diff --git a/.gitignore b/.gitignore
index 62302db..038a450 100644
--- a/.gitignore
+++ b/.gitignore
@@ -86,3 +86,5 @@ CLAUDE.md
docs/_build/
internal_docs/
.worktrees/
+# Added by code-review-graph
+.code-review-graph/
diff --git a/docs/source/design_doc/marufs_kernel_module_architecture.md b/docs/source/design_doc/marufs_kernel_module_architecture.md
new file mode 100644
index 0000000..d55383d
--- /dev/null
+++ b/docs/source/design_doc/marufs_kernel_module_architecture.md
@@ -0,0 +1,194 @@
+# marufs Kernel Module Architecture
+
+## Module Components
+
+```mermaid
+graph TB
+ subgraph VFS["VFS Layer"]
+ super["super.c
mount / umount / format"]
+ dir["dir.c
readdir / lookup
create / unlink"]
+ inode["inode.c
iget / new_inode / evict"]
+ file["file.c
mmap / ftruncate / ioctl"]
+ end
+
+ subgraph Data["Data Layer"]
+ index["index.c
hash index
insert / lookup / delete"]
+ region["region.c
RAT allocator
alloc / free"]
+ nrht_m["nrht.c
Name-Ref Hash Table
name_offset / find_name"]
+ end
+
+ subgraph Security["Security Layer"]
+ acl["acl.c
perm check / grant
chown / set_default"]
+ end
+
+ subgraph Maint["Maintenance"]
+ gc["gc.c
4-phase GC sweep"]
+ sysfs["sysfs.c
/sys/fs/marufs/ stats"]
+ end
+
+ super --> dir & inode & file
+ dir --> index
+ file --> index & region & acl & nrht_m
+ inode --> index
+ gc -.-> index & region & nrht_m
+
+ style VFS fill:#1B4F72,stroke:#1B4F72,stroke-width:2px,color:#fff
+ style Data fill:#145A32,stroke:#145A32,stroke-width:2px,color:#fff
+ style Security fill:#78281F,stroke:#78281F,stroke-width:2px,color:#fff
+ style Maint fill:#2C3E50,stroke:#2C3E50,stroke-width:2px,color:#fff
+```
+
+| Layer | File | Role |
+|-------|------|------|
+| VFS | `super.c` | Module init, mount/umount, DAX device setup, mkfs (format) |
+| VFS | `dir.c` | Directory operations: readdir, lookup, create, unlink, d_revalidate |
+| VFS | `inode.c` | Inode lifecycle: iget (from CXL index), new_inode, evict |
+| VFS | `file.c` | File operations: mmap (DAX fault), ftruncate (region alloc), ioctl dispatch |
+| Data | `index.c` | Global partitioned index: CAS-based insert/lookup/delete, hash chain walk |
+| Data | `region.c` | RAT (Region Allocation Table): contiguous space finder, alloc/free entries |
+| Data | `nrht.c` | Name-Ref Hash Table: name_offset, find_name, batch operations |
+| Security | `acl.c` | Permission enforcement: delegation table check, perm_grant, chown |
+| Maintenance | `gc.c` | Background GC: 4-phase sweep (dead process, stale index, local tracker, NRHT) |
+| Maintenance | `sysfs.c` | sysfs interface: `/sys/fs/marufs/` stats and configuration |
+
+## CXL Memory Layout
+
+```mermaid
+block-beta
+ columns 4
+
+ sb_label["◼ Global Superblock"]:4
+ sb["Superblock (256B)"]:4
+
+ space:4
+
+ gi_label["◼ Global Index"]:4
+ sh0["Shard Header 0 (64B)"]
+ sh1["Shard Header 1 (64B)"]
+ sh2["Shard Header 2 (64B)"]
+ sh3["Shard Header 3 (64B)"]
+ bk0["Buckets 0 (256 × 4B)"]
+ bk1["Buckets 1 (256 × 4B)"]
+ bk2["Buckets 2 (256 × 4B)"]
+ bk3["Buckets 3 (256 × 4B)"]
+ en0["Entries 0 (256 × 64B)"]
+ en1["Entries 1 (256 × 64B)"]
+ en2["Entries 2 (256 × 64B)"]
+ en3["Entries 3 (256 × 64B)"]
+
+ space:4
+
+ rat_label["◼ Region Allocation Table"]:4
+ rat_hdr["RAT Header (128B)"]:4
+ r0["RAT Entry 0 (2 KB)"]
+ r1["RAT Entry 1 (2 KB)"]
+ r_dot["... (× 253)"]
+ r255["RAT Entry 255 (2 KB)"]
+
+ space:4
+
+ rg["Region 0, 1, 2, ... (2 MB aligned each)"]:4
+
+ style sb_label fill:#1B4F72,color:#fff,font-weight:bold
+ style sb fill:#2E86C1,color:#fff
+ style gi_label fill:#145A32,color:#fff,font-weight:bold
+ style sh0 fill:#1E8449,color:#fff
+ style sh1 fill:#1E8449,color:#fff
+ style sh2 fill:#1E8449,color:#fff
+ style sh3 fill:#1E8449,color:#fff
+ style bk0 fill:#27AE60,color:#fff
+ style bk1 fill:#27AE60,color:#fff
+ style bk2 fill:#27AE60,color:#fff
+ style bk3 fill:#27AE60,color:#fff
+ style en0 fill:#52BE80,color:#fff
+ style en1 fill:#52BE80,color:#fff
+ style en2 fill:#52BE80,color:#fff
+ style en3 fill:#52BE80,color:#fff
+ style rat_label fill:#78281F,color:#fff,font-weight:bold
+ style rat_hdr fill:#C0392B,color:#fff
+ style r0 fill:#E74C3C,color:#fff
+ style r1 fill:#E74C3C,color:#fff
+ style r_dot fill:#E74C3C,color:#fff
+ style r255 fill:#E74C3C,color:#fff
+ style rg fill:#2C3E50,color:#fff
+```
+
+| Block | Size | Description |
+|-------|------|-------------|
+| Superblock | 256B (4 CL) | FS geometry, shard count, offsets, mounted node bitmask (`active_nodes`) |
+| Shard Header | 64B (1 CL) × 4 | Per-shard bucket/entry array offsets (immutable after format) |
+| Buckets | 4B × 256 per shard | Hash chain head pointers (`head_entry_idx` or `BUCKET_END`) |
+| Entries | 64B (1 CL) × 256 per shard | Index entries: state, name_hash, region_id, next_in_bucket |
+| RAT Header | 128B (2 CL) | max_entries, alloc_lock (CAS spinlock), allocation stats |
+| RAT Entry | 2 KB (32 CL) × 256 | CL0: phys_offset/size, CL1: name, CL2: ACL, CL3-31: delegation |
+| Region Data | 2 MB aligned each | Actual file data, variable size |
+
+## ACL (Access Control List)
+
+```mermaid
+flowchart TD
+ subgraph GI_path["Global Index"]
+ gi_path["filename → Index Entry
(hash → shard → bucket → chain)"]
+ end
+
+ subgraph RAT_path["RAT Entry (via region_id)"]
+ cl0["CL0: phys_offset, size"]
+ cl2["CL2: Owner + default_perms"]
+ cl3["CL3-31: Delegation Table
(up to 29 entries)"]
+ end
+
+ subgraph RD_path["Region Data"]
+ region["mmap / read / write / unlink
(open is always allowed)"]
+ end
+
+ gi_path -->|"region_id"| RAT_path
+ cl0 -->|"phys_offset"| region
+ cl2 -.->|"owner / default"| region
+ cl3 -.->|"delegated perms"| region
+
+ style GI_path fill:#145A32,stroke:#145A32,stroke-width:2px,color:#fff
+ style RAT_path fill:#1B4F72,stroke:#1B4F72,stroke-width:2px,color:#fff
+ style RD_path fill:#2C3E50,stroke:#2C3E50,stroke-width:2px,color:#fff
+```
+
+- **Data path** (solid): `filename → Index Entry → region_id → CL0.phys_offset → Region`
+- **Permission path** (dotted): Owner (implicit all) → default_perms (non-owner baseline) → Delegation Table (per-node/pid grants)
+- open() always allowed — permission check at mmap / read / write / unlink
+- Delegations stored on CXL — immediately visible cross-node
+
+## NRHT (Name-Ref Hash Table)
+
+```mermaid
+flowchart TD
+ subgraph GI["Global Index"]
+ gi_entry["filename → region_id
(file-level lookup)"]
+ end
+
+ subgraph NRHT_F["NRHT File"]
+ nrht_entry["name → (offset, target_region_id)
(application-level reference)"]
+ end
+
+ subgraph Regions["Region files"]
+ r0["Region 0 (data)"]
+ r1["Region 1 (data)"]
+ r2["Region 2 (data)"]
+ end
+
+ gi_entry -->|"region_id=0"| r0
+ gi_entry -->|"region_id=1"| r1
+ gi_entry -->|"region_id=2"| r2
+ gi_entry -->|"region_id=5 (NRHT)"| NRHT_F
+
+ nrht_entry -->|"target=0, offset=0x1000"| r0
+ nrht_entry -->|"target=1, offset=0x2000"| r1
+ nrht_entry -->|"target=0, offset=0x3000"| r0
+
+ style GI fill:#145A32,stroke:#145A32,stroke-width:2px,color:#fff
+ style NRHT_F fill:#1B4F72,stroke:#1B4F72,stroke-width:2px,color:#fff
+ style Regions fill:#2C3E50,stroke:#2C3E50,stroke-width:2px,color:#fff
+```
+
+- **Global Index**: `filename → region_id` — filesystem-level file lookup
+- **NRHT**: `name → (offset, target_region_id)` — application-level intra-region references (e.g., KV cache keys)
+- A single NRHT can freely reference **multiple regions** (N:M relationship)
+- NRHT files are regular regions registered in the Global Index (own RAT entry)
diff --git a/marufs_kernel/.clang-format b/marufs_kernel/.clang-format
new file mode 100644
index 0000000..1cc151e
--- /dev/null
+++ b/marufs_kernel/.clang-format
@@ -0,0 +1,806 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# clang-format configuration file. Intended for clang-format >= 11.
+#
+# For more information, see:
+#
+# Documentation/dev-tools/clang-format.rst
+# https://clang.llvm.org/docs/ClangFormat.html
+# https://clang.llvm.org/docs/ClangFormatStyleOptions.html
+#
+---
+AccessModifierOffset: -4
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Left
+AlignOperands: true
+AlignTrailingComments: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: None
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: false
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:
+ AfterClass: false
+ AfterControlStatement: false
+ AfterEnum: false
+ AfterFunction: true
+ AfterNamespace: true
+ AfterObjCDeclaration: false
+ AfterStruct: false
+ AfterUnion: false
+ AfterExternBlock: false
+ BeforeCatch: false
+ BeforeElse: false
+ IndentBraces: false
+ SplitEmptyFunction: true
+ SplitEmptyRecord: true
+ SplitEmptyNamespace: true
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Custom
+BreakBeforeInheritanceComma: false
+BreakBeforeTernaryOperators: false
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeComma
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: false
+ColumnLimit: 80
+CommentPragmas: '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 8
+ContinuationIndentWidth: 8
+Cpp11BracedListStyle: false
+DerivePointerAlignment: false
+DisableFormat: false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: false
+
+# Taken from:
+# git grep -h '^#define [^[:space:]]*for_each[^[:space:]]*(' include/ tools/ \
+# | sed "s,^#define \([^[:space:]]*for_each[^[:space:]]*\)(.*$, - '\1'," \
+# | LC_ALL=C sort -u
+ForEachMacros:
+ - '__ata_qc_for_each'
+ - '__bio_for_each_bvec'
+ - '__bio_for_each_segment'
+ - '__evlist__for_each_entry'
+ - '__evlist__for_each_entry_continue'
+ - '__evlist__for_each_entry_from'
+ - '__evlist__for_each_entry_reverse'
+ - '__evlist__for_each_entry_safe'
+ - '__for_each_mem_range'
+ - '__for_each_mem_range_rev'
+ - '__for_each_thread'
+ - '__hlist_for_each_rcu'
+ - '__map__for_each_symbol_by_name'
+ - '__pci_bus_for_each_res0'
+ - '__pci_bus_for_each_res1'
+ - '__pci_dev_for_each_res0'
+ - '__pci_dev_for_each_res1'
+ - '__perf_evlist__for_each_entry'
+ - '__perf_evlist__for_each_entry_reverse'
+ - '__perf_evlist__for_each_entry_safe'
+ - '__rq_for_each_bio'
+ - '__shost_for_each_device'
+ - '__sym_for_each'
+ - '_for_each_counter'
+ - 'apei_estatus_for_each_section'
+ - 'ata_for_each_dev'
+ - 'ata_for_each_link'
+ - 'ata_qc_for_each'
+ - 'ata_qc_for_each_raw'
+ - 'ata_qc_for_each_with_internal'
+ - 'ax25_for_each'
+ - 'ax25_uid_for_each'
+ - 'bio_for_each_bvec'
+ - 'bio_for_each_bvec_all'
+ - 'bio_for_each_folio_all'
+ - 'bio_for_each_integrity_vec'
+ - 'bio_for_each_segment'
+ - 'bio_for_each_segment_all'
+ - 'bio_list_for_each'
+ - 'bip_for_each_vec'
+ - 'bond_for_each_slave'
+ - 'bond_for_each_slave_rcu'
+ - 'bpf_for_each'
+ - 'bpf_for_each_reg_in_vstate'
+ - 'bpf_for_each_reg_in_vstate_mask'
+ - 'bpf_for_each_spilled_reg'
+ - 'bpf_object__for_each_map'
+ - 'bpf_object__for_each_program'
+ - 'btree_for_each_safe128'
+ - 'btree_for_each_safe32'
+ - 'btree_for_each_safe64'
+ - 'btree_for_each_safel'
+ - 'card_for_each_dev'
+ - 'cgroup_taskset_for_each'
+ - 'cgroup_taskset_for_each_leader'
+ - 'cpu_aggr_map__for_each_idx'
+ - 'cpufreq_for_each_efficient_entry_idx'
+ - 'cpufreq_for_each_entry'
+ - 'cpufreq_for_each_entry_idx'
+ - 'cpufreq_for_each_valid_entry'
+ - 'cpufreq_for_each_valid_entry_idx'
+ - 'css_for_each_child'
+ - 'css_for_each_descendant_post'
+ - 'css_for_each_descendant_pre'
+ - 'damon_for_each_region'
+ - 'damon_for_each_region_from'
+ - 'damon_for_each_region_safe'
+ - 'damon_for_each_scheme'
+ - 'damon_for_each_scheme_safe'
+ - 'damon_for_each_target'
+ - 'damon_for_each_target_safe'
+ - 'damos_for_each_core_filter'
+ - 'damos_for_each_core_filter_safe'
+ - 'damos_for_each_ops_filter'
+ - 'damos_for_each_ops_filter_safe'
+ - 'damos_for_each_quota_goal'
+ - 'damos_for_each_quota_goal_safe'
+ - 'data__for_each_file'
+ - 'data__for_each_file_new'
+ - 'data__for_each_file_start'
+ - 'def_for_each_cpu'
+ - 'device_for_each_child_node'
+ - 'device_for_each_child_node_scoped'
+ - 'dma_fence_array_for_each'
+ - 'dma_fence_chain_for_each'
+ - 'dma_fence_unwrap_for_each'
+ - 'dma_resv_for_each_fence'
+ - 'dma_resv_for_each_fence_unlocked'
+ - 'do_for_each_ftrace_op'
+ - 'drm_atomic_crtc_for_each_plane'
+ - 'drm_atomic_crtc_state_for_each_plane'
+ - 'drm_atomic_crtc_state_for_each_plane_state'
+ - 'drm_atomic_for_each_plane_damage'
+ - 'drm_client_for_each_connector_iter'
+ - 'drm_client_for_each_modeset'
+ - 'drm_connector_for_each_possible_encoder'
+ - 'drm_exec_for_each_locked_object'
+ - 'drm_exec_for_each_locked_object_reverse'
+ - 'drm_for_each_bridge_in_chain_scoped'
+ - 'drm_for_each_connector_iter'
+ - 'drm_for_each_crtc'
+ - 'drm_for_each_crtc_reverse'
+ - 'drm_for_each_encoder'
+ - 'drm_for_each_encoder_mask'
+ - 'drm_for_each_fb'
+ - 'drm_for_each_legacy_plane'
+ - 'drm_for_each_plane'
+ - 'drm_for_each_plane_mask'
+ - 'drm_for_each_privobj'
+ - 'drm_gem_for_each_gpuvm_bo'
+ - 'drm_gem_for_each_gpuvm_bo_safe'
+ - 'drm_gpusvm_for_each_range'
+ - 'drm_gpuva_for_each_op'
+ - 'drm_gpuva_for_each_op_from_reverse'
+ - 'drm_gpuva_for_each_op_reverse'
+ - 'drm_gpuva_for_each_op_safe'
+ - 'drm_gpuvm_bo_for_each_va'
+ - 'drm_gpuvm_bo_for_each_va_safe'
+ - 'drm_gpuvm_for_each_va'
+ - 'drm_gpuvm_for_each_va_range'
+ - 'drm_gpuvm_for_each_va_range_safe'
+ - 'drm_gpuvm_for_each_va_safe'
+ - 'drm_mm_for_each_hole'
+ - 'drm_mm_for_each_node'
+ - 'drm_mm_for_each_node_in_range'
+ - 'drm_mm_for_each_node_safe'
+ - 'dsa_switch_for_each_available_port'
+ - 'dsa_switch_for_each_cpu_port'
+ - 'dsa_switch_for_each_cpu_port_continue_reverse'
+ - 'dsa_switch_for_each_port'
+ - 'dsa_switch_for_each_port_continue_reverse'
+ - 'dsa_switch_for_each_port_safe'
+ - 'dsa_switch_for_each_user_port'
+ - 'dsa_switch_for_each_user_port_continue_reverse'
+ - 'dsa_tree_for_each_cpu_port'
+ - 'dsa_tree_for_each_user_port'
+ - 'dsa_tree_for_each_user_port_continue_reverse'
+ - 'dso__for_each_symbol'
+ - 'elf_hash_for_each_possible'
+ - 'elf_symtab__for_each_symbol'
+ - 'evlist__for_each_cpu'
+ - 'evlist__for_each_entry'
+ - 'evlist__for_each_entry_continue'
+ - 'evlist__for_each_entry_from'
+ - 'evlist__for_each_entry_reverse'
+ - 'evlist__for_each_entry_safe'
+ - 'flow_action_for_each'
+ - 'for_each_acpi_consumer_dev'
+ - 'for_each_acpi_dev_match'
+ - 'for_each_active_dev_scope'
+ - 'for_each_active_drhd_unit'
+ - 'for_each_active_iommu'
+ - 'for_each_active_irq'
+ - 'for_each_active_route'
+ - 'for_each_aggr_pgid'
+ - 'for_each_alloc_capable_rdt_resource'
+ - 'for_each_and_bit'
+ - 'for_each_andnot_bit'
+ - 'for_each_available_child_of_node'
+ - 'for_each_available_child_of_node_scoped'
+ - 'for_each_bench'
+ - 'for_each_bio'
+ - 'for_each_board_func_rsrc'
+ - 'for_each_btf_ext_rec'
+ - 'for_each_btf_ext_sec'
+ - 'for_each_bvec'
+ - 'for_each_capable_rdt_resource'
+ - 'for_each_card_auxs'
+ - 'for_each_card_auxs_safe'
+ - 'for_each_card_components'
+ - 'for_each_card_dapms'
+ - 'for_each_card_pre_auxs'
+ - 'for_each_card_prelinks'
+ - 'for_each_card_rtds'
+ - 'for_each_card_rtds_safe'
+ - 'for_each_card_widgets'
+ - 'for_each_card_widgets_safe'
+ - 'for_each_cgroup_storage_type'
+ - 'for_each_child_of_node'
+ - 'for_each_child_of_node_scoped'
+ - 'for_each_child_of_node_with_prefix'
+ - 'for_each_clear_bit'
+ - 'for_each_clear_bit_from'
+ - 'for_each_clear_bitrange'
+ - 'for_each_clear_bitrange_from'
+ - 'for_each_cmd'
+ - 'for_each_cmsghdr'
+ - 'for_each_collection'
+ - 'for_each_comp_order'
+ - 'for_each_compatible_node'
+ - 'for_each_compatible_node_scoped'
+ - 'for_each_component_dais'
+ - 'for_each_component_dais_safe'
+ - 'for_each_conduit'
+ - 'for_each_console'
+ - 'for_each_console_srcu'
+ - 'for_each_cpu'
+ - 'for_each_cpu_and'
+ - 'for_each_cpu_andnot'
+ - 'for_each_cpu_from'
+ - 'for_each_cpu_or'
+ - 'for_each_cpu_wrap'
+ - 'for_each_dapm_widgets'
+ - 'for_each_dedup_cand'
+ - 'for_each_dev_addr'
+ - 'for_each_dev_scope'
+ - 'for_each_dma_cap_mask'
+ - 'for_each_dpcm_be'
+ - 'for_each_dpcm_be_rollback'
+ - 'for_each_dpcm_be_safe'
+ - 'for_each_dpcm_fe'
+ - 'for_each_drhd_unit'
+ - 'for_each_dss_dev'
+ - 'for_each_efi_memory_desc'
+ - 'for_each_efi_memory_desc_in_map'
+ - 'for_each_element'
+ - 'for_each_element_extid'
+ - 'for_each_element_id'
+ - 'for_each_enabled_cpu'
+ - 'for_each_endpoint_of_node'
+ - 'for_each_event'
+ - 'for_each_event_tps'
+ - 'for_each_evictable_lru'
+ - 'for_each_fib6_node_rt_rcu'
+ - 'for_each_fib6_walker_rt'
+ - 'for_each_file_lock'
+ - 'for_each_free_mem_range'
+ - 'for_each_free_mem_range_reverse'
+ - 'for_each_func_rsrc'
+ - 'for_each_gpiochip_node'
+ - 'for_each_group_evsel'
+ - 'for_each_group_evsel_head'
+ - 'for_each_group_member'
+ - 'for_each_group_member_head'
+ - 'for_each_hstate'
+ - 'for_each_hwgpio'
+ - 'for_each_hwgpio_in_range'
+ - 'for_each_if'
+ - 'for_each_inject_fn'
+ - 'for_each_insn'
+ - 'for_each_insn_op_loc'
+ - 'for_each_insn_prefix'
+ - 'for_each_intid'
+ - 'for_each_iommu'
+ - 'for_each_ip_tunnel_rcu'
+ - 'for_each_irq_desc'
+ - 'for_each_irq_nr'
+ - 'for_each_lang'
+ - 'for_each_link_ch_maps'
+ - 'for_each_link_codecs'
+ - 'for_each_link_cpus'
+ - 'for_each_link_platforms'
+ - 'for_each_lru'
+ - 'for_each_matching_node'
+ - 'for_each_matching_node_and_match'
+ - 'for_each_media_entity_data_link'
+ - 'for_each_mem_pfn_range'
+ - 'for_each_mem_range'
+ - 'for_each_mem_range_rev'
+ - 'for_each_mem_region'
+ - 'for_each_member'
+ - 'for_each_memory'
+ - 'for_each_migratetype_order'
+ - 'for_each_missing_reg'
+ - 'for_each_mle_subelement'
+ - 'for_each_mod_mem_type'
+ - 'for_each_mon_capable_rdt_resource'
+ - 'for_each_mp_bvec'
+ - 'for_each_net'
+ - 'for_each_net_continue_reverse'
+ - 'for_each_net_rcu'
+ - 'for_each_netdev'
+ - 'for_each_netdev_continue'
+ - 'for_each_netdev_continue_rcu'
+ - 'for_each_netdev_continue_reverse'
+ - 'for_each_netdev_dump'
+ - 'for_each_netdev_feature'
+ - 'for_each_netdev_in_bond_rcu'
+ - 'for_each_netdev_rcu'
+ - 'for_each_netdev_reverse'
+ - 'for_each_netdev_safe'
+ - 'for_each_new_connector_in_state'
+ - 'for_each_new_crtc_in_state'
+ - 'for_each_new_mst_mgr_in_state'
+ - 'for_each_new_plane_in_state'
+ - 'for_each_new_plane_in_state_reverse'
+ - 'for_each_new_private_obj_in_state'
+ - 'for_each_new_reg'
+ - 'for_each_nhlt_endpoint'
+ - 'for_each_nhlt_endpoint_fmtcfg'
+ - 'for_each_nhlt_fmtcfg'
+ - 'for_each_node'
+ - 'for_each_node_by_name'
+ - 'for_each_node_by_type'
+ - 'for_each_node_mask'
+ - 'for_each_node_numadist'
+ - 'for_each_node_state'
+ - 'for_each_node_with_cpus'
+ - 'for_each_node_with_property'
+ - 'for_each_nonreserved_multicast_dest_pgid'
+ - 'for_each_numa_hop_mask'
+ - 'for_each_of_allnodes'
+ - 'for_each_of_allnodes_from'
+ - 'for_each_of_cpu_node'
+ - 'for_each_of_graph_port'
+ - 'for_each_of_graph_port_endpoint'
+ - 'for_each_of_pci_range'
+ - 'for_each_old_connector_in_state'
+ - 'for_each_old_crtc_in_state'
+ - 'for_each_old_mst_mgr_in_state'
+ - 'for_each_old_plane_in_state'
+ - 'for_each_old_private_obj_in_state'
+ - 'for_each_oldnew_connector_in_state'
+ - 'for_each_oldnew_crtc_in_state'
+ - 'for_each_oldnew_mst_mgr_in_state'
+ - 'for_each_oldnew_plane_in_state'
+ - 'for_each_oldnew_plane_in_state_reverse'
+ - 'for_each_oldnew_private_obj_in_state'
+ - 'for_each_online_cpu'
+ - 'for_each_online_cpu_wrap'
+ - 'for_each_online_node'
+ - 'for_each_online_pgdat'
+ - 'for_each_or_bit'
+ - 'for_each_page_ext'
+ - 'for_each_path'
+ - 'for_each_pci_bridge'
+ - 'for_each_pci_dev'
+ - 'for_each_pcm_streams'
+ - 'for_each_physmem_range'
+ - 'for_each_populated_zone'
+ - 'for_each_possible_cpu'
+ - 'for_each_possible_cpu_wrap'
+ - 'for_each_present_blessed_reg'
+ - 'for_each_present_cpu'
+ - 'for_each_present_section_nr'
+ - 'for_each_prime_number'
+ - 'for_each_prime_number_from'
+ - 'for_each_probe_cache_entry'
+ - 'for_each_process'
+ - 'for_each_process_thread'
+ - 'for_each_prop_codec_conf'
+ - 'for_each_prop_dai_codec'
+ - 'for_each_prop_dai_cpu'
+ - 'for_each_prop_dlc_codecs'
+ - 'for_each_prop_dlc_cpus'
+ - 'for_each_prop_dlc_platforms'
+ - 'for_each_property_of_node'
+ - 'for_each_pt_level_entry'
+ - 'for_each_rdt_resource'
+ - 'for_each_reg'
+ - 'for_each_reg_filtered'
+ - 'for_each_reloc'
+ - 'for_each_reloc_from'
+ - 'for_each_requested_gpio'
+ - 'for_each_requested_gpio_in_range'
+ - 'for_each_reserved_child_of_node'
+ - 'for_each_reserved_mem_range'
+ - 'for_each_reserved_mem_region'
+ - 'for_each_rtd_ch_maps'
+ - 'for_each_rtd_codec_dais'
+ - 'for_each_rtd_components'
+ - 'for_each_rtd_cpu_dais'
+ - 'for_each_rtd_dais'
+ - 'for_each_rtd_dais_reverse'
+ - 'for_each_sband_iftype_data'
+ - 'for_each_script'
+ - 'for_each_sec'
+ - 'for_each_set_bit'
+ - 'for_each_set_bit_from'
+ - 'for_each_set_bit_wrap'
+ - 'for_each_set_bitrange'
+ - 'for_each_set_bitrange_from'
+ - 'for_each_set_clump8'
+ - 'for_each_sg'
+ - 'for_each_sg_dma_page'
+ - 'for_each_sg_page'
+ - 'for_each_sgtable_dma_page'
+ - 'for_each_sgtable_dma_sg'
+ - 'for_each_sgtable_page'
+ - 'for_each_sgtable_sg'
+ - 'for_each_sibling_event'
+ - 'for_each_sta_active_link'
+ - 'for_each_subelement'
+ - 'for_each_subelement_extid'
+ - 'for_each_subelement_id'
+ - 'for_each_sublist'
+ - 'for_each_subsystem'
+ - 'for_each_suite'
+ - 'for_each_supported_activate_fn'
+ - 'for_each_supported_inject_fn'
+ - 'for_each_sym'
+ - 'for_each_thread'
+ - 'for_each_token'
+ - 'for_each_unicast_dest_pgid'
+ - 'for_each_valid_link'
+ - 'for_each_vif_active_link'
+ - 'for_each_vma'
+ - 'for_each_vma_range'
+ - 'for_each_vsi'
+ - 'for_each_wakeup_source'
+ - 'for_each_zone'
+ - 'for_each_zone_zonelist'
+ - 'for_each_zone_zonelist_nodemask'
+ - 'func_for_each_insn'
+ - 'fwnode_for_each_available_child_node'
+ - 'fwnode_for_each_child_node'
+ - 'fwnode_for_each_parent_node'
+ - 'fwnode_graph_for_each_endpoint'
+ - 'gadget_for_each_ep'
+ - 'genradix_for_each'
+ - 'genradix_for_each_from'
+ - 'genradix_for_each_reverse'
+ - 'hash_for_each'
+ - 'hash_for_each_possible'
+ - 'hash_for_each_possible_rcu'
+ - 'hash_for_each_possible_rcu_notrace'
+ - 'hash_for_each_possible_safe'
+ - 'hash_for_each_rcu'
+ - 'hash_for_each_safe'
+ - 'hashmap__for_each_entry'
+ - 'hashmap__for_each_entry_safe'
+ - 'hashmap__for_each_key_entry'
+ - 'hashmap__for_each_key_entry_safe'
+ - 'hctx_for_each_ctx'
+ - 'hists__for_each_format'
+ - 'hists__for_each_sort_list'
+ - 'hlist_bl_for_each_entry'
+ - 'hlist_bl_for_each_entry_rcu'
+ - 'hlist_bl_for_each_entry_safe'
+ - 'hlist_for_each'
+ - 'hlist_for_each_entry'
+ - 'hlist_for_each_entry_continue'
+ - 'hlist_for_each_entry_continue_rcu'
+ - 'hlist_for_each_entry_continue_rcu_bh'
+ - 'hlist_for_each_entry_from'
+ - 'hlist_for_each_entry_from_rcu'
+ - 'hlist_for_each_entry_rcu'
+ - 'hlist_for_each_entry_rcu_bh'
+ - 'hlist_for_each_entry_rcu_notrace'
+ - 'hlist_for_each_entry_safe'
+ - 'hlist_for_each_entry_srcu'
+ - 'hlist_for_each_safe'
+ - 'hlist_nulls_for_each_entry'
+ - 'hlist_nulls_for_each_entry_from'
+ - 'hlist_nulls_for_each_entry_rcu'
+ - 'hlist_nulls_for_each_entry_safe'
+ - 'i3c_bus_for_each_i2cdev'
+ - 'i3c_bus_for_each_i3cdev'
+ - 'idr_for_each_entry'
+ - 'idr_for_each_entry_continue'
+ - 'idr_for_each_entry_continue_ul'
+ - 'idr_for_each_entry_ul'
+ - 'iio_for_each_active_channel'
+ - 'in_dev_for_each_ifa_rcu'
+ - 'in_dev_for_each_ifa_rtnl'
+ - 'in_dev_for_each_ifa_rtnl_net'
+ - 'inet_bind_bucket_for_each'
+ - 'interval_tree_for_each_span'
+ - 'intlist__for_each_entry'
+ - 'intlist__for_each_entry_safe'
+ - 'kcore_copy__for_each_phdr'
+ - 'key_for_each'
+ - 'key_for_each_safe'
+ - 'klp_for_each_func'
+ - 'klp_for_each_func_safe'
+ - 'klp_for_each_func_static'
+ - 'klp_for_each_object'
+ - 'klp_for_each_object_safe'
+ - 'klp_for_each_object_static'
+ - 'kunit_suite_for_each_test_case'
+ - 'kvm_for_each_memslot'
+ - 'kvm_for_each_memslot_in_gfn_range'
+ - 'kvm_for_each_vcpu'
+ - 'libbpf_nla_for_each_attr'
+ - 'list_for_each'
+ - 'list_for_each_codec'
+ - 'list_for_each_codec_safe'
+ - 'list_for_each_continue'
+ - 'list_for_each_entry'
+ - 'list_for_each_entry_continue'
+ - 'list_for_each_entry_continue_rcu'
+ - 'list_for_each_entry_continue_reverse'
+ - 'list_for_each_entry_from'
+ - 'list_for_each_entry_from_rcu'
+ - 'list_for_each_entry_from_reverse'
+ - 'list_for_each_entry_lockless'
+ - 'list_for_each_entry_rcu'
+ - 'list_for_each_entry_reverse'
+ - 'list_for_each_entry_safe'
+ - 'list_for_each_entry_safe_continue'
+ - 'list_for_each_entry_safe_from'
+ - 'list_for_each_entry_safe_reverse'
+ - 'list_for_each_entry_srcu'
+ - 'list_for_each_from'
+ - 'list_for_each_prev'
+ - 'list_for_each_prev_safe'
+ - 'list_for_each_rcu'
+ - 'list_for_each_safe'
+ - 'llist_for_each'
+ - 'llist_for_each_entry'
+ - 'llist_for_each_entry_safe'
+ - 'llist_for_each_safe'
+ - 'lwq_for_each_safe'
+ - 'map__for_each_symbol'
+ - 'map__for_each_symbol_by_name'
+ - 'mas_for_each'
+ - 'mas_for_each_rev'
+ - 'mci_for_each_dimm'
+ - 'media_device_for_each_entity'
+ - 'media_device_for_each_intf'
+ - 'media_device_for_each_link'
+ - 'media_device_for_each_pad'
+ - 'media_entity_for_each_pad'
+ - 'media_pipeline_for_each_entity'
+ - 'media_pipeline_for_each_pad'
+ - 'mlx5_lag_for_each_peer_mdev'
+ - 'mptcp_for_each_subflow'
+ - 'msi_domain_for_each_desc'
+ - 'msi_for_each_desc'
+ - 'mt_for_each'
+ - 'nanddev_io_for_each_block'
+ - 'nanddev_io_for_each_page'
+ - 'neigh_for_each_in_bucket'
+ - 'neigh_for_each_in_bucket_rcu'
+ - 'neigh_for_each_in_bucket_safe'
+ - 'netdev_for_each_lower_dev'
+ - 'netdev_for_each_lower_private'
+ - 'netdev_for_each_lower_private_rcu'
+ - 'netdev_for_each_mc_addr'
+ - 'netdev_for_each_synced_mc_addr'
+ - 'netdev_for_each_synced_uc_addr'
+ - 'netdev_for_each_uc_addr'
+ - 'netdev_for_each_upper_dev_rcu'
+ - 'netdev_hw_addr_list_for_each'
+ - 'nft_rule_for_each_expr'
+ - 'nla_for_each_attr'
+ - 'nla_for_each_attr_type'
+ - 'nla_for_each_nested'
+ - 'nla_for_each_nested_type'
+ - 'nlmsg_for_each_attr'
+ - 'nlmsg_for_each_msg'
+ - 'nr_neigh_for_each'
+ - 'nr_neigh_for_each_safe'
+ - 'nr_node_for_each'
+ - 'nr_node_for_each_safe'
+ - 'of_for_each_phandle'
+ - 'of_property_for_each_string'
+ - 'of_property_for_each_u32'
+ - 'pci_bus_for_each_resource'
+ - 'pci_dev_for_each_resource'
+ - 'pcl_for_each_chunk'
+ - 'pcl_for_each_segment'
+ - 'pcm_for_each_format'
+ - 'perf_config_items__for_each_entry'
+ - 'perf_config_sections__for_each_entry'
+ - 'perf_config_set__for_each_entry'
+ - 'perf_cpu_map__for_each_cpu'
+ - 'perf_cpu_map__for_each_cpu_skip_any'
+ - 'perf_cpu_map__for_each_idx'
+ - 'perf_evlist__for_each_entry'
+ - 'perf_evlist__for_each_entry_reverse'
+ - 'perf_evlist__for_each_entry_safe'
+ - 'perf_evlist__for_each_evsel'
+ - 'perf_evlist__for_each_mmap'
+ - 'perf_evsel_for_each_per_thread_period_safe'
+ - 'perf_hpp_list__for_each_format'
+ - 'perf_hpp_list__for_each_format_safe'
+ - 'perf_hpp_list__for_each_sort_list'
+ - 'perf_hpp_list__for_each_sort_list_safe'
+ - 'plist_for_each'
+ - 'plist_for_each_continue'
+ - 'plist_for_each_entry'
+ - 'plist_for_each_entry_continue'
+ - 'plist_for_each_entry_safe'
+ - 'plist_for_each_safe'
+ - 'pnp_for_each_card'
+ - 'pnp_for_each_dev'
+ - 'protocol_for_each_card'
+ - 'protocol_for_each_dev'
+ - 'queue_for_each_hw_ctx'
+ - 'radix_tree_for_each_slot'
+ - 'radix_tree_for_each_tagged'
+ - 'rb_for_each'
+ - 'rbtree_postorder_for_each_entry_safe'
+ - 'rdma_for_each_block'
+ - 'rdma_for_each_port'
+ - 'rdma_umem_for_each_dma_block'
+ - 'resource_list_for_each_entry'
+ - 'resource_list_for_each_entry_safe'
+ - 'rhl_for_each_entry_rcu'
+ - 'rhl_for_each_rcu'
+ - 'rht_for_each'
+ - 'rht_for_each_entry'
+ - 'rht_for_each_entry_from'
+ - 'rht_for_each_entry_rcu'
+ - 'rht_for_each_entry_rcu_from'
+ - 'rht_for_each_entry_safe'
+ - 'rht_for_each_from'
+ - 'rht_for_each_rcu'
+ - 'rht_for_each_rcu_from'
+ - 'rq_for_each_bvec'
+ - 'rq_for_each_segment'
+ - 'rq_list_for_each'
+ - 'rq_list_for_each_safe'
+ - 'sample_read_group__for_each'
+ - 'scsi_for_each_prot_sg'
+ - 'scsi_for_each_sg'
+ - 'sctp_for_each_hentry'
+ - 'sctp_skb_for_each'
+ - 'sec_for_each_insn'
+ - 'sec_for_each_insn_continue'
+ - 'sec_for_each_insn_from'
+ - 'sec_for_each_sym'
+ - 'shdma_for_each_chan'
+ - 'shost_for_each_device'
+ - 'sk_for_each'
+ - 'sk_for_each_bound'
+ - 'sk_for_each_bound_safe'
+ - 'sk_for_each_entry_offset_rcu'
+ - 'sk_for_each_from'
+ - 'sk_for_each_rcu'
+ - 'sk_for_each_safe'
+ - 'sk_nulls_for_each'
+ - 'sk_nulls_for_each_from'
+ - 'sk_nulls_for_each_rcu'
+ - 'snd_array_for_each'
+ - 'snd_pcm_group_for_each_entry'
+ - 'snd_soc_dapm_widget_for_each_path'
+ - 'snd_soc_dapm_widget_for_each_path_safe'
+ - 'snd_soc_dapm_widget_for_each_sink_path'
+ - 'snd_soc_dapm_widget_for_each_source_path'
+ - 'sparsebit_for_each_set_range'
+ - 'strlist__for_each_entry'
+ - 'strlist__for_each_entry_safe'
+ - 'sym_for_each_insn'
+ - 'sym_for_each_insn_continue_reverse'
+ - 'symbols__for_each_entry'
+ - 'tb_property_for_each'
+ - 'tcf_act_for_each_action'
+ - 'tcf_exts_for_each_action'
+ - 'test_suite__for_each_test_case'
+ - 'tool_pmu__for_each_event'
+ - 'ttm_bo_lru_for_each_reserved_guarded'
+ - 'ttm_resource_manager_for_each_res'
+ - 'udp_lrpa_for_each_entry_rcu'
+ - 'udp_portaddr_for_each_entry'
+ - 'udp_portaddr_for_each_entry_rcu'
+ - 'usb_hub_for_each_child'
+ - 'v4l2_device_for_each_subdev'
+ - 'v4l2_m2m_for_each_dst_buf'
+ - 'v4l2_m2m_for_each_dst_buf_safe'
+ - 'v4l2_m2m_for_each_src_buf'
+ - 'v4l2_m2m_for_each_src_buf_safe'
+ - 'virtio_device_for_each_vq'
+ - 'vkms_config_for_each_connector'
+ - 'vkms_config_for_each_crtc'
+ - 'vkms_config_for_each_encoder'
+ - 'vkms_config_for_each_plane'
+ - 'vkms_config_connector_for_each_possible_encoder'
+ - 'vkms_config_encoder_for_each_possible_crtc'
+ - 'vkms_config_plane_for_each_possible_crtc'
+ - 'while_for_each_ftrace_op'
+ - 'workloads__for_each'
+ - 'xa_for_each'
+ - 'xa_for_each_marked'
+ - 'xa_for_each_range'
+ - 'xa_for_each_start'
+ - 'xas_for_each'
+ - 'xas_for_each_conflict'
+ - 'xas_for_each_marked'
+ - 'xbc_array_for_each_value'
+ - 'xbc_for_each_key_value'
+ - 'xbc_node_for_each_array_value'
+ - 'xbc_node_for_each_child'
+ - 'xbc_node_for_each_key_value'
+ - 'xbc_node_for_each_subkey'
+ - 'ynl_attr_for_each'
+ - 'ynl_attr_for_each_nested'
+ - 'ynl_attr_for_each_payload'
+ - 'zorro_for_each_dev'
+ - 'zpci_bus_for_each'
+
+IncludeBlocks: Preserve
+IncludeCategories:
+ - Regex: '.*'
+ Priority: 1
+IncludeIsMainRegex: '(Test)?$'
+IndentCaseLabels: false
+IndentGotoLabels: false
+IndentPPDirectives: None
+IndentWidth: 8
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd: ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Auto
+ObjCBlockIndentWidth: 8
+ObjCSpaceAfterProperty: true
+ObjCSpaceBeforeProtocolList: true
+
+# Taken from git's rules
+PenaltyBreakAssignment: 10
+PenaltyBreakBeforeFirstCallParameter: 30
+PenaltyBreakComment: 10
+PenaltyBreakFirstLessLess: 0
+PenaltyBreakString: 10
+PenaltyExcessCharacter: 100
+PenaltyReturnTypeOnItsOwnLine: 60
+
+PointerAlignment: Right
+ReflowComments: false
+SortIncludes: false
+SortUsingDeclarations: false
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatementsExceptForEachMacros
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles: false
+SpacesInContainerLiterals: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: Cpp03
+TabWidth: 8
+UseTab: Always
+...
diff --git a/marufs_kernel/.gitignore b/marufs_kernel/.gitignore
new file mode 100644
index 0000000..5e1bdb0
--- /dev/null
+++ b/marufs_kernel/.gitignore
@@ -0,0 +1,27 @@
+# Kernel build artifacts
+*.ko
+*.o
+*.mod
+*.mod.c
+*.order
+*.symvers
+.*.cmd
+Module.symvers
+modules.order
+
+# Test binaries
+tests/bench_name_ref
+tests/test_chown_race
+tests/test_cross_process
+tests/test_dupname
+tests/test_ioctl
+tests/test_mmap
+tests/test_mmap_cuda
+tests/test_overlap
+tests/test_mmap_notrunc
+tests/test_negative
+tests/test_nrht_race
+tests/test_gc_deleg
+tests/test_pid_reuse
+tests/test_postexec_attack
+tests/dax_zero
diff --git a/marufs_kernel/LICENSE b/marufs_kernel/LICENSE
new file mode 100644
index 0000000..9efa6fb
--- /dev/null
+++ b/marufs_kernel/LICENSE
@@ -0,0 +1,338 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+
+ Copyright (C)
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, see .
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) year name of author
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+ , 1 April 1989
+ Moe Ghoul, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/marufs_kernel/Makefile b/marufs_kernel/Makefile
new file mode 100644
index 0000000..ae40ba6
--- /dev/null
+++ b/marufs_kernel/Makefile
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# Makefile for MARUFS - WORM filesystem for CXL shared memory
+#
+# Partitioned Global Index architecture
+#
+# Build artifacts are collected in the 'build/' directory
+#
+
+# Module name configuration
+# Change this to avoid conflicts with existing modules
+MODULE_NAME := marufs
+
+obj-m := $(MODULE_NAME).o
+
+# Source files (from src/ directory)
+$(MODULE_NAME)-objs := src/super.o src/inode.o src/file.o src/dir.o src/region.o src/index.o src/nrht.o src/acl.o src/cache.o src/sysfs.o src/sysfs_me.o src/sysfs_gc.o src/sysfs_nrht.o src/sysfs_debug.o src/gc.o src/me.o src/me_order.o src/me_request.o src/bootstrap.o
+
+# Kernel build directory (default to running kernel)
+KDIR ?= /lib/modules/$(shell uname -r)/build
+
+# Pass module name as compiler definition
+ccflags-y += -DMARUFS_MODULE_NAME=\"$(MODULE_NAME)\"
+
+# Include path for marufs_uapi.h (shared kernel/userspace header)
+ccflags-y += -I$(src)/include
+
+# CXL 2.0 compatibility: explicit cache flush for cross-host visibility
+ccflags-y += -DCONFIG_MARUFS_CXL2_COMPAT
+
+# Build output directory
+BUILD_DIR := build
+
+# Current directory (use CURDIR — immune to stale PWD env var)
+PWD := $(CURDIR)
+
+# Default target: build the module
+all: collect-artifacts
+
+# Create build directory
+prepare:
+ @mkdir -p $(BUILD_DIR)
+
+# Build kernel module in-place (depends on prepare)
+build-module: prepare
+ $(MAKE) -C $(KDIR) M=$(PWD) modules
+
+# Collect all build artifacts to build/ and clean source dirs (depends on build)
+collect-artifacts: build-module
+ @echo "Collecting build artifacts to $(BUILD_DIR)/"
+ @mkdir -p $(BUILD_DIR)
+ @cp -f *.o *.ko *.mod* .*.o .*.cmd Module.symvers modules.order $(BUILD_DIR)/ 2>/dev/null || true
+ @cp -f src/*.o src/.*.cmd $(BUILD_DIR)/ 2>/dev/null || true
+ @rm -f *.o *.ko *.mod *.mod.c *.mod.o .*.o .*.cmd Module.symvers modules.order 2>/dev/null || true
+ @rm -f src/*.o src/.*.cmd 2>/dev/null || true
+ @echo "Build artifacts collected in $(BUILD_DIR)/"
+
+# Clean build artifacts
+clean:
+ $(MAKE) -C $(KDIR) M=$(PWD) clean
+ @sh -c 'rm -f *.o *.ko *.mod* .*.cmd Module.symvers modules.order 2>/dev/null; true'
+ @sh -c 'rm -f src/*.o src/.*.cmd 2>/dev/null; true'
+ @rm -rf $(BUILD_DIR)
+ @echo "✓ Build artifacts cleaned"
+
+# Install the module
+install: all
+ sudo insmod $(BUILD_DIR)/$(MODULE_NAME).ko
+
+# Load the module
+load: all
+ sudo insmod $(BUILD_DIR)/$(MODULE_NAME).ko
+
+# Unload the module
+unload:
+ sudo rmmod marufs
+
+# Reload the module
+reload: unload load
+
+# Show module info
+info: all
+ modinfo $(BUILD_DIR)/$(MODULE_NAME).ko
+
+# Check for coding style issues (requires kernel checkpatch.pl)
+check:
+ $(KDIR)/scripts/checkpatch.pl --no-tree -f *.c *.h
+
+.PHONY: all clean install load unload reload info check
diff --git a/marufs_kernel/README.md b/marufs_kernel/README.md
new file mode 100644
index 0000000..7e21b65
--- /dev/null
+++ b/marufs_kernel/README.md
@@ -0,0 +1,192 @@
+# marufs kernel module
+
+Linux kernel filesystem module for CXL shared memory. Provides per-region access control via VFS.
+
+## Build & Install
+
+```bash
+sudo ./install.sh # build + load module
+sudo ./install.sh --mount /dev/dax6.0 --format # build + load + format + mount
+sudo ./uninstall.sh # unmount + unload module
+```
+
+## Auto-load on Boot
+
+```bash
+sudo ./setup-autoload.sh # module auto-load only
+sudo ./setup-autoload.sh --mount /dev/dax6.0 # + auto-mount at boot
+sudo ./setup-autoload.sh --status # check current config
+sudo ./setup-autoload.sh --uninstall # remove all config
+```
+
+## Tests
+
+Tests require a CXL DAX device.
+
+```bash
+# setup → test → teardown
+sudo ./tests/setup_local_multinode.sh --teardown
+sudo ./tests/setup_local_multinode.sh --device /dev/dax6.0
+sudo ./tests/setup_local_multinode.sh --status
+sudo ./tests/test_local_multinode.sh --no-cleanup --skip-setup
+sudo ./tests/setup_local_multinode.sh --teardown
+```
+
+Individual test binaries (built automatically by the test suite):
+
+| Binary | Description |
+|--------|-------------|
+| `test_ioctl` | Two-phase create, name-ref, permission delegation |
+| `test_mmap` | mmap data integrity (single + cross-node) |
+| `test_mmap_cuda` | mmap permission + cudaHostRegister (requires CUDA) |
+| `test_cross_process` | Cross-process create/truncate/mmap/unlink visibility |
+| `test_chown_race` | CHOWN concurrency and race condition tests |
+| `test_overlap` | Concurrent ftruncate physical overlap check |
+
+## Documentation
+
+Architecture docs are in `docs/`:
+
+| Document | Description |
+|----------|-------------|
+| [0_user_guide](docs/0_user_guide.md) | User-facing flow: multi-node mount, region lifecycle, name-ref sharing, security model (vLLM/LMCache scenario) |
+| [1_arch_metadata_layout](docs/1_arch_metadata_layout.md) | CXL memory layout, superblock/shard/RAT/NRHT structs |
+| [2_arch_entry_lifecycle](docs/2_arch_entry_lifecycle.md) | State machines for index, RAT, delegation entries |
+| [3_arch_gc](docs/3_arch_gc.md) | GC thread: tombstone sweep, dead process reclaim, orphan tracking |
+| [4_arch_nrht](docs/4_arch_nrht.md) | NRHT (Name-Ref Hash Table) structure and operations |
+| [5_arch_acl](docs/5_arch_acl.md) | Permission model: owner/default_perms/delegation |
+| [6_arch_mount_io](docs/6_arch_mount_io.md) | Mount/unmount flow, read/write/mmap I/O paths |
+
+---
+
+## Userspace API Guide
+
+### Mount Options
+
+```bash
+# Or use install.sh (recommended):
+sudo ./install.sh --mount /dev/dax0.0 --format # node_id=1 (default)
+sudo ./install.sh --mount /dev/dax0.0 --node-id 2 # second node
+
+# Manual mount (device arg is ignored — daxdev is passed via -o):
+sudo mount -t marufs -o node_id=1,daxdev=/dev/dax0.0,format none /mnt/marufs
+sudo mount -t marufs -o node_id=2,daxdev=/dev/dax0.0 none /mnt/marufs2
+```
+
+| Option | Description |
+|--------|-------------|
+| `node_id=N` | Node identifier for this mount (required, N > 0) |
+| `daxdev=/dev/daxX.Y` | DEV_DAX device path |
+| `format` | Initialize CXL memory. Use only on the first mount |
+
+### File Lifecycle
+
+```c
+#include "marufs_uapi.h" // include/marufs_uapi.h
+
+// Phase 1: Reserve metadata slot (no physical space)
+int fd = open("/mnt/marufs/my_region", O_CREAT | O_RDWR, 0644);
+
+// Phase 2: Allocate physical region (rounded up to 2MB)
+ftruncate(fd, 128 * 1024 * 1024); // 128MB
+
+// Phase 3: Access data via mmap (zero-copy)
+void *map = mmap(NULL, 128*1024*1024, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+
+// Option A: CPU write — requires sfence to flush WC buffers
+memcpy(map, src_data, data_size);
+__builtin_ia32_sfence();
+
+// Option B: GPU write — CUDA handles coherence, no sfence needed
+cudaHostRegister(map, 128*1024*1024, cudaHostRegisterDefault);
+cudaMemcpy(map, device_ptr, data_size, cudaMemcpyDeviceToHost);
+
+// Phase 4: Grant access to other nodes via ioctl
+// Option A: Set default permissions (applies to all non-owners)
+struct marufs_perm_req preq = {0};
+preq.perms = MARUFS_PERM_READ | MARUFS_PERM_WRITE;
+ioctl(fd, MARUFS_IOC_PERM_SET_DEFAULT, &preq);
+
+// Option B: Grant specific permissions to a (node_id, pid) pair
+struct marufs_perm_req greq = {0};
+greq.node_id = 2;
+greq.pid = 12345;
+greq.perms = MARUFS_PERM_READ | MARUFS_PERM_WRITE;
+ioctl(fd, MARUFS_IOC_PERM_GRANT, &greq);
+
+// Phase 5: Cleanup
+munmap(map, 128*1024*1024);
+close(fd);
+// To explicitly delete the region, call unlink() after close().
+// Otherwise, the region is automatically reclaimed by GC after the owner process exits.
+unlink("/mnt/marufs/my_region");
+```
+
+- Region size is immutable: a second `ftruncate()` on a file with size > 0 returns `-EACCES`
+- `ftruncate(fd, 0)` is a no-op (does not enter Phase 2)
+- Data writes are only possible via `mmap(PROT_WRITE)` — the `write()` syscall always returns `-EACCES`
+- Up to 256 regions (files) can be created
+
+### Permission System
+
+marufs uses its own delegation system instead of POSIX file permissions.
+Permission checks occur not at `open()` time, but at **actual data access** (mmap, read, ioctl).
+
+**Permission bits:**
+
+| Constant | Value | Meaning |
+|----------|-------|---------|
+| `MARUFS_PERM_READ` | 0x0001 | `read()`, `mmap(PROT_READ)` |
+| `MARUFS_PERM_WRITE` | 0x0002 | `mmap(PROT_WRITE)` |
+| `MARUFS_PERM_DELETE` | 0x0004 | `unlink()` |
+| `MARUFS_PERM_ADMIN` | 0x0008 | `chown`, `perm_set_default` |
+| `MARUFS_PERM_IOCTL` | 0x0010 | NRHT ioctls |
+| `MARUFS_PERM_GRANT` | 0x0020 | Delegate permissions to third parties (excluding ADMIN/GRANT) |
+
+**Check order:** Owner (all perms) → default_perms → delegation table → deny (`-EACCES`)
+
+**Cross-node access example:**
+
+```mermaid
+sequenceDiagram
+ participant N1 as Node 1 (owner)
+ participant CXL as CXL Memory
+ participant N2 as Node 2 (accessor)
+
+ N1->>CXL: open("region_a", O_CREAT)
+ N1->>CXL: ftruncate(fd, 128MB)
+
+ N2->>CXL: mmap(PROT_READ)
+ CXL-->>N2: EACCES (no permission)
+
+ N1->>CXL: ioctl(PERM_SET_DEFAULT, {perms=READ})
+
+ N2->>CXL: open("region_a", O_RDONLY)
+ N2->>CXL: mmap(PROT_READ)
+ CXL-->>N2: OK
+ N2->>CXL: mmap(PROT_WRITE)
+ CXL-->>N2: EACCES (READ only)
+
+ N1->>CXL: ioctl(PERM_GRANT, {node=2, pid=PID, perms=WRITE})
+
+ N2->>CXL: mmap(PROT_WRITE)
+ CXL-->>N2: OK
+```
+
+### ioctl Reference
+
+| Command | Direction | Struct | Required Permission |
+|---------|-----------|--------|---------------------|
+| `MARUFS_IOC_PERM_GRANT` | W | `marufs_perm_req` | ADMIN or GRANT |
+| `MARUFS_IOC_PERM_SET_DEFAULT` | W | `marufs_perm_req` | ADMIN |
+
+### Error Codes
+
+| Error | Context | Meaning |
+|-------|---------|---------|
+| `EACCES` | mmap, read, ioctl, unlink, ftruncate | Insufficient permissions or immutable region (second ftruncate) |
+| `ENODATA` | mmap | Phase 1 state (before ftruncate) |
+| `ENOSPC` | open(O_CREAT), PERM_GRANT | RAT entries or delegation table full |
+| `ENAMETOOLONG` | open(O_CREAT) | Filename exceeds 63 bytes |
+| `EINVAL` | ioctls | Invalid parameter |
+| `EAGAIN` | PERM_GRANT | CAS conflict (retry needed) |
diff --git a/marufs_kernel/docs/0_user_guide.md b/marufs_kernel/docs/0_user_guide.md
new file mode 100644
index 0000000..dd0be29
--- /dev/null
+++ b/marufs_kernel/docs/0_user_guide.md
@@ -0,0 +1,477 @@
+# MARUFS User Guide — vLLM / LMCache Scenario
+
+Audience: cluster admins bringing up marufs on multiple nodes, and
+application developers (e.g., LMCache backends) using marufs to share
+KV cache through CXL memory.
+
+This guide describes *what* happens from the user's point of view and
+*why* each step exists. For byte layouts, locking protocols, and GC
+internals, follow the links in each section to the architecture docs.
+
+---
+
+## 1. Actors and the mental model
+
+Three roles interact with a single CXL shared memory pool:
+
+| Role | Does | Example |
+|------|------|---------|
+| **Admin** | Formats the CXL device once, mounts marufs on every node | Cluster operator |
+| **Producer app** | Creates a region, writes KV data via `mmap`, publishes a name | vLLM prefill instance |
+| **Consumer app** | Looks up the name on another node, `mmap`s the same physical memory, reads zero-copy | vLLM decode instance |
+
+Key invariants the user can rely on:
+
+- All nodes see the same bytes — marufs is a filesystem view of shared
+ physical memory, not a replicated store. No data is copied between nodes.
+- Regions are the unit of allocation. A region has an owner (the node
+ that created it) and a size that is fixed after first `ftruncate()`.
+- Permissions live on the region, not on the file's POSIX mode. `open()`
+ always succeeds; the kernel denies at `mmap`/`read`/`ioctl` time.
+- **`O_CLOEXEC` is required** on every `open()` that will be used for
+ `mmap`/`read`/`ioctl`. The kernel returns `-EACCES` at first
+ data-access if the fd does not have `FD_CLOEXEC` set. This closes the
+ post-exec privilege retention class of attacks (an inherited fd
+ surviving an `execve` into hostile code).
+
+### Deployment topology
+
+Every node runs its own kernel with the marufs module loaded, and
+every node mounts the **same** CXL DAX device. There is no server,
+no replication daemon, no network path between nodes — coordination
+happens inside the CXL pool itself.
+
+Every node mounts at the same path (default `/mnt/marufs`; configurable
+per node with `--mount-point`).
+
+```mermaid
+flowchart TB
+ subgraph N1["Node 1 (node_id=1)"]
+ P1[vLLM worker] --> V1[VFS]
+ V1 --> M1[marufs.ko]
+ end
+ subgraph N2["Node 2 (node_id=2)"]
+ P2[vLLM worker] --> V2[VFS]
+ V2 --> M2[marufs.ko]
+ end
+ subgraph NN["Node N (node_id=N)"]
+ PN[vLLM worker] --> VN[VFS]
+ VN --> MN[marufs.ko]
+ end
+ M1 -- mmap --> CXL[(CXL shared memory
— regions + NRHT + metadata)]
+ M2 -- mmap --> CXL
+ MN -- mmap --> CXL
+```
+
+What to notice on this picture:
+
+- **Every kernel sees the same physical bytes.** The arrows to the
+ CXL pool are not network hops; they are load/store through the
+ DAX mapping.
+- **This generalizes.** One vLLM worker per node is the minimum; a
+ node can run many workers — they all share the single marufs
+ mount and are distinguished by PID. Delegations are
+ `(node_id, pid)`-scoped, so each process is addressable
+ independently.
+- **Node failure is isolated.** If Node 1 crashes, its owned regions
+ and delegations become reclaimable by GC on the surviving nodes;
+ Node 2 keeps operating on its mmap'd pointers.
+
+---
+
+## 2. Admin flow — bringing up a multi-node cluster
+
+The CXL pool is formatted **once** by the first node. Every other node
+mounts the same device without `format`.
+
+```bash
+# Node 1 (first boot, one-time format)
+sudo ./install.sh --mount /dev/dax0.0 --node-id 1 --format
+
+# Node 2, 3, ... (format is omitted)
+sudo ./install.sh --mount /dev/dax0.0 --node-id 2
+sudo ./install.sh --mount /dev/dax0.0 --node-id 3
+
+# Custom mount directory (default: /mnt/marufs)
+sudo ./install.sh --mount /dev/dax0.0 --node-id 2 --mount-point /data/marufs
+```
+
+Requirements:
+
+- Every node must use a **distinct `node_id`** (`N > 0`). The ID is
+ stamped into every region the node creates and is used for
+ cross-node permission targeting and crash detection.
+- The same physical DAX device must be exposed on every node (CXL
+ pool). Mount point paths can differ per node, but using an identical
+ path (e.g. `/mnt/marufs`) everywhere simplifies scripts that open
+ regions by absolute path.
+- No shared network filesystem, no ZooKeeper, no coordinator daemon —
+ coordination uses CXL memory itself.
+
+Key `install.sh` options:
+
+| Flag | Meaning |
+|------|---------|
+| `--mount ` | DAX device to bind (e.g. `/dev/dax0.0`) |
+| `--node-id ` | Unique node identifier, `N > 0` (default: `1`) |
+| `--mount-point ` | Filesystem mount directory (default: `/mnt/marufs`) |
+| `--format` | Initialize CXL memory. Only on the *first* node, *first* boot |
+
+Additional mount options (pass via `mount -o` or `/etc/fstab`):
+
+| Option | Meaning |
+|--------|---------|
+| `me_strategy=` | Cross-node write coordination strategy for NRHT (default: `request`). Per-NRHT override via `marufs_nrht_init_req.me_strategy`. |
+
+See: [6_arch_mount_io.md](6_arch_mount_io.md) for the on-mount
+discovery and bootstrapping logic.
+
+---
+
+## 3. Application flow — region lifecycle
+
+marufs regions are created in phases. Only the producer node goes
+through the full sequence; consumers skip to the mmap step.
+
+```
+Phase 1: reserve open(O_CREAT) → metadata slot only, no bytes
+Phase 2: allocate ftruncate(size) → physical region pinned (2MB aligned, rounded up)
+Phase 3: use mmap(...) / memcpy → zero-copy access
+Phase 4: grant ioctl(PERM_*) → grant cross-node access
+(Phase 5: publish) ioctl(NAME_OFFSET) → register a name in NRHT (optional, see §4)
+Phase 6: cleanup munmap + close + unlink (or wait for GC)
+```
+
+Mapped onto the deployment topology, each phase touches a different
+slice of CXL shared memory:
+
+```mermaid
+sequenceDiagram
+ participant P as vLLM worker
(Node 1, producer)
+ participant K1 as marufs.ko
(Node 1)
+ participant CXL as CXL shared memory
+ participant K2 as marufs.ko
(Node 2)
+ participant C as vLLM worker
(Node 2, consumer)
+
+ rect rgb(235, 245, 255)
+ Note over P,CXL: Phase 1 — reserve
+ P->>K1: open("/mnt/marufs/llama3-kv", O_CREAT)
+ K1->>CXL: allocate Index slot (no data bytes)
+ K1-->>P: fd (size=0)
+ end
+
+ rect rgb(235, 255, 240)
+ Note over P,CXL: Phase 2 — allocate
+ P->>K1: ftruncate(fd, 8 GiB)
+ K1->>CXL: reserve 2 MiB-aligned region in RAT
+ K1-->>P: OK (size fixed, immutable)
+ end
+
+ rect rgb(255, 250, 230)
+ Note over P,CXL: Phase 3 — use (zero-copy)
+ P->>K1: mmap(fd, PROT_RW)
+ K1-->>P: mapped pointer
+ P->>CXL: memcpy via pointer + sfence
+ end
+
+ rect rgb(255, 210, 210)
+ Note over CXL,C: Before grant — consumer access is denied
+ C->>K2: open(region, O_RDONLY) + mmap(PROT_READ)
+ K2->>CXL: check perms — no default, no delegation
+ CXL-->>K2: deny
+ K2-->>C: -EACCES
+ end
+
+ rect rgb(255, 235, 200)
+ Note over P,CXL: Phase 4 — grant (security)
+ alt Option A: broadcast to all non-owners
+ P->>K1: ioctl(PERM_SET_DEFAULT, READ)
+ K1->>CXL: write default_perms on RAT entry
+ else Option B: grant to specific (node_id, pid)
+ P->>K1: ioctl(PERM_GRANT, {node_id=2, pid=201, perms=RW})
+ K1->>CXL: insert delegation row + stamp pid birth_time
+ end
+ end
+
+ rect rgb(245, 235, 255)
+ Note over P,CXL: Phase 5 — publish name-ref (optional)
+ Note over P,K1: One-time NRHT setup (producer only, skip if already done)
+ P->>K1: open("/mnt/marufs/kv_nrht", O_CREAT)
+ K1-->>P: nrht_fd
+ P->>K1: ioctl(nrht_fd, NRHT_INIT, {max_entries, num_shards, num_buckets})
+ K1->>CXL: format NRHT region (hash table layout)
+ P->>K1: ioctl(nrht_fd, PERM_SET_DEFAULT, IOCTL)
+ Note over P,K1: Publish each name
+ P->>K1: ioctl(nrht_fd, NAME_OFFSET, "kv_blk_AAAA" → region_fd, offset)
+ K1->>CXL: insert into NRHT
+ end
+
+ rect rgb(230, 248, 250)
+ Note over CXL,C: Consumer path — Node 2 uses only NRHT lookup + mmap
+ C->>K2: open("/mnt/marufs/nrht_file")
+ K2-->>C: nrht_fd
+ C->>K2: ioctl(nrht_fd, FIND_NAME, "kv_blk_AAAA")
+ K2->>CXL: read NRHT
+ CXL-->>K2: (region_name, offset)
+ K2-->>C: region_name, offset
+ C->>K2: open(region_name, O_RDONLY) + mmap(PROT_READ)
+ K2->>CXL: check default_perms / delegation
+ CXL-->>K2: OK
+ K2-->>C: mapped pointer
+ C->>CXL: direct load at offset (zero-copy)
+ end
+
+ rect rgb(240, 240, 240)
+ Note over P,CXL: Phase 6 — cleanup
+ P->>K1: munmap + close + unlink (or exit)
+ K1->>CXL: mark Index TOMBSTONE, RAT DELETING
+ Note over CXL: GC reclaims later
+ end
+```
+
+Phase boundaries are meaningful:
+
+- `open(O_CREAT)` reserves only an index slot. Before `ftruncate()`,
+ `mmap()` returns `-ENODATA`.
+- The first successful `ftruncate()` fixes the size permanently. A
+ second `ftruncate()` with a non-zero size returns `-EACCES`.
+- `write()` is not supported — data must flow through `mmap`. This
+ enforces the zero-copy contract with CXL memory.
+- For CPU writes the application is responsible for `sfence` (WC
+ buffer flush) after `memcpy`. CUDA handles its own coherence for
+ GPU-originated writes.
+- No KV bytes traverse the network — only the `name → (region, offset)`
+ metadata is exchanged via ioctl; the data itself is a direct CXL load.
+- If a consumer also needs to write (e.g. store evicted blocks), the
+ producer issues `PERM_GRANT` for that `(node_id, pid)`, or sets
+ `default_perms = READ|WRITE` for broad write-back.
+- If the producer exits without `unlink`, GC reclaims the region after
+ owner-process death is detected. Consumers that still hold `mmap`
+ mappings keep working until they `munmap`.
+
+See: [2_arch_entry_lifecycle.md](2_arch_entry_lifecycle.md) for the
+per-state CAS transitions, and the README "Userspace API Guide" for
+a complete C code example.
+
+---
+
+## 4. Name-ref (NRHT) — publishing and peer lookup
+
+Regions are identified by POSIX filenames (`/mnt/marufs/my_region`).
+For LMCache-style workloads that share *ranges* of a big region
+(one region per model → many KV blocks inside it), marufs provides
+a name-ref table (NRHT) that maps an arbitrary string key to
+`(region, offset)`.
+
+```mermaid
+flowchart LR
+ subgraph NRHT["NRHT (name-ref table)"]
+ K1["'kv_blk_AAAA'"]
+ K2["'kv_blk_BBBB'"]
+ K3["'kv_blk_CCCC'"]
+ end
+ subgraph REGION["Region file: llama3-kv"]
+ R0["KV block 0
offset 0x00000"]
+ R1["KV block 1
offset 0x20000"]
+ R2["KV block 2
offset 0x40000"]
+ R3["KV block 3
offset 0x60000"]
+ R4["KV block 4
offset 0x80000"]
+ R5["KV block 5
offset 0xA0000"]
+ end
+ K1 -. "(llama3-kv, 0x00000)" .-> R0
+ K2 -. "(llama3-kv, 0x20000)" .-> R1
+ K3 -. "(llama3-kv, 0x80000)" .-> R4
+```
+
+A region is the unit of allocation (big, 2 MiB-aligned). An NRHT entry
+is just a named pointer into that region — many names can coexist
+inside one region. This is the core pattern LMCache uses: one region
+per model, N name-refs per block.
+
+### 4.1 One-time setup — creating the NRHT file
+
+An NRHT is itself a marufs region, but with a special internal layout
+(hash table instead of flat data). It must be created and formatted
+once by the producer (or an admin tool), then reused by every peer
+that wants to look up or publish names.
+
+```c
+/* Producer, one-time setup: create + format the NRHT region. */
+int nrht_fd = open("/mnt/marufs/kv_nrht", O_CREAT | O_RDWR | O_CLOEXEC, 0644);
+
+struct marufs_nrht_init_req init = {
+ .max_entries = 0, /* 0 → default (524288) */
+ .num_shards = 0, /* 0 → default (64, pow2) */
+ .num_buckets = 0, /* 0 → default (max_entries / 4) */
+ .me_strategy = 1, /* 0 = order-driven, 1 = request-driven (default). */
+};
+ioctl(nrht_fd, MARUFS_IOC_NRHT_INIT, &init); /* requires PERM_ADMIN */
+
+/* Optionally grant IOCTL perm to consumers so they can FIND_NAME. */
+struct marufs_perm_req preq = { .perms = MARUFS_PERM_IOCTL };
+ioctl(nrht_fd, MARUFS_IOC_PERM_SET_DEFAULT, &preq);
+```
+
+After this, every node just does `open("/mnt/marufs/kv_nrht")` to get
+an `nrht_fd`; no further init is needed (idempotent creation only
+happens the first time). `NRHT_INIT` is ADMIN-gated, so only the
+region's owner (or a delegated admin) can format it.
+
+Peers lazily join the NRHT's cross-node coordination ring on their
+first `NAME_OFFSET` / `FIND_NAME` call, which adds a one-time setup
+latency (~ms). Latency-sensitive consumers can pre-warm that path:
+
+```c
+/* Optional: join the NRHT ring up front. Idempotent — safe to call
+ * repeatedly; the first call bears the full join cost. */
+ioctl(nrht_fd, MARUFS_IOC_NRHT_JOIN);
+```
+
+### 4.2 Publishing and looking up names
+
+Producer-side publish (`nrht_fd` obtained via `open(nrht_path)`):
+
+```c
+struct marufs_name_offset_req req = {0};
+strncpy(req.name, "kv_blk_AAAA", sizeof(req.name) - 1);
+req.target_region_fd = region_fd;
+req.offset = kv_block_offset; // bytes within region data area
+ioctl(nrht_fd, MARUFS_IOC_NAME_OFFSET, &req);
+```
+
+Consumer-side lookup on another node (same `nrht_fd` step):
+
+```c
+struct marufs_find_name_req req = {0};
+strncpy(req.name, "kv_blk_AAAA", sizeof(req.name) - 1);
+if (ioctl(nrht_fd, MARUFS_IOC_FIND_NAME, &req) < 0) {
+ /* -ENOENT = name not published; -EACCES = no IOCTL perm */
+ return -errno;
+}
+/* On success the kernel fills:
+ * req.region_name — target region filename (no leading slash)
+ * req.offset — byte offset within that region's data area
+ */
+
+/* Step 2: open the region and mmap the byte range you need. */
+char path[PATH_MAX];
+snprintf(path, sizeof(path), "/mnt/marufs/%s", req.region_name);
+int region_fd = open(path, O_RDONLY | O_CLOEXEC);
+
+/* mmap offset must be page-aligned; round down and adjust if needed. */
+void *base = mmap(NULL, kv_block_size, PROT_READ, MAP_SHARED,
+ region_fd, req.offset);
+/* `base` is a direct pointer into CXL memory — zero-copy read.
+ * If the owner has not granted READ to this (node_id, pid),
+ * mmap returns MAP_FAILED with errno = EACCES (see §5).
+ */
+```
+
+Batch variants (`MARUFS_IOC_BATCH_FIND_NAME`, `MARUFS_IOC_BATCH_NAME_OFFSET`)
+process multiple entries per syscall — preferred for LMCache
+block-level access patterns.
+
+See: [4_arch_nrht.md](4_arch_nrht.md).
+
+### 4.3 Per-entry ref/pin counters
+
+Each NRHT entry carries two user-managed `__u32` counters — `ref_count`
+and `pin_count` — meant for application-defined eviction policies.
+Both reset to 0 on every fresh insert. Caller semantics:
+
+- **inc**: returns `-EOVERFLOW` if current value is `UINT32_MAX`.
+- **dec**: returns `-EINVAL` if current value is already 0.
+
+Each ioctl acquires the entry's NRHT shard ME for the read-modify-write,
+so concurrent `inc`/`dec` from any node serialize correctly.
+
+```c
+struct marufs_refcnt_req cr = {0};
+strncpy(cr.name, "kv_blk_AAAA", sizeof(cr.name) - 1);
+
+/* refcount on cache pin/release. */
+ioctl(nrht_fd, MARUFS_IOC_NRHT_REF_INC, &cr); /* cr.count = post-op value */
+ioctl(nrht_fd, MARUFS_IOC_NRHT_REF_DEC, &cr);
+
+/* pin to block eviction across a critical region. */
+ioctl(nrht_fd, MARUFS_IOC_NRHT_PIN_INC, &cr);
+/* ... mmap + read/write ... */
+ioctl(nrht_fd, MARUFS_IOC_NRHT_PIN_DEC, &cr);
+```
+
+`MARUFS_IOC_FIND_NAME` returns the current `ref_count` and `pin_count`
+alongside `offset` — no separate read ioctl needed:
+
+```c
+struct marufs_find_name_req fr = {0};
+strncpy(fr.name, "kv_blk_AAAA", sizeof(fr.name) - 1);
+ioctl(nrht_fd, MARUFS_IOC_FIND_NAME, &fr);
+/* fr.ref_count, fr.pin_count = snapshot (lock-free read, may be stale). */
+```
+
+---
+
+## 5. Security — who can read/write a region
+
+marufs uses a **delegation model**, not POSIX file bits. The check
+happens at data-access time (`mmap`, `read`, permission-gated ioctls),
+not at `open()`.
+
+In addition to the delegation check, every data-access entry point
+requires the calling fd to be marked `FD_CLOEXEC` (set via
+`open(..., O_CLOEXEC)` or `fcntl(fd, F_SETFD, FD_CLOEXEC)`). An fd
+without `FD_CLOEXEC` would survive `execve` and let post-exec code
+reach the region; the kernel returns `-EACCES` at `mmap`/`read`/`ioctl`
+to close that path regardless of the new exe identity.
+
+Owner (the node that created the region) always has full access.
+For everyone else, the kernel consults, in order:
+
+1. **Default permissions** — set once via `PERM_SET_DEFAULT`; applies
+ to every non-owner accessor.
+2. **Delegation table** — per `(node_id, pid)` grants via `PERM_GRANT`.
+3. **Deny** — returns `-EACCES`.
+
+```mermaid
+flowchart LR
+ Start([mmap / read / ioctl
on region R]) --> Q1{Caller is
R's owner?}
+ Q1 -- yes --> Allow([Allow — all perms])
+ Q1 -- no --> Q2{default_perms
covers this op?}
+ Q2 -- yes --> Allow
+ Q2 -- no --> Q3{Delegation for
node_id, pid
with this perm?}
+ Q3 -- yes --> Q4{target pid's
birth_time still
matches?}
+ Q4 -- yes --> Allow
+ Q4 -- no --> Deny([Deny — EACCES
stale PID, reaped by GC])
+ Q3 -- no --> Deny
+```
+
+Permission bits:
+
+| Bit | Syscall |
+|-----|---------|
+| `MARUFS_PERM_READ` | `mmap(PROT_READ)`, `read()` |
+| `MARUFS_PERM_WRITE` | `mmap(PROT_WRITE)` |
+| `MARUFS_PERM_DELETE` | `unlink()` |
+| `MARUFS_PERM_ADMIN` | `chown`, `PERM_SET_DEFAULT` |
+| `MARUFS_PERM_IOCTL` | NRHT ioctls |
+| `MARUFS_PERM_GRANT` | Delegate further (ADMIN/GRANT themselves cannot be re-delegated) |
+
+PID reuse is handled by stamping each delegation with the target
+process's birth time: a reborn PID does not inherit old grants.
+Delegations from a dead process are reclaimed by the GC thread.
+
+See: [5_arch_acl.md](5_arch_acl.md) for the check order in code,
+and [3_arch_gc.md](3_arch_gc.md) for dead-process reclaim.
+
+---
+
+## 6. Where to go next
+
+| If you want to know... | Read |
+|---|---|
+| The exact on-disk layout (superblock, shards, RAT, NRHT) | [1_arch_metadata_layout](1_arch_metadata_layout.md) |
+| State machines for index/RAT/delegation entries | [2_arch_entry_lifecycle](2_arch_entry_lifecycle.md) |
+| How dead regions and stale delegations are reclaimed | [3_arch_gc](3_arch_gc.md) |
+| NRHT shard/bucket structure and insert protocol | [4_arch_nrht](4_arch_nrht.md) |
+| ACL check path and delegation internals | [5_arch_acl](5_arch_acl.md) |
+| Mount bootstrap, VFS read/write/mmap paths | [6_arch_mount_io](6_arch_mount_io.md) |
diff --git a/marufs_kernel/docs/1_arch_metadata_layout.md b/marufs_kernel/docs/1_arch_metadata_layout.md
new file mode 100644
index 0000000..78d5464
--- /dev/null
+++ b/marufs_kernel/docs/1_arch_metadata_layout.md
@@ -0,0 +1,327 @@
+# Doc 1: CXL Metadata Layout
+
+> **Source files**: `marufs_layout.h` (all struct definitions, `enum marufs_layout`), `marufs.h` (`marufs_sb_info`, `shard_cache`)
+
+---
+
+## 1. CXL Memory Physical Layout
+
+Default configuration: 4 shards, 256 buckets/shard, 256 entries/shard, 256 RAT entries.
+
+```mermaid
+block-beta
+ columns 4
+
+ gsb_label["◼ Global Superblock"]:4
+ sb["Superblock (256B)"]:4
+
+ space:4
+
+ rht_label["◼ Region Hash Table"]:4
+ st0["Shard Header 0 (64B)"]
+ st1["Shard Header 1 (64B)"]
+ st2["Shard Header 2 (64B)"]
+ st3["Shard Header 3 (64B)"]
+ bk0["Buckets 0 (256 × 4B)"]
+ bk1["Buckets 1 (256 × 4B)"]
+ bk2["Buckets 2 (256 × 4B)"]
+ bk3["Buckets 3 (256 × 4B)"]
+ en0["Entries 0 (256 × 64B)"]
+ en1["Entries 1 (256 × 64B)"]
+ en2["Entries 2 (256 × 64B)"]
+ en3["Entries 3 (256 × 64B)"]
+
+ space:4
+
+ rat_label["◼ Region Allocation Table"]:4
+ rat_hdr["RAT Header (128B)"]:4
+ r0["RAT Entry 0 (2 KB)"]
+ r1["RAT Entry 1 (2 KB)"]
+ r_dot["... (× 253)"]
+ r255["RAT Entry 255 (2 KB)"]
+
+ space:4
+
+ rg["Region 0, 1, 2, ... (2 MB aligned each)"]:4
+
+ style gsb_label fill:#1B4F72,color:#fff,font-weight:bold
+ style sb fill:#2E86C1,color:#fff
+ style rht_label fill:#145A32,color:#fff,font-weight:bold
+ style st0 fill:#1E8449,color:#fff
+ style st1 fill:#1E8449,color:#fff
+ style st2 fill:#1E8449,color:#fff
+ style st3 fill:#1E8449,color:#fff
+ style bk0 fill:#27AE60,color:#fff
+ style bk1 fill:#27AE60,color:#fff
+ style bk2 fill:#27AE60,color:#fff
+ style bk3 fill:#27AE60,color:#fff
+ style en0 fill:#52BE80,color:#fff
+ style en1 fill:#52BE80,color:#fff
+ style en2 fill:#52BE80,color:#fff
+ style en3 fill:#52BE80,color:#fff
+ style rat_label fill:#78281F,color:#fff,font-weight:bold
+ style rat_hdr fill:#C0392B,color:#fff
+ style r0 fill:#E74C3C,color:#fff
+ style r1 fill:#E74C3C,color:#fff
+ style r_dot fill:#E74C3C,color:#fff
+ style r255 fill:#E74C3C,color:#fff
+ style rg fill:#2C3E50,color:#fff
+```
+
+### Category Overview
+
+| Category | Color | Description |
+|----------|:-----:|-------------|
+| **Global Superblock** | 🔵 Blue | Describes the entire FS geometry. Read once at mount time and cached in an in-memory struct. Single instance |
+| **Region Hash Table** | 🟢 Green | Sharded hash index for filename → region_id mapping. Shard Header (immutable pointers) → Bucket (chain head) → Entry (state-managed) hierarchy |
+| **Region Allocation Table** | 🔴 Red | Per-region (per-file) metadata storage. RAT Header (global alloc management) + RAT Entry (physical allocation, name, ACL, delegation unified) |
+| **Region Data** | ⚫ Gray | Actual file data. Variable size, 2 MB aligned |
+
+### Key Fields by Block
+
+| Category | Block | Key Fields | Role |
+|:--------:|-------|------------|------|
+| 🔵 | **Superblock** | `magic`, `num_shards`, `buckets_per_shard`, `entries_per_shard`, `active_nodes` | FS geometry + mounted node management |
+| 🟢 | **Shard Header** | `bucket_array_offset`, `entry_array_offset` | Per-shard array positions (immutable after format) |
+| 🟢 | **Bucket** | `head_entry_idx` | Hash chain start (entry index or `BUCKET_END`) |
+| 🟢 | **Index Entry** | `state`, `name_hash`, `region_id`, `next_in_bucket` | filename → region mapping |
+| 🔴 | **RAT Header** | `magic`, `max_entries`, `alloc_lock` | Region allocation table meta + global lock |
+| 🔴 | **RAT Entry** | CL0: `state`, `phys_offset`, `size` / CL1: `name` / CL2: ACL / CL3-31: delegation | Unified file metadata (32 CL) |
+| ⚫ | **Region** | *(raw data)* | Actual file data (variable size, 2 MB aligned) |
+
+---
+
+## 2. Filename → Region Data Access Path
+
+Full path from a filename to the actual Region data. Traverses Region Hash Table → RAT → Region Data.
+
+```mermaid
+flowchart LR
+ subgraph rht["Region Hash Table"]
+ direction TB
+ F["filename"]
+ F -->|"SHA-256 → 64-bit"| H["name_hash"]
+ H -->|"bits 63..48"| S["Shard"]
+ H -->|"bits 47..32"| B["Bucket"]
+ B -->|"chain walk"| E["Index Entry"]
+ end
+
+ subgraph rat["RAT"]
+ direction TB
+ RAT_CL0["CL0: state, phys_offset, size"]
+ RAT_CL1["CL1: name"]
+ RAT_CL2["CL2: owner, default_perms"]
+ RAT_CL3["CL3-31: delegation table"]
+ end
+
+ subgraph region["Region Data"]
+ direction TB
+ RG["Region"]
+ end
+
+ E -->|"region_id"| RAT_CL0
+ RAT_CL0 -->|"phys_offset + size"| RG
+ RAT_CL2 -.->|"perm check"| RG
+ RAT_CL3 -.->|"delegation check"| RG
+
+ style rht fill:#145A32,stroke:#145A32,stroke-width:2px,color:#fff
+ style rat fill:#78281F,stroke:#78281F,stroke-width:2px,color:#fff
+ style region fill:#2C3E50,stroke:#2C3E50,stroke-width:2px,color:#fff
+```
+
+- **Region Hash Table path**: `filename` → SHA-256 hash → shard selection (upper bits) → bucket selection (middle bits) → entry chain walk → `name_hash` match → extract `region_id`
+- **RAT access**: Direct indexing via `rat.entries[region_id]` → get `phys_offset`/`size` from CL0
+- **ACL check**: Check owner/`default_perms` in CL2 → if insufficient, scan delegation table in CL3-31 → grant or deny Region access
+
+---
+
+## 3. Struct Details
+
+### 3.1 `marufs_superblock` (256 B = 4 CL)
+
+Single instance describing the entire filesystem geometry.
+
+**Purpose**: Shard count, offsets of each area, mounted node management.
+
+**Usage**:
+- **mount**: `marufs_read_superblock()` validates magic/version then copies to `sbi`
+- **format**: `marufs_format_device()` initializes all fields
+- **active_nodes**: `marufs_le64_cas()` sets/clears bits at mount/unmount (node registration)
+- **runtime**: `marufs_gsb_get()` accesses directly from CXL (includes RMB)
+
+```c
+struct marufs_superblock { /* 256B = 4 CL */
+ /* ── CL0: identity + layout + integrity ── */
+ uint32 magic; /* 0x4D415255 ("MARU") */
+ uint32 version;
+ uint64 total_size; /* Total CXL memory size */
+ uint64 shard_table_offset;
+ uint64 rat_offset;
+ uint64 active_nodes; /* Mounted node bitmask, CAS */
+ uint32 num_shards; /* power of 2 */
+ uint32 buckets_per_shard; /* power of 2 */
+ uint32 entries_per_shard;
+ uint32 checksum; /* CRC32 */
+ /* ── CL1–CL3: reserved ── */
+ uint8 reserved[200];
+};
+```
+---
+
+### 3.2 `marufs_shard_header` (64 B = 1 CL)
+
+**Immutable** metadata describing the hash bucket/entry array locations for each shard.
+
+**Purpose**: Stores absolute offsets of bucket array and entry array per shard.
+
+**Usage**:
+- **mount**: `marufs_init_shard_table()` validates then caches pointers in `shard_cache` (DRAM)
+- **runtime**: Index insert/lookup/delete access via `shard_cache` → avoids CXL round-trip
+- Never modified after format (read-only)
+
+```c
+struct marufs_shard_header { /* 64B = 1 CL */
+ uint32 magic; /* 0x4D534844 ("MSHD") */
+ uint32 shard_id; /* [0..num_shards) */
+ uint32 num_buckets; /* power of 2 */
+ uint32 num_entries;
+ uint64 bucket_array_offset; /* Absolute offset within device */
+ uint64 entry_array_offset;
+ uint8 reserved[32];
+};
+```
+---
+
+### 3.3 `marufs_index_entry` (64 B = 1 CL)
+
+Core unit of the Global Index. Maps filename → region_id.
+
+**Purpose**: Hash-chain based file lookup. `state` field is the CAS target.
+
+**Usage**:
+- **insert**: `marufs_index_insert()` — `CAS(state, EMPTY, INSERTING)` → fill fields → TENTATIVE → shard lock → bucket prepend → post-insert dedup → VALID → unlock
+- **lookup**: `marufs_index_lookup()` — chain walk + `name_hash` matching
+- **delete**: `marufs_index_delete()` — `CAS(VALID, TOMBSTONE)` (stays in chain, reused by insert path)
+- **GC**: Reclaims stale INSERTING/TOMBSTONE entries (`gc.c`)
+
+```c
+struct marufs_index_entry { /* 64B = 1 CL */
+ uint32 state; /* EMPTY(0)/INSERTING(1)/TENTATIVE(2)/VALID(3)/TOMBSTONE(4) */
+ uint32 next_in_bucket; /* Hash chain link (0xFFFFFFFF = end) */
+ uint64 name_hash; /* SHA-256 → 64-bit truncate */
+ uint32 region_id; /* RAT entry ID */
+ uint32 node_id; /* Inserting node (for stale detection) */
+ uint64 created_at; /* Creation time ns (for stale detection) */
+ uint8 reserved[32];
+};
+```
+---
+
+### 3.4 `marufs_rat_entry` (2,048 B = 32 CL)
+
+Per-region (per-file) metadata. Unifies physical allocation, name, ownership, permissions, and delegation into a single entry.
+
+**Purpose**: Manages all file metadata within a single RAT entry.
+
+**Usage**:
+- **alloc**: `marufs_rat_alloc_entry()` — CAS `FREE→ALLOCATING` + `alloc_lock`
+- **init**: `marufs_region_init()` — sets phys_offset/size via ftruncate path → `WRITE_ONCE(state, ALLOCATED)`
+- **inode load**: `marufs_iget()` — restores inode metadata from RAT entry
+- **ACL**: `acl.c` — reads owner/default_perms/delegation
+- **I/O**: `file.c` — syncs `modified_at`, `size` (CL0 access)
+- **GC**: `gc.c` — orphan detection and reclaim, `marufs_rat_free_entry()`
+
+```c
+struct marufs_rat_entry { /* 2048B = 32 CL */
+ /* ── CL0: Hot I/O metadata ── */
+ uint32 state; /* FREE(0)/ALLOCATING(1)/ALLOCATED(2)/DELETING(3) */
+ uint32 region_type; /* DATA(0)/NRHT(1) */
+ uint64 phys_offset; /* Data area offset (0 = unallocated) */
+ uint64 size; /* Region size (variable, 2 MB aligned) */
+ uint64 alloc_time; /* Allocation time (ns) */
+ uint64 modified_at; /* Last modification time (ns) */
+ uint8 reserved0[24];
+
+ /* ── CL1: Name — cold ── */
+ char name[64]; /* null-terminated (MARUFS_NAME_MAX+1) */
+
+ /* ── CL2: ACL ── */
+ uint16 default_perms; /* Default permissions for non-owners */
+ uint16 owner_node_id; /* Owning node (max 64) */
+ uint32 owner_pid;
+ uint64 owner_birth_time; /* PID reuse prevention */
+ uint32 uid;
+ uint32 gid;
+ uint16 mode;
+ uint16 deleg_num_entries; /* Active delegation count (max 29) */
+ uint8 reserved2[36];
+
+ /* ── CL3-CL31: Delegation table ── */
+ struct marufs_deleg_entry deleg_entries[29];
+};
+```
+
+> CL0 is the hot path accessed on every I/O path. `marufs_rat_entry_get()` issues RMB for CL0 only.
+> CL1 (name) is accessed only on hash match. No CL miss during chain walk.
+---
+
+### 3.5 `marufs_deleg_entry` (64 B = 1 CL)
+
+A slot that delegates fine-grained permissions to a specific `(node_id, pid)` pair.
+
+**Purpose**: Dynamic delegation of individual permissions such as READ/WRITE/DELETE.
+
+**Usage**:
+- **grant**: `marufs_perm_grant()` — CAS `EMPTY→GRANTING` → fill fields → `WRITE_ONCE(state, ACTIVE)`, existing entries use CAS-loop to OR perms (upsert)
+- **check**: `marufs_check_permission()` — scans delegation table
+- **GC**: Cleans up stale delegations from dead processes (`ACTIVE→EMPTY`)
+
+```c
+struct marufs_deleg_entry { /* 64B = 1 CL */
+ uint32 state; /* EMPTY(0)/GRANTING(1)/ACTIVE(2) */
+ uint32 node_id; /* Target node */
+ uint32 pid; /* Target PID */
+ uint32 perms; /* Permission bitmask */
+ uint64 birth_time; /* PID reuse prevention */
+ uint64 granted_at; /* Grant time (ns) */
+ uint8 reserved[32];
+};
+```
+---
+
+### 3.6 `marufs_rat` (RAT Header, 128 B = 2 CL)
+
+RAT header. Located immediately before the entry array.
+
+```c
+struct marufs_rat { /* 128B header + entries */
+ uint32 magic; /* 0x4D524154 ("MRAT") */
+ uint32 version;
+ uint32 num_entries; /* Allocated entry count */
+ uint32 max_entries; /* Maximum (256) */
+ uint64 device_size;
+ uint64 rat_offset;
+ uint64 regions_start; /* Region data start offset */
+ uint64 total_allocated;
+ uint64 total_free;
+ uint32 alloc_lock; /* CAS spinlock (0=unlocked) */
+ uint8 reserved[68];
+ struct marufs_rat_entry entries[256];
+};
+```
+---
+
+## 4. Cacheline Boundary Summary
+
+All CXL structs are designed with cacheline (64 B) boundaries in mind:
+
+| Struct | Size | CL Count | Key CL Separation |
+|--------|------|----------|-------------------|
+| `marufs_superblock` | 256 B | 4 | CL0: all live fields, CL1-3: reserved |
+| `marufs_shard_header` | 64 B | 1 | Entire struct in a single CL |
+| `marufs_index_entry` | 64 B | 1 | Entire struct in a single CL (CAS target = `state`) |
+| `marufs_rat_entry` | 2,048 B | 32 | CL0: hot I/O, CL1: name (cold), CL2: ACL, CL3-31: delegation |
+| `marufs_deleg_entry` | 64 B | 1 | Entire struct in a single CL |
+| `marufs_rat` header | 128 B | 2 | CL0: identity+stats, CL1: reserved |
+
+> CL separation rationale: CXL memory access is cacheline-granular. Separating hot path (state checks, I/O metadata) from cold data (name, delegation) prevents unnecessary cacheline transfers.
diff --git a/marufs_kernel/docs/2_arch_entry_lifecycle.md b/marufs_kernel/docs/2_arch_entry_lifecycle.md
new file mode 100644
index 0000000..86b537c
--- /dev/null
+++ b/marufs_kernel/docs/2_arch_entry_lifecycle.md
@@ -0,0 +1,201 @@
+# Doc 2: Entry Lifecycle (State Machines)
+
+> **Source files**: `index.c` (Global Index insert/lookup/delete), `region.c` (RAT alloc/init/free), `acl.c` (delegation grant/check), `gc.c` (3-phase GC sweep), `marufs_layout.h` (state enum definitions)
+
+---
+
+## Overview
+
+All state transitions are performed via CAS (compare-and-swap), providing lock-free operation within a single node. In multi-node environments, the atomicity guarantee of CAS varies depending on CXL hardware cache coherence level (CXL 3.0 vs 2.0), so cross-node lock-free operation is not fully guaranteed.
+
+| Entry Type | Location | States | Definition |
+|------------|----------|--------|------------|
+| Global Index Entry | Shard Entry Array | 5 | `enum marufs_entry_state` |
+| RAT Entry | Region Allocation Table | 4 | `enum marufs_rat_entry_state` |
+| Delegation Entry | Inside RAT Entry (CL3-31) | 3 | `enum marufs_deleg_state` |
+
+---
+
+## 2-1. Global Index Entry
+
+Core unit of the filename → region_id mapping. `marufs_index_entry` (64B = 1CL).
+
+### State Transition Diagram
+
+```mermaid
+flowchart LR
+ EMPTY -->|"open(O_CREAT)"| INSERTING
+ INSERTING -->|"fields written"| TENTATIVE
+ TENTATIVE -->|"shard lock + dedup pass"| VALID
+ TENTATIVE -->|"dedup loss / link fail"| TOMBSTONE
+ VALID -->|"unlink()"| TOMBSTONE
+ INSERTING -->|"GC stale (5s)"| TOMBSTONE
+ TOMBSTONE -->|"insert chain reuse"| INSERTING
+ TOMBSTONE -->|"insert inline unlink"| EMPTY
+```
+
+- **INSERTING**: Slot claimed, fields being written. Other nodes ignore it. Stale detection via `node_id` + `created_at`.
+- **TENTATIVE**: Fields written, entry linked to chain. Visible to dedup but not to lookup. Transitions to VALID under shard lock after dedup passes.
+- **TOMBSTONE**: `unlink()` performs `CAS(VALID, TOMBSTONE)` for logical deletion. Stays in chain; cleaned up during next insert's dup check.
+
+### Transition Details
+
+| Transition | Event | CAS Condition | Function |
+|------------|-------|---------------|----------|
+| EMPTY → INSERTING | `open(O_CREAT)` → index insert | `CAS(state, EMPTY, INSERTING)` | `__marufs_index_insert()` step 3b (flat scan) |
+| TOMBSTONE → INSERTING | `open(O_CREAT)` → chain reuse | `CAS(state, TOMBSTONE, INSERTING)` | `__marufs_index_insert()` step 3a |
+| INSERTING → TENTATIVE | Insert: fields written | `WRITE_LE32(entry->state, MARUFS_ENTRY_TENTATIVE)` | `__marufs_index_insert()` step 5 |
+| TENTATIVE → VALID | Insert: shard lock + dedup pass | `WRITE_LE32(entry->state, MARUFS_ENTRY_VALID)` | `__marufs_index_insert()` step 9 (under lock) |
+| TENTATIVE → TOMBSTONE | Insert: dedup loss or link failure | `WRITE_LE32(entry->state, MARUFS_ENTRY_TOMBSTONE)` | `__marufs_index_insert()` (under lock) |
+| VALID → TOMBSTONE | `unlink()` → logical delete (stays in chain) | `CAS(VALID, TOMBSTONE)` | `marufs_index_delete()` |
+| TOMBSTONE → EMPTY | Inline unlink during insert dup check chain walk | `CAS(state, TOMBSTONE, EMPTY)` + `CAS(prev_next, cur, next)` | `marufs_index_check_duplicate()` |
+| INSERTING → TOMBSTONE | GC Phase 2: stale timeout exceeds 5s | `CAS(state, INSERTING, TOMBSTONE)` | `marufs_gc_sweep_stale_entries()` → `marufs_entry_reclaim_slot()` |
+
+### Insert Protocol (Shard-Lock Serialized)
+
+1. **Pre-insert dup check** (lock-free, best-effort): Walk bucket chain for existing VALID entry
+2. **Slot claim**: CAS EMPTY/TOMBSTONE → INSERTING. Stamp `node_id` and `created_at` (for GC stale detection)
+3. **Field write**: Write `name_hash`, `region_id`. While INSERTING, exclusive ownership
+4. **TENTATIVE**: `WRITE_LE32(state, TENTATIVE)` — visible to dedup walkers but not to lookup
+5. **Acquire shard lock** (`spin_lock` in DRAM `marufs_shard_cache.insert_lock`)
+6. **Bucket prepend**: `marufs_index_link_to_bucket()` — `CAS(bucket_head, old_head, entry_idx)` (skip if chain-reuse)
+7. **Post-insert dedup**: Re-walk chain for VALID duplicates. Loser (higher `entry_idx`) → TOMBSTONE
+8. **Publish**: Winner `WRITE_LE32(state, VALID)` — visible to lookup
+9. **Release shard lock**
+
+Chain reuse path (step 3a): TOMBSTONE slot is already linked in the chain, so bucket prepend is skipped.
+
+### TOCTOU Defense: Post-insert Dedup (Under Lock)
+
+The shard lock serializes link + dedup + publish within a node, eliminating the window where two VALID entries with the same name are visible to lookup. Cross-node serialization is handled by the token ring protocol.
+
+- **Winner determination**: Lower `entry_idx` wins
+- **Loser rollback**: Higher `entry_idx` transitions to TOMBSTONE (under lock), returns `-EEXIST`
+
+### Stale INSERTING Detection (`marufs_is_stale_inserting()`)
+
+| Condition | Action |
+|-----------|--------|
+| `node_id == this_node` + `created_at > 5s` | Immediate reclaim |
+| `node_id == this_node` + `created_at == 0` | Register in local tracker, check timeout on next GC |
+| `node_id == 0` (orphan) | Register in local tracker (no CXL write — prevents cacheline clobbering) |
+| `node_id != this_node` | Skip — that node's GC handles it |
+
+---
+
+## 2-2. RAT Entry
+
+Per-region (per-file) metadata. `marufs_rat_entry` (2048B = 32CL).
+
+### State Transition Diagram
+
+```mermaid
+flowchart LR
+ FREE -->|"open(O_CREAT)"| ALLOCATING
+ ALLOCATING -->|"field init complete"| ALLOCATED
+ ALLOCATED -->|"unlink() or GC"| DELETING
+ DELETING -->|"index cleanup + free"| FREE
+ ALLOCATING -->|"GC orphan recovery"| FREE
+ DELETING -->|"GC stuck recovery"| FREE
+```
+
+- **ALLOCATING**: Fields being initialized (`owner_pid`, `owner_birth_time`, `name`, `phys_offset`). GC reclaims to FREE if owner is dead.
+- **DELETING**: Deletion in progress. Transitions to FREE after index entry cleanup. GC performs full cleanup then FREE if owner is dead.
+
+### Transition Details
+
+| Transition | Event | CAS Condition | Function |
+|------------|-------|---------------|----------|
+| FREE → ALLOCATING | `open(O_CREAT)` → region alloc | `CAS(state, FREE, ALLOCATING)` | `marufs_rat_alloc_entry()` |
+| ALLOCATING → ALLOCATED | Alloc: after field init complete | `WRITE_ONCE(state, ALLOCATED)` | `marufs_rat_alloc_entry()` |
+| ALLOCATED → DELETING | `unlink()` or GC dead process reclaim | `CAS(state, ALLOCATED, DELETING)` | `marufs_unlink()` or `marufs_gc_reclaim_dead_regions()` |
+| DELETING → FREE | After index entry cleanup, RAT release | Multi-stage CAS: CAS(DELETING→FREE), fallback CAS(ALLOCATED→FREE), fallback CAS(ALLOCATING→FREE) | `marufs_rat_free_entry()` |
+| ALLOCATING → FREE | GC Phase 1: owner dead, not yet registered in index | `marufs_gc_cleanup_rat_entry()` | `marufs_gc_reclaim_dead_regions()` |
+| DELETING → FREE | GC Phase 1: owner dead + stuck DELETING recovery | `marufs_gc_cleanup_rat_entry()` | `marufs_gc_reclaim_dead_regions()` |
+
+### 2-Phase Allocation
+
+RAT allocation proceeds in two phases:
+
+1. **`open(O_CREAT)`** → `marufs_rat_alloc_entry()`: `CAS(FREE→ALLOCATING)` → field init (`name`, `owner_pid`, `owner_birth_time`, `alloc_time`) → `WRITE_ONCE(ALLOCATED)`. At this point `phys_offset=0`, `size=0` (reservation state)
+2. **`ftruncate(N)`** → `marufs_region_init()`: Acquire `alloc_lock` → search for contiguous space → write `phys_offset`, `size`. Physical space is actually allocated at this point
+
+`alloc_lock` is used only during `region_init()`. CAS alone cannot prevent races between contiguous space search and allocation (stale lock is force-released after 5s timeout).
+
+### DELETING Path
+
+Deletion order matters:
+
+1. Index entry: `CAS(VALID→TOMBSTONE)` — no longer found by lookup
+2. RAT entry: `CAS(ALLOCATED→DELETING)` — marks deletion in progress
+3. RAT entry: field reset + multi-stage CAS to FREE — slot available for reuse
+
+Reversing the order (FREE RAT first) would create a dangling reference where the index entry points to a freed RAT.
+
+### Orphan Recovery (`marufs_is_orphaned()`)
+
+`marufs_is_orphaned()` is a pure predicate; node_id filtering is the caller's responsibility (`gc.c:marufs_gc_reclaim_dead_regions()`). Conditions checked by this function:
+
+1. Owner process is dead (`marufs_owner_is_dead()`: pid lookup → `task_struct` → `birth_time` comparison)
+2. `owner_pid == 0` (crash during ALLOCATING): `alloc_time` exceeds 5s
+3. No active delegations (`marufs_has_active_delegations()` = false)
+
+All conditions must be met for reclaim to proceed.
+
+---
+
+## 2-3. Delegation Entry
+
+A slot that delegates permissions to a specific (node_id, pid). `marufs_deleg_entry` (64B = 1CL).
+Fixed array inside RAT entry (`deleg_entries[29]`, CL3-CL31).
+
+### State Transition Diagram
+
+```mermaid
+flowchart LR
+ EMPTY -->|"ioctl(PERM_GRANT)"| GRANTING
+ GRANTING -->|"field write complete"| ACTIVE
+ ACTIVE -->|"GC dead delegation sweep"| EMPTY
+ GRANTING -->|"GC stale (5s)"| EMPTY
+ ACTIVE -->|"re-grant — perms OR"| ACTIVE
+```
+
+- **GRANTING**: Slot claimed, fields being written (`node_id`, `pid`, `perms`, `granted_at`). `birth_time` is 0 — lazy stamped on first access.
+- **ACTIVE**: Valid delegation. Permission check matches `(node_id, pid, birth_time)`. Re-grant to same target uses CAS-loop to OR perms.
+
+### Transition Details
+
+| Transition | Event | CAS Condition | Function |
+|------------|-------|---------------|----------|
+| EMPTY → GRANTING | Owner calls `ioctl(PERM_GRANT)` | `CAS(state, EMPTY, GRANTING)` | `marufs_deleg_grant()` |
+| GRANTING → ACTIVE | Grant: after field write + WMB complete | `WRITE_ONCE(state, ACTIVE)` | `marufs_deleg_grant()` |
+| ACTIVE → EMPTY | GC Phase 1: delegated process deemed dead | `CAS(state, ACTIVE, EMPTY)` | `marufs_gc_sweep_dead_delegations()` |
+| GRANTING → EMPTY | GC: stale GRANTING timeout (5s, crash during grant) | `CAS(state, GRANTING, EMPTY)` | `marufs_gc_sweep_dead_delegations()` |
+| ACTIVE → ACTIVE | Owner re-grants to same (node_id, pid) | `CAS-loop(perms, old, old\|new)` | `marufs_deleg_try_upsert()` |
+
+### Upsert Pattern
+
+When an existing (node_id, pid) match is found, instead of allocating a new slot, `perms` bits are OR'd via CAS-loop:
+
+```
+do {
+ old_perms = READ_LE32(de->perms);
+ new_perms = old_perms | req->perms;
+} while (CAS(&de->perms, old_perms, new_perms) != old_perms);
+```
+
+### Lazy birth_time
+
+`birth_time` is set to `0` at grant time. When the delegated process first undergoes a permission check, it is stamped via `CAS(birth_time, 0, current->start_boottime)`. This means:
+
+- No need to know the target process's `birth_time` at grant time
+- PID reuse detection activates only after the first access
+
+### Dead Process Detection
+
+Conditions for GC to clean up a delegation (`marufs_gc_sweep_dead_delegations()`):
+
+1. `de->node_id == sbi->node_id` (only clean up this node's delegations)
+2. `de->pid != 0`
+3. `marufs_owner_is_dead(de_pid, de_birth)` = true
+4. Cleanup: `CAS(ACTIVE→EMPTY)` → `deleg_num_entries` atomic dec
diff --git a/marufs_kernel/docs/3_arch_gc.md b/marufs_kernel/docs/3_arch_gc.md
new file mode 100644
index 0000000..6231555
--- /dev/null
+++ b/marufs_kernel/docs/3_arch_gc.md
@@ -0,0 +1,344 @@
+# Doc 3: GC Logic
+
+> **Source files**: `gc.c` (full GC implementation), `acl.c` (`marufs_owner_is_dead()`, delegation sweep), `index.c` (`marufs_is_stale_inserting()` calls), `sysfs.c` (GC manual control interface)
+
+---
+
+## Overview
+
+marufs GC is a crash-safe 3-phase sweep structure where a 10-second periodic kthread reclaims orphaned resources left by dead processes in CXL shared memory.
+
+Core design principles:
+
+- **Each node reclaims only its own resources** — `node_id` comparison prevents cross-node interference
+- **CAS-based state transitions** — safe under concurrent GC/insert/delete races
+- **Admin node (node_id==1) handles orphans exclusively** — `node_id==0` orphan entries are tracked and claimed only by the admin node via local tracker, eliminating CAS contention
+- **2-phase orphan reclaim** — local track (5s) → claim (node_id preemption + timestamp) → normal path reclaim (5s)
+
+| Property | Value |
+|----------|-------|
+| Interval | 10s (`MARUFS_GC_INTERVAL_MS`) |
+| kthread name | `marufs-gc-{node_id}` |
+| Stale timeout | 5s (`MARUFS_STALE_TIMEOUT_NS`) |
+| Shard sweep ratio | 1/4 per cycle (`MARUFS_GC_SHARD_DIVISOR`) |
+| Local tracker capacity | 64 (`MARUFS_GC_ORPHAN_MAX`) |
+
+---
+
+## GC Architecture Overview
+
+### Thread Lifecycle
+
+```mermaid
+flowchart LR
+ Stopped["Stopped"] -->|"gc_start() (mount)"| Running["Running"]
+ Running -->|"gc_paused = 1"| Paused["Paused"]
+ Paused -->|"gc_paused = 0"| Running
+ Running -->|"gc_stop() (unmount)"| Stopped
+ Stopped -->|"gc_restart() (sysfs)"| Running
+```
+
+- **`gc_paused`**: Atomic flag. When 1, sweep is skipped (used for testing)
+- **`gc_epoch`**: Incremented each cycle. Liveness check via sysfs `gc_status`
+- **Module reference**: `try_module_get()`/`module_put()` prevents rmmod during GC execution
+
+### sysfs Control Interface (`/sys/fs/marufs/`)
+
+| File | Permissions | Purpose |
+|------|-------------|---------|
+| `gc_trigger` | W(0200) | Manually trigger dead-process region reclaim |
+| `gc_pause` | RW(0644) | `echo 1` pause / `echo 0` resume / `echo {node_id}:{0\|1}` per-node |
+| `gc_stop` | W(0200) | `echo {node_id}` or `echo all` to stop GC thread |
+| `gc_restart` | W(0200) | `echo {node_id}` or `echo all` to restart GC thread |
+| `gc_status` | R(0444) | Outputs liveness as `node{id}:{running\|stopped} epoch={N}` |
+
+### Full GC Cycle Flow
+
+```mermaid
+flowchart LR
+ Start["kthread start"] --> Sleep["sleep(10s)"]
+ Sleep --> CheckStop{should_stop?}
+ CheckStop -->|Yes| Exit["exit"]
+ CheckStop -->|No| CheckPause{gc_paused?}
+ CheckPause -->|Yes| Sleep
+ CheckPause -->|No| Phase1["Phase 1
Dead Process
Region Cleanup"]
+ Phase1 --> Phase2["Phase 2
Stale Entry
Sweep"]
+ Phase2 --> Phase3["Phase 3
Local Tracker
Sweep"]
+ Phase3 --> Phase4["Phase 4
NRHT Stale
Sweep"]
+ Phase4 --> Epoch["gc_epoch++"]
+ Epoch --> Sleep
+
+ style Phase1 fill:#E74C3C,color:#fff
+ style Phase2 fill:#2E86C1,color:#fff
+ style Phase3 fill:#1E8449,color:#fff
+ style Phase4 fill:#8E44AD,color:#fff
+```
+
+---
+
+## Phase 1: Dead Process Region Cleanup
+
+**Function**: `marufs_gc_reclaim_dead_regions()`
+
+Iterates over the entire RAT (256 entries), reclaiming regions owned by dead processes.
+
+### Full Flow
+
+```mermaid
+flowchart LR
+ Free{state == FREE?} -->|Yes| Skip["skip"]
+ Free -->|No| DelegSweep["1.1 Delegation Sweep"]
+ DelegSweep --> Orphaned{"1.2 Orphan check"}
+ Orphaned -->|"indeterminate"| Track["register in
local tracker (RAT)"]
+ Orphaned -->|No| Skip
+ Orphaned -->|Yes| StateCheck{state?}
+ StateCheck -->|ALLOCATING/DELETING| Cleanup["cleanup_rat_entry()"]
+ StateCheck -->|ALLOCATED| CAS["CAS(ALLOCATED→DELETING)"]
+ CAS --> CASok{success?}
+ CASok -->|No| Skip
+ CASok -->|Yes| Cleanup
+
+ style DelegSweep fill:#2E86C1,color:#fff
+ style Orphaned fill:#1E8449,color:#fff
+```
+
+### 1.1 Delegation Sweep
+
+**Function**: `marufs_gc_sweep_dead_delegations()`
+
+Runs for every ALLOCATED RAT entry. Cleans up this node's dead delegations and stale GRANTING entries.
+
+#### Sweep Flow
+
+```mermaid
+flowchart LR
+ State{state?}
+ State -->|GRANTING| GrantedAt{"granted_at == 0?"}
+ GrantedAt -->|Yes| Track["register in
local tracker (DELEG)"]
+ GrantedAt -->|No| Timeout{"timeout 5s?"}
+ Timeout -->|Yes| CAS1["CAS(GRANTING→EMPTY)"]
+ Timeout -->|No| Next["next entry"]
+ State -->|EMPTY| Next
+ State -->|ACTIVE| BirthCheck{"birth_time == 0?"}
+ BirthCheck -->|Yes| TrackUnbound["register in
local tracker (DELEG_UNBOUND)"]
+ BirthCheck -->|No| DeadCheck{"dead delegation?"}
+ DeadCheck -->|No| Next
+ DeadCheck -->|Yes| CAS2["CAS(ACTIVE→EMPTY)"]
+```
+
+**Design rationale**: Each node only cleans up its own ACTIVE delegations → all delegations must be cleared before a region can be reclaimed. GRANTING is timeout-based so any node can clean it. ACTIVE + `birth_time==0` (before lazy init) causes `owner_is_dead()` false-positives, so it's routed to the local tracker (`DELEG_UNBOUND`). After timeout, a sentinel (`birth_time=1`) is written → reclaimed via normal path on next cycle.
+
+### 1.2 Orphan Check (`marufs_is_orphaned()`)
+
+Pure predicate — no side effects. state/node_id filtering is the caller's responsibility.
+
+```mermaid
+flowchart LR
+ PID{owner_pid == 0?}
+ PID -->|Yes| AllocTime{"alloc_time == 0?"}
+ AllocTime -->|Yes| RetFalse["return false
(caller does local track)"]
+ AllocTime -->|No| TimeoutA{"timeout 5s?"}
+ TimeoutA -->|No| RetFalse
+ TimeoutA -->|Yes| Deleg
+ PID -->|No| Dead{"1.2.1 owner dead?"}
+ Dead -->|No| RetFalse
+ Dead -->|Yes| Deleg{"active delegation?"}
+ Deleg -->|Yes| RetFalse2["return false"]
+ Deleg -->|No| RetTrue["return true (orphan)"]
+
+ style Dead fill:#2E86C1,color:#fff
+```
+
+### 1.2.1 `marufs_owner_is_dead()` Logic
+
+**Function**: `acl.c` — Detects PID reuse via `birth_time` (`start_boottime`).
+
+```mermaid
+flowchart LR
+ CheckZero{pid == 0?} -->|Yes| Alive["return false"]
+ CheckZero -->|No| FindPID["find_get_pid(pid)"]
+ FindPID --> Exists{exists?}
+ Exists -->|No| Dead1["return true"]
+ Exists -->|Yes| GetTask["get_pid_task()"]
+ GetTask --> TaskOK{task exists?}
+ TaskOK -->|No| Dead2["return true"]
+ TaskOK -->|Yes| Birth{"birth_time match?"}
+ Birth -->|No| Dead3["return true (PID reuse)"]
+ Birth -->|Yes| Alive2["return false"]
+```
+
+---
+
+## Phase 2: Stale Entry Sweep
+
+**Function**: `marufs_gc_sweep_stale_entries()`
+
+Sweeps `num_shards / 4` shards per cycle. Rotates `gc_next_shard` to cover all shards across 4 cycles.
+
+Transitions stale INSERTING entries to TOMBSTONE. The reason for TOMBSTONE instead of EMPTY: if a crash occurs after `link_to_bucket` but before `publish(VALID)`, the INSERTING entry may already be linked in the chain. An EMPTY entry in the chain could be claimed by flat scan for a different bucket, causing chain corruption. TOMBSTONE ensures safe in-place reuse by the insert path. TOMBSTONE entries are not processed in this phase — they are reused in-place by the insert path's `check_duplicate`.
+
+### Sweep Flow
+
+```mermaid
+flowchart LR
+ Entry["iterate entries"] --> State{state == INSERTING?}
+ State -->|No| Next["next entry"]
+ State -->|Yes| Stale["2.1 is_stale_inserting()"]
+ Stale --> Result{result?}
+ Result -->|stale| Reclaim["CAS(INSERTING→TOMBSTONE)"]
+ Result -->|indeterminate| Track["register in
local tracker (INDEX)"]
+ Result -->|other node| Next
+ Reclaim --> Next
+ Track --> Next
+
+ style Stale fill:#2E86C1,color:#fff
+```
+
+### 2.1 Stale INSERTING Detection (`marufs_is_stale_inserting()`)
+
+Pure function — no side effects. Safe to call from both GC thread and syscall path (index.c insert dup check). Local tracking is handled by the caller (`marufs_gc_sweep_stale_entries`).
+
+| Condition | Return | Meaning |
+|-----------|--------|---------|
+| `node_id == 0` (admin) | 0 | orphan — only admin node (node_id==1) registers in tracker |
+| `node_id == 0` (non-admin) | -1 | orphan but this node is not admin — skip |
+| `node_id != this node` | -1 | owned by another node — skip |
+| `node_id == this`, `created_at == 0` | 0 | timestamp not recorded — indeterminate |
+| `node_id == this`, `created_at > 5s` | 1 | confirmed stale |
+| `node_id == this`, `created_at ≤ 5s` | 0 | still valid |
+
+---
+
+## Phase 3: Local Tracker Sweep
+
+**Function**: `marufs_gc_sweep_orphans()`
+
+Tracks entries in DRAM that couldn't be determined via normal paths (timeout comparison, `owner_is_dead`, etc.) in Phase 1/2. On timeout expiry, **claims** the entry to transition it into a state processable by the normal path. Actual reclaim occurs in the next GC cycle's normal path.
+
+### DRAM Tracker Structure
+
+```c
+enum marufs_orphan_type {
+ MARUFS_ORPHAN_INDEX, /* stale INSERTING index entry */
+ MARUFS_ORPHAN_DELEG, /* stale GRANTING delegation entry */
+ MARUFS_ORPHAN_DELEG_UNBOUND, /* ACTIVE deleg, birth_time not yet bound */
+ MARUFS_ORPHAN_RAT, /* stuck ALLOCATING RAT entry */
+ MARUFS_ORPHAN_NRHT, /* stale INSERTING NRHT entry */
+};
+
+struct marufs_orphan_tracker {
+ void *entry; /* CXL entry pointer */
+ u64 discovered_at; /* first discovery time (ns) */
+ enum marufs_orphan_type type;
+};
+```
+
+- **Registration**: `marufs_gc_track_orphan()` — GC thread only (must not be called from syscall path)
+- **Dedup**: Pointer comparison skips already-registered entries
+- **Capacity overflow**: When full at 64, registration is abandoned → retry after sweep on next cycle
+
+### Sweep Flow
+
+```mermaid
+flowchart LR
+ Entry["iterate tracker"] --> Stuck{"3.1 still_stuck()?"}
+ Stuck -->|No| Remove["remove from tracker"]
+ Stuck -->|Yes| Timeout{"timeout 5s?"}
+ Timeout -->|No| Keep["keep"]
+ Timeout -->|Yes| Claim["3.2 claim_orphan()"]
+ Claim --> Claimed{success?}
+ Claimed -->|Yes| Count["claimed++"]
+ Claimed -->|No| Noop["another node preempted"]
+ Count --> Remove
+ Noop --> Remove
+
+ style Stuck fill:#2E86C1,color:#fff
+ style Claim fill:#1E8449,color:#fff
+```
+
+### 3.1 `still_stuck()` — Per-type Detection
+
+| Type | Stuck condition | Reason |
+|------|----------------|--------|
+| INDEX | `state == INSERTING && node_id == 0` | Crash during insert before node_id was written. Only admin node (node_id==1) registers in tracker |
+| DELEG | `state == GRANTING && (node_id == 0 \|\| granted_at == 0)` | Crash during grant. `node_id == 0`: target node not recorded. `granted_at == 0`: timestamp not recorded. Admin node only tracks `node_id==0` case |
+| DELEG_UNBOUND | `state == ACTIVE && birth_time == 0` | Grant complete (ACTIVE) but delegated process hasn't made first access yet. `birth_time=0` causes `owner_is_dead()` false-positive — cannot process via normal path |
+| RAT | `state == ALLOCATING && owner_node_id == 0` | Crash during alloc before owner node_id was written. Only admin node (node_id==1) registers in tracker |
+| NRHT | `state == INSERTING && inserter_node == 0` | Crash during NRHT insert before inserter_node was written. Only admin node (node_id==1) registers in tracker |
+
+If state has changed or the relevant field has been populated → not stuck → remove from tracker → normal path handles it.
+
+### 3.2 `claim_orphan()` — Per-type Preemption
+
+After timeout expires, preempts `node_id` via CAS and stamps timestamp. Actual reclaim is handled by the next GC cycle's normal path.
+
+| Type | Claim target | Timestamp target | Next cycle reclaim path |
+|------|-------------|-----------------|------------------------|
+| INDEX | `CAS(e->node_id, 0, mine)` | `e->created_at = now` | `stale_sweep` → `is_stale_inserting` = 1 → `entry_reclaim_slot` (INSERTING→TOMBSTONE) |
+| DELEG | `CAS(de->node_id, 0, mine)` or ownership check | `de->granted_at = now` | `sweep_dead_delegations` → timeout → clear + CAS(GRANTING→EMPTY) |
+| DELEG_UNBOUND | `CAS(de->birth_time, 0, 1)` (sentinel) | — | `sweep_dead_delegations` → `owner_is_dead(pid, 1)` → birth_time mismatch → CAS(ACTIVE→EMPTY) |
+| RAT | `CAS(re->owner_node_id, 0, mine)` | `re->alloc_time = now` | `dead_process_regions` → `is_orphaned` → `cleanup_rat_entry` |
+| NRHT | `CAS(e->inserter_node, 0, mine)` | `e->created_at = now` | `nrht_gc_sweep_all` → `nrht_is_stale` = 1 → CAS(INSERTING→TOMBSTONE) |
+
+**DELEG special case**: The grant path writes `node_id` first and `granted_at` later, so `node_id != 0` but `granted_at == 0` can occur. During claim, if `node_id != 0`, only ownership is verified (`de_node == sbi->node_id`) and only the timestamp is updated.
+
+### Orphan Lifecycle (2-phase timeout, admin node only)
+
+`node_id==0` orphans are tracked and claimed **only by admin node (node_id==1)**. Non-admin nodes skip `node_id==0` entries.
+
+```
+[Discovery] node_id=0 or timestamp=0 (admin node only)
+ → register in local tracker (no CXL write)
+ → wait 5s
+ ↓
+[Claim] CAS(node_id, 0, 1) + timestamp = now
+ → remove from tracker
+ → discovered by next GC cycle's normal path (owned by node_id==1)
+ → wait 5s
+ ↓
+[Reclaim] final reclaim via normal path
+ → total max ~60s (2 × timeout)
+```
+
+**2-phase design rationale**:
+- 1st timeout: a live writer may still be writing, so observe without writing to CXL
+- Admin-only claim: only node_id==1 attempts CAS → single preemptor, no contention
+- 2nd timeout: after claim, other nodes may still be reading this entry, so don't reclaim immediately
+
+---
+
+## Edge Cases
+
+### GC vs Concurrent insert/delete Races
+
+| Race scenario | Safety guarantee |
+|---------------|-----------------|
+| GC reclaim vs insert (same slot) | CAS atomicity: GC does `CAS(INSERTING, TOMBSTONE)`, insert does `CAS(state, INSERTING)`. Only one succeeds |
+| GC cleanup vs unlink (same region) | GC preempts with `CAS(ALLOCATED, DELETING)`. unlink's CAS fails, skip |
+| GC delegation sweep vs perm_grant | ACTIVE clear: state is ACTIVE during clear → grant upsert doesn't match. After CAS transition, even if grant takes EMPTY, all fields are overwritten |
+| Phase 2 stale reclaim vs live inserter | `node_id`/`created_at` comparison avoids touching live entries. 5s timeout far exceeds normal insert time |
+
+### Cross-node Visibility
+
+- **WMB before CAS**: `marufs_entry_reclaim_slot()` writes `name_hash=0` → WMB → CAS
+- **RMB before state read**: All phases issue RMB before reading entries → ensures fresh data even on CXL 2.0
+- **CAS failure = skip**: Another node/path already handled it. Proceed to next without retry
+
+### `marufs_is_stale_inserting()` — Thread Safety
+
+Pure function (no side effects). Callable from both GC thread and syscall path (index.c insert dup check). Local tracking is performed only in GC thread-exclusive function `marufs_gc_track_orphan()` — no concurrent access to `gc_orphans[]` array.
+
+### `marufs_can_force_unlink()` — dir.c unlink fallback
+
+On permission check failure, uses `marufs_is_orphaned()` for orphan detection. `is_orphaned` is a pure predicate, safe for syscall path. `owner_node == sbi->node_id` filter allows force unlink only for orphans owned by this node.
+
+### GC Phase Dependencies
+
+Phase 1 → 2 → 3 → 4 order is priority-based:
+
+- **Phase 1 (region reclaim)**: Free dead process regions quickly → available for other nodes. Side task: during RAT scan, records `region_type == NRHT` entries in DRAM bitmap (`gc_nrht_bitmap`) for Phase 4
+- **Phase 2 (stale INSERTING)**: Global Index slot reclaim (INSERTING→TOMBSTONE) + register orphans in tracker when found
+- **Phase 3 (orphan sweep)**: Processes trackers registered by Phase 1/2/4
+- **Phase 4 (NRHT stale sweep)**: Iterates NRHT regions via DRAM bitmap, transitions stale INSERTING entries to TOMBSTONE. `gc_epoch`-based round-robin sweeps ~25% of shards per cycle. See Doc 4 for NRHT structure and state transition details
+
+Each phase can fail independently without affecting the overall GC cycle (errors ignored, retried on next cycle).
diff --git a/marufs_kernel/docs/4_arch_nrht.md b/marufs_kernel/docs/4_arch_nrht.md
new file mode 100644
index 0000000..fb602b7
--- /dev/null
+++ b/marufs_kernel/docs/4_arch_nrht.md
@@ -0,0 +1,450 @@
+# Doc 4: NRHT Structure and Operations
+
+> **Source files**: `nrht.c` (full implementation), `marufs_layout.h` (NRHT structs), `marufs_uapi.h` (ioctl definitions)
+
+---
+
+## 1. Overview
+
+NRHT (Name-Ref Hash Table) is an independent CXL file that stores **name → (offset, target_region_id)** mappings. As an independent region with its own RAT entry, it can freely reference multiple data regions (N:M). Unlike the Global Index (file-level `filename → region_id`), NRHT manages application-level (KV cache keys, etc.) name-refs.
+
+---
+
+## 2. Physical Layout
+
+An NRHT file is allocated as a regular region entry, with the hash table formatted inside its data area.
+
+```mermaid
+block-beta
+ columns 4
+
+ hdr_label["◼ NRHT Header (64B)"]:4
+
+ space:4
+
+ sh_label["◼ Shard Headers (N × 64B)"]:4
+ sh0["Shard Header 0"]
+ sh1["Shard Header 1"]
+ sh2["..."]
+ sh3["Shard Header N"]
+
+ space:4
+
+ sd_label["◼ Per-Shard Data (× N)"]:4
+ bkt0["Buckets 0"]
+ bkt1["Buckets 1"]
+ bkt2["..."]
+ bkt3["Buckets M"]
+ ent0["Entries 0"]
+ ent1["Entries 1"]
+ ent2["..."]
+ ent3["Entries M"]
+
+ style hdr_label fill:#1B4F72,color:#fff,font-weight:bold
+ style sh_label fill:#145A32,color:#fff,font-weight:bold
+ style sh0 fill:#1E8449,color:#fff
+ style sh1 fill:#1E8449,color:#fff
+ style sh2 fill:#1E8449,color:#fff
+ style sh3 fill:#1E8449,color:#fff
+ style sd_label fill:#6C3483,color:#fff,font-weight:bold
+ style bkt0 fill:#8E44AD,color:#fff
+ style bkt1 fill:#8E44AD,color:#fff
+ style bkt2 fill:#8E44AD,color:#fff
+ style bkt3 fill:#8E44AD,color:#fff
+ style ent0 fill:#A569BD,color:#fff
+ style ent1 fill:#A569BD,color:#fff
+ style ent2 fill:#A569BD,color:#fff
+ style ent3 fill:#A569BD,color:#fff
+```
+
+Hash routing (shard/bucket selection) and bucket chain structure are identical to the Global Index — see Doc 1.
+
+---
+
+## 3. Struct Details
+
+### 3.1 NRHT Header (64B, 1 CL)
+
+`marufs_nrht_header` — First 64B of the NRHT file. Describes the overall hash table geometry.
+
+| Size | Field | Description |
+|------|-------|-------------|
+| 4B | `magic` | `MARUFS_NRHT_MAGIC` (0x4E524854 "NRHT") |
+| 4B | `version` | Format version (1) |
+| 4B | `num_shards` | Shard count (power of 2, max 64) |
+| 4B | `buckets_per_shard` | Buckets per shard (power of 2) |
+| 4B | `entries_per_shard` | Max entries per shard |
+| 4B | `owner_region_id` | RAT entry ID of this NRHT file |
+| 8B | `table_size` | Total NRHT allocation size (bytes) |
+| 32B | `reserved` | Padding (64B alignment) |
+
+**Usage**: `nrht_get_header()` — look up `phys_offset` from RAT entry → convert to DAX pointer → validate magic/version.
+
+### 3.2 NRHT Shard Header (64B, 1 CL)
+
+`marufs_nrht_shard_header` — Per-shard geometry + absolute offsets.
+
+| Size | Field | Description |
+|------|-------|-------------|
+| 4B | `num_entries` | Max entries in this shard |
+| 4B | `num_buckets` | Bucket count in this shard |
+| 8B | `bucket_array_offset` | **Absolute** device offset of bucket array |
+| 8B | `entry_array_offset` | Absolute offset of entry array |
+| 4B | `free_hint` | Flat scan start hint (best-effort, no CAS needed) |
+| 4B | `lock` | Shard-level CAS spinlock (0=unlocked, 1=locked) |
+| 32B | `reserved` | Padding |
+
+**Usage**: `nrht_get_shard_ctx()` — read shard header → convert offsets to DAX pointers → cache in `nrht_shard_ctx` struct. `free_hint` is read and updated by insert's flat scan (only advances on insert, delete does not touch it). `lock` is acquired via `CAS(0,1)` spinloop in `marufs_nrht_insert()` to serialize bucket linking and post-insert dedup within a shard.
+
+### 3.3 NRHT Entry (128B, 2 CL)
+
+`marufs_nrht_entry` — Unified entry. CL0 is accessed on every chain walk, CL1 only on hash match.
+
+| CL | Size | Field | Description |
+|----|------|-------|-------------|
+| CL0 | 4B | `state` | CAS target: EMPTY(0) / INSERTING(1) / TENTATIVE(2) / VALID(3) / TOMBSTONE(4) |
+| CL0 | 4B | `next_in_bucket` | Chain link (`BUCKET_END` = end) |
+| CL0 | 8B | `name_hash` | 64-bit SHA-256 truncated hash |
+| CL0 | 8B | `offset` | Offset within target region's data area |
+| CL0 | 4B | `target_region_id` | RAT entry ID of referenced region |
+| CL0 | 4B | `inserter_node` | Inserting node_id (for stale detection) |
+| CL0 | 8B | `created_at` | Creation time ns (stale INSERTING timeout) |
+| CL0 | 24B | `reserved0` | Padding (64B CL boundary) |
+| CL1 | 64B | `name` | Null-terminated name (max 63 chars + NUL) |
+
+The CPU fetches only the CLs actually accessed, so on hash mismatch during chain walk, only CL0 (64B) is read and CL1 (name) is not accessed. This achieves the same CL access pattern without separating into distinct arrays.
+
+---
+
+## 4. NRHT Entry Lifecycle
+
+5-state pattern with shard lock serialization. Delete only transitions to TOMBSTONE; TOMBSTONE entries are reused in-place by the insert path.
+
+### 4.1 State Transition Diagram
+
+```mermaid
+flowchart LR
+ EMPTY -->|"insert (claim)"| INSERTING
+ INSERTING -->|"fields written"| TENTATIVE
+ TENTATIVE -->|"dedup pass"| VALID
+ TENTATIVE -->|"dedup fail"| TOMBSTONE
+ VALID -->|"delete"| TOMBSTONE
+ TOMBSTONE -->|"insert (in-place reuse)"| INSERTING
+ TOMBSTONE -->|"inline unlink"| EMPTY
+ INSERTING -->|"stale (GC Phase 4)"| TOMBSTONE
+```
+
+**Invariant**: Only INSERTING, TENTATIVE, VALID, TOMBSTONE exist in chains. EMPTY is always outside chains (flat scan only).
+
+### 4.2 State Transition Details
+
+| Transition | Event | CAS condition | Function |
+|------------|-------|---------------|----------|
+| EMPTY → INSERTING | Insert: flat scan | `CAS(state, EMPTY, INSERTING)` | `nrht_claim_entry()` |
+| TOMBSTONE → INSERTING | Insert: chain reuse (in-place) | `CAS(state, TOMBSTONE, INSERTING)` | `nrht_claim_entry()` |
+| INSERTING → TENTATIVE | Insert: fields written, pre-lock publish | `WRITE_LE32(state, TENTATIVE)` | `marufs_nrht_insert()` step 4 |
+| TENTATIVE → VALID | Insert: dedup passed under shard lock | `WRITE_LE32(state, VALID)` | `marufs_nrht_insert()` step 8 |
+| TENTATIVE → TOMBSTONE | Insert: dedup lost or link failed under shard lock | `WRITE_LE32(state, TOMBSTONE)` | `marufs_nrht_insert()` step 6-7 |
+| VALID → TOMBSTONE | Delete: logical delete | `CAS(state, VALID, TOMBSTONE)` | `marufs_nrht_delete()` |
+| TOMBSTONE → EMPTY | Dup check: inline unlink (after removing from chain) | `CAS(state, TOMBSTONE, EMPTY)` | `nrht_check_duplicate()` |
+| INSERTING → TOMBSTONE | GC Phase 4: stale reclaim | `CAS(state, INSERTING, TOMBSTONE)` | `marufs_nrht_gc_sweep_all()` |
+
+### 4.3 Lifecycle Differences from Global Index
+
+| Item | Global Index | NRHT |
+|------|-------------|------|
+| States | 4 (EMPTY/INSERTING/VALID/TOMBSTONE) | 5 (+TENTATIVE) |
+| Insert serialization | Lock-free (CAS only) | Shard lock (CAS spinlock on `shard_header->lock`) |
+| Post-insert dedup | Lock-free, lower entry_idx wins | Under shard lock, first to lock wins |
+| Delete | CAS VALID→TOMBSTONE (stays in chain) | Same |
+| Chain reuse targets | TOMBSTONE/EMPTY (first dead entry in-place reuse) | Same |
+| TOMBSTONE → EMPTY | `check_duplicate` inline unlink (after removing from chain) | Same |
+| Stale INSERTING | GC Phase 2: INSERTING→TOMBSTONE | GC Phase 4: INSERTING→TOMBSTONE |
+| Orphan tracking | Phase 3 DRAM tracker | Same (MARUFS_ORPHAN_NRHT) |
+| Flat scan optimization | DRAM `atomic free_hint` | CXL shard header `free_hint` |
+
+---
+
+## 5. Operation Flowcharts
+
+### 5.1 Init
+
+`marufs_nrht_init()` — Format NRHT file.
+
+```mermaid
+flowchart LR
+ Alloc{"physical memory
allocated?"} -->|No| Init["CXL physical alloc"]
+ Alloc -->|Yes| Double{"already formatted?"}
+ Init --> Format
+ Double -->|Yes| Fail(["-EEXIST"])
+ Double -->|No| Format
+
+ Format["zero-fill area"] --> Header["Write Header + Shard Headers"]
+ Header --> Buckets["Initialize bucket arrays"]
+ Buckets --> Done(["success"])
+```
+
+**Two initialization paths**:
+- **ftruncate path**: `ftruncate()` → CXL physical alloc → `NRHT_INIT` → format. Double-init prevention (magic check).
+- **Direct alloc path**: `NRHT_INIT` detects no physical memory → auto-calls `marufs_region_init()`. Double-init check skipped in this case (stale magic from recycled region possible).
+
+### 5.2 Insert
+
+`marufs_nrht_insert()` — Register name → (offset, target_region_id).
+
+```mermaid
+flowchart LR
+ Resolve["hash"] --> Dup{"dup check
(chain walk)"}
+
+ Dup -->|duplicate| Fail(["-EEXIST"])
+ Dup -->|pass| Slot["Claim entry
dead chain reuse (TOMBSTONE/EMPTY)
or flat scan (free_hint)"]
+ Slot -->|fail| Full(["-ENOSPC"])
+ Slot -->|success| Fill["write fields + WMB"]
+ Fill --> Tent["TENTATIVE"]
+ Tent --> Lock["acquire shard lock"]
+ Lock --> Link["Link to bucket
(skip if chain reuse)"]
+ Link --> Dedup["Post-insert dedup"]
+ Dedup -->|"dup found"| TS["TOMBSTONE + unlock"]
+ Dedup -->|"unique"| Valid["VALID + unlock"]
+ Valid --> Done(["success"])
+ TS --> Fail2(["-EEXIST"])
+```
+
+**Slot claim priority**: Chain reuse (first dead entry — TOMBSTONE or EMPTY — in-place reuse, remaining dead entries are inline unlinked + transitioned to EMPTY) → Flat scan (starting from `free_hint`, O(1) amortized). Same pattern as Global Index `check_duplicate`.
+
+**Shard lock**: After writing fields and transitioning to TENTATIVE, the inserter acquires a CAS spinlock on `shard_header->lock`. Under the lock, bucket linking and post-insert dedup execute atomically, eliminating the TOCTOU race where two inserters both pass dedup before seeing each other's entry. The lock scope is per-shard, so inserts to different shards proceed in parallel.
+
+**Post-insert dedup**: Under shard lock, walks the bucket chain looking for another VALID entry with the same name. If found, the current insert loses (transitions to TOMBSTONE). Since the lock serializes inserts within a shard, the first inserter to acquire the lock wins.
+
+### 5.3 Lookup
+
+`marufs_nrht_lookup()` — Look up (offset, target_region_id) by name.
+
+```mermaid
+flowchart LR
+ Resolve["hash"] --> Walk["Start chain walk"]
+
+ Walk --> CL0["Read CL0
state, hash"]
+ CL0 --> Valid{VALID?}
+ Valid -->|No| Next["next entry"]
+ Valid -->|Yes| Hash{hash match?}
+
+ Hash -->|No| Next
+ Hash -->|Yes| CL1["Read CL1
name compare"]
+ CL1 --> Match{match?}
+ Match -->|No| Next
+ Match -->|Yes| Return(["offset, region_id"])
+
+ Next --> End{end of chain?}
+ End -->|No| CL0
+ End -->|Yes| NotFound(["-ENOENT"])
+```
+
+### 5.4 Delete
+
+`marufs_nrht_delete()` — CAS VALID → TOMBSTONE (logical delete only).
+
+```mermaid
+flowchart LR
+ Resolve["hash"] --> Find["Chain walk → VALID entry"]
+ Find --> Found{found?}
+ Found -->|No| NotFound(["-ENOENT"])
+ Found -->|Yes| TS["CAS: VALID → TOMBSTONE"]
+ TS --> Done(["success"])
+```
+
+Delete only performs TOMBSTONE transition. Chain unlink and entry reuse are handled by the insert path's `check_duplicate` (in-place reuse or inline unlink). Completes in a single CXL write (CAS), and CAS includes an implicit full barrier so no separate WMB is needed.
+
+---
+
+## 6. Stale INSERTING Detection and GC
+
+`nrht_is_stale()` — Pure function (no side effects) that determines staleness of INSERTING entries. Used in GC Phase 4.
+
+```mermaid
+flowchart LR
+ Start(["INSERTING found"]) --> Own{"my_node?"}
+ Own -->|No| Other(["-1: other node"])
+ Own -->|"node==0"| Orphan(["0: orphan → DRAM tracker"])
+ Own -->|Yes| TS{"timestamp?"}
+ TS -->|"==0"| Indeterminate(["0: indeterminate → DRAM tracker"])
+ TS -->|present| Age{"> 5s?"}
+ Age -->|No| Fresh(["0: valid"])
+ Age -->|Yes| Stale(["1: stale → TOMBSTONE"])
+```
+
+**GC Phase 4** (`marufs_nrht_gc_sweep_all`): Enumerates NRHT regions from DRAM bitmap (`gc_nrht_bitmap`), sweeps ~25% of shards per cycle via `gc_epoch`-based round-robin. Stale entries are transitioned to TOMBSTONE (not EMPTY, because if a crash occurs after `link_to_bucket` but before `publish`, the entry may already be linked in the chain, and an EMPTY in the chain could be claimed by flat scan for a cross-bucket, causing chain corruption).
+
+**Orphan handling**: Indeterminate cases where `node==0` or `created_at==0` are registered in the DRAM tracker via `marufs_gc_track_orphan(MARUFS_ORPHAN_NRHT)`. Claimed and reclaimed after 2-phase timeout in Phase 3 (see Doc 3).
+
+---
+
+## 7. CL0/CL1 Access Pattern
+
+### 7.1 Access Sequence
+
+```mermaid
+sequenceDiagram
+ participant App as Application
+ participant K as Kernel (nrht.c)
+ participant CXL as CXL Memory
+
+ App->>K: FIND_NAME("layer0_kv_42")
+ K->>CXL: RMB(64) + entry[7] CL0: state=VALID, hash=0xAB..
+ Note over K: hash mismatch → skip (no CL1 access)
+ K->>CXL: RMB(64) + entry[42] CL0: state=VALID, hash=0xCD..
+ Note over K: hash match!
+ K->>CXL: RMB(64) + entry[42] CL1: name="layer0_kv_42"
+ Note over K: name match → hit
+ K->>App: offset=0x1000, region_id=3
+```
+
+Although each entry is 128B (2 CL), the CPU **fetches only the CLs actually accessed**.
+During chain walk, if only CL0 is accessed, CL1 (name) is not fetched.
+CL1 is read only on hash match for name comparison.
+RMB is also applied selectively: `MARUFS_CXL_RMB(e, 64)` (CL0) / `MARUFS_CXL_RMB(&e->name, 64)` (CL1).
+
+### 7.2 CL Access Comparison
+
+| Scenario | CL0 reads | CL1 reads | Total CL reads |
+|----------|-----------|-----------|----------------|
+| Lookup miss (chain 3 hops) | 3 | 0 | 3 |
+| Lookup hit (hit on 2nd hop) | 2 | 1 | 3 |
+| Insert (dup check 3 hops + entry claim) | 3~4 | 0~1 | 4~5 |
+
+---
+
+## 8. Relationship with Global Index
+
+```mermaid
+flowchart TD
+ subgraph GI["Global Index"]
+ gi_entry["filename → region_id
(file location)"]
+ end
+
+ subgraph NRHT_FILE["NRHT File (independent file)"]
+ nrht_entry["name → (offset, target_region_id)
(intra-region reference)"]
+ end
+
+ subgraph Regions["Region Files"]
+ r0["Region 0 (data)"]
+ r1["Region 1 (data)"]
+ r2["Region 2 (data)"]
+ end
+
+ gi_entry -->|"region_id=0"| r0
+ gi_entry -->|"region_id=1"| r1
+ gi_entry -->|"region_id=2"| r2
+ gi_entry -->|"region_id=5
(NRHT is also a region)"| NRHT_FILE
+
+ nrht_entry -->|"target_region_id=0, offset=0x1000"| r0
+ nrht_entry -->|"target_region_id=1, offset=0x2000"| r1
+ nrht_entry -->|"target_region_id=0, offset=0x3000"| r0
+```
+
+- **Global Index**: `filename → region_id` (filesystem level — `open()`, `unlink()`, etc.)
+- **NRHT**: `name → (offset, target_region_id)` (application level — KV cache keys, etc.)
+- A single NRHT can **freely reference multiple regions** (N:M relationship)
+- The NRHT file itself is a **regular region** registered in the Global Index (has its own RAT entry)
+
+---
+
+## 9. Multi-Region Binding Example
+
+KV cache scenario: distribute per-layer KV tensors across multiple regions, unified indexing via a single NRHT.
+
+```mermaid
+flowchart LR
+ subgraph NRHT["NRHT: my_kv_index"]
+ e1["layer0_token_42
→ region_0, offset=0"]
+ e2["layer1_token_42
→ region_1, offset=0"]
+ e3["layer0_token_99
→ region_0, offset=4096"]
+ e4["layer2_token_42
→ region_2, offset=0"]
+ end
+
+ subgraph R0["Region 0: kv_layer0"]
+ d0["KV data (layer 0)"]
+ end
+ subgraph R1["Region 1: kv_layer1"]
+ d1["KV data (layer 1)"]
+ end
+ subgraph R2["Region 2: kv_layer2"]
+ d2["KV data (layer 2)"]
+ end
+
+ e1 --> d0
+ e3 --> d0
+ e2 --> d1
+ e4 --> d2
+```
+
+**Usage flow**:
+
+```
+1. Create regions (per layer)
+ fd_r0 = open("/mnt/marufs/kv_layer0", O_CREAT); ftruncate(fd_r0, 128MB);
+ fd_r1 = open("/mnt/marufs/kv_layer1", O_CREAT); ftruncate(fd_r1, 128MB);
+
+2. Create NRHT file + initialize
+ fd_nrht = open("/mnt/marufs/my_kv_index", O_CREAT);
+ ioctl(fd_nrht, MARUFS_IOC_NRHT_INIT, {max_entries: 8192, num_shards: 4});
+
+3. Register name-ref
+ ioctl(fd_nrht, MARUFS_IOC_NAME_OFFSET, {
+ name: "layer0_token_42", offset: 0, target_region_fd: fd_r0
+ });
+
+4. Look up name-ref
+ ioctl(fd_nrht, MARUFS_IOC_FIND_NAME, {name: "layer0_token_42"});
+ // → offset=0, region_name="kv_layer0"
+```
+
+---
+
+## 10. Sizing
+
+Default configuration (num_shards=64). `bucket_array_size` is 64B boundary aligned.
+
+| total entries | entries/shard | buckets/shard | NRHT Header + Shard Headers | Per-shard size | Total size |
+|---------------|---------------|---------------|-------|-------|-------|
+| 4,096 | 64 | 16 | 4,160B | 64B + 8KB = ~8KB | ~520KB |
+| 32,768 | 512 | 128 | 4,160B | 512B + 64KB = ~65KB | ~4MB |
+| 524,288 (default) | 8,192 | 2,048 | 4,160B | 8KB + 1MB = ~1MB | ~65MB |
+
+- `entries_per_shard = max_entries / num_shards`
+- `buckets_per_shard = entries_per_shard / 4` (default load factor), `roundup_pow_of_two()`
+- Per-shard = `bucket_array(CL-aligned)` + `entries(E × 128B)`
+- `num_shards` is specified in `NRHT_INIT` (default 64, max 64)
+
+---
+
+## 11. Dangling Reference Handling
+
+If the region referenced by `target_region_id` in NRHT is deleted, a dangling reference occurs.
+
+**Handling approach: Lazy validation**
+- On insert: validates that `target_region_id`'s RAT state is `ALLOCATED` → otherwise `-EINVAL`
+- On lookup: current implementation has no dangling check (caller's responsibility)
+- NRHT file and region lifecycles are **completely independent** — deleting a region does not automatically clean up NRHT entries referencing it
+
+---
+
+## 12. Internal Function Summary
+
+| Function | Role |
+|----------|------|
+| `nrht_get_header()` | RAT → phys_offset → DAX pointer → magic/version validation |
+| `nrht_get_shard_ctx()` | Read shard header → entry array DAX pointer → `nrht_shard_ctx` |
+| `nrht_resolve_bucket()` | Common prologue: hash → shard → determine bucket_head |
+| `nrht_name_matches()` | CL1 RMB + hash compare + name compare |
+| `nrht_is_stale()` | Same-node stale INSERTING detection (pure function, for GC integration) |
+| `nrht_claim_entry()` | CAS INSERTING + inserter_node/created_at stamp |
+| `nrht_check_duplicate()` | Chain walk: dup detection + record first dead entry (TOMBSTONE/EMPTY) for in-place reuse + inline unlink remaining dead entries |
+| `nrht_find_chain()` | Chain walk: search VALID entry + return prev_next (prefetch optimized) |
+| `nrht_link_to_bucket()` | CAS bucket prepend (publish is caller's responsibility) |
+| `nrht_post_insert_dedup()` | Under shard lock: chain walk dup detection, returns -EEXIST if duplicate VALID found |
+| `marufs_nrht_init()` | NRHT format: parameter validation → size calculation → zero + header/shard/bucket init |
+| `marufs_nrht_insert()` | Dup check → claim entry → write fields → TENTATIVE → shard lock → link → dedup → VALID/TOMBSTONE → unlock |
+| `marufs_nrht_lookup()` | Resolve bucket → find chain → return offset/region_id |
+| `marufs_nrht_delete()` | Resolve bucket → find chain → CAS VALID→TOMBSTONE (stays in chain) |
+| `marufs_nrht_gc_sweep_all()` | Enumerate NRHT via RAT bitmap → shard round-robin sweep → stale INSERTING→TOMBSTONE + orphan tracking |
diff --git a/marufs_kernel/docs/5_arch_acl.md b/marufs_kernel/docs/5_arch_acl.md
new file mode 100644
index 0000000..9313ef7
--- /dev/null
+++ b/marufs_kernel/docs/5_arch_acl.md
@@ -0,0 +1,268 @@
+# Doc 5: ACL / Permission Model
+
+> **Source files**: `acl.c` (permission check, delegation grant), `file.c` (check call sites, chown, perm_set_default), `dir.c` (unlink permission), `gc.c` (dead delegation sweep)
+
+---
+
+## 1. Overview
+
+marufs access control operates at the **RAT entry level**. Each file (region) has one RAT entry, and ownership, default permission, and delegation table are all stored in that RAT entry's cachelines.
+
+Key characteristics:
+- **Permission check occurs at actual access time**, not at open() (mmap, read, page_mkwrite, etc.)
+- **open() is always allowed** — supports challenge-response auth patterns
+- **Owner has implicit ALL** — no separate permission bit check needed
+- **Delegations are stored on CXL** — immediately visible cross-node
+
+---
+
+## 2. Permission Bits
+
+| Bit | Value | Meaning | Check point |
+|-----|-------|---------|-------------|
+| `PERM_READ` | 0x01 | read(), mmap(PROT_READ) | `read_iter`, `mmap` |
+| `PERM_WRITE` | 0x02 | mmap(PROT_WRITE), page_mkwrite | `mmap`, `page_mkwrite` |
+| `PERM_DELETE` | 0x04 | unlink | `unlink` |
+| `PERM_ADMIN` | 0x08 | chown, perm_set_default | `ioctl(CHOWN)`, `ioctl(PERM_SET_DEFAULT)` |
+| `PERM_IOCTL` | 0x10 | name_offset, clear_name, etc. | `ioctl(NAME_OFFSET)`, `ioctl(CLEAR_NAME)` |
+| `PERM_GRANT` | 0x20 | delegate to third parties | `ioctl(PERM_GRANT)` |
+| `PERM_ALL` | 0x3F | all | — |
+
+---
+
+## 3. Permission Check Flow
+
+**Function**: `marufs_check_permission()`
+
+```mermaid
+flowchart LR
+ Start(["check_permission(rat_id, required)"]) --> State{RAT state
== ALLOCATED?}
+ State -->|No| Deny(["-EACCES"])
+ State -->|Yes| Owner{"3.1 Owner?"}
+ Owner -->|Yes| Allow(["0 (allowed)"])
+ Owner -->|No| Default{"default_perms
sufficient?"}
+ Default -->|Yes| Allow
+ Default -->|No| Deleg{"3.2 Delegation
table scan"}
+ Deleg -->|match| Birth["3.3 Lazy
birth_time"]
+ Birth --> Allow
+ Deleg -->|no match| Deny
+
+ style Owner fill:#1E8449,color:#fff
+ style Deleg fill:#2E86C1,color:#fff
+```
+
+### 3.1 Owner Check (`marufs_is_owner`)
+
+3-stage comparison defending against PID reuse:
+
+```mermaid
+flowchart LR
+ Node{"node_id
== mine?"} -->|No| False(["not owner"])
+ Node -->|Yes| PID{"pid
== current?"}
+ PID -->|No| False
+ PID -->|Yes| Birth{"birth_time
== current?"}
+ Birth -->|No| False
+ Birth -->|Yes| True(["owner"])
+```
+
+- Reads `owner_node_id`, `owner_pid`, `owner_birth_time` from CL2
+- RMB ensures fresh read from CXL
+
+### 3.2 Delegation Table Scan
+
+RAT entry CL3-CL31 holds up to 29 delegation entries. Sequential scan:
+
+```mermaid
+flowchart LR
+ Entry["deleg_entries[i]"] --> State{state == ACTIVE?}
+ State -->|No| Next["next"]
+ State -->|Yes| Match{"(node_id, pid)
match?"}
+ Match -->|No| Next
+ Match -->|Yes| Perms{"(perms & required)
== required?"}
+ Perms -->|No| Next
+ Perms -->|Yes| Birth{"birth_time
verify"}
+ Birth -->|OK| Allow(["match"])
+ Birth -->|mismatch| Next
+```
+
+When `birth_time == 0`: delegation has been granted but the delegated process hasn't made its first access yet. Treated as match and lazy init is performed (3.3).
+
+### 3.3 Lazy birth_time Init
+
+Since the delegated process's `birth_time` may not be known at grant time (it could be a process on another node), it is stamped via CAS on first access:
+
+```c
+if (de->birth_time == 0)
+ CAS(de->birth_time, 0, current->start_boottime);
+```
+
+After this, GC can determine dead processes via `birth_time`.
+
+---
+
+## 4. Permission Check Timing
+
+```mermaid
+sequenceDiagram
+ participant App
+ participant VFS
+ participant marufs
+
+ App->>VFS: open(path, flags)
+ VFS->>marufs: marufs_open()
+ Note over marufs: always allowed (no perm check)
+ marufs-->>App: fd
+
+ App->>VFS: read(fd, buf, len)
+ VFS->>marufs: read_iter()
+ marufs->>marufs: check_permission(PERM_READ)
+ marufs-->>App: data or -EACCES
+
+ App->>VFS: mmap(fd, PROT_READ|PROT_WRITE)
+ VFS->>marufs: mmap()
+ marufs->>marufs: check_permission(PERM_READ)
+ marufs->>marufs: check_permission(PERM_WRITE)
+ marufs-->>App: addr or -EACCES
+
+ App->>App: *addr = data (write fault)
+ VFS->>marufs: page_mkwrite()
+ marufs->>marufs: check_permission(PERM_WRITE)
+ marufs-->>VFS: VM_FAULT_LOCKED or SIGBUS
+
+ App->>VFS: unlink(path)
+ VFS->>marufs: marufs_unlink()
+ marufs->>marufs: check_permission(PERM_DELETE)
+ Note over marufs: on failure, check force_unlink
+ marufs-->>App: 0 or -EACCES
+```
+
+**Why open() is always allowed**: In the pattern where a daemon creates a region and passes the fd to another process, the receiving process may not have a delegation at open time. After open, delegation is granted → permission check at subsequent mmap/read.
+
+---
+
+## 5. Delegation Grant Flow
+
+**Function**: `marufs_deleg_grant()`, **ioctl**: `MARUFS_IOC_PERM_GRANT`
+
+### 5.1 Permission Validation
+
+The scope of grantable permissions depends on the requester's permissions:
+
+| Requester permission | Grantable scope |
+|---------------------|-----------------|
+| Owner or ADMIN | All permissions (including ADMIN, GRANT) |
+| GRANT | READ, WRITE, DELETE, IOCTL only (cannot grant ADMIN or GRANT itself) |
+| Other | -EACCES |
+
+### 5.2 Grant Flow
+
+```mermaid
+flowchart LR
+ Req(["grant(node_id, pid, perms)"]) --> Validate{"perms valid?
node_id > 0?
pid > 0?"}
+ Validate -->|No| Fail(["-EINVAL"])
+ Validate -->|Yes| Scan["Delegation table scan"]
+ Scan --> Found{"existing (node_id, pid)
match?"}
+ Found -->|Yes| Upsert["CAS-loop:
perms |= new_perms"]
+ Found -->|No| Free{"free slot?"}
+ Free -->|No| Full(["-ENOSPC"])
+ Free -->|Yes| Claim["CAS: EMPTY → GRANTING"]
+ Claim -->|fail| Retry{"retry < 3?"}
+ Retry -->|Yes| Scan
+ Retry -->|No| Fail2(["-EAGAIN"])
+ Claim -->|success| Fill["field write + WMB"]
+ Fill --> Publish["state = ACTIVE"]
+ Upsert --> Done(["0 (success)"])
+ Publish --> Done
+```
+
+### 5.3 Delegation Entry State Transitions
+
+```mermaid
+flowchart LR
+ EMPTY -->|"grant (CAS claim)"| GRANTING
+ GRANTING -->|"field write + publish"| ACTIVE
+ ACTIVE -->|"upsert (CAS perms OR)"| ACTIVE
+ ACTIVE -->|"GC: dead process"| EMPTY
+ GRANTING -->|"GC: stale timeout 5s"| EMPTY
+```
+
+| Transition | CAS condition | Function |
+|------------|---------------|----------|
+| EMPTY → GRANTING | `CAS(state, EMPTY, GRANTING)` | `marufs_deleg_grant()` |
+| GRANTING → ACTIVE | `WRITE_ONCE(state, ACTIVE)` | `marufs_deleg_grant()` |
+| ACTIVE → ACTIVE | `CAS-loop(perms, old, old\|new)` | `marufs_deleg_try_upsert()` |
+| ACTIVE → EMPTY | `CAS(state, ACTIVE, EMPTY)` | `gc.c` delegation sweep |
+| GRANTING → EMPTY | `CAS(state, GRANTING, EMPTY)` | `gc.c` delegation sweep |
+
+---
+
+## 6. Default Permission
+
+**Function**: `marufs_ioctl_perm_set_default()`, **ioctl**: `MARUFS_IOC_PERM_SET_DEFAULT`
+
+Requires ADMIN permission. Sets the `default_perms` field of the RAT entry, granting baseline permissions to all non-owners without delegation.
+
+```c
+WRITE_LE16(rat_entry->default_perms, perms);
+MARUFS_CXL_WMB(rat_entry, sizeof(*rat_entry));
+```
+
+Common usage patterns:
+- `default_perms = PERM_READ`: allow read from all nodes
+- `default_perms = PERM_READ | PERM_WRITE`: allow read/write from all nodes
+- `default_perms = 0`: only owner and delegated processes can access
+
+---
+
+## 7. Dead Process Detection
+
+**Function**: `marufs_owner_is_dead()`
+
+```mermaid
+flowchart LR
+ PID{"pid == 0?"} -->|Yes| Alive(["false (not recorded)"])
+ PID -->|No| Find["find_get_pid(pid)"]
+ Find --> Exists{exists?}
+ Exists -->|No| Dead1(["true"])
+ Exists -->|Yes| Task["get_pid_task()"]
+ Task --> TaskOK{task exists?}
+ TaskOK -->|No| Dead2(["true"])
+ TaskOK -->|Yes| Birth{"birth_time
match?"}
+ Birth -->|No| Dead3(["true (PID reuse)"])
+ Birth -->|Yes| Alive2(["false"])
+```
+
+Detects PID reuse via `birth_time` comparison. Uses `start_boottime` (monotonic from boot), so collision probability is extremely low even after reboot.
+
+---
+
+## 8. CXL Storage Locations
+
+Permission-related fields are distributed across the RAT entry:
+
+| CL | Field | Purpose |
+|----|-------|---------|
+| CL2 | `default_perms` (2B) | non-owner baseline |
+| CL2 | `owner_node_id` (2B) | owner node |
+| CL2 | `owner_pid` (4B) | owner process |
+| CL2 | `owner_birth_time` (8B) | PID reuse defense |
+| CL2 | `uid`, `gid`, `mode` (8B) | POSIX compatibility (future) |
+| CL2 | `deleg_num_entries` (2B) | active delegation count |
+| CL3-CL31 | `deleg_entries[29]` (29 × 64B) | delegation table |
+
+Owner check (CL2) and delegation scan (CL3+) are in separate CLs, so the owner fast-path fetches only CL2.
+
+---
+
+## 9. Internal Function Summary
+
+| Function | Role |
+|----------|------|
+| `marufs_check_permission()` | Unified permission check: owner → default → delegation |
+| `marufs_is_owner()` | 3-stage owner detection (node_id → pid → birth_time) |
+| `marufs_deleg_matches()` | Delegation entry matching (node_id + pid + birth_time + perms) |
+| `marufs_deleg_grant()` | Delegation grant: upsert or new entry CAS claim |
+| `marufs_deleg_try_upsert()` | Delegation table scan: perms OR on existing match, else return free slot |
+| `marufs_deleg_entry_clear()` | Delegation entry field reset (state is caller's responsibility) |
+| `marufs_owner_is_dead()` | PID lookup + birth_time comparison for dead process detection |
+| `marufs_ioctl_perm_set_default()` | Set default_perms (requires ADMIN) |
diff --git a/marufs_kernel/docs/6_arch_mount_io.md b/marufs_kernel/docs/6_arch_mount_io.md
new file mode 100644
index 0000000..a2dc395
--- /dev/null
+++ b/marufs_kernel/docs/6_arch_mount_io.md
@@ -0,0 +1,392 @@
+# Doc 6: Mount/Unmount & I/O Flow
+
+> **Source files**: `super.c` (mount/unmount, format), `file.c` (read_iter, mmap, open/release), `dir.c` (create, unlink, readdir, lookup), `inode.c` (iget, setattr, cross-node i_size sync)
+
+---
+
+## 1. Overview
+
+The system-level lifecycle of marufs consists of **mount → file I/O → unmount**. At mount, CXL memory is mapped and metadata structures are validated; at unmount, resources are released in reverse order.
+
+Key characteristics:
+- **CXL DAX device based**: DEV_DAX character device (`/dev/daxN.M`) directly mapped via `memremap`
+- **Two-phase create**: `open(O_CREAT)` → RAT reservation + index insert (lightweight), `ftruncate()` → region allocation (physical space)
+- **d_revalidate=0**: VFS dentry cache invalidation ensures cross-node consistency
+
+---
+
+## 2. Mount Options
+
+| Option | Format | Description |
+|--------|--------|-------------|
+| `node_id` | `node_id=N` (N > 0) | Node ID for multi-node access control |
+| `daxdev` | `daxdev=/dev/daxN.M` | DEV_DAX character device path |
+| `format` | `format` | Perform format at mount time |
+
+---
+
+## 3. Mount Flow
+
+**Function**: `marufs_fill_super()` → `marufs_fill_super_common()`
+
+```mermaid
+flowchart LR
+ Parse["Parse mount options"] --> DevDAX["DEV_DAX init"]
+
+ DevDAX --> Format{"format option?"}
+ Format -->|Yes| DoFormat["marufs_format_device()"]
+ Format -->|No| SkipFormat["Skip (preserve existing data)"]
+
+ DoFormat --> ReadSB
+ SkipFormat --> ReadSB
+
+ ReadSB["Load + validate superblock"] --> ActiveNodes["active_nodes_set
(duplicate mount detection)"]
+
+ ActiveNodes --> ShardTable["Build shard_cache (DRAM)"]
+
+ ShardTable --> RAT["Load + validate RAT"]
+
+ RAT --> VFS["Configure VFS superblock"]
+
+ VFS --> RootInode["Create root inode"]
+
+ RootInode --> Cache["Init entry cache"]
+
+ Cache --> Sysfs["Register sysfs"]
+
+ Sysfs --> GC["Start GC thread"]
+```
+
+### 3.1 DEV_DAX Memory Mapping
+
+`marufs_dax_acquire_devdax()`:
+
+1. Read physical address (`resource`) and size (`size`) from sysfs
+2. `memremap(phys_addr, dev_size, MEMREMAP_WB)` → `sbi->dax_base`
+3. `pfn_valid()` → detect ZONE_DEVICE struct pages (determines `VM_MIXEDMAP` for GPU DMA)
+4. Open DAX device file (`sbi->dax_filp`) → for mmap delegation to device_dax driver
+5. Store `sbi->phys_base` (for remap_pfn_range fallback)
+
+User mmap delegates to device_dax driver for WC (Write-Combining) pgprot. NVIDIA driver recognizes device_dax VMAs, enabling `cudaHostRegister`.
+
+### 3.2 Format Procedure (`marufs_format_device`)
+
+Initializes the entire CXL memory metadata area during format:
+
+1. **Zero clear**: `memset` + WMB
+2. **Superblock**: magic, version, total_size, shard geometry
+3. **Shard table**: per-shard header (magic, bucket/entry array offsets)
+4. **Bucket arrays**: initialize all to `MARUFS_BUCKET_END`
+5. **Entry arrays**: zero (ENTRY_EMPTY = 0)
+6. **RAT**: magic, version, regions_start, total_free
+7. **Verification readback**: RMB then re-verify critical fields
+
+---
+
+## 4. Unmount Flow
+
+**Function**: `marufs_kill_sb()`
+
+```mermaid
+flowchart LR
+ GC["Stop GC thread"] --> ActiveNodes["active_nodes_clear"]
+ ActiveNodes --> Sysfs["Unregister sysfs"]
+ Sysfs --> Cache["Destroy entry cache"]
+ Cache --> Shard["Free shard resources"]
+ Shard --> DaxFilp["Close dax_filp"]
+ DaxFilp --> ReleaseDax["DAX memunmap"]
+ ReleaseDax --> Free["Free superblock"]
+```
+
+Key ordering constraints:
+- **GC must stop first**: If GC is iterating RAT/index when the mapping is released → panic
+- **active_nodes_clear → sysfs/cache release → DAX release**: Notify other nodes of unmount before releasing resources
+
+---
+
+## 5. File I/O Paths
+
+### 5.1 Create (Two-Phase Model)
+
+**Function**: `marufs_create()` (Phase 1), `marufs_setattr()` (Phase 2)
+
+```mermaid
+sequenceDiagram
+ participant App
+ participant VFS
+ participant dir.c
+ participant region.c
+ participant index.c
+ participant inode.c
+
+ Note over App,inode.c: Phase 1: open(O_CREAT) — lightweight reservation
+
+ App->>VFS: open(path, O_CREAT)
+ VFS->>dir.c: marufs_create()
+ dir.c->>region.c: marufs_rat_alloc_entry(name, 0, 0)
+ region.c-->>dir.c: rat_entry_id
+ Note over dir.c: write uid/gid/mode
+ dir.c->>index.c: marufs_index_insert(name, rat_entry_id)
+ index.c-->>dir.c: entry_idx
+ dir.c->>dir.c: marufs_new_inode() + d_instantiate()
+ dir.c-->>App: fd (i_size=0, data_phys_offset=0)
+
+ Note over App,inode.c: Phase 2: ftruncate(size) — physical allocation
+
+ App->>VFS: ftruncate(fd, size)
+ VFS->>inode.c: marufs_setattr(ATTR_SIZE)
+ inode.c->>inode.c: check_permission(PERM_WRITE)
+ Note over inode.c: if i_size>0 then -EACCES (no realloc)
+ inode.c->>region.c: marufs_region_init(rat_entry_id, size)
+ region.c-->>inode.c: phys_offset
+ inode.c->>inode.c: truncate_setsize() + rat_sync_size()
+ inode.c-->>App: 0
+```
+
+Phase 1 failure rollback:
+- index insert failure → `marufs_rat_free_entry()`
+- inode creation failure → `marufs_index_delete()` + `marufs_rat_free_entry()`
+
+### 5.2 Lookup
+
+**Function**: `marufs_lookup()`
+
+```mermaid
+flowchart LR
+ Lookup(["lookup(name)"]) --> Hash["marufs_hash_name()"]
+ Hash --> Shard["shard selection
(hash & shard_mask)"]
+ Shard --> Index["marufs_index_lookup()
bucket → chain walk"]
+ Index --> Found{found?}
+ Found -->|No| Neg["d_splice_alias(NULL)
→ negative dentry"]
+ Found -->|Yes| Iget["marufs_iget()
read metadata from RAT"]
+ Iget --> Splice["d_splice_alias(inode)"]
+```
+
+`marufs_iget()` **always performs a fresh read from CXL**:
+- Even cached inodes re-read `i_size`, `data_phys_offset`, ownership from RAT
+- Returns `-ESTALE` if GC has freed the RAT entry
+
+### 5.3 Read
+
+**Function**: `marufs_read_iter()`
+
+```mermaid
+sequenceDiagram
+ participant App
+ participant VFS
+ participant file.c
+ participant CXL
+
+ App->>VFS: read(fd, buf, len)
+ VFS->>file.c: read_iter(iocb, to)
+
+ file.c->>file.c: check_permission(PERM_READ)
+ Note over file.c: return -EACCES on failure
+
+ file.c->>CXL: read fresh size from RAT entry
+ Note over file.c: cross-node i_size sync
(reflects remote ftruncate)
+
+ alt data_phys_offset == 0
+ file.c->>CXL: re-check RAT phys_offset
+ Note over file.c: update data_phys_offset
after remote ftruncate
+ end
+
+ file.c->>file.c: pos/count boundary check
+ file.c->>CXL: RMB(data_ptr, count)
+ Note over CXL: CPU cache invalidation
(ensures WB reader sees
fresh data from WC writer)
+ CXL-->>file.c: data
+ file.c->>App: copy_to_iter() → bytes read
+```
+
+**Direct read** — bypasses page cache, copies directly from CXL memory via `copy_to_iter()`:
+- Copies directly from `sbi->dax_base + data_phys_offset + pos`
+- RMB ensures cross-node freshness
+
+### 5.4 mmap
+
+**Function**: `marufs_mmap()`
+
+```mermaid
+flowchart LR
+ Start(["mmap(fd, prot, flags)"]) --> RdOnly{"VM_WRITE on
O_RDONLY fd?"}
+ RdOnly -->|Yes| Deny1(["-EACCES"])
+ RdOnly -->|No| PermR["check_permission(PERM_READ)"]
+ PermR -->|fail| Deny2(["-EACCES"])
+ PermR -->|ok| WriteCheck{"VM_WRITE?"}
+ WriteCheck -->|Yes| PermW["check_permission(PERM_WRITE)"]
+ WriteCheck -->|No| Route
+ PermW -->|fail| Deny3(["-EACCES"])
+ PermW -->|ok| Route
+
+ Route{"dax_filp
available?"}
+ Route -->|Yes| Delegate["device_dax mmap delegation
vma_set_file(dax_filp)
→ NVIDIA cudaHostRegister compatible
+ pgprot_writecombine()"]
+ Route -->|No| Fallback["remap_pfn_range
+ pgprot_writecombine()
VM_PFNMAP"]
+
+ Delegate --> Done(["return 0"])
+ Fallback --> Done
+```
+
+Page fault paths after mmap:
+
+| Fault type | Handler | Behavior |
+|-----------|---------|----------|
+| Read fault | `marufs_fault()` → `filemap_fault()` | Map page from page cache |
+| Write fault | `marufs_page_mkwrite()` | `check_permission(PERM_WRITE)` → page lock → dirty marking |
+
+### 5.5 Unlink
+
+**Function**: `marufs_unlink()`
+
+```mermaid
+sequenceDiagram
+ participant App
+ participant VFS
+ participant dir.c
+ participant index.c
+ participant region.c
+
+ App->>VFS: unlink(path)
+ VFS->>dir.c: marufs_unlink()
+
+ dir.c->>dir.c: check_permission(PERM_DELETE)
+ alt permission denied
+ dir.c->>dir.c: marufs_can_force_unlink()?
+ Note over dir.c: dead owner +
no active delegations → allow
+ end
+
+ dir.c->>index.c: marufs_index_delete(name)
+ Note over index.c: CAS: VALID → TOMBSTONE
+
+ dir.c->>region.c: marufs_unlink_cleanup_region()
+ Note over region.c: CAS: ALLOCATED → DELETING
→ marufs_rat_free_entry()
+
+ dir.c->>dir.c: drop_nlink(inode)
+ dir.c-->>App: 0
+```
+
+Deletion order constraint: **index TOMBSTONE first → RAT FREE** (reversing would create dangling reference). If CAS `ALLOCATED→DELETING` fails in `marufs_unlink_cleanup_region()`, GC is assumed to have already handled it, skip.
+
+---
+
+## 6. Readdir
+
+**Function**: `marufs_iterate()`
+
+RAT-based readdir — iterates only the RAT entry array (256 entries) instead of the index entry array (4 shards × 256 = 1024):
+
+```mermaid
+flowchart LR
+ Start(["readdir()"]) --> Dot["ctx->pos=0: emit '.'"]
+ Dot --> DotDot["ctx->pos=1: emit '..'"]
+ DotDot --> Scan["i = ctx->pos - 2"]
+ Scan --> Loop{"i < MAX_RAT_ENTRIES?"}
+ Loop -->|No| Done(["return 0"])
+ Loop -->|Yes| RMB["RMB(rat_entry[i])"]
+ RMB --> State{"state ==
ALLOCATED?"}
+ State -->|No| Next["i++"]
+ State -->|Yes| Name{"name_len > 0?"}
+ Name -->|No| Next
+ Name -->|Yes| Emit["dir_emit(name, ino)"]
+ Emit --> Full{"buffer full?"}
+ Full -->|Yes| Save["ctx->pos = i + 3"]
+ Full -->|No| Next
+ Next --> Loop
+ Save --> Done
+```
+
+`ctx->pos` encoding: `0`=`.`, `1`=`..`, `2+i`=RAT entry[i]. When VFS re-enters after buffer full, it resumes from the saved pos.
+
+---
+
+## 7. d_revalidate=0 — Cross-Node Consistency
+
+```mermaid
+sequenceDiagram
+ participant Node_A
+ participant CXL
+ participant Node_B
+
+ Node_A->>CXL: create("file_x") → index insert + RAT alloc
+ Note over Node_B: "file_x" not in
existing dentry cache
+
+ Node_B->>Node_B: open("file_x")
+ Note over Node_B: d_revalidate() → return 0
→ VFS invalidates dentry
+ Node_B->>CXL: marufs_lookup("file_x")
+ CXL-->>Node_B: index entry (VALID)
+ Node_B->>CXL: marufs_iget() → metadata from RAT
+ Node_B-->>Node_B: return fd
+```
+
+Since `marufs_d_revalidate()` always returns 0, **every open() performs a fresh lookup from CXL**. This means:
+- create/unlink from other nodes is immediately visible
+- Dentry cache performance loss exists, but CXL latency (~200ns) makes it practical
+- `i_size` for already-opened fds is separately refreshed from RAT in `read_iter()`
+
+---
+
+## 8. Cross-Node i_size Synchronization
+
+Path for a local node to observe size set by a remote node's `ftruncate()`:
+
+| Timing | Function | Behavior |
+|--------|----------|----------|
+| `open()` → lookup | `marufs_iget()` → `marufs_inode_fill_from_entry()` | RAT `size` field → `inode->i_size` |
+| `read()` | `marufs_read_iter()` | RAT `size` re-read → `i_size_write()` |
+| `stat()` | `marufs_getattr()` | RAT `size` re-read → `inode->i_size` |
+| write-back | `marufs_write_inode()` → `marufs_rat_sync_size()` | `inode->i_size` → RAT `size` + `modified_at` |
+
+`data_phys_offset` is refreshed the same way: in `read_iter()`, if `xi->data_phys_offset == 0`, RAT `phys_offset` is re-checked to detect whether the region has been initialized after a remote ftruncate.
+
+---
+
+## 9. Internal Function Summary
+
+### Mount/Unmount
+
+| Function | Role |
+|----------|------|
+| `marufs_fill_super()` | Mount entry point: option parsing, DAX init, format, call common |
+| `marufs_fill_super_common()` | superblock → active_nodes_set → shard table → RAT → VFS → root inode → cache → sysfs → GC |
+| `marufs_kill_sb()` | Unmount: GC stop → active_nodes_clear → sysfs → cache → shard resources → dax_filp close → DAX memunmap |
+| `marufs_dax_acquire_devdax()` | Read sysfs phys/size → memremap(WB) → ZONE_DEVICE detection |
+| `marufs_dax_release()` | `memunmap(dax_base)` |
+| `marufs_format_device()` | superblock/shard/bucket/RAT init + WMB + readback verification |
+| `marufs_read_superblock()` | magic/version validation, geometry copy to sbi |
+| `marufs_init_shard_table()` | shard header validation, build shard_cache (DRAM) |
+| `marufs_load_rat()` | RAT magic/version validation, set sbi->rat |
+| `marufs_active_nodes_set()` | CAS to set active_nodes bit (duplicate mount detection) |
+| `marufs_active_nodes_clear()` | CAS to clear active_nodes bit |
+
+### File I/O
+
+| Function | Role |
+|----------|------|
+| `marufs_open()` | `generic_file_open()` + allocate batch buffer (`marufs_file_priv`) |
+| `marufs_release()` | Free batch buffer (`kvfree(private_data)`) |
+| `marufs_read_iter()` | PERM_READ check → cross-node size sync → CXL direct copy |
+| `marufs_write_iter()` | Rejects `write()` syscall (`-EACCES`) |
+| `marufs_mmap()` | PERM_READ/WRITE check → device_dax mmap delegation |
+| `marufs_fault()` | Delegates to `filemap_fault()` (page cache read fault) |
+| `marufs_page_mkwrite()` | PERM_WRITE check → page lock → dirty marking |
+| `marufs_read_folio()` | zero-fill (data is populated via mmap write) |
+| `marufs_ioctl()` | NRHT, permission, chown, etc. |
+
+### Directory
+
+| Function | Role |
+|----------|------|
+| `marufs_lookup()` | Global index hash lookup → `marufs_iget()` |
+| `marufs_create()` | Two-phase Phase 1: RAT reservation + index insert |
+| `marufs_unlink()` | PERM_DELETE check → index TOMBSTONE → RAT DELETING→FREE |
+| `marufs_iterate()` | RAT-based readdir (iterate 256 entries) |
+
+### Inode
+
+| Function | Role |
+|----------|------|
+| `marufs_iget()` | index entry + RAT → create/refresh VFS inode (cross-node fresh read) |
+| `marufs_new_inode()` | Create new VFS inode (caller fills region_id, etc.) |
+| `marufs_setattr()` | Two-phase Phase 2: realloc check → `marufs_region_init()` → size sync |
+| `marufs_getattr()` | Fresh size read from RAT → `generic_fillattr()` |
+| `marufs_write_inode()` | `marufs_rat_sync_size()` — write size + modified_at to RAT |
+| `marufs_evict_inode()` | Page cache truncate + `clear_inode()` |
diff --git a/marufs_kernel/docs/7_arch_me.md b/marufs_kernel/docs/7_arch_me.md
new file mode 100644
index 0000000..0ec9a39
--- /dev/null
+++ b/marufs_kernel/docs/7_arch_me.md
@@ -0,0 +1,409 @@
+# ME Protocol: Token Lifecycle
+
+Cross-node mutual exclusion via CXL shared memory. Two strategies (order-driven, request-driven) share the same layout and differ only in **who's-next** decision.
+
+Source: `src/me.h`, `src/me.c`, `src/me_order.c`, `src/me_request.c`.
+
+---
+
+## 1. Shared State Layout
+
+Physical byte layout:
+
+```
+[Header 64B] [CB × S] [Membership × N] [Slot × S × N]
+```
+
+| Struct | Scope | Writer | Reader | Purpose |
+|---|---|---|---|---|
+| `marufs_me_header` | area | formatter | all | magic/offsets/`format_generation` |
+| `marufs_me_cb` | per-shard | current holder | all | `magic`, `holder`, `generation`, `acquire_count` (reserved `state` field is on-disk but currently unused) |
+| `marufs_me_membership_slot` | per-node | own node | all | `magic`, `status`, `node_id`, `joined_at`, `heartbeat`, `heartbeat_ts`, `pending_shards_mask` |
+| `marufs_me_slot` | per-(shard, node) | see below | target node | `magic`, `from_node`, doorbell (`token_seq`, `cb_gen_at_write`) + request payload (`sequence`, `requesting`, `requested_at`, `granted_at`) |
+
+**Slot writer rule**:
+- OD (token arrival): **current holder** writes successor's slot (`token_seq`, `cb_gen_at_write`).
+- RD (hand-raise): **requester** writes own slot (`requesting=1`, `sequence`, `requested_at`).
+- RD (grant): **current holder** writes requester's slot (`granted_at`, then OD-style `token_seq` bump).
+
+Invariant: each slot has at most **1 writer / 1 reader** at any moment → no multi-host hot-polling of a shared CL.
+
+---
+
+## 2. Overview
+
+Two abstraction levels: per-node lifecycle (coarse phases from mount to unmount) and per-(shard, node) state machine (fine-grained token ownership states within steady state). Thread-level sequence and memory-level byte evolution live in §3.
+
+### 2.1 Lifecycle (per node)
+
+Coarse-grained phases of a single node from mount to unmount. The `steady state` phase is where all useful work happens — §2.2 defines per-shard states within that phase.
+
+```mermaid
+flowchart LR
+ MOUNT([mount]):::evt
+ JOIN["join
claim NONE shards"]:::phase
+ ACTIVE["active
(see §3 for thread view)"]:::active
+ LEAVE["leave
hand off held tokens"]:::phase
+ UMOUNT([unmount]):::evt
+
+ MOUNT --> JOIN --> ACTIVE --> LEAVE --> UMOUNT
+
+ classDef evt fill:#eeeeee,stroke:#555,color:#000
+ classDef phase fill:#e3f2fd,stroke:#1976d2,color:#000
+ classDef active fill:#fff3e0,stroke:#e65100,color:#000,font-weight:bold
+```
+
+### 2.2 State Machine (per shard, per node)
+
+```mermaid
+flowchart TD
+ NONE1["NONE
not a member"]:::none
+ MEMBER["MEMBER
no token"]:::active
+ HOLDER_BUSY["HOLDER_BUSY
holds token, in CS"]:::holder
+ HOLDER_IDLE["HOLDER_IDLE
holds token, no CS"]:::idle
+
+ NONE1 -->|join| MEMBER
+ MEMBER -->|token arrives, self-takeover on timeout| HOLDER_BUSY
+ HOLDER_BUSY -->|release| HOLDER_IDLE
+ HOLDER_IDLE -->|pass_token| MEMBER
+ HOLDER_IDLE -->|reuse token| HOLDER_BUSY
+ HOLDER_IDLE -->|leave| NONE1
+ MEMBER -->|leave| NONE1
+
+ classDef none fill:#eeeeee,stroke:#888,color:#000
+ classDef active fill:#e3f2fd,stroke:#1976d2,color:#000
+ classDef holder fill:#fff3e0,stroke:#e65100,color:#000
+ classDef idle fill:#f3e5f5,stroke:#6a1b9a,color:#000
+```
+
+Crash recovery: no dedicated state transition on remote-holder death. A thread calling `acquire` in the MEMBER state times out in `wait_for_token`, checks the holder's `heartbeat_ts` stall, runs `me_pass_token(self)`, and transitions to HOLDER_BUSY. No distributed watcher — the blocked acquirer is the only trigger.
+
+## 3. Thread Interaction & Memory Access
+
+### 3.1 Order-Driven: Token Pass (A → B)
+
+OD token circulates autonomously — receiver doesn't ask. PollThread_B has real work (flip `is_holder` even when no app thread waits).
+
+```mermaid
+sequenceDiagram
+ autonumber
+ participant AppThread_A
+ participant MEM as CXL_ME
+ participant PollThread_B
+ participant AppThread_B
+
+ rect rgb(255,248,240)
+ Note over AppThread_A: sender: release after CS on shard s
+ AppThread_A->>AppThread_A: ME_UNHOLD, me_shard_passable check
+ AppThread_A->>AppThread_A: read cached_successor (DRAM, = B)
+ AppThread_A->>MEM: me_pass_token: WRITE CB (holder=B, gen++), WMB
+ AppThread_A->>MEM: WRITE Slot[s,B].token_seq++ (doorbell)
+ AppThread_A->>AppThread_A: ME_LOSE_HOLDER(sh) (own is_holder=false)
+ end
+
+ Note over PollThread_B,AppThread_B: receiver side — two mutually exclusive cases:
+ alt Case A: app thread already in acquire() when doorbell arrives
+ rect rgb(240,255,240)
+ Note over AppThread_B: AppThread_B is looping in wait_for_token
+ AppThread_B->>MEM: RMB own Slot.token_seq
+ MEM-->>AppThread_B: bumped
+ AppThread_B->>MEM: RMB CB (double-check holder==self, gen>last_cb_gen)
+ AppThread_B->>AppThread_B: ME_BECOME_HOLDER(sh), return 0
+ AppThread_B->>MEM: acquire_count++
+ Note over AppThread_B: enter CS (PollThread_B's detection would be redundant)
+ end
+ else Case B: no app thread waiting
+ rect rgb(240,248,255)
+ Note over PollThread_B: PollThread_B is fallback consumer — poll_cycle 10us
+ PollThread_B->>MEM: RMB own Slot[s,B].token_seq
+ MEM-->>PollThread_B: seq bumped
+ PollThread_B->>PollThread_B: poll_last_slot_seq = cur, ME_BECOME_HOLDER(sh)
+ PollThread_B->>MEM: tick heartbeat (once per cycle, first holder shard)
+ Note over PollThread_B: passable + successor!=self? → auto-forward
else keep token until app acquires
+ end
+ rect rgb(240,255,240)
+ Note over AppThread_B: later, AppThread_B calls acquire()
+ AppThread_B->>AppThread_B: mutex_lock, ME_HOLD, wait_for_token fast path
+ AppThread_B->>AppThread_B: ME_IS_HOLDER(sh) == true → return 0 immediately
+ AppThread_B->>MEM: acquire_count++
+ Note over AppThread_B: enter CS
+ end
+ end
+
+ rect rgb(255,235,205)
+ Note over AppThread_B: post-CS: release
+ AppThread_B->>AppThread_B: passable + cached_successor!=self → pass to next ring node
(else keep, avoid ping-pong when B dominates)
+ end
+```
+
+**Memory access summary**:
+- A reads: own `CB[s].holder` (confirm self), `cached_successor` (DRAM).
+- A writes: `CB[s]` (holder, gen), then `Slot[s,B]` (doorbell). CB first so any reader seeing new `token_seq` is guaranteed to see fresh CB.
+- B reads: `Slot[s,B].token_seq` (poll), `CB[s]` (double-check).
+- B writes: nothing until post-CS release.
+
+**OD-specific characteristics**:
+- No `requesting` / `pending_shards_mask` writes — token flows purely on ring order.
+- Holder picks successor from `cached_successor` (DRAM, refreshed by poll's `next_active` scan) — no request slot scan.
+- Release decision: pass if passable AND `cached_successor != self`; keep-token if local app hot (avoid cross-node ping-pong).
+- **Receiver-side detection is shared**: both RD and OD use the same `wait_for_token` — if an app thread is already waiting when the doorbell arrives, it detects the `token_seq` bump itself (slow-path loop) and flips `is_holder`. Poll thread's detection is redundant in that case.
+- **Poll thread's OD-unique role**: token can arrive at a node with **no local waiter** (OD doesn't require a request). PollThread_B is then the sole consumer — flips `is_holder`, ticks heartbeat, and may auto-forward to the next ring node (autonomous circulation). If an app thread shows up later, `wait_for_token` fast path (`ME_IS_HOLDER`) returns immediately without CB RMB.
+
+---
+
+### 3.2 Request-Driven: Request + Grant (B wants token from A)
+
+app thread = VFS-side `acquire/release`. poll thread = `poll_cycle` on `MAppThread_BUFS_ME_DEFAULT_POLL_US` period (10µs default). **CB is written only on grant, read only on grant/takeover check**. Steady-state path polls per-node slot + membership only.
+
+```mermaid
+sequenceDiagram
+ autonumber
+ participant AppThread_B
+ participant MEM as CXL_ME
+ participant PollThread_A
+ participant AppThread_A
+
+ rect rgb(240,255,240)
+ Note over AppThread_B: requester: raise hand on shard s
+ AppThread_B->>MEM: WRITE Slot.sequence++ (fence stale grants from prior attempts)
+ AppThread_B->>MEM: WRITE Slot.requesting=1 (raise hand - holder now owns writes to this slot)
+ AppThread_B->>MEM: WMB Slot (publish fully populated request before bit visible)
+ AppThread_B->>MEM: CAS Membership.pending_mask |= (1<>AppThread_A: me_shard_passable check (no local waiters, hold=0)
+ AppThread_A->>MEM: full Slot scan (release path - mask snapshot may be stale)
+ MEM-->>AppThread_A: peer B Slot requesting = 1
+ AppThread_A->>MEM: me_pass_token: CB first (holder=B, gen++), then Slot[s,B].token_seq++
+ AppThread_A->>AppThread_A: own is_holder = false
+ end
+ else Path 2: grant via poll (backstop — holder idle, no release traffic)
+ rect rgb(240,248,255)
+ Note over PollThread_A: PollThread_A on poll_cycle 10us
+ PollThread_A->>MEM: RMB peer Membership.pending_mask (one pass - amortized)
+ MEM-->>PollThread_A: peers_pending has bit s
+ PollThread_A->>MEM: RMB own Slot.token_seq (doorbell - holder transition w/o CB)
+ PollThread_A->>MEM: masked Slot scan (skip RMB on nodes with bit clear)
+ PollThread_A->>MEM: me_pass_token (same CB→doorbell as release path)
+ end
+ end
+
+ rect rgb(240,255,240)
+ Note over AppThread_B: requester wakes, enters CS
+ AppThread_B->>MEM: RMB own Slot.token_seq (changed - wake trigger)
+ AppThread_B->>MEM: RMB CB (double-check: holder=B AND gen>last_cb_gen, rejects phantom/stale pass)
+ AppThread_B->>MEM: WRITE Slot.requesting=0 (return slot ownership to self post-CS)
+ AppThread_B->>MEM: CAS Membership.pending_mask &= ~(1<>AppThread_B: me_shard_passable: yes → scan+grant immediately (else keep token, avoid ping-pong)
+ end
+```
+
+**Memory access summary**:
+- B writes (raise hand): `Slot[s,B].{sequence, requesting, requested_at}`, then `Membership[B].pending_shards_mask` bit.
+- A reads (scan): `Slot[s, idx].requesting` for `idx = me_idx+1 .. max_nodes` (round-robin).
+- A writes (grant): `Slot[s,B].granted_at`, then CB (`holder`, `gen++`), then `Slot[s,B].token_seq++` (doorbell).
+- B reads (wake): `Slot[s,B].token_seq`, `CB[s]`.
+- B writes (post-CS): `Slot[s,B].requesting = 0`, then clear mask bit.
+
+**Writer ownership split of `Slot[s,B]`** (governed by `requesting`):
+- `requesting == 0` → **owner B** writes (raise hand).
+- `requesting == 1` → **holder A** writes (grant + doorbell).
+
+**RD-specific characteristics**:
+- Requester: single app-thread path — set `pending_shards_mask` bit after slot WMB (CAS, 64-retry bound).
+- Holder: `release()` immediate path = **full slot scan** (avoids missing a request due to mask snapshot skip), `poll_cycle` path = **mask-filtered scan** (skip slot RMB on nodes with bit clear).
+- poll thread: detects holder transition via own slot `token_seq` bump (no CB RMB). After `is_holder=true` flip, takes on heartbeat + grant duties from the same cycle onward.
+- Crash detection: no distributed watch in poll. A blocked acquirer hits `wait_for_token` timeout, checks holder's `heartbeat_ts` stall → self-takeover (`me_pass_token(s, self)`).
+
+---
+
+### 3.3 Cacheline Snapshot — Before / After 1 Token Pass
+
+Concrete CL value transitions, complementing §3.1/3.2 sequence diagrams. 2-node example (A=1, B=2) for some shard s, starting gen=5, starting `token_seq` at B = 99.
+
+**Order-driven: A holds, passes to B (A idle, poll_cycle)**
+
+| CL | Field | Before | After step 1 (CB) | After step 2 (doorbell) | Writer |
+|---|---|---|---|---|---|
+| CB | `holder` | 1 | **2** | 2 | A |
+| CB | `generation` | 5 | **6** | 6 | A |
+| CB | `acquire_count` | 42 | 42 | 42 | — |
+| Slot[A] | `token_seq` | 100 | 100 | 100 | — |
+| Slot[B] | `token_seq` | 99 | 99 | **100** | A |
+| Slot[B] | `cb_gen_at_write` | 4 | 4 | **6** | A |
+| Slot[B] | `from_node` | — | — | **1** | A |
+| A DRAM | `sh.is_holder` | true | true | **false** | A (local) |
+| B DRAM | `sh.is_holder` | false | false | false → **true** (on next poll) | B (local) |
+
+Ordering: CB WMB precedes Slot WMB. Any reader seeing the new `token_seq` on Slot[B] is guaranteed to see `holder=2, gen=6` on CB.
+
+**Request-driven: B raises hand, A grants**
+
+| CL | Field | T0 (idle) | T1 (raise hand) | T2 (grant) | T3 (post-CS) | Writer |
+|---|---|---|---|---|---|---|
+| Slot[B] | `sequence` | n | **n+1** | n+1 | n+1 | B |
+| Slot[B] | `requesting` | 0 | **1** | 1 | **0** | B |
+| Slot[B] | `requested_at` | — | **now_ns** | now_ns | now_ns | B |
+| Slot[B] | `granted_at` | — | — | **now_ns** | now_ns | A |
+| Slot[B] | `token_seq` | 99 | 99 | **100** | 100 | A |
+| Slot[B] | `cb_gen_at_write` | 4 | 4 | **6** | 6 | A |
+| Slot[B] | `from_node` | — | — | **1** | 1 | A |
+| Membership[B] | `pending_shards_mask` | old | **old \| (1<B is non-holder for s — own Slot.token_seq unchanged, no CB RMB
+ end
+
+ rect rgb(240,255,240)
+ Note over AppThread_B: phase 1: acquire attempt
+ Note over AppThread_B: request_acquire(s)
+ alt RD
+ AppThread_B->>MEM: WRITE Slot[s,B].requesting = 1
+ AppThread_B->>MEM: CAS Membership[B].pending_shards_mask |= (1<>MEM: RMB Slot[s,B].token_seq (unchanged)
+ end
+ end
+
+ rect rgb(255,248,240)
+ Note over AppThread_B: phase 3: second-chance liveness probe
(counter-based — CXL peers don't share clock origin)
+ AppThread_B->>MEM: RMB CB[s]
+ MEM-->>AppThread_B: holder = A
+ AppThread_B->>MEM: RMB Membership[A].heartbeat
+ MEM-->>AppThread_B: hb_before = N
+ Note over AppThread_B: usleep 100ms on local clock (no cross-node arithmetic)
+ AppThread_B->>MEM: RMB Membership[A].heartbeat
+ MEM-->>AppThread_B: hb_after
+ AppThread_B->>MEM: RMB CB[s] (re-read: catches races during probe sleep)
+ MEM-->>AppThread_B: holder_after, cb_gen_after
+ end
+
+ alt Case A: holder_after == self (late grant landed during probe)
+ rect rgb(240,255,240)
+ Note over AppThread_B: phase 4A: late grant — enter CS directly
+ AppThread_B->>AppThread_B: sync last_cb_gen, last_token_seq, ME_BECOME_HOLDER
+ AppThread_B-->>AppThread_B: return 0
+ end
+ else Case B: holder_after != holder OR hb_after != hb_before
+ rect rgb(224,240,255)
+ Note over AppThread_B: phase 4B: someone else acted, or holder alive — back off
+ AppThread_B-->>AppThread_B: return -ETIMEDOUT (no CB write)
+ Note over AppThread_B: caller retries or propagates error
+ end
+ else Case C: holder unchanged AND counter stuck — A crashed
+ rect rgb(255,230,230)
+ Note over AppThread_B: phase 4C: self-takeover
+ Note over AppThread_B: pr_warn "crash detected ... heartbeat stuck at N"
+ AppThread_B->>MEM: me_pass_token(s, B): CB (holder=B, gen++) + Slot[s,B].token_seq++
+ Note over AppThread_B: return 0 (enter CS)
+ end
+ end
+```
+
+Key properties:
+- **No distributed watch**. poll_cycle RMBs only own doorbell slot + peer membership (for successor+pending). It never touches CB.
+- **Acquirer is the detector**. An idle shard with no waiter is not monitored; first acquirer arrival triggers the check.
+- **Counter-based liveness**. Compares holder's `heartbeat` counter before/after a 100ms observer-local sleep. Counter advance ⇒ alive; stuck ⇒ crashed. `heartbeat_ts` is observability-only — not used for decision since CXL peers don't share a `ktime_get_ns()` zero point (per-node boot times differ, direct subtraction is meaningless).
+- **Why 100ms probe**. Holder ticks heartbeat on every poll cycle (10µs default). 100ms = 10000× safety margin — survives severe scheduler stalls, preemption, brief IRQ storms. Longer window delays takeover but avoids false-positive crash calls.
+- **Generation-fence safety** (Case A only). `me_pass_token(s, self)` bumps `cb->generation` — any in-flight doorbell from A (if A revives) fails the reader's `cb_gen > last_cb_gen` check and is rejected as stale.
+- **Heartbeat location**. Lives in `Membership[A]`, not CB — A's liveness ticks (when alive) don't invalidate CB's read-mostly CL.
+
+---
+
+## 4. Stats & Bench Integration
+
+Three sysfs entries expose ME runtime counters, per-sbi and aggregated across registered MEs. Writing any value resets the writable ones.
+
+| Sysfs attr | Mode | Scope |
+|---|---|---|
+| `/sys/fs/marufs/me_info` | rw | per-instance snapshot (holder/gen/heartbeat) |
+| `/sys/fs/marufs/me_poll_stats` | rw | coarse poll-thread cost (RMB counts, cycle ns) |
+| `/sys/fs/marufs/me_fine_stats` | rw | per-CPU fine-grained — wait latencies, poll-phase breakdown, lock hold, grant age |
+| `/sys/fs/marufs/me_per_shard_acquire` | ro | per-shard acquire counter (hotspot detection) |
+| `/sys/fs/marufs/me_poll_thread_cpu` | ro | poll kthread CPU usage |
+
+### 4.1 `me_poll_stats` (coarse)
+
+| Counter | Increment site |
+|---|---|
+| `poll_cycles` | once per `poll_cycle` invocation |
+| `poll_ns_total` | ktime delta summed per cycle |
+| `poll_rmb_cb` | `me_pass_token` CB magic check — **not** on steady-state poll path |
+| `poll_rmb_slot` | every `RMB(Slot[...])` (own doorbell + grant scan + pass-token) |
+| `poll_rmb_membership` | every `RMB(Membership[...])` (heartbeat tick + peer pass) |
+
+### 4.2 `me_fine_stats` (per-CPU, aggregated on read)
+
+From `struct marufs_me_stats_pcpu`:
+
+| Counter | Meaning |
+|---|---|
+| `wait_count`, `wait_wall_ns`, `wait_cpu_ns` | `wait_for_token` invocations, wall + on-CPU time |
+| `wait_fast_hit` | `ME_IS_HOLDER` early-return (no token wait) |
+| `wait_spin_hit` / `wait_sleep_hit` / `wait_deadline_hit` | exit-reason split |
+| `wait_lat_buckets[12]` | log2(ns) histogram, [<128ns .. ≥128ms] |
+| `poll_ns_membership` / `poll_ns_doorbell` / `poll_ns_scan` | poll-cycle phase breakdown |
+| `lock_hold_count`, `lock_hold_ns_total`, `lock_hold_buckets[12]` | CS hold time (mutex_lock → mutex_unlock) |
+| `grant_age_count`, `grant_age_buckets[12]` | RD `granted_at - requested_at` histogram |
+
+### 4.3 Bench Integration
+
+`tests/test_nrht_race --sweep` reads counters before/after each run and reports per-cycle rates — `cb/c`, `slot/c`, `mem/c` from `me_poll_stats`; `wait_avg`, `spin%`, `fast%`, `hold_avg`, `grant`, `poll_cpu%`, `mem%`/`door%`/`scan%` from `me_fine_stats`. Per-cycle rates are the fair comparison metric — absolute totals shift with cycle count (faster poll = more cycles even if the RMB-per-cycle workload dropped).
+
+Steady-state expectation after doorbell + mask optimizations:
+- `cb/c` ≈ 0 (CB RMB only on grant/takeover/invalidate/crash-probe)
+- `slot/c` ≈ `S` (own doorbell per shard) + grant scans (masked)
+- `mem/c` ≈ `N` (one membership pass per cycle) + 1 (holder heartbeat RMB)
+- `fast%` > 0 under keep-token-dominant workloads; `wait_deadline_hit` ≈ 0 in healthy runs.
+
+---
+
+## References
+
+- `src/me.h` — CXL structs, DRAM companion `struct marufs_me_shard`, inlines, vtable, pending-mask CAS.
+- `src/me.c` — lifecycle, common primitives, poll registry, `wait_for_token` + `me_handle_acquire_deadline` (counter-based crash probe).
+- `src/me_order.c` — order-driven vtable.
+- `src/me_request.c` — request-driven vtable, masked/full scan variants.
+- `src/sysfs.c` — `me_poll_stats`, `me_info`.
+- `tests/test_nrht_race.c` — bench harness with poll-stats integration.
+- `docs/4_arch_nrht.md` — NRHT ME integration (per-shard ME instances).
diff --git a/marufs_kernel/include/marufs_uapi.h b/marufs_kernel/include/marufs_uapi.h
new file mode 100644
index 0000000..8ece3c3
--- /dev/null
+++ b/marufs_kernel/include/marufs_uapi.h
@@ -0,0 +1,159 @@
+/* SPDX-License-Identifier: Apache-2.0 WITH Linux-syscall-note */
+/*
+ * marufs_uapi.h - MARUFS userspace API definitions
+ *
+ * Shared header for kernel module and userspace programs.
+ * Contains ioctl structures, commands, and constants.
+ */
+
+#ifndef _MARUFS_UAPI_H
+#define _MARUFS_UAPI_H
+
+#ifdef __KERNEL__
+#include
+#include
+#else
+#include
+#include
+#include
+#endif
+
+/* ── Constants ─────────────────────────────────────────────────────── */
+
+enum marufs_constants {
+ MARUFS_MAX_NODE_ID = 8,
+ MARUFS_NAME_MAX = 63,
+ MARUFS_DELEG_MAX = 29, /* Max delegation entries per region */
+ MARUFS_BATCH_FIND_MAX = 32,
+ MARUFS_BATCH_STORE_MAX = 32,
+};
+
+/* ME (Mutual Exclusion) strategy */
+enum marufs_me_strategy {
+ MARUFS_ME_ORDER = 0, /* Order-driven: token ring circulation */
+ MARUFS_ME_REQUEST = 1, /* Request-driven: holder grants on demand */
+};
+
+/* Permission bitmask */
+enum marufs_perm {
+ MARUFS_PERM_READ = 0x0001, /* read() and mmap(PROT_READ) */
+ MARUFS_PERM_WRITE = 0x0002, /* mmap(PROT_WRITE), page_mkwrite */
+ MARUFS_PERM_DELETE = 0x0004, /* unlink */
+ MARUFS_PERM_ADMIN =
+ 0x0008, /* chown, perm_set_default (ownership control) */
+ MARUFS_PERM_IOCTL = 0x0010, /* name_offset, clear_name ioctls */
+ MARUFS_PERM_GRANT =
+ 0x0020, /* perm_grant to third parties (auth proxy) */
+ MARUFS_PERM_ALL = 0x003F,
+};
+
+/* ── ioctl structures ──────────────────────────────────────────────── */
+
+/* Name-ref registration: name → (target_region:offset) in NRHT.
+ * Used for single ioctl and as array element for batch ioctl. */
+struct marufs_name_offset_req {
+ char name[MARUFS_NAME_MAX + 1]; /* input: name */
+ __u64 offset; /* input: offset within target region's data area */
+ __u64 name_hash; /* input: pre-computed hash (0 = auto) */
+ __s32 target_region_fd; /* input: fd of target region file */
+ __s32 status; /* output: 0=success, negative errno (batch) */
+};
+
+/* Name-ref lookup: name → (region_name, offset) from NRHT.
+ * Used for single ioctl and as array element for batch ioctl. */
+struct marufs_find_name_req {
+ char name[MARUFS_NAME_MAX + 1]; /* input: name to search */
+ char region_name[MARUFS_NAME_MAX +
+ 1]; /* output: target region file name */
+ __u64 offset; /* output: offset within target region's data area */
+ __u64 name_hash; /* input: pre-computed hash (0 = auto) */
+ __s32 status; /* output: 0=found, negative errno (batch) */
+ __u32 ref_count; /* output: NRHT entry ref_count */
+ __u32 pin_count; /* output: NRHT entry pin_count */
+};
+
+/* Batch find name request */
+struct marufs_batch_find_req {
+ __u32 count; /* input: number of entries (max MARUFS_BATCH_FIND_MAX) */
+ __u32 found; /* output: number of entries successfully found */
+ __u64 entries; /* input/output: userspace pointer to marufs_find_name_req[] */
+};
+
+/* Batch name-offset store request */
+struct marufs_batch_name_offset_req {
+ __u32 count; /* input: number of entries (max MARUFS_BATCH_STORE_MAX) */
+ __u32 stored; /* output: number of entries successfully stored */
+ __u64 entries; /* input/output: userspace pointer to marufs_name_offset_req[] */
+};
+
+/* Permission delegation ioctl */
+struct marufs_perm_req {
+ __u32 node_id; /* Target node (must be > 0) */
+ __u32 pid; /* Target PID (must be > 0) */
+ __u32 perms; /* Permission bitmask (MARUFS_PERM_*) */
+ __u32 reserved;
+};
+
+/* Ownership transfer (caller becomes new owner) */
+struct marufs_chown_req {
+ __u32 reserved; /* must be 0 */
+};
+
+/* NRHT ref/pin counter manipulation: name → (counter inc/dec).
+ * Used for REF_INC, REF_DEC, PIN_INC, PIN_DEC ioctls. Looks up the entry
+ * by name, performs the requested op under ME shard lock, returns the
+ * post-op counter value.
+ *
+ * Semantics:
+ * inc: fails with -EOVERFLOW if current value is UINT32_MAX
+ * dec: fails with -EINVAL if current value is 0
+ */
+struct marufs_refcnt_req {
+ char name[MARUFS_NAME_MAX + 1]; /* input: name */
+ __u64 name_hash; /* input: pre-computed hash (0 = auto) */
+ __u32 count; /* output: counter value after op */
+ __s32 status; /* output: 0=success, negative errno (reserved for batch) */
+};
+
+/* NRHT initialization request */
+struct marufs_nrht_init_req {
+ __u32 max_entries; /* 0 = default (524288), total across all shards */
+ __u32 num_shards; /* 0 = default (64), must be power of 2 */
+ __u32 num_buckets; /* 0 = default (max_entries / 4), total across all shards */
+ __u32 me_strategy; /* 0 = order, 1 = request (recommended). Zero-
+ * init ⇒ 0 ⇒ order; pass 1 explicitly for request.
+ * Mount-option default is request (see super.c).
+ */
+};
+
+/* ── ioctl commands ────────────────────────────────────────────────── */
+
+/* Global name management */
+#define MARUFS_IOC_NAME_OFFSET _IOW('X', 1, struct marufs_name_offset_req)
+#define MARUFS_IOC_FIND_NAME _IOWR('X', 2, struct marufs_find_name_req)
+#define MARUFS_IOC_CLEAR_NAME _IOW('X', 3, struct marufs_name_offset_req)
+#define MARUFS_IOC_BATCH_FIND_NAME _IOWR('X', 4, struct marufs_batch_find_req)
+#define MARUFS_IOC_BATCH_NAME_OFFSET \
+ _IOWR('X', 6, struct marufs_batch_name_offset_req)
+
+/* Permission delegation */
+#define MARUFS_IOC_PERM_GRANT _IOW('X', 10, struct marufs_perm_req)
+#define MARUFS_IOC_PERM_SET_DEFAULT _IOW('X', 13, struct marufs_perm_req)
+
+/* Ownership transfer */
+#define MARUFS_IOC_CHOWN _IOW('X', 14, struct marufs_chown_req)
+
+/* NRHT (Name-Ref Hash Table) */
+#define MARUFS_IOC_NRHT_INIT _IOW('X', 20, struct marufs_nrht_init_req)
+/* Explicit ME ring join — optional pre-warm alternative to lazy-init on
+ * first NAME_OFFSET. Idempotent (re-joining a cached instance is a no-op).
+ */
+#define MARUFS_IOC_NRHT_JOIN _IO('X', 21)
+
+/* Per-entry ref/pin counter manipulation (each acquires NRHT shard ME). */
+#define MARUFS_IOC_NRHT_REF_INC _IOWR('X', 22, struct marufs_refcnt_req)
+#define MARUFS_IOC_NRHT_REF_DEC _IOWR('X', 23, struct marufs_refcnt_req)
+#define MARUFS_IOC_NRHT_PIN_INC _IOWR('X', 24, struct marufs_refcnt_req)
+#define MARUFS_IOC_NRHT_PIN_DEC _IOWR('X', 25, struct marufs_refcnt_req)
+
+#endif /* _MARUFS_UAPI_H */
diff --git a/marufs_kernel/install.sh b/marufs_kernel/install.sh
new file mode 100755
index 0000000..dd158c9
--- /dev/null
+++ b/marufs_kernel/install.sh
@@ -0,0 +1,200 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+#
+# install.sh - Kernel filesystem module installation script
+#
+# Builds the kernel module, loads the module,
+# and optionally formats + mounts a device.
+#
+# Usage:
+# sudo ./install.sh # Build + load module only
+# sudo ./install.sh --mount /dev/dax0.0 # Build + load + format + mount
+# sudo ./install.sh --skip-build # Load pre-built module
+# sudo ./install.sh --module-name myfs # Custom module name
+
+set -e
+
+# Color definitions
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;36m'
+NC='\033[0m' # No Color
+
+# Default values
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+MODULE_NAME="${MARUFS_MODULE_NAME:-marufs}"
+BUILD_DIR="${MARUFS_BUILD_DIR:-$SCRIPT_DIR/build}"
+NODE_ID=1
+DO_MOUNT=0
+DO_FORMAT=0
+DEVICE=""
+MOUNT_POINT=""
+SKIP_BUILD=false
+
+# Helper functions with module name prefix
+log_info() { echo -e "${BLUE}[INFO]${NC} [$MODULE_NAME] $1"; }
+log_success() { echo -e "${GREEN}[ OK ]${NC} [$MODULE_NAME] $1"; }
+log_error() { echo -e "${RED}[ERR ]${NC} [$MODULE_NAME] $1"; }
+log_warn() { echo -e "${YELLOW}[WARN]${NC} [$MODULE_NAME] $1"; }
+
+# Print usage
+usage() {
+ cat << EOF
+Usage: sudo $0 [OPTIONS]
+
+Options:
+ --mount Mount device (e.g., /dev/dax0.0)
+ --format Format device on mount (first mount only)
+ --node-id Node ID (default: 1)
+ --mount-point Mount point (default: /mnt/)
+ --build-dir Build directory (default: build)
+ --module-name Module name (default: $MODULE_NAME)
+ --skip-build Skip build step (use existing binaries)
+ -h, --help Print help
+
+Examples:
+ sudo $0 # Build + load module
+ sudo $0 --mount /dev/dax0.0 --format # Build + load + format + mount
+ sudo $0 --mount /dev/dax0.0 # Build + load + mount (no format)
+ sudo $0 --mount /dev/dax0.0 --node-id 2 # Set node_id=2
+ sudo $0 --module-name myfs --mount /dev/dax0.0 # Custom module name
+EOF
+ exit 1
+}
+
+# Parse arguments
+while [[ $# -gt 0 ]]; do
+ case $1 in
+ --mount) DO_MOUNT=1; DEVICE="$2"; shift 2 ;;
+ --format) DO_FORMAT=1; shift ;;
+ --node-id) NODE_ID="$2"; shift 2 ;;
+ --mount-point) MOUNT_POINT="$2"; shift 2 ;;
+ --build-dir) BUILD_DIR="$2"; shift 2 ;;
+ --module-name) MODULE_NAME="$2"; shift 2 ;;
+ --skip-build) SKIP_BUILD=true; shift ;;
+ -h|--help) usage ;;
+ *) log_error "Unknown option: $1"; usage ;;
+ esac
+done
+
+# Default mount point based on module name
+if [ -z "$MOUNT_POINT" ]; then
+ MOUNT_POINT="/mnt/${MODULE_NAME}"
+fi
+
+# Validate --mount option
+if [ $DO_MOUNT -eq 1 ] && [ -z "$DEVICE" ]; then
+ log_error "Device must be specified when using --mount option"
+ usage
+fi
+
+MODULE_PATH="$BUILD_DIR/${MODULE_NAME}.ko"
+
+echo "============================================"
+echo " ${MODULE_NAME} Installation"
+echo "============================================"
+echo ""
+
+cd "$SCRIPT_DIR"
+
+# Step 1: Build
+if [ "$SKIP_BUILD" = false ]; then
+ log_info "Step 1/4: Building..."
+
+ make clean > /dev/null 2>&1 || true
+ make MODULE_NAME="$MODULE_NAME" -j128
+
+ if [ ! -f "$MODULE_PATH" ]; then
+ log_error "Build failed: $MODULE_PATH not found"
+ exit 1
+ fi
+
+ log_success "Build complete"
+else
+ log_info "Step 1/4: Skipping build (--skip-build)"
+ if [ ! -f "$MODULE_PATH" ]; then
+ log_error "Module not found: $MODULE_PATH. Run without --skip-build first."
+ exit 1
+ fi
+fi
+echo ""
+
+# Step 2: Unload existing module
+log_info "Step 2/4: Checking existing module..."
+
+if grep -q "^${MODULE_NAME} " /proc/modules 2>/dev/null; then
+ if mount | grep -q " type ${MODULE_NAME} "; then
+ log_error "${MODULE_NAME} is mounted. Unmount first or use uninstall.sh"
+ exit 1
+ fi
+ log_warn "Unloading existing ${MODULE_NAME} module..."
+ rmmod "${MODULE_NAME}"
+ log_success "Existing module unloaded"
+else
+ log_info "No existing module loaded"
+fi
+echo ""
+
+# Step 3: Load module
+log_info "Step 3/4: Loading module (node_id=$NODE_ID)..."
+
+insmod "$MODULE_PATH" node_id=$NODE_ID
+
+if ! grep -q "^${MODULE_NAME} " /proc/modules 2>/dev/null; then
+ log_error "Module load failed"
+ exit 1
+fi
+
+log_success "Module loaded (node_id=$NODE_ID)"
+echo ""
+
+# Step 4: Mount (optional)
+if [ $DO_MOUNT -eq 1 ]; then
+ if [ $DO_FORMAT -eq 1 ]; then
+ log_info "Step 4/4: Formatting and mounting..."
+ else
+ log_info "Step 4/4: Mounting..."
+ fi
+
+ if [ ! -e "$DEVICE" ]; then
+ log_error "Device not found: $DEVICE"
+ exit 1
+ fi
+
+ mkdir -p "$MOUNT_POINT"
+
+ if mount | grep -q " $MOUNT_POINT "; then
+ log_warn "$MOUNT_POINT already mounted. Unmounting..."
+ umount "$MOUNT_POINT"
+ fi
+
+ MOUNT_OPTS="daxdev=$DEVICE,node_id=$NODE_ID"
+ if [ $DO_FORMAT -eq 1 ]; then
+ MOUNT_OPTS="$MOUNT_OPTS,format"
+ fi
+
+ log_info "Mounting: $DEVICE -> $MOUNT_POINT (node_id=$NODE_ID)"
+ mount -t "${MODULE_NAME}" -o "$MOUNT_OPTS" none "$MOUNT_POINT"
+
+ if ! mount | grep -q "$MOUNT_POINT type ${MODULE_NAME}"; then
+ log_error "Mount failed"
+ exit 1
+ fi
+
+ log_success "Mounted: $DEVICE -> $MOUNT_POINT"
+else
+ log_info "Step 4/4: Skipping format/mount (no --mount option)"
+fi
+
+echo ""
+echo "============================================"
+echo " ${MODULE_NAME} Installation Complete!"
+echo "============================================"
+echo " Module: ${MODULE_NAME} (node_id=$NODE_ID)"
+if [ $DO_MOUNT -eq 1 ]; then
+ echo " Mount: $DEVICE -> $MOUNT_POINT"
+fi
+echo ""
+echo " Uninstall: sudo ./uninstall.sh"
+echo ""
diff --git a/marufs_kernel/setup-autoload.sh b/marufs_kernel/setup-autoload.sh
new file mode 100755
index 0000000..7098b41
--- /dev/null
+++ b/marufs_kernel/setup-autoload.sh
@@ -0,0 +1,295 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# setup-autoload.sh - Configure MARUFS kernel module auto-load at boot
+#
+# Usage:
+# sudo ./scripts/setup-autoload.sh # Install + auto-load
+# sudo ./scripts/setup-autoload.sh --mount /dev/dax6.0 # + auto-mount (fstab)
+# sudo ./scripts/setup-autoload.sh --mount /dev/dax6.0 --systemd # + auto-mount (systemd)
+# sudo ./scripts/setup-autoload.sh --uninstall # Remove all config
+# sudo ./scripts/setup-autoload.sh --status # Show current state
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# Find project root: walk up from SCRIPT_DIR until Makefile with MODULE_NAME is found
+PROJECT_DIR="$SCRIPT_DIR"
+while [ "$PROJECT_DIR" != "/" ]; do
+ [ -f "$PROJECT_DIR/Makefile" ] && grep -q 'MODULE_NAME' "$PROJECT_DIR/Makefile" 2>/dev/null && break
+ PROJECT_DIR="$(dirname "$PROJECT_DIR")"
+done
+
+MODULE_NAME="marufs"
+MODULE_KO="$PROJECT_DIR/build/${MODULE_NAME}.ko"
+KERNEL_VER="$(uname -r)"
+INSTALL_DIR="/lib/modules/${KERNEL_VER}/extra"
+MODPROBE_CONF="/etc/modprobe.d/${MODULE_NAME}.conf"
+MODULES_CONF="/etc/modules-load.d/${MODULE_NAME}.conf"
+SYSTEMD_UNIT="/etc/systemd/system/${MODULE_NAME}-mount.service"
+MOUNT_POINT="/mnt/${MODULE_NAME}"
+NODE_ID=1
+
+# Colors
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+CYAN='\033[0;36m'
+NC='\033[0m'
+
+log_info() { echo -e "${CYAN}[INFO]${NC} $1"; }
+log_ok() { echo -e "${GREEN}[ OK ]${NC} $1"; }
+log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
+log_err() { echo -e "${RED}[ERR ]${NC} $1"; }
+
+# --- Argument parsing ---
+ACTION="install"
+DAX_DEVICE=""
+USE_SYSTEMD=false
+
+while [[ $# -gt 0 ]]; do
+ case $1 in
+ --mount) DAX_DEVICE="$2"; shift 2 ;;
+ --systemd) USE_SYSTEMD=true; shift ;;
+ --node-id) NODE_ID="$2"; shift 2 ;;
+ --uninstall) ACTION="uninstall"; shift ;;
+ --status) ACTION="status"; shift ;;
+ --help|-h)
+ cat <<'USAGE'
+Usage: sudo ./scripts/setup-autoload.sh [OPTIONS]
+
+Actions:
+ (default) Install module + configure auto-load
+ --uninstall Remove all auto-load/mount config
+ --status Show current configuration state
+
+Options:
+ --mount DEV Also configure auto-mount (e.g., --mount /dev/dax6.0)
+ --systemd Use systemd unit instead of fstab for auto-mount
+ --node-id ID Node ID for mount (default: 1)
+USAGE
+ exit 0
+ ;;
+ *) log_err "Unknown option: $1"; exit 1 ;;
+ esac
+done
+
+# --- Root check ---
+if [ "$(id -u)" -ne 0 ]; then
+ log_err "Must run as root (sudo)"
+ exit 1
+fi
+
+# ============================================================================
+# STATUS
+# ============================================================================
+do_status() {
+ echo "============================================"
+ echo " MARUFS Auto-Load Status"
+ echo "============================================"
+
+ # Module installed?
+ if [ -f "${INSTALL_DIR}/${MODULE_NAME}.ko" ]; then
+ log_ok "Module installed: ${INSTALL_DIR}/${MODULE_NAME}.ko"
+ else
+ log_warn "Module not installed in system path"
+ fi
+
+ # Module loaded?
+ if lsmod | grep -q "^${MODULE_NAME} "; then
+ log_ok "Module loaded"
+ else
+ log_warn "Module not loaded"
+ fi
+
+ # Auto-load config?
+ if [ -f "$MODULES_CONF" ]; then
+ log_ok "Auto-load: $MODULES_CONF"
+ else
+ log_warn "Auto-load not configured"
+ fi
+
+ # modprobe config?
+ if [ -f "$MODPROBE_CONF" ]; then
+ log_ok "Modprobe config: $MODPROBE_CONF"
+ echo " $(cat "$MODPROBE_CONF" | grep -v '^#' | grep -v '^$')"
+ else
+ log_info "No modprobe config (using defaults)"
+ fi
+
+ # Auto-mount?
+ if grep -q "${MODULE_NAME}" /etc/fstab 2>/dev/null; then
+ log_ok "Auto-mount: fstab"
+ grep "${MODULE_NAME}" /etc/fstab | sed 's/^/ /'
+ elif [ -f "$SYSTEMD_UNIT" ]; then
+ local enabled
+ enabled=$(systemctl is-enabled ${MODULE_NAME}-mount.service 2>/dev/null || echo "disabled")
+ log_ok "Auto-mount: systemd ($enabled)"
+ else
+ log_info "Auto-mount not configured"
+ fi
+
+ # Current mounts?
+ if mount | grep -q "${MODULE_NAME}"; then
+ log_ok "Active mounts:"
+ mount | grep "${MODULE_NAME}" | sed 's/^/ /'
+ else
+ log_info "No active mounts"
+ fi
+
+ echo "============================================"
+}
+
+# ============================================================================
+# UNINSTALL
+# ============================================================================
+do_uninstall() {
+ echo "============================================"
+ echo " MARUFS Auto-Load Uninstall"
+ echo "============================================"
+
+ # Remove systemd unit
+ if [ -f "$SYSTEMD_UNIT" ]; then
+ systemctl disable ${MODULE_NAME}-mount.service 2>/dev/null || true
+ rm -f "$SYSTEMD_UNIT"
+ systemctl daemon-reload
+ log_ok "Removed systemd unit"
+ fi
+
+ # Remove fstab entries
+ if grep -q "${MODULE_NAME}" /etc/fstab 2>/dev/null; then
+ sed -i "/${MODULE_NAME}/d" /etc/fstab
+ # Remove leftover comment
+ sed -i '/# MARUFS CXL filesystem/d' /etc/fstab
+ log_ok "Removed fstab entries"
+ fi
+
+ # Remove auto-load config
+ if [ -f "$MODULES_CONF" ]; then
+ rm -f "$MODULES_CONF"
+ log_ok "Removed $MODULES_CONF"
+ fi
+
+ # Remove modprobe config
+ if [ -f "$MODPROBE_CONF" ]; then
+ rm -f "$MODPROBE_CONF"
+ log_ok "Removed $MODPROBE_CONF"
+ fi
+
+ # Remove installed module
+ if [ -f "${INSTALL_DIR}/${MODULE_NAME}.ko" ]; then
+ rm -f "${INSTALL_DIR}/${MODULE_NAME}.ko"
+ depmod -a
+ log_ok "Removed module from system path"
+ fi
+
+ log_ok "Uninstall complete"
+ echo "============================================"
+}
+
+# ============================================================================
+# INSTALL
+# ============================================================================
+do_install() {
+ echo "============================================"
+ echo " MARUFS Auto-Load Setup"
+ echo "============================================"
+
+ # Step 1: Check module exists
+ if [ ! -f "$MODULE_KO" ]; then
+ log_err "Module not found: $MODULE_KO"
+ log_info "Build first: cd $PROJECT_DIR && make -j128"
+ exit 1
+ fi
+ log_ok "Module found: $MODULE_KO"
+
+ # Step 2: Install to system path
+ mkdir -p "$INSTALL_DIR"
+ cp "$MODULE_KO" "$INSTALL_DIR/"
+ depmod -a
+ log_ok "Installed to $INSTALL_DIR/"
+
+ # Verify
+ if ! modinfo "$MODULE_NAME" > /dev/null 2>&1; then
+ log_err "modinfo failed after install"
+ exit 1
+ fi
+ log_ok "modinfo verification passed"
+
+ # Step 3: Auto-load config
+ echo "$MODULE_NAME" > "$MODULES_CONF"
+ log_ok "Auto-load configured: $MODULES_CONF"
+
+ # Step 4: Create mount point
+ mkdir -p "$MOUNT_POINT"
+ log_ok "Mount point: $MOUNT_POINT"
+
+ # Step 5: Auto-mount (if --mount specified)
+ if [ -n "$DAX_DEVICE" ]; then
+ if [ ! -e "$DAX_DEVICE" ]; then
+ log_warn "DAX device $DAX_DEVICE not found (will use nofail)"
+ fi
+
+ if [ "$USE_SYSTEMD" = true ]; then
+ cat > "$SYSTEMD_UNIT" << UNIT
+[Unit]
+Description=Mount MARUFS CXL filesystem
+After=systemd-modules-load.service
+ConditionPathExists=$DAX_DEVICE
+
+[Service]
+Type=oneshot
+RemainAfterExit=yes
+
+ExecStart=/bin/mount -t ${MODULE_NAME} -o daxdev=${DAX_DEVICE},node_id=${NODE_ID} none ${MOUNT_POINT}
+ExecStart=/bin/chmod 1777 ${MOUNT_POINT}
+
+ExecStop=/bin/umount ${MOUNT_POINT}
+
+[Install]
+WantedBy=multi-user.target
+UNIT
+ systemctl daemon-reload
+ systemctl enable ${MODULE_NAME}-mount.service
+ log_ok "Auto-mount: systemd unit enabled"
+ else
+ # fstab (remove old entries first)
+ sed -i "/${MODULE_NAME}/d" /etc/fstab
+ sed -i '/# MARUFS CXL filesystem/d' /etc/fstab
+ cat >> /etc/fstab << FSTAB
+
+# MARUFS CXL filesystem (auto-generated by setup-autoload.sh)
+none ${MOUNT_POINT} ${MODULE_NAME} daxdev=${DAX_DEVICE},node_id=${NODE_ID},nofail 0 0
+FSTAB
+ log_ok "Auto-mount: fstab configured"
+ fi
+ else
+ log_info "Auto-mount skipped (use --mount /dev/daxX.Y to enable)"
+ fi
+
+ echo ""
+ echo "============================================"
+ log_ok "Setup complete!"
+ echo "============================================"
+ echo ""
+ echo " Module will auto-load on next boot."
+ if [ -n "$DAX_DEVICE" ]; then
+ echo " Filesystem will auto-mount on next boot."
+ echo ""
+ echo " To mount now:"
+ echo " sudo modprobe ${MODULE_NAME}"
+ if [ "$USE_SYSTEMD" = true ]; then
+ echo " sudo systemctl start ${MODULE_NAME}-mount.service"
+ else
+ echo " sudo systemctl daemon-reload"
+ echo " sudo mount ${MOUNT_POINT}"
+ fi
+ fi
+ echo ""
+}
+
+# --- Dispatch ---
+case "$ACTION" in
+ install) do_install ;;
+ uninstall) do_uninstall ;;
+ status) do_status ;;
+esac
diff --git a/marufs_kernel/src/acl.c b/marufs_kernel/src/acl.c
new file mode 100644
index 0000000..8143107
--- /dev/null
+++ b/marufs_kernel/src/acl.c
@@ -0,0 +1,479 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * acl.c - MARUFS Permission Delegation System
+ *
+ * Permission model:
+ * 1. Owner (node_id + pid + birth_time) → implicit ALL permissions
+ * 2. RAT default_perms → baseline for non-owners
+ * 3. Delegation entries in RAT entry → per-entity grants
+ *
+ * Check order (fast path first):
+ * 1. Owner? → allow (same as existing 3-stage ACL)
+ * 2. RAT default_perms sufficient? → allow
+ * 3. Delegation table match? → allow
+ * 4. → deny (-EACCES)
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "marufs.h"
+#include "me.h"
+
+/*
+ * marufs_get_exe_id - read current process's exe binary identity.
+ *
+ * Used by post-exec privilege retention defense: bind ownership /
+ * delegation to (inode, dev) of the exe binary so that an execve() into
+ * a different binary fails subsequent permission checks.
+ *
+ * Returns true with @out_ino / @out_dev populated, or false if there is
+ * no exe (kernel thread) — caller should treat as deny.
+ */
+bool marufs_get_exe_id(u64 *out_ino, u32 *out_dev)
+{
+ struct mm_struct *mm = current->mm;
+ struct file *exe = mm ? READ_ONCE(mm->exe_file) : NULL;
+
+ if (!exe || !exe->f_inode)
+ return false;
+
+ *out_ino = exe->f_inode->i_ino;
+ *out_dev = new_encode_dev(exe->f_inode->i_sb->s_dev);
+ return true;
+}
+
+/*
+ * marufs_check_exe_id - check current process's exe identity matches.
+ *
+ * Returns true iff @ino and @dev equal the current task's exe inode/dev.
+ * Returns false if no exe (kernel thread) or mismatch.
+ */
+bool marufs_check_exe_id(u64 ino, u32 dev)
+{
+ u64 cur_ino;
+ u32 cur_dev;
+
+ if (!marufs_get_exe_id(&cur_ino, &cur_dev))
+ return false;
+ return cur_ino == ino && cur_dev == dev;
+}
+
+/*
+ * marufs_get_pid_identity - read identity (birth_time + exe id) of an
+ * arbitrary local PID.
+ *
+ * Same-node only: looks up the target task on the current host and reads
+ * (start_boottime, exe inode, exe dev). Used by delegation grant to
+ * eagerly bind a same-node delegation to the target's identity (closes
+ * the lazy-init timing window where an attacker could hijack the target
+ * between grant and first mmap). Cross-node delegations cannot use this —
+ * caller must fall back to lazy-init.
+ *
+ * Returns true with all out params populated on success. False if the
+ * PID is not found, the task has no mm, or no exe.
+ */
+bool marufs_get_pid_identity(u32 target_pid, u64 *out_birth, u64 *out_ino,
+ u32 *out_dev)
+{
+ if (target_pid == 0)
+ return false;
+
+ struct pid *pid_s = find_get_pid(target_pid);
+ if (!pid_s)
+ return false;
+
+ bool ok = false;
+ struct task_struct *task = get_pid_task(pid_s, PIDTYPE_PID);
+ if (task) {
+ if (task->mm) {
+ struct file *exe = READ_ONCE(task->mm->exe_file);
+ if (exe && exe->f_inode) {
+ *out_birth = ktime_to_ns(task->start_boottime);
+ *out_ino = exe->f_inode->i_ino;
+ *out_dev = new_encode_dev(
+ exe->f_inode->i_sb->s_dev);
+ ok = true;
+ }
+ }
+ put_task_struct(task);
+ }
+ put_pid(pid_s);
+ return ok;
+}
+
+/*
+ * marufs_owner_is_dead - check if RAT entry owner process is dead
+ * @owner_pid: PID from RAT entry
+ * @owner_birth_time: birth time from RAT entry (le64 raw value)
+ *
+ * Returns true if the process is dead or PID was reused.
+ */
+bool marufs_owner_is_dead(u32 owner_pid, u64 owner_birth_time)
+{
+ struct pid *pid_s;
+ struct task_struct *task;
+ bool dead = true;
+
+ if (owner_pid == 0)
+ return false; /* pid not yet written (ALLOCATING) — find_get_pid(0) returns init */
+
+ pid_s = find_get_pid(owner_pid);
+ if (!pid_s)
+ return true;
+
+ task = get_pid_task(pid_s, PIDTYPE_PID);
+ if (task) {
+ u64 bt = ktime_to_ns(task->start_boottime);
+ if (bt == owner_birth_time)
+ dead = false;
+ put_task_struct(task);
+ }
+ put_pid(pid_s);
+ return dead;
+}
+
+/*
+ * marufs_is_owner - check if current process is the owner of a RAT entry
+ * @sbi: superblock info
+ * @rat_entry: RAT entry to check
+ *
+ * 4-stage ACL: node_id → pid → birth_time → exe_inode
+ * The exe_inode stage defends against post-exec privilege retention:
+ * an attacker that execve()s a different binary keeps PID + birth_time
+ * but ends up with a different exe inode/dev — fails the check.
+ *
+ * Returns true if current process is the owner.
+ */
+static bool marufs_is_owner(struct marufs_sb_info *sbi,
+ struct marufs_rat_entry *rat_entry)
+{
+ u32 owner_node, owner_pid;
+ u64 owner_birth_time, current_birth_time;
+
+ /* Invalidate CL2 (ACL fields) before reading */
+ MARUFS_CXL_RMB(&rat_entry->default_perms, 64);
+
+ /* Stage 1: node_id check */
+ owner_node = READ_LE16(rat_entry->owner_node_id);
+ if (owner_node != sbi->node_id)
+ return false;
+
+ /* Stage 2: pid check */
+ owner_pid = READ_LE32(rat_entry->owner_pid);
+ if (owner_pid != current->pid)
+ return false;
+
+ /* Stage 3: birth_time check (PID reuse protection) */
+ owner_birth_time = READ_LE64(rat_entry->owner_birth_time);
+ current_birth_time = ktime_to_ns(current->start_boottime);
+ if (owner_birth_time != current_birth_time)
+ return false;
+
+ /* Stage 4: exe_inode check (post-exec privilege retention defense) */
+ u64 owner_ino = READ_LE64(rat_entry->owner_exe_inode_ino);
+ u32 owner_dev = READ_LE32(rat_entry->owner_exe_inode_dev);
+
+ return marufs_check_exe_id(owner_ino, owner_dev);
+}
+
+/*
+ * marufs_deleg_entry_clear - zero all data fields of a delegation entry
+ * @de: delegation entry pointer
+ *
+ * Clears node_id, pid, perms, birth_time, granted_at and issues a WMB.
+ * Does NOT touch de->state — caller is responsible for the state transition
+ * (CAS or direct write depending on ownership context).
+ */
+void marufs_deleg_entry_clear(struct marufs_deleg_entry *de)
+{
+ WRITE_LE32(de->node_id, 0);
+ WRITE_LE32(de->pid, 0);
+ WRITE_LE32(de->perms, 0);
+ WRITE_LE64(de->birth_time, 0);
+ WRITE_LE64(de->granted_at, 0);
+ WRITE_LE64(de->exe_inode_ino, 0);
+ WRITE_LE32(de->exe_inode_dev, 0);
+ MARUFS_CXL_WMB(de, sizeof(*de));
+}
+
+/*
+ * marufs_check_permission_any - compute granted subset of candidate perms
+ * @sbi: superblock info
+ * @rat_entry_id: RAT entry ID
+ * @candidate: permission bitmask to test (MARUFS_PERM_*, OR-combined)
+ * @out_granted: granted subset (intersection of candidate with caller's rights)
+ *
+ * ANY-semantics: returns the actual rights held by the caller within the
+ * requested candidate set. Caller branches on which bits matched (e.g.,
+ * PERM_GRANT: ADMIN can grant anything, GRANT cannot propagate ADMIN/GRANT).
+ *
+ * Aggregation:
+ * 1. Owner → *out_granted = candidate (full match, fast path)
+ * 2. Otherwise: (default_perms ∪ matched deleg entries) & candidate
+ *
+ * Returns 0 on success, -EINVAL on bad input. State != ALLOCATED yields
+ * 0 with *out_granted=0 (caller maps to -EACCES).
+ *
+ * marufs_check_permission() is a thin AND-semantics wrapper over this.
+ */
+int marufs_check_permission_any(struct marufs_sb_info *sbi, u32 rat_entry_id,
+ u32 candidate, u32 *out_granted)
+{
+ struct marufs_rat_entry *rat_entry =
+ marufs_rat_entry_get(sbi, rat_entry_id);
+ if (!rat_entry || !out_granted || candidate == 0)
+ return -EINVAL;
+
+ /* Verify RAT entry is still allocated before reading fields */
+ if (READ_LE32(rat_entry->state) != MARUFS_RAT_ENTRY_ALLOCATED)
+ return 0;
+
+ /* Fast path: owner has ALL permissions */
+ *out_granted = 0;
+ if (marufs_is_owner(sbi, rat_entry)) {
+ *out_granted = candidate;
+ return 0;
+ }
+
+ /* Invalidate CL2 to cover default_perms, deleg_num_entries, deleg_entries */
+ MARUFS_CXL_RMB(&rat_entry->default_perms, 64);
+ u32 default_perms = READ_LE16(rat_entry->default_perms);
+ u32 have = 0;
+ have |= default_perms & candidate;
+ if (have == candidate) {
+ *out_granted = have;
+ return 0;
+ }
+
+ for (u32 i = 0; i < rat_entry->deleg_num_entries; i++) {
+ struct marufs_deleg_entry *de =
+ marufs_rat_deleg_entry(rat_entry, i);
+ if (!de)
+ continue;
+ if (READ_LE32(de->state) != MARUFS_DELEG_ACTIVE)
+ continue;
+
+ u32 de_node = READ_LE32(de->node_id);
+ u32 de_pid = READ_LE32(de->pid);
+ if (de_node != sbi->node_id || de_pid != current->pid)
+ continue;
+
+ u64 de_birth = READ_LE64(de->birth_time);
+ u64 cur_birth = ktime_to_ns(current->start_boottime);
+ if (de_birth == 0) {
+ /* Lazy init on first access by matching process */
+ marufs_le64_cas(&de->birth_time, 0, cur_birth);
+ MARUFS_CXL_WMB(de, sizeof(*de));
+ } else if (de_birth != cur_birth) {
+ continue; /* PID reuse */
+ }
+
+ /*
+ * Bind delegation to the caller's exe binary identity.
+ * Defends against post-exec privilege retention: an attacker
+ * that execve()s a different binary (same PID + birth_time,
+ * possibly inherited fd) ends up with a different exe
+ * inode/dev — entry no longer matches.
+ *
+ * Lazy-init mirrors birth_time: de->exe_inode_ino == 0 ⇒
+ * "first match"; capture and CAS-bind. Subsequent calls
+ * compare; mismatch ⇒ skip this entry.
+ */
+ u64 cur_ino;
+ u32 cur_dev;
+ if (!marufs_get_exe_id(&cur_ino, &cur_dev))
+ continue; /* kernel thread / no exe — never matches */
+
+ u64 de_ino = READ_LE64(de->exe_inode_ino);
+ u32 de_dev = READ_LE32(de->exe_inode_dev);
+ if (de_ino == 0) {
+ marufs_le64_cas(&de->exe_inode_ino, 0, cur_ino);
+ marufs_le32_cas(&de->exe_inode_dev, 0, cur_dev);
+ MARUFS_CXL_WMB(de, sizeof(*de));
+ } else if (de_ino != cur_ino || de_dev != cur_dev) {
+ continue; /* exe binary changed (post-exec attack) */
+ }
+
+ u32 de_perms = READ_LE32(de->perms);
+ u32 grant = de_perms & candidate;
+ if (!grant)
+ continue;
+
+ have |= grant;
+ if (have == candidate)
+ break;
+ }
+
+ *out_granted = have;
+ return 0;
+}
+
+/*
+ * marufs_check_permission - AND-semantics permission check
+ *
+ * Returns 0 if every bit in @required_perms is granted, -EACCES if any
+ * bit missing, -EINVAL on bad input.
+ *
+ * Thin wrapper over marufs_check_permission_any().
+ */
+int marufs_check_permission(struct marufs_sb_info *sbi, u32 rat_entry_id,
+ u32 required_perms)
+{
+ u32 have;
+ int ret = marufs_check_permission_any(sbi, rat_entry_id, required_perms,
+ &have);
+ if (ret)
+ return ret;
+ return have == required_perms ? 0 : -EACCES;
+}
+
+/*
+ * marufs_deleg_grant - grant permissions to (node_id, pid)
+ * @sbi: superblock info
+ * @rat_entry_id: RAT entry ID
+ * @req: permission request (node_id, pid, birth_time, perms)
+ *
+ * Upsert: if matching entry exists, OR the new perms in.
+ * Otherwise, find an empty slot and create a new entry.
+ *
+ * Only owner or ADMIN-delegated callers should invoke this
+ * (caller must check before calling).
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+/*
+ * marufs_deleg_try_upsert - scan entries for existing match or first free slot
+ * @deleg_entries: delegation entry array (from rat_entry->deleg_entries)
+ * @num_entries: max entries in table
+ * @req: permission request
+ * @out_free_idx: output first free slot index (MARUFS_DELEG_MAX_ENTRIES if none)
+ *
+ * Return: 0 if upserted existing entry, 1 if free slot found, -ENOSPC if full
+ */
+static int marufs_deleg_try_upsert(struct marufs_deleg_entry *deleg_entries,
+ u32 num_entries, struct marufs_perm_req *req,
+ u32 *out_free_idx)
+{
+ u32 i;
+ u32 free_idx = MARUFS_DELEG_MAX_ENTRIES;
+
+ for (i = 0; i < num_entries; i++) {
+ struct marufs_deleg_entry *de = &deleg_entries[i];
+ u32 state;
+
+ MARUFS_CXL_RMB(de, sizeof(*de));
+ state = READ_LE32(de->state);
+
+ if (state == MARUFS_DELEG_EMPTY) {
+ if (free_idx == MARUFS_DELEG_MAX_ENTRIES)
+ free_idx = i;
+ continue;
+ }
+
+ if (state != MARUFS_DELEG_ACTIVE)
+ continue;
+
+ /* Check for existing match: same (node_id, pid) */
+ if (READ_LE32(de->node_id) == req->node_id &&
+ READ_LE32(de->pid) == req->pid) {
+ /* Upsert: CAS loop to atomically OR in new permissions */
+ u32 old_perms, new_perms;
+ do {
+ old_perms = READ_LE32(de->perms);
+ new_perms = old_perms | req->perms;
+ if (new_perms == old_perms)
+ break; /* Already has all requested bits */
+ } while (marufs_le32_cas(&de->perms, old_perms,
+ new_perms) != old_perms);
+ WRITE_LE64(de->granted_at, ktime_get_real_ns());
+ MARUFS_CXL_WMB(de, sizeof(*de));
+ return 0;
+ }
+ }
+
+ *out_free_idx = free_idx;
+ return (free_idx < num_entries) ? 1 : -ENOSPC;
+}
+
+/*
+ * Caller MUST hold the Global ME for MARUFS_ME_GLOBAL_SHARD_ID. With the ME
+ * held, this function is the sole state-field mutator for this rat_entry's
+ * delegation array — no retry/CAS-dance is needed.
+ *
+ * Readers (check_permission) only CAS `de->birth_time` and are gated by the
+ * state-machine (GRANTING → WMB → ACTIVE), so plain writes + the intermediate
+ * WMB are sufficient for visibility.
+ */
+int marufs_deleg_grant(struct marufs_sb_info *sbi, u32 rat_entry_id,
+ struct marufs_perm_req *req)
+{
+ if (!sbi || !req)
+ return -EINVAL;
+
+ if (req->perms == 0 || (req->perms & ~MARUFS_PERM_ALL))
+ return -EINVAL;
+
+ if (req->node_id == 0 || req->pid == 0)
+ return -EINVAL;
+
+ struct marufs_rat_entry *rat_entry =
+ marufs_rat_entry_get(sbi, rat_entry_id);
+ if (!rat_entry ||
+ READ_CXL_LE32(rat_entry->state) != MARUFS_RAT_ENTRY_ALLOCATED)
+ return -EINVAL;
+
+ MARUFS_CXL_RMB(&rat_entry->default_perms, 64);
+
+ /* Single pass: upsert hit, no space, or free slot to claim. */
+ u32 free_idx;
+ int ret = marufs_deleg_try_upsert(rat_entry->deleg_entries,
+ MARUFS_DELEG_MAX_ENTRIES, req,
+ &free_idx);
+ if (ret <= 0)
+ return ret; /* 0 = upserted, -ENOSPC = full */
+
+ struct marufs_deleg_entry *de =
+ marufs_rat_deleg_entry(rat_entry, free_idx);
+ if (!de)
+ return -EINVAL;
+
+ /* Reserve slot visibly so readers racing by see GRANTING, not stale ACTIVE. */
+ WRITE_LE32(de->state, MARUFS_DELEG_GRANTING);
+ MARUFS_CXL_WMB(&de->state, sizeof(de->state));
+
+ WRITE_LE32(de->node_id, req->node_id);
+ WRITE_LE32(de->pid, req->pid);
+ WRITE_LE32(de->perms, req->perms);
+
+ /*
+ * Identity binding (birth_time + exe_inode):
+ * same-node target → eager (resolve target's identity now, closes
+ * the grant→first-mmap timing window where an
+ * attacker could hijack the target via execve)
+ * cross-node target → lazy (kernel can't reach remote task; the
+ * consumer's first mmap captures it)
+ */
+ u64 t_birth = 0, t_ino = 0;
+ u32 t_dev = 0;
+ if (req->node_id == sbi->node_id)
+ marufs_get_pid_identity(req->pid, &t_birth, &t_ino, &t_dev);
+ WRITE_LE64(de->birth_time, t_birth);
+ WRITE_LE64(de->exe_inode_ino, t_ino);
+ WRITE_LE32(de->exe_inode_dev, t_dev);
+
+ WRITE_LE64(de->granted_at, ktime_get_real_ns());
+ MARUFS_CXL_WMB(de, sizeof(*de));
+
+ /* Publish — ensured fields are globally visible before state transition. */
+ WRITE_LE32(de->state, MARUFS_DELEG_ACTIVE);
+ MARUFS_CXL_WMB(&de->state, sizeof(de->state));
+
+ marufs_le16_cas_inc(&rat_entry->deleg_num_entries);
+ return 0;
+}
diff --git a/marufs_kernel/src/acl.h b/marufs_kernel/src/acl.h
new file mode 100644
index 0000000..aa02288
--- /dev/null
+++ b/marufs_kernel/src/acl.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * acl.h - Region-level access control entry points.
+ */
+
+#ifndef _MARUFS_ACL_H
+#define _MARUFS_ACL_H
+
+#include
+
+struct marufs_sb_info;
+struct marufs_deleg_entry;
+struct marufs_perm_req;
+
+void marufs_deleg_entry_clear(struct marufs_deleg_entry *de);
+int marufs_check_permission(struct marufs_sb_info *sbi, u32 rat_entry_id,
+ u32 required_perms);
+int marufs_check_permission_any(struct marufs_sb_info *sbi, u32 rat_entry_id,
+ u32 candidate, u32 *out_granted);
+int marufs_deleg_grant(struct marufs_sb_info *sbi, u32 rat_entry_id,
+ struct marufs_perm_req *req);
+bool marufs_owner_is_dead(u32 owner_pid, u64 owner_birth_time);
+bool marufs_get_exe_id(u64 *out_ino, u32 *out_dev);
+bool marufs_check_exe_id(u64 ino, u32 dev);
+bool marufs_get_pid_identity(u32 target_pid, u64 *out_birth, u64 *out_ino,
+ u32 *out_dev);
+
+#endif /* _MARUFS_ACL_H */
diff --git a/marufs_kernel/src/bootstrap.c b/marufs_kernel/src/bootstrap.c
new file mode 100644
index 0000000..fd58eb8
--- /dev/null
+++ b/marufs_kernel/src/bootstrap.c
@@ -0,0 +1,453 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * bootstrap.c - MARUFS auto-mount bootstrap slot management.
+ *
+ * Bootstrap is ONLY for mount-time election + format gate.
+ * Runtime liveness is owned entirely by ME (not bootstrap).
+ *
+ * Algorithm (last-writer-wins, settle window):
+ * 1. Write magic + random_token + status=CLAIMED to a target slot.
+ * 2. Sleep bootstrap_settle_ms so peers can complete writes.
+ * 3. Reread: if our random_token is still present, we won.
+ *
+ * No CAS needed: a 64B slot is one CL; cross-host CXL writes resolve at
+ * last-store-to-CL granularity. The random_token uniquely identifies the
+ * winner after the settle window.
+ */
+
+#include
+#include
+#include
+#include
+
+#include "marufs.h"
+#include "bootstrap.h"
+#include "me.h"
+
+enum marufs_bootstrap_configuration {
+ /* Timing constants (milliseconds) */
+ MARUFS_BOOTSTRAP_SETTLE_MS = 20,
+ MARUFS_BOOTSTRAP_POLL_MS = 50,
+ MARUFS_BOOTSTRAP_FORMAT_TIMEOUT_MS = 30000
+};
+
+/* Module parameter: settle window in ms (tunable for high-latency fabrics) */
+static unsigned int bootstrap_settle_ms = MARUFS_BOOTSTRAP_SETTLE_MS;
+module_param(bootstrap_settle_ms, uint, 0600);
+MODULE_PARM_DESC(bootstrap_settle_ms,
+ "Bootstrap slot claim settle window in ms (default 20)");
+
+/* Module parameter: stuck formatter detection timeout in ms */
+static unsigned int bootstrap_format_timeout_ms =
+ MARUFS_BOOTSTRAP_FORMAT_TIMEOUT_MS;
+module_param(bootstrap_format_timeout_ms, uint, 0600);
+MODULE_PARM_DESC(bootstrap_format_timeout_ms,
+ "Stuck formatter detection timeout in ms (default 30000)");
+
+/*
+ * Module parameter: chaos fault injection — if non-zero, the auto-mount
+ * formatter skips GSB-magic write and slot[0] status→CLAIMED promotion,
+ * leaving slot[0] stuck at FORMATTING forever. Joiners will detect this
+ * via bootstrap_format_timeout_ms and exercise the steal path.
+ * Set BEFORE mounting the formatter node. Default 0 (disabled).
+ */
+static unsigned int bootstrap_inject_stuck_formatter;
+module_param(bootstrap_inject_stuck_formatter, uint, 0600);
+MODULE_PARM_DESC(
+ bootstrap_inject_stuck_formatter,
+ "DEBUG: if non-zero, formatter skips GSB-magic write (chaos test). Default 0.");
+
+/*
+ * Module parameter: chaos race-window expander. If non-zero, after a free
+ * slot is identified but before the status=CLAIMED write, sleep this many
+ * microseconds. Widening this window lets concurrent mounters observe the
+ * same free slot and both write their token, exercising the last-writer-wins
+ * resolution. Default 0 (no artificial delay).
+ */
+static unsigned int bootstrap_debug_pre_write_delay_us;
+module_param(bootstrap_debug_pre_write_delay_us, uint, 0600);
+MODULE_PARM_DESC(
+ bootstrap_debug_pre_write_delay_us,
+ "DEBUG: delay (us) between free-slot scan and CLAIMED write to widen race window. Default 0.");
+
+/*
+ * marufs_bootstrap_should_inject_stuck - readable from mount path before
+ * sbi is fully constructed (unlike the old per-sbi sysfs flag).
+ */
+bool marufs_bootstrap_should_inject_stuck(void)
+{
+ return READ_LE32(bootstrap_inject_stuck_formatter) != 0;
+}
+
+/* ── Internal helpers ─────────────────────────────────────────────────── */
+
+static inline struct marufs_bootstrap_slot *__slot(struct marufs_sb_info *sbi,
+ int idx)
+{
+ return marufs_bootstrap_slot_get(sbi, idx);
+}
+
+/* Generate a non-zero random token (claim race tiebreaker). */
+static u64 __gen_nonzero_token(void)
+{
+ u64 t;
+ do {
+ t = get_random_u64();
+ } while (t == 0);
+ return t;
+}
+
+/*
+ * Write claim fields to a slot, settle, then re-read to verify ownership.
+ * Returns 0 on win (token survived), -EAGAIN on lost race.
+ *
+ * @slot: target bootstrap slot (in CXL memory)
+ * @status: status to write (CLAIMED for normal, FORMATTING for steal)
+ * @token: our random_token (caller already generated)
+ * @observed_token: optional out-param for the token observed after settle
+ * (used by claim() for the "lost race" log line). May be NULL.
+ */
+static int __claim_write_and_verify(struct marufs_bootstrap_slot *slot,
+ u32 status, u64 token, u64 *observed_token)
+{
+ WRITE_LE32(slot->magic, MARUFS_BOOTSTRAP_MAGIC);
+ WRITE_LE64(slot->random_token, token);
+ WRITE_LE32(slot->status, status);
+ MARUFS_CXL_WMB(slot, sizeof(*slot));
+
+ msleep(bootstrap_settle_ms);
+
+ MARUFS_CXL_RMB(slot, sizeof(*slot));
+ u64 obs = READ_CXL_LE64(slot->random_token);
+ if (observed_token)
+ *observed_token = obs;
+ return (obs == token) ? 0 : -EAGAIN;
+}
+
+/*
+ * me_node_is_dead - check whether node @node_id has stopped ticking.
+ *
+ * Cross-host safe: uses observer-local counter snap (mirrors
+ * me_handle_acquire_deadline in me.c). Sample heartbeat counter, sleep
+ * MARUFS_ME_LIVENESS_PROBE_NS on local clock, resample. No counter advance
+ * ⇒ owner not ticking ⇒ dead.
+ *
+ * Cross-host monotonic timestamps don't share an origin, so subtracting
+ * heartbeat_ts is meaningless — only counter advance is portable.
+ *
+ * Returns false if ME not started, node_id invalid, or counter advanced.
+ */
+static bool me_node_is_dead(struct marufs_sb_info *sbi, u32 node_id)
+{
+ struct marufs_me_instance *me = sbi->me;
+ if (!me)
+ return false; /* ME not started — cannot conclude dead */
+ if (!marufs_me_is_valid_node(me, node_id))
+ return false;
+
+ /* membership[] is indexed by internal idx = node_id - 1 */
+ struct marufs_me_membership_slot *ms =
+ me_membership_get(me, node_id - 1);
+ u64 hb_before = READ_CXL_LE64(ms->heartbeat);
+
+ /* Observer-local probe interval — sleep on our clock, not theirs. */
+ u64 probe_us = MARUFS_ME_LIVENESS_PROBE_NS / NSEC_PER_USEC;
+ usleep_range(probe_us, probe_us + probe_us / 4);
+
+ MARUFS_CXL_RMB(ms, sizeof(*ms));
+ u64 hb_after = READ_CXL_LE64(ms->heartbeat);
+
+ /* No advance ⇒ owner stopped ticking ⇒ dead */
+ return (hb_before == hb_after);
+}
+
+/* ── marufs_bootstrap_init_area ────────────────────────────────────────── */
+
+void marufs_bootstrap_init_area(void *base)
+{
+ BUILD_BUG_ON(sizeof(struct marufs_bootstrap_slot) !=
+ MARUFS_BOOTSTRAP_SLOT_SIZE);
+ /*
+ * No-op documentation call. Bootstrap area is zeroed by format or by
+ * the kernel's zero-page guarantee for fresh CXL devices.
+ * EMPTY = 0, magic = 0 ≠ MARUFS_BOOTSTRAP_MAGIC — claim scan correctly
+ * treats all slots as available on a fresh device.
+ */
+ (void)base;
+}
+
+/* ── marufs_bootstrap_claim ─────────────────────────────────────────────── */
+
+int marufs_bootstrap_claim(struct marufs_sb_info *sbi, int *out_slot_idx)
+{
+ struct marufs_bootstrap_slot *slots = marufs_bootstrap_slot_get(sbi, 0);
+ if (!slots)
+ return -EINVAL;
+
+ /*
+ * Single pass: pick the first reusable slot.
+ *
+ * A slot is reusable when:
+ * (a) magic != MARUFS_BOOTSTRAP_MAGIC (fresh/zeroed device), OR
+ * (b) status == EMPTY (graceful umount), OR
+ * (c) status == CLAIMED AND me_node_is_dead(idx+1) (owner crashed).
+ *
+ * FORMATTING is never reusable here — stuck-formatter steal handles
+ * stale FORMATTING on slot[0] via marufs_bootstrap_steal_stuck_slot0().
+ */
+ int free_idx = -1;
+ for (int i = 0; i < MARUFS_BOOTSTRAP_MAX_SLOTS; i++) {
+ struct marufs_bootstrap_slot *s = &slots[i];
+ MARUFS_CXL_RMB(s, sizeof(*s));
+
+ /* (a) uninitialised slot on a fresh device */
+ u32 mg = READ_CXL_LE32(s->magic);
+ if (mg != MARUFS_BOOTSTRAP_MAGIC) {
+ free_idx = i;
+ break;
+ }
+ /* (b) graceful-umount EMPTY */
+ u32 st = READ_CXL_LE32(s->status);
+ if (st == MARUFS_BS_EMPTY) {
+ free_idx = i;
+ break;
+ }
+ /* (c) CLAIMED but ME says node is dead */
+ if (st == MARUFS_BS_CLAIMED && me_node_is_dead(sbi, i + 1)) {
+ free_idx = i;
+ break;
+ }
+ /* FORMATTING — skip (claim in progress, steal path handles it) */
+ }
+
+ if (free_idx < 0)
+ return -EBUSY; /* table full */
+
+ u64 token = __gen_nonzero_token();
+
+ /* Chaos hook: artificially widen the race window between free-slot
+ * detection and the CLAIMED write so concurrent mounters observe the
+ * same free slot and both write their token. Disabled in production. */
+ if (bootstrap_debug_pre_write_delay_us)
+ usleep_range(bootstrap_debug_pre_write_delay_us,
+ bootstrap_debug_pre_write_delay_us + 100);
+
+ /* Write claim: magic → token → status=CLAIMED → settle → verify */
+ struct marufs_bootstrap_slot *tgt = &slots[free_idx];
+ u64 observed;
+ int ret = __claim_write_and_verify(tgt, MARUFS_BS_CLAIMED, token,
+ &observed);
+ if (ret) {
+ pr_info("bootstrap: claim race lost on slot %d (my_token=0x%016llx observed=0x%016llx)\n",
+ free_idx, token, observed);
+ return -EAGAIN; /* lost race; caller retries */
+ }
+
+ sbi->bootstrap_slot_idx = free_idx;
+ sbi->bootstrap_token = token;
+ *out_slot_idx = free_idx;
+ return 0;
+}
+
+/* ── marufs_bootstrap_claim_explicit ───────────────────────────────────── */
+
+int marufs_bootstrap_claim_explicit(struct marufs_sb_info *sbi, u32 node_id)
+{
+ if (node_id == 0 || node_id > MARUFS_BOOTSTRAP_MAX_SLOTS)
+ return -EINVAL;
+
+ int idx = (int)(node_id - 1);
+ struct marufs_bootstrap_slot *tgt = __slot(sbi, idx);
+ MARUFS_CXL_RMB(tgt, sizeof(*tgt));
+ u32 mg = READ_CXL_LE32(tgt->magic);
+ u32 st = READ_CXL_LE32(tgt->status);
+
+ /* Slot must be uninitialized, EMPTY, or stale-CLAIMED */
+ if (mg == MARUFS_BOOTSTRAP_MAGIC && st != MARUFS_BS_EMPTY) {
+ if (st == MARUFS_BS_CLAIMED) {
+ if (!me_node_is_dead(sbi, node_id)) {
+ pr_err("bootstrap: slot %d (node_id %u) CLAIMED and live\n",
+ idx, node_id);
+ return -EBUSY;
+ }
+ pr_info("bootstrap: slot %d (node_id %u) stale CLAIMED, stealing\n",
+ idx, node_id);
+ } else {
+ pr_err("bootstrap: slot %d (node_id %u) not claimable (status=%u)\n",
+ idx, node_id, st);
+ return -EBUSY;
+ }
+ }
+
+ u64 token = __gen_nonzero_token();
+ if (__claim_write_and_verify(tgt, MARUFS_BS_CLAIMED, token, NULL))
+ return -EAGAIN;
+
+ sbi->bootstrap_slot_idx = idx;
+ sbi->bootstrap_token = token;
+ return 0;
+}
+
+/* ── marufs_bootstrap_release ───────────────────────────────────────────── */
+
+void marufs_bootstrap_release(struct marufs_sb_info *sbi)
+{
+ if (sbi->bootstrap_slot_idx < 0)
+ return;
+
+ /*
+ * Graceful umount: write EMPTY directly so the slot is immediately
+ * reusable by the next mounter. No kthread to stop — ME owns liveness.
+ */
+ struct marufs_bootstrap_slot *tgt =
+ __slot(sbi, sbi->bootstrap_slot_idx);
+ WRITE_LE32(tgt->status, MARUFS_BS_EMPTY);
+ MARUFS_CXL_WMB(tgt, sizeof(*tgt));
+ pr_info("bootstrap: node %u slot %d released (EMPTY)\n", sbi->node_id,
+ sbi->bootstrap_slot_idx);
+ sbi->bootstrap_slot_idx = -1;
+}
+
+/* ── marufs_bootstrap_wait_for_format ──────────────────────────────────── */
+
+int marufs_bootstrap_wait_for_format(struct marufs_sb_info *sbi)
+{
+ struct marufs_superblock *gsb = marufs_gsb_get(sbi);
+ struct marufs_bootstrap_slot *slot0 = __slot(sbi, 0);
+ if (!gsb || !slot0)
+ return -EINVAL;
+
+ pr_info("bootstrap: node %u (slot %d) waiting for formatter\n",
+ sbi->node_id, sbi->bootstrap_slot_idx);
+
+ u64 local_start = ktime_get_ns();
+ u64 t_max_ns = (u64)bootstrap_format_timeout_ms * NSEC_PER_MSEC;
+
+ while (1) {
+ u32 slot0_st;
+
+ /* Check if format complete */
+ MARUFS_CXL_RMB(gsb, sizeof(*gsb));
+ if (READ_CXL_LE32(gsb->magic) == MARUFS_MAGIC) {
+ pr_info("bootstrap: format detected, node %u proceeding\n",
+ sbi->node_id);
+ return 0;
+ }
+
+ /* Check stuck formatter: slot[0] stuck at FORMATTING past timeout */
+ MARUFS_CXL_RMB(slot0, sizeof(*slot0));
+ slot0_st = READ_CXL_LE32(slot0->status);
+ if (slot0_st == MARUFS_BS_FORMATTING) {
+ u64 elapsed_ns = ktime_get_ns() - local_start;
+ if (elapsed_ns > t_max_ns) {
+ pr_warn("bootstrap: formatter stuck after %llu ms, entering recovery\n",
+ elapsed_ns / NSEC_PER_MSEC);
+ return -EAGAIN;
+ }
+ }
+
+ msleep_interruptible(MARUFS_BOOTSTRAP_POLL_MS);
+ }
+}
+
+/* ── marufs_bootstrap_set_status ────────────────────────────────────────── */
+
+void marufs_bootstrap_set_status(struct marufs_sb_info *sbi, int slot_idx,
+ enum marufs_bootstrap_status status)
+{
+ struct marufs_bootstrap_slot *tgt =
+ marufs_bootstrap_slot_get(sbi, slot_idx);
+ if (!tgt)
+ return;
+
+ WRITE_LE32(tgt->status, status);
+ MARUFS_CXL_WMB(tgt, sizeof(*tgt));
+}
+
+/* ── __bootstrap_zero_format_output ─────────────────────────────────────── */
+
+/*
+ * Zero the format output area (GSB + shard table), preserving the bootstrap
+ * area. Used before a re-format so stale layout data does not confuse the
+ * new formatter.
+ */
+static void __bootstrap_zero_format_output(struct marufs_sb_info *sbi)
+{
+ u64 regions_start = MARUFS_REGION_OFFSET;
+
+ void *base = sbi->dax_base;
+ memset(base, 0, MARUFS_GSB_SIZE);
+ memset(marufs_dax_ptr(sbi, MARUFS_SHARD_TABLE_OFFSET), 0,
+ regions_start - MARUFS_SHARD_TABLE_OFFSET);
+ MARUFS_CXL_WMB(base, regions_start);
+}
+
+/* ── marufs_bootstrap_steal_stuck_slot0 ────────────────────────────────── */
+
+int marufs_bootstrap_steal_stuck_slot0(struct marufs_sb_info *sbi)
+{
+ struct marufs_bootstrap_slot *slot0 = __slot(sbi, 0);
+ u64 token = __gen_nonzero_token();
+
+ if (__claim_write_and_verify(slot0, MARUFS_BS_FORMATTING, token,
+ NULL)) {
+ pr_info("bootstrap: lost slot[0] steal race, retry\n");
+ return -EAGAIN;
+ }
+
+ pr_info("bootstrap: node %u stole slot[0] for recovery\n",
+ sbi->node_id);
+
+ /* Zero the format output area, preserving bootstrap area.
+ * Strict re-format: safer than detecting partial-format state.
+ */
+ __bootstrap_zero_format_output(sbi);
+
+ sbi->bootstrap_slot_idx = 0;
+ sbi->bootstrap_token = token;
+ return 0;
+}
+
+/* ── Sysfs dump helper ──────────────────────────────────────────────────── */
+
+static const char *const bs_status_names[] = {
+ [MARUFS_BS_EMPTY] = "EMPTY",
+ [MARUFS_BS_CLAIMED] = "CLAIMED",
+ [MARUFS_BS_FORMATTING] = "FORMATTING",
+};
+
+ssize_t marufs_bootstrap_dump_slots(struct marufs_sb_info *sbi, char *buf,
+ size_t bufsize)
+{
+ if (bufsize == 0)
+ return 0;
+
+ struct marufs_bootstrap_slot *slots = marufs_bootstrap_slot_get(sbi, 0);
+ if (!slots)
+ return scnprintf(buf, bufsize, "(bootstrap not initialized)\n");
+
+ ssize_t n = 0;
+ for (int i = 0; i < MARUFS_BOOTSTRAP_MAX_SLOTS; i++) {
+ if ((size_t)n >= bufsize)
+ break;
+
+ struct marufs_bootstrap_slot *s = &slots[i];
+ MARUFS_CXL_RMB(s, sizeof(*s));
+
+ u32 mg = READ_CXL_LE32(s->magic);
+ u32 st = READ_CXL_LE32(s->status);
+ u64 tok = READ_CXL_LE64(s->random_token);
+
+ const char *stname = (st < ARRAY_SIZE(bs_status_names) &&
+ bs_status_names[st]) ?
+ bs_status_names[st] :
+ "?";
+
+ n += scnprintf(buf + n, bufsize - n,
+ "slot[%d] node_id=%d magic=0x%08x status=%s "
+ "token=0x%016llx%s\n",
+ i, i + 1, mg, stname, tok,
+ (i == sbi->bootstrap_slot_idx) ? " " : "");
+ }
+ return n;
+}
diff --git a/marufs_kernel/src/bootstrap.h b/marufs_kernel/src/bootstrap.h
new file mode 100644
index 0000000..beab3db
--- /dev/null
+++ b/marufs_kernel/src/bootstrap.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * bootstrap.h - MARUFS bootstrap slot API.
+ *
+ * Bootstrap is ONLY for mount-time election + format gate.
+ * Runtime liveness is owned entirely by ME.
+ *
+ * Public surface consumed by super.c, sysfs_debug.c.
+ */
+
+#ifndef _MARUFS_BOOTSTRAP_H
+#define _MARUFS_BOOTSTRAP_H
+
+#include
+#include "marufs_bootstrap_layout.h"
+
+struct marufs_sb_info;
+
+/* ── Core bootstrap API ─────────────────────────────────────────── */
+
+/*
+ * marufs_bootstrap_init_area - no-op documentation call; bootstrap area is
+ * zero by construction (fresh device) or by graceful umount EMPTY writes.
+ */
+void marufs_bootstrap_init_area(void *base);
+
+/*
+ * marufs_bootstrap_claim - claim the first available slot.
+ *
+ * Reusable slots: !magic_ok || status==EMPTY ||
+ * (status==CLAIMED && me_node_is_dead(idx+1)).
+ * FORMATTING slots are never reusable here (steal path handles that).
+ *
+ * Write order: magic → token → status=CLAIMED → wmb → settle → rmb → verify.
+ * Returns 0 with *out_slot_idx set on success.
+ * Returns -EAGAIN if lost the race (caller retries from top).
+ * Returns -EBUSY if all slots occupied and live.
+ */
+int marufs_bootstrap_claim(struct marufs_sb_info *sbi, int *out_slot_idx);
+
+/*
+ * marufs_bootstrap_claim_explicit - claim slot[node_id-1] for manual mount.
+ *
+ * Returns -EBUSY if slot is live (CLAIMED + ME alive).
+ */
+int marufs_bootstrap_claim_explicit(struct marufs_sb_info *sbi, u32 node_id);
+
+/*
+ * marufs_bootstrap_release - write status=EMPTY (clean umount).
+ * No kthread to stop. No heartbeat write. Just the EMPTY transition.
+ */
+void marufs_bootstrap_release(struct marufs_sb_info *sbi);
+
+/*
+ * marufs_bootstrap_wait_for_format - joiner polls GSB magic until format done.
+ *
+ * If slot[0].status==FORMATTING and bootstrap_format_timeout_ms elapses
+ * without GSB magic appearing, returns -EAGAIN so caller can steal.
+ * Returns 0 when GSB magic is valid.
+ */
+int marufs_bootstrap_wait_for_format(struct marufs_sb_info *sbi);
+
+/*
+ * marufs_bootstrap_steal_stuck_slot0 - recover from a crashed formatter.
+ *
+ * Overwrites slot[0] with our token, writes status=FORMATTING, settle+verify.
+ * Zeroes format area (preserving bootstrap area) on success.
+ *
+ * Returns 0 if we took ownership of slot[0].
+ * Returns -EAGAIN if another node won the steal race.
+ */
+int marufs_bootstrap_steal_stuck_slot0(struct marufs_sb_info *sbi);
+
+/*
+ * marufs_bootstrap_set_status - write status field to slot @slot_idx with WMB.
+ * Use marufs_bootstrap_promote_claimed() for the common slot[0] CLAIMED case.
+ */
+void marufs_bootstrap_set_status(struct marufs_sb_info *sbi, int slot_idx,
+ enum marufs_bootstrap_status status);
+
+/* ── Sysfs debug helpers ────────────────────────────────────────── */
+
+/*
+ * marufs_bootstrap_dump_slots - print slot table to @buf, bounded by @bufsize.
+ */
+ssize_t marufs_bootstrap_dump_slots(struct marufs_sb_info *sbi, char *buf,
+ size_t bufsize);
+
+/*
+ * marufs_bootstrap_should_inject_stuck - true when the
+ * bootstrap_inject_stuck_formatter module param is non-zero.
+ *
+ * Must be checked via this helper (not sbi field) because the mount path
+ * needs to read it before sbi is fully constructed.
+ */
+bool marufs_bootstrap_should_inject_stuck(void);
+
+#endif /* _MARUFS_BOOTSTRAP_H */
diff --git a/marufs_kernel/src/cache.c b/marufs_kernel/src/cache.c
new file mode 100644
index 0000000..0300fc5
--- /dev/null
+++ b/marufs_kernel/src/cache.c
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* cache.c - MARUFS Entry Cache (stub) */
+
+#include
+#include
+
+#include "marufs.h"
+
+int marufs_cache_init(struct marufs_sb_info *sbi)
+{
+ sbi->entry_cache = NULL;
+ pr_debug("cache initialized (stub)\n");
+ return 0;
+}
+
+void marufs_cache_destroy(struct marufs_sb_info *sbi)
+{
+ sbi->entry_cache = NULL;
+ pr_debug("cache destroyed (stub)\n");
+}
diff --git a/marufs_kernel/src/cache.h b/marufs_kernel/src/cache.h
new file mode 100644
index 0000000..81d7e33
--- /dev/null
+++ b/marufs_kernel/src/cache.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * cache.h - File entry cache lifecycle.
+ */
+
+#ifndef _MARUFS_CACHE_H
+#define _MARUFS_CACHE_H
+
+struct marufs_sb_info;
+
+int marufs_cache_init(struct marufs_sb_info *sbi);
+void marufs_cache_destroy(struct marufs_sb_info *sbi);
+
+#endif /* _MARUFS_CACHE_H */
diff --git a/marufs_kernel/src/compat.h b/marufs_kernel/src/compat.h
new file mode 100644
index 0000000..dca3f2d
--- /dev/null
+++ b/marufs_kernel/src/compat.h
@@ -0,0 +1,120 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * compat.h - MARUFS kernel version compatibility shim
+ *
+ * Consolidates all LINUX_VERSION_CODE checks.
+ * Covers kernel API changes from 5.x through 6.5+.
+ */
+
+#ifndef _MARUFS_COMPAT_H
+#define _MARUFS_COMPAT_H
+
+#include
+#include
+
+/* VFS idmap parameter abstraction (5.12, 6.3) */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 3, 0)
+#define MARUFS_IDMAP_PARAM_COMMA struct mnt_idmap *idmap,
+#define MARUFS_IDMAP_ARG_COMMA idmap,
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 12, 0)
+#define MARUFS_IDMAP_PARAM_COMMA struct user_namespace *mnt_userns,
+#define MARUFS_IDMAP_ARG_COMMA mnt_userns,
+#else
+#define MARUFS_IDMAP_PARAM_COMMA /* empty */
+#define MARUFS_IDMAP_ARG_COMMA /* empty */
+#endif
+
+/*
+ * generic_fillattr() wrapper — inline function to avoid preprocessor
+ * argument counting issues with MARUFS_IDMAP_ARG_COMMA trailing comma macro.
+ */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 3, 0)
+static inline void marufs_generic_fillattr(struct mnt_idmap *idmap,
+ u32 req_mask, struct inode *inode,
+ struct kstat *stat)
+{
+ generic_fillattr(idmap, req_mask, inode, stat);
+}
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 12, 0)
+static inline void marufs_generic_fillattr(struct user_namespace *mnt_userns,
+ u32 req_mask, struct inode *inode,
+ struct kstat *stat)
+{
+ generic_fillattr(mnt_userns, inode, stat);
+}
+#else
+static inline void marufs_generic_fillattr(u32 req_mask, struct inode *inode,
+ struct kstat *stat)
+{
+ generic_fillattr(inode, stat);
+}
+#endif
+
+/* setattr_prepare() wrapper */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 3, 0)
+static inline int marufs_setattr_prepare(struct mnt_idmap *idmap,
+ struct dentry *dentry,
+ struct iattr *attr)
+{
+ return setattr_prepare(idmap, dentry, attr);
+}
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 12, 0)
+static inline int marufs_setattr_prepare(struct user_namespace *mnt_userns,
+ struct dentry *dentry,
+ struct iattr *attr)
+{
+ return setattr_prepare(mnt_userns, dentry, attr);
+}
+#else
+static inline int marufs_setattr_prepare(struct dentry *dentry,
+ struct iattr *attr)
+{
+ return setattr_prepare(dentry, attr);
+}
+#endif
+
+/* SLAB_MEM_SPREAD removed in 6.8 */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0)
+#define MARUFS_SLAB_MEM_SPREAD 0
+#else
+#define MARUFS_SLAB_MEM_SPREAD SLAB_MEM_SPREAD
+#endif
+
+/* set_page_dirty() removed in 6.8, replaced by folio_mark_dirty() */
+static inline void marufs_set_page_dirty(struct page *page)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 8, 0)
+ folio_mark_dirty(page_folio(page));
+#else
+ set_page_dirty(page);
+#endif
+}
+
+/* d_revalidate() signature changed in 6.12:
+ * old: int (*)(struct dentry *, unsigned int)
+ * new: int (*)(struct inode *, const struct qstr *, struct dentry *, unsigned int)
+ */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 12, 0)
+#define MARUFS_D_REVALIDATE_ARGS \
+ struct inode *dir, const struct qstr *name, struct dentry *dentry, \
+ unsigned int flags
+#else
+#define MARUFS_D_REVALIDATE_ARGS struct dentry *dentry, unsigned int flags
+#endif
+
+/* s_d_op: use set_default_d_op() on 6.17+ (direct __s_d_op write
+ * skips DCACHE_OP_REVALIDATE flag), fall back to direct s_d_op on older. */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 17, 0)
+#define MARUFS_SET_D_OP(sb, ops) set_default_d_op(sb, ops)
+#else
+#define MARUFS_SET_D_OP(sb, ops) ((sb)->s_d_op = (ops))
+#endif
+
+/* call_mmap() renamed to vfs_mmap() in 6.17 */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 17, 0)
+#define marufs_call_mmap(file, vma) vfs_mmap(file, vma)
+#else
+#define marufs_call_mmap(file, vma) call_mmap(file, vma)
+#endif
+
+#endif /* _MARUFS_COMPAT_H */
diff --git a/marufs_kernel/src/dir.c b/marufs_kernel/src/dir.c
new file mode 100644
index 0000000..3368b68
--- /dev/null
+++ b/marufs_kernel/src/dir.c
@@ -0,0 +1,347 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * dir.c - MARUFS flat namespace directory operations
+ *
+ * Root directory is a flat namespace containing all files across all regions.
+ * Files are accessed through global sharded index. No chunk directories exist.
+ *
+ * Two-phase create:
+ * open(O_CREAT) -> reserve RAT entry + insert index (lightweight, no space)
+ * ftruncate(N) -> allocate physical region (in inode.c setattr)
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "compat.h"
+#include "marufs.h"
+#include "me.h"
+
+/* ============================================================================
+ * marufs_lookup - Global index hash lookup
+ * ============================================================================ */
+
+static struct dentry *marufs_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct super_block *sb = dir->i_sb;
+ struct marufs_sb_info *sbi = marufs_sb_get(sb);
+ struct marufs_index_entry *entry;
+ struct inode *inode = NULL;
+ int ret;
+
+ if (dir->i_ino != MARUFS_ROOT_INO)
+ return ERR_PTR(-ENOENT);
+
+ pr_debug("lookup '%.*s' in global index\n", (int)dentry->d_name.len,
+ dentry->d_name.name);
+
+ /* Search in global index */
+ ret = marufs_index_lookup(sbi, dentry->d_name.name, dentry->d_name.len,
+ &entry);
+ if (ret == 0) {
+ u64 hash;
+ u32 shard_id;
+ u32 entry_idx;
+ struct marufs_index_entry *base;
+
+ /* Found - determine shard_id and entry_idx for inode creation */
+ hash = READ_CXL_LE64(entry->name_hash);
+ shard_id = marufs_shard_idx(hash, sbi->shard_mask);
+
+ base = marufs_shard_entries(sbi, shard_id);
+ if (!base)
+ return ERR_PTR(-EIO);
+ entry_idx = (u32)(entry - base);
+
+ inode = marufs_iget(sb, entry, shard_id, entry_idx);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+
+ pr_debug("found '%.*s' region=%u\n", (int)dentry->d_name.len,
+ dentry->d_name.name, READ_CXL_LE32(entry->region_id));
+ }
+
+ return d_splice_alias(inode, dentry);
+}
+
+/* ============================================================================
+ * marufs_create - lightweight file creation (two-phase model)
+ * ============================================================================
+ *
+ * Phase 1 only: reserves RAT entry and inserts index entry.
+ * No physical space is allocated. i_size = 0, data_phys_offset = 0.
+ * Physical allocation happens in marufs_setattr() via ftruncate().
+ */
+
+/*
+ * __marufs_create_locked - critical section: RAT alloc + index insert.
+ * Caller holds Global ME. On success, *out_rat_entry_id and *out_entry_idx
+ * are set. On failure, any partial allocation is cleaned up internally.
+ */
+static int __marufs_create_locked(struct marufs_sb_info *sbi,
+ struct dentry *dentry, umode_t mode,
+ u32 *out_rat_entry_id, u32 *out_entry_idx)
+{
+ /* Step 1: Reserve RAT entry (size=0, offset=0) */
+ u32 rat_entry_id;
+ int ret = marufs_rat_alloc_entry(sbi, dentry->d_name.name, 0, 0,
+ &rat_entry_id);
+ if (ret) {
+ pr_err("RAT entry reservation failed: %d\n", ret);
+ return ret;
+ }
+
+ /* Write uid/gid/mode to RAT entry (single source of truth) */
+ struct marufs_rat_entry *rat_e =
+ marufs_rat_entry_get(sbi, rat_entry_id);
+ if (rat_e) {
+ WRITE_LE32(rat_e->uid, current_uid().val);
+ WRITE_LE32(rat_e->gid, current_gid().val);
+ WRITE_LE16(rat_e->mode, mode & 0777);
+ MARUFS_CXL_WMB(rat_e, sizeof(*rat_e));
+ }
+
+ /* Step 2: Insert into global index */
+ ret = marufs_index_insert(sbi, dentry->d_name.name, dentry->d_name.len,
+ rat_entry_id, out_entry_idx);
+ if (ret) {
+ pr_err("index insert failed: %d\n", ret);
+ marufs_rat_free_entry(marufs_rat_entry_get(sbi, rat_entry_id));
+ return ret;
+ }
+
+ *out_rat_entry_id = rat_entry_id;
+ return 0;
+}
+
+static int marufs_create(MARUFS_IDMAP_PARAM_COMMA struct inode *dir,
+ struct dentry *dentry, umode_t mode, bool excl)
+{
+ if (dir->i_ino != MARUFS_ROOT_INO)
+ return -ENOENT;
+
+ if (dentry->d_name.len > MARUFS_NAME_MAX)
+ return -ENAMETOOLONG;
+
+ pr_debug("creating file '%.*s' (lightweight, no physical space)\n",
+ (int)dentry->d_name.len, dentry->d_name.name);
+
+ struct super_block *sb = dir->i_sb;
+ struct marufs_sb_info *sbi = marufs_sb_get(sb);
+ if (!sbi->rat) {
+ pr_err("RAT not initialized\n");
+ return -ENOSPC;
+ }
+
+ /* Global ME: serialize RAT alloc + index insert across nodes */
+ int ret = sbi->me->ops->acquire(sbi->me, MARUFS_ME_GLOBAL_SHARD_ID);
+ if (ret)
+ return ret;
+ u32 rat_entry_id, entry_idx;
+ ret = __marufs_create_locked(sbi, dentry, mode, &rat_entry_id,
+ &entry_idx);
+ sbi->me->ops->release(sbi->me, MARUFS_ME_GLOBAL_SHARD_ID);
+ if (ret)
+ return ret;
+
+ u64 hash = marufs_hash_name(dentry->d_name.name, dentry->d_name.len);
+ u32 shard_id = marufs_shard_idx(hash, sbi->shard_mask);
+
+ /* Resolve entry pointer directly — no redundant lookup needed */
+ struct marufs_index_entry *entry =
+ marufs_shard_entry(sbi, shard_id, entry_idx);
+ if (!entry) {
+ pr_err("post-insert entry resolve failed\n");
+ marufs_index_delete(sbi, dentry->d_name.name,
+ dentry->d_name.len);
+ marufs_rat_free_entry(marufs_rat_entry_get(sbi, rat_entry_id));
+ return -EIO;
+ }
+
+ struct inode *inode = marufs_new_inode(sb, mode);
+ if (IS_ERR(inode)) {
+ pr_err("inode creation failed: %ld\n", PTR_ERR(inode));
+ marufs_index_delete(sbi, dentry->d_name.name,
+ dentry->d_name.len);
+ marufs_rat_free_entry(marufs_rat_entry_get(sbi, rat_entry_id));
+ return PTR_ERR(inode);
+ }
+
+ struct marufs_inode_info *xi = marufs_inode_get(inode);
+ xi->region_id = rat_entry_id;
+ xi->entry_idx = entry_idx;
+ xi->shard_id = shard_id;
+ xi->rat_entry_id = rat_entry_id;
+ xi->region_offset = 0; /* No physical region yet */
+ xi->owner_node_id = sbi->node_id;
+ xi->owner_pid = current->pid;
+ xi->owner_birth_time = ktime_to_ns(current->start_boottime);
+ xi->data_phys_offset = 0; /* Will be set by ftruncate */
+
+ inode->i_ino = marufs_make_ino(rat_entry_id);
+ inode->i_size = 0;
+ inode->i_op = &marufs_file_inode_ops;
+ inode->i_fop = &marufs_file_ops;
+
+ d_instantiate(dentry, inode);
+
+ pr_debug("reserved RAT entry %u for '%.*s' (ftruncate pending)\n",
+ rat_entry_id, (int)dentry->d_name.len, dentry->d_name.name);
+
+ return 0;
+}
+
+/* ============================================================================
+ * marufs_unlink - deletion via global index tombstone
+ * ============================================================================ */
+
+/*
+ * marufs_unlink_cleanup_region - invalidate region header and free RAT entry
+ * @sbi: superblock info
+ * @rat_entry_id: RAT entry ID to clean up
+ */
+static void marufs_unlink_cleanup_region(struct marufs_sb_info *sbi,
+ u32 rat_entry_id)
+{
+ struct marufs_rat_entry *rat_e =
+ marufs_rat_entry_get(sbi, rat_entry_id);
+ if (!rat_e)
+ return;
+
+ /* CAS: ALLOCATED → DELETING (preempt — prevent race with GC) */
+ u32 old_state = marufs_le32_cas(&rat_e->state,
+ MARUFS_RAT_ENTRY_ALLOCATED,
+ MARUFS_RAT_ENTRY_DELETING);
+ if (old_state != MARUFS_RAT_ENTRY_ALLOCATED)
+ return; /* GC already preempted or already FREE */
+
+ marufs_rat_free_entry(rat_e);
+}
+
+static int marufs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = d_inode(dentry);
+ struct marufs_inode_info *xi = marufs_inode_get(inode);
+ struct marufs_sb_info *sbi = marufs_sb_get(inode->i_sb);
+ int ret;
+
+ pr_debug("unlink '%.*s' rat_entry=%u\n", (int)dentry->d_name.len,
+ dentry->d_name.name, xi->rat_entry_id);
+
+ /* Permission check: DELETE required */
+ ret = marufs_check_permission(sbi, xi->rat_entry_id,
+ MARUFS_PERM_DELETE);
+ if (ret) {
+ /* Dead owner with no active delegations → allow force delete */
+ if (!marufs_can_force_unlink(sbi, xi->rat_entry_id)) {
+ pr_err("no delete permission for rat_entry %u\n",
+ xi->rat_entry_id);
+ return -EACCES;
+ }
+ }
+
+ /* Delete region entry from global index (mark as TOMBSTONE) */
+ ret = marufs_index_delete(sbi, dentry->d_name.name, dentry->d_name.len);
+ if (ret) {
+ pr_err("index delete failed: %d\n", ret);
+ return ret;
+ }
+
+ /* Region cleanup: only if RAT entry still ALLOCATED (skip if GC freed it) */
+ marufs_unlink_cleanup_region(sbi, xi->rat_entry_id);
+
+ drop_nlink(inode);
+ inode_set_ctime_to_ts(inode, current_time(inode));
+
+ pr_debug("unlinked '%.*s'\n", (int)dentry->d_name.len,
+ dentry->d_name.name);
+
+ return 0;
+}
+
+/* ============================================================================
+ * marufs_iterate - scan all shards for readdir
+ * ============================================================================ */
+
+static int marufs_iterate(struct file *file, struct dir_context *ctx)
+{
+ struct inode *inode = file_inode(file);
+ struct marufs_sb_info *sbi = marufs_sb_get(inode->i_sb);
+ struct marufs_rat_entry *rat_e;
+ u32 i;
+ char name_buf[MARUFS_NAME_MAX + 1];
+
+ /* "." entry */
+ if (ctx->pos == 0) {
+ if (!dir_emit_dot(file, ctx))
+ return 0;
+ ctx->pos = 1;
+ }
+
+ /* ".." entry */
+ if (ctx->pos == 1) {
+ if (!dir_emit_dotdot(file, ctx))
+ return 0;
+ ctx->pos = 2;
+ }
+
+ /*
+ * RAT-based readdir: scan 256 RAT entries instead of 64*16384 index entries.
+ *
+ * ctx->pos encoding:
+ * 0 = "."
+ * 1 = ".."
+ * 2 + i = RAT entry[i] (i = 0..255)
+ *
+ * On VFS re-entry after buffer full, pos resumes from the next RAT slot.
+ */
+ for (i = ctx->pos - 2; i < MARUFS_MAX_RAT_ENTRIES; i++) {
+ unsigned long ino;
+ size_t name_len;
+
+ rat_e = &sbi->rat->entries[i];
+ MARUFS_CXL_RMB(rat_e, sizeof(*rat_e));
+
+ if (READ_LE32(rat_e->state) != MARUFS_RAT_ENTRY_ALLOCATED)
+ continue;
+
+ memcpy(name_buf, rat_e->name, MARUFS_NAME_MAX);
+ name_buf[MARUFS_NAME_MAX] = '\0';
+ name_len = strnlen(name_buf, MARUFS_NAME_MAX);
+
+ if (name_len == 0)
+ continue;
+
+ ino = marufs_make_ino(i);
+
+ if (!dir_emit(ctx, name_buf, name_len, ino, DT_REG))
+ return 0;
+
+ ctx->pos = i + 3; /* next iteration resumes from i+1 */
+ }
+
+ return 0;
+}
+
+/* ============================================================================
+ * Operations tables
+ * ============================================================================ */
+
+const struct file_operations marufs_dir_ops = {
+ .owner = THIS_MODULE,
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+ .iterate_shared = marufs_iterate,
+ .fsync = noop_fsync,
+};
+
+const struct inode_operations marufs_dir_inode_ops = {
+ .lookup = marufs_lookup,
+ .create = marufs_create,
+ .unlink = marufs_unlink,
+};
diff --git a/marufs_kernel/src/dir.h b/marufs_kernel/src/dir.h
new file mode 100644
index 0000000..7d4fd93
--- /dev/null
+++ b/marufs_kernel/src/dir.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * dir.h - VFS directory file_operations export.
+ */
+
+#ifndef _MARUFS_DIR_H
+#define _MARUFS_DIR_H
+
+#include
+
+extern const struct file_operations marufs_dir_ops;
+
+#endif /* _MARUFS_DIR_H */
diff --git a/marufs_kernel/src/file.c b/marufs_kernel/src/file.c
new file mode 100644
index 0000000..cf3c526
--- /dev/null
+++ b/marufs_kernel/src/file.c
@@ -0,0 +1,878 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* file.c - MARUFS file operations (open, read, mmap, ioctl) */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "compat.h"
+#include "marufs.h"
+#include "me.h"
+
+/* vma hardening: block fork inherit, mremap grow, coredump leak */
+#define MARUFS_VMA_HARDEN_FLAGS (VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP)
+
+/* Pre-allocated batch buffer to avoid kvmalloc per ioctl */
+#define MARUFS_BATCH_BUF_SIZE \
+ max(MARUFS_BATCH_FIND_MAX * sizeof(struct marufs_find_name_req), \
+ MARUFS_BATCH_STORE_MAX * sizeof(struct marufs_name_offset_req))
+
+struct marufs_file_priv {
+ void *batch_buf;
+};
+
+struct marufs_file_ctx {
+ struct inode *inode;
+ struct marufs_inode_info *xi;
+ struct marufs_sb_info *sbi;
+};
+
+static inline void marufs_file_ctx_init(struct marufs_file_ctx *ctx,
+ struct file *file)
+{
+ ctx->inode = file_inode(file);
+ ctx->xi = marufs_inode_get(ctx->inode);
+ ctx->sbi = marufs_sb_get(ctx->inode->i_sb);
+}
+
+/* open() always allowed — permission checks happen at data access time */
+static int marufs_open(struct inode *inode, struct file *file)
+{
+ int ret = generic_file_open(inode, file);
+ if (ret)
+ return ret;
+
+ struct marufs_file_priv *priv =
+ kvmalloc(sizeof(*priv) + MARUFS_BATCH_BUF_SIZE, GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ priv->batch_buf = priv + 1;
+ file->private_data = priv;
+ return 0;
+}
+
+static int marufs_release(struct inode *inode, struct file *file)
+{
+ kvfree(file->private_data);
+ return 0;
+}
+
+/*
+ * marufs_require_cloexec - require FD_CLOEXEC on the caller's fd for @file.
+ *
+ * Post-exec privilege retention defense (fd-level, defense in depth):
+ * even when execve preserves PID + start_boottime + exe_inode (e.g.,
+ * same binary re-exec with hostile argv), an unmarked fd would survive
+ * exec and let the new context reach our data paths. Requiring
+ * FD_CLOEXEC forces the kernel to drop the fd across execve, closing
+ * that class of attack regardless of exe inode equality.
+ *
+ * Looked up at data-access time (mmap/read/write/ioctl) because at
+ * .open the fd is not yet installed in fdtable; only there can we
+ * inspect close_on_exec.
+ */
+static bool marufs_require_cloexec(struct file *file)
+{
+ struct files_struct *files = current->files;
+ if (!files)
+ return false;
+
+ rcu_read_lock();
+
+ struct fdtable *fdt = files_fdtable(files);
+ bool cloexec = false;
+ for (unsigned int i = 0; i < fdt->max_fds; i++) {
+ if (rcu_dereference_raw(fdt->fd[i]) == file) {
+ cloexec = close_on_exec(i, files);
+ break;
+ }
+ }
+
+ rcu_read_unlock();
+ return cloexec;
+}
+
+/*
+ * Direct CXL read — bypasses page cache, copies to user buffer.
+ *
+ * marufs is a DAX FS: primary data access is via mmap (zero-copy).
+ * read_iter supports read() for debugging/operational tools (cat, hexdump)
+ * while maintaining full permission checks and CXL cache coherence (RMB).
+ *
+ * Page cache path (read_folio) is intentionally blocked (-EIO) because:
+ * - No permission check interface at folio fill time
+ * - DRAM page cache copies break cross-node CXL coherence
+ * - sendfile/splice are not part of the KV cache access model
+ */
+static ssize_t marufs_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct marufs_file_ctx fc;
+ marufs_file_ctx_init(&fc, iocb->ki_filp);
+
+ if (!marufs_require_cloexec(iocb->ki_filp))
+ return -EACCES;
+
+ int ret = marufs_check_permission(fc.sbi, fc.xi->rat_entry_id,
+ MARUFS_PERM_READ);
+ if (ret)
+ return ret;
+
+ /*
+ * Cross-node i_size sync: remote ftruncate updates RAT but not our
+ * DRAM i_size (d_revalidate=0 only affects new lookups, not open fds).
+ */
+ struct marufs_rat_entry *rat_e =
+ marufs_rat_entry_get(fc.sbi, fc.xi->rat_entry_id);
+ if (rat_e) {
+ u64 fresh_size = READ_LE64(rat_e->size);
+ if (fresh_size != (u64)fc.inode->i_size) {
+ inode_lock(fc.inode);
+ i_size_write(fc.inode, fresh_size);
+ inode_unlock(fc.inode);
+ }
+
+ if (fc.xi->data_phys_offset == 0) {
+ u64 phys = READ_CXL_LE64(rat_e->phys_offset);
+ if (phys != 0)
+ fc.xi->data_phys_offset = phys;
+ }
+ }
+
+ loff_t pos = iocb->ki_pos;
+ size_t count = iov_iter_count(to);
+ if (pos >= fc.inode->i_size)
+ return 0;
+ if (pos + count > fc.inode->i_size)
+ count = fc.inode->i_size - pos;
+ if (count == 0)
+ return 0;
+
+ if (unlikely(fc.xi->data_phys_offset == 0)) {
+ pr_debug("read on uninitialized region (ftruncate pending)\n");
+ return 0;
+ }
+
+ if (unlikely(!marufs_validate_region_addr(
+ fc.sbi, fc.xi->data_phys_offset, pos + count))) {
+ pr_err("read range invalid slot_base=0x%llx pos=%lld count=%zu\n",
+ fc.xi->data_phys_offset, pos, count);
+ return -EIO;
+ }
+
+ /* RMB: WC writer → WB reader cache coherence */
+ void *data_ptr =
+ marufs_file_data_at(fc.sbi, fc.xi->data_phys_offset, pos);
+ MARUFS_CXL_RMB(data_ptr, count);
+
+ size_t copied = copy_to_iter(data_ptr, count, to);
+ if (copied == 0)
+ return -EFAULT;
+
+ iocb->ki_pos += copied;
+ return copied;
+}
+
+/* write() rejected — data writes only via mmap(PROT_WRITE) */
+static ssize_t marufs_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ return -EACCES;
+}
+
+/* RAT delegation gate shared by mmap and mprotect paths. */
+static int marufs_authorize_perms(struct marufs_sb_info *sbi,
+ struct marufs_inode_info *xi, bool need_write)
+{
+ int ret = marufs_check_permission(sbi, xi->rat_entry_id,
+ MARUFS_PERM_READ);
+ if (ret)
+ return ret;
+ if (need_write)
+ return marufs_check_permission(sbi, xi->rat_entry_id,
+ MARUFS_PERM_WRITE);
+ return 0;
+}
+
+/*
+ * vm_ops wrapper hooks. Recover sbi via container_of(vma->vm_ops),
+ * xi via vma->vm_private_data (set by attach, kept alive by igrab).
+ * vma->vm_file may point to dax_filp (device_dax delegate).
+ */
+
+static void marufs_vm_open(struct vm_area_struct *vma)
+{
+ struct marufs_inode_info *xi = vma->vm_private_data;
+
+ if (xi)
+ igrab(&xi->vfs_inode);
+}
+
+static void marufs_vm_close(struct vm_area_struct *vma)
+{
+ struct marufs_inode_info *xi = vma->vm_private_data;
+
+ if (xi)
+ iput(&xi->vfs_inode);
+}
+
+/* mprotect: block escalation past RAT delegation. */
+static int marufs_vm_mprotect(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, unsigned long newflags)
+{
+ if (!(newflags & (VM_READ | VM_WRITE | VM_EXEC)))
+ return 0; /* PROT_NONE */
+
+ struct marufs_inode_info *xi = vma->vm_private_data;
+ if (!xi || !vma->vm_ops)
+ return 0;
+
+ if ((newflags & VM_WRITE) && vma->vm_file &&
+ !(vma->vm_file->f_mode & FMODE_WRITE))
+ return -EACCES;
+
+ struct marufs_sb_info *sbi =
+ container_of(vma->vm_ops, struct marufs_sb_info, vm_ops);
+
+ return marufs_authorize_perms(sbi, xi, newflags & VM_WRITE);
+}
+
+/*
+ * Lazy-seed sbi->vm_ops from underlying driver's ops, override hooks,
+ * point vma at it. xi stashed in vm_private_data + igrab keeps inode
+ * alive across vma life. .open/.close maintain igrab balance over
+ * vma split/clone.
+ */
+static int marufs_attach_vm_ops(struct vm_area_struct *vma,
+ struct marufs_sb_info *sbi,
+ struct marufs_inode_info *xi)
+{
+ mutex_lock(&sbi->vm_ops_lock);
+ if (!sbi->vm_ops_seeded) {
+ if (vma->vm_ops)
+ sbi->vm_ops = *vma->vm_ops;
+ else
+ memset(&sbi->vm_ops, 0, sizeof(sbi->vm_ops));
+ sbi->vm_ops.open = marufs_vm_open;
+ sbi->vm_ops.close = marufs_vm_close;
+ sbi->vm_ops.mprotect = marufs_vm_mprotect;
+ sbi->vm_ops_seeded = true;
+ }
+ mutex_unlock(&sbi->vm_ops_lock);
+
+ if (!igrab(&xi->vfs_inode))
+ return -ESTALE; /* inode being evicted */
+ vma->vm_private_data = xi;
+ vma->vm_ops = &sbi->vm_ops;
+ return 0;
+}
+
+static int marufs_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ if (!marufs_require_cloexec(file))
+ return -EACCES;
+
+ /* Reject VM_WRITE on O_RDONLY fd */
+ if ((vma->vm_flags & VM_WRITE) && !(file->f_mode & FMODE_WRITE))
+ return -EACCES;
+
+ struct marufs_file_ctx fc;
+ marufs_file_ctx_init(&fc, file);
+
+ struct inode *inode = fc.inode;
+ struct marufs_sb_info *sbi = fc.sbi;
+ struct marufs_inode_info *xi = fc.xi;
+
+ int ret = marufs_authorize_perms(sbi, xi, vma->vm_flags & VM_WRITE);
+ if (ret)
+ return ret;
+
+ if (xi->data_phys_offset == 0 || inode->i_size == 0)
+ return -ENODATA;
+
+ /*
+ * DEV_DAX: delegate to device_dax driver — NVIDIA cudaHostRegister
+ * needs device_dax vm_ops on ZONE_DEVICE pages.
+ * Fallback: remap_pfn_range (no GPU DMA but mmap works).
+ */
+
+ /* Bounds check */
+ u64 user_offset = (u64)vma->vm_pgoff << PAGE_SHIFT;
+ unsigned long map_size = vma->vm_end - vma->vm_start;
+ if (user_offset + map_size > inode->i_size)
+ return -EINVAL;
+
+ if (unlikely(!marufs_validate_region_addr(sbi, xi->data_phys_offset,
+ user_offset + map_size)))
+ return -EIO;
+
+ if (sbi->dax_filp && sbi->dax_filp->f_op && sbi->dax_filp->f_op->mmap) {
+ /* Delegate to device_dax mmap (GPU DMA capable). */
+ unsigned long orig_pgoff = vma->vm_pgoff;
+
+ vma->vm_pgoff += fc.xi->data_phys_offset >> PAGE_SHIFT;
+
+ vma_set_file(vma, sbi->dax_filp);
+ ret = sbi->dax_filp->f_op->mmap(sbi->dax_filp, vma);
+ pr_debug("mmap DEV_DAX delegated inode=%lu pgoff=%lu ret=%d\n",
+ inode->i_ino, vma->vm_pgoff, ret);
+ if (ret) {
+ vma_set_file(vma, file);
+ vma->vm_pgoff = orig_pgoff;
+ return ret;
+ }
+
+ vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+ vm_flags_set(vma, MARUFS_VMA_HARDEN_FLAGS);
+ /* Keep vm_file = dax_filp; device_dax fault hooks need it.
+ * Recover marufs state via container_of + vm_private_data. */
+ } else {
+ /* Fallback: remap_pfn_range (WC pgprot, no GPU DMA). */
+ phys_addr_t phys_addr =
+ sbi->phys_base + xi->data_phys_offset + user_offset;
+
+ vm_flags_set(vma, VM_PFNMAP | MARUFS_VMA_HARDEN_FLAGS);
+ vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+
+ ret = remap_pfn_range(vma, vma->vm_start,
+ phys_addr >> PAGE_SHIFT, map_size,
+ vma->vm_page_prot);
+ if (ret) {
+ pr_err("remap_pfn_range failed: %d\n", ret);
+ return ret;
+ }
+
+ pr_debug("mmap DEV_DAX fallback PFN inode=%lu phys=0x%llx\n",
+ inode->i_ino, (unsigned long long)phys_addr);
+ }
+
+ return marufs_attach_vm_ops(vma, sbi, xi);
+}
+
+static loff_t marufs_llseek(struct file *file, loff_t offset, int whence)
+{
+ return generic_file_llseek(file, offset, whence);
+}
+
+/*
+ * Page cache path not supported — DAX FS uses mmap for data access
+ * and read_iter for read(). sendfile/splice will get -EIO.
+ */
+static int marufs_read_folio(struct file *file, struct folio *folio)
+{
+ folio_unlock(folio);
+ return -EIO;
+}
+
+const struct address_space_operations marufs_aops = {
+ .read_folio = marufs_read_folio,
+ .dirty_folio = filemap_dirty_folio,
+};
+
+/* Resolve a userspace region fd to its RAT entry ID */
+static int nrht_resolve_target_fd(int target_fd, u32 *out_region_id)
+{
+ struct fd f = fdget((unsigned int)target_fd);
+ if (!fd_file(f))
+ return -EBADF;
+
+ if (fd_file(f)->f_op != &marufs_file_ops) {
+ fdput(f);
+ return -EINVAL;
+ }
+
+ struct marufs_file_ctx fc;
+ marufs_file_ctx_init(&fc, fd_file(f));
+
+ *out_region_id = fc.xi->rat_entry_id;
+
+ fdput(f);
+ return 0;
+}
+
+/* Per-entry NRHT store: resolve target fd → insert name-ref */
+static int nrht_store_one(struct marufs_sb_info *sbi, u32 owner_rid,
+ struct marufs_name_offset_req *req)
+{
+ req->name[sizeof(req->name) - 1] = '\0';
+ size_t nlen = strnlen(req->name, sizeof(req->name));
+
+ u32 target_rid;
+ int ret = nrht_resolve_target_fd(req->target_region_fd, &target_rid);
+ if (ret)
+ return ret;
+
+ return marufs_nrht_insert(sbi, owner_rid, req->name, nlen,
+ req->name_hash, req->offset, target_rid);
+}
+
+/* ── ioctl handler: BATCH_NAME_OFFSET (batched NRHT insert) ─────────── */
+static long marufs_ioctl_batch_name_offset(
+ struct marufs_sb_info *sbi, struct marufs_inode_info *xi,
+ struct marufs_batch_name_offset_req *breq, void *batch_buf)
+{
+ if (breq->count == 0 || breq->count > MARUFS_BATCH_STORE_MAX)
+ return -EINVAL;
+
+ struct marufs_name_offset_req *bent = batch_buf;
+ size_t buf_size = (size_t)breq->count * sizeof(*bent);
+
+ if (copy_from_user(bent, (void __user *)u64_to_user_ptr(breq->entries),
+ buf_size))
+ return -EFAULT;
+
+ breq->stored = 0;
+ for (u32 i = 0; i < breq->count; i++) {
+ bent[i].status =
+ nrht_store_one(sbi, xi->rat_entry_id, &bent[i]);
+ if (bent[i].status == 0)
+ breq->stored++;
+ }
+
+ if (copy_to_user((void __user *)u64_to_user_ptr(breq->entries), bent,
+ buf_size))
+ return -EFAULT;
+
+ return 0;
+}
+
+/* Per-entry NRHT find: lookup name → fill offset + region_name */
+static int nrht_find_one(struct marufs_sb_info *sbi, u32 owner_rid,
+ struct marufs_find_name_req *req)
+{
+ req->name[sizeof(req->name) - 1] = '\0';
+ size_t nlen = strnlen(req->name, sizeof(req->name));
+
+ u64 offset;
+ u32 target_region_id;
+ u32 ref_count = 0, pin_count = 0;
+ int ret = marufs_nrht_lookup(sbi, owner_rid, req->name, nlen,
+ req->name_hash, &offset, &target_region_id,
+ &ref_count, &pin_count);
+ if (ret)
+ return ret;
+
+ req->offset = offset;
+ req->ref_count = ref_count;
+ req->pin_count = pin_count;
+
+ memset(req->region_name, 0, sizeof(req->region_name));
+ struct marufs_rat_entry *rat_e =
+ marufs_rat_entry_get(sbi, target_region_id);
+ if (rat_e) {
+ MARUFS_CXL_RMB(rat_e->name, sizeof(rat_e->name));
+ memcpy(req->region_name, rat_e->name,
+ min(sizeof(req->region_name) - 1, sizeof(rat_e->name)));
+ }
+ return 0;
+}
+
+/* ── ioctl handler: BATCH_FIND_NAME (batched NRHT lookup) ────────────── */
+static long marufs_ioctl_batch_find_name(struct marufs_sb_info *sbi,
+ struct marufs_inode_info *xi,
+ struct marufs_batch_find_req *breq,
+ void *batch_buf)
+{
+ if (breq->count == 0 || breq->count > MARUFS_BATCH_FIND_MAX)
+ return -EINVAL;
+
+ struct marufs_find_name_req *bent = batch_buf;
+ size_t buf_size = (size_t)breq->count * sizeof(*bent);
+
+ if (copy_from_user(bent, (void __user *)u64_to_user_ptr(breq->entries),
+ buf_size))
+ return -EFAULT;
+
+ breq->found = 0;
+ for (u32 i = 0; i < breq->count; i++) {
+ bent[i].status = nrht_find_one(sbi, xi->rat_entry_id, &bent[i]);
+ if (bent[i].status == 0)
+ breq->found++;
+ }
+
+ if (copy_to_user((void __user *)u64_to_user_ptr(breq->entries), bent,
+ buf_size))
+ return -EFAULT;
+
+ return 0;
+}
+
+/* ── ioctl handler: CLEAR_NAME (remove name-ref from NRHT) ──────────── */
+static long marufs_ioctl_clear_name(struct marufs_sb_info *sbi,
+ struct marufs_inode_info *xi,
+ struct marufs_name_offset_req *req)
+{
+ req->name[sizeof(req->name) - 1] = '\0';
+ size_t name_len = strnlen(req->name, sizeof(req->name));
+
+ return marufs_nrht_delete(sbi, xi->rat_entry_id, req->name, name_len,
+ req->name_hash);
+}
+
+/* ── ioctl handler: PERM_GRANT ──────────────────────────────────────── */
+static long marufs_ioctl_perm_grant(struct marufs_sb_info *sbi,
+ struct marufs_inode_info *xi,
+ struct marufs_perm_req *preq)
+{
+ /* Permission check inside ME: a concurrent chown can strip
+ * ADMIN/GRANT between check and delegation write, so a lock-free
+ * precheck would let stale rights leak across ownership transfer.
+ *
+ * GRANT can grant non-privileged perms; only ADMIN can grant
+ * ADMIN or GRANT itself.
+ */
+ u32 have;
+ int ret = marufs_check_permission_any(
+ sbi, xi->rat_entry_id, MARUFS_PERM_ADMIN | MARUFS_PERM_GRANT,
+ &have);
+ if (ret)
+ return ret;
+ if (!have) {
+ return -EACCES;
+ }
+ if (!(have & MARUFS_PERM_ADMIN) &&
+ (preq->perms & (MARUFS_PERM_ADMIN | MARUFS_PERM_GRANT))) {
+ return -EPERM;
+ }
+
+ ret = sbi->me->ops->acquire(sbi->me, MARUFS_ME_GLOBAL_SHARD_ID);
+ if (ret)
+ return ret;
+
+ ret = marufs_deleg_grant(sbi, xi->rat_entry_id, preq);
+ if (ret == 0)
+ pr_debug(
+ "granted perms=0x%x to node=%u pid=%u on rat_entry %u\n",
+ preq->perms, preq->node_id, preq->pid,
+ xi->rat_entry_id);
+
+ sbi->me->ops->release(sbi->me, MARUFS_ME_GLOBAL_SHARD_ID);
+ return ret;
+}
+
+/* ── ioctl handler: CHOWN (ownership transfer to caller) ───────────── */
+/*
+ * Caller MUST hold the Global ME. The ADMIN check is done here (not in the
+ * ioctl wrapper) so that the check and the ownership-transfer writes happen
+ * in the same critical section: a concurrent chown that completed first will
+ * have stripped ADMIN (default_perms=0, deleg cleared), so an in-lock recheck
+ * is the only thing that prevents multiple chowns from winning.
+ */
+static long __marufs_ioctl_chown_locked(struct marufs_sb_info *sbi,
+ struct marufs_inode_info *xi,
+ struct marufs_rat_entry *rat_entry)
+{
+ int ret = marufs_check_permission(sbi, xi->rat_entry_id,
+ MARUFS_PERM_ADMIN);
+ if (ret)
+ return ret;
+
+ /* CAS ALLOCATED→ALLOCATING: block GC during ownership transfer */
+ u32 old_state = marufs_le32_cas(&rat_entry->state,
+ MARUFS_RAT_ENTRY_ALLOCATED,
+ MARUFS_RAT_ENTRY_ALLOCATING);
+ if (old_state != MARUFS_RAT_ENTRY_ALLOCATED)
+ return -EAGAIN;
+
+ WRITE_LE64(rat_entry->alloc_time, ktime_get_real_ns());
+ MARUFS_CXL_WMB(rat_entry, 64); // CL0
+
+ WRITE_LE16(rat_entry->default_perms, 0);
+ WRITE_LE16(rat_entry->owner_node_id, sbi->node_id);
+ WRITE_LE32(rat_entry->owner_pid, current->pid);
+ WRITE_LE64(rat_entry->owner_birth_time,
+ ktime_to_ns(current->start_boottime));
+ WRITE_LE16(rat_entry->deleg_num_entries, 0);
+
+ /* Bind owner identity to exe binary (post-exec retention defense) */
+ u64 owner_ino = 0;
+ u32 owner_dev = 0;
+ marufs_get_exe_id(&owner_ino, &owner_dev);
+ WRITE_LE64(rat_entry->owner_exe_inode_ino, owner_ino);
+ WRITE_LE32(rat_entry->owner_exe_inode_dev, owner_dev);
+
+ MARUFS_CXL_WMB(&rat_entry->default_perms, 64); // CL2
+
+ for (u32 i = 0; i < MARUFS_DELEG_MAX_ENTRIES; i++) {
+ struct marufs_deleg_entry *de =
+ marufs_rat_deleg_entry(rat_entry, i);
+ if (!de)
+ continue;
+ WRITE_LE32(de->state, MARUFS_DELEG_EMPTY);
+ MARUFS_CXL_WMB(de, sizeof(*de));
+ }
+
+ /* Publish: ALLOCATING → ALLOCATED (ownership transfer complete) */
+ WRITE_LE32(rat_entry->state, MARUFS_RAT_ENTRY_ALLOCATED);
+ MARUFS_CXL_WMB(rat_entry, sizeof(*rat_entry));
+
+ return 0;
+}
+
+static long marufs_ioctl_chown(struct marufs_sb_info *sbi,
+ struct marufs_inode_info *xi,
+ struct marufs_chown_req *req)
+{
+ struct marufs_rat_entry *rat_entry =
+ marufs_rat_entry_get(sbi, xi->rat_entry_id);
+ if (!rat_entry)
+ return -EIO;
+
+ int ret = sbi->me->ops->acquire(sbi->me, MARUFS_ME_GLOBAL_SHARD_ID);
+ if (ret)
+ return ret;
+
+ ret = __marufs_ioctl_chown_locked(sbi, xi, rat_entry);
+
+ sbi->me->ops->release(sbi->me, MARUFS_ME_GLOBAL_SHARD_ID);
+ if (ret == 0)
+ pr_info_ratelimited("chown rat_entry %u -> node=%u pid=%d\n",
+ xi->rat_entry_id, sbi->node_id,
+ current->pid);
+ return ret;
+}
+
+/* ── ioctl handler: NRHT_REF/PIN_INC/DEC (per-entry counter ops) ───── */
+static long marufs_ioctl_nrht_refcnt(struct marufs_sb_info *sbi,
+ struct marufs_inode_info *xi,
+ struct marufs_refcnt_req *req,
+ nrht_refcnt_op_t op)
+{
+ req->name[sizeof(req->name) - 1] = '\0';
+ size_t name_len = strnlen(req->name, sizeof(req->name));
+
+ u32 new_count = 0;
+ int ret = op(sbi, xi->rat_entry_id, req->name, name_len, req->name_hash,
+ &new_count);
+ if (ret == 0)
+ req->count = new_count;
+ return ret;
+}
+
+/* ── ioctl handler: NRHT_INIT (format NRHT hash table) ──────────────── */
+static long marufs_ioctl_nrht_init(struct marufs_sb_info *sbi,
+ struct marufs_inode_info *xi,
+ struct marufs_nrht_init_req *nreq)
+{
+ enum marufs_me_strategy strat =
+ (nreq->me_strategy == MARUFS_ME_REQUEST) ? MARUFS_ME_REQUEST :
+ MARUFS_ME_ORDER;
+ return marufs_nrht_init(sbi, xi->rat_entry_id, nreq->max_entries,
+ nreq->num_shards, nreq->num_buckets, strat);
+}
+
+/* ── ioctl handler: PERM_SET_DEFAULT ────────────────────────────────── */
+static long marufs_ioctl_perm_set_default(struct marufs_sb_info *sbi,
+ struct marufs_inode_info *xi,
+ struct marufs_perm_req *preq)
+{
+ if (preq->perms & ~MARUFS_PERM_ALL)
+ return -EINVAL;
+
+ struct marufs_rat_entry *rat_entry =
+ marufs_rat_entry_get(sbi, xi->rat_entry_id);
+ if (!rat_entry)
+ return -EIO;
+
+ int ret = sbi->me->ops->acquire(sbi->me, MARUFS_ME_GLOBAL_SHARD_ID);
+ if (ret)
+ return ret;
+
+ /* Re-check ADMIN inside ME: a concurrent perm_set_default or chown
+ * may have stripped our ADMIN since the lock-free precheck. */
+ ret = marufs_check_permission(sbi, xi->rat_entry_id, MARUFS_PERM_ADMIN);
+ if (ret)
+ goto out;
+
+ WRITE_LE16(rat_entry->default_perms, preq->perms);
+ MARUFS_CXL_WMB(rat_entry, sizeof(*rat_entry));
+
+ pr_debug("set default_perms=0x%x on rat_entry %u\n", preq->perms,
+ xi->rat_entry_id);
+
+out:
+ sbi->me->ops->release(sbi->me, MARUFS_ME_GLOBAL_SHARD_ID);
+ return ret;
+}
+
+/*
+ * Required perm for ioctl precheck.
+ * Returns 0 when handler does its own check inside an ME critical section
+ * (PERM_GRANT/CHOWN need check+write atomic vs concurrent chown).
+ */
+static u16 marufs_ioctl_required_perm(unsigned int cmd)
+{
+ switch (cmd) {
+ case MARUFS_IOC_PERM_GRANT:
+ case MARUFS_IOC_CHOWN:
+ case MARUFS_IOC_PERM_SET_DEFAULT:
+ return 0; /* handler self-checks */
+ case MARUFS_IOC_NRHT_INIT:
+ return MARUFS_PERM_ADMIN;
+ default:
+ return MARUFS_PERM_IOCTL;
+ }
+}
+
+static long marufs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ if (!marufs_require_cloexec(file))
+ return -EACCES;
+
+ struct marufs_file_ctx fc;
+ marufs_file_ctx_init(&fc, file);
+
+ union {
+ struct marufs_name_offset_req name;
+ struct marufs_find_name_req find;
+ struct marufs_batch_name_offset_req batch_store;
+ struct marufs_batch_find_req batch_find;
+ struct marufs_perm_req perm;
+ struct marufs_chown_req chown;
+ struct marufs_nrht_init_req nrht_init;
+ struct marufs_refcnt_req refcnt;
+ } payload;
+
+ size_t req_size = _IOC_SIZE(cmd);
+ if (req_size > sizeof(payload))
+ return -ENOTTY;
+ if (copy_from_user(&payload, (void __user *)arg, req_size))
+ return -EFAULT;
+
+ u16 req_perm = marufs_ioctl_required_perm(cmd);
+ long ret = 0;
+ if (req_perm) {
+ ret = marufs_check_permission(fc.sbi, fc.xi->rat_entry_id,
+ req_perm);
+ if (ret)
+ return ret;
+ }
+
+ struct marufs_file_priv *priv = file->private_data;
+
+ switch (cmd) {
+ case MARUFS_IOC_NAME_OFFSET:
+ ret = nrht_store_one(fc.sbi, fc.xi->rat_entry_id,
+ &payload.name);
+ break;
+
+ case MARUFS_IOC_BATCH_NAME_OFFSET:
+ ret = marufs_ioctl_batch_name_offset(
+ fc.sbi, fc.xi, &payload.batch_store, priv->batch_buf);
+ break;
+
+ case MARUFS_IOC_FIND_NAME:
+ ret = nrht_find_one(fc.sbi, fc.xi->rat_entry_id, &payload.find);
+ break;
+
+ case MARUFS_IOC_BATCH_FIND_NAME:
+ ret = marufs_ioctl_batch_find_name(
+ fc.sbi, fc.xi, &payload.batch_find, priv->batch_buf);
+ break;
+
+ case MARUFS_IOC_CLEAR_NAME:
+ ret = marufs_ioctl_clear_name(fc.sbi, fc.xi, &payload.name);
+ break;
+
+ case MARUFS_IOC_PERM_GRANT:
+ ret = marufs_ioctl_perm_grant(fc.sbi, fc.xi, &payload.perm);
+ break;
+
+ case MARUFS_IOC_PERM_SET_DEFAULT:
+ ret = marufs_ioctl_perm_set_default(fc.sbi, fc.xi,
+ &payload.perm);
+ break;
+
+ case MARUFS_IOC_CHOWN:
+ ret = marufs_ioctl_chown(fc.sbi, fc.xi, &payload.chown);
+ break;
+
+ case MARUFS_IOC_NRHT_INIT:
+ ret = marufs_ioctl_nrht_init(fc.sbi, fc.xi, &payload.nrht_init);
+ break;
+
+ case MARUFS_IOC_NRHT_JOIN:
+ ret = marufs_nrht_join(fc.sbi, fc.xi->rat_entry_id);
+ break;
+
+ case MARUFS_IOC_NRHT_REF_INC:
+ ret = marufs_ioctl_nrht_refcnt(fc.sbi, fc.xi, &payload.refcnt,
+ marufs_nrht_ref_inc);
+ break;
+
+ case MARUFS_IOC_NRHT_REF_DEC:
+ ret = marufs_ioctl_nrht_refcnt(fc.sbi, fc.xi, &payload.refcnt,
+ marufs_nrht_ref_dec);
+ break;
+
+ case MARUFS_IOC_NRHT_PIN_INC:
+ ret = marufs_ioctl_nrht_refcnt(fc.sbi, fc.xi, &payload.refcnt,
+ marufs_nrht_pin_inc);
+ break;
+
+ case MARUFS_IOC_NRHT_PIN_DEC:
+ ret = marufs_ioctl_nrht_refcnt(fc.sbi, fc.xi, &payload.refcnt,
+ marufs_nrht_pin_dec);
+ break;
+
+ default:
+ return -ENOTTY;
+ }
+
+ /* Centralized copy_to_user for _IOWR ioctls */
+ if (ret == 0 && (_IOC_DIR(cmd) & _IOC_READ))
+ if (copy_to_user((void __user *)arg, &payload, req_size))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+/*
+ * marufs_get_unmapped_area - get aligned virtual address for mmap
+ *
+ * DEV_DAX mode: delegate to device_dax's get_unmapped_area so the
+ * kernel allocates a 2MB-aligned address. Without this, mmap() picks
+ * a 4KB-aligned address and device_dax's mmap handler rejects it.
+ */
+static unsigned long marufs_get_unmapped_area(struct file *file,
+ unsigned long addr,
+ unsigned long len,
+ unsigned long pgoff,
+ unsigned long flags)
+{
+ struct marufs_file_ctx fc;
+ marufs_file_ctx_init(&fc, file);
+ struct marufs_sb_info *sbi = fc.sbi;
+
+ if (sbi->dax_filp && sbi->dax_filp->f_op &&
+ sbi->dax_filp->f_op->get_unmapped_area) {
+ return sbi->dax_filp->f_op->get_unmapped_area(
+ sbi->dax_filp, addr, len, pgoff, flags);
+ }
+
+ return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags);
+}
+
+const struct file_operations marufs_file_ops = {
+ .owner = THIS_MODULE,
+ .llseek = marufs_llseek,
+ .read_iter = marufs_read_iter,
+ .write_iter = marufs_write_iter,
+ .mmap = marufs_mmap,
+ .get_unmapped_area = marufs_get_unmapped_area,
+ .open = marufs_open,
+ .release = marufs_release,
+ .fsync = noop_fsync,
+ .unlocked_ioctl = marufs_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
+};
diff --git a/marufs_kernel/src/file.h b/marufs_kernel/src/file.h
new file mode 100644
index 0000000..ae444fd
--- /dev/null
+++ b/marufs_kernel/src/file.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * file.h - VFS file_operations / address_space_operations exports.
+ */
+
+#ifndef _MARUFS_FILE_H
+#define _MARUFS_FILE_H
+
+#include
+
+extern const struct file_operations marufs_file_ops;
+extern const struct address_space_operations marufs_aops;
+
+#endif /* _MARUFS_FILE_H */
diff --git a/marufs_kernel/src/gc.c b/marufs_kernel/src/gc.c
new file mode 100644
index 0000000..bd76022
--- /dev/null
+++ b/marufs_kernel/src/gc.c
@@ -0,0 +1,827 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* gc.c - MARUFS Tombstone Garbage Collection */
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "marufs.h"
+
+#define MARUFS_GC_INTERVAL_MS 10000 /* Run GC every 10 seconds */
+#define MARUFS_GC_SHARD_DIVISOR \
+ 4 /* Round-robin: sweep ~25% of shards per cycle (min 1) */
+
+/*
+ * Admin node orphan policy
+ *
+ * Entries with node_id==0 are crash orphans — the writer died before
+ * stamping its node_id. No node's "node_id == sbi->node_id" filter
+ * matches them, so the normal GC path cannot reach them.
+ *
+ * The "admin node" is the node with the lowest currently-ACTIVE node_id
+ * in the bootstrap slot table. Only the admin node tracks node_id==0
+ * orphans in the local DRAM tracker and later CAS-claims them
+ * (node_id 0→admin_id, timestamp=now). Once claimed, the next GC cycle's
+ * normal path reclaims them like any other dead-process resource.
+ *
+ * Admin role is cached in sbi->cached_admin_node_id; refreshed at mount
+ * and on ME membership changes. Use marufs_is_admin_node(sbi) for checks.
+ *
+ * This eliminates CAS contention that would occur if every node
+ * raced to claim the same orphan entries.
+ *
+ * Affected paths:
+ * - marufs_is_stale_inserting() (index.c) — node_id==0 → admin only
+ * - nrht_is_stale() (nrht.c) — node_id==0 → admin only
+ * - marufs_gc_reclaim_dead_regions() — ALLOCATING + node_id==0
+ * - marufs_gc_sweep_dead_delegations() — GRANTING + granted_at==0
+ */
+
+/*
+ * marufs_current_admin_node_id - return the lowest ACTIVE node_id.
+ *
+ * Scans the bootstrap slot table for ACTIVE slots and returns the lowest
+ * node_id (= slot index + 1). Returns 0 if no ACTIVE slot found (all-dead
+ * or bootstrap area not yet initialised).
+ *
+ * Called at GC sweep entry — O(8) scan, negligible overhead.
+ */
+u32 marufs_current_admin_node_id(struct marufs_sb_info *sbi)
+{
+ struct marufs_bootstrap_slot *slots;
+ u32 lowest = U32_MAX;
+ int i;
+
+ if (!sbi->dax_base)
+ return 0;
+
+ slots = marufs_bootstrap_slot_get(sbi, 0);
+ if (!slots)
+ return 0;
+
+ for (i = 0; i < MARUFS_BOOTSTRAP_MAX_SLOTS; i++) {
+ MARUFS_CXL_RMB(&slots[i], sizeof(slots[i]));
+ if (READ_CXL_LE32(slots[i].status) != MARUFS_BS_CLAIMED)
+ continue;
+ u32 candidate = (u32)(i + 1);
+ if (candidate < lowest)
+ lowest = candidate;
+ }
+
+ return (lowest == U32_MAX) ? 0 : lowest;
+}
+
+/*
+ * marufs_is_admin_node - return true if this node is the current admin.
+ *
+ * Uses the cached value sbi->cached_admin_node_id for the hot path.
+ * The cache is refreshed at mount and on ME membership change events.
+ * GC sweep refreshes it directly via marufs_current_admin_node_id().
+ */
+bool marufs_is_admin_node(struct marufs_sb_info *sbi)
+{
+ return READ_LE32(sbi->cached_admin_node_id) == sbi->node_id;
+}
+
+/*
+ * marufs_entry_reclaim_slot - reclaim stale INSERTING entry to TOMBSTONE
+ * @entry: index entry pointer
+ *
+ * Reclaims to TOMBSTONE (not EMPTY) because the entry may already be
+ * linked to a bucket chain (crash between link_to_bucket and publish).
+ * A chain-in EMPTY would let flat scan claim it for a different bucket,
+ * corrupting the original chain. TOMBSTONE is safe — reused in-place
+ * by the next insert on the same bucket via check_duplicate.
+ *
+ * Returns true if slot was reclaimed.
+ */
+static bool marufs_entry_reclaim_slot(struct marufs_index_entry *entry)
+{
+ WRITE_LE64(entry->name_hash, 0);
+ MARUFS_CXL_WMB(entry, sizeof(*entry));
+ return marufs_le32_cas(&entry->state, MARUFS_ENTRY_INSERTING,
+ MARUFS_ENTRY_TOMBSTONE) ==
+ MARUFS_ENTRY_INSERTING;
+}
+
+/*
+ * marufs_gc_track_orphan - track orphaned entry in DRAM
+ * @sbi: superblock info
+ * @entry: CXL entry pointer (index, deleg, or RAT)
+ * @type: entry type for dispatch during sweep
+ *
+ * Tracks entry locally with discovery timestamp. Does NOT write to CXL
+ * to avoid cacheline clobbering with live writers on other nodes/CPUs.
+ *
+ * Thread safety: gc_orphans[] and gc_orphan_count are only accessed by the
+ * single GC kthread — no locking required.
+ */
+void marufs_gc_track_orphan(struct marufs_sb_info *sbi, void *entry,
+ enum marufs_orphan_type type)
+{
+ /* Already tracked? Skip duplicate registration */
+ u32 i;
+ for (i = 0; i < sbi->gc_orphan_count; i++) {
+ if (sbi->gc_orphans[i].entry == entry)
+ return;
+ }
+
+ /* Full — retry next cycle after sweep frees slots */
+ if (sbi->gc_orphan_count >= MARUFS_GC_ORPHAN_MAX)
+ return;
+
+ i = sbi->gc_orphan_count++;
+ sbi->gc_orphans[i].entry = entry;
+ sbi->gc_orphans[i].discovered_at = ktime_get_real_ns();
+ sbi->gc_orphans[i].type = type;
+}
+
+/*
+ * marufs_gc_is_orphan_stuck - check if tracked entry is still in expected state
+ * Returns true if entry is still stuck (eligible for reclaim on timeout).
+ * Returns false if state changed (should be removed from tracker).
+ */
+static bool marufs_gc_is_orphan_stuck(void *entry, enum marufs_orphan_type type)
+{
+ MARUFS_CXL_RMB(entry, 64); /* Invalidate CL0 (state & node_id field) */
+
+ switch (type) {
+ case MARUFS_ORPHAN_INDEX: {
+ struct marufs_index_entry *e = entry;
+ return READ_CXL_LE32(e->state) == MARUFS_ENTRY_INSERTING &&
+ READ_CXL_LE32(e->node_id) == 0;
+ }
+ case MARUFS_ORPHAN_DELEG: {
+ struct marufs_deleg_entry *de = entry;
+ return READ_CXL_LE32(de->state) == MARUFS_DELEG_GRANTING &&
+ (READ_CXL_LE32(de->node_id) == 0 ||
+ READ_CXL_LE64(de->granted_at) == 0);
+ }
+ case MARUFS_ORPHAN_DELEG_UNBOUND: {
+ struct marufs_deleg_entry *de = entry;
+ return READ_CXL_LE32(de->state) == MARUFS_DELEG_ACTIVE &&
+ READ_CXL_LE64(de->birth_time) == 0;
+ }
+ case MARUFS_ORPHAN_RAT: {
+ struct marufs_rat_entry *re = entry;
+ /* Invalidate CL2 (owner_node_id field) */
+ MARUFS_CXL_RMB(&re->owner_node_id, 64);
+ return READ_LE32(re->state) == MARUFS_RAT_ENTRY_ALLOCATING &&
+ READ_LE16(re->owner_node_id) == 0;
+ }
+ case MARUFS_ORPHAN_NRHT: {
+ struct marufs_nrht_entry *e = entry;
+ return READ_CXL_LE32(e->state) == MARUFS_ENTRY_INSERTING &&
+ READ_CXL_LE32(e->inserter_node) == 0;
+ }
+ default:
+ return false;
+ }
+}
+
+/*
+ * marufs_gc_claim_orphan - claim ownership of a timed-out orphan entry
+ * @sbi: superblock info (for node_id)
+ *
+ * CAS node_id from 0 to this node, then stamp a fresh timestamp.
+ * Actual reclaim happens in the next GC cycle via the normal path
+ * (is_stale_inserting / sweep_dead_delegations / is_orphaned),
+ * which gives a second timeout window before reclaim.
+ *
+ * Returns true if claimed (or already has node_id for DELEG).
+ */
+static bool marufs_gc_claim_orphan(struct marufs_sb_info *sbi, void *entry,
+ enum marufs_orphan_type type)
+{
+ u64 now = ktime_get_real_ns();
+
+ switch (type) {
+ case MARUFS_ORPHAN_INDEX: {
+ struct marufs_index_entry *e = entry;
+
+ if (marufs_le32_cas(&e->node_id, 0, sbi->node_id) != 0)
+ return false;
+ WRITE_LE64(e->created_at, now);
+ MARUFS_CXL_WMB(e, sizeof(*e));
+ return true;
+ }
+ case MARUFS_ORPHAN_DELEG: {
+ struct marufs_deleg_entry *de = entry;
+ u32 de_node = READ_CXL_LE32(de->node_id);
+
+ if (de_node == 0) {
+ if (marufs_le32_cas(&de->node_id, 0, sbi->node_id) != 0)
+ return false;
+ } else if (de_node != sbi->node_id) {
+ return false; /* Another node owns this entry */
+ }
+ WRITE_LE64(de->granted_at, now);
+ MARUFS_CXL_WMB(de, sizeof(*de));
+ return true;
+ }
+ case MARUFS_ORPHAN_DELEG_UNBOUND: {
+ struct marufs_deleg_entry *de = entry;
+ /*
+ * Stamp sentinel birth_time=1 so normal sweep's
+ * owner_is_dead(pid, 1) will flag it for reclaim.
+ * If the real process already bound, CAS fails — harmless.
+ */
+ if (marufs_le64_cas(&de->birth_time, 0, 1) != 0)
+ return false;
+ MARUFS_CXL_WMB(de, sizeof(*de));
+ return true;
+ }
+ case MARUFS_ORPHAN_RAT: {
+ struct marufs_rat_entry *re = entry;
+
+ if (marufs_le16_cas(&re->owner_node_id, 0, sbi->node_id) != 0)
+ return false;
+ WRITE_LE64(re->alloc_time, now);
+ MARUFS_CXL_WMB(re, sizeof(*re));
+ return true;
+ }
+ case MARUFS_ORPHAN_NRHT: {
+ struct marufs_nrht_entry *e = entry;
+
+ if (marufs_le32_cas(&e->inserter_node, 0, sbi->node_id) != 0)
+ return false;
+ WRITE_LE64(e->created_at, now);
+ MARUFS_CXL_WMB(e, 64);
+ return true;
+ }
+ default:
+ return false;
+ }
+}
+
+/*
+ * marufs_gc_sweep_orphans - reclaim timed-out orphaned entries
+ * @sbi: superblock info
+ *
+ * Walks the DRAM orphan tracker. For each entry:
+ * 1. If state changed (no longer stuck) → remove from tracker
+ * 2. If timeout expired → reclaim, then remove
+ * 3. Otherwise → keep tracking
+ *
+ * Returns number of reclaimed entries.
+ */
+static int marufs_gc_sweep_orphans(struct marufs_sb_info *sbi)
+{
+ int reclaimed = 0;
+ u32 i = 0;
+ u64 now = ktime_get_real_ns();
+
+ while (i < sbi->gc_orphan_count) {
+ struct marufs_orphan_tracker *t = &sbi->gc_orphans[i];
+
+ if (!marufs_gc_is_orphan_stuck(t->entry, t->type))
+ goto remove;
+
+ if (now > t->discovered_at &&
+ (now - t->discovered_at) > MARUFS_STALE_TIMEOUT_NS) {
+ if (marufs_gc_claim_orphan(sbi, t->entry, t->type))
+ reclaimed++;
+ goto remove;
+ }
+
+ i++;
+ continue;
+remove:
+ sbi->gc_orphans[i] = sbi->gc_orphans[--sbi->gc_orphan_count];
+ }
+
+ return reclaimed;
+}
+
+/*
+ * marufs_is_stale_inserting - check if INSERTING entry is stale and reclaimable
+ * @sbi: superblock info
+ * @e: index entry (caller verified state == INSERTING)
+ * @shard_id: shard containing the entry
+ * @slot_idx: slot index within shard
+ *
+ * Policy:
+ * - node_id == this node: check created_at timeout
+ * - node_id == 0 (orphan): track in DRAM, never write to CXL
+ * - node_id == other node: skip
+ * - created_at == 0 (same node): use DRAM tracker instead of CXL stamp
+ *
+ * Returns:
+ * 1 = stale + reclaimable → caller may reclaim
+ * 0 = not yet stale (tracked/deferred)
+ * -1 = not this node's entry
+ */
+/*
+ * Pure staleness check — no side effects (no DRAM tracking).
+ * Safe to call from any context (GC thread, syscall path).
+ * GC tombstone_sweep() handles DRAM tracking for return 0 cases.
+ */
+static int marufs_is_stale_inserting(struct marufs_sb_info *sbi,
+ struct marufs_index_entry *e)
+{
+ u32 inserter_node = READ_CXL_LE32(e->node_id);
+ u64 created_at, now;
+
+ if (inserter_node == 0)
+ return marufs_is_admin_node(sbi) ?
+ 0 :
+ -1; /* Only admin node tracks orphans */
+
+ if (inserter_node != sbi->node_id)
+ return -1;
+
+ /* Same node — check CXL timestamp */
+ created_at = READ_CXL_LE64(e->created_at);
+ if (created_at == 0)
+ return 0; /* Timestamp not yet visible — can't determine */
+
+ now = ktime_get_real_ns();
+ if (now > created_at && (now - created_at) > MARUFS_STALE_TIMEOUT_NS)
+ return 1;
+
+ return 0;
+}
+
+/*
+ * marufs_gc_sweep_stale_entries - reclaim stale INSERTING entries in shard
+ * @sbi: superblock info
+ * @shard_id: shard to clean
+ *
+ * TOMBSTONE entries are NOT converted to EMPTY here. Without chain unlink,
+ * converting TOMBSTONE→EMPTY allows the flat EMPTY scan to claim the slot
+ * for a different bucket, corrupting the original bucket's chain. Instead,
+ * TOMBSTONE entries are reused in-place by the insert path (same-bucket
+ * chain reuse in __marufs_index_insert step 3a). Proper TOMBSTONE→EMPTY
+ * with chain unlink will be added with token-ring (PR 2).
+ *
+ * Stale INSERTING entries (crashed mid-insert) are reclaimed to TOMBSTONE
+ * after timeout via CAS(INSERTING → TOMBSTONE).
+ *
+ * Returns number of reclaimed entries, or negative on error.
+ */
+static int marufs_gc_sweep_stale_entries(struct marufs_sb_info *sbi,
+ u32 shard_id)
+{
+ struct marufs_shard_header *sh = marufs_shard_header_get(sbi, shard_id);
+ if (!sh)
+ return -EINVAL;
+
+ u32 num_entries = READ_LE32(sh->num_entries);
+ int inserting_reclaimed = 0;
+
+ for (u32 i = 0; i < num_entries; i++) {
+ struct marufs_index_entry *entry =
+ marufs_shard_entry(sbi, shard_id, i);
+ if (!entry)
+ continue;
+
+ u32 state = READ_CXL_LE32(entry->state);
+ if (state == MARUFS_ENTRY_INSERTING) {
+ int ret = marufs_is_stale_inserting(sbi, entry);
+ if (ret == 1) {
+ if (marufs_entry_reclaim_slot(entry))
+ inserting_reclaimed++;
+ } else if (ret == 0) {
+ marufs_gc_track_orphan(sbi, entry,
+ MARUFS_ORPHAN_INDEX);
+ }
+ }
+ }
+
+ if (inserting_reclaimed > 0)
+ pr_debug(
+ "gc reclaimed %d stale INSERTING entries from shard %u\n",
+ inserting_reclaimed, shard_id);
+
+ return inserting_reclaimed;
+}
+
+/*
+ * marufs_gc_sweep_dead_delegations - clean dead delegation entries for this node
+ * @sbi: superblock info
+ * @rat_entry: RAT entry whose delegation table to scan
+ *
+ * Scans the delegation table in the region header. For entries where
+ * node_id matches this node, checks if the delegated process is still alive.
+ * Dead entries are marked EMPTY.
+ *
+ * Returns number of entries cleaned, or negative on error.
+ */
+static int marufs_gc_sweep_dead_delegations(struct marufs_sb_info *sbi,
+ struct marufs_rat_entry *rat_entry)
+{
+ int cleaned = 0;
+
+ if (READ_CXL_LE32(rat_entry->state) != MARUFS_RAT_ENTRY_ALLOCATED)
+ return 0;
+
+ for (u32 i = 0; i < MARUFS_DELEG_MAX_ENTRIES; i++) {
+ struct marufs_deleg_entry *de =
+ marufs_rat_deleg_entry(rat_entry, i);
+ if (!de)
+ continue;
+
+ u32 de_state = READ_CXL_LE32(de->state);
+
+ switch (de_state) {
+ case MARUFS_DELEG_EMPTY:
+ break;
+
+ case MARUFS_DELEG_GRANTING: {
+ u64 granted_at = READ_CXL_LE64(de->granted_at);
+
+ if (granted_at == 0) {
+ u32 de_node = READ_CXL_LE32(de->node_id);
+
+ if (de_node != 0 || marufs_is_admin_node(sbi))
+ marufs_gc_track_orphan(
+ sbi, de, MARUFS_ORPHAN_DELEG);
+ } else if (ktime_get_real_ns() - granted_at >
+ MARUFS_STALE_TIMEOUT_NS) {
+ if (marufs_le32_cas(&de->state,
+ MARUFS_DELEG_GRANTING,
+ MARUFS_DELEG_EMPTY) ==
+ MARUFS_DELEG_GRANTING)
+ cleaned++;
+ }
+ break;
+ }
+ case MARUFS_DELEG_ACTIVE: {
+ u32 de_node = READ_CXL_LE32(de->node_id);
+ u32 de_pid = READ_CXL_LE32(de->pid);
+ u64 de_birth = READ_CXL_LE64(de->birth_time);
+
+ if (de_node != sbi->node_id || de_pid == 0)
+ break;
+
+ /*
+ * birth_time == 0: lazy init not yet done by delegated
+ * process. Cannot use owner_is_dead() because it would
+ * compare real birth_time against 0 and false-positive.
+ * Use granted_at timeout instead (same pattern as GRANTING).
+ */
+ if (de_birth == 0) {
+ marufs_gc_track_orphan(
+ sbi, de, MARUFS_ORPHAN_DELEG_UNBOUND);
+ break;
+ }
+
+ if (!marufs_owner_is_dead(de_pid, de_birth))
+ break;
+
+ if (marufs_le32_cas(&de->state, MARUFS_DELEG_ACTIVE,
+ MARUFS_DELEG_EMPTY) ==
+ MARUFS_DELEG_ACTIVE) {
+ marufs_le16_cas_dec(
+ &rat_entry->deleg_num_entries);
+ cleaned++;
+ }
+ break;
+ }
+ }
+ }
+
+ return cleaned;
+}
+
+/*
+ * marufs_has_active_delegations - check if any active delegation entries exist
+ * @rat_entry: RAT entry to check
+ *
+ * Returns true if at least one ACTIVE delegation entry exists.
+ */
+static bool marufs_has_active_delegations(struct marufs_rat_entry *rat_entry)
+{
+ if (READ_CXL_LE64(rat_entry->phys_offset) == 0)
+ return false;
+
+ for (u32 i = 0; i < MARUFS_DELEG_MAX_ENTRIES; i++) {
+ struct marufs_deleg_entry *de =
+ marufs_rat_deleg_entry(rat_entry, i);
+ if (!de)
+ continue;
+ if (READ_LE32(de->state) == MARUFS_DELEG_ACTIVE)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * marufs_is_orphaned - check if RAT entry owner is dead with no active users
+ * @entry: RAT entry pointer (caller must validate and filter by node_id)
+ *
+ * Returns true when:
+ * 1. Owner process is dead (pid=0 uses alloc_time timeout)
+ * 2. No active delegations remain
+ *
+ * Pure predicate — no side effects. Caller handles DRAM tracking
+ * for alloc_time=0 case (pid=0, timestamp not yet written).
+ */
+static bool marufs_is_orphaned(struct marufs_rat_entry *entry)
+{
+ MARUFS_CXL_RMB(&entry->default_perms, 64);
+
+ u32 owner_pid = READ_LE32(entry->owner_pid);
+ u64 birth_time = READ_LE64(entry->owner_birth_time);
+
+ if (owner_pid == 0) {
+ u64 alloc_time = READ_LE64(entry->alloc_time);
+
+ if (alloc_time == 0)
+ return false; /* Timestamp not yet written */
+
+ u64 now = ktime_get_real_ns();
+ if (now <= alloc_time ||
+ (now - alloc_time) <= MARUFS_STALE_TIMEOUT_NS)
+ return false;
+ } else if (!marufs_owner_is_dead(owner_pid, birth_time)) {
+ return false;
+ }
+
+ if (marufs_has_active_delegations(entry))
+ return false;
+
+ return true;
+}
+
+/*
+ * marufs_can_force_unlink - check if an unowned RAT entry can be force-deleted
+ * @sbi: superblock info
+ * @rat_entry_id: RAT entry index
+ *
+ * For dir.c unlink fallback: allows deleting a file whose owner is dead
+ * and has no active delegations, without requiring DELETE permission.
+ */
+bool marufs_can_force_unlink(struct marufs_sb_info *sbi, u32 rat_entry_id)
+{
+ struct marufs_rat_entry *entry =
+ marufs_rat_entry_get(sbi, rat_entry_id);
+ if (!entry)
+ return false;
+ if (READ_LE16(entry->owner_node_id) != sbi->node_id)
+ return false;
+ return marufs_is_orphaned(entry);
+}
+
+/*
+ * marufs_gc_cleanup_rat_entry - full index + RAT cleanup for a reclaimable entry
+ * @sbi: superblock info
+ * @entry: RAT entry
+ *
+ * Removes index entry by name, then frees the RAT entry.
+ */
+static void marufs_gc_cleanup_rat_entry(struct marufs_sb_info *sbi,
+ struct marufs_rat_entry *entry)
+{
+ char name_buf[MARUFS_NAME_MAX + 1];
+ size_t name_len;
+ int ret;
+
+ MARUFS_CXL_RMB(entry->name, sizeof(entry->name));
+ memcpy(name_buf, entry->name, MARUFS_NAME_MAX);
+ name_buf[MARUFS_NAME_MAX] = '\0';
+ name_len = strnlen(name_buf, MARUFS_NAME_MAX);
+
+ if (name_len > 0) {
+ ret = marufs_index_delete(sbi, name_buf, name_len);
+ if (ret && ret != -ENOENT)
+ pr_warn("gc index_delete failed for '%s': %d\n",
+ name_buf, ret);
+ }
+
+ marufs_rat_free_entry(entry);
+}
+
+/*
+ * marufs_gc_reclaim_dead_regions - reclaim regions from dead processes
+ * @sbi: superblock info
+ *
+ * Three-phase GC for each ALLOCATED RAT entry:
+ * Phase A: Sweep dead delegation entries for this node (all regions)
+ * Phase B: Filter to regions owned by this node with dead owner
+ * Phase C: Skip reclaim if active delegations remain
+ *
+ * A region is only reclaimed when the owner is dead AND all delegation
+ * entries have been cleared (by each node's GC sweeping its own entries).
+ *
+ * Returns number of reclaimed regions.
+ */
+int marufs_gc_reclaim_dead_regions(struct marufs_sb_info *sbi)
+{
+ u32 i;
+ int reclaimed = 0;
+
+ if (!sbi)
+ return -EINVAL;
+
+ for (i = 0; i < MARUFS_MAX_RAT_ENTRIES; i++) {
+ struct marufs_rat_entry *entry = marufs_rat_entry_get(sbi, i);
+ if (!entry) {
+ pr_info_ratelimited(
+ "gc: no RAT loaded, skipping dead-process scan\n");
+ return 0;
+ }
+
+ u32 state = READ_LE32(entry->state);
+ if (state == MARUFS_RAT_ENTRY_FREE) {
+ clear_bit(i, sbi->gc_nrht_bitmap);
+ continue;
+ }
+
+ /* Maintain NRHT bitmap for Phase 4 */
+ if (state == MARUFS_RAT_ENTRY_ALLOCATED &&
+ READ_LE32(entry->region_type) == MARUFS_REGION_NRHT)
+ set_bit(i, sbi->gc_nrht_bitmap);
+ else
+ clear_bit(i, sbi->gc_nrht_bitmap);
+
+ /* Sweep dead delegation entries for this node (runs on all regions) */
+ marufs_gc_sweep_dead_delegations(sbi, entry);
+
+ /* Orphan with no owner node (crash before node_id written) */
+ u16 owner_node = READ_LE16(entry->owner_node_id);
+ if (owner_node == 0 && state == MARUFS_RAT_ENTRY_ALLOCATING) {
+ if (marufs_is_admin_node(sbi))
+ marufs_gc_track_orphan(sbi, entry,
+ MARUFS_ORPHAN_RAT);
+ continue;
+ }
+
+ /* Remaining logic is owner-node only */
+ if (owner_node != sbi->node_id)
+ continue;
+
+ if (!marufs_is_orphaned(entry)) {
+ /* Track ALLOCATING with alloc_time not yet written */
+ if (state == MARUFS_RAT_ENTRY_ALLOCATING &&
+ READ_LE32(entry->owner_pid) == 0)
+ marufs_gc_track_orphan(sbi, entry,
+ MARUFS_ORPHAN_RAT);
+ continue;
+ }
+
+ /* ALLOCATED needs CAS to DELETING first (race with unlink) */
+ if (state == MARUFS_RAT_ENTRY_ALLOCATED) {
+ u32 old_state = marufs_le32_cas(
+ &entry->state, MARUFS_RAT_ENTRY_ALLOCATED,
+ MARUFS_RAT_ENTRY_DELETING);
+ if (old_state != MARUFS_RAT_ENTRY_ALLOCATED)
+ continue; /* unlink already preempted */
+ }
+
+ pr_info("gc reclaiming RAT entry %u (state=%u, pid=%u)\n", i,
+ state, READ_LE32(entry->owner_pid));
+ marufs_gc_cleanup_rat_entry(sbi, entry);
+ reclaimed++;
+ }
+
+ return reclaimed;
+}
+
+/*
+ * marufs_gc_thread_fn - background GC thread function
+ * @data: pointer to marufs_sb_info
+ *
+ * Runs periodically to:
+ * 1. Reclaim regions from dead processes
+ * 2. Sweep tombstone entries from shards exceeding threshold
+ * Thread runs every MARUFS_GC_INTERVAL_MS (10 seconds).
+ */
+static int marufs_gc_thread_fn(void *data)
+{
+ struct marufs_sb_info *sbi = data;
+ pr_info("gc thread started for node %u\n", sbi->node_id);
+
+ while (1) {
+ msleep_interruptible(MARUFS_GC_INTERVAL_MS);
+
+ if (kthread_should_stop())
+ break;
+
+ if (atomic_read(&sbi->gc_paused))
+ continue;
+
+ u32 shards_per_cycle =
+ max(sbi->num_shards / MARUFS_GC_SHARD_DIVISOR, 1U);
+
+ /* Refresh admin role cache once per sweep cycle. */
+ WRITE_LE32(sbi->cached_admin_node_id,
+ marufs_current_admin_node_id(sbi));
+
+ /* Phase 1: Reclaim regions from dead processes */
+ marufs_gc_reclaim_dead_regions(sbi);
+ if (kthread_should_stop())
+ break;
+
+ /* Phase 2: Round-robin sweep of stale INSERTING entries */
+ for (u32 s = 0; s < shards_per_cycle; s++) {
+ marufs_gc_sweep_stale_entries(sbi, sbi->gc_next_shard);
+ sbi->gc_next_shard =
+ (sbi->gc_next_shard + 1) % sbi->num_shards;
+ }
+ if (kthread_should_stop())
+ break;
+
+ /* Phase 3: Reclaim timed-out orphaned entries from DRAM tracker */
+ marufs_gc_sweep_orphans(sbi);
+ if (kthread_should_stop())
+ break;
+
+ /* Phase 4: Sweep stale INSERTING entries in NRHT regions */
+ marufs_nrht_gc_sweep_all(sbi);
+ if (kthread_should_stop())
+ break;
+
+ atomic_inc(&sbi->gc_epoch);
+ }
+
+ pr_info("gc thread exiting for node %u (active=%d)\n", sbi->node_id,
+ atomic_read(&sbi->gc_active));
+ return 0;
+}
+
+/*
+ * marufs_gc_start - start background GC thread
+ * @sbi: superblock info
+ *
+ * Called during mount. Creates a kernel thread that periodically
+ * reclaims regions from dead processes.
+ *
+ * Returns 0 on success, negative on error.
+ */
+int marufs_gc_start(struct marufs_sb_info *sbi)
+{
+ if (!sbi)
+ return -EINVAL;
+
+ if (sbi->gc_thread) {
+ pr_warn("gc thread already running\n");
+ return -EEXIST;
+ }
+
+ /* Hold module reference so rmmod waits for GC exit */
+ if (!try_module_get(THIS_MODULE))
+ return -ENODEV;
+
+ atomic_set(&sbi->gc_active, 1);
+ atomic_set(&sbi->gc_paused, 0);
+ atomic_set(&sbi->gc_epoch, 0);
+ sbi->gc_orphan_count = 0;
+
+ sbi->gc_thread = kthread_run(marufs_gc_thread_fn, sbi, "marufs-gc-%u",
+ sbi->node_id);
+ if (IS_ERR(sbi->gc_thread)) {
+ int ret = PTR_ERR(sbi->gc_thread);
+ sbi->gc_thread = NULL;
+ atomic_set(&sbi->gc_active, 0);
+ module_put(THIS_MODULE);
+ pr_err("failed to start gc thread: %d\n", ret);
+ return ret;
+ }
+
+ pr_info("gc thread started (interval=%dms)\n", MARUFS_GC_INTERVAL_MS);
+ return 0;
+}
+
+/*
+ * marufs_gc_stop - stop background GC thread
+ * @sbi: superblock info
+ *
+ * Called during unmount. Stops the background GC thread.
+ */
+void marufs_gc_stop(struct marufs_sb_info *sbi)
+{
+ if (!sbi || !sbi->gc_thread)
+ return;
+
+ pr_info("stopping gc thread for node %u\n", sbi->node_id);
+ atomic_set(&sbi->gc_active, 0);
+ kthread_stop(sbi->gc_thread);
+ sbi->gc_thread = NULL;
+ module_put(THIS_MODULE);
+}
+
+/*
+ * marufs_gc_restart - restart GC thread (safe even if thread crashed)
+ * @sbi: superblock info
+ *
+ * Stops existing thread (if any) and starts a fresh one.
+ * kthread_stop() is safe on crashed kthreads (task stays zombie
+ * because kthread_create holds a reference).
+ */
+int marufs_gc_restart(struct marufs_sb_info *sbi)
+{
+ if (!sbi)
+ return -EINVAL;
+
+ pr_info("gc restart requested for node %u\n", sbi->node_id);
+
+ /* Stop existing thread cleanly (safe even if already dead) */
+ marufs_gc_stop(sbi);
+
+ return marufs_gc_start(sbi);
+}
diff --git a/marufs_kernel/src/gc.h b/marufs_kernel/src/gc.h
new file mode 100644
index 0000000..e460df1
--- /dev/null
+++ b/marufs_kernel/src/gc.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * gc.h - Tombstone / dead-process GC types and entry points.
+ *
+ * Orphan tracker stored in DRAM on each sbi (gc_orphans[]). GC sweeps
+ * stale INSERTING/GRANTING/ALLOCATING entries that were abandoned by
+ * dead processes or aborted operations.
+ */
+
+#ifndef _MARUFS_GC_H
+#define _MARUFS_GC_H
+
+#include
+
+#define MARUFS_GC_ORPHAN_MAX 64
+
+enum marufs_orphan_type {
+ MARUFS_ORPHAN_INDEX, /* stale INSERTING index entry */
+ MARUFS_ORPHAN_DELEG, /* stale GRANTING delegation entry */
+ MARUFS_ORPHAN_DELEG_UNBOUND, /* ACTIVE deleg, birth_time not yet bound */
+ MARUFS_ORPHAN_RAT, /* stuck ALLOCATING RAT entry */
+ MARUFS_ORPHAN_NRHT, /* stale INSERTING NRHT entry */
+};
+
+struct marufs_orphan_tracker {
+ void *entry;
+ u64 discovered_at;
+ enum marufs_orphan_type type;
+};
+
+struct marufs_sb_info;
+
+void marufs_gc_track_orphan(struct marufs_sb_info *sbi, void *entry,
+ enum marufs_orphan_type type);
+int marufs_gc_reclaim_dead_regions(struct marufs_sb_info *sbi);
+bool marufs_can_force_unlink(struct marufs_sb_info *sbi, u32 rat_entry_id);
+int marufs_gc_start(struct marufs_sb_info *sbi);
+void marufs_gc_stop(struct marufs_sb_info *sbi);
+int marufs_gc_restart(struct marufs_sb_info *sbi);
+
+/* Admin role helpers (decoupled from hardcoded node_id==1) */
+u32 marufs_current_admin_node_id(struct marufs_sb_info *sbi);
+bool marufs_is_admin_node(struct marufs_sb_info *sbi);
+
+#endif /* _MARUFS_GC_H */
diff --git a/marufs_kernel/src/index.c b/marufs_kernel/src/index.c
new file mode 100644
index 0000000..5194273
--- /dev/null
+++ b/marufs_kernel/src/index.c
@@ -0,0 +1,524 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * index.c - MARUFS Global Index CAS operations
+ *
+ * Lock-free global index for file metadata. Index is sharded by filename hash,
+ * with each shard containing a hash-bucket array and flat entry array.
+ * All mutations use CAS (cmpxchg) on entry state fields and bucket head
+ * pointers for multi-node safety.
+ *
+ * Entry state machine:
+ * EMPTY --CAS--> INSERTING --> TENTATIVE --> VALID --CAS--> TOMBSTONE
+ *
+ * Bucket chains are singly-linked via next_in_bucket, terminating with
+ * MARUFS_BUCKET_END (0xFFFFFFFF).
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "marufs.h"
+
+/* CAS current state → INSERTING and stamp inserter identity for GC. */
+static inline bool marufs_index_claim_entry(struct marufs_sb_info *sbi,
+ struct marufs_index_entry *e)
+{
+ u32 st = READ_LE32(e->state);
+ if (st != MARUFS_ENTRY_EMPTY && st != MARUFS_ENTRY_TOMBSTONE)
+ return false;
+ if (marufs_le32_cas(&e->state, st, MARUFS_ENTRY_INSERTING) != st)
+ return false;
+
+ WRITE_LE64(e->created_at, ktime_get_real_ns());
+ WRITE_LE32(e->node_id, sbi->node_id);
+ /* WMB deferred to caller (step 4 / link_and_publish) */
+ return true;
+}
+
+/*
+ * marufs_index_check_duplicate - check for duplicate name in bucket chain
+ * @sbi: superblock info
+ * @entries: shard hot entry array
+ * @bucket_head: pointer to bucket head entry
+ * @hash: name hash to match
+ * @name: filename to match
+ * @namelen: filename length
+ * @reuse_idx: output — index of first reusable entry (TOMBSTONE or EMPTY)
+ * found in chain, or MARUFS_BUCKET_END if none. Caller can
+ * reuse this slot instead of scanning the entry array.
+ *
+ * Return: 0 if no duplicate, -EEXIST if name already exists
+ */
+static int marufs_index_check_duplicate(struct marufs_sb_info *sbi,
+ struct marufs_index_entry *entries,
+ u32 *bucket_head, u64 hash,
+ const char *name, size_t namelen,
+ u32 *reuse_idx)
+{
+ MARUFS_CXL_RMB(bucket_head, sizeof(*bucket_head));
+ u32 cur = READ_CXL_LE32(*bucket_head);
+ u32 steps = 0;
+ u32 *prev_next = bucket_head;
+ u32 num_entries = sbi->entries_per_shard;
+ *reuse_idx = MARUFS_BUCKET_END;
+
+ while (cur != MARUFS_BUCKET_END && cur < num_entries) {
+ if (++steps > num_entries) {
+ pr_err("bucket chain cycle detected (dup check)\n");
+ return -EIO;
+ }
+
+ /* Direct access + manual RMB to avoid per-iteration bounds check
+ * in marufs_shard_entry() — caller already validated shard geometry. */
+ struct marufs_index_entry *e = &entries[cur];
+ MARUFS_CXL_RMB(e, sizeof(*e));
+ u32 st = READ_CXL_LE32(e->state);
+
+ if (st == MARUFS_ENTRY_TOMBSTONE || st == MARUFS_ENTRY_EMPTY) {
+ /* Pre-read successor before any modification */
+ u32 next = READ_CXL_LE32(e->next_in_bucket);
+
+ if (*reuse_idx == MARUFS_BUCKET_END) {
+ /* Keep first dead entry for in-place reuse */
+ *reuse_idx = cur;
+ prev_next = (u32 *)&e->next_in_bucket;
+ } else {
+ /*
+ * Inline unlink: skip this dead entry by
+ * pointing prev directly to cur's successor.
+ * Best-effort — CAS failure is harmless.
+ * On success, reclaim to EMPTY so flat scan
+ * can safely reuse (no chain pollution since
+ * entry is no longer linked).
+ */
+ if (marufs_le32_cas(prev_next, cur, next) ==
+ cur) {
+ /* Unlinked — reclaim to EMPTY.
+ * Stale name/hash are harmless:
+ * flat scan only checks state,
+ * and the claimer overwrites all
+ * fields before publishing VALID.
+ * marufs_le32_cas includes CXL WMB. */
+ marufs_le32_cas(&e->state, st,
+ MARUFS_ENTRY_EMPTY);
+ }
+ /* Don't advance prev_next — it now points
+ * to the successor, re-read from there. */
+ }
+ cur = next;
+ continue;
+ }
+
+ /*
+ * Only VALID, TENTATIVE, or INSERTING remain (dead handled
+ * above). Skip INSERTING/TENTATIVE — not yet committed;
+ * concurrent duplicates caught by post-insert dedup (step 7),
+ * stale entries cleaned up by GC.
+ */
+ if (st == MARUFS_ENTRY_VALID &&
+ READ_CXL_LE64(e->name_hash) == hash &&
+ marufs_rat_name_matches(sbi, READ_CXL_LE32(e->region_id),
+ name, namelen)) {
+ pr_debug("index_insert: name '%.*s' already exists\n",
+ (int)namelen, name);
+ return -EEXIST;
+ }
+
+ /* Live entry — advance prev_next past it */
+ prev_next = (u32 *)&e->next_in_bucket;
+ cur = READ_LE32(e->next_in_bucket);
+ }
+
+ return 0;
+}
+
+/*
+ * marufs_index_post_insert_dedup - detect concurrent duplicate under shard lock.
+ * Called after link_to_bucket while holding the shard lock. Walks the
+ * bucket chain looking for another VALID entry with the same name.
+ * If found, returns -EEXIST so the caller can tombstone and release.
+ */
+static int marufs_index_post_insert_dedup(struct marufs_sb_info *sbi,
+ struct marufs_index_entry *entries,
+ u32 *bucket_head,
+ struct marufs_index_entry *entry,
+ u32 entry_idx, u64 hash,
+ const char *name, size_t namelen)
+{
+ MARUFS_CXL_RMB(bucket_head, sizeof(*bucket_head));
+ u32 cur = READ_CXL_LE32(*bucket_head);
+ u32 num_entries = sbi->entries_per_shard;
+ u32 steps = 0;
+
+ while (cur != MARUFS_BUCKET_END && cur < num_entries) {
+ if (++steps > num_entries) {
+ pr_err("bucket chain cycle detected (post-insert dedup)\n");
+ break;
+ }
+
+ struct marufs_index_entry *e = &entries[cur];
+ MARUFS_CXL_RMB(e, sizeof(*e));
+
+ if (cur != entry_idx &&
+ READ_CXL_LE32(e->state) == MARUFS_ENTRY_VALID &&
+ READ_CXL_LE64(e->name_hash) == hash &&
+ marufs_rat_name_matches(sbi, READ_CXL_LE32(e->region_id),
+ name, namelen)) {
+ if (entry_idx > cur) {
+ pr_info("index_insert: post-insert dup '%.*s' "
+ "(entry %u loses to %u)\n",
+ (int)namelen, name, entry_idx, cur);
+ return -EEXIST;
+ }
+ }
+ cur = READ_LE32(e->next_in_bucket);
+ }
+
+ return 0;
+}
+
+/*
+ * marufs_index_link_to_bucket - prepend entry to bucket chain via CAS.
+ * Self-loop guard: if entry is already the head (stale link from previous
+ * life), skip linking to avoid infinite chain walk.
+ * Return: 0 on success, -EAGAIN if CAS retries exhausted
+ */
+/*
+ * Caller holds Global ME (single writer on this node, cross-node also
+ * excluded) → no competitor can race bucket_head, so a plain WRITE suffices.
+ */
+static int marufs_index_link_to_bucket(struct marufs_index_entry *entry,
+ u32 entry_idx, u32 *bucket_head)
+{
+ MARUFS_CXL_RMB(bucket_head, sizeof(*bucket_head));
+ u32 old_head = READ_CXL_LE32(*bucket_head);
+
+ if (unlikely(old_head == entry_idx)) {
+ /* Already the bucket head (stale link from previous life).
+ * Chain successor is intact — just skip linking. */
+ return 0;
+ }
+
+ WRITE_LE32(entry->next_in_bucket, old_head);
+ MARUFS_CXL_WMB(entry, sizeof(*entry));
+
+ WRITE_LE32(*bucket_head, entry_idx);
+ MARUFS_CXL_WMB(bucket_head, sizeof(*bucket_head));
+ return 0;
+}
+
+/*
+ * marufs_index_insert - insert new file into global index
+ * @sbi: superblock info
+ * @name: filename
+ * @namelen: filename length
+ * @region_id: region where data is allocated (= RAT entry ID)
+ * @out_entry_idx: output - global entry index within shard
+ *
+ * Insert protocol (shard-lock serialized):
+ * 1. Pre-insert dup check (lock-free, best-effort)
+ * 2. CAS entry state EMPTY/TOMBSTONE -> INSERTING (reserve slot)
+ * 3. Fill entry fields
+ * 4. WRITE state to TENTATIVE (visible to dedup, not to lookup)
+ * 5. Acquire shard lock
+ * 6. CAS bucket head to link entry into hash chain
+ * 7. Post-insert dedup → loser TOMBSTONE, winner VALID (publish)
+ * 8. Release shard lock
+ *
+ * Return: 0 on success, -ENOSPC if shard full, -EEXIST if name exists
+ */
+/* Internal insert with caller-supplied hash */
+static int __marufs_index_insert(struct marufs_sb_info *sbi, const char *name,
+ size_t namelen, u64 hash, u32 region_id,
+ u32 *out_entry_idx)
+{
+ /* Step 1: select shard + bucket from hash (shard_cache = local DRAM) */
+ u32 shard_id = marufs_shard_idx(hash, sbi->shard_mask);
+ u32 bucket_id = marufs_bucket_idx(hash, sbi->bucket_mask);
+ struct marufs_shard_cache *sc = &sbi->shard_cache[shard_id];
+ if (!sc->entries || !sc->buckets) {
+ pr_err("index_insert: NULL entries/buckets for shard %u\n",
+ shard_id);
+ return -EINVAL;
+ }
+
+ struct marufs_index_entry *entries = sc->entries;
+ u32 *bucket_head = &sc->buckets[bucket_id];
+
+ /*
+ * Step 2: check for duplicate name before reserving slot.
+ * Walk bucket chain to find existing VALID entry with same name.
+ * Also records the first dead entry (TOMBSTONE/EMPTY) for reuse.
+ *
+ * NOTE: This pre-insert check has a TOCTOU window — two nodes can both
+ * pass it and insert the same name. Post-insert dedup after step 6
+ * detects and resolves this: the loser (higher entry_idx) rolls back
+ * to TOMBSTONE and returns -EEXIST.
+ */
+ u32 chain_reuse_idx = MARUFS_BUCKET_END;
+ int ret = marufs_index_check_duplicate(sbi, entries, bucket_head, hash,
+ name, namelen, &chain_reuse_idx);
+ if (ret)
+ return ret;
+
+ /*
+ * Step 3a: try to reuse a dead entry (TOMBSTONE or EMPTY) found in
+ * the bucket chain. The entry is already linked, so we skip
+ * link_and_publish later. Try CAS with both possible states.
+ */
+ struct marufs_index_entry *entry = NULL;
+ u32 entry_idx;
+ bool reused_chain_entry = false;
+
+ /* check_duplicate guarantees reuse_idx was dead; CAS-claim it */
+ if (chain_reuse_idx != MARUFS_BUCKET_END) {
+ struct marufs_index_entry *e = &entries[chain_reuse_idx];
+
+ if (marufs_index_claim_entry(sbi, e)) {
+ entry_idx = chain_reuse_idx;
+ entry = e;
+ reused_chain_entry = true;
+ pr_debug(
+ "index_insert: reusing chain entry %u in shard %u\n",
+ entry_idx, shard_id);
+ }
+ }
+
+ /*
+ * Step 3b: fallback — scan hot entry array for EMPTY slot if no
+ * tombstone was reused. Start from shard_free_hint to skip
+ * known-occupied prefix (H-P1: O(1) amortized vs O(n) linear).
+ */
+ u32 num_entries = sbi->entries_per_shard;
+ if (!entry) {
+ u32 hint = (u32)atomic_read(&sc->free_hint);
+ if (hint >= num_entries)
+ hint = 0;
+
+ for (u32 scan = 0; scan < num_entries; scan++) {
+ entry_idx = hint + scan;
+ if (entry_idx >= num_entries)
+ entry_idx -= num_entries;
+
+ struct marufs_index_entry *e = &entries[entry_idx];
+ MARUFS_CXL_RMB(e, sizeof(*e));
+ if (READ_CXL_LE32(e->state) != MARUFS_ENTRY_EMPTY)
+ continue;
+
+ if (marufs_index_claim_entry(sbi, e)) {
+ hint = entry_idx + 1;
+ if (hint >= num_entries)
+ hint = 0;
+
+ entry = e;
+ atomic_set(&sc->free_hint, hint);
+ break;
+ }
+ }
+ }
+
+ if (!entry) {
+ pr_debug("index_insert: shard %u full (%u entries)\n", shard_id,
+ num_entries);
+ return -ENOSPC;
+ }
+
+ /* Step 4: fill entry fields (INSERTING — exclusively owned) */
+ WRITE_LE64(entry->name_hash, hash);
+ WRITE_LE32(entry->region_id, region_id);
+ MARUFS_CXL_WMB(entry, sizeof(*entry));
+
+ /* Step 5: INSERTING → TENTATIVE (visible to dedup, not to lookup) */
+ WRITE_LE32(entry->state, MARUFS_ENTRY_TENTATIVE);
+ MARUFS_CXL_WMB(entry, sizeof(*entry));
+
+ /* Steps 6-8 run under Global ME (held by caller) — exclusive
+ * link + dedup with no intra/cross-node competitor. */
+
+ if (!reused_chain_entry) {
+ ret = marufs_index_link_to_bucket(entry, entry_idx,
+ bucket_head);
+ if (ret) {
+ WRITE_LE32(entry->state, MARUFS_ENTRY_TOMBSTONE);
+ MARUFS_CXL_WMB(entry, sizeof(*entry));
+ return ret;
+ }
+ }
+
+ /* Post-insert dedup — concurrent inserts resolved here */
+ ret = marufs_index_post_insert_dedup(sbi, entries, bucket_head, entry,
+ entry_idx, hash, name, namelen);
+ if (ret) {
+ WRITE_LE32(entry->state, MARUFS_ENTRY_TOMBSTONE);
+ MARUFS_CXL_WMB(entry, sizeof(*entry));
+ return ret;
+ }
+
+ /* TENTATIVE → VALID (now visible to lookup) */
+ WRITE_LE32(entry->state, MARUFS_ENTRY_VALID);
+ MARUFS_CXL_WMB(entry, sizeof(*entry));
+
+ *out_entry_idx = entry_idx;
+ return 0;
+}
+
+int marufs_index_insert(struct marufs_sb_info *sbi, const char *name,
+ size_t namelen, u32 region_id, u32 *out_entry_idx)
+{
+ u64 hash;
+
+ if (!sbi || !name || !out_entry_idx)
+ return -EINVAL;
+ if (namelen == 0 || namelen > MARUFS_NAME_MAX)
+ return -ENAMETOOLONG;
+ if (unlikely(!marufs_shard_geometry_valid(sbi->buckets_per_shard,
+ sbi->entries_per_shard))) {
+ pr_err("shard corrupted (buckets=%u entries=%u) - reformat needed\n",
+ sbi->buckets_per_shard, sbi->entries_per_shard);
+ return -EIO;
+ }
+
+ hash = marufs_hash_name(name, namelen);
+ return __marufs_index_insert(sbi, name, namelen, hash, region_id,
+ out_entry_idx);
+}
+
+/*
+ * marufs_index_lookup - lookup file by name in global index
+ * @sbi: superblock info
+ * @name: filename to search
+ * @namelen: filename length
+ * @out_entry: output pointer to found entry (NULL if not found)
+ *
+ * Lock-free read: walks bucket chain checking for VALID entries with matching
+ * name_hash and name. Uses rmb() after state check before reading entry fields.
+ *
+ * Return: 0 on success, -ENOENT if not found
+ */
+/* Lookup result — shared between lookup and delete */
+struct marufs_index_result {
+ struct marufs_index_entry *entry; /* matched CXL entry pointer */
+ u32 *prev_next; /* pointer to prev's next_in_bucket (or bucket head) */
+ u32 shard_id; /* shard containing the entry */
+ u32 entry_idx; /* entry index within the shard */
+};
+
+/*
+ * __marufs_index_lookup - internal chain walk for lookup/delete
+ * @sbi: superblock info
+ * @name: filename to search
+ * @namelen: filename length
+ * @result: output — entry pointer, prev link, shard/entry indices
+ *
+ * Return: 0 on success, -ENOENT if not found
+ */
+static int __marufs_index_lookup(struct marufs_sb_info *sbi, const char *name,
+ size_t namelen,
+ struct marufs_index_result *result)
+{
+ u64 hash;
+ u32 shard_id, bucket_idx, num_entries;
+ u32 cur, steps = 0;
+ u32 *prev_next;
+
+ if (!sbi || !name || !result)
+ return -EINVAL;
+ if (namelen == 0 || namelen > MARUFS_NAME_MAX)
+ return -ENOENT;
+
+ hash = marufs_hash_name(name, namelen);
+ shard_id = marufs_shard_idx(hash, sbi->shard_mask);
+ num_entries = sbi->entries_per_shard;
+ bucket_idx = marufs_bucket_idx(hash, sbi->bucket_mask);
+ prev_next = marufs_shard_bucket(sbi, shard_id, bucket_idx);
+ if (!prev_next)
+ return -ENOENT;
+
+ cur = READ_CXL_LE32(*prev_next);
+
+ while (cur != MARUFS_BUCKET_END && cur < num_entries) {
+ struct marufs_index_entry *entry =
+ marufs_shard_entry(sbi, shard_id, cur);
+ u32 next, state;
+
+ if (!entry)
+ return -EIO;
+
+ if (++steps > num_entries) {
+ pr_err("bucket chain cycle detected (lookup, shard %u bucket %u)\n",
+ shard_id, bucket_idx);
+ return -EIO;
+ }
+
+ state = READ_CXL_LE32(entry->state);
+ next = READ_CXL_LE32(entry->next_in_bucket);
+
+ if (state == MARUFS_ENTRY_VALID &&
+ READ_CXL_LE64(entry->name_hash) == hash &&
+ marufs_rat_name_matches(sbi,
+ READ_CXL_LE32(entry->region_id),
+ name, namelen)) {
+ result->entry = entry;
+ result->prev_next = prev_next;
+ result->shard_id = shard_id;
+ result->entry_idx = cur;
+ return 0;
+ }
+
+ prev_next = (u32 *)&entry->next_in_bucket;
+ cur = next;
+ }
+
+ return -ENOENT;
+}
+
+int marufs_index_lookup(struct marufs_sb_info *sbi, const char *name,
+ size_t namelen, struct marufs_index_entry **out_entry)
+{
+ struct marufs_index_result result;
+ int ret;
+
+ if (!out_entry)
+ return -EINVAL;
+
+ ret = __marufs_index_lookup(sbi, name, namelen, &result);
+ *out_entry = ret ? NULL : result.entry;
+ return ret;
+}
+
+/*
+ * marufs_index_delete - delete file from global index
+ * @sbi: superblock info
+ * @name: filename to delete
+ * @namelen: filename length
+ *
+ * CAS VALID → TOMBSTONE (logical delete). Entry stays in chain for
+ * in-place reuse by the next insert on the same bucket (step 3a).
+ *
+ * Return: 0 on success, -ENOENT if not found
+ */
+int marufs_index_delete(struct marufs_sb_info *sbi, const char *name,
+ size_t namelen)
+{
+ struct marufs_index_result result;
+ int ret = __marufs_index_lookup(sbi, name, namelen, &result);
+ if (ret)
+ return ret;
+
+ /* CAS VALID → TOMBSTONE (stays in chain for reuse) */
+ if (marufs_le32_cas(&result.entry->state, MARUFS_ENTRY_VALID,
+ MARUFS_ENTRY_TOMBSTONE) != MARUFS_ENTRY_VALID) {
+ pr_debug("index_delete: CAS failed for '%.*s'\n", (int)namelen,
+ name);
+ return -ENOENT;
+ }
+ /* CAS includes implicit full barrier — no extra WMB needed */
+
+ pr_debug("index_delete: '%.*s' deleted\n", (int)namelen, name);
+ return 0;
+}
diff --git a/marufs_kernel/src/index.h b/marufs_kernel/src/index.h
new file mode 100644
index 0000000..60dcf8e
--- /dev/null
+++ b/marufs_kernel/src/index.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * index.h - Global CAS hash index entry points.
+ */
+
+#ifndef _MARUFS_INDEX_H
+#define _MARUFS_INDEX_H
+
+#include
+
+struct marufs_sb_info;
+struct marufs_index_entry;
+
+int marufs_index_insert(struct marufs_sb_info *sbi, const char *name,
+ size_t namelen, u32 region_id, u32 *out_entry_idx);
+int marufs_index_lookup(struct marufs_sb_info *sbi, const char *name,
+ size_t namelen, struct marufs_index_entry **out_entry);
+int marufs_index_delete(struct marufs_sb_info *sbi, const char *name,
+ size_t namelen);
+
+#endif /* _MARUFS_INDEX_H */
diff --git a/marufs_kernel/src/inode.c b/marufs_kernel/src/inode.c
new file mode 100644
index 0000000..7c22316
--- /dev/null
+++ b/marufs_kernel/src/inode.c
@@ -0,0 +1,466 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * inode.c - MARUFS inode operations
+ *
+ * VFS inode operations implementation for MARUFS partitioned global index.
+ * Maps index entries to VFS inodes and handles metadata updates.
+ *
+ * Two-phase model:
+ * marufs_iget: reads RAT entry, sets data_phys_offset if region initialized
+ * marufs_setattr: ftruncate triggers marufs_region_init (first-time only, WORM)
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "compat.h"
+#include "marufs.h"
+
+/* ============================================================================
+ * inode read operations (index entry → VFS inode)
+ * ============================================================================ */
+
+/*
+ * marufs_resolve_rat_entry - read RAT entry fields into inode info
+ * @sbi: superblock info
+ * @xi: inode info to populate
+ * @rat_entry_id: RAT entry ID to resolve
+ *
+ * Reads ownership, region_offset, and data_phys_offset from RAT entry.
+ * v2: data starts directly at phys_offset (no header in region).
+ *
+ * Return: 0 on success, -ENOENT if RAT entry not allocated, -EINVAL if missing
+ */
+static int marufs_resolve_rat_entry(struct marufs_sb_info *sbi,
+ struct marufs_inode_info *xi,
+ u32 rat_entry_id)
+{
+ struct marufs_rat_entry *rat_entry;
+ u64 phys_offset;
+
+ rat_entry = marufs_rat_entry_get(sbi, rat_entry_id);
+ if (!rat_entry)
+ return -EINVAL;
+ if (READ_LE32(rat_entry->state) != MARUFS_RAT_ENTRY_ALLOCATED)
+ return -ENOENT;
+
+ phys_offset = READ_LE64(rat_entry->phys_offset);
+ xi->region_offset = phys_offset;
+
+ /* Invalidate CL2 (owner_node_id, owner_pid, owner_birth_time, uid, gid, mode, ...) */
+ MARUFS_CXL_RMB(&rat_entry->default_perms, 64);
+
+ xi->owner_node_id = READ_LE16(rat_entry->owner_node_id);
+ xi->owner_pid = READ_LE32(rat_entry->owner_pid);
+ xi->owner_birth_time = READ_LE64(rat_entry->owner_birth_time);
+
+ if (marufs_valid_phys_offset(sbi, phys_offset))
+ xi->data_phys_offset = phys_offset;
+ else
+ xi->data_phys_offset = 0;
+
+ return 0;
+}
+
+/*
+ * marufs_inode_fill_from_entry - populate VFS inode fields from index + RAT
+ * @inode: target inode
+ * @hot: index hot entry (file_size)
+ * @rat_e: RAT entry (uid, gid, mode)
+ *
+ * Reads file_size from index entry, uid/gid/mode from RAT entry.
+ */
+static void marufs_inode_fill_from_entry(struct inode *inode,
+ struct marufs_rat_entry *rat_e)
+{
+ /* Invalidate CL2 (uid, gid, mode are in CL2 starting at owner_node_id) */
+ MARUFS_CXL_RMB(&rat_e->owner_node_id, 64);
+
+ inode->i_mode = S_IFREG | READ_LE16(rat_e->mode);
+ inode->i_uid = make_kuid(&init_user_ns, READ_LE32(rat_e->uid));
+ inode->i_gid = make_kgid(&init_user_ns, READ_LE32(rat_e->gid));
+ inode->i_size = READ_LE64(rat_e->size);
+ inode->i_blocks = (inode->i_size + MARUFS_SECTOR_SIZE - 1) >>
+ MARUFS_SECTOR_SHIFT;
+ set_nlink(inode, 1);
+}
+
+/*
+ * marufs_iget - read inode from global index entry
+ * @sb: Superblock
+ * @ie: Index entry pointer
+ * @shard_id: Shard ID this entry belongs to
+ * @entry_idx: Entry index within shard's entry array
+ *
+ * Creates or updates VFS inode from index entry. Reads latest file_size
+ * from shared CXL memory to support cross-node visibility.
+ *
+ * Two-phase awareness:
+ * phys_offset > 0: region initialized, data_phys_offset = phys_offset
+ * phys_offset == 0: region not yet initialized (ftruncate pending)
+ *
+ * Return: inode pointer on success, ERR_PTR on error
+ */
+struct inode *marufs_iget(struct super_block *sb,
+ struct marufs_index_entry *entry, u32 shard_id,
+ u32 entry_idx)
+{
+ struct marufs_inode_info *xi;
+ struct inode *inode;
+ struct marufs_sb_info *sbi = marufs_sb_get(sb);
+ struct marufs_rat_entry *rat_e;
+ u32 region_id;
+ unsigned long ino;
+ int ret;
+
+ /* Read fresh data from shared memory */
+ MARUFS_CXL_RMB(entry, sizeof(*entry));
+ region_id = READ_CXL_LE32(entry->region_id);
+ ino = marufs_make_ino(region_id);
+
+ /* Check if entry is VALID */
+ if (READ_LE32(entry->state) != MARUFS_ENTRY_VALID)
+ return ERR_PTR(-ENOENT);
+
+ inode = iget_locked(sb, ino);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ xi = marufs_inode_get(inode);
+ xi->region_id = region_id;
+ xi->entry_idx = entry_idx;
+ xi->shard_id = shard_id;
+ xi->rat_entry_id = region_id;
+
+ if (!(inode->i_state & I_NEW)) {
+ /*
+ * Already cached — refresh from CXL memory for cross-node
+ * visibility. When a RAT entry is reused (file deleted then
+ * new file allocated to the same entry), the cached inode
+ * carries stale data_phys_offset / owner fields. Re-read
+ * everything that can change between uses of the same ino.
+ */
+ rat_e = marufs_rat_entry_get(sbi, region_id);
+ if (rat_e)
+ marufs_inode_fill_from_entry(inode, rat_e);
+
+ /* Re-validate RAT entry: GC may have freed it since caching */
+ ret = marufs_resolve_rat_entry(sbi, xi, region_id);
+ if (ret == -ENOENT) {
+ iput(inode);
+ return ERR_PTR(-ESTALE);
+ }
+
+ return inode;
+ }
+
+ /* Restore fields from RAT entry */
+ ret = marufs_resolve_rat_entry(sbi, xi, region_id);
+ if (ret == -ENOENT) {
+ iget_failed(inode);
+ return ERR_PTR(-ENOENT);
+ }
+ if (ret == -EINVAL) {
+ /* Invalid RAT entry ID */
+ xi->region_offset = 0;
+ xi->owner_node_id = 0;
+ xi->owner_pid = 0;
+ xi->owner_birth_time = 0;
+ xi->data_phys_offset = 0;
+ }
+
+ rat_e = marufs_rat_entry_get(sbi, region_id);
+ if (rat_e) {
+ u64 alloc_time, mod_time;
+ struct timespec64 ts;
+
+ marufs_inode_fill_from_entry(inode, rat_e);
+
+ /* Timestamps from RAT entry */
+ alloc_time = READ_LE64(rat_e->alloc_time);
+ mod_time = READ_LE64(rat_e->modified_at);
+
+ ts.tv_sec = alloc_time / NSEC_PER_SEC;
+ ts.tv_nsec = alloc_time % NSEC_PER_SEC;
+ inode_set_atime_to_ts(inode, ts);
+
+ ts.tv_sec = mod_time / NSEC_PER_SEC;
+ ts.tv_nsec = mod_time % NSEC_PER_SEC;
+ inode_set_mtime_to_ts(inode, ts);
+ inode_set_ctime_to_ts(inode, ts);
+ }
+
+ inode->i_op = &marufs_file_inode_ops;
+ inode->i_fop = &marufs_file_ops;
+ inode->i_mapping->a_ops = &marufs_aops;
+
+ unlock_new_inode(inode);
+ return inode;
+}
+
+/* ============================================================================
+ * New inode creation
+ * ============================================================================ */
+
+/*
+ * marufs_new_inode - create new VFS inode
+ * @sb: Superblock
+ * @mode: file mode (S_IFREG | permissions)
+ *
+ * Creates new in-memory inode before writing to index. Caller must fill in
+ * region_id, slot_idx, entry_idx, shard_id.
+ *
+ * Return: inode pointer on success, ERR_PTR on error
+ */
+struct inode *marufs_new_inode(struct super_block *sb, umode_t mode)
+{
+ struct inode *inode;
+ struct marufs_inode_info *xi;
+
+ inode = new_inode(sb);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ xi = marufs_inode_get(inode);
+ marufs_inode_info_init(xi);
+
+ inode->i_ino = 0;
+ inode->i_mode = mode;
+ inode->i_uid = current_fsuid();
+ inode->i_gid = current_fsgid();
+
+ {
+ struct timespec64 ts = current_time(inode);
+ inode_set_atime_to_ts(inode, ts);
+ inode_set_mtime_to_ts(inode, ts);
+ inode_set_ctime_to_ts(inode, ts);
+ }
+
+ inode->i_blocks = 0;
+ set_nlink(inode, 1);
+
+ if (S_ISREG(mode)) {
+ inode->i_op = &marufs_file_inode_ops;
+ inode->i_fop = &marufs_file_ops;
+ inode->i_mapping->a_ops = &marufs_aops;
+ } else if (S_ISDIR(mode)) {
+ inode->i_op = &marufs_dir_inode_ops;
+ inode->i_fop = &marufs_dir_ops;
+ set_nlink(inode, 2);
+ }
+
+ return inode;
+}
+
+/* ============================================================================
+ * inode writeback
+ * ============================================================================ */
+
+/*
+ * marufs_write_inode - write inode metadata back to index entry
+ * @inode: inode to write
+ * @wbc: writeback control
+ *
+ * Updates index entry with current file size and timestamp. Called by VFS
+ * during writeback or sync operations.
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+/*
+ * marufs_index_entry_sync_size - sync file size and timestamp to CXL index
+ * @sbi: superblock info
+ * @xi: marufs inode info
+ * @hot: hot entry pointer (must not be NULL)
+ * @size: file size to write
+ *
+ * Writes file_size to hot entry, modified_at to cold entry, then issues WMB.
+ */
+static void marufs_rat_sync_size(struct marufs_sb_info *sbi,
+ struct marufs_inode_info *xi, loff_t size)
+{
+ struct marufs_rat_entry *rat_e =
+ marufs_rat_entry_get(sbi, xi->rat_entry_id);
+
+ if (rat_e) {
+ WRITE_LE64(rat_e->size, size);
+ WRITE_LE64(rat_e->modified_at, ktime_get_real_ns());
+ MARUFS_CXL_WMB(rat_e, sizeof(*rat_e));
+ }
+}
+
+int marufs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+ struct marufs_inode_info *xi = marufs_inode_get(inode);
+ struct marufs_sb_info *sbi = marufs_sb_get(inode->i_sb);
+
+ if (inode->i_ino == MARUFS_ROOT_INO)
+ return 0;
+
+ /* Update file size and timestamp in RAT */
+ marufs_rat_sync_size(sbi, xi, inode->i_size);
+
+ return 0;
+}
+
+/* ============================================================================
+ * inode eviction
+ * ============================================================================ */
+
+/*
+ * marufs_evict_inode - evict inode from cache
+ * @inode: inode to evict
+ *
+ * Called when inode is removed from cache. Cleans up page cache and marks
+ * inode as clean.
+ */
+void marufs_evict_inode(struct inode *inode)
+{
+ truncate_inode_pages_final(&inode->i_data);
+ clear_inode(inode);
+}
+
+/* ============================================================================
+ * File attribute operations
+ * ============================================================================ */
+
+/*
+ * marufs_getattr - get file attributes
+ *
+ * Reads latest file size from index entry to ensure cross-node consistency,
+ * then fills stat structure.
+ */
+static int marufs_getattr(MARUFS_IDMAP_PARAM_COMMA const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
+{
+ struct inode *inode = d_inode(path->dentry);
+ struct marufs_inode_info *xi = marufs_inode_get(inode);
+ struct super_block *sb = inode->i_sb;
+ struct marufs_sb_info *sbi = marufs_sb_get(sb);
+
+ marufs_generic_fillattr(MARUFS_IDMAP_ARG_COMMA request_mask, inode,
+ stat);
+
+ if (inode->i_ino != MARUFS_ROOT_INO) {
+ /* Read latest file_size from RAT — write to stat directly
+ * to avoid i_size_write() without i_rwsem */
+ struct marufs_rat_entry *rat_e =
+ marufs_rat_entry_get(sbi, xi->rat_entry_id);
+ if (rat_e) {
+ u64 fresh_size = READ_CXL_LE64(rat_e->size);
+ stat->size = fresh_size;
+ stat->blocks = (fresh_size + MARUFS_SECTOR_SIZE - 1) >>
+ MARUFS_SECTOR_SHIFT;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * marufs_setattr - set file attributes (ftruncate triggers region allocation)
+ *
+ * WORM enforcement:
+ * - First ftruncate (i_size == 0 -> new size): allocates physical region
+ * - Second ftruncate (i_size > 0): rejected with -EACCES
+ *
+ * This is the key function in the two-phase create model:
+ * open(O_CREAT) creates lightweight entry (i_size=0)
+ * ftruncate(size) triggers marufs_region_init() here
+ */
+static int marufs_setattr(MARUFS_IDMAP_PARAM_COMMA struct dentry *dentry,
+ struct iattr *attr)
+{
+ struct inode *inode = d_inode(dentry);
+ int ret;
+
+ ret = marufs_setattr_prepare(MARUFS_IDMAP_ARG_COMMA dentry, attr);
+ if (ret)
+ return ret;
+
+ /* CXL FS: only ATTR_SIZE (ftruncate) is meaningful — silently
+ * succeed for other attrs (chmod/chown/utimes) since CXL memory
+ * does not persist POSIX attributes.
+ */
+ if (!(attr->ia_valid & ATTR_SIZE))
+ return 0;
+
+ if (attr->ia_valid & ATTR_SIZE) {
+ struct marufs_inode_info *xi = marufs_inode_get(inode);
+ struct marufs_sb_info *sbi = marufs_sb_get(inode->i_sb);
+
+ /* MARUFS permission check before region allocation */
+ ret = marufs_check_permission(sbi, xi->rat_entry_id,
+ MARUFS_PERM_WRITE);
+ if (ret)
+ return ret;
+
+ /* WORM: reject size changes if file already has data */
+ if (inode->i_size > 0)
+ return -EACCES;
+
+ /* No-op if setting to 0 */
+ if (attr->ia_size == 0)
+ return 0;
+
+ /*
+ * First-time size set: allocate physical region.
+ * marufs_region_init finds contiguous space, initializes region header,
+ * and updates RAT entry with phys_offset and size.
+ */
+ ret = marufs_region_init(sbi, xi->rat_entry_id, attr->ia_size);
+ if (ret) {
+ pr_err("region_init failed for rat_entry %u: %d\n",
+ xi->rat_entry_id, ret);
+ return ret;
+ }
+
+ /* Update inode from initialized region */
+ {
+ struct marufs_rat_entry *rat_entry;
+ u64 region_offset;
+
+ rat_entry = marufs_rat_entry_get(sbi, xi->rat_entry_id);
+ if (!rat_entry)
+ return -EIO;
+
+ region_offset = READ_LE64(rat_entry->phys_offset);
+ xi->region_offset = region_offset;
+
+ /* v2: data starts directly at phys_offset */
+ xi->data_phys_offset = region_offset;
+ }
+
+ /* Set inode size */
+ truncate_setsize(inode, attr->ia_size);
+
+ {
+ struct timespec64 now = current_time(inode);
+ inode_set_mtime_to_ts(inode, now);
+ inode_set_ctime_to_ts(inode, now);
+ }
+
+ /* Update file size in RAT */
+ marufs_rat_sync_size(sbi, xi, inode->i_size);
+
+ mark_inode_dirty(inode);
+
+ pr_debug("ftruncate rat=%u size=%lld slot_base=0x%llx\n",
+ xi->rat_entry_id, inode->i_size, xi->data_phys_offset);
+ }
+
+ return 0;
+}
+
+/* ============================================================================
+ * File inode operations table
+ * ============================================================================ */
+
+const struct inode_operations marufs_file_inode_ops = {
+ .getattr = marufs_getattr,
+ .setattr = marufs_setattr,
+};
diff --git a/marufs_kernel/src/inode.h b/marufs_kernel/src/inode.h
new file mode 100644
index 0000000..dbffb7c
--- /dev/null
+++ b/marufs_kernel/src/inode.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * inode.h - In-memory inode state and inode entry points.
+ *
+ * `marufs_inode_info` lives here so any layer needing per-file metadata
+ * (region/shard/owner) can include just this header. `vfs_inode` MUST
+ * remain the last field (container_of contract).
+ */
+
+#ifndef _MARUFS_INODE_H
+#define _MARUFS_INODE_H
+
+#include
+#include
+
+struct marufs_index_entry;
+
+struct marufs_inode_info {
+ u32 region_id; /* Region where data is stored (= RAT entry ID) */
+ u32 entry_idx; /* Global index entry index (for writeback) */
+ u32 shard_id; /* Shard this entry belongs to */
+ u32 rat_entry_id;
+ u64 region_offset;
+ u32 owner_node_id;
+ u32 owner_pid;
+ u64 owner_birth_time;
+ u64 data_phys_offset; /* Cached: region phys_offset (= data start) */
+ struct inode vfs_inode; /* VFS inode (must be last!) */
+};
+
+static inline struct marufs_inode_info *marufs_inode_get(struct inode *inode)
+{
+ return container_of(inode, struct marufs_inode_info, vfs_inode);
+}
+
+/* Zero-initialize all marufs-specific fields of inode_info. */
+static inline void marufs_inode_info_init(struct marufs_inode_info *xi)
+{
+ xi->region_id = 0;
+ xi->entry_idx = 0;
+ xi->shard_id = 0;
+ xi->rat_entry_id = 0;
+ xi->region_offset = 0;
+ xi->owner_node_id = 0;
+ xi->owner_pid = 0;
+ xi->owner_birth_time = 0;
+ xi->data_phys_offset = 0;
+}
+
+struct inode *marufs_iget(struct super_block *sb,
+ struct marufs_index_entry *entry, u32 shard_id,
+ u32 entry_idx);
+struct inode *marufs_new_inode(struct super_block *sb, umode_t mode);
+int marufs_write_inode(struct inode *inode, struct writeback_control *wbc);
+void marufs_evict_inode(struct inode *inode);
+
+extern const struct inode_operations marufs_file_inode_ops;
+extern const struct inode_operations marufs_dir_inode_ops;
+
+#endif /* _MARUFS_INODE_H */
diff --git a/marufs_kernel/src/marufs.h b/marufs_kernel/src/marufs.h
new file mode 100644
index 0000000..7d251cd
--- /dev/null
+++ b/marufs_kernel/src/marufs.h
@@ -0,0 +1,473 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * marufs.h - MARUFS in-memory core (sbi, DAX/RAT/shard helpers).
+ *
+ * ============================================================================
+ * MARUFS Architecture: CAS-based Lock-free Distributed Model
+ * ============================================================================
+ *
+ * All nodes can concurrently read/write the global index using CAS
+ * (Compare-And-Swap) operations. No GCS (Global Chunk Server) needed.
+ *
+ * On-disk layout lives in marufs_layout.h (umbrella over per-domain
+ * layout headers). Per-module entry points live in their own headers
+ * (gc.h, acl.h, region.h, index.h, nrht.h, cache.h, inode.h, super.h,
+ * file.h, dir.h). This file owns the in-memory `marufs_sb_info` plus
+ * inline DAX/RAT/shard accessors that all subsystems reuse.
+ */
+
+#ifndef _MARUFS_H
+#define _MARUFS_H
+
+/* ============================================================================
+ * Module name configuration
+ * ============================================================================
+ * MARUFS_MODULE_NAME is defined via compiler flag in Makefile (-DMARUFS_MODULE_NAME).
+ * Change MODULE_NAME in Makefile to load multiple instances without conflicts.
+ * This affects: filesystem type name, sysfs path, and kernel module name.
+ */
+#ifndef MARUFS_MODULE_NAME
+#define MARUFS_MODULE_NAME "marufs" /* Fallback if not defined by Makefile */
+#endif
+
+/* pr_fmt: 모든 pr_* 로그에 모듈 이름 자동 접두사 */
+#undef pr_fmt
+#define pr_fmt(fmt) MARUFS_MODULE_NAME ": " fmt
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+/* On-disk layout + ME types (sbi fields reference both) */
+#include "marufs_layout.h"
+#include "me.h"
+
+/* GC orphan tracker types — referenced by sbi fields below */
+#include "gc.h"
+
+/* Bootstrap slot management */
+#include "bootstrap.h"
+
+/* Per-module entry points (kept here so .c files including only marufs.h
+ * still see all subsystem APIs — preserves existing include patterns). */
+#include "acl.h"
+#include "cache.h"
+#include "dir.h"
+#include "file.h"
+#include "index.h"
+#include "inode.h"
+#include "nrht.h"
+#include "region.h"
+#include "super.h"
+
+/* ============================================================================
+ * Filesystem constants
+ * ============================================================================ */
+
+#define MARUFS_ROOT_INO 1 /* Root directory ino */
+
+/* Stale entry timeout: INSERTING, GRANTING, ALLOCATING states older than
+ * this are considered abandoned and eligible for GC reclaim. */
+#define MARUFS_STALE_TIMEOUT_NS (5ULL * NSEC_PER_SEC)
+
+/* VFS blocksize (for sb->s_blocksize) */
+#define MARUFS_VFS_BLOCK_SIZE 4096
+#define MARUFS_VFS_BLOCK_SIZE_BITS 12
+
+/* Sector size for i_blocks calculation */
+#define MARUFS_SECTOR_SIZE 512
+#define MARUFS_SECTOR_SHIFT 9
+
+/* Root directory mode */
+#define MARUFS_ROOT_DIR_MODE 0755
+
+/* Forward declarations */
+struct marufs_entry_cache;
+struct marufs_me_instance;
+struct marufs_nrht_stats_pcpu;
+
+/* ============================================================================
+ * Local DRAM cache for shard metadata (read-only after format)
+ * ============================================================================
+ *
+ * Eliminates per-operation CXL reads for shard header fields that never
+ * change: bucket/entry array pointers and counts. Populated once at mount
+ * from CXL shard_table, then accessed from local DRAM on every index op.
+ */
+
+struct marufs_shard_cache {
+ u32 *buckets; /* CXL bucket array pointer */
+ struct marufs_index_entry *entries; /* CXL entry array (32B each) */
+ struct marufs_shard_header *header; /* CXL shard header */
+ atomic_t free_hint; /* Next-free scan start (not for correctness) */
+};
+
+/* ============================================================================
+ * In-memory superblock info
+ * ============================================================================
+ *
+ * Combines device/DAX state with CXL memory layout pointers.
+ * One instance per mounted filesystem, stored in sb->s_fs_info.
+ */
+
+struct marufs_sb_info {
+ /* ── Storage backend (DEV_DAX) ────────────────────────────────────── */
+ void *dax_base; /* DAX direct access base (virtual) */
+ phys_addr_t phys_base; /* Physical base address for DAX mmap */
+ long dax_nr_pages; /* DAX mapped page count */
+ u64 total_size; /* Total device size (bytes) */
+ char daxdev_path[128]; /* DEV_DAX device path */
+ struct file *dax_filp; /* DEV_DAX device file for mmap delegation */
+
+ /* ── vm_ops wrapper (mprotect enforcement) ──────────────────────────
+ * Lazy-seeded at first mmap: copy underlying ops, override .mprotect.
+ * All vmas point to &sbi->vm_ops; no per-vma alloc.
+ */
+ struct vm_operations_struct vm_ops;
+ bool vm_ops_seeded;
+ struct mutex vm_ops_lock;
+
+ /* ── Node identity ────────────────────────────────────────────────── */
+ u32 node_id;
+
+ /* ── Cached on-disk layout pointers (set at mount, valid until umount) */
+ struct marufs_superblock *gsb; /* Global superblock */
+ struct marufs_bootstrap_slot *bootstrap_slots; /* slot base array */
+ struct marufs_rat *rat; /* Region allocation table array */
+ void *me_area; /* ME membership area base */
+
+ /* ── Geometry (read from GSB at mount time) ───────────────────────── */
+ u32 num_shards;
+ u32 shard_mask; /* num_shards - 1 */
+ u32 buckets_per_shard;
+ u32 bucket_mask; /* buckets_per_shard - 1 */
+ u32 entries_per_shard;
+
+ /* ── Per-shard DRAM cache (read-only after mount) ─────────────────── */
+ struct marufs_shard_cache *shard_cache;
+ bool has_struct_pages; /* ZONE_DEVICE pages exist (VM_MIXEDMAP) */
+
+ /* ── Entry cache ──────────────────────────────────────────────────── */
+ struct marufs_entry_cache *entry_cache;
+
+ /* ── Mutual Exclusion (ME) ────────────────────────────────────────── */
+ struct marufs_me_instance *me; /* Global ME */
+ struct marufs_me_instance *nrht_me[MARUFS_MAX_RAT_ENTRIES];
+ struct mutex nrht_me_lock; /* serializes nrht_me[] creation */
+ /* Unified poll thread: serves Global ME + all NRHT MEs */
+ struct list_head me_list;
+ struct mutex me_list_lock;
+ struct task_struct *me_poll_thread;
+
+ /* ── NRHT stats (per-CPU) ─────────────────────────────────────────── */
+ /* May be NULL before alloc; recorders guard for that. */
+ struct marufs_nrht_stats_pcpu __percpu *nrht_stats;
+
+ /* ── Garbage collector ────────────────────────────────────────────── */
+ struct task_struct *gc_thread;
+ atomic_t gc_active; /* 0 = GC should self-terminate */
+ atomic_t gc_paused; /* 1 = GC temporarily paused */
+ atomic_t gc_epoch; /* GC cycle counter (liveness check) */
+ u32 gc_next_shard; /* Round-robin sweep position */
+ struct marufs_orphan_tracker gc_orphans[MARUFS_GC_ORPHAN_MAX];
+ u32 gc_orphan_count;
+ DECLARE_BITMAP(gc_nrht_bitmap, MARUFS_MAX_RAT_ENTRIES);
+
+ /* ── Bootstrap auto-mount ─────────────────────────────────────────── */
+ int bootstrap_slot_idx; /* -1 = not claimed; 0..7 = slot index */
+ u64 bootstrap_token; /* our random_token (for verification) */
+
+ /* ── Admin role (lowest active node_id, refreshed dynamically) ───── */
+ u32 cached_admin_node_id;
+};
+
+/* ============================================================================
+ * sbi accessor
+ * ============================================================================ */
+
+static inline struct marufs_sb_info *marufs_sb_get(struct super_block *sb)
+{
+ return sb->s_fs_info;
+}
+
+/* ── Bootstrap slot pointer accessor ──────────────────────────────────────
+ *
+ * marufs_bootstrap_slot_get - return pointer to bootstrap slot @idx.
+ * Defined here (after marufs_sb_info) because bootstrap.h is included before
+ * the struct definition so it cannot host the inline body.
+ */
+static inline struct marufs_bootstrap_slot *
+marufs_bootstrap_slot_get(struct marufs_sb_info *sbi, int idx)
+{
+ if (WARN_ON_ONCE(idx < 0 || idx >= MARUFS_BOOTSTRAP_MAX_SLOTS))
+ return NULL;
+ if (unlikely(!sbi || !sbi->bootstrap_slots))
+ return NULL;
+ return &sbi->bootstrap_slots[idx];
+}
+
+/*
+ * marufs_bootstrap_promote_claimed - write CLAIMED to slot[0] with WMB.
+ * Convenience for the 3 super.c sites that promote slot[0] after format.
+ * Calls marufs_bootstrap_set_status() (declared in bootstrap.h).
+ */
+static inline void marufs_bootstrap_promote_claimed(struct marufs_sb_info *sbi)
+{
+ marufs_bootstrap_set_status(sbi, 0, MARUFS_BS_CLAIMED);
+}
+
+/* ============================================================================
+ * DAX abstraction API
+ * ============================================================================
+ *
+ * All filesystem code uses these helpers to access device memory (DEV_DAX).
+ */
+
+/* Get direct pointer to device memory at @offset (DAX only) */
+static inline void *marufs_dax_ptr(struct marufs_sb_info *sbi, u64 offset)
+{
+ return (void *)((char *)sbi->dax_base + offset);
+}
+
+/* Return total device size in bytes */
+static inline u64 marufs_dax_size(struct marufs_sb_info *sbi)
+{
+ return sbi->total_size;
+}
+
+/* Validate that a physical offset is within the DAX mapping */
+static inline bool marufs_valid_phys_offset(struct marufs_sb_info *sbi,
+ u64 offset)
+{
+ return offset > 0 && offset < sbi->total_size && sbi->dax_base;
+}
+
+/*
+ * marufs_gsb_get - GSB pointer with RMB.
+ *
+ * Returns sbi->gsb (cached after marufs_read_superblock).
+ * marufs_read_superblock itself computes the pointer directly from dax_base
+ * and assigns sbi->gsb — it does not call this helper.
+ */
+static inline struct marufs_superblock *
+marufs_gsb_get(struct marufs_sb_info *sbi)
+{
+ if (unlikely(!sbi || !sbi->gsb))
+ return NULL;
+ MARUFS_CXL_RMB(sbi->gsb,
+ sizeof(*sbi->gsb) - sizeof(sbi->gsb->reserved));
+ return sbi->gsb;
+}
+
+/*
+ * marufs_rat_get - cached RAT pointer with CL0 RMB.
+ *
+ * sbi->rat is set by marufs_load_rat before any runtime caller reaches here.
+ */
+static inline struct marufs_rat *marufs_rat_get(struct marufs_sb_info *sbi)
+{
+ if (unlikely(!sbi || !sbi->rat))
+ return NULL;
+ MARUFS_CXL_RMB(sbi->rat, 64); /* Invalidate CL0 (hot I/O metadata) */
+ return sbi->rat;
+}
+
+/* RAT entry accessor — CL0 RMB included, caller adds CL2 RMB if needed */
+static inline struct marufs_rat_entry *
+marufs_rat_entry_get(struct marufs_sb_info *sbi, u32 id)
+{
+ if (unlikely(!sbi || !sbi->rat || id >= MARUFS_MAX_RAT_ENTRIES))
+ return NULL;
+
+ struct marufs_rat_entry *e = &sbi->rat->entries[id];
+ MARUFS_CXL_RMB(e, 64); /* Invalidate CL0 (hot I/O metadata) */
+ return e;
+}
+
+/* Direct delegation entry array accessor from RAT entry */
+static inline struct marufs_deleg_entry *
+marufs_rat_deleg_entries(struct marufs_rat_entry *entry)
+{
+ return entry->deleg_entries;
+}
+
+/* Safe single delegation entry accessor with bounds check + RMB */
+static inline struct marufs_deleg_entry *
+marufs_rat_deleg_entry(struct marufs_rat_entry *entry, u32 idx)
+{
+ if (unlikely(idx >= MARUFS_DELEG_MAX_ENTRIES))
+ return NULL;
+
+ struct marufs_deleg_entry *de = &entry->deleg_entries[idx];
+ MARUFS_CXL_RMB(de, sizeof(*de));
+ return de;
+}
+
+/* ── Typed DAX region accessors ──────────────────────────────────────────── */
+
+/*
+ * marufs_shard_header_get - shard header pointer for @shard_id.
+ *
+ * Returns the cached header pointer from sbi->shard_cache[shard_id].header.
+ * Cache is allocated and header pointers populated in
+ * marufs_sbi_init_layout_ptrs() at mount start, BEFORE format/read_superblock,
+ * so this helper is valid for both format-side writes and post-mount reads.
+ */
+static inline struct marufs_shard_header *
+marufs_shard_header_get(struct marufs_sb_info *sbi, u32 shard_id)
+{
+ if (unlikely(!sbi || !sbi->shard_cache ||
+ shard_id >= MARUFS_REGION_NUM_SHARDS))
+ return NULL;
+
+ struct marufs_shard_header *sh = sbi->shard_cache[shard_id].header;
+ if (unlikely(!sh))
+ return NULL;
+
+ MARUFS_CXL_RMB(sh, sizeof(*sh));
+ return sh;
+}
+
+/*
+ * marufs_me_area_get - cached ME membership area base.
+ * sbi->me_area is set in marufs_fill_super_common before marufs_me_create.
+ */
+static inline void *marufs_me_area_get(struct marufs_sb_info *sbi)
+{
+ if (unlikely(!sbi || !sbi->me_area))
+ return NULL;
+ return sbi->me_area;
+}
+
+/*
+ * marufs_file_data_at - file data pointer at (data_phys_offset + pos).
+ *
+ * data_phys_offset is the per-inode physical base; pos is the read offset
+ * within that region. Caller must RMB before accessing the returned pointer.
+ */
+static inline void *marufs_file_data_at(struct marufs_sb_info *sbi,
+ u64 data_phys_offset, loff_t pos)
+{
+ return marufs_dax_ptr(sbi, data_phys_offset + (u64)pos);
+}
+
+/* Validate that [offset, offset+size) is within DAX mapping (overflow-safe) */
+static inline bool marufs_dax_range_valid(struct marufs_sb_info *sbi,
+ u64 offset, u64 size)
+{
+ if (unlikely(!sbi->dax_base || size == 0))
+ return false;
+ if (unlikely(offset >= sbi->total_size))
+ return false;
+ if (unlikely(size >
+ sbi->total_size - offset)) /* overflow-safe subtraction */
+ return false;
+ return true;
+}
+
+/*
+ * marufs_rat_name_matches - check if RAT entry name matches given name
+ * Invalidates CXL cache, then compares with null-terminator awareness.
+ * Returns true if names match.
+ */
+static inline bool marufs_rat_name_matches(struct marufs_sb_info *sbi,
+ u32 region_id, const char *name,
+ size_t namelen)
+{
+ struct marufs_rat_entry *rat_e = marufs_rat_entry_get(sbi, region_id);
+ if (!rat_e)
+ return false;
+ MARUFS_CXL_RMB(rat_e->name, sizeof(rat_e->name));
+ return strncmp(rat_e->name, name, namelen) == 0 &&
+ (namelen >= sizeof(rat_e->name) || rat_e->name[namelen] == '\0');
+}
+
+/* Validate data_phys_offset + access range is within DAX mapping */
+static inline bool marufs_validate_region_addr(struct marufs_sb_info *sbi,
+ u64 data_phys_offset,
+ u64 access_size)
+{
+ if (unlikely(data_phys_offset == 0))
+ return false;
+ return marufs_dax_range_valid(sbi, data_phys_offset, access_size);
+}
+
+/* ============================================================================
+ * Shard helpers
+ * ============================================================================ */
+
+/* Validate shard geometry is non-zero and within sane bounds */
+static inline bool marufs_shard_geometry_valid(u32 num_buckets, u32 num_entries)
+{
+ u32 limit = MARUFS_REGION_ENTRIES_PER_SHARD;
+ return num_buckets > 0 && num_entries > 0 && num_buckets <= limit &&
+ num_entries <= limit;
+}
+
+/* Return shard's bucket array pointer (from local DRAM shard_cache) */
+static inline u32 *marufs_shard_buckets(struct marufs_sb_info *sbi,
+ u32 shard_id)
+{
+ if (unlikely(shard_id >= sbi->num_shards))
+ return NULL;
+ return sbi->shard_cache[shard_id].buckets;
+}
+
+/* Safe bucket access: validate shard_id and bucket_idx within shard bounds */
+static inline u32 *marufs_shard_bucket(struct marufs_sb_info *sbi, u32 shard_id,
+ u32 bucket_idx)
+{
+ if (unlikely(shard_id >= sbi->num_shards))
+ return NULL;
+ struct marufs_shard_cache *sc = &sbi->shard_cache[shard_id];
+ if (unlikely(bucket_idx >= sbi->buckets_per_shard))
+ return NULL;
+ if (unlikely(!sc->buckets))
+ return NULL;
+
+ u32 *bucket = &sc->buckets[bucket_idx];
+ MARUFS_CXL_RMB(bucket, sizeof(*bucket));
+ return bucket;
+}
+
+/* Return shard's entry array pointer (from local DRAM shard_cache) */
+static inline struct marufs_index_entry *
+marufs_shard_entries(struct marufs_sb_info *sbi, u32 shard_id)
+{
+ if (unlikely(shard_id >= sbi->num_shards))
+ return NULL;
+ return sbi->shard_cache[shard_id].entries;
+}
+
+/* Safe entry access: validate shard_id and entry_idx within shard bounds */
+static inline struct marufs_index_entry *
+marufs_shard_entry(struct marufs_sb_info *sbi, u32 shard_id, u32 entry_idx)
+{
+ if (unlikely(shard_id >= sbi->num_shards))
+ return NULL;
+ struct marufs_shard_cache *sc = &sbi->shard_cache[shard_id];
+ if (unlikely(entry_idx >= sbi->entries_per_shard ||
+ !marufs_shard_geometry_valid(1, sbi->entries_per_shard)))
+ return NULL;
+ if (unlikely(!sc->entries))
+ return NULL;
+
+ struct marufs_index_entry *e = &sc->entries[entry_idx];
+ MARUFS_CXL_RMB(e, sizeof(*e));
+ return e;
+}
+
+/* ============================================================================
+ * sysfs entry points (sysfs.c)
+ * ============================================================================ */
+
+int marufs_sysfs_init(void);
+void marufs_sysfs_exit(void);
+int marufs_sysfs_register(struct marufs_sb_info *sbi);
+void marufs_sysfs_unregister(struct marufs_sb_info *sbi);
+
+#endif /* _MARUFS_H */
diff --git a/marufs_kernel/src/marufs_bootstrap_layout.h b/marufs_kernel/src/marufs_bootstrap_layout.h
new file mode 100644
index 0000000..a4e1770
--- /dev/null
+++ b/marufs_kernel/src/marufs_bootstrap_layout.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * marufs_bootstrap_layout.h - Bootstrap slot area on-disk format.
+ *
+ * Bootstrap area sits at MARUFS_BOOTSTRAP_AREA_OFFSET (immediately after the 256B
+ * Global Superblock) and provides a node-election table for auto-mount.
+ * Each slot is exactly one CL (64B). Slot i corresponds to node_id (i+1).
+ *
+ * Bootstrap is ONLY for mount-time election and format gate.
+ * Runtime liveness is owned entirely by ME (not bootstrap heartbeat).
+ *
+ * Slot write order (publication gate):
+ * magic → random_token → status
+ *
+ * Status field is the publication gate: peers treat a slot as claimed only
+ * when status != EMPTY AND random_token != 0.
+ */
+
+#ifndef _MARUFS_BOOTSTRAP_LAYOUT_H
+#define _MARUFS_BOOTSTRAP_LAYOUT_H
+
+#include
+
+/*
+ * Bootstrap slot status values.
+ * EMPTY = 0 is ABI-pinned: zero-init via memset must equal EMPTY.
+ *
+ * Reuse rule:
+ * - EMPTY slots are always claimable.
+ * - CLAIMED slots whose ME node is dead (me_node_is_dead) are claimable.
+ * - FORMATTING slots are NOT claimable (format in progress on slot[0]).
+ * Stuck-formatter steal handles recovery via timeout + token overwrite.
+ * Graceful umount writes EMPTY directly.
+ */
+enum marufs_bootstrap_status {
+ MARUFS_BS_EMPTY = 0, /* available for new claim */
+ MARUFS_BS_CLAIMED = 1, /* node mounted and active */
+ MARUFS_BS_FORMATTING = 2, /* slot[0] only: format in progress */
+};
+
+/*
+ * Bootstrap slot — exactly 64B (one CL).
+ *
+ * 0: magic — MARUFS_BOOTSTRAP_MAGIC when slot is in use
+ * 4: status — enum marufs_bootstrap_status (publication gate)
+ * 8: random_token — per-claim nonce for race verification (0 reserved)
+ * 16..63: reserved (former heartbeat + claim_ts folded here)
+ */
+struct marufs_bootstrap_slot {
+ __le32 magic; /* 0 */
+ __le32 status; /* 4 */
+ __le64 random_token; /* 8 */
+ __u8 reserved[48]; /* 16..63 */
+} __attribute__((packed));
+
+#endif /* _MARUFS_BOOTSTRAP_LAYOUT_H */
diff --git a/marufs_kernel/src/marufs_endian.h b/marufs_kernel/src/marufs_endian.h
new file mode 100644
index 0000000..4ece596
--- /dev/null
+++ b/marufs_kernel/src/marufs_endian.h
@@ -0,0 +1,168 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * marufs_endian.h - Endian + CXL barrier + CAS primitives.
+ *
+ * CXL shared memory needs both endian conversion (on-disk = little-endian)
+ * and compiler/cache barriers for cross-host visibility. These macros are
+ * used by every layer that touches CXL fields, so they live in their own
+ * header to keep includes precise.
+ */
+
+#ifndef _MARUFS_ENDIAN_H
+#define _MARUFS_ENDIAN_H
+
+#include
+
+/* Read little-endian field with compiler barrier (DRAM / general use) */
+#define READ_LE32(field) le32_to_cpu(READ_ONCE(field))
+#define READ_LE64(field) le64_to_cpu(READ_ONCE(field))
+#define READ_LE16(field) le16_to_cpu(READ_ONCE(field))
+
+/* Read little-endian field from CXL memory (caller must issue RMB first) */
+#define READ_CXL_LE32(field) le32_to_cpu(field)
+#define READ_CXL_LE64(field) le64_to_cpu(field)
+#define READ_CXL_LE16(field) le16_to_cpu(field)
+
+/* Write little-endian field with compiler barrier */
+#define WRITE_LE32(field, val) WRITE_ONCE(field, cpu_to_le32(val))
+#define WRITE_LE64(field, val) WRITE_ONCE(field, cpu_to_le64(val))
+#define WRITE_LE16(field, val) WRITE_ONCE(field, cpu_to_le16(val))
+
+/*
+ * CXL multi-node memory barriers
+ *
+ * CXL 3.0: hardware cache coherence guaranteed across hosts.
+ * Standard wmb()/rmb() suffice for cross-node visibility.
+ * CXL 2.0: no cross-host coherence in spec.
+ * Explicit clwb/clflushopt required to flush/invalidate so
+ * writes reach CXL memory and reads fetch fresh data.
+ *
+ * MARUFS_CXL_WMB(addr, len) - Call after writing CXL fields.
+ * MARUFS_CXL_RMB(addr, len) - Call before reading CXL fields.
+ *
+ * Enable CXL 2.0 mode via -DCONFIG_MARUFS_CXL2_COMPAT.
+ */
+#ifdef CONFIG_MARUFS_CXL2_COMPAT
+
+#include
+
+#define MARUFS_CXL_WMB(addr, len) \
+ do { \
+ wmb(); \
+ arch_wb_cache_pmem((void *)(addr), (len)); \
+ } while (0)
+
+#define MARUFS_CXL_RMB(addr, len) \
+ do { \
+ arch_invalidate_pmem((void *)(addr), (len)); \
+ rmb(); \
+ } while (0)
+
+#else /* CXL 3.0: hardware coherence guaranteed across hosts */
+
+#define MARUFS_CXL_WMB(addr, len) wmb()
+#define MARUFS_CXL_RMB(addr, len) rmb()
+
+#endif /* CONFIG_MARUFS_CXL2_COMPAT */
+
+/*
+ * marufs_le{16,32,64}_cas - Atomic compare-and-swap on little-endian CXL
+ * fields. Returns old value in CPU byte order. On success, flushes write
+ * via WMB for cross-node visibility (CXL 2.0). On CXL 2.0 also performs
+ * post-CAS verification: invalidate cacheline + re-read. If a peer
+ * overwrote between our cmpxchg and flush, report mismatch as CAS
+ * failure with the actual current value. CXL 3.0 skips verification.
+ */
+static inline u16 marufs_le16_cas(__le16 *ptr, u16 old, u16 new)
+{
+ u16 exp = cpu_to_le16(old);
+ u16 ret = cmpxchg((u16 *)ptr, exp, cpu_to_le16(new));
+
+ if (ret == exp) {
+ MARUFS_CXL_WMB(ptr, sizeof(*ptr));
+#ifdef CONFIG_MARUFS_CXL2_COMPAT
+ MARUFS_CXL_RMB(ptr, sizeof(*ptr));
+ if (unlikely(READ_CXL_LE16(*ptr) != new))
+ return READ_CXL_LE16(*ptr);
+#endif
+ }
+ return le16_to_cpu(ret);
+}
+
+static inline u32 marufs_le32_cas(__le32 *ptr, u32 old, u32 new)
+{
+ u32 exp = cpu_to_le32(old);
+ u32 ret = cmpxchg((u32 *)ptr, exp, cpu_to_le32(new));
+
+ if (ret == exp) {
+ MARUFS_CXL_WMB(ptr, sizeof(*ptr));
+#ifdef CONFIG_MARUFS_CXL2_COMPAT
+ MARUFS_CXL_RMB(ptr, sizeof(*ptr));
+ if (unlikely(READ_CXL_LE32(*ptr) != new))
+ return READ_CXL_LE32(*ptr);
+#endif
+ }
+ return le32_to_cpu(ret);
+}
+
+static inline u64 marufs_le64_cas(__le64 *ptr, u64 old, u64 new)
+{
+ u64 exp = cpu_to_le64(old);
+ u64 ret = cmpxchg((u64 *)ptr, exp, cpu_to_le64(new));
+
+ if (ret == exp) {
+ MARUFS_CXL_WMB(ptr, sizeof(*ptr));
+#ifdef CONFIG_MARUFS_CXL2_COMPAT
+ MARUFS_CXL_RMB(ptr, sizeof(*ptr));
+ if (unlikely(READ_CXL_LE64(*ptr) != new))
+ return READ_CXL_LE64(*ptr);
+#endif
+ }
+ return le64_to_cpu(ret);
+}
+
+/*
+ * marufs_le{16,32}_cas_inc/dec - CAS-based atomic increment/decrement.
+ * Safe for concurrent multi-node access. Underflow/overflow guarded.
+ */
+static inline void marufs_le16_cas_inc(__le16 *p)
+{
+ u16 old_val;
+ do {
+ old_val = READ_CXL_LE16(*p);
+ if (old_val == U16_MAX)
+ return; /* overflow guard */
+ } while (marufs_le16_cas(p, old_val, old_val + 1) != old_val);
+}
+
+static inline void marufs_le16_cas_dec(__le16 *p)
+{
+ u16 old_val;
+ do {
+ old_val = READ_CXL_LE16(*p);
+ if (old_val == 0)
+ return; /* underflow guard */
+ } while (marufs_le16_cas(p, old_val, old_val - 1) != old_val);
+}
+
+static inline void marufs_le32_cas_inc(__le32 *p)
+{
+ u32 old_val;
+ do {
+ old_val = READ_CXL_LE32(*p);
+ if (old_val == U32_MAX)
+ return; /* overflow guard */
+ } while (marufs_le32_cas(p, old_val, old_val + 1) != old_val);
+}
+
+static inline void marufs_le32_cas_dec(__le32 *p)
+{
+ u32 old_val;
+ do {
+ old_val = READ_CXL_LE32(*p);
+ if (old_val == 0)
+ return; /* underflow guard */
+ } while (marufs_le32_cas(p, old_val, old_val - 1) != old_val);
+}
+
+#endif /* _MARUFS_ENDIAN_H */
diff --git a/marufs_kernel/src/marufs_hash.h b/marufs_kernel/src/marufs_hash.h
new file mode 100644
index 0000000..5f12a9f
--- /dev/null
+++ b/marufs_kernel/src/marufs_hash.h
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * marufs_hash.h - Hash + index helpers (orthogonal to on-disk layout).
+ *
+ * Filename hashing, shard/bucket selection, VFS ino synthesis, and
+ * power-of-2 alignment. Stand-alone — no dependencies on layout structs.
+ */
+
+#ifndef _MARUFS_HASH_H
+#define _MARUFS_HASH_H
+
+#include
+#include
+#include
+
+/*
+ * marufs_shard_idx - select shard from 64-bit name hash.
+ * Uses upper 16 bits (bits 63..48) masked with shard_mask.
+ * @name_hash: 64-bit hash of filename
+ * @shard_mask: num_shards - 1 (num_shards must be power of 2)
+ */
+static inline u32 marufs_shard_idx(u64 name_hash, u32 shard_mask)
+{
+ return (u32)((name_hash >> 48) & shard_mask);
+}
+
+/*
+ * marufs_bucket_idx - select bucket within shard.
+ * Uses bits 47..32 of hash masked with bucket_mask.
+ * @name_hash: 64-bit hash of filename
+ * @bucket_mask: num_buckets - 1 (num_buckets must be power of 2)
+ */
+static inline u32 marufs_bucket_idx(u64 name_hash, u32 bucket_mask)
+{
+ return (u32)((name_hash >> 32) & bucket_mask);
+}
+
+/*
+ * marufs_make_ino - synthesize VFS inode number from region_id.
+ * +2 to skip ino 0 (null/invalid) and ino 1 (root directory).
+ */
+static inline unsigned long marufs_make_ino(u32 region_id)
+{
+ return (unsigned long)region_id + 2;
+}
+
+/* Extract region_id from VFS inode number. */
+static inline u32 marufs_ino_to_region(unsigned long ino)
+{
+ return (u32)(ino - 2);
+}
+
+/* Align @val up to next @align boundary. @align must be power of 2. */
+static inline u64 marufs_align_up(u64 val, u64 align)
+{
+ return (val + align - 1) & ~(align - 1);
+}
+
+/*
+ * marufs_hash_name - SHA-256 truncated hash for filename.
+ * Upper bits used for shard selection, middle bits for bucket index.
+ */
+static inline u64 marufs_hash_name(const char *name, size_t len)
+{
+ u8 digest[SHA256_DIGEST_SIZE];
+ sha256((const u8 *)name, len, digest);
+ return get_unaligned_le64(digest);
+}
+
+#endif /* _MARUFS_HASH_H */
diff --git a/marufs_kernel/src/marufs_index_layout.h b/marufs_kernel/src/marufs_index_layout.h
new file mode 100644
index 0000000..8b19b93
--- /dev/null
+++ b/marufs_kernel/src/marufs_index_layout.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * marufs_index_layout.h - Global hash index on-disk format.
+ *
+ * Shard table headers and per-bucket-chain index entries.
+ * Lock-free CAS protocol on `state` field drives insert/delete.
+ */
+
+#ifndef _MARUFS_INDEX_LAYOUT_H
+#define _MARUFS_INDEX_LAYOUT_H
+
+#include
+
+/* ── Index entry state (CAS target) ──────────────────────────────── */
+enum marufs_entry_state {
+ MARUFS_ENTRY_EMPTY = 0,
+ MARUFS_ENTRY_INSERTING = 1,
+ MARUFS_ENTRY_TENTATIVE = 2,
+ MARUFS_ENTRY_VALID = 3,
+ MARUFS_ENTRY_TOMBSTONE = 4,
+};
+
+/* ── Region index defaults ─────────────────────────────────────────── */
+enum marufs_region_config {
+ MARUFS_REGION_NUM_SHARDS = 4,
+ MARUFS_REGION_BUCKETS_PER_SHARD = 256,
+ MARUFS_REGION_ENTRIES_PER_SHARD = 256,
+};
+
+/* ── Sentinels / Sizes ─────────────────────────────────────────────── */
+#define MARUFS_BUCKET_END 0xFFFFFFFF
+
+enum {
+ MARUFS_SHARD_HEADER_SIZE = 64,
+ MARUFS_INDEX_ENTRY_SIZE = 64,
+};
+
+/*
+ * Shard Header — 64 bytes each, stored in shard table.
+ * One per shard; describes bucket and entry-array layout within the
+ * global index pool.
+ */
+struct marufs_shard_header {
+ __le32 magic; /* MARUFS_SHARD_MAGIC */
+ __le32 shard_id; /* Shard index [0..num_shards) */
+ __le32 num_buckets; /* Hash bucket count (pow2) */
+ __le32 num_entries; /* Max entries in this shard */
+ __le64 bucket_array_offset; /* Absolute offset in device */
+ __le64 entry_array_offset; /* Index entries (64B each) */
+ __u8 reserved[32]; /* Padding to 64 */
+} __attribute__((packed)); /* Total: 64 bytes */
+
+/*
+ * Index Entry — 64 bytes (1 CL).
+ * Chain linkage + region_id for hash-based file lookup.
+ * Name, file_size, uid/gid/mode, timestamps are all in RAT entry
+ * (single source of truth).
+ */
+struct marufs_index_entry {
+ __le32 state; /* 0: CAS target (EMPTY/INSERTING/TENTATIVE/VALID/TOMBSTONE) */
+ __le32 next_in_bucket; /* 4: hash chain link (MARUFS_BUCKET_END = end) */
+ __le64 name_hash; /* 8: 64-bit SHA-256 truncated hash */
+ __le32 region_id; /* 16: RAT entry ID */
+ __le32 node_id; /* 20: inserter node (stale INSERTING detection) */
+ __le64 created_at; /* 24: ns since epoch (stale INSERTING detection) */
+ __u8 reserved[32]; /* Padding to 64 */
+} __attribute__((packed)); /* Total: 64 bytes */
+
+#endif /* _MARUFS_INDEX_LAYOUT_H */
diff --git a/marufs_kernel/src/marufs_layout.h b/marufs_kernel/src/marufs_layout.h
new file mode 100644
index 0000000..858153d
--- /dev/null
+++ b/marufs_kernel/src/marufs_layout.h
@@ -0,0 +1,146 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * marufs_layout.h - MARUFS on-disk layout (umbrella header).
+ *
+ * CAS-based lock-free distributed model: all nodes can concurrently
+ * read/write the global index. No GCS (Global Chunk Server) needed.
+ *
+ * This header collects the master layout description: magic numbers,
+ * ME area sizing, the global offset table, and compile-time size
+ * validators. Per-domain on-disk structs are defined in dedicated
+ * headers (superblock/index/rat/nrht), each focused on one subsystem.
+ *
+ * CXL Memory Layout (packed; only regions are 2MB-aligned):
+ *
+ * ┌──────────────────────────────────────────────┐
+ * │ Superblock (256B) │ 0x00000
+ * ├──────────────────────────────────────────────┤
+ * │ Shard Table (4 × 64B = 256B) │ 0x00100
+ * ├──────────────────────────────────────────────┤
+ * │ Bucket Arrays (4 × 256 × 4B = 4KB) │ 0x00200
+ * ├──────────────────────────────────────────────┤
+ * │ Entry Arrays (4 × 256 × 64B = 64KB) │ 0x01200
+ * ├──────────────────────────────────────────────┤
+ * │ RAT (hdr + 256 × 2KB ≈ 512KB) │ 0x11200
+ * ├──────────────────────────────────────────────┤
+ * │ ME Area (hdr + CBs + membership ≈ 4KB) │ dynamic (gsb->me_area_offset)
+ * ├──────────── 2MB aligned ─────────────────────┤
+ * │ Region 0 data, Region 1 data, ... │ 0x200000
+ * └──────────────────────────────────────────────┘
+ */
+
+#ifndef _MARUFS_LAYOUT_H
+#define _MARUFS_LAYOUT_H
+
+#include
+
+#include "marufs_uapi.h"
+#include "marufs_endian.h"
+#include "marufs_hash.h"
+
+#include "marufs_superblock_layout.h"
+#include "marufs_bootstrap_layout.h"
+#include "marufs_index_layout.h"
+#include "marufs_rat_layout.h"
+#include "marufs_nrht_layout.h"
+
+/* ── Magic Numbers / Version ───────────────────────────────────────── */
+enum marufs_magic {
+ MARUFS_MAGIC = 0x4D415255, /* "MARU" */
+ MARUFS_BOOTSTRAP_MAGIC = 0x4D425453, /* "MBTS" */
+ MARUFS_SHARD_MAGIC = 0x4D534844, /* "MSHD" */
+ MARUFS_RAT_MAGIC = 0x4D524154, /* "MRAT" */
+ MARUFS_NRHT_MAGIC = 0x4E524854, /* "NRHT" */
+ MARUFS_ME_MAGIC = 0x4D454C4B, /* "MELK" */
+
+ MARUFS_VERSION = 1,
+ MARUFS_NRHT_VERSION = 1,
+ MARUFS_ME_VERSION = 1,
+};
+
+/* ── ME area sizing (referenced by layout offsets and ME init) ────── */
+enum {
+ MARUFS_ME_HEADER_SIZE = 64, /* ME area header (1 CL) */
+ MARUFS_ME_CB_SIZE = 64, /* ME control block (1 CL) */
+ MARUFS_ME_MEMBERSHIP_SLOT_SIZE = 64, /* ME membership slot (1 CL) */
+ MARUFS_ME_SLOT_SIZE = 64, /* ME per-(shard, node) slot (1 CL) */
+};
+
+/* Bootstrap area constants */
+enum {
+ MARUFS_BOOTSTRAP_SLOT_SIZE = 64, /* 1 CL */
+ MARUFS_BOOTSTRAP_MAX_SLOTS = MARUFS_MAX_NODE_ID,
+ MARUFS_BOOTSTRAP_AREA_SIZE = MARUFS_BOOTSTRAP_SLOT_SIZE *
+ MARUFS_BOOTSTRAP_MAX_SLOTS, /* 512B */
+};
+
+/*
+ * marufs_me_area_size - compute total ME area size in bytes.
+ * Layout: [Header 64B] [CB S×64B] [Membership N×64B] [Slot S×N×64B]
+ *
+ * Slot region is allocated for BOTH strategies:
+ * - order-driven: per-node doorbell (token_seq)
+ * - request-driven: per-node hand-raise + doorbell
+ */
+static inline u64 marufs_me_area_size(u32 num_shards, u32 max_nodes)
+{
+ u64 size = MARUFS_ME_HEADER_SIZE;
+
+ size += (u64)num_shards * MARUFS_ME_CB_SIZE;
+ size += (u64)max_nodes * MARUFS_ME_MEMBERSHIP_SLOT_SIZE;
+ size += (u64)num_shards * max_nodes * MARUFS_ME_SLOT_SIZE;
+ return size;
+}
+
+/* ── Global offset table ──────────────────────────────────────────── */
+enum marufs_layout {
+ MARUFS_ALIGN_2MB = 2 * 1024 * 1024,
+
+ MARUFS_GSB_OFFSET = 0,
+ /* Bootstrap area immediately follows the GSB (512B = 8 × 64B slots) */
+ MARUFS_BOOTSTRAP_AREA_OFFSET = MARUFS_GSB_SIZE,
+ /* Shard table starts after GSB + bootstrap area */
+ MARUFS_SHARD_TABLE_OFFSET =
+ MARUFS_BOOTSTRAP_AREA_OFFSET + MARUFS_BOOTSTRAP_AREA_SIZE,
+ MARUFS_INDEX_BUCKET_OFFSET =
+ MARUFS_SHARD_TABLE_OFFSET +
+ MARUFS_REGION_NUM_SHARDS * MARUFS_SHARD_HEADER_SIZE,
+ MARUFS_INDEX_ENTRY_OFFSET =
+ MARUFS_INDEX_BUCKET_OFFSET +
+ MARUFS_REGION_NUM_SHARDS * MARUFS_REGION_BUCKETS_PER_SHARD * 4,
+ MARUFS_RAT_OFFSET = MARUFS_INDEX_ENTRY_OFFSET +
+ MARUFS_REGION_NUM_SHARDS *
+ MARUFS_REGION_ENTRIES_PER_SHARD *
+ MARUFS_INDEX_ENTRY_SIZE,
+ MARUFS_ME_AREA_OFFSET = MARUFS_RAT_OFFSET + MARUFS_RAT_HEADER_SIZE +
+ MARUFS_MAX_RAT_ENTRIES * MARUFS_RAT_ENTRY_SIZE,
+ MARUFS_REGION_OFFSET = MARUFS_ALIGN_2MB,
+};
+
+/* ── Compile-time size validation ────────────────────────────────── */
+
+#define MARUFS_BUILD_BUG_ON(cond) ((void)sizeof(char[1 - 2 * !!(cond)]))
+
+static inline void __marufs_verify_structs(void)
+{
+ MARUFS_BUILD_BUG_ON(sizeof(struct marufs_superblock) !=
+ MARUFS_GSB_SIZE);
+ MARUFS_BUILD_BUG_ON(sizeof(struct marufs_bootstrap_slot) !=
+ MARUFS_BOOTSTRAP_SLOT_SIZE);
+ MARUFS_BUILD_BUG_ON(sizeof(struct marufs_shard_header) !=
+ MARUFS_SHARD_HEADER_SIZE);
+ MARUFS_BUILD_BUG_ON(sizeof(struct marufs_index_entry) !=
+ MARUFS_INDEX_ENTRY_SIZE);
+ MARUFS_BUILD_BUG_ON(sizeof(struct marufs_deleg_entry) !=
+ MARUFS_DELEG_ENTRY_SIZE);
+ MARUFS_BUILD_BUG_ON(sizeof(struct marufs_rat_entry) !=
+ MARUFS_RAT_ENTRY_SIZE);
+ MARUFS_BUILD_BUG_ON(sizeof(struct marufs_nrht_header) !=
+ MARUFS_NRHT_HEADER_SIZE);
+ MARUFS_BUILD_BUG_ON(sizeof(struct marufs_nrht_shard_header) !=
+ MARUFS_NRHT_SHARD_HEADER_SIZE);
+ MARUFS_BUILD_BUG_ON(sizeof(struct marufs_nrht_entry) !=
+ MARUFS_NRHT_ENTRY_SIZE);
+}
+
+#endif /* _MARUFS_LAYOUT_H */
diff --git a/marufs_kernel/src/marufs_nrht_layout.h b/marufs_kernel/src/marufs_nrht_layout.h
new file mode 100644
index 0000000..f7fdb34
--- /dev/null
+++ b/marufs_kernel/src/marufs_nrht_layout.h
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * marufs_nrht_layout.h - Independent Name-Ref Hash Table on-disk format.
+ *
+ * Per-region hash table mapping (name, name_hash) -> (target_region_id,
+ * offset). Header + per-shard headers + bucket arrays + entry arrays
+ * all live within the region's data area.
+ */
+
+#ifndef _MARUFS_NRHT_LAYOUT_H
+#define _MARUFS_NRHT_LAYOUT_H
+
+#include
+
+#include "marufs_uapi.h" /* MARUFS_NAME_MAX */
+
+/* ── NRHT defaults ────────────────────────────────────────────────── */
+enum marufs_nrht_config {
+ MARUFS_NRHT_DEFAULT_NUM_SHARDS = 64,
+ MARUFS_NRHT_MAX_NUM_SHARDS = 64,
+ MARUFS_NRHT_DEFAULT_ENTRIES = 8192,
+ MARUFS_NRHT_MAX_ENTRIES = 8192,
+ MARUFS_NRHT_DEFAULT_LOAD_FACTOR = 4, /* entries per bucket */
+};
+
+/* ── Sizes ────────────────────────────────────────────────────────── */
+enum {
+ MARUFS_NRHT_HEADER_SIZE = 64, /* 1 CL */
+ MARUFS_NRHT_SHARD_HEADER_SIZE = 64, /* 1 CL */
+ MARUFS_NRHT_ENTRY_SIZE = 128, /* 2 CL */
+};
+
+/*
+ * NRHT Header — first 64 bytes (1 CL) of an NRHT file.
+ * Describes the entire hash table layout within the region's data area.
+ */
+struct marufs_nrht_header {
+ __le32 magic; /* 0: MARUFS_NRHT_MAGIC (0x4E524854) */
+ __le32 version; /* 4: format version (1) */
+ __le32 num_shards; /* 8: shard count (power of 2) */
+ __le32 buckets_per_shard; /* 12: buckets per shard (power of 2) */
+ __le32 entries_per_shard; /* 16: max entries per shard */
+ __le32 owner_region_id; /* 20: RAT entry ID of this NRHT file */
+ __le64 table_size; /* 24: total NRHT allocation size (bytes) */
+ __u8 reserved[32]; /* 32: padding to 64B */
+} __attribute__((packed)); /* Total: 64 bytes */
+
+/*
+ * NRHT Shard Header — 64 bytes each, stored after NRHT header.
+ * Contains per-shard geometry and absolute device offsets for bucket
+ * and entry arrays.
+ */
+struct marufs_nrht_shard_header {
+ __le32 num_entries; /* 0: max entries in this shard */
+ __le32 num_buckets; /* 4: bucket count in this shard */
+ __le64 bucket_array_offset; /* 8: absolute offset in device */
+ __le64 entry_array_offset; /* 16: absolute offset in device */
+ __le32 free_hint; /* 24: flat scan start hint (best-effort, no CAS) */
+ __u8 reserved[36]; /* 28: padding to 64B */
+} __attribute__((packed)); /* Total: 64 bytes */
+
+/*
+ * NRHT Entry — 128 bytes (2 CL).
+ * CL0: hot path — accessed on every chain walk.
+ * CL1: cold path — accessed only on hash match for name verification.
+ * CPU only fetches CLs actually accessed, so chain walks that miss on
+ * hash still touch only 1 CL per hop.
+ */
+struct marufs_nrht_entry {
+ /* ── CL0: hot path (bytes 0–63) ────────────────────────────── */
+ __le32 state; /* 0: EMPTY(0) / INSERTING(1) / TENTATIVE(2) / VALID(3) / TOMBSTONE(4) */
+ __le32 next_in_bucket; /* 4: chain link (BUCKET_END = 0xFFFFFFFF) */
+ __le64 name_hash; /* 8: 64-bit SHA-256 truncated hash */
+ __le64 offset; /* 16: offset within target region's data area */
+ __le32 target_region_id; /* 24: which region this offset refers to */
+ __le32 inserter_node; /* 28: node_id that created this entry */
+ __le64 created_at; /* 32: ns since epoch (stale INSERTING detection) */
+ __le32 ref_count; /* 40: user-managed reference count (REF_INC/DEC ioctls) */
+ __le32 pin_count; /* 44: user-managed pin count (PIN_INC/DEC ioctls) */
+ __u8 reserved0[16]; /* 48: padding to 64B CL boundary */
+
+ /* ── CL1: cold path (bytes 64–127) ─────────────────────────── */
+ char name[MARUFS_NAME_MAX + 1]; /* 64: null-terminated name (64B) */
+} __attribute__((packed)); /* Total: 128 bytes */
+
+#endif /* _MARUFS_NRHT_LAYOUT_H */
diff --git a/marufs_kernel/src/marufs_rat_layout.h b/marufs_kernel/src/marufs_rat_layout.h
new file mode 100644
index 0000000..28d80b2
--- /dev/null
+++ b/marufs_kernel/src/marufs_rat_layout.h
@@ -0,0 +1,136 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * marufs_rat_layout.h - Region Allocation Table on-disk format.
+ *
+ * RAT entries embed the per-region delegation table directly (no separate
+ * region header pool). One RAT slot tracks one region file.
+ */
+
+#ifndef _MARUFS_RAT_LAYOUT_H
+#define _MARUFS_RAT_LAYOUT_H
+
+#include
+
+#include "marufs_uapi.h" /* MARUFS_NAME_MAX, MARUFS_DELEG_MAX */
+
+/* ── Region type (stored in RAT entry CL0) ───────────────────────── */
+enum marufs_region_type {
+ MARUFS_REGION_DATA = 0, /* normal data region (default) */
+ MARUFS_REGION_NRHT = 1, /* NRHT name-ref hash table */
+};
+
+/* ── RAT entry state ──────────────────────────────────────────────── */
+enum marufs_rat_entry_state {
+ MARUFS_RAT_ENTRY_FREE = 0,
+ MARUFS_RAT_ENTRY_ALLOCATING = 1,
+ MARUFS_RAT_ENTRY_ALLOCATED = 2,
+ MARUFS_RAT_ENTRY_DELETING = 3,
+};
+
+/* ── Delegation entry state ───────────────────────────────────────── */
+enum marufs_deleg_state {
+ MARUFS_DELEG_EMPTY = 0,
+ MARUFS_DELEG_GRANTING = 1,
+ MARUFS_DELEG_ACTIVE = 2,
+};
+
+#define MARUFS_DELEG_MAX_ENTRIES MARUFS_DELEG_MAX /* from uapi */
+
+/* ── Capacity ─────────────────────────────────────────────────────── */
+enum {
+ MARUFS_MAX_REGIONS = 256, /* max files */
+ MARUFS_MAX_RAT_ENTRIES = 256,
+};
+
+/* ── Sizes ────────────────────────────────────────────────────────── */
+enum {
+ MARUFS_DELEG_ENTRY_SIZE = 64,
+ MARUFS_RAT_ENTRY_SIZE = 2048, /* 32 CL = 2KB */
+ MARUFS_RAT_HEADER_SIZE = 128,
+};
+
+/*
+ * Delegation Entry — 64 bytes each, stored in region header delegation table.
+ * Each entry grants specific permissions to a (node_id, pid) pair plus an
+ * exe-binary identity (post-exec privilege retention defense).
+ */
+struct marufs_deleg_entry {
+ __le32 state; /* MARUFS_DELEG_EMPTY(0) / MARUFS_DELEG_GRANTING(1) / MARUFS_DELEG_ACTIVE(2) */
+ __le32 node_id; /* Target node (0 = any node) */
+ __le32 pid; /* Target PID (0 = all processes on node) */
+ __le32 perms; /* Permission bitmask (MARUFS_PERM_*) */
+ __le64 birth_time; /* PID reuse protection (0 if pid=0) */
+ __le64 granted_at; /* Grant timestamp (ns since epoch) */
+ __le64 exe_inode_ino; /* exe binary inode # (0 = lazy-init on first match) */
+ __le32 exe_inode_dev; /* exe binary fs dev (huge_encode_dev) */
+ __u8 reserved[20]; /* Padding to 64 */
+} __attribute__((packed)); /* Total: 64 bytes */
+
+/*
+ * Region Allocation Entry - tracks variable-sized region files.
+ * Each entry describes one region file with physically contiguous allocation.
+ * Size: 2048 bytes (32 cache lines, 2KB).
+ *
+ * Two-phase lifecycle:
+ * open(O_CREAT): state=ALLOCATED, phys_offset=0, size=0 (reservation)
+ * ftruncate(N): phys_offset and size filled in (physical allocation)
+ */
+struct marufs_rat_entry {
+ /* ── CL0: Hot I/O metadata (64B) ────────────────────────────── */
+ __le32 state; /* lifecycle (FREE/ALLOCATING/ALLOCATED/DELETING) */
+ __le32 region_type; /* MARUFS_REGION_DATA(0) / MARUFS_REGION_NRHT(1) */
+ __le64 phys_offset; /* data area offset (0 = not yet allocated) */
+ __le64 size; /* region size in bytes (2MB aligned) */
+ __le64 alloc_time; /* allocation timestamp (ns) */
+ __le64 modified_at; /* last modification (ns) */
+ __u8 reserved0[24];
+
+ /* ── CL1: Name (64B) — cold, read on hash match only ────────── */
+ char name[MARUFS_NAME_MAX + 1]; /* null-terminated filename */
+
+ /* ── CL2: ACL — ownership + perms + delegation header (64B) ── */
+ __le16 default_perms; /* default non-owner perms (MARUFS_PERM_*) */
+ __le16 owner_node_id; /* node ownership (max 64) */
+ __le32 owner_pid; /* process ownership */
+ __le64 owner_birth_time; /* PID reuse protection */
+ __le64 owner_exe_inode_ino; /* owner exe binary inode # (post-exec retention defense) */
+ __le32 owner_exe_inode_dev; /* owner exe binary fs dev */
+ __le32 uid; /* POSIX owner UID */
+ __le32 gid; /* POSIX owner GID */
+ __le16 mode; /* POSIX mode bits */
+ __le16 deleg_num_entries; /* active delegation count (max 29) */
+ __u8 reserved2[24];
+
+ /* ── CL3-CL31: Delegation entries (29 × 64B = 1856B) ────────── */
+ struct marufs_deleg_entry deleg_entries[MARUFS_DELEG_MAX_ENTRIES];
+} __attribute__((packed)); /* Total: 2048B (32 CL = 2KB) */
+
+/*
+ * Region Allocation Table - global allocator for variable-sized regions.
+ * Stored after index pool, before first region.
+ * Size: 4KB header + (256 × 2KB entries) = 516KB total.
+ */
+struct marufs_rat {
+ /* Header */
+ __le32 magic; /* MARUFS_RAT_MAGIC */
+ __le32 version; /* 1 */
+ __le32 num_entries; /* Number of allocated entries */
+ __le32 max_entries; /* MARUFS_MAX_RAT_ENTRIES */
+
+ /* Device info */
+ __le64 device_size; /* Total device size */
+ __le64 rat_offset; /* Offset of this RAT */
+ __le64 regions_start; /* Where region files start */
+
+ /* Allocation stats */
+ __le64 total_allocated; /* Total allocated bytes */
+ __le64 total_free; /* Remaining free bytes */
+
+ /* Reserved */
+ __u8 reserved[72]; /* Padding to 4KB */
+
+ /* Entry array follows immediately */
+ struct marufs_rat_entry entries[MARUFS_MAX_RAT_ENTRIES];
+} __attribute__((packed));
+
+#endif /* _MARUFS_RAT_LAYOUT_H */
diff --git a/marufs_kernel/src/marufs_superblock_layout.h b/marufs_kernel/src/marufs_superblock_layout.h
new file mode 100644
index 0000000..e40740c
--- /dev/null
+++ b/marufs_kernel/src/marufs_superblock_layout.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * marufs_superblock_layout.h - Global Superblock on-disk format.
+ */
+
+#ifndef _MARUFS_SUPERBLOCK_LAYOUT_H
+#define _MARUFS_SUPERBLOCK_LAYOUT_H
+
+#include
+
+enum {
+ MARUFS_GSB_SIZE = 256, /* Global superblock total size */
+};
+
+/*
+ * Global Superblock — first 256 bytes of CXL memory.
+ * Single instance; describes entire partitioned index layout.
+ */
+struct marufs_superblock {
+ /* ── CL0: identity + layout + integrity (bytes 0–63) ────────── */
+ __le32 magic; /* 0: MARUFS_MAGIC */
+ __le32 uuid; /* 4: Filesystem UUID (per-format random) */
+ __le32 version; /* 8: 1 */
+ __le32 _pad0; /* 12: align total_size to 8B */
+ __le64 total_size; /* 16: Total CXL memory size */
+ __le64 shard_table_offset; /* 24: Shard header array offset */
+ __le64 rat_offset; /* 32: RAT (Region Allocation Table) offset */
+ __le32 num_shards; /* 40: Number of shards (pow2) */
+ __le32 buckets_per_shard; /* 44: buckets per shard */
+ __le32 entries_per_shard; /* 48: Index entries per shard */
+ __le32 checksum; /* 52: CRC32 */
+ __le64 me_area_offset; /* 56: ME area offset (0 = ME disabled) */
+
+ /* ── CL1–CL3: reserved (bytes 64–255) ────────────────────────── */
+ __u8 reserved[192]; /* Padding to 256 */
+} __attribute__((packed));
+
+#endif /* _MARUFS_SUPERBLOCK_LAYOUT_H */
diff --git a/marufs_kernel/src/me.c b/marufs_kernel/src/me.c
new file mode 100644
index 0000000..478d0f8
--- /dev/null
+++ b/marufs_kernel/src/me.c
@@ -0,0 +1,703 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * me.c - MARUFS Cross-node Mutual Exclusion common infrastructure
+ *
+ * Provides: instance lifecycle, poll thread, membership scan helper.
+ * Strategy-specific logic lives in me_order.c / me_request.c.
+ */
+
+#include
+#include
+#include
+#include
+
+#include "marufs.h"
+#include "me_stats.h"
+#include "me.h"
+
+/*
+ * marufs_me_hold - intra-node serialize, then mark CS active.
+ * Bump `holding` BEFORE dropping local_waiters: else a poll cycle could
+ * see (holding=0, waiters=0) and grant the token away mid-acquire.
+ */
+void marufs_me_hold(struct marufs_me_instance *me, u32 shard_id)
+{
+ struct marufs_me_shard *sh = me_shard_get(me, shard_id);
+ atomic_inc(&sh->local_waiters);
+ mutex_lock(&sh->local_lock);
+ atomic_inc(&sh->holding);
+ smp_mb();
+ atomic_dec(&sh->local_waiters);
+ me_stats_lock_acquired(sh);
+ me_stats_bump_shard_acquire(me, shard_id);
+}
+
+/* Pair with marufs_me_hold. WMB in me_shard_unhold publishes CXL writes
+ * before the dec, so next holder sees fully published state. */
+void marufs_me_unhold(struct marufs_me_instance *me, u32 shard_id)
+{
+ struct marufs_me_shard *sh = me_shard_get(me, shard_id);
+ me_shard_unhold(sh);
+ me_stats_lock_released(me, sh);
+ mutex_unlock(&sh->local_lock);
+}
+
+/*
+ * me_membership_{set,clear}_pending - flip bit @shard_id in own
+ * pending_shards_mask. Sole cross-node writer; intra-node threads race on
+ * different bits of same word — CAS serializes.
+ *
+ * set_pending must run AFTER slot.requesting WMB so holders observing the
+ * bit see a fully-written request slot.
+ *
+ * Bounded retry + WARN_ONCE: release-path full scan covers a missing
+ * set-bit; a stale set-bit costs one wasted self-skipped poll RMB.
+ */
+#define MARUFS_ME_PENDING_CAS_RETRIES 64
+
+void marufs_me_membership_set_pending(struct marufs_me_instance *me,
+ u32 shard_id)
+{
+ struct marufs_me_membership_slot *ms = me_my_membership_get(me);
+ u64 bit = 1ULL << shard_id;
+
+ for (int r = 0; r < MARUFS_ME_PENDING_CAS_RETRIES; r++) {
+ u64 old = READ_CXL_LE64(ms->pending_shards_mask);
+
+ if (old & bit)
+ return;
+ if (marufs_le64_cas(&ms->pending_shards_mask, old, old | bit) ==
+ old)
+ return;
+ cpu_relax();
+ }
+ WARN_ONCE(1, "me: pending_shards_mask set stuck (shard=%u)\n",
+ shard_id);
+}
+
+void marufs_me_membership_clear_pending(struct marufs_me_instance *me,
+ u32 shard_id)
+{
+ struct marufs_me_membership_slot *ms = me_my_membership_get(me);
+ u64 bit = 1ULL << shard_id;
+
+ for (int r = 0; r < MARUFS_ME_PENDING_CAS_RETRIES; r++) {
+ u64 old = READ_CXL_LE64(ms->pending_shards_mask);
+
+ if (!(old & bit))
+ return;
+ if (marufs_le64_cas(&ms->pending_shards_mask, old,
+ old & ~bit) == old)
+ return;
+ cpu_relax();
+ }
+ WARN_ONCE(1, "me: pending_shards_mask clear stuck (shard=%u)\n",
+ shard_id);
+}
+
+/*
+ * me_membership_tick_heartbeat - advance own heartbeat; deactivate ME if
+ * the slot pointer no longer addresses a real slot (peer reformatted area).
+ */
+void me_membership_tick_heartbeat(struct marufs_me_instance *me)
+{
+ struct marufs_me_membership_slot *ms = me_my_membership_get(me);
+ if (READ_CXL_LE32(ms->magic) != MARUFS_ME_MS_MAGIC) {
+ atomic_set(&me->active, 0);
+ return;
+ }
+ WRITE_LE64(ms->heartbeat, READ_CXL_LE64(ms->heartbeat) + 1);
+ WRITE_LE64(ms->heartbeat_ts, ktime_get_ns());
+ MARUFS_CXL_WMB(&ms->heartbeat, 16);
+}
+
+/*
+ * marufs_me_pass_token - hand off to @new_holder.
+ * Order: CB (holder+gen) → WMB → slot doorbell → WMB. Readers observing a
+ * new token_seq cross-check CB without seeing stale data.
+ * HOLDER_NONE: CB-only, no doorbell. Caller must own the token (or be in
+ * bootstrap/recovery).
+ */
+void marufs_me_pass_token(struct marufs_me_instance *me, u32 shard_id,
+ u32 new_holder)
+{
+ /* Reject invalid node_id: only HOLDER_NONE or [1, max_nodes]. */
+ if (new_holder != MARUFS_ME_HOLDER_NONE &&
+ !marufs_me_is_valid_node(me, new_holder)) {
+ WARN_ONCE(1,
+ "marufs_me: invalid new_holder=%u (max_nodes=%u)\n",
+ new_holder, me->max_nodes);
+ return;
+ }
+
+ /* Magic check: deactivate self if cb pointer no longer addresses a
+ * CB record (peer reformatted the area). */
+ struct marufs_me_cb *cb = me_cb_get(me, shard_id);
+ if (READ_CXL_LE32(cb->magic) != MARUFS_ME_CB_MAGIC) {
+ atomic_set(&me->active, 0);
+ return;
+ }
+
+ /* Ownership guard: only current holder may pass to a peer. is_holder
+ * (DRAM) can be stale, so re-verify against CB. Bypass when
+ * @new_holder == self: takeover and NONE-claim intentionally write
+ * cb=self regardless of prior holder. */
+ if (new_holder != me->node_id &&
+ READ_CXL_LE32(cb->holder) != me->node_id)
+ return;
+
+ /* Step 1: publish CB before doorbell. */
+ u64 new_gen = READ_CXL_LE64(cb->generation) + 1;
+ WRITE_LE32(cb->holder, new_holder);
+ WRITE_LE64(cb->generation, new_gen);
+ MARUFS_CXL_WMB(cb, sizeof(*cb));
+
+ /* Step 2: doorbell (skip on HOLDER_NONE — no slot to ring). */
+ struct marufs_me_shard *sh = me_shard_get(me, shard_id);
+ if (new_holder == MARUFS_ME_HOLDER_NONE) {
+ me_shard_lose_holder(sh);
+ return;
+ }
+
+ struct marufs_me_slot *slot = me_slot_of(me, shard_id, new_holder - 1);
+ if (READ_CXL_LE32(slot->magic) != MARUFS_ME_SLOT_MAGIC) {
+ atomic_set(&me->active, 0);
+ return;
+ }
+
+ u64 new_seq = READ_CXL_LE64(slot->token_seq) + 1;
+ WRITE_LE32(slot->from_node, me->node_id);
+ WRITE_LE64(slot->cb_gen_at_write, new_gen);
+ WRITE_LE64(slot->token_seq, new_seq);
+ MARUFS_CXL_WMB(slot, sizeof(*slot));
+
+ if (new_holder == me->node_id) {
+ sh->last_cb_gen = new_gen;
+ sh->last_token_seq = new_seq;
+ sh->poll_last_slot_seq = new_seq;
+ me_shard_become_holder(sh);
+ } else {
+ me_shard_lose_holder(sh);
+ }
+}
+/* ── next_active: scan membership slots for next ACTIVE node ──────── */
+
+/*
+ * marufs_me_next_active - find next ACTIVE node after @current.
+ * Scans slot[current+1..] wrapping around. O(N), N <= max_nodes.
+ * Returns @current if no other active node found.
+ */
+u32 marufs_me_next_active(struct marufs_me_instance *me, u32 from)
+{
+ /* @from is an external node_id (1..max_nodes) or HOLDER_NONE.
+ * Internal array indexing is 0-based; convert, scan, convert back.
+ * HOLDER_NONE: treat as starting before slot 0 so the loop scans
+ * every slot.
+ */
+ u32 from_idx = (from == MARUFS_ME_HOLDER_NONE) ? me->max_nodes - 1 :
+ from - 1;
+
+ for (u32 i = 1; i < me->max_nodes; i++) {
+ u32 idx = (from_idx + i) % me->max_nodes;
+ struct marufs_me_membership_slot *slot =
+ me_membership_get(me, idx);
+ if (READ_CXL_LE32(slot->status) == MARUFS_ME_ACTIVE)
+ return idx + 1; /* external node_id */
+ }
+ return from;
+}
+
+/*
+ * me_leave_successor - next ACTIVE node, or HOLDER_NONE if we're alone.
+ * Cold path (leave only) — lets later joiners NONE-claim the shard.
+ */
+u32 me_leave_successor(struct marufs_me_instance *me)
+{
+ u32 succ = marufs_me_next_active(me, me->node_id);
+ return (succ == me->node_id) ? MARUFS_ME_HOLDER_NONE : succ;
+}
+
+/* ── Shared strategy primitives ───────────────────────────────────── */
+
+/*
+ * me_handle_acquire_deadline - liveness probe + optional takeover.
+ * Sample holder heartbeat → sleep local → resample. Unchanged ⇒ crash ⇒
+ * takeover. Counter-based (not heartbeat_ts) since CXL peers don't share
+ * a monotonic clock. Returns 0 if we hold token after, -ETIMEDOUT else.
+ */
+static int me_handle_acquire_deadline(struct marufs_me_instance *me,
+ u32 shard_id, struct marufs_me_shard *sh,
+ struct marufs_me_slot *my_slot,
+ struct marufs_me_cb *cb)
+{
+ u32 holder = me_cb_snapshot(cb, NULL);
+ if (!marufs_me_is_valid_node(me, holder) || holder == me->node_id)
+ return -ETIMEDOUT;
+
+ struct marufs_me_membership_slot *hs =
+ me_membership_get(me, holder - 1);
+ u64 hb_before = READ_CXL_LE64(hs->heartbeat);
+
+ /* Sleep on local clock — "elapsed" is observer-local, not cross-node. */
+ u64 probe_us = MARUFS_ME_LIVENESS_PROBE_NS / NSEC_PER_USEC;
+ usleep_range(probe_us, probe_us + probe_us / 4);
+ MARUFS_CXL_RMB(hs, sizeof(*hs));
+ u64 hb_after = READ_CXL_LE64(hs->heartbeat);
+
+ /* Re-read CB to catch races during the probe sleep:
+ * - late grant landed on us → enter CS, skip takeover.
+ * - another node took over / holder passed elsewhere → back off. */
+ u64 cb_gen_after;
+ u32 holder_after = me_cb_snapshot(cb, &cb_gen_after);
+
+ if (holder_after == me->node_id && cb_gen_after > sh->last_cb_gen) {
+ sh->last_cb_gen = cb_gen_after;
+ sh->last_token_seq = READ_CXL_LE64(my_slot->token_seq);
+ me_shard_become_holder(sh);
+ return 0;
+ }
+
+ if (holder_after != holder || hb_after != hb_before)
+ return -ETIMEDOUT; /* holder changed or original still alive */
+
+ pr_warn("me: crash detected on shard %u (holder=%u, heartbeat stuck at %llu) — taking over\n",
+ shard_id, holder, hb_before);
+ marufs_me_pass_token(me, shard_id, me->node_id);
+ return 0;
+}
+
+/*
+ * marufs_me_wait_for_token - poll doorbell until token arrives.
+ * Mirrors pass_token writer order: detect token_seq advance, snapshot cb.
+ * Promote only when holder=self AND cb_gen > last_cb_gen; else phantom.
+ */
+int marufs_me_wait_for_token(struct marufs_me_instance *me, u32 shard_id)
+{
+ struct marufs_me_shard *sh = me_shard_get(me, shard_id);
+ struct marufs_me_cb *cb = &me->cbs[shard_id];
+
+ /* Fast path. is_holder alone insufficient: pass_token writes cb=peer
+ * BEFORE clearing is_holder, so a racing acquire can see is_holder=true
+ * with cb naming a peer. Re-verify cb. */
+ if (me_shard_is_holder(sh)) {
+ MARUFS_CXL_RMB(cb, sizeof(*cb));
+ if (READ_CXL_LE32(cb->holder) == me->node_id) {
+ me_stats_wait_fast_hit(me);
+ return 0;
+ }
+ }
+
+ u64 wall_start = ktime_get_ns();
+ u64 cpu_start = me_stats_cpu_ns();
+ u64 deadline = wall_start + MARUFS_ME_ACQUIRE_TIMEOUT_NS;
+ u32 spins = 0;
+ struct marufs_me_slot *my_slot = me_my_slot(me, shard_id);
+ while (ktime_get_ns() < deadline) {
+ MARUFS_CXL_RMB(&my_slot->token_seq, sizeof(my_slot->token_seq));
+ u64 cur_seq = READ_CXL_LE64(my_slot->token_seq);
+
+ if (cur_seq != sh->last_token_seq) {
+ /* Acquire order: CB read must not reorder before slot. */
+ u64 cb_gen;
+ u32 holder = me_cb_snapshot(cb, &cb_gen);
+
+ if (holder == me->node_id && cb_gen > sh->last_cb_gen) {
+ sh->last_token_seq = cur_seq;
+ sh->last_cb_gen = cb_gen;
+ sh->poll_last_slot_seq = cur_seq;
+ me_shard_become_holder(sh);
+ me_stats_wait_done(
+ me, wall_start, cpu_start,
+ spins < MARUFS_ME_SPIN_COUNT ?
+ MARUFS_ME_WAIT_SPIN :
+ MARUFS_ME_WAIT_SLEEP);
+ return 0;
+ }
+ /* Phantom: seq advanced but CB not ours. Advance seq
+ * only; keep last_cb_gen so gen-monotonicity filter
+ * rejects stale passes. */
+ sh->last_token_seq = cur_seq;
+ }
+
+ if (++spins < MARUFS_ME_SPIN_COUNT)
+ cpu_relax();
+ else
+ usleep_range(me->poll_interval_us / 2,
+ me->poll_interval_us);
+ }
+
+ me_stats_wait_done(me, wall_start, cpu_start, MARUFS_ME_WAIT_DEADLINE);
+ return me_handle_acquire_deadline(me, shard_id, sh, my_slot, cb);
+}
+
+/*
+ * marufs_me_common_join - claim any HOLDER_NONE shards.
+ * Covers first-node (all NONE from format) and later-joiner (prior holder
+ * left NONE successor). Crashed-holder stale id → handled by takeover path.
+ */
+int marufs_me_common_join(struct marufs_me_instance *me)
+{
+ struct marufs_me_membership_slot *slot = me_my_membership_get(me);
+ WRITE_LE32(slot->status, MARUFS_ME_ACTIVE);
+ WRITE_LE32(slot->node_id, me->node_id);
+ WRITE_LE64(slot->joined_at, ktime_get_ns());
+ WRITE_LE64(slot->heartbeat, 0);
+ WRITE_LE64(slot->heartbeat_ts, ktime_get_ns());
+ WRITE_LE64(slot->pending_shards_mask, 0);
+ MARUFS_CXL_WMB(slot, sizeof(*slot));
+
+ /* Seed baselines BEFORE claim — own NONE-claim must not look phantom. */
+ for (u32 s = 0; s < me->num_shards; s++) {
+ struct marufs_me_slot *my_slot = me_my_slot(me, s);
+ u64 seq = READ_CXL_LE64(my_slot->token_seq);
+
+ struct marufs_me_shard *sh = me_shard_get(me, s);
+ sh->last_token_seq = seq;
+ sh->poll_last_slot_seq = seq;
+ (void)me_cb_snapshot(&me->cbs[s], &sh->last_cb_gen);
+ }
+
+ u32 claimed = 0;
+ for (u32 s = 0; s < me->num_shards; s++) {
+ u32 cur = me_cb_snapshot(&me->cbs[s], NULL);
+
+ if (cur == MARUFS_ME_HOLDER_NONE) {
+ marufs_me_pass_token(me, s, me->node_id);
+ claimed++;
+ } else {
+ /* Seed is_holder so poll cycles skip CB RMB on
+ * non-owned shards. Baselines seeded above. */
+ struct marufs_me_shard *sh = me_shard_get(me, s);
+ if (cur == me->node_id)
+ me_shard_become_holder(sh);
+ else
+ me_shard_lose_holder(sh);
+ }
+ }
+
+ u32 succ = marufs_me_next_active(me, me->node_id);
+ for (u32 s = 0; s < me->num_shards; s++)
+ me->shards[s].cached_successor = succ;
+
+ if (claimed == me->num_shards)
+ pr_info("me: node %u joined as first node (token holder)\n",
+ me->node_id);
+ else if (claimed)
+ pr_info("me: node %u joined ring, claimed %u vacant shard(s)\n",
+ me->node_id, claimed);
+ else
+ pr_info("me: node %u joined ring\n", me->node_id);
+
+ return 0;
+}
+
+/*
+ * marufs_me_common_leave - hand off held shards, clear membership.
+ * NONE fallback when alone lets a later joiner NONE-claim in common_join.
+ */
+void marufs_me_common_leave(struct marufs_me_instance *me)
+{
+ for (u32 s = 0; s < me->num_shards; s++) {
+ if (me_cb_snapshot(&me->cbs[s], NULL) == me->node_id)
+ marufs_me_pass_token(me, s, me_leave_successor(me));
+ me_shard_reset_holding(me_shard_get(me, s));
+ }
+
+ struct marufs_me_membership_slot *slot = me_my_membership_get(me);
+ WRITE_LE32(slot->status, MARUFS_ME_NONE);
+ WRITE_LE64(slot->joined_at, 0);
+ WRITE_LE64(slot->pending_shards_mask, 0);
+ MARUFS_CXL_WMB(slot, sizeof(*slot));
+
+ pr_info("me: node %u left ring\n", me->node_id);
+}
+
+/* ── Poll thread ──────────────────────────────────────────────────── */
+
+/*
+ * Unified poll thread: iterates all registered ME instances and
+ * calls poll_cycle() on each. Used for Global ME + all NRHT MEs.
+ */
+static int marufs_me_registry_poll_fn(void *data)
+{
+ struct marufs_sb_info *sbi = data;
+ u32 poll_us = MARUFS_ME_DEFAULT_POLL_US;
+
+ pr_info("me: registry poll thread started (node=%u)\n", sbi->node_id);
+
+ while (!kthread_should_stop()) {
+ struct marufs_me_instance *me;
+
+ usleep_range(poll_us, poll_us + poll_us / 4);
+
+ mutex_lock(&sbi->me_list_lock);
+ list_for_each_entry(me, &sbi->me_list, list_node) {
+ if (!atomic_read(&me->active))
+ continue;
+
+ u64 t0 = ktime_get_ns();
+ me->ops->poll_cycle(me);
+ atomic64_add(ktime_get_ns() - t0, &me->poll_ns_total);
+ atomic64_inc(&me->poll_cycles);
+ }
+ mutex_unlock(&sbi->me_list_lock);
+ }
+
+ pr_info("me: registry poll thread exiting (node=%u)\n", sbi->node_id);
+ return 0;
+}
+
+void marufs_me_registry_init(struct marufs_sb_info *sbi)
+{
+ INIT_LIST_HEAD(&sbi->me_list);
+ sbi->me_poll_thread = NULL;
+}
+
+int marufs_me_registry_start(struct marufs_sb_info *sbi)
+{
+ if (sbi->me_poll_thread)
+ return -EEXIST;
+
+ sbi->me_poll_thread =
+ kthread_run(marufs_me_registry_poll_fn, sbi, "marufs_me_poll");
+ if (IS_ERR(sbi->me_poll_thread)) {
+ int ret = PTR_ERR(sbi->me_poll_thread);
+
+ sbi->me_poll_thread = NULL;
+ return ret;
+ }
+ return 0;
+}
+
+void marufs_me_registry_stop(struct marufs_sb_info *sbi)
+{
+ if (!sbi->me_poll_thread)
+ return;
+
+ kthread_stop(sbi->me_poll_thread);
+ sbi->me_poll_thread = NULL;
+}
+
+void marufs_me_register(struct marufs_sb_info *sbi,
+ struct marufs_me_instance *me)
+{
+ mutex_lock(&sbi->me_list_lock);
+ list_add_tail(&me->list_node, &sbi->me_list);
+ atomic_set(&me->active, 1);
+ mutex_unlock(&sbi->me_list_lock);
+}
+
+void marufs_me_unregister(struct marufs_sb_info *sbi,
+ struct marufs_me_instance *me)
+{
+ mutex_lock(&sbi->me_list_lock);
+ atomic_set(&me->active, 0);
+ list_del_init(&me->list_node);
+ mutex_unlock(&sbi->me_list_lock);
+}
+
+void marufs_me_teardown(struct marufs_sb_info *sbi,
+ struct marufs_me_instance *me)
+{
+ if (!sbi || !me)
+ return;
+
+ /* Reformat detection: underlying CXL area was wiped by a fresh
+ * nrht_init while this instance was cached. Skip leave — CBs /
+ * membership now belong to a foreign ring.
+ */
+ struct marufs_me_header *h = me_header_get(me);
+ if (READ_CXL_LE64(h->format_generation) != me->cached_generation) {
+ pr_info("me: teardown skip leave (format_gen mismatch)\n");
+ marufs_me_invalidate(sbi, me);
+ return;
+ }
+
+ /* leave() BEFORE unregister: poll thread must keep ticking heartbeat
+ * and passing tokens during handoff. leave() clears membership last.
+ */
+ me->ops->leave(me);
+ marufs_me_invalidate(sbi, me);
+}
+
+void marufs_me_invalidate(struct marufs_sb_info *sbi,
+ struct marufs_me_instance *me)
+{
+ if (!me)
+ return;
+ marufs_me_unregister(sbi, me);
+ marufs_me_destroy(me);
+}
+
+/* ── Instance lifecycle ───────────────────────────────────────────── */
+
+/*
+ * marufs_me_create - allocate and initialize ME instance.
+ * @me_area_base: CXL virtual address of ME area start (header location)
+ * @num_shards: number of CB entries (Global ME: 1, NRHT ME: N_shard)
+ * @max_nodes: max node count
+ * @node_id: this node's ID
+ * @poll_interval_us: poll thread interval
+ * @strategy: order-driven or request-driven
+ *
+ * Parses CXL header to locate CB array, membership slots, request slots.
+ * Allocates DRAM arrays for holding/heartbeat tracking.
+ */
+struct marufs_me_instance *marufs_me_create(void *me_area_base, u32 num_shards,
+ u32 max_nodes, u32 node_id,
+ u32 poll_interval_us,
+ enum marufs_me_strategy strategy)
+{
+ struct marufs_me_instance *me = kzalloc(sizeof(*me), GFP_KERNEL);
+ if (!me)
+ return ERR_PTR(-ENOMEM);
+
+ /* CXL pointers from header offsets */
+ struct marufs_me_header *hdr = me_area_base;
+ me->header = hdr;
+ MARUFS_CXL_RMB(hdr, sizeof(*hdr));
+ me->cached_generation = READ_CXL_LE64(hdr->format_generation);
+ me->cbs = marufs_me_cb_at(me_area_base,
+ READ_CXL_LE64(hdr->cb_array_offset), 0);
+ me->membership = marufs_me_membership_at(
+ me_area_base, READ_CXL_LE64(hdr->membership_offset), 0);
+ me->slots = marufs_me_slot_at(me_area_base,
+ READ_CXL_LE64(hdr->request_offset),
+ max_nodes, 0, 0);
+
+ /* Configuration */
+ me->num_shards = num_shards;
+ me->max_nodes = max_nodes;
+ me->node_id = node_id;
+ me->me_idx = node_id - 1; /* external 1..N → internal idx 0..N-1 */
+ me->poll_interval_us = poll_interval_us;
+ me->strategy = strategy;
+ me->ops = marufs_me_get_ops(strategy);
+
+ /* Single per-shard DRAM allocation — kcalloc zeroes all fields, so
+ * atomic counters, is_holder, sequence shadows start at 0 / false.
+ */
+ me->shards = kcalloc(num_shards, sizeof(*me->shards), GFP_KERNEL);
+ if (!me->shards) {
+ marufs_me_destroy(me);
+ return ERR_PTR(-ENOMEM);
+ }
+ for (u32 s = 0; s < num_shards; s++)
+ mutex_init(&me->shards[s].local_lock);
+
+ /* Fine-grained per-CPU stats. alloc_percpu zeroes all fields. */
+ me->stats = alloc_percpu(struct marufs_me_stats_pcpu);
+ if (!me->stats) {
+ marufs_me_destroy(me);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ atomic_set(&me->active, 0);
+ INIT_LIST_HEAD(&me->list_node);
+
+ return me;
+}
+
+void marufs_me_destroy(struct marufs_me_instance *me)
+{
+ if (!me)
+ return;
+
+ /* Caller must have unregistered first; this is a safety net. */
+ WARN_ON_ONCE(!list_empty(&me->list_node));
+
+ free_percpu(me->stats);
+ kfree(me->shards);
+ kfree(me);
+}
+
+/* ── Format: initialize ME area in CXL memory ────────────────────── */
+
+/*
+ * marufs_me_format - write initial ME structures to CXL memory.
+ *
+ * Layout from me_area_base:
+ * [Header 64B] [CB array S×64B] [Membership N×64B] [Request S×N×64B]
+ */
+int marufs_me_format(void *me_area_base, u32 num_shards, u32 max_nodes,
+ u32 poll_interval_us, enum marufs_me_strategy strategy)
+{
+ /* Compute offsets — slot region is now allocated for both strategies */
+ u64 cb_off = sizeof(struct marufs_me_header);
+ u64 mem_off = cb_off + (u64)num_shards * sizeof(struct marufs_me_cb);
+ u64 slot_off =
+ mem_off +
+ (u64)max_nodes * sizeof(struct marufs_me_membership_slot);
+ u64 total = slot_off +
+ (u64)num_shards * max_nodes * sizeof(struct marufs_me_slot);
+
+ /* Write header */
+ struct marufs_me_header *hdr = me_area_base;
+ WRITE_LE32(hdr->magic, MARUFS_ME_MAGIC);
+ WRITE_LE32(hdr->version, MARUFS_ME_VERSION);
+ WRITE_LE32(hdr->strategy, strategy);
+ WRITE_LE32(hdr->num_shards, num_shards);
+ WRITE_LE32(hdr->max_nodes, max_nodes);
+ WRITE_LE32(hdr->poll_interval_us, poll_interval_us);
+ WRITE_LE64(hdr->cb_array_offset, cb_off);
+ WRITE_LE64(hdr->membership_offset, mem_off);
+ WRITE_LE64(hdr->request_offset, slot_off);
+ WRITE_LE64(hdr->total_size, total);
+ /* Bump generation so any prior cached ME instance on another sbi
+ * detects the reformat and drops its stale state on next me_get.
+ */
+ WRITE_LE64(hdr->format_generation, ktime_get_real_ns());
+ MARUFS_CXL_WMB(hdr, sizeof(*hdr));
+
+ /* Initialize CBs — holder=NONE (no valid node), state=FREE */
+ for (u32 s = 0; s < num_shards; s++) {
+ struct marufs_me_cb *cb =
+ marufs_me_cb_at(me_area_base, cb_off, s);
+
+ memset(cb, 0, sizeof(*cb));
+ WRITE_LE32(cb->magic, MARUFS_ME_CB_MAGIC);
+ WRITE_LE32(cb->holder, MARUFS_ME_HOLDER_NONE);
+ MARUFS_CXL_WMB(cb, sizeof(*cb));
+ }
+
+ /* Initialize membership slots.
+ * slot[i] is reserved for external node_id (i + 1).
+ */
+ for (u32 n = 0; n < max_nodes; n++) {
+ struct marufs_me_membership_slot *slot =
+ marufs_me_membership_at(me_area_base, mem_off, n);
+
+ WRITE_LE32(slot->magic, MARUFS_ME_MS_MAGIC);
+ WRITE_LE32(slot->status, MARUFS_ME_NONE);
+ WRITE_LE32(slot->node_id, n + 1);
+ WRITE_LE64(slot->joined_at, 0);
+ WRITE_LE64(slot->heartbeat, 0);
+ WRITE_LE64(slot->heartbeat_ts, 0);
+ WRITE_LE64(slot->pending_shards_mask, 0);
+ memset(slot->reserved, 0, sizeof(slot->reserved));
+ MARUFS_CXL_WMB(slot, sizeof(*slot));
+ }
+
+ /* Initialize per-(shard, node) slots — tag each with magic; batch
+ * the WMB since format is a cold path (one big flush > N*S small).
+ */
+ for (u32 s = 0; s < num_shards; s++) {
+ for (u32 n = 0; n < max_nodes; n++) {
+ struct marufs_me_slot *sl = marufs_me_slot_at(
+ me_area_base, slot_off, max_nodes, s, n);
+ memset(sl, 0, sizeof(*sl));
+ WRITE_LE32(sl->magic, MARUFS_ME_SLOT_MAGIC);
+ }
+ }
+
+ struct marufs_me_slot *slots_base =
+ marufs_me_slot_at(me_area_base, slot_off, max_nodes, 0, 0);
+ u64 slots_bytes =
+ (u64)num_shards * max_nodes * sizeof(struct marufs_me_slot);
+ MARUFS_CXL_WMB(slots_base, slots_bytes);
+
+ pr_info("me: formatted area (%s, shards=%u, nodes=%u, size=%llu)\n",
+ strategy == MARUFS_ME_ORDER ? "order" : "request", num_shards,
+ max_nodes, total);
+ return 0;
+}
diff --git a/marufs_kernel/src/me.h b/marufs_kernel/src/me.h
new file mode 100644
index 0000000..ae772f5
--- /dev/null
+++ b/marufs_kernel/src/me.h
@@ -0,0 +1,288 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * me.h - MARUFS Cross-node Mutual Exclusion API
+ *
+ * Strategy Pattern: common interface with pluggable implementations.
+ * - Order-driven (token ring): token circulates among ACTIVE nodes
+ * - Request-driven: holder scans request slots, grants to requester
+ *
+ * Two ME domains:
+ * - Global ME (S=1): single token for Index insert + RAT alloc
+ * - NRHT ME (S=N_shard): per-shard token, opt-in membership
+ */
+
+#ifndef _MARUFS_ME_H
+#define _MARUFS_ME_H
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+/* ── Configuration ────────────────────────────────────────────────── */
+enum marufs_me_config {
+ MARUFS_ME_MAX_NODES = MARUFS_MAX_NODE_ID,
+ MARUFS_ME_GLOBAL_SHARDS = 1, /* Global ME: single shard */
+ MARUFS_ME_GLOBAL_SHARD_ID = 0, /* Global ME: shard index */
+ MARUFS_ME_DEFAULT_POLL_US = 10, /* default poll interval (us) */
+ MARUFS_ME_SPIN_COUNT = 100, /* spins before usleep in acquire */
+};
+
+/* CXL on-disk layout (structs, magics, address helpers). */
+#include "me_layout.h"
+
+/* Typed constants — kept as #define so their width is explicit. */
+#define MARUFS_ME_ACQUIRE_TIMEOUT_NS (5ULL * NSEC_PER_SEC)
+/* Liveness probe window for second-chance crash check (observer-local).
+ * On acquire deadline, sample holder's heartbeat counter, sleep this long
+ * on local clock, resample. Unchanged counter after this window ⇒ crash.
+ * Avoids cross-node ktime_get_ns() subtraction (local monotonic clocks
+ * have per-node boot-time zero points and can't be subtracted meaningfully
+ * across nodes sharing CXL memory). 100ms = 10000× default poll interval:
+ * even pathological scheduler stalls should still produce a tick within it.
+ */
+#define MARUFS_ME_LIVENESS_PROBE_NS (100ULL * NSEC_PER_MSEC)
+
+/* ── Forward declarations ─────────────────────────────────────────── */
+
+struct marufs_me_instance;
+struct marufs_sb_info;
+
+/* ── Strategy Operations (vtable) ─────────────────────────────────── */
+
+struct marufs_me_ops {
+ /*
+ * acquire - block until ME is acquired for shard_id.
+ * Returns 0 on success, -EINTR if interrupted, -ETIMEDOUT on timeout.
+ */
+ int (*acquire)(struct marufs_me_instance *me, u32 shard_id);
+
+ /*
+ * release - release ME for shard_id. Must be called by current holder.
+ */
+ void (*release)(struct marufs_me_instance *me, u32 shard_id);
+
+ /*
+ * poll_cycle - called periodically by poll thread.
+ * Holder: heartbeat update, token pass if not holding.
+ * Non-holder: heartbeat monitoring, crash detection,
+ * pre-compute cached_successor.
+ */
+ void (*poll_cycle)(struct marufs_me_instance *me);
+
+ /*
+ * join - register this node as ACTIVE in membership slot.
+ */
+ int (*join)(struct marufs_me_instance *me);
+
+ /*
+ * leave - deregister this node. Pass held tokens to successor first.
+ */
+ void (*leave)(struct marufs_me_instance *me);
+};
+
+/* ── Per-shard DRAM state ─────────────────────────────────────────── */
+
+/*
+ * struct marufs_me_shard - per-shard local bookkeeping.
+ *
+ * Laid out as a single array in marufs_me_instance::shards[num_shards],
+ * one struct per shard. Fields touched together (hot path: holding,
+ * is_holder, last_token_seq) naturally share a cache line.
+ *
+ * holding / local_waiters : atomic counters for intra-node coordination.
+ * local_lock : intra-node serialization (sleepable).
+ * cached_successor : precomputed next ACTIVE node_id (ring).
+ * is_holder : DRAM "am I the token holder?" flag (replaces
+ * per-cycle CB hot-polling).
+ * poll_last_slot_seq : poll-thread's shadow of own slot token_seq;
+ * drives receiver-side is_holder flip on bump.
+ * last_token_seq : wait_for_token phantom filter baseline.
+ * last_cb_gen : cb->generation snapshot for stale-pass fence.
+ */
+struct marufs_me_shard {
+ atomic_t holding;
+ atomic_t local_waiters;
+ struct mutex local_lock;
+ u32 cached_successor;
+ bool is_holder;
+ u64 poll_last_slot_seq;
+ u64 last_token_seq;
+ u64 last_cb_gen;
+
+ /* Timestamp captured by acquire() just after mutex_lock; consumed by
+ * release() to record CS hold time. Valid only while the local_lock
+ * is held (single owner). Non-volatile; torn reads are impossible
+ * because only the owner reads/writes it.
+ */
+ u64 lock_hold_start_ns;
+};
+
+/* Fine-grained per-CPU stats — full layout and helpers in me_stats.h.
+ * Forward-declared here so marufs_me_instance can hold the pointer
+ * without pulling in the stats machinery.
+ */
+struct marufs_me_stats_pcpu;
+
+/* ── ME Instance (per-mount, DRAM) ────────────────────────────────── */
+
+struct marufs_me_instance {
+ /* CXL pointers */
+ struct marufs_me_header *header;
+ struct marufs_me_cb *cbs;
+ struct marufs_me_membership_slot *membership;
+ struct marufs_me_slot
+ *slots; /* per-(shard, node) slots, always allocated */
+
+ /* Configuration */
+ u32 num_shards; /* Global ME: 1, NRHT ME: N_shard */
+ u32 max_nodes;
+ u32 node_id; /* external 1..MARUFS_MAX_NODE_ID (0 = ORPHAN reserved) */
+ u32 me_idx; /* internal 0..max_nodes-1 = node_id - 1 */
+ u32 poll_interval_us;
+ enum marufs_me_strategy strategy;
+ const struct marufs_me_ops *ops;
+
+ /* Per-shard DRAM state — one struct per shard, allocated as a single
+ * contiguous array (avoids 8 separate kcalloc/kfree cycles).
+ */
+ struct marufs_me_shard *shards;
+
+ /* Poll-path cost counters exposed via /sys/fs/marufs/me_poll_stats.
+ * Incremented only from poll_cycle; app-thread traffic excluded.
+ */
+ atomic64_t poll_cycles;
+ atomic64_t poll_ns_total;
+ atomic64_t poll_rmb_cb;
+ atomic64_t poll_rmb_slot;
+ atomic64_t poll_rmb_membership;
+
+ /* Fine-grained per-CPU stats (latency histos, hit buckets, per-shard).
+ * Allocated in marufs_me_create, freed in marufs_me_destroy. See
+ * struct marufs_me_stats_pcpu for field layout.
+ */
+ struct marufs_me_stats_pcpu __percpu *stats;
+
+ /* Test-only fault injection. When non-zero, the poll thread treats
+ * this ME as dead: no heartbeat tick, no grant scan, no doorbell
+ * handling. Peers' acquire deadline path then exercises the
+ * counter-based liveness probe and self-takeover end-to-end.
+ * Toggled via /sys/fs/marufs/me_freeze_heartbeat (node_id-scoped).
+ * Overhead: one atomic_read per poll cycle — negligible.
+ */
+ atomic_t debug_freeze_poll;
+
+ /* Active flag — cleared on destroy, checked by registry poll thread */
+ atomic_t active; /* 0 = shutdown, 1 = serving polls */
+
+ /* Cached CXL format_generation at create time. Compared by
+ * marufs_nrht_me_get() fast-path to detect a re-format of the
+ * underlying CXL area (RAT slot reused → new nrht_init wiped
+ * ME state) and discard this stale instance.
+ */
+ u64 cached_generation;
+
+ /* Registry link (sbi->me_list) */
+ struct list_head list_node;
+};
+
+/* Inline helpers — depend on the DRAM types defined above. */
+#include "me_inline.h"
+
+/* ── Instance lifecycle ─────────────────────────────────────────────── */
+
+struct marufs_me_instance *marufs_me_create(void *me_area_base, u32 num_shards,
+ u32 max_nodes, u32 node_id,
+ u32 poll_interval_us,
+ enum marufs_me_strategy strategy);
+void marufs_me_destroy(struct marufs_me_instance *me);
+int marufs_me_format(void *me_area_base, u32 num_shards, u32 max_nodes,
+ u32 poll_interval_us, enum marufs_me_strategy strategy);
+
+/* ── Registry (sbi-managed poll thread) ─────────────────────────────── */
+
+/*
+ * Registry API — unified poll thread per sbi.
+ *
+ * Lifecycle:
+ * mount: marufs_me_registry_init(sbi)
+ * → marufs_me_registry_start(sbi) // starts the poll thread
+ * → [for each ME] marufs_me_register(sbi, me)
+ * unmount: [for each ME] marufs_me_unregister(sbi, me)
+ * → marufs_me_registry_stop(sbi)
+ */
+void marufs_me_registry_init(struct marufs_sb_info *sbi);
+int marufs_me_registry_start(struct marufs_sb_info *sbi);
+void marufs_me_registry_stop(struct marufs_sb_info *sbi);
+
+void marufs_me_register(struct marufs_sb_info *sbi,
+ struct marufs_me_instance *me);
+void marufs_me_unregister(struct marufs_sb_info *sbi,
+ struct marufs_me_instance *me);
+
+/*
+ * marufs_me_teardown - graceful teardown on unmount.
+ * unregister + ops->leave (pass held tokens, clear membership slot) + destroy.
+ * No-op if @me is NULL.
+ */
+void marufs_me_teardown(struct marufs_sb_info *sbi,
+ struct marufs_me_instance *me);
+
+/*
+ * marufs_me_invalidate - drop a stale instance without leaving the ring.
+ * unregister + destroy only. Use when the CXL state has already been
+ * reformatted by a peer (membership slots/holder are no longer ours to
+ * release), so calling ops->leave would touch foreign state.
+ * No-op if @me is NULL.
+ */
+void marufs_me_invalidate(struct marufs_sb_info *sbi,
+ struct marufs_me_instance *me);
+
+/* ── Acquire / release primitives (me.c) ────────────────────────────── */
+
+void marufs_me_hold(struct marufs_me_instance *me, u32 shard_id);
+void marufs_me_unhold(struct marufs_me_instance *me, u32 shard_id);
+void marufs_me_pass_token(struct marufs_me_instance *me, u32 shard_id,
+ u32 new_holder);
+void marufs_me_membership_set_pending(struct marufs_me_instance *me,
+ u32 shard_id);
+void marufs_me_membership_clear_pending(struct marufs_me_instance *me,
+ u32 shard_id);
+
+/*
+ * marufs_me_wait_for_token - spin-then-sleep until cb->holder == self or
+ * deadline expires. Caller must hold local_locks[shard_id] and have
+ * incremented `holding`. Returns 0 on success, -ETIMEDOUT on deadline.
+ */
+int marufs_me_wait_for_token(struct marufs_me_instance *me, u32 shard_id);
+
+/* ── Shared strategy primitives ─────────────────────────────────────── */
+
+/*
+ * marufs_me_common_* - strategy-independent ops implementations used by
+ * both order-driven and request-driven vtables.
+ */
+int marufs_me_common_join(struct marufs_me_instance *me);
+void marufs_me_common_leave(struct marufs_me_instance *me);
+
+/* Topology / membership — definitions in me.c. */
+u32 marufs_me_next_active(struct marufs_me_instance *me, u32 from);
+u32 me_leave_successor(struct marufs_me_instance *me);
+void me_membership_tick_heartbeat(struct marufs_me_instance *me);
+
+/* ── Strategy implementations ───────────────────────────────────────── */
+
+extern const struct marufs_me_ops marufs_me_order_ops;
+extern const struct marufs_me_ops marufs_me_request_ops;
+
+static inline const struct marufs_me_ops *
+marufs_me_get_ops(enum marufs_me_strategy strategy)
+{
+ if (strategy == MARUFS_ME_REQUEST)
+ return &marufs_me_request_ops;
+ return &marufs_me_order_ops;
+}
+
+#endif /* _MARUFS_ME_H */
diff --git a/marufs_kernel/src/me_inline.h b/marufs_kernel/src/me_inline.h
new file mode 100644
index 0000000..a93f2ac
--- /dev/null
+++ b/marufs_kernel/src/me_inline.h
@@ -0,0 +1,188 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * me_inline.h - Inline helpers for the MARUFS Cross-node ME.
+ *
+ * Included by me.h after the DRAM types (marufs_me_shard /
+ * marufs_me_instance) are defined — these helpers dereference instance
+ * fields and need full struct visibility. NOT a standalone header.
+ */
+
+#ifndef _MARUFS_ME_INLINE_H
+#define _MARUFS_ME_INLINE_H
+
+/* ── Shard state primitives (DRAM only) ─────────────────────────────── */
+
+/*
+ * me_shard_unhold - release-path counterpart of holding++. wmb() orders
+ * prior CXL writes (RAT, index, etc.) before the dec becomes visible to
+ * the poll thread, so the next holder sees a fully published CXL state.
+ * smp_mb() pairs with poll_cycle's smp_mb() for DRAM visibility.
+ */
+static inline void me_shard_unhold(struct marufs_me_shard *sh)
+{
+ wmb();
+ int v = atomic_dec_if_positive(&sh->holding);
+ WARN_ON_ONCE(v < 0); /* unbalanced UNHOLD — caller bug */
+ smp_mb();
+}
+
+/* Force holding=0 (leave/cleanup path only). Avoids underflow from blind dec. */
+static inline void me_shard_reset_holding(struct marufs_me_shard *sh)
+{
+ atomic_set(&sh->holding, 0);
+ smp_mb();
+}
+
+/*
+ * me_shard_become_holder - publish "we are now the token holder".
+ *
+ * Callers MUST write DRAM baselines (last_cb_gen / last_token_seq /
+ * poll_last_slot_seq) BEFORE this. smp_wmb() pairs with smp_rmb() in
+ * me_shard_is_holder: once a reader observes is_holder=true, all prior
+ * baseline stores are guaranteed visible.
+ */
+static inline void me_shard_become_holder(struct marufs_me_shard *sh)
+{
+ smp_wmb();
+ sh->is_holder = true;
+}
+
+/*
+ * me_shard_lose_holder - "we gave the token away".
+ *
+ * No barrier needed: a reader observing is_holder=false falls to the
+ * wait_for_token loop, which performs its own CB RMB to re-verify
+ * ownership.
+ */
+static inline void me_shard_lose_holder(struct marufs_me_shard *sh)
+{
+ sh->is_holder = false;
+}
+
+/*
+ * me_shard_is_holder - reader pair for me_shard_become_holder. smp_rmb
+ * ensures any baseline stores (last_cb_gen / last_token_seq /
+ * poll_last_slot_seq) published by the writer are visible once
+ * is_holder reads true.
+ */
+static inline bool me_shard_is_holder(const struct marufs_me_shard *sh)
+{
+ smp_rmb();
+ return sh->is_holder;
+}
+
+/*
+ * me_shard_passable - this node holds the token but no CS is in flight
+ * and no local thread is waiting. Used by poll/release to decide whether
+ * to forward the token instead of keeping it.
+ */
+static inline bool me_shard_passable(struct marufs_me_instance *me,
+ u32 shard_id)
+{
+ struct marufs_me_shard *sh = &me->shards[shard_id];
+ smp_mb();
+ return atomic_read(&sh->holding) == 0 &&
+ atomic_read(&sh->local_waiters) == 0;
+}
+
+/* ── Topology / validity ────────────────────────────────────────────── */
+
+/*
+ * marufs_me_is_valid_node - true iff @node_id names a real slot.
+ *
+ * Valid external node_ids are [1, max_nodes]. Excludes 0 (ORPHAN sentinel)
+ * and MARUFS_ME_HOLDER_NONE (0xFFFFFFFF, trivially > max_nodes).
+ */
+static inline bool marufs_me_is_valid_node(const struct marufs_me_instance *me,
+ u32 node_id)
+{
+ return node_id >= 1 && node_id <= me->max_nodes;
+}
+
+/* ── CXL fetch helpers ──────────────────────────────────────────────── */
+
+/* DRAM shard state — no RMB. Grouped with the CXL fetchers so callers
+ * find one site for "give me X for shard_id". */
+static inline struct marufs_me_shard *
+me_shard_get(struct marufs_me_instance *me, u32 shard_id)
+{
+ return &me->shards[shard_id];
+}
+
+/* Fetch CB for @shard_id with RMB. CXL-resident — every access must
+ * invalidate own cache to defeat stale reads from prior holder. */
+static inline struct marufs_me_cb *me_cb_get(struct marufs_me_instance *me,
+ u32 shard_id)
+{
+ struct marufs_me_cb *cb = &me->cbs[shard_id];
+ MARUFS_CXL_RMB(cb, sizeof(*cb));
+ return cb;
+}
+
+/*
+ * me_cb_snapshot - RMB CB and return (holder, generation).
+ * Pass NULL for @out_gen when only the holder is needed. A single RMB
+ * covers both fields (same CL).
+ */
+static inline u32 me_cb_snapshot(struct marufs_me_cb *cb, u64 *out_gen)
+{
+ MARUFS_CXL_RMB(cb, sizeof(*cb));
+ if (out_gen)
+ *out_gen = READ_CXL_LE64(cb->generation);
+ return READ_CXL_LE32(cb->holder);
+}
+
+/* me_cb_bump_acquire_count - holder records a successful acquire. */
+static inline void me_cb_bump_acquire_count(struct marufs_me_cb *cb)
+{
+ WRITE_LE64(cb->acquire_count, READ_CXL_LE64(cb->acquire_count) + 1);
+ MARUFS_CXL_WMB(&cb->acquire_count, sizeof(cb->acquire_count));
+}
+
+/* Per-(shard, internal_idx) slot lookup with RMB. Slot fits in one
+ * cacheline, so the full RMB is no costlier than a partial one. */
+static inline struct marufs_me_slot *me_slot_of(struct marufs_me_instance *me,
+ u32 shard_id, u32 idx)
+{
+ struct marufs_me_slot *slot =
+ &me->slots[(u64)shard_id * me->max_nodes + idx];
+ MARUFS_CXL_RMB(slot, sizeof(*slot));
+ return slot;
+}
+
+/* Self-slot shortcut (idx = me->me_idx). */
+static inline struct marufs_me_slot *me_my_slot(struct marufs_me_instance *me,
+ u32 shard_id)
+{
+ return me_slot_of(me, shard_id, me->me_idx);
+}
+
+/* Membership slot fetch with RMB. Heartbeat / pending mask / status all
+ * live here — must be re-fetched from CXL each call. */
+static inline struct marufs_me_membership_slot *
+me_membership_get(struct marufs_me_instance *me, u32 node_idx)
+{
+ struct marufs_me_membership_slot *ms = &me->membership[node_idx];
+ MARUFS_CXL_RMB(ms, sizeof(*ms));
+ return ms;
+}
+
+/* Self-membership shortcut. */
+static inline struct marufs_me_membership_slot *
+me_my_membership_get(struct marufs_me_instance *me)
+{
+ return me_membership_get(me, me->me_idx);
+}
+
+/* Header fetch with RMB. format_generation is the only frequently re-read
+ * field; callers compare it against me->cached_generation to detect peer
+ * reformat. */
+static inline struct marufs_me_header *
+me_header_get(struct marufs_me_instance *me)
+{
+ struct marufs_me_header *header = me->header;
+ MARUFS_CXL_RMB(header, sizeof(*header));
+ return header;
+}
+
+#endif /* _MARUFS_ME_INLINE_H */
diff --git a/marufs_kernel/src/me_layout.h b/marufs_kernel/src/me_layout.h
new file mode 100644
index 0000000..fdb359c
--- /dev/null
+++ b/marufs_kernel/src/me_layout.h
@@ -0,0 +1,161 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * me_layout.h - On-disk layout for the MARUFS Cross-node Mutual Exclusion
+ * (ME) area.
+ *
+ * Holds CXL-resident structs, their identifying magic values, and pointer-
+ * arithmetic helpers used by format/create paths and bootstrap. Runtime
+ * state (DRAM types, inline helpers, public API) lives in me.h.
+ */
+
+#ifndef _MARUFS_ME_LAYOUT_H
+#define _MARUFS_ME_LAYOUT_H
+
+#include
+
+/* No-holder sentinel for cb->holder. */
+#define MARUFS_ME_HOLDER_NONE ((u32)0xFFFFFFFF)
+
+/* ── Membership slot status ─────────────────────────────────────────── */
+enum marufs_me_membership_status {
+ MARUFS_ME_NONE = 0,
+ MARUFS_ME_ACTIVE = 1,
+};
+
+/*
+ * Per-type magics — prefix each CXL-resident ME struct so writers can
+ * verify they are actually addressing the intended record type. Guards
+ * against stale instance writes after a peer reformat changed the ME
+ * area layout (cached offsets now fall inside a different struct).
+ */
+enum marufs_me_struct_magic {
+ MARUFS_ME_CB_MAGIC = 0x4352454D, /* "MERC" */
+ MARUFS_ME_MS_MAGIC = 0x534D454D, /* "MEMS" */
+ MARUFS_ME_SLOT_MAGIC = 0x4C534D45, /* "EMSL" */
+};
+
+/*
+ * ME Area Header (64B, 1 cacheline) — describes ME area layout.
+ * Stored once at the start of the ME area.
+ */
+struct marufs_me_header {
+ __le32 magic; /* 0: MARUFS_ME_MAGIC */
+ __le32 version; /* 4: format version */
+ __le32 strategy; /* 8: MARUFS_ME_ORDER / MARUFS_ME_REQUEST */
+ __le32 num_shards; /* 12: CB count (Global ME: 1, NRHT ME: N_shard) */
+ __le32 max_nodes; /* 16: max nodes supported */
+ __le32 poll_interval_us; /* 20: polling interval in microseconds */
+ __le64 cb_array_offset; /* 24: CB array offset from ME area start */
+ __le64 membership_offset; /* 32: membership slot array offset */
+ __le64 request_offset; /* 40: request slot array (0 if order-driven) */
+ __le64 total_size; /* 48: total ME area size in bytes */
+ __le64 format_generation; /* 56: bumped on every marufs_me_format() —
+ * used to invalidate per-sbi cached ME
+ * instances when the underlying CXL area
+ * has been reformatted (e.g. nrht_init
+ * called again on a reused RAT slot). */
+} __attribute__((packed));
+
+/*
+ * ME Control Block (64B, 1 cacheline) — per-shard mutual exclusion state.
+ * Read-mostly; written only on holder transition. Hot path polling happens
+ * on per-node slot doorbell, not here.
+ */
+struct marufs_me_cb {
+ __le32 magic; /* 0: MARUFS_ME_CB_MAGIC */
+ __le32 holder; /* 4: current holder node_id (MARUFS_ME_HOLDER_NONE if vacant) */
+ __le32 state; /* 8: ME_FREE(0) / ME_HELD(1) / ME_RELEASING(2) */
+ __le32 _pad; /* 12 */
+ __le64 generation; /* 16: monotonic, bumped on every holder change */
+ __le64 acquire_count; /* 24: total acquisitions (stats) */
+ __u8 reserved[32]; /* 32: pad to 64B */
+} __attribute__((packed));
+
+/*
+ * ME Membership Slot (64B, 1 cacheline) — per-node membership + liveness.
+ *
+ * Each node writes ONLY its own slot. Holder scans slot[holder+1..] to
+ * find next ACTIVE node for token pass. Heartbeat lives here (distributed
+ * per-node) instead of on the shared CB, so liveness ticks don't
+ * invalidate a shared hot CL.
+ */
+struct marufs_me_membership_slot {
+ __le32 magic; /* 0: MARUFS_ME_MS_MAGIC */
+ __le32 status; /* 4: MARUFS_ME_NONE / MARUFS_ME_ACTIVE */
+ __le32 node_id; /* 8: self node_id (validation) */
+ __le32 _pad; /* 12 */
+ __le64 joined_at; /* 16: ktime_get_ns() at status change */
+ __le64 heartbeat; /* 24: self-tick counter (poll thread advances) */
+ __le64 heartbeat_ts; /* 32: ktime_get_ns() at last tick */
+ /* 40: per-shard pending bitmap — bit s set ⇒ this node has a hand
+ * up on shard s. Single-writer (owning node). Holder ORs peers' masks
+ * to skip scans. Width caps at MARUFS_NRHT_MAX_NUM_SHARDS (64).
+ */
+ __le64 pending_shards_mask; /* 40 */
+ __u8 reserved[16]; /* 48: pad to 64B */
+} __attribute__((packed));
+
+/*
+ * ME Slot (64B, 1 cacheline) — per-(shard, node) slot. Interpretation
+ * depends on strategy:
+ *
+ * Order-driven (doorbell):
+ * Single-writer: current token holder. Owner NEVER writes its own slot.
+ * Holder bumps @token_seq + snapshots @cb_gen_at_write when passing the
+ * token; owner polls @token_seq locally and enters CS on change.
+ *
+ * Request-driven (hand-raise):
+ * Writer ownership is split by @requesting value:
+ * requesting == 0 → owner may write (sets requesting=1 to raise hand)
+ * requesting == 1 → holder may write (sets granted_at, flips to 0)
+ * Token handoff to the granted node is ALSO rung via @token_seq so the
+ * granted node polls a per-node CL instead of the shared CB.
+ *
+ * Layout: slots[shard_id * max_nodes + internal_idx]
+ */
+struct marufs_me_slot {
+ __le32 magic; /* 0: MARUFS_ME_SLOT_MAGIC */
+ __le32 from_node; /* 4: last writer node_id (observability) */
+
+ /* Order-driven doorbell (holder-written) */
+ __le64 token_seq; /* 8: monotonic; owner polls this */
+ __le64 cb_gen_at_write; /* 16: cb->generation snapshot at pass time */
+
+ /* Request-driven hand-raise (writer-split by @requesting) */
+ __le32 requesting; /* 24: 0=idle (self writes), 1=requesting (holder writes) */
+ __le32 sequence; /* 28: monotonic seq# (stale request fencing) */
+ __le64 requested_at; /* 32: self writes when raising hand */
+ __le64 granted_at; /* 40: holder writes on grant */
+
+ __u8 reserved[16]; /* 48: pad to 64B */
+} __attribute__((packed));
+
+/* ── CXL address helpers ────────────────────────────────────────────── */
+
+/*
+ * Compute CXL addresses of CB / membership / slot entries from
+ * (me_area_base, offset, index). Return void* so callers can cast to the
+ * typed pointer once. Used during format/create (before typed arrays are
+ * cached in the instance) and from nrht bootstrap.
+ */
+static inline void *marufs_me_cb_at(void *me_area_base, u64 cb_off, u32 s)
+{
+ return (u8 *)me_area_base + cb_off +
+ (u64)s * sizeof(struct marufs_me_cb);
+}
+
+static inline void *marufs_me_membership_at(void *me_area_base, u64 mem_off,
+ u32 n)
+{
+ return (u8 *)me_area_base + mem_off +
+ (u64)n * sizeof(struct marufs_me_membership_slot);
+}
+
+static inline void *marufs_me_slot_at(void *me_area_base, u64 slot_off,
+ u32 max_nodes, u32 s, u32 n)
+{
+ return (u8 *)me_area_base + slot_off +
+ ((u64)s * max_nodes + n) * sizeof(struct marufs_me_slot);
+}
+
+#endif /* _MARUFS_ME_LAYOUT_H */
diff --git a/marufs_kernel/src/me_order.c b/marufs_kernel/src/me_order.c
new file mode 100644
index 0000000..f5998f5
--- /dev/null
+++ b/marufs_kernel/src/me_order.c
@@ -0,0 +1,163 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * me_order.c - Order-driven (token ring) ME strategy
+ *
+ * Token circulates among ACTIVE nodes in ring order.
+ *
+ * Poll thread roles:
+ * Holder: increment heartbeat, pass token if not holding.
+ * Non-holder: monitor heartbeat for crash detection,
+ * pre-compute cached_successor for O(0) token pass.
+ *
+ * Acquire/release:
+ * acquire() sets holding[shard]=true, waits for token arrival.
+ * release() sets holding[shard]=false, poll thread passes token.
+ */
+
+#include
+#include
+
+#include "marufs.h"
+#include "me.h"
+#include "me_stats.h"
+
+/* ── Poll cycle ───────────────────────────────────────────────────── */
+
+static void order_poll_cycle(struct marufs_me_instance *me)
+{
+ /* Fault injection: simulate a crashed node — skip the entire cycle
+ * (no heartbeat, no grant, no doorbell handling).
+ */
+ if (atomic_read(&me->debug_freeze_poll))
+ return;
+
+ bool ticked_hb = false;
+ u64 t0 = ktime_get_ns();
+ u32 successor = marufs_me_next_active(me, me->node_id);
+ u64 t1 = ktime_get_ns();
+ u64 scan_ns_total = 0;
+
+ for (u32 s = 0; s < me->num_shards; s++) {
+ /* Receiver doorbell: bump ⇒ peer passed token. */
+ struct marufs_me_slot *my_slot = me_my_slot(me, s);
+ u64 cur_seq = READ_CXL_LE64(my_slot->token_seq);
+
+ struct marufs_me_shard *sh = me_shard_get(me, s);
+ sh->cached_successor = successor;
+ if (cur_seq != sh->poll_last_slot_seq) {
+ sh->poll_last_slot_seq = cur_seq;
+ me_shard_become_holder(sh);
+ }
+
+ if (!sh->is_holder)
+ continue;
+
+ if (!ticked_hb) {
+ me_membership_tick_heartbeat(me);
+ ticked_hb = true;
+ }
+ if (me_shard_passable(me, s) && successor != me->node_id) {
+ u64 ts0 = ktime_get_ns();
+ marufs_me_pass_token(me, s, successor);
+ scan_ns_total += ktime_get_ns() - ts0;
+ }
+ }
+
+ u64 t2 = ktime_get_ns();
+ struct marufs_me_stats_pcpu *st = this_cpu_ptr(me->stats);
+ st->poll_ns_membership += t1 - t0;
+ st->poll_ns_scan += scan_ns_total;
+ st->poll_ns_doorbell += (t2 - t1) - scan_ns_total;
+}
+
+/* ── Acquire: wait for token arrival ──────────────────────────────── */
+
+static int order_acquire(struct marufs_me_instance *me, u32 shard_id)
+{
+ marufs_me_hold(me, shard_id);
+
+ int ret = marufs_me_wait_for_token(me, shard_id);
+ if (ret == 0) {
+ me_cb_bump_acquire_count(me_cb_get(me, shard_id));
+ return 0;
+ }
+
+ marufs_me_unhold(me, shard_id);
+ return ret;
+}
+
+/* ── Release: pass token directly if idle ─────────────────────────── */
+/*
+ * Direct pass bypasses the poll thread — rapid acquire/release cycles
+ * would otherwise starve other nodes (poll can't catch the brief idle
+ * window). Keep the token if local waiters exist to avoid cross-node
+ * ping-pong when one node dominates this shard.
+ */
+static void order_release(struct marufs_me_instance *me, u32 shard_id)
+{
+ struct marufs_me_shard *sh = me_shard_get(me, shard_id);
+ me_shard_unhold(sh);
+
+ if (me_shard_passable(me, shard_id)) {
+ u32 succ = sh->cached_successor;
+
+ if (succ == me->node_id) {
+ succ = marufs_me_next_active(me, me->node_id);
+ sh->cached_successor = succ;
+ }
+ if (succ != me->node_id)
+ marufs_me_pass_token(me, shard_id, succ);
+ }
+
+ me_stats_lock_released(me, sh);
+ mutex_unlock(&sh->local_lock);
+}
+
+/* ── Leave: token-gated cleanup ───────────────────────────────────── */
+
+/*
+ * order_leave - own the token on each shard (if possible), then pass.
+ *
+ * Success path: acquire forces sole-writer status for this node's slot,
+ * so handoff races only with our own code.
+ * Failure path: CB may still list us as holder (spurious timeout / ring
+ * stuck). If so we rewrite CB directly — the generation bump makes any
+ * in-flight doorbell from peers be discarded as stale.
+ */
+static void order_leave(struct marufs_me_instance *me)
+{
+ for (u32 s = 0; s < me->num_shards; s++) {
+ int ret = me->ops->acquire(me, s);
+ bool acquired = (ret == 0);
+ u32 holder;
+
+ if (acquired)
+ holder = me->node_id;
+ else
+ holder = me_cb_snapshot(&me->cbs[s], NULL);
+
+ if (holder == me->node_id)
+ marufs_me_pass_token(me, s, me_leave_successor(me));
+
+ if (acquired)
+ marufs_me_unhold(me, s);
+
+ me_shard_reset_holding(me_shard_get(me, s));
+ }
+
+ /* Clear membership last — after this, no peer will target our slot. */
+ struct marufs_me_membership_slot *slot = me_my_membership_get(me);
+ WRITE_LE32(slot->status, MARUFS_ME_NONE);
+ WRITE_LE64(slot->joined_at, 0);
+ MARUFS_CXL_WMB(slot, sizeof(*slot));
+
+ pr_info("me: node %u left ring (order-driven)\n", me->node_id);
+}
+
+const struct marufs_me_ops marufs_me_order_ops = {
+ .acquire = order_acquire,
+ .release = order_release,
+ .poll_cycle = order_poll_cycle,
+ .join = marufs_me_common_join,
+ .leave = order_leave,
+};
diff --git a/marufs_kernel/src/me_request.c b/marufs_kernel/src/me_request.c
new file mode 100644
index 0000000..b0ac54d
--- /dev/null
+++ b/marufs_kernel/src/me_request.c
@@ -0,0 +1,235 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * me_request.c - Request-driven ME strategy
+ *
+ * Nodes signal lock demand via per-(shard, node) request slots.
+ * Holder scans request slots and grants to a requester by
+ * transferring the token (writing holder=requester in CB).
+ *
+ * When no requests are pending, the holder retains the token
+ * (unlike order-driven which always circulates).
+ *
+ * Request slot layout: slots[shard_id * max_nodes + node_id]
+ */
+
+#include
+#include
+
+#include "marufs.h"
+#include "me.h"
+#include "me_stats.h"
+
+/* ── Helpers ──────────────────────────────────────────────────────── */
+
+/*
+ * request_pass_token - if @node_idx is requesting, grant + pass token to it.
+ *
+ * Common tail of full-scan and masked-scan grant loops. Returns true iff
+ * the token was actually transferred.
+ */
+static bool request_pass_token(struct marufs_me_instance *me, u32 shard_id,
+ u32 node_idx)
+{
+ struct marufs_me_slot *rs = me_slot_of(me, shard_id, node_idx);
+ if (READ_CXL_LE32(rs->requesting)) {
+ u64 now = ktime_get_ns();
+ u64 requested_at = READ_CXL_LE64(rs->requested_at);
+ WRITE_LE64(rs->granted_at, now);
+ MARUFS_CXL_WMB(&rs->granted_at, sizeof(rs->granted_at));
+ me_stats_record_grant_age(me, now, requested_at);
+ marufs_me_pass_token(me, shard_id, node_idx + 1);
+ return true;
+ }
+ return false;
+}
+/*
+ * request_scan_and_grant - holder scans request slots, grants to first
+ * requester. Scan starts at (me_idx+1) wrapping — round-robin fairness
+ * prevents low-id starvation under sustained contention.
+ * Caller must be current holder, idle (hold counter 0).
+ * Returns true iff token was granted.
+ *
+ * Full scan variant — used by release path where mask pre-collection has
+ * no amortization benefit.
+ */
+static bool request_scan_and_grant(struct marufs_me_instance *me, u32 shard_id)
+{
+ for (u32 i = 1; i < me->max_nodes; i++) {
+ u32 node_idx = (me->me_idx + i) % me->max_nodes;
+ if (request_pass_token(me, shard_id, node_idx)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+/*
+ * request_scan_and_grant_masked - filtered scan using pre-collected
+ * pending-mask snapshots from peers' membership CLs. Skips nodes whose
+ * bit for @shard_id is clear, avoiding the slot RMB entirely.
+ *
+ * Invariant: a set bit was written AFTER slot.requesting=1 with WMB, so
+ * reading requesting==0 on a node with bit set can only mean the peer
+ * just cleared its hand (post-CS) — benign, continue scan.
+ */
+static bool request_scan_and_grant_masked(struct marufs_me_instance *me,
+ u32 shard_id, const u64 *node_pending)
+{
+ u64 bit = 1ULL << shard_id;
+
+ for (u32 i = 1; i < me->max_nodes; i++) {
+ u32 node_idx = (me->me_idx + i) % me->max_nodes;
+ if (!(node_pending[node_idx] & bit))
+ continue;
+
+ if (request_pass_token(me, shard_id, node_idx)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+static inline void request_clear_own(struct marufs_me_instance *me,
+ u32 shard_id)
+{
+ struct marufs_me_slot *rs = me_my_slot(me, shard_id);
+ WRITE_LE32(rs->requesting, 0);
+ MARUFS_CXL_WMB(rs, sizeof(*rs));
+ marufs_me_membership_clear_pending(me, shard_id);
+}
+
+/* ── Poll cycle ───────────────────────────────────────────────────── */
+
+static void request_poll_cycle(struct marufs_me_instance *me)
+{
+ /* Fault injection: simulate a crashed node — skip the entire cycle
+ * (no heartbeat, no grant, no doorbell handling).
+ */
+ if (atomic_read(&me->debug_freeze_poll))
+ return;
+
+ bool ticked_hb = false;
+ u64 node_pending[MARUFS_ME_MAX_NODES] = { 0 };
+ u64 peers_pending = 0;
+ u32 successor = me->node_id;
+ bool successor_found = false;
+
+ u64 t0 = ktime_get_ns();
+
+ /* Single membership pass: gather pending masks + pick round-robin
+ * successor. S×N → N membership RMBs/cycle vs. per-shard next_active.
+ */
+ for (u32 i = 1; i <= me->max_nodes; i++) {
+ u32 idx = (me->me_idx + i) % me->max_nodes;
+ struct marufs_me_membership_slot *ms =
+ me_membership_get(me, idx);
+
+ if (READ_CXL_LE32(ms->status) != MARUFS_ME_ACTIVE)
+ continue;
+
+ node_pending[idx] = READ_CXL_LE64(ms->pending_shards_mask);
+ if (idx == me->me_idx)
+ continue;
+
+ peers_pending |= node_pending[idx];
+ if (!successor_found) {
+ successor = idx + 1;
+ successor_found = true;
+ }
+ }
+
+ u64 t1 = ktime_get_ns();
+ u64 scan_ns_total = 0;
+
+ for (u32 s = 0; s < me->num_shards; s++) {
+ struct marufs_me_shard *sh = me_shard_get(me, s);
+ sh->cached_successor = successor;
+
+ /* Receiver doorbell: bump ⇒ peer granted token. Use the
+ * slot's cb_gen_at_write (already RMB'd via me_my_slot) to
+ * verify freshness — avoids hammering the shared CB cacheline. */
+ struct marufs_me_slot *my_slot = me_my_slot(me, s);
+ u64 cur_seq = READ_CXL_LE64(my_slot->token_seq);
+ if (cur_seq != sh->poll_last_slot_seq) {
+ sh->poll_last_slot_seq = cur_seq;
+ me_shard_become_holder(sh);
+ }
+
+ if (!sh->is_holder)
+ continue;
+
+ /* Holder path: tick heartbeat once, grant if passable. */
+ if (!ticked_hb) {
+ me_membership_tick_heartbeat(me);
+ ticked_hb = true;
+ }
+ if (me_shard_passable(me, s) && (peers_pending & (1ULL << s))) {
+ u64 ts0 = ktime_get_ns();
+ request_scan_and_grant_masked(me, s, node_pending);
+ scan_ns_total += ktime_get_ns() - ts0;
+ }
+ }
+
+ u64 t2 = ktime_get_ns();
+ struct marufs_me_stats_pcpu *st = this_cpu_ptr(me->stats);
+ st->poll_ns_membership += t1 - t0;
+ st->poll_ns_scan += scan_ns_total;
+ st->poll_ns_doorbell += (t2 - t1) - scan_ns_total;
+}
+
+/* ── Acquire: write request slot, wait for grant ──────────────────── */
+
+static int request_acquire(struct marufs_me_instance *me, u32 shard_id)
+{
+ marufs_me_hold(me, shard_id);
+
+ struct marufs_me_cb *cb = me_cb_get(me, shard_id);
+ if (READ_CXL_LE32(cb->holder) == me->node_id) {
+ me_cb_bump_acquire_count(cb);
+ return 0;
+ }
+
+ /* Raise hand, wait for grant. */
+ struct marufs_me_slot *rs = me_my_slot(me, shard_id);
+ WRITE_LE32(rs->sequence, READ_CXL_LE32(rs->sequence) + 1);
+ WRITE_LE32(rs->requesting, 1);
+ WRITE_LE64(rs->requested_at, ktime_get_ns());
+ MARUFS_CXL_WMB(rs, sizeof(*rs));
+ marufs_me_membership_set_pending(me, shard_id);
+
+ int ret = marufs_me_wait_for_token(me, shard_id);
+ request_clear_own(me, shard_id);
+
+ if (ret == 0) {
+ me_cb_bump_acquire_count(cb);
+ return 0;
+ }
+
+ marufs_me_unhold(me, shard_id);
+ return ret;
+}
+
+/* ── Release ──────────────────────────────────────────────────────── */
+
+static void request_release(struct marufs_me_instance *me, u32 shard_id)
+{
+ /* Cannot use marufs_me_unhold(): scan_and_grant must execute between
+ * unhold (so passable check sees holding=0) and mutex_unlock (so a
+ * concurrent local acquire doesn't interleave). */
+ struct marufs_me_shard *sh = me_shard_get(me, shard_id);
+ me_shard_unhold(sh);
+
+ if (me_shard_passable(me, shard_id))
+ request_scan_and_grant(me, shard_id);
+
+ me_stats_lock_released(me, sh);
+ mutex_unlock(&sh->local_lock);
+}
+
+const struct marufs_me_ops marufs_me_request_ops = {
+ .acquire = request_acquire,
+ .release = request_release,
+ .poll_cycle = request_poll_cycle,
+ .join = marufs_me_common_join,
+ .leave = marufs_me_common_leave,
+};
diff --git a/marufs_kernel/src/me_stats.h b/marufs_kernel/src/me_stats.h
new file mode 100644
index 0000000..038257e
--- /dev/null
+++ b/marufs_kernel/src/me_stats.h
@@ -0,0 +1,230 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * me_stats.h - MARUFS ME fine-grained performance counters
+ *
+ * All per-CPU stats counters and their accessor helpers live here.
+ * Kept separate from me.h to stop the core ME header from ballooning
+ * as the instrumentation surface grows.
+ *
+ * Layering:
+ * me.h — core layout, forward-declares struct marufs_me_stats_pcpu
+ * and holds only the per-CPU pointer on the instance.
+ * me_stats.h — defines struct marufs_me_stats_pcpu and all helpers,
+ * includes me.h for struct marufs_me_shard/instance.
+ * TU files — include "me_stats.h" wherever they read/update stats.
+ *
+ * Overhead:
+ * Updates are non-atomic (`this_cpu_ptr` + plain `++`). A preempt /
+ * migration during update can lose or double-count a single event,
+ * which is acceptable for statistics. Sysfs readers aggregate over
+ * `for_each_possible_cpu`.
+ *
+ * Bucketing:
+ * Latency histograms use log2(ns) bucketing (12 buckets, <128ns ..
+ * >=128ms). Coarse enough for hot paths (fls64 + index), fine enough
+ * to separate p50/p99 regimes.
+ */
+
+#ifndef _MARUFS_ME_STATS_H
+#define _MARUFS_ME_STATS_H
+
+#include
+#include
+#include
+#include
+#include
+
+#include "me.h"
+
+/* ── Latency histogram layout ──────────────────────────────────────── */
+
+/*
+ * 12 buckets covers [<128ns] .. [>=128ms]. Bucket i holds counts where
+ * ns is in [2^(i+6), 2^(i+7)); bucket 0 catches all ns < 128, bucket 11
+ * saturates for ns >= 128ms.
+ */
+#define MARUFS_ME_LAT_BUCKETS 12
+#define MARUFS_ME_LAT_BUCKET_BASE_SHIFT 7 /* first bucket top = 2^7 = 128 */
+
+/* ── Per-CPU stats struct ──────────────────────────────────────────── */
+
+struct marufs_me_stats_pcpu {
+ /* wait_for_token */
+ u64 wait_count;
+ u64 wait_wall_ns;
+ u64 wait_cpu_ns;
+ u64 wait_spin_hit;
+ u64 wait_sleep_hit;
+ u64 wait_deadline_hit;
+ u64 wait_fast_hit; /* ME_IS_HOLDER early-return (no token wait) */
+ u64 wait_lat_buckets[MARUFS_ME_LAT_BUCKETS];
+
+ /* poll_cycle phase breakdown (ns per invocation, summed) */
+ u64 poll_ns_membership;
+ u64 poll_ns_doorbell;
+ u64 poll_ns_scan;
+
+ /* CS lock hold time (mutex_lock .. mutex_unlock span) */
+ u64 lock_hold_count;
+ u64 lock_hold_ns_total;
+ u64 lock_hold_buckets[MARUFS_ME_LAT_BUCKETS];
+
+ /* Request-mode grant age (granted_at - requested_at) */
+ u64 grant_age_count;
+ u64 grant_age_buckets[MARUFS_ME_LAT_BUCKETS];
+
+ /* Per-shard acquire count (hotspot detection). Capped to
+ * MARUFS_NRHT_MAX_NUM_SHARDS; larger shard_ids fold into the last
+ * bin rather than overflow.
+ */
+ u64 per_shard_acquire[MARUFS_NRHT_MAX_NUM_SHARDS];
+};
+
+enum marufs_me_wait_hit {
+ MARUFS_ME_WAIT_SPIN = 0,
+ MARUFS_ME_WAIT_SLEEP,
+ MARUFS_ME_WAIT_DEADLINE,
+};
+
+/*
+ * me_stats_cpu_ns - return current task's cumulative on-CPU ns.
+ *
+ * Reads struct sched_entity::sum_exec_runtime directly, which is updated
+ * at each scheduler tick (HZ granularity — ~1ms on typical configs).
+ * The proper accessor `task_sched_runtime()` forces a refresh but is not
+ * exported to modules, so we accept the tick-bounded staleness; the
+ * error caps at one tick per wait window, which is negligible compared
+ * to acquire deadlines measured in µs .. s.
+ */
+static inline u64 me_stats_cpu_ns(void)
+{
+ return current->se.sum_exec_runtime;
+}
+
+/* ── Bucket indexing ───────────────────────────────────────────────── */
+
+/*
+ * me_stats_lat_bucket - map ns → log2 bucket index in [0, nbuckets-1].
+ * Bucket 0 covers ns < 2^(BASE_SHIFT) (i.e., < 128ns); bucket n-1
+ * saturates at the top.
+ */
+static inline u32 me_stats_lat_bucket(u64 ns)
+{
+ if (ns < (1ULL << MARUFS_ME_LAT_BUCKET_BASE_SHIFT))
+ return 0;
+ u32 b = fls64(ns) - MARUFS_ME_LAT_BUCKET_BASE_SHIFT;
+ return min_t(u32, b, MARUFS_ME_LAT_BUCKETS - 1);
+}
+
+/* ── Accessor helpers ──────────────────────────────────────────────── */
+
+/*
+ * me_stats_wait_fast_hit - wait_for_token early exit via ME_IS_HOLDER.
+ * Called before any token-wait work; paired with me_stats_wait_done
+ * which handles the full-wait exit paths. Tracking the split reveals
+ * how often the intra-node fast path avoids ME traffic entirely.
+ */
+static inline void me_stats_wait_fast_hit(struct marufs_me_instance *me)
+{
+ struct marufs_me_stats_pcpu *st = this_cpu_ptr(me->stats);
+ st->wait_fast_hit++;
+}
+
+/*
+ * me_stats_wait_done - common exit accounting for wait_for_token.
+ * Records wall + on-CPU duration, slots the wall delta into the log2
+ * bucket, and bumps the phase counter for @hit.
+ */
+static inline void me_stats_wait_done(struct marufs_me_instance *me,
+ u64 wall_start, u64 cpu_start,
+ enum marufs_me_wait_hit hit)
+{
+ u64 wall_ns = ktime_get_ns() - wall_start;
+ u64 cpu_ns = me_stats_cpu_ns() - cpu_start;
+
+ /* Clamp cpu_ns to wall_ns: sum_exec_runtime updates at scheduler
+ * tick boundaries (~1ms) so a tick firing during a short wait may
+ * add pre-wait CS time into the measured delta, inflating cpu_ns
+ * above wall_ns and pushing the aggregate cpu_util above 100%.
+ * Clamping gives a physically valid upper bound; samples are still
+ * biased upward for sub-tick waits but no longer nonsensical.
+ */
+ if (cpu_ns > wall_ns)
+ cpu_ns = wall_ns;
+
+ struct marufs_me_stats_pcpu *st = this_cpu_ptr(me->stats);
+
+ st->wait_count++;
+ st->wait_wall_ns += wall_ns;
+ st->wait_cpu_ns += cpu_ns;
+ st->wait_lat_buckets[me_stats_lat_bucket(wall_ns)]++;
+ switch (hit) {
+ case MARUFS_ME_WAIT_SPIN:
+ st->wait_spin_hit++;
+ break;
+ case MARUFS_ME_WAIT_SLEEP:
+ st->wait_sleep_hit++;
+ break;
+ case MARUFS_ME_WAIT_DEADLINE:
+ st->wait_deadline_hit++;
+ break;
+ }
+}
+
+/*
+ * me_stats_lock_acquired - stash ktime just after mutex_lock on the
+ * shard struct. Paired with me_stats_lock_released. Valid only while
+ * the shard's local_lock is held (single owner).
+ */
+static inline void me_stats_lock_acquired(struct marufs_me_shard *sh)
+{
+ sh->lock_hold_start_ns = ktime_get_ns();
+}
+
+/*
+ * me_stats_lock_released - consume the paired timestamp and contribute
+ * a lock-hold-time sample. Must be called while still holding the
+ * lock, right before mutex_unlock.
+ */
+static inline void me_stats_lock_released(struct marufs_me_instance *me,
+ struct marufs_me_shard *sh)
+{
+ u64 hold_ns = ktime_get_ns() - sh->lock_hold_start_ns;
+ struct marufs_me_stats_pcpu *st = this_cpu_ptr(me->stats);
+
+ st->lock_hold_count++;
+ st->lock_hold_ns_total += hold_ns;
+ st->lock_hold_buckets[me_stats_lat_bucket(hold_ns)]++;
+}
+
+/*
+ * me_stats_bump_shard_acquire - per-shard acquire hotspot counter.
+ * Capped at MARUFS_NRHT_MAX_NUM_SHARDS - 1; over-sized shard_ids fold
+ * into the last bin rather than overflow.
+ */
+static inline void me_stats_bump_shard_acquire(struct marufs_me_instance *me,
+ u32 shard_id)
+{
+ struct marufs_me_stats_pcpu *st = this_cpu_ptr(me->stats);
+ u32 idx = min_t(u32, shard_id, MARUFS_NRHT_MAX_NUM_SHARDS - 1);
+ st->per_shard_acquire[idx]++;
+}
+
+/*
+ * me_stats_record_grant_age - request-mode grant path. @requested_at
+ * is read from the requester's slot under RMB; @now is the ktime
+ * captured at grant time.
+ */
+static inline void me_stats_record_grant_age(struct marufs_me_instance *me,
+ u64 now, u64 requested_at)
+{
+ if (now <= requested_at)
+ return; /* clock skew or empty slot — skip */
+ u64 age_ns = now - requested_at;
+ struct marufs_me_stats_pcpu *st = this_cpu_ptr(me->stats);
+
+ st->grant_age_count++;
+ st->grant_age_buckets[me_stats_lat_bucket(age_ns)]++;
+}
+
+#endif /* _MARUFS_ME_STATS_H */
diff --git a/marufs_kernel/src/nrht.c b/marufs_kernel/src/nrht.c
new file mode 100644
index 0000000..b0a0726
--- /dev/null
+++ b/marufs_kernel/src/nrht.c
@@ -0,0 +1,1182 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * nrht.c - MARUFS Independent Name-Ref Hash Table (NRHT)
+ *
+ * NRHT is an independent CXL file that maps names to (offset, region_id)
+ * pairs. Unlike the global index where name-refs and region files share
+ * the same shards, each NRHT file has its own dedicated hash table with
+ * clean lifecycle management.
+ *
+ * Physical layout (within region data area):
+ * [NRHT Header 64B] [Shard Headers N×64B] [Shard 0 Data] [Shard 1 Data] ...
+ *
+ * Per-shard data:
+ * [Bucket Array (CL-aligned)] [Entry Array (128B each)]
+ *
+ * Each entry is 128B (2 CL):
+ * CL0 (bytes 0-63): hot path — state, next, hash, offset, region_id
+ * CL1 (bytes 64-127): cold path — name (accessed only on hash match)
+ *
+ * Entry state machine (same as global index):
+ * EMPTY --CAS--> INSERTING --WMB--> VALID --CAS--> TOMBSTONE
+ *
+ * Stale INSERTING detection: node_id + created_at timeout.
+ * Same-node or uninitialized (node==0) entries are reclaimed after timeout.
+ * Other nodes' entries are skipped — their GC handles them.
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "marufs.h"
+#include "nrht_stats.h"
+
+/* ============================================================================
+ * Shard view — transient CXL pointer set for one shard
+ * ============================================================================ */
+
+struct nrht_shard_ctx {
+ struct marufs_nrht_shard_header *header;
+ struct marufs_nrht_entry *entries;
+ u32 *bucket_head;
+ u32 num_entries;
+ u32 shard_id; /* needed for ME acquire/release */
+ /* Direct handle to sbi-level per-CPU stats — avoids threading
+ * sbi through just for instrumentation. Set once in ctx init.
+ */
+ struct marufs_nrht_stats_pcpu __percpu *stats;
+};
+
+/* ============================================================================
+ * Low-level helpers
+ * ============================================================================ */
+
+static inline struct marufs_nrht_shard_header *
+nrht_shard_header(void *nrht_base, u32 shard_id)
+{
+ struct marufs_nrht_shard_header *sh =
+ (struct marufs_nrht_shard_header
+ *)((char *)nrht_base +
+ sizeof(struct marufs_nrht_header) +
+ (u64)shard_id * sizeof(*sh));
+ MARUFS_CXL_RMB(sh, sizeof(*sh));
+ return sh;
+}
+
+/*
+ * nrht_name_matches - check if entry matches (hash, name).
+ * Caller must have issued RMB on both CL0 and CL1 before calling.
+ */
+static inline bool nrht_name_matches(struct marufs_nrht_entry *e, u64 hash,
+ const char *name, size_t namelen)
+{
+ MARUFS_CXL_RMB(e->name, sizeof(e->name));
+ if (READ_CXL_LE64(e->name_hash) != hash)
+ return false;
+ if (strncmp(e->name, name, namelen) != 0)
+ return false;
+ if (namelen < sizeof(e->name) && e->name[namelen] != '\0')
+ return false;
+ return true;
+}
+
+/*
+ * 1 - pure staleness check for INSERTING entry.
+ *
+ * Only handles the clear-cut case: same-node + created_at > 0 + timeout.
+ * No side effects — caller handles CAS reclaim and chain unlink.
+ * Indeterminate cases (node==0, created_at==0) return 0; the GC sweep
+ * in marufs_nrht_gc_sweep_all() tracks them via marufs_gc_track_orphan().
+ *
+ * Returns:
+ * 1 = stale, caller should reclaim
+ * 0 = not stale or indeterminate (orphan/timestamp not visible)
+ * -1 = not this node's entry
+ */
+static int nrht_is_stale(struct marufs_sb_info *sbi,
+ struct marufs_nrht_entry *e)
+{
+ u32 ins_node = READ_CXL_LE32(e->inserter_node);
+ if (ins_node == 0)
+ return marufs_is_admin_node(sbi) ?
+ 0 :
+ -1; /* Only admin node tracks orphans */
+
+ if (ins_node != sbi->node_id)
+ return -1;
+
+ u64 created_at = READ_CXL_LE64(e->created_at);
+ if (created_at == 0)
+ return 0; /* timestamp not yet visible — tracked by GC */
+
+ u64 now = ktime_get_real_ns();
+ if (now <= created_at || (now - created_at) <= MARUFS_STALE_TIMEOUT_NS)
+ return 0;
+
+ return 1;
+}
+
+/* CAS current state → INSERTING and stamp inserter identity for GC. */
+static inline bool nrht_claim_entry(struct marufs_sb_info *sbi,
+ struct marufs_nrht_entry *e)
+{
+ u32 st = READ_LE32(e->state);
+ if (st != MARUFS_ENTRY_EMPTY && st != MARUFS_ENTRY_TOMBSTONE)
+ return false;
+ if (marufs_le32_cas(&e->state, st, MARUFS_ENTRY_INSERTING) != st)
+ return false;
+
+ WRITE_LE64(e->created_at, ktime_get_real_ns());
+ WRITE_LE32(e->inserter_node, sbi->node_id);
+ return true;
+}
+
+/* ============================================================================
+ * Header / shard resolution
+ * ============================================================================ */
+
+static struct marufs_nrht_header *nrht_get_header(struct marufs_sb_info *sbi,
+ u32 nrht_region_id)
+{
+ struct marufs_rat_entry *rat_e =
+ marufs_rat_entry_get(sbi, nrht_region_id);
+ if (!rat_e)
+ return NULL;
+
+ if (READ_CXL_LE32(rat_e->state) != MARUFS_RAT_ENTRY_ALLOCATED)
+ return NULL;
+
+ u64 phys_offset = READ_CXL_LE64(rat_e->phys_offset);
+ u64 region_size = READ_CXL_LE64(rat_e->size);
+ struct marufs_nrht_header *hdr = marufs_dax_ptr(sbi, phys_offset);
+ if (phys_offset == 0 || region_size < sizeof(*hdr) || !hdr ||
+ !marufs_dax_range_valid(sbi, phys_offset, sizeof(*hdr)))
+ return NULL;
+
+ MARUFS_CXL_RMB(hdr, sizeof(*hdr));
+ if (READ_CXL_LE32(hdr->magic) != MARUFS_NRHT_MAGIC ||
+ READ_CXL_LE32(hdr->version) != MARUFS_NRHT_VERSION ||
+ READ_CXL_LE64(hdr->table_size) > region_size) {
+ pr_err("nrht: invalid header (magic=0x%x ver=%u table_size=%llu region=%llu)\n",
+ READ_CXL_LE32(hdr->magic), READ_CXL_LE32(hdr->version),
+ READ_CXL_LE64(hdr->table_size), region_size);
+ return NULL;
+ }
+
+ return hdr;
+}
+
+static int nrht_get_shard_ctx(struct marufs_sb_info *sbi,
+ struct marufs_nrht_header *nrht, u32 shard_id,
+ struct nrht_shard_ctx *out)
+{
+ struct marufs_nrht_shard_header *sh = nrht_shard_header(nrht, shard_id);
+
+ out->header = sh;
+ out->stats = sbi->nrht_stats;
+ out->num_entries = READ_CXL_LE32(sh->num_entries);
+
+ u32 num_buckets = READ_CXL_LE32(sh->num_buckets);
+ if (num_buckets == 0 || out->num_entries == 0 ||
+ out->num_entries >
+ MARUFS_NRHT_MAX_ENTRIES * MARUFS_NRHT_MAX_NUM_SHARDS) {
+ pr_err("nrht: shard %u invalid (buckets=%u entries=%u)\n",
+ shard_id, num_buckets, out->num_entries);
+ return -EIO;
+ }
+
+ u64 entry_off = READ_CXL_LE64(sh->entry_array_offset);
+ u64 entry_size =
+ (u64)out->num_entries * sizeof(struct marufs_nrht_entry);
+
+ if (entry_off == 0 ||
+ !marufs_dax_range_valid(sbi, entry_off, entry_size)) {
+ pr_err("nrht: shard %u invalid entry offset 0x%llx\n", shard_id,
+ entry_off);
+ return -EIO;
+ }
+
+ out->entries = marufs_dax_ptr(sbi, entry_off);
+ return 0;
+}
+
+/*
+ * nrht_resolve_bucket - common prologue for all NRHT operations.
+ * Resolves header, selects shard, populates ctx including bucket_head.
+ * Auto-computes hash if name_hash is 0.
+ */
+static int nrht_resolve_bucket(struct marufs_sb_info *sbi, u32 nrht_region_id,
+ const char *name, size_t namelen, u64 *name_hash,
+ struct nrht_shard_ctx *ctx)
+{
+ if (*name_hash == 0)
+ *name_hash = marufs_hash_name(name, namelen);
+
+ struct marufs_nrht_header *nrht = nrht_get_header(sbi, nrht_region_id);
+ if (!nrht)
+ return -ENOENT;
+
+ u32 shard_mask = READ_CXL_LE32(nrht->num_shards) - 1;
+ u32 shard_id = marufs_shard_idx(*name_hash, shard_mask);
+ int ret = nrht_get_shard_ctx(sbi, nrht, shard_id, ctx);
+ if (ret)
+ return ret;
+ ctx->shard_id = shard_id;
+
+ /* Resolve bucket head pointer */
+ u32 num_buckets = READ_CXL_LE32(ctx->header->num_buckets);
+ u64 bucket_off = READ_CXL_LE64(ctx->header->bucket_array_offset);
+ u64 bucket_size = marufs_align_up((u64)num_buckets * sizeof(u32), 64);
+ if (bucket_off == 0 ||
+ !marufs_dax_range_valid(sbi, bucket_off, bucket_size))
+ return -EIO;
+
+ u32 *buckets = marufs_dax_ptr(sbi, bucket_off);
+ u32 bucket_idx = marufs_bucket_idx(*name_hash, num_buckets - 1);
+ ctx->bucket_head = &buckets[bucket_idx];
+
+ return 0;
+}
+
+/* ============================================================================
+ * Chain walk operations
+ * ============================================================================ */
+
+/*
+ * nrht_check_duplicate - walk bucket chain, check for duplicate name.
+ *
+ * Records first dead entry (TOMBSTONE or EMPTY) as in-place reuse candidate.
+ * Additional dead entries are inline-unlinked and reclaimed to EMPTY.
+ * Consistent with index.c check_duplicate pattern.
+ *
+ * @reuse_idx: output — first reusable entry index, or BUCKET_END.
+ * Return: 0 if no duplicate, -EEXIST if exists, -EIO on chain cycle.
+ */
+static int nrht_check_duplicate(struct marufs_sb_info *sbi,
+ struct nrht_shard_ctx *ctx, u64 hash,
+ const char *name, size_t namelen,
+ u32 *reuse_idx)
+{
+ u32 *prev_next = ctx->bucket_head;
+ MARUFS_CXL_RMB(prev_next, sizeof(*prev_next));
+ u32 cur = READ_CXL_LE32(*prev_next);
+ *reuse_idx = MARUFS_BUCKET_END;
+ u32 steps = 0;
+
+ while (cur != MARUFS_BUCKET_END && cur < ctx->num_entries) {
+ if (++steps > ctx->num_entries) {
+ pr_err("nrht: chain cycle detected\n");
+ return -EIO;
+ }
+
+ struct marufs_nrht_entry *e = &ctx->entries[cur];
+ MARUFS_CXL_RMB(e, 64);
+ u32 st = READ_CXL_LE32(e->state);
+ u32 next = READ_CXL_LE32(e->next_in_bucket);
+
+ /* Dead entry (TOMBSTONE or EMPTY): reuse first, unlink rest */
+ if (st == MARUFS_ENTRY_TOMBSTONE || st == MARUFS_ENTRY_EMPTY) {
+ if (*reuse_idx == MARUFS_BUCKET_END) {
+ *reuse_idx = cur;
+ prev_next = (u32 *)&e->next_in_bucket;
+ } else {
+ if (marufs_le32_cas(prev_next, cur, next) ==
+ cur)
+ marufs_le32_cas(&e->state, st,
+ MARUFS_ENTRY_EMPTY);
+ }
+ cur = next;
+ continue;
+ }
+
+ if (st == MARUFS_ENTRY_VALID &&
+ READ_CXL_LE64(e->name_hash) == hash &&
+ nrht_name_matches(e, hash, name, namelen))
+ return -EEXIST;
+
+ prev_next = (u32 *)&e->next_in_bucket;
+ cur = next;
+ }
+
+ return 0;
+}
+
+/*
+ * nrht_find_chain - walk bucket chain to find a VALID entry by name.
+ * Returns entry pointer and sets *out_idx, *out_prev_next. NULL if not found.
+ */
+static struct marufs_nrht_entry *nrht_find_chain(struct nrht_shard_ctx *ctx,
+ u64 hash, const char *name,
+ size_t namelen, u32 *out_idx,
+ u32 **out_prev_next)
+{
+ u32 *prev_next = ctx->bucket_head;
+ MARUFS_CXL_RMB(prev_next, sizeof(*prev_next));
+ u32 cur = READ_CXL_LE32(*prev_next);
+ u32 steps = 0;
+
+ while (cur != MARUFS_BUCKET_END && cur < ctx->num_entries) {
+ if (++steps > ctx->num_entries) {
+ pr_err("nrht: chain cycle detected\n");
+ nrht_stats_record_chain_depth(ctx->stats, steps);
+ return NULL;
+ }
+
+ struct marufs_nrht_entry *e = &ctx->entries[cur];
+ MARUFS_CXL_RMB(e, 64);
+ u32 next = READ_CXL_LE32(e->next_in_bucket);
+ u32 state = READ_CXL_LE32(e->state);
+
+ if (next != MARUFS_BUCKET_END && next < ctx->num_entries)
+ prefetch(&ctx->entries[next]);
+
+ if (state == MARUFS_ENTRY_VALID) {
+ if (nrht_name_matches(e, hash, name, namelen)) {
+ if (out_idx)
+ *out_idx = cur;
+ if (out_prev_next)
+ *out_prev_next = prev_next;
+ nrht_stats_record_chain_depth(ctx->stats,
+ steps);
+ return e;
+ }
+ }
+
+ prev_next = (u32 *)&e->next_in_bucket;
+ cur = next;
+ }
+
+ nrht_stats_record_chain_depth(ctx->stats, steps);
+ return NULL;
+}
+
+/* ============================================================================
+ * Slot acquisition + link + publish
+ * ============================================================================ */
+
+/*
+ * marufs_nrht_me_get - lazily create/return per-NRHT ME instance for this node.
+ * The first access by this node creates the instance, joins membership,
+ * and registers with the sbi-level poll thread.
+ */
+static struct marufs_me_instance *marufs_nrht_me_get(struct marufs_sb_info *sbi,
+ u32 nrht_region_id)
+{
+ struct marufs_me_instance *nme;
+ struct marufs_rat_entry *rat_e;
+ void *base;
+ struct marufs_nrht_header *hdr;
+ void *me_base;
+ u32 num_shards;
+ int ret;
+
+ if (nrht_region_id >= MARUFS_MAX_RAT_ENTRIES)
+ return ERR_PTR(-EINVAL);
+
+ /* Fast path: already initialized AND CXL format generation matches.
+ * If the underlying CXL ME area has been reformatted (e.g. a peer
+ * mount re-ran nrht_init on a recycled RAT slot), our cached
+ * instance is stale and must be dropped so we re-join the fresh ring.
+ */
+ nme = READ_ONCE(sbi->nrht_me[nrht_region_id]);
+ if (nme) {
+ struct marufs_me_header *h = me_header_get(nme);
+ if (READ_CXL_LE64(h->format_generation) ==
+ nme->cached_generation)
+ return nme;
+ }
+
+ mutex_lock(&sbi->nrht_me_lock);
+ nme = sbi->nrht_me[nrht_region_id];
+ if (nme) {
+ struct marufs_me_header *h = me_header_get(nme);
+ if (READ_CXL_LE64(h->format_generation) ==
+ nme->cached_generation)
+ goto out; /* fresh cached, return as-is */
+
+ /* Stale: drop it and fall through to recreate. */
+ pr_info("nrht: ME instance for region %u stale (gen changed), reinit\n",
+ nrht_region_id);
+ sbi->nrht_me[nrht_region_id] = NULL;
+ marufs_me_invalidate(sbi, nme);
+ nme = NULL;
+ }
+
+ /* Resolve NRHT base + ME area */
+ rat_e = marufs_rat_entry_get(sbi, nrht_region_id);
+ if (!rat_e ||
+ READ_CXL_LE32(rat_e->state) != MARUFS_RAT_ENTRY_ALLOCATED) {
+ nme = ERR_PTR(-ENOENT);
+ goto out;
+ }
+ base = marufs_dax_ptr(sbi, READ_CXL_LE64(rat_e->phys_offset));
+ if (!base) {
+ nme = ERR_PTR(-EIO);
+ goto out;
+ }
+ hdr = base;
+ MARUFS_CXL_RMB(hdr, sizeof(*hdr));
+ num_shards = READ_CXL_LE32(hdr->num_shards);
+ me_base = (char *)base + sizeof(struct marufs_nrht_header) +
+ (u64)num_shards * sizeof(struct marufs_nrht_shard_header);
+
+ /* Read strategy from ME header (formatted during nrht_init) */
+ struct marufs_me_header *me_hdr = me_base;
+ MARUFS_CXL_RMB(me_hdr, sizeof(*me_hdr));
+ enum marufs_me_strategy strat = READ_CXL_LE32(me_hdr->strategy);
+
+ nme = marufs_me_create(me_base, num_shards, MARUFS_ME_MAX_NODES,
+ sbi->node_id, MARUFS_ME_DEFAULT_POLL_US, strat);
+ if (IS_ERR(nme))
+ goto out;
+
+ ret = nme->ops->join(nme);
+ if (ret) {
+ marufs_me_destroy(nme);
+ nme = ERR_PTR(ret);
+ goto out;
+ }
+
+ marufs_me_register(sbi, nme);
+
+ /* Publish via WRITE_ONCE; fast path uses READ_ONCE */
+ WRITE_ONCE(sbi->nrht_me[nrht_region_id], nme);
+
+ pr_info("nrht: ME instance created for region %u (strategy=%s, shards=%u)\n",
+ nrht_region_id, strat == MARUFS_ME_ORDER ? "order" : "request",
+ num_shards);
+
+out:
+ mutex_unlock(&sbi->nrht_me_lock);
+ return nme;
+}
+
+/*
+ * Caller MUST hold the NRHT ME shard lock — no other mutator can touch
+ * bucket_head concurrently:
+ * - link_to_bucket / check_duplicate unlink / post_insert_dedup: all under
+ * the same ME shard lock in the insert path (this thread).
+ * - lookup: read-only on bucket chain.
+ * - delete: only CAS-es entry->state, does not touch bucket_head / next_in_bucket.
+ * - gc: does not touch bucket structure.
+ * So the CAS on bucket_head can be a plain WRITE.
+ */
+static int nrht_link_to_bucket(struct marufs_nrht_entry *entry, u32 entry_idx,
+ u32 *bucket_head)
+{
+ MARUFS_CXL_RMB(bucket_head, sizeof(*bucket_head));
+ u32 old_head = READ_CXL_LE32(*bucket_head);
+
+ if (unlikely(old_head == entry_idx)) {
+ /* Already the bucket head (stale link from previous life).
+ * Chain successor is intact — just skip linking. */
+ return 0;
+ }
+
+ WRITE_LE32(entry->next_in_bucket, old_head);
+ MARUFS_CXL_WMB(entry, 64);
+
+ WRITE_LE32(*bucket_head, entry_idx);
+ MARUFS_CXL_WMB(bucket_head, sizeof(*bucket_head));
+ return 0;
+}
+
+/*
+ * nrht_post_insert_dedup - detect duplicate under shard lock.
+ * Called after link_to_bucket while holding the shard lock. Walks the
+ * bucket chain looking for another VALID entry with the same name.
+ * If found, returns -EEXIST so the caller can tombstone and release.
+ */
+static int nrht_post_insert_dedup(struct nrht_shard_ctx *ctx,
+ struct marufs_nrht_entry *entry,
+ u32 entry_idx, u64 hash, const char *name,
+ size_t namelen)
+{
+ u32 *head = ctx->bucket_head;
+ MARUFS_CXL_RMB(head, sizeof(*head));
+ u32 cur = READ_CXL_LE32(*head);
+ u32 steps = 0;
+
+ while (cur != MARUFS_BUCKET_END && cur < ctx->num_entries) {
+ if (++steps > ctx->num_entries) {
+ pr_err("nrht: chain cycle detected\n");
+ return -EIO;
+ }
+
+ struct marufs_nrht_entry *e = &ctx->entries[cur];
+ MARUFS_CXL_RMB(e, 64);
+ u32 next = READ_CXL_LE32(e->next_in_bucket);
+
+ if (entry_idx == cur) {
+ cur = next;
+ continue;
+ }
+
+ if (next != MARUFS_BUCKET_END && next < ctx->num_entries)
+ prefetch(&ctx->entries[next]);
+
+ u32 state = READ_CXL_LE32(e->state);
+ if (state == MARUFS_ENTRY_VALID &&
+ nrht_name_matches(e, hash, name, namelen)) {
+ return -EEXIST;
+ }
+
+ cur = next;
+ }
+
+ return 0;
+}
+
+/* ============================================================================
+ * Public API
+ * ============================================================================ */
+int marufs_nrht_init(struct marufs_sb_info *sbi, u32 nrht_region_id,
+ u32 max_entries, u32 num_shards, u32 num_buckets,
+ enum marufs_me_strategy me_strategy)
+{
+ struct marufs_rat_entry *rat_e =
+ marufs_rat_entry_get(sbi, nrht_region_id);
+ if (!rat_e)
+ return -ENOENT;
+ if (READ_CXL_LE32(rat_e->state) != MARUFS_RAT_ENTRY_ALLOCATED)
+ return -EINVAL;
+
+ /* ── Step 1: validate params + compute total size ────────────── */
+ if (num_shards == 0)
+ num_shards = MARUFS_NRHT_DEFAULT_NUM_SHARDS;
+ if (max_entries == 0)
+ max_entries = MARUFS_NRHT_DEFAULT_NUM_SHARDS *
+ MARUFS_NRHT_DEFAULT_ENTRIES;
+
+ if (!is_power_of_2(num_shards) ||
+ (num_shards > MARUFS_NRHT_MAX_NUM_SHARDS))
+ return -EINVAL;
+
+ if (!is_power_of_2(max_entries) ||
+ (max_entries >
+ MARUFS_NRHT_MAX_ENTRIES * MARUFS_NRHT_MAX_NUM_SHARDS)) {
+ pr_err("nrht_init: max_entries %u exceeds limit\n",
+ max_entries);
+ return -EINVAL;
+ }
+
+ u32 entries_per_shard = max_entries / num_shards;
+ if (entries_per_shard == 0)
+ return -EINVAL;
+
+ u32 buckets_per_shard;
+ if (num_buckets != 0) {
+ buckets_per_shard = num_buckets / num_shards;
+ if (buckets_per_shard == 0)
+ buckets_per_shard = 1;
+ } else {
+ buckets_per_shard =
+ entries_per_shard / MARUFS_NRHT_DEFAULT_LOAD_FACTOR;
+ if (buckets_per_shard == 0)
+ buckets_per_shard = 1;
+ }
+ if (buckets_per_shard > MARUFS_NRHT_MAX_ENTRIES)
+ return -EINVAL;
+ buckets_per_shard = roundup_pow_of_two(buckets_per_shard);
+ if (buckets_per_shard == 0 ||
+ buckets_per_shard > MARUFS_NRHT_MAX_ENTRIES)
+ return -EINVAL;
+
+ u64 bucket_array_size =
+ marufs_align_up((u64)buckets_per_shard * sizeof(u32), 64);
+ u64 per_shard_size =
+ bucket_array_size +
+ (u64)entries_per_shard * sizeof(struct marufs_nrht_entry);
+ u64 shard_headers_end =
+ sizeof(struct marufs_nrht_header) +
+ (u64)num_shards * sizeof(struct marufs_nrht_shard_header);
+
+ /* NRHT ME area placed between shard headers and shard data.
+ * Always sized as REQUEST to support both strategies without re-format.
+ */
+ u64 me_area_size = marufs_me_area_size(num_shards, MARUFS_ME_MAX_NODES);
+ u64 me_area_offset = shard_headers_end;
+ u64 shard_data_start = me_area_offset + me_area_size;
+
+ u64 total_needed = shard_data_start + (u64)num_shards * per_shard_size;
+
+ /* ── Step 2: allocate physical memory if not yet done ────────── */
+ u64 phys_offset = READ_CXL_LE64(rat_e->phys_offset);
+ u64 region_size = READ_CXL_LE64(rat_e->size);
+ bool freshly_allocated = false;
+ if (phys_offset == 0 || region_size == 0) {
+ /* No ftruncate — allocate from init directly */
+ int ret = marufs_region_init(sbi, nrht_region_id, total_needed);
+ if (ret) {
+ pr_err("nrht_init: region_init failed: %d\n", ret);
+ return ret;
+ }
+
+ /* Re-read RAT entry after allocation */
+ rat_e = marufs_rat_entry_get(sbi, nrht_region_id);
+ if (!rat_e)
+ return -EIO;
+
+ phys_offset = READ_CXL_LE64(rat_e->phys_offset);
+ region_size = READ_CXL_LE64(rat_e->size);
+ freshly_allocated = true;
+ }
+
+ if (total_needed > region_size) {
+ pr_err("nrht_init: need %llu bytes, have %llu\n", total_needed,
+ region_size);
+ return -ENOSPC;
+ }
+
+ /* Register in DRAM bitmap for fast GC enumeration */
+ set_bit(nrht_region_id, sbi->gc_nrht_bitmap);
+
+ /* ── Step 3: resolve base pointer + double-init check ────────── */
+ if (!marufs_dax_range_valid(sbi, phys_offset, region_size))
+ return -EINVAL;
+
+ void *base = marufs_dax_ptr(sbi, phys_offset);
+ if (!base)
+ return -EINVAL;
+
+ /* Double-init protection: check the RAT entry's region_type rather than
+ * probing physical data for magic. rat_entry_reset() zeroes region_type
+ * to MARUFS_REGION_DATA on every alloc/free, so REGION_NRHT here means
+ * nrht_init already ran on THIS lifecycle — genuine double-init.
+ * Stale NRHT magic in recycled CXL physical space is safely ignored.
+ * NOTE: region_type is set to NRHT *after* format (below), so this
+ * check only triggers on a second call, not the first. */
+ if (!freshly_allocated &&
+ READ_CXL_LE32(rat_e->region_type) == MARUFS_REGION_NRHT) {
+ pr_err("nrht_init: region %u already formatted\n",
+ nrht_region_id);
+ return -EEXIST;
+ }
+
+ /* Invalidate any stale per-node ME instance cached for this rat_id.
+ * Happens when a RAT entry is freed and later reallocated for a new
+ * NRHT file — without this, sbi->nrht_me[rat_id] points to a stale
+ * instance whose DRAM state (holding, heartbeat, cached_successor)
+ * does not match the freshly-formatted CXL area.
+ */
+ mutex_lock(&sbi->nrht_me_lock);
+ struct marufs_me_instance *stale = sbi->nrht_me[nrht_region_id];
+ sbi->nrht_me[nrht_region_id] = NULL;
+ marufs_me_invalidate(sbi, stale);
+ mutex_unlock(&sbi->nrht_me_lock);
+
+ /* Zero + format */
+ memset(base, 0, (size_t)total_needed);
+ MARUFS_CXL_WMB(base, total_needed);
+
+ struct marufs_nrht_header *hdr = base;
+ WRITE_LE32(hdr->magic, MARUFS_NRHT_MAGIC);
+ WRITE_LE32(hdr->version, MARUFS_NRHT_VERSION);
+ WRITE_LE32(hdr->num_shards, num_shards);
+ WRITE_LE32(hdr->buckets_per_shard, buckets_per_shard);
+ WRITE_LE32(hdr->entries_per_shard, entries_per_shard);
+ WRITE_LE32(hdr->owner_region_id, nrht_region_id);
+ WRITE_LE64(hdr->table_size, total_needed);
+ MARUFS_CXL_WMB(hdr, sizeof(*hdr));
+
+ /* Format NRHT ME area (between shard headers and shard data) */
+ int me_ret = marufs_me_format((char *)base + me_area_offset, num_shards,
+ MARUFS_ME_MAX_NODES,
+ MARUFS_ME_DEFAULT_POLL_US, me_strategy);
+ if (me_ret) {
+ pr_err("nrht_init: me_format failed: %d\n", me_ret);
+ return me_ret;
+ }
+
+ /* Make the initiator the first holder and ACTIVE member
+ * atomically from any peer's perspective — peers that race
+ * through me_get will see a non-empty ring and will not
+ * self-elect as first_node. This removes the start-up window
+ * where holder=NONE would be visible.
+ */
+ char *me_base = (char *)base + me_area_offset;
+ struct marufs_me_header *mh = (struct marufs_me_header *)me_base;
+ MARUFS_CXL_RMB(mh, sizeof(*mh));
+ u64 cb_off = READ_CXL_LE64(mh->cb_array_offset);
+ u64 mem_off = READ_CXL_LE64(mh->membership_offset);
+ u64 slot_off = READ_CXL_LE64(mh->request_offset);
+ u32 mnodes = READ_CXL_LE32(mh->max_nodes);
+
+ for (u32 s = 0; s < num_shards; s++) {
+ struct marufs_me_cb *cb = marufs_me_cb_at(me_base, cb_off, s);
+ u64 new_gen = READ_CXL_LE64(cb->generation) + 1;
+ WRITE_LE32(cb->holder, sbi->node_id);
+ WRITE_LE64(cb->generation, new_gen);
+ MARUFS_CXL_WMB(cb, sizeof(*cb));
+
+ /* Ring the initiator's doorbell so the first acquire's
+ * wait_for_token fast path sees a fresh seq + generation.
+ */
+ struct marufs_me_slot *ms = marufs_me_slot_at(
+ me_base, slot_off, mnodes, s, sbi->node_id - 1);
+ WRITE_LE32(ms->from_node, sbi->node_id);
+ WRITE_LE64(ms->cb_gen_at_write, new_gen);
+ WRITE_LE64(ms->token_seq, READ_CXL_LE64(ms->token_seq) + 1);
+ MARUFS_CXL_WMB(ms, sizeof(*ms));
+ }
+ /* slot[i] is for external node_id (i+1); index by (node_id - 1). */
+ struct marufs_me_membership_slot *my_slot =
+ marufs_me_membership_at(me_base, mem_off, sbi->node_id - 1);
+ WRITE_LE32(my_slot->status, MARUFS_ME_ACTIVE);
+ WRITE_LE32(my_slot->node_id, sbi->node_id);
+ WRITE_LE64(my_slot->joined_at, ktime_get_ns());
+ WRITE_LE64(my_slot->heartbeat, 0);
+ WRITE_LE64(my_slot->heartbeat_ts, ktime_get_ns());
+ MARUFS_CXL_WMB(my_slot, sizeof(*my_slot));
+
+ u64 offset = shard_data_start;
+ for (u32 s = 0; s < num_shards; s++) {
+ struct marufs_nrht_shard_header *sh =
+ nrht_shard_header(base, s);
+ if (!sh)
+ return -EINVAL;
+
+ u64 shard_bucket_off = phys_offset + offset;
+ u64 shard_entry_off = shard_bucket_off + bucket_array_size;
+
+ WRITE_LE32(sh->num_entries, entries_per_shard);
+ WRITE_LE32(sh->num_buckets, buckets_per_shard);
+ WRITE_LE64(sh->bucket_array_offset, shard_bucket_off);
+ WRITE_LE64(sh->entry_array_offset, shard_entry_off);
+ MARUFS_CXL_WMB(sh, sizeof(*sh));
+
+ u32 *bkts = (u32 *)((char *)base + offset);
+ for (u32 i = 0; i < buckets_per_shard; i++)
+ WRITE_LE32(bkts[i], MARUFS_BUCKET_END);
+ MARUFS_CXL_WMB(bkts, bucket_array_size);
+
+ offset += per_shard_size;
+ }
+
+ /* Tag region as NRHT for GC discovery and double-init protection.
+ * Must be AFTER format so the double-init check above doesn't
+ * see our own write on the first call. */
+ WRITE_LE32(rat_e->region_type, MARUFS_REGION_NRHT);
+ MARUFS_CXL_WMB(rat_e, 64);
+
+ pr_info("nrht_init: region %u, %u shards, %u entries/shard, %u buckets/shard, %llu bytes\n",
+ nrht_region_id, num_shards, entries_per_shard,
+ buckets_per_shard, total_needed);
+
+ /* Join the initiating node to the freshly-formatted ring so the
+ * first insert (on any node) has at least one ACTIVE member and
+ * the initial token holder is a real node_id — not left as NONE.
+ * Without this, insert from a peer mount would race through
+ * first_node detection on an empty ring, which on a very short
+ * retry window can miss a concurrent joiner. Also ensures crash
+ * recovery (next_active) has a candidate.
+ */
+ {
+ struct marufs_me_instance *nme =
+ marufs_nrht_me_get(sbi, nrht_region_id);
+ if (IS_ERR(nme))
+ pr_warn("nrht_init: initiator join failed for region %u: %ld\n",
+ nrht_region_id, PTR_ERR(nme));
+ }
+
+ return 0;
+}
+
+int marufs_nrht_join(struct marufs_sb_info *sbi, u32 nrht_region_id)
+{
+ struct marufs_me_instance *nme =
+ marufs_nrht_me_get(sbi, nrht_region_id);
+
+ return IS_ERR(nme) ? PTR_ERR(nme) : 0;
+}
+
+int marufs_nrht_insert(struct marufs_sb_info *sbi, u32 nrht_region_id,
+ const char *name, size_t namelen, u64 name_hash,
+ u64 offset, u32 target_region_id)
+{
+ if (!sbi || !name)
+ return -EINVAL;
+ if (namelen == 0 || namelen > MARUFS_NAME_MAX)
+ return -ENAMETOOLONG;
+
+ /* Validate target region */
+ struct marufs_rat_entry *tr =
+ marufs_rat_entry_get(sbi, target_region_id);
+ if (!tr || READ_CXL_LE32(tr->state) != MARUFS_RAT_ENTRY_ALLOCATED)
+ return -EINVAL;
+
+ u64 target_size = READ_CXL_LE64(tr->size);
+ if (target_size > 0 && offset >= target_size)
+ return -EINVAL;
+
+ struct nrht_shard_ctx ctx;
+ int ret = nrht_resolve_bucket(sbi, nrht_region_id, name, namelen,
+ &name_hash, &ctx);
+ if (ret)
+ return ret;
+
+ /* Step 1: duplicate check (also inline-unlinks TOMBSTONE, records EMPTY reuse) */
+ u32 chain_reuse_idx = MARUFS_BUCKET_END;
+ ret = nrht_check_duplicate(sbi, &ctx, name_hash, name, namelen,
+ &chain_reuse_idx);
+ if (ret)
+ return ret;
+
+ /* Step 2a: reuse dead chain entry — TOMBSTONE or EMPTY (skip flat scan + link) */
+ bool reused_chain_entry = false;
+ u32 entry_idx;
+ struct marufs_nrht_entry *entry = NULL;
+ if (chain_reuse_idx != MARUFS_BUCKET_END) {
+ struct marufs_nrht_entry *e = &ctx.entries[chain_reuse_idx];
+ if (nrht_claim_entry(sbi, e)) {
+ entry_idx = chain_reuse_idx;
+ entry = e;
+ reused_chain_entry = true;
+ }
+ }
+
+ /* Step 2b: flat scan for EMPTY (start from free_hint) */
+ if (!entry) {
+ MARUFS_CXL_RMB(&ctx.header->free_hint,
+ sizeof(ctx.header->free_hint));
+ u32 hint = READ_CXL_LE32(ctx.header->free_hint);
+ if (hint >= ctx.num_entries)
+ hint = 0;
+
+ for (u32 scan = 0; scan < ctx.num_entries; scan++) {
+ u32 idx = hint + scan;
+ if (idx >= ctx.num_entries)
+ idx -= ctx.num_entries;
+
+ struct marufs_nrht_entry *e = &ctx.entries[idx];
+ MARUFS_CXL_RMB(e, 64);
+
+ if (READ_CXL_LE32(e->state) != MARUFS_ENTRY_EMPTY)
+ continue;
+
+ if (nrht_claim_entry(sbi, e)) {
+ entry_idx = idx;
+ entry = e;
+ WRITE_LE32(e->next_in_bucket,
+ MARUFS_BUCKET_END);
+ /* Advance hint past claimed entry */
+ u32 next_hint = idx + 1;
+ if (next_hint >= ctx.num_entries)
+ next_hint = 0;
+ WRITE_LE32(ctx.header->free_hint, next_hint);
+ MARUFS_CXL_WMB(&ctx.header->free_hint,
+ sizeof(ctx.header->free_hint));
+ break;
+ }
+ }
+ }
+
+ if (!entry)
+ return -ENOSPC;
+
+ /* Step 3: fill fields (both CL0 and CL1) */
+ WRITE_LE64(entry->name_hash, name_hash);
+ WRITE_LE64(entry->offset, offset);
+ WRITE_LE32(entry->target_region_id, target_region_id);
+ WRITE_LE32(entry->inserter_node, sbi->node_id);
+ WRITE_LE64(entry->created_at, ktime_get_real_ns());
+ /* Reset user counters when claiming a fresh/reused slot. */
+ WRITE_LE32(entry->ref_count, 0);
+ WRITE_LE32(entry->pin_count, 0);
+
+ size_t copy_len = min(namelen, sizeof(entry->name) - 1);
+ memset(entry->name, 0, sizeof(entry->name));
+ memcpy(entry->name, name, copy_len);
+
+ MARUFS_CXL_WMB(entry, sizeof(*entry)); /* flush both CL0 + CL1 */
+
+ /* Step 4: INSERTING → TENTATIVE (visible in chain but not yet VALID) */
+ WRITE_LE32(entry->state, MARUFS_ENTRY_TENTATIVE);
+ MARUFS_CXL_WMB(entry, 64);
+
+ /* Step 5: acquire NRHT ME token — cross-node shard lock (FT via heartbeat) */
+ struct marufs_me_instance *nme =
+ marufs_nrht_me_get(sbi, nrht_region_id);
+ if (IS_ERR(nme)) {
+ WRITE_LE32(entry->state, MARUFS_ENTRY_TOMBSTONE);
+ MARUFS_CXL_WMB(entry, 64);
+ return PTR_ERR(nme);
+ }
+ ret = nme->ops->acquire(nme, ctx.shard_id);
+ if (ret) {
+ WRITE_LE32(entry->state, MARUFS_ENTRY_TOMBSTONE);
+ MARUFS_CXL_WMB(entry, 64);
+ return ret;
+ }
+
+ /* Step 6: link to bucket (skip if reused — already in chain) */
+ if (!reused_chain_entry) {
+ ret = nrht_link_to_bucket(entry, entry_idx, ctx.bucket_head);
+ if (ret) {
+ WRITE_LE32(entry->state, MARUFS_ENTRY_TOMBSTONE);
+ MARUFS_CXL_WMB(entry, 64);
+ goto unlock;
+ }
+ }
+
+ /* Step 7: post-insert dedup — concurrent inserts resolved here */
+ ret = nrht_post_insert_dedup(&ctx, entry, entry_idx, name_hash, name,
+ namelen);
+ if (ret) {
+ WRITE_LE32(entry->state, MARUFS_ENTRY_TOMBSTONE);
+ MARUFS_CXL_WMB(entry, 64);
+ goto unlock;
+ }
+
+ /* Step 8: TENTATIVE → VALID (entry is now queryable) */
+ WRITE_LE32(entry->state, MARUFS_ENTRY_VALID);
+ MARUFS_CXL_WMB(entry, 64);
+
+unlock:
+ nme->ops->release(nme, ctx.shard_id);
+ return ret;
+}
+
+/*
+ * nrht_modify_count - shared ME-protected RMW of an entry counter field.
+ *
+ * @field_offset: byte offset of __le32 counter within marufs_nrht_entry
+ * (offsetof ref_count / pin_count).
+ * @delta: +1 (inc) or -1 (dec).
+ *
+ * Acquires the NRHT shard ME for (region, name), locates a VALID entry,
+ * applies the bounded RMW, and releases. The ME shard lock guards against
+ * concurrent dec-from-zero / inc-from-UINT32_MAX races on the same entry.
+ */
+static int nrht_modify_count(struct marufs_sb_info *sbi, u32 nrht_region_id,
+ const char *name, size_t namelen, u64 name_hash,
+ size_t field_offset, int delta, u32 *out_count)
+{
+ if (!sbi || !name || !out_count)
+ return -EINVAL;
+ if (namelen == 0 || namelen > MARUFS_NAME_MAX)
+ return -ENOENT;
+
+ struct nrht_shard_ctx ctx;
+ int ret = nrht_resolve_bucket(sbi, nrht_region_id, name, namelen,
+ &name_hash, &ctx);
+ if (ret)
+ return ret;
+
+ struct marufs_me_instance *nme =
+ marufs_nrht_me_get(sbi, nrht_region_id);
+ if (IS_ERR(nme))
+ return PTR_ERR(nme);
+
+ ret = nme->ops->acquire(nme, ctx.shard_id);
+ if (ret)
+ return ret;
+
+ struct marufs_nrht_entry *e =
+ nrht_find_chain(&ctx, name_hash, name, namelen, NULL, NULL);
+ if (!e) {
+ ret = -ENOENT;
+ goto unlock;
+ }
+
+ __le32 *counter = (__le32 *)((u8 *)e + field_offset);
+ MARUFS_CXL_RMB(counter, sizeof(*counter));
+ u32 cur = READ_CXL_LE32(*counter);
+
+ if (delta > 0) {
+ if (cur == U32_MAX) {
+ ret = -EOVERFLOW;
+ goto unlock;
+ }
+ cur += 1;
+ } else {
+ if (cur == 0) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+ cur -= 1;
+ }
+
+ WRITE_LE32(*counter, cur);
+ MARUFS_CXL_WMB(counter, sizeof(*counter));
+ *out_count = cur;
+ ret = 0;
+
+unlock:
+ nme->ops->release(nme, ctx.shard_id);
+ return ret;
+}
+
+int marufs_nrht_ref_inc(struct marufs_sb_info *sbi, u32 nrht_region_id,
+ const char *name, size_t namelen, u64 name_hash,
+ u32 *out_count)
+{
+ return nrht_modify_count(sbi, nrht_region_id, name, namelen, name_hash,
+ offsetof(struct marufs_nrht_entry, ref_count),
+ +1, out_count);
+}
+
+int marufs_nrht_ref_dec(struct marufs_sb_info *sbi, u32 nrht_region_id,
+ const char *name, size_t namelen, u64 name_hash,
+ u32 *out_count)
+{
+ return nrht_modify_count(sbi, nrht_region_id, name, namelen, name_hash,
+ offsetof(struct marufs_nrht_entry, ref_count),
+ -1, out_count);
+}
+
+int marufs_nrht_pin_inc(struct marufs_sb_info *sbi, u32 nrht_region_id,
+ const char *name, size_t namelen, u64 name_hash,
+ u32 *out_count)
+{
+ return nrht_modify_count(sbi, nrht_region_id, name, namelen, name_hash,
+ offsetof(struct marufs_nrht_entry, pin_count),
+ +1, out_count);
+}
+
+int marufs_nrht_pin_dec(struct marufs_sb_info *sbi, u32 nrht_region_id,
+ const char *name, size_t namelen, u64 name_hash,
+ u32 *out_count)
+{
+ return nrht_modify_count(sbi, nrht_region_id, name, namelen, name_hash,
+ offsetof(struct marufs_nrht_entry, pin_count),
+ -1, out_count);
+}
+
+int marufs_nrht_lookup(struct marufs_sb_info *sbi, u32 nrht_region_id,
+ const char *name, size_t namelen, u64 name_hash,
+ u64 *out_offset, u32 *out_target_region_id,
+ u32 *out_ref_count, u32 *out_pin_count)
+{
+ if (!sbi || !name || !out_offset || !out_target_region_id)
+ return -EINVAL;
+ if (namelen == 0 || namelen > MARUFS_NAME_MAX)
+ return -ENOENT;
+
+ struct nrht_shard_ctx ctx;
+ int ret = nrht_resolve_bucket(sbi, nrht_region_id, name, namelen,
+ &name_hash, &ctx);
+ if (ret)
+ return ret;
+
+ struct marufs_nrht_entry *e =
+ nrht_find_chain(&ctx, name_hash, name, namelen, NULL, NULL);
+ if (!e)
+ return -ENOENT;
+
+ *out_offset = READ_CXL_LE64(e->offset);
+ *out_target_region_id = READ_CXL_LE32(e->target_region_id);
+ if (out_ref_count)
+ *out_ref_count = READ_CXL_LE32(e->ref_count);
+ if (out_pin_count)
+ *out_pin_count = READ_CXL_LE32(e->pin_count);
+ return 0;
+}
+
+int marufs_nrht_delete(struct marufs_sb_info *sbi, u32 nrht_region_id,
+ const char *name, size_t namelen, u64 name_hash)
+{
+ if (!sbi || !name)
+ return -EINVAL;
+ if (namelen == 0 || namelen > MARUFS_NAME_MAX)
+ return -ENOENT;
+
+ struct nrht_shard_ctx ctx;
+ int ret = nrht_resolve_bucket(sbi, nrht_region_id, name, namelen,
+ &name_hash, &ctx);
+ if (ret)
+ return ret;
+
+ u32 entry_idx;
+ struct marufs_nrht_entry *e = nrht_find_chain(
+ &ctx, name_hash, name, namelen, &entry_idx, NULL);
+ if (!e)
+ return -ENOENT;
+
+ /* CAS VALID → TOMBSTONE (logical delete, stays in chain for reuse) */
+ if (marufs_le32_cas(&e->state, MARUFS_ENTRY_VALID,
+ MARUFS_ENTRY_TOMBSTONE) != MARUFS_ENTRY_VALID)
+ return -ENOENT;
+ /* CAS includes implicit full barrier — no extra WMB needed */
+ pr_debug("nrht: deleted '%.*s'\n", (int)namelen, name);
+ return 0;
+}
+
+/* ============================================================================
+ * GC support — called from gc.c Phase 4
+ * ============================================================================ */
+
+/*
+ * marufs_nrht_gc_sweep_all - sweep stale INSERTING entries across all NRHT regions
+ * @sbi: superblock info
+ *
+ * Iterates DRAM bitmap of NRHT regions, sweeping ~25% of shards per cycle
+ * (round-robin via gc_epoch). Same-node stale entries are reclaimed to EMPTY;
+ * indeterminate cases (node==0, created_at==0) are tracked in the DRAM
+ * orphan array for two-stage reclaim.
+ *
+ * Returns total number of reclaimed entries.
+ */
+int marufs_nrht_gc_sweep_all(struct marufs_sb_info *sbi)
+{
+ int reclaimed = 0;
+ u32 epoch = (u32)atomic_read(&sbi->gc_epoch);
+ u32 region_id;
+
+ for_each_set_bit(region_id, sbi->gc_nrht_bitmap,
+ MARUFS_MAX_RAT_ENTRIES) {
+ struct marufs_nrht_header *nrht =
+ nrht_get_header(sbi, region_id);
+ if (!nrht) {
+ /* Region gone — clear stale bitmap bit */
+ clear_bit(region_id, sbi->gc_nrht_bitmap);
+ continue;
+ }
+
+ u32 num_shards = READ_CXL_LE32(nrht->num_shards);
+ u32 shards_per_cycle = max(num_shards / 4, 1U);
+ u32 start = (epoch * shards_per_cycle) % num_shards;
+
+ for (u32 si = 0; si < shards_per_cycle; si++) {
+ u32 s = (start + si) % num_shards;
+ struct nrht_shard_ctx ctx;
+ if (nrht_get_shard_ctx(sbi, nrht, s, &ctx))
+ continue;
+
+ for (u32 j = 0; j < ctx.num_entries; j++) {
+ struct marufs_nrht_entry *e = &ctx.entries[j];
+ MARUFS_CXL_RMB(e, 64);
+
+ if (READ_CXL_LE32(e->state) !=
+ MARUFS_ENTRY_INSERTING)
+ continue;
+
+ int ret = nrht_is_stale(sbi, e);
+ if (ret == 1) {
+ WRITE_LE64(e->name_hash, 0);
+ MARUFS_CXL_WMB(e, 64);
+ if (marufs_le32_cas(
+ &e->state,
+ MARUFS_ENTRY_INSERTING,
+ MARUFS_ENTRY_TOMBSTONE) ==
+ MARUFS_ENTRY_INSERTING)
+ reclaimed++;
+ } else if (ret == 0) {
+ marufs_gc_track_orphan(
+ sbi, e, MARUFS_ORPHAN_NRHT);
+ }
+ }
+ }
+ }
+
+ if (reclaimed > 0)
+ pr_debug("nrht gc: reclaimed %d stale INSERTING entries\n",
+ reclaimed);
+
+ return reclaimed;
+}
diff --git a/marufs_kernel/src/nrht.h b/marufs_kernel/src/nrht.h
new file mode 100644
index 0000000..0ccef7e
--- /dev/null
+++ b/marufs_kernel/src/nrht.h
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * nrht.h - Independent Name-Ref Hash Table entry points.
+ *
+ * Per-region hash table for name -> (region_id, offset) bindings.
+ * Multi-region NRHT chaining handled via marufs_nrht_join.
+ */
+
+#ifndef _MARUFS_NRHT_H
+#define _MARUFS_NRHT_H
+
+#include
+
+#include "me.h" /* enum marufs_me_strategy */
+
+struct marufs_sb_info;
+
+int marufs_nrht_init(struct marufs_sb_info *sbi, u32 nrht_region_id,
+ u32 max_entries, u32 num_shards, u32 num_buckets,
+ enum marufs_me_strategy me_strategy);
+
+/*
+ * marufs_nrht_join - explicit pre-warm: create this sbi's NRHT ME instance
+ * and join the ring for @nrht_region_id. Idempotent (cached on re-call).
+ * Backup path is lazy-init on first insert.
+ */
+int marufs_nrht_join(struct marufs_sb_info *sbi, u32 nrht_region_id);
+
+int marufs_nrht_insert(struct marufs_sb_info *sbi, u32 nrht_region_id,
+ const char *name, size_t namelen, u64 name_hash,
+ u64 offset, u32 target_region_id);
+/*
+ * marufs_nrht_lookup - find VALID entry by name.
+ * @out_ref_count, @out_pin_count: optional (NULL ok) — receive snapshot
+ * of user-managed ref/pin counts for the entry.
+ */
+int marufs_nrht_lookup(struct marufs_sb_info *sbi, u32 nrht_region_id,
+ const char *name, size_t namelen, u64 name_hash,
+ u64 *out_offset, u32 *out_target_region_id,
+ u32 *out_ref_count, u32 *out_pin_count);
+int marufs_nrht_delete(struct marufs_sb_info *sbi, u32 nrht_region_id,
+ const char *name, size_t namelen, u64 name_hash);
+
+/*
+ * Per-entry ref/pin counter ops. Each acquires the NRHT shard ME for the
+ * resolved (region, name), looks up a VALID entry, RMWs the counter, and
+ * releases. @out_count receives the post-op value.
+ *
+ * Returns 0 on success, -ENOENT if entry missing/non-VALID,
+ * -EINVAL on dec-from-zero, -EOVERFLOW on inc-from-UINT32_MAX.
+ */
+
+typedef int (*nrht_refcnt_op_t)(struct marufs_sb_info *sbi, u32 nrht_region_id,
+ const char *name, size_t namelen, u64 name_hash,
+ u32 *out_count);
+
+int marufs_nrht_ref_inc(struct marufs_sb_info *sbi, u32 nrht_region_id,
+ const char *name, size_t namelen, u64 name_hash,
+ u32 *out_count);
+int marufs_nrht_ref_dec(struct marufs_sb_info *sbi, u32 nrht_region_id,
+ const char *name, size_t namelen, u64 name_hash,
+ u32 *out_count);
+int marufs_nrht_pin_inc(struct marufs_sb_info *sbi, u32 nrht_region_id,
+ const char *name, size_t namelen, u64 name_hash,
+ u32 *out_count);
+int marufs_nrht_pin_dec(struct marufs_sb_info *sbi, u32 nrht_region_id,
+ const char *name, size_t namelen, u64 name_hash,
+ u32 *out_count);
+
+int marufs_nrht_gc_sweep_all(struct marufs_sb_info *sbi);
+
+#endif /* _MARUFS_NRHT_H */
diff --git a/marufs_kernel/src/nrht_stats.h b/marufs_kernel/src/nrht_stats.h
new file mode 100644
index 0000000..489cf1a
--- /dev/null
+++ b/marufs_kernel/src/nrht_stats.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * nrht_stats.h - MARUFS NRHT per-CPU performance counters
+ *
+ * Currently tracks bucket-chain walk depth observed by nrht_find_chain.
+ * The counters live on struct marufs_sb_info (not per-region) because
+ * chain depth is a property of the hash distribution over the NRHT as a
+ * whole; aggregating across regions hides nothing for the dominant user
+ * (single large NRHT per mount) and keeps allocation simple.
+ *
+ * Overhead model: non-atomic per-CPU updates (see me_stats.h header for
+ * the same trade-off). Sysfs reads sum across for_each_possible_cpu.
+ */
+
+#ifndef _MARUFS_NRHT_STATS_H
+#define _MARUFS_NRHT_STATS_H
+
+#include
+#include
+#include
+
+/*
+ * 8 buckets covers depth [0] .. [>=128]. Bucket i holds counts where
+ * depth is in [2^(i-1), 2^i); bucket 0 is depth 0/1, bucket 7 saturates
+ * for depth >= 128. In practice >16 is already pathological — the tail
+ * bucket exists mostly to make overflow visible rather than silent.
+ */
+#define MARUFS_NRHT_DEPTH_BUCKETS 8
+
+struct marufs_nrht_stats_pcpu {
+ u64 find_chain_count;
+ u64 find_chain_steps_total;
+ u64 chain_depth_buckets[MARUFS_NRHT_DEPTH_BUCKETS];
+};
+
+/*
+ * nrht_stats_depth_bucket - map walk depth → bucket in [0, 7].
+ * depth==0 lands in bucket 0 alongside depth 1 (fls(0) returns 0).
+ */
+static inline u32 nrht_stats_depth_bucket(u32 depth)
+{
+ if (depth <= 1)
+ return 0;
+ u32 b = fls(depth); /* depth=2→2, =3→2, =4→3, =128→8 */
+ return min_t(u32, b, MARUFS_NRHT_DEPTH_BUCKETS - 1);
+}
+
+/*
+ * nrht_stats_record_chain_depth - add a sample after a bucket-chain
+ * walk. Call from nrht_find_chain regardless of hit/miss. @stats may
+ * be NULL before sbi init completes; caller doesn't need to guard.
+ */
+static inline void
+nrht_stats_record_chain_depth(struct marufs_nrht_stats_pcpu __percpu *stats,
+ u32 steps)
+{
+ if (!stats)
+ return;
+
+ struct marufs_nrht_stats_pcpu *st = this_cpu_ptr(stats);
+ st->find_chain_count++;
+ st->find_chain_steps_total += steps;
+ st->chain_depth_buckets[nrht_stats_depth_bucket(steps)]++;
+}
+
+#endif /* _MARUFS_NRHT_STATS_H */
diff --git a/marufs_kernel/src/region.c b/marufs_kernel/src/region.c
new file mode 100644
index 0000000..d418eb0
--- /dev/null
+++ b/marufs_kernel/src/region.c
@@ -0,0 +1,372 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * region.c - MARUFS Region Allocator and Offset Name Management
+ *
+ * Two-phase region allocation:
+ * 1. open(O_CREAT): marufs_rat_alloc_entry() reserves a RAT entry (size=0)
+ * 2. ftruncate(N): marufs_region_init() finds contiguous space, inits header
+ *
+ * Offset name management:
+ * Applications can name offsets within the data area via ioctl.
+ * Names are stored in a per-region hash index (up to 8160 entries, 128-byte names).
+ * Hash-based bucket chain provides O(1) lookup (same pattern as global index).
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "acl.h"
+#include "marufs.h"
+#include "me.h"
+
+/* ============================================================================
+ * RAT (Region Allocation Table) management
+ * ============================================================================ */
+
+/* Temporary span for sorted gap search */
+struct marufs_region_span {
+ u64 offset;
+ u64 end;
+};
+
+/*
+ * marufs_find_contiguous_space - find contiguous free space in device
+ * @sbi: superblock info
+ * @size: requested size (bytes, must be 2MB aligned)
+ * @out_offset: output offset from device start
+ *
+ * Collects allocated RAT entries, sorts by offset, then does a single
+ * linear scan through the gaps. O(n log n) vs previous O(n²).
+ *
+ * Returns 0 on success, -ENOSPC if no space available.
+ */
+static int marufs_find_contiguous_space(struct marufs_sb_info *sbi, u64 size,
+ u64 *out_offset)
+{
+ struct marufs_rat *rat = marufs_rat_get(sbi);
+ struct marufs_region_span *spans;
+ u32 count = 0;
+ u64 regions_start;
+ u64 device_size;
+ u64 candidate;
+ u32 i, j;
+ int ret;
+
+ if (!rat || !out_offset)
+ return -EINVAL;
+
+ spans = kmalloc_array(MARUFS_MAX_RAT_ENTRIES, sizeof(*spans),
+ GFP_KERNEL);
+ if (!spans)
+ return -ENOMEM;
+
+ regions_start = READ_CXL_LE64(rat->regions_start);
+ device_size = READ_CXL_LE64(rat->device_size);
+
+ pr_debug(
+ "find_contiguous_space: size=%llu, regions_start=0x%llx, device_size=%llu\n",
+ size, regions_start, device_size);
+
+ /* Collect all allocated regions with valid physical placement */
+ for (i = 0; i < MARUFS_MAX_RAT_ENTRIES; i++) {
+ u64 entry_offset, entry_size;
+ struct marufs_rat_entry *entry = marufs_rat_entry_get(sbi, i);
+ if (!entry) {
+ ret = -EIO;
+ goto out;
+ }
+
+ if (READ_CXL_LE32(entry->state) != MARUFS_RAT_ENTRY_ALLOCATED)
+ continue;
+
+ entry_offset = READ_CXL_LE64(entry->phys_offset);
+ entry_size = READ_CXL_LE64(entry->size);
+ if (entry_offset == 0 || entry_size == 0)
+ continue;
+
+ /* Skip corrupted entries (overflow protection) */
+ if (entry_size > device_size ||
+ entry_offset > device_size - entry_size)
+ continue;
+
+ spans[count].offset = entry_offset;
+ spans[count].end = entry_offset + entry_size;
+ count++;
+ }
+
+ /* Insertion sort by offset (max 256 elements, cache-friendly) */
+ for (i = 1; i < count; i++) {
+ struct marufs_region_span tmp = spans[i];
+
+ j = i;
+ while (j > 0 && spans[j - 1].offset > tmp.offset) {
+ spans[j] = spans[j - 1];
+ j--;
+ }
+ spans[j] = tmp;
+ }
+
+ /* Single linear scan through sorted spans for first gap >= size */
+ candidate = regions_start;
+ for (i = 0; i < count; i++) {
+ if (candidate + size <= spans[i].offset) {
+ *out_offset = candidate;
+ pr_debug(
+ "found contiguous space at offset 0x%llx size %llu\n",
+ candidate, size);
+ ret = 0;
+ goto out;
+ }
+
+ if (spans[i].end > candidate)
+ candidate =
+ marufs_align_up(spans[i].end, MARUFS_ALIGN_2MB);
+ }
+
+ /* Check space after last region */
+ if (candidate + size <= device_size) {
+ *out_offset = candidate;
+ pr_debug("found contiguous space at offset 0x%llx size %llu\n",
+ candidate, size);
+ ret = 0;
+ goto out;
+ }
+
+ pr_err("no contiguous space for size %llu\n", size);
+ ret = -ENOSPC;
+
+out:
+ kfree(spans);
+ return ret;
+}
+
+/*
+ * marufs_rat_entry_reset - zero all mutable fields and optionally fill new values
+ * @entry: RAT entry (caller holds exclusive state: ALLOCATING or pre-FREE)
+ * @sbi: superblock info (NULL for clear-only)
+ * @name: filename (NULL for clear-only)
+ * @name_len: filename length
+ * @offset: physical offset
+ * @size: region size
+ */
+static void marufs_rat_entry_reset(struct marufs_rat_entry *entry,
+ struct marufs_sb_info *sbi, const char *name,
+ size_t name_len, u64 offset, u64 size)
+{
+ /* Zero everything, then restore state */
+ u32 saved_state = READ_CXL_LE32(entry->state);
+ memset(entry, 0, sizeof(*entry));
+ WRITE_LE32(entry->state, saved_state);
+
+ if (sbi && name) {
+ u64 now = ktime_get_real_ns();
+
+ memcpy(entry->name, name, name_len);
+ WRITE_LE64(entry->phys_offset, offset);
+ WRITE_LE64(entry->size, size);
+ WRITE_LE16(entry->owner_node_id, sbi->node_id);
+ WRITE_LE32(entry->owner_pid, current->pid);
+ WRITE_LE64(entry->owner_birth_time,
+ ktime_to_ns(current->start_boottime));
+
+ u64 exe_ino = 0;
+ u32 exe_dev = 0;
+ (void)marufs_get_exe_id(&exe_ino, &exe_dev);
+ WRITE_LE64(entry->owner_exe_inode_ino, exe_ino);
+ WRITE_LE32(entry->owner_exe_inode_dev, exe_dev);
+
+ WRITE_LE64(entry->alloc_time, now);
+ WRITE_LE64(entry->modified_at, now);
+ }
+
+ MARUFS_CXL_WMB(entry, sizeof(*entry));
+}
+
+/*
+ * marufs_rat_alloc_entry - allocate a RAT entry
+ * @sbi: superblock info
+ * @name: region file name
+ * @size: region size (bytes), 0 for reservation mode
+ * @offset: physical offset in device, 0 for reservation mode
+ * @out_rat_entry_id: output RAT entry ID
+ *
+ * Finds a free RAT entry and initializes it.
+ * When size=0 and offset=0, this is a reservation (two-phase create).
+ * Returns 0 on success, negative error code on failure.
+ */
+int marufs_rat_alloc_entry(struct marufs_sb_info *sbi, const char *name,
+ u64 size, u64 offset, u32 *out_rat_entry_id)
+{
+ u32 i;
+ size_t name_len;
+
+ if (!sbi || !name || !out_rat_entry_id)
+ return -EINVAL;
+
+ name_len = strlen(name);
+ if (name_len > MARUFS_NAME_MAX)
+ return -ENAMETOOLONG;
+
+ /* Find free entry */
+ for (i = 0; i < MARUFS_MAX_RAT_ENTRIES; i++) {
+ u32 old_state;
+ struct marufs_rat_entry *entry = marufs_rat_entry_get(sbi, i);
+ if (!entry)
+ return -EIO;
+
+ old_state = READ_CXL_LE32(entry->state);
+ if (old_state != MARUFS_RAT_ENTRY_FREE)
+ continue;
+
+ /* Try to claim this entry with CAS: FREE → ALLOCATING */
+ old_state = marufs_le32_cas(&entry->state,
+ MARUFS_RAT_ENTRY_FREE,
+ MARUFS_RAT_ENTRY_ALLOCATING);
+ if (old_state != MARUFS_RAT_ENTRY_FREE)
+ continue; /* Lost race, try next */
+
+ /* Initialize fields while in ALLOCATING state (invisible to others) */
+ marufs_rat_entry_reset(entry, sbi, name, name_len, offset,
+ size);
+
+ /* Publish: ALLOCATING → ALLOCATED (fields now valid) */
+ WRITE_LE32(entry->state, MARUFS_RAT_ENTRY_ALLOCATED);
+ MARUFS_CXL_WMB(entry, 64);
+
+ *out_rat_entry_id = i;
+ pr_debug(
+ "allocated RAT entry %u for '%s' at offset %llu size %llu\n",
+ i, name, offset, size);
+ return 0;
+ }
+
+ pr_err("no free RAT entries\n");
+ return -ENOSPC;
+}
+
+/*
+ * marufs_rat_free_entry - free a RAT entry
+ * @entry: RAT entry pointer (caller must validate)
+ *
+ * Accepts entry in ALLOCATED, DELETING, or ALLOCATING state.
+ * Clears metadata fields, then CAS transitions to FREE.
+ */
+void marufs_rat_free_entry(struct marufs_rat_entry *entry)
+{
+ u32 cur_state;
+
+ if (!entry)
+ return;
+
+ /* Clear entry fields BEFORE state transition (H-S1: prevent reuse race) */
+ marufs_rat_entry_reset(entry, NULL, NULL, 0, 0, 0);
+
+ /* CAS to FREE: try DELETING first, then ALLOCATED, then ALLOCATING (rollback) */
+ cur_state = marufs_le32_cas(&entry->state, MARUFS_RAT_ENTRY_DELETING,
+ MARUFS_RAT_ENTRY_FREE);
+ if (cur_state != MARUFS_RAT_ENTRY_DELETING) {
+ cur_state = marufs_le32_cas(&entry->state,
+ MARUFS_RAT_ENTRY_ALLOCATED,
+ MARUFS_RAT_ENTRY_FREE);
+ if (cur_state != MARUFS_RAT_ENTRY_ALLOCATED)
+ marufs_le32_cas(&entry->state,
+ MARUFS_RAT_ENTRY_ALLOCATING,
+ MARUFS_RAT_ENTRY_FREE);
+ }
+}
+
+/* ============================================================================
+ * Region initialization (called from ftruncate path)
+ * ============================================================================ */
+
+/*
+ * marufs_region_init - allocate physical space and initialize region header
+ * @sbi: superblock info
+ * @rat_entry_id: RAT entry ID (must be pre-allocated in reservation mode)
+ * @data_size: user-requested data size in bytes
+ *
+ * Called from marufs_setattr() when ftruncate sets file size for the first time.
+ * Finds contiguous space, initializes region header with name table,
+ * and updates the RAT entry with physical offset/size.
+ *
+ * Region layout (v2: header in pool, data starts at phys_offset):
+ * [Data area (data_size, 2MB aligned)]
+ * Total region_size = align_2MB(data_size)
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+int marufs_region_init(struct marufs_sb_info *sbi, u32 rat_entry_id,
+ u64 data_size)
+{
+ struct marufs_rat_entry *entry =
+ marufs_rat_entry_get(sbi, rat_entry_id);
+ if (!entry || data_size == 0)
+ return -EINVAL;
+
+ /* Verify entry is allocated and not yet initialized */
+ if (READ_CXL_LE32(entry->state) != MARUFS_RAT_ENTRY_ALLOCATED)
+ return -EINVAL;
+ if (READ_CXL_LE64(entry->phys_offset) != 0)
+ return -EEXIST; /* Already initialized */
+
+ /* Reject obviously oversized requests */
+ if (data_size > sbi->total_size)
+ return -ENOSPC;
+
+ /* Align region size to 2MB boundary */
+ u64 region_size = marufs_align_up(data_size, MARUFS_ALIGN_2MB);
+ if (region_size > sbi->total_size)
+ return -ENOSPC;
+
+ /* Global ME gives cross-node + intra-node exclusion (local_locks). */
+ int ret = sbi->me->ops->acquire(sbi->me, MARUFS_ME_GLOBAL_SHARD_ID);
+ if (ret) {
+ pr_err("region_init: ME acquire failed: %d\n", ret);
+ return ret;
+ }
+
+ /* ── Critical section (ME held) ───────────────────────────── */
+
+ /* Find contiguous free space */
+ u64 region_offset;
+ ret = marufs_find_contiguous_space(sbi, region_size, ®ion_offset);
+ if (ret) {
+ pr_err("no contiguous space for region (size=%llu)\n",
+ region_size);
+ goto unlock;
+ }
+
+ /* Validate region fits in DAX mapping */
+ if (!marufs_dax_range_valid(sbi, region_offset, region_size)) {
+ pr_err("region_init: region 0x%llx+%llu exceeds DAX mapping\n",
+ region_offset, region_size);
+ ret = -ENOSPC;
+ goto unlock;
+ }
+
+ /* Commit: write phys_offset + size to RAT entry */
+ WRITE_LE64(entry->phys_offset, region_offset);
+ WRITE_LE64(entry->size, region_size);
+ MARUFS_CXL_WMB(entry, 64);
+
+unlock:
+ sbi->me->ops->release(sbi->me, MARUFS_ME_GLOBAL_SHARD_ID);
+
+ if (ret) {
+ pr_err("region_init failed for rat_entry=%u (err=%d)\n",
+ rat_entry_id, ret);
+ return ret;
+ }
+
+ pr_debug(
+ "region_init rat=%u region_offset=0x%llx region_size=%llu data_size=%llu\n",
+ rat_entry_id, region_offset, region_size, data_size);
+
+ return 0;
+}
diff --git a/marufs_kernel/src/region.h b/marufs_kernel/src/region.h
new file mode 100644
index 0000000..465de20
--- /dev/null
+++ b/marufs_kernel/src/region.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * region.h - RAT (Region Allocation Table) and region init entry points.
+ */
+
+#ifndef _MARUFS_REGION_H
+#define _MARUFS_REGION_H
+
+#include
+
+struct marufs_sb_info;
+struct marufs_rat_entry;
+
+int marufs_rat_alloc_entry(struct marufs_sb_info *sbi, const char *name,
+ u64 size, u64 offset, u32 *out_rat_entry_id);
+void marufs_rat_free_entry(struct marufs_rat_entry *entry);
+
+/* Region initialization (called from ftruncate path) */
+int marufs_region_init(struct marufs_sb_info *sbi, u32 rat_entry_id,
+ u64 data_size);
+
+#endif /* _MARUFS_REGION_H */
diff --git a/marufs_kernel/src/super.c b/marufs_kernel/src/super.c
new file mode 100644
index 0000000..56feb79
--- /dev/null
+++ b/marufs_kernel/src/super.c
@@ -0,0 +1,1464 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * super.c - MARUFS Partitioned Global Index superblock and mount
+ *
+ * Architecture:
+ * - Global Superblock (4 KB) at offset 0
+ * - Shard Table (num_shards x 64B headers)
+ * - Index Pool (per-shard bucket + entry arrays)
+ * - Regions (per-node data: header + bitmap + slots)
+ *
+ * All nodes access the global index via CAS operations on CXL shared memory.
+ * No GCS (Global Chunk Server) is required.
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "compat.h"
+#include "marufs.h"
+#include "me.h"
+#include "nrht_stats.h"
+
+/* Module parameter: node_id (must be > 0) */
+int marufs_node_id = 1;
+module_param_named(node_id, marufs_node_id, int, 0444);
+MODULE_PARM_DESC(node_id,
+ "Node ID for multi-node access control (must be > 0)");
+
+/* Free per-shard DRAM resources */
+static void marufs_free_shard_resources(struct marufs_sb_info *sbi)
+{
+ kvfree(sbi->shard_cache);
+ sbi->shard_cache = NULL;
+}
+
+/* Mount option parsing */
+enum MARUFS_MOUNT_OPTION {
+ OPTION_NODE_ID,
+ OPTION_DAXDEV,
+ OPTION_FORMAT,
+ OPTION_ME_STRATEGY,
+ OPTION_ERROR,
+};
+
+static const match_table_t marufs_tokens = {
+ { OPTION_NODE_ID, "node_id=%d" },
+ { OPTION_DAXDEV, "daxdev=%s" },
+ { OPTION_FORMAT, "format" },
+ { OPTION_ME_STRATEGY, "me_strategy=%s" },
+ { OPTION_ERROR, NULL },
+};
+
+struct marufs_mount_opts {
+ int node_id; /* 0 = auto-mount (no node_id= given); ≥1 = manual */
+ char daxdev[128];
+ bool format; /* in-kernel format on mount */
+ enum marufs_me_strategy me_strategy; /* order or request */
+};
+
+static int marufs_parse_options(char *options, struct marufs_mount_opts *opts)
+{
+ substring_t args[MAX_OPT_ARGS];
+ char *p;
+ int token;
+ int option;
+
+ /* Default: 0 = auto-mount (no node_id= on command line) */
+ opts->node_id = 0;
+ opts->daxdev[0] = '\0';
+ opts->format = false;
+ opts->me_strategy = MARUFS_ME_REQUEST; /* default: demand-driven */
+
+ if (!options)
+ return 0;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ if (!*p)
+ continue;
+
+ token = match_token(p, marufs_tokens, args);
+ switch (token) {
+ case OPTION_NODE_ID:
+ if (match_int(&args[0], &option))
+ return -EINVAL;
+ if (option <= 0) {
+ pr_err("node_id must be > 0 (got %d)\n",
+ option);
+ return -EINVAL;
+ }
+ opts->node_id = option;
+ break;
+ case OPTION_DAXDEV:
+ match_strlcpy(opts->daxdev, &args[0],
+ sizeof(opts->daxdev));
+ break;
+ case OPTION_FORMAT:
+ opts->format = true;
+ break;
+ case OPTION_ME_STRATEGY: {
+ char str[16];
+
+ match_strlcpy(str, &args[0], sizeof(str));
+ if (strcmp(str, "order") == 0)
+ opts->me_strategy = MARUFS_ME_ORDER;
+ else if (strcmp(str, "request") == 0)
+ opts->me_strategy = MARUFS_ME_REQUEST;
+ else {
+ pr_err("invalid me_strategy: %s (use 'order' or 'request')\n",
+ str);
+ return -EINVAL;
+ }
+ break;
+ }
+ default:
+ pr_err("unrecognized mount option: %s\n", p);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+/* inode cache */
+static struct kmem_cache *marufs_inode_cachep;
+
+/* ============================================================================
+ * inode cache management
+ * ============================================================================ */
+
+static struct inode *marufs_alloc_inode(struct super_block *sb)
+{
+ struct marufs_inode_info *xi;
+
+ xi = kmem_cache_alloc(marufs_inode_cachep, GFP_KERNEL);
+ if (!xi) {
+ return NULL;
+ }
+
+ marufs_inode_info_init(xi);
+
+ return &xi->vfs_inode;
+}
+
+static void marufs_free_inode(struct inode *inode)
+{
+ kmem_cache_free(marufs_inode_cachep, marufs_inode_get(inode));
+}
+
+/* ============================================================================
+ * Filesystem statistics
+ * ============================================================================ */
+
+static int marufs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct super_block *sb = dentry->d_sb;
+ struct marufs_sb_info *sbi = marufs_sb_get(sb);
+ u64 used_size = 0;
+ u32 used_entries = 0;
+ u32 i;
+
+ /* RAT: compute used size and entry count */
+ for (i = 0; i < MARUFS_MAX_RAT_ENTRIES; i++) {
+ struct marufs_rat_entry *entry = marufs_rat_entry_get(sbi, i);
+ if (!entry)
+ return -EIO;
+
+ u32 state = READ_LE32(entry->state);
+ if (state == MARUFS_RAT_ENTRY_ALLOCATED ||
+ state == MARUFS_RAT_ENTRY_DELETING) {
+ used_size += READ_LE64(entry->size);
+ used_entries++;
+ }
+ }
+
+ buf->f_files = MARUFS_MAX_RAT_ENTRIES;
+ buf->f_ffree = MARUFS_MAX_RAT_ENTRIES - used_entries;
+
+ buf->f_type = MARUFS_MAGIC;
+ buf->f_bsize = PAGE_SIZE;
+ buf->f_blocks = sbi->total_size / PAGE_SIZE;
+ if (used_size > sbi->total_size)
+ buf->f_bfree = 0;
+ else
+ buf->f_bfree = (sbi->total_size - used_size) / PAGE_SIZE;
+ buf->f_bavail = buf->f_bfree;
+ buf->f_namelen = MARUFS_NAME_MAX;
+
+ return 0;
+}
+
+/* ============================================================================
+ * Superblock operations table
+ * ============================================================================ */
+
+static int marufs_show_options(struct seq_file *m, struct dentry *root)
+{
+ struct marufs_sb_info *sbi = root->d_sb->s_fs_info;
+
+ seq_printf(m, ",node_id=%u", sbi->node_id);
+
+ if (sbi->daxdev_path[0])
+ seq_printf(m, ",daxdev=%s", sbi->daxdev_path);
+ else if (sbi->dax_base != NULL)
+ seq_puts(m, ",dax");
+
+ return 0;
+}
+
+static const struct super_operations marufs_sops = {
+ .alloc_inode = marufs_alloc_inode,
+ .free_inode = marufs_free_inode,
+ .write_inode = marufs_write_inode,
+ .evict_inode = marufs_evict_inode,
+ .drop_inode = generic_delete_inode,
+ .statfs = marufs_statfs,
+ .show_options = marufs_show_options,
+};
+
+/* ============================================================================
+ * Dentry operations - dcache invalidation for cross-node visibility
+ * ============================================================================
+ *
+ * Since MARUFS is based on CXL shared memory, other nodes can create/delete files.
+ * Always fail revalidation so VFS re-calls lookup for fresh data.
+ */
+
+static int marufs_d_revalidate(MARUFS_D_REVALIDATE_ARGS)
+{
+ /* Always return 0 -> VFS re-calls lookup -> read fresh data from CXL */
+ return 0;
+}
+
+static const struct dentry_operations marufs_dentry_ops = {
+ .d_revalidate = marufs_d_revalidate,
+};
+
+/* ============================================================================
+ * Unified DAX abstraction layer
+ * ============================================================================
+ *
+ * marufs_dax_acquire() / marufs_dax_release()
+ *
+ * DEV_DAX (character device) produces:
+ * sbi->dax_base = mapped memory pointer
+ * sbi->total_size = total size
+ *
+ * After that, all filesystem logic references only sbi->dax_base.
+ */
+
+/* Helper: read u64 value from sysfs */
+static int marufs_read_sysfs_u64(const char *path, u64 *out)
+{
+ struct file *f;
+ char buf[64];
+ loff_t pos = 0;
+ ssize_t len;
+
+ f = filp_open(path, O_RDONLY, 0);
+ if (IS_ERR(f))
+ return PTR_ERR(f);
+
+ memset(buf, 0, sizeof(buf));
+ len = kernel_read(f, buf, sizeof(buf) - 1, &pos);
+ filp_close(f, NULL);
+
+ if (len <= 0)
+ return -EIO;
+
+ buf[len] = '\0';
+ return kstrtoull(buf, 0, out) < 0 ? -EINVAL : 0;
+}
+
+/*
+ * marufs_dax_acquire - DEV_DAX memory acquisition via memremap.
+ *
+ * Reads phys addr/size from sysfs, memremap()s, opens the device file
+ * for mmap delegation. Reads sbi->daxdev_path (set by caller).
+ * After return, sbi->dax_base / phys_base / total_size / dax_filp populated.
+ */
+static int marufs_dax_acquire(struct marufs_sb_info *sbi)
+{
+ const char *devpath = sbi->daxdev_path;
+ const char *devname;
+ char sysfs_path[256];
+ u64 phys_addr, dev_size;
+ void *mapped;
+ int ret;
+
+ devname = strrchr(devpath, '/');
+ devname = devname ? devname + 1 : devpath;
+
+ pr_debug("acquiring DEV_DAX device %s via memremap\n", devname);
+
+ snprintf(sysfs_path, sizeof(sysfs_path),
+ "/sys/bus/dax/devices/%s/resource", devname);
+ ret = marufs_read_sysfs_u64(sysfs_path, &phys_addr);
+ if (ret) {
+ pr_err("cannot read resource for %s (%d)\n", devname, ret);
+ return ret;
+ }
+
+ snprintf(sysfs_path, sizeof(sysfs_path), "/sys/bus/dax/devices/%s/size",
+ devname);
+ ret = marufs_read_sysfs_u64(sysfs_path, &dev_size);
+ if (ret || dev_size == 0) {
+ pr_err("cannot read size for %s (%d)\n", devname, ret);
+ return ret ? ret : -EINVAL;
+ }
+
+ pr_debug("%s phys=0x%llx size=%llu (%llu MB)\n", devname, phys_addr,
+ dev_size, dev_size >> 20);
+
+ mapped = memremap(phys_addr, dev_size, MEMREMAP_WB);
+ if (!mapped) {
+ pr_err("memremap failed for %s\n", devname);
+ return -ENOMEM;
+ }
+
+ sbi->dax_base = mapped;
+ sbi->phys_base = phys_addr; /* Store physical address for DAX mmap */
+ sbi->dax_nr_pages = dev_size >> PAGE_SHIFT;
+ sbi->total_size = dev_size;
+
+ /*
+ * Detect ZONE_DEVICE struct pages
+ *
+ * If the device_dax driver has already created ZONE_DEVICE struct pages
+ * via devm_memremap_pages(), use VM_MIXEDMAP + vmf_insert_mixed() path
+ * so get_user_pages() works. This enables GPU DMA (cudaMemcpy etc.)
+ * to transfer directly from CXL memory without a bounce buffer.
+ */
+ sbi->has_struct_pages = pfn_valid(phys_addr >> PAGE_SHIFT);
+ if (sbi->has_struct_pages)
+ pr_debug(
+ "DEV_DAX %s - ZONE_DEVICE pages detected, VM_MIXEDMAP enabled (GPU direct DMA capable)\n",
+ devname);
+ else
+ pr_debug(
+ "DEV_DAX %s - no struct pages, using VM_PFNMAP (GPU DMA via bounce buffer)\n",
+ devname);
+
+ /* Open DAX device file for mmap delegation to device_dax driver.
+ * This allows NVIDIA driver to recognize the VMA as device_dax-backed,
+ * enabling cudaHostRegister on CXL memory. */
+ sbi->dax_filp = filp_open(devpath, O_RDWR, 0);
+ if (IS_ERR(sbi->dax_filp)) {
+ pr_warn("failed to open DAX device %s for mmap: %ld\n", devpath,
+ PTR_ERR(sbi->dax_filp));
+ sbi->dax_filp = NULL;
+ }
+
+ pr_debug("DEV_DAX %s acquired (%llu bytes, mapped at %p)\n", devname,
+ dev_size, mapped);
+ return 0;
+}
+
+/*
+ * marufs_gsb_checksum - CRC32 over immutable superblock fields.
+ *
+ * Covers: magic, version, total_size, shard_table_offset, rat_offset,
+ * num_shards, buckets_per_shard, entries_per_shard.
+ * Skips: checksum (self), reserved.
+ */
+static u32 marufs_gsb_checksum(const struct marufs_superblock *gsb)
+{
+ return crc32(~0, (const u8 *)gsb,
+ offsetof(struct marufs_superblock, checksum));
+}
+
+/*
+ * marufs_format_device - in-kernel format for DEV_DAX mode.
+ *
+ * Initialises filesystem metadata directly on the mapped memory.
+ * Layout: superblock + shard table + bucket/entry arrays + RAT.
+ */
+static int marufs_format_device(struct marufs_sb_info *sbi)
+{
+ void *base = sbi->dax_base;
+ u64 total = sbi->total_size;
+ u32 num_shards = MARUFS_REGION_NUM_SHARDS;
+ u32 entries_per_shard = MARUFS_REGION_ENTRIES_PER_SHARD;
+ u32 buckets_per_shard = MARUFS_REGION_BUCKETS_PER_SHARD;
+ u32 i;
+ u32 *bucket_base;
+
+ pr_debug("formatting DEV_DAX device (%llu bytes, %llu MB)\n",
+ total, total >> 20);
+
+ /* --- Layout calculation --- */
+ u64 bucket_array_start = MARUFS_INDEX_BUCKET_OFFSET;
+ u64 total_bucket_bytes =
+ (u64)num_shards * buckets_per_shard * sizeof(u32);
+ u64 entry_array_start = bucket_array_start + total_bucket_bytes;
+ u64 total_entry_bytes =
+ (u64)num_shards * entries_per_shard * MARUFS_INDEX_ENTRY_SIZE;
+ u64 index_pool_end = entry_array_start + total_entry_bytes;
+ u64 rat_offset = index_pool_end;
+ u64 rat_size = sizeof(struct marufs_rat);
+ u64 regions_start =
+ marufs_align_up(rat_offset + rat_size, MARUFS_ALIGN_2MB);
+
+ if (regions_start >= total) {
+ pr_err("device too small for metadata (%llu < %llu)\n", total,
+ regions_start);
+ return -ENOSPC;
+ }
+
+ if (entry_array_start != MARUFS_INDEX_ENTRY_OFFSET ||
+ rat_offset != MARUFS_RAT_OFFSET ||
+ regions_start != MARUFS_REGION_OFFSET) {
+ pr_err("FORMAT FAILED — layout mismatched\n");
+ return -EIO;
+ }
+
+ /* --- Zero the metadata area (skip bootstrap area) ---
+ *
+ * Bootstrap area [MARUFS_BOOTSTRAP_AREA_OFFSET, MARUFS_BOOTSTRAP_AREA_OFFSET +
+ * MARUFS_BOOTSTRAP_AREA_SIZE) is preserved so concurrent auto-mount
+ * claimants keep their slot state across a formatter restart.
+ * Zero [0 .. GSB_SIZE) and [shard_table_offset .. regions_start).
+ */
+ memset(base, 0, MARUFS_GSB_SIZE);
+ memset((char *)base + MARUFS_SHARD_TABLE_OFFSET, 0,
+ regions_start - MARUFS_SHARD_TABLE_OFFSET);
+ MARUFS_CXL_WMB(base, regions_start);
+
+ /* --- Step 1: Superblock (magic written LAST as format-complete fence) ---
+ *
+ * sbi->gsb is already cached by marufs_sbi_init_layout_ptrs() at mount
+ * start, so marufs_gsb_get() returns a valid pointer here.
+ */
+ struct marufs_superblock *gsb = marufs_gsb_get(sbi);
+ if (!gsb)
+ return -EINVAL;
+
+ /* magic intentionally deferred to end of format */
+ WRITE_LE32(gsb->version, MARUFS_VERSION);
+ WRITE_LE64(gsb->total_size, total);
+ WRITE_LE32(gsb->num_shards, num_shards);
+ WRITE_LE32(gsb->buckets_per_shard, buckets_per_shard);
+ WRITE_LE32(gsb->entries_per_shard, entries_per_shard);
+ WRITE_LE64(gsb->shard_table_offset, MARUFS_SHARD_TABLE_OFFSET);
+ WRITE_LE64(gsb->rat_offset, rat_offset);
+ /* --- Step 2: Shard table --- */
+ for (i = 0; i < num_shards; i++) {
+ struct marufs_shard_header *sh =
+ marufs_shard_header_get(sbi, i);
+ if (!sh)
+ return -EINVAL;
+
+ WRITE_LE32(sh->magic, MARUFS_SHARD_MAGIC);
+ WRITE_LE32(sh->shard_id, i);
+ WRITE_LE32(sh->num_buckets, buckets_per_shard);
+ WRITE_LE32(sh->num_entries, entries_per_shard);
+ WRITE_LE64(sh->bucket_array_offset,
+ bucket_array_start +
+ (u64)i * buckets_per_shard * sizeof(u32));
+ WRITE_LE64(sh->entry_array_offset,
+ entry_array_start + (u64)i * entries_per_shard *
+ MARUFS_INDEX_ENTRY_SIZE);
+ }
+
+ /* --- Step 3: Bucket arrays (all MARUFS_BUCKET_END) --- */
+ bucket_base = (u32 *)((char *)base + bucket_array_start);
+ for (i = 0; i < num_shards * buckets_per_shard; i++)
+ WRITE_LE32(bucket_base[i], MARUFS_BUCKET_END);
+
+ /* --- Step 4: Entry arrays already zeroed (ENTRY_EMPTY = 0) --- */
+
+ /* --- Step 5: RAT --- */
+ struct marufs_rat *rat = marufs_rat_get(sbi);
+ if (!rat)
+ return -EINVAL;
+
+ WRITE_LE32(rat->magic, MARUFS_RAT_MAGIC);
+ WRITE_LE32(rat->version, 1);
+ WRITE_LE32(rat->num_entries, 0);
+ WRITE_LE32(rat->max_entries, MARUFS_MAX_RAT_ENTRIES);
+ WRITE_LE64(rat->device_size, total);
+ WRITE_LE64(rat->rat_offset, rat_offset);
+ WRITE_LE64(rat->regions_start, regions_start);
+ WRITE_LE64(rat->total_allocated, 0);
+ WRITE_LE64(rat->total_free, total - regions_start);
+
+ /* --- Step 6: ME (Mutual Exclusion) area ---
+ * Size AND format use REQUEST strategy so request slots fit
+ * inside the reserved area and either strategy can be selected
+ * at runtime without re-formatting. ORDER sizing would leave
+ * format to overflow into the RAT region.
+ */
+ u64 me_offset = MARUFS_ME_AREA_OFFSET;
+ u64 me_size = marufs_me_area_size(MARUFS_ME_GLOBAL_SHARDS,
+ MARUFS_ME_MAX_NODES);
+ if (me_offset + me_size > regions_start) {
+ pr_err("no space for ME area (need %llu at 0x%llx, regions@0x%llx)\n",
+ me_size, me_offset, regions_start);
+ return -ENOSPC;
+ }
+
+ int ret = marufs_me_format((char *)base + me_offset,
+ MARUFS_ME_GLOBAL_SHARDS, MARUFS_ME_MAX_NODES,
+ MARUFS_ME_DEFAULT_POLL_US,
+ MARUFS_ME_REQUEST);
+ if (ret) {
+ pr_err("ME format failed: %d\n", ret);
+ return ret;
+ }
+
+ WRITE_LE64(gsb->me_area_offset, me_offset);
+
+ /* Full barrier to ensure all metadata except magic is flushed. */
+ MARUFS_CXL_WMB(base, regions_start);
+
+ /* --- Verify format: read back critical fields --- */
+ /* Check the first shard */
+ struct marufs_shard_header *vsh =
+ (struct marufs_shard_header *)((char *)base +
+ MARUFS_SHARD_TABLE_OFFSET);
+ u32 v_magic = READ_CXL_LE32(vsh->magic);
+ u32 v_buckets = READ_CXL_LE32(vsh->num_buckets);
+ u32 v_entries = READ_CXL_LE32(vsh->num_entries);
+ pr_debug(
+ "format verify the first shard: magic=0x%x buckets=%u entries=%u\n",
+ v_magic, v_buckets, v_entries);
+
+ /* Check the last shard (previously corrupted) */
+ vsh = (struct marufs_shard_header *)((char *)base +
+ MARUFS_SHARD_TABLE_OFFSET +
+ (MARUFS_REGION_NUM_SHARDS - 1) *
+ MARUFS_SHARD_HEADER_SIZE);
+ v_magic = READ_CXL_LE32(vsh->magic);
+ v_buckets = READ_CXL_LE32(vsh->num_buckets);
+ v_entries = READ_CXL_LE32(vsh->num_entries);
+ pr_debug(
+ "format verify the last shard: magic=0x%x buckets=%u entries=%u\n",
+ v_magic, v_buckets, v_entries);
+
+ /* Check RAT magic and first entry state */
+ u32 v_rat_magic = READ_CXL_LE32(rat->magic);
+ u32 v_rat_state0 = READ_CXL_LE32(rat->entries[0].state);
+ pr_debug("format verify RAT: magic=0x%x entry[0].state=%u\n",
+ v_rat_magic, v_rat_state0);
+
+ if (v_magic != MARUFS_SHARD_MAGIC || v_buckets != buckets_per_shard) {
+ pr_err("FORMAT VERIFICATION FAILED — WC memory not flushed\n");
+ return -EIO;
+ }
+
+ /*
+ * Write GSB magic LAST as the format-complete fence.
+ * Joiners poll for this magic to know format is done.
+ */
+ WRITE_LE32(gsb->magic, MARUFS_MAGIC);
+ u32 want_csum = marufs_gsb_checksum(gsb);
+ WRITE_LE32(gsb->checksum, want_csum);
+ MARUFS_CXL_WMB(gsb, sizeof(*gsb));
+
+ pr_info("format complete (shards=%u, entries/shard=%u, rat@0x%llx, regions@0x%llx)\n",
+ num_shards, entries_per_shard, rat_offset, regions_start);
+ return 0;
+}
+
+/* marufs_dax_release - unmap DEV_DAX memory. */
+static void marufs_dax_release(struct marufs_sb_info *sbi)
+{
+ if (sbi->dax_base)
+ memunmap(sbi->dax_base);
+ sbi->dax_base = NULL;
+}
+
+/* ============================================================================
+ * Read and validate Global Superblock
+ * ============================================================================ */
+
+static int marufs_read_superblock(struct marufs_sb_info *sbi,
+ struct super_block *sb, int silent)
+{
+ struct marufs_superblock *gsb = marufs_gsb_get(sbi);
+
+ if (!gsb)
+ return -EINVAL;
+
+ /* Validate magic */
+ u32 magic = READ_CXL_LE32(gsb->magic);
+ if (magic != MARUFS_MAGIC) {
+ if (!silent)
+ pr_err("invalid magic 0x%x (expected 0x%x)\n", magic,
+ MARUFS_MAGIC);
+ return -EINVAL;
+ }
+
+ /* Validate version (v2 required: region header pool layout) */
+ u32 version = READ_CXL_LE32(gsb->version);
+ if (version != MARUFS_VERSION) {
+ if (!silent)
+ pr_err("unsupported version %u (expected %u, reformat required)\n",
+ version, MARUFS_VERSION);
+ return -EINVAL;
+ }
+
+ /* Validate CRC32 (immutable fields only, excludes checksum/reserved) */
+ u32 stored = READ_CXL_LE32(gsb->checksum);
+ u32 computed = marufs_gsb_checksum(gsb);
+ if (stored != computed) {
+ if (!silent)
+ pr_err("superblock checksum mismatch (stored=0x%x, computed=0x%x)\n",
+ stored, computed);
+ return -EINVAL;
+ }
+
+ /* Copy geometry fields to sbi */
+ sbi->total_size = READ_CXL_LE64(gsb->total_size);
+ sbi->num_shards = READ_CXL_LE32(gsb->num_shards);
+ if (sbi->num_shards == 0 || !is_power_of_2(sbi->num_shards)) {
+ pr_err("num_shards %u is not a power of 2\n", sbi->num_shards);
+ return -EINVAL;
+ }
+ sbi->shard_mask = sbi->num_shards - 1;
+ sbi->buckets_per_shard = READ_CXL_LE32(gsb->buckets_per_shard);
+ if (sbi->buckets_per_shard == 0 ||
+ !is_power_of_2(sbi->buckets_per_shard)) {
+ pr_err("buckets_per_shard %u is not a power of 2\n",
+ sbi->buckets_per_shard);
+ return -EINVAL;
+ }
+ sbi->bucket_mask = sbi->buckets_per_shard - 1;
+ sbi->entries_per_shard = READ_CXL_LE32(gsb->entries_per_shard);
+
+ pr_debug("superblock validated\n");
+ pr_debug(" total_size=%llu, shards=%u, entries/shard=%u\n",
+ sbi->total_size, sbi->num_shards, sbi->entries_per_shard);
+
+ return 0;
+}
+
+/* ============================================================================
+ * Initialize Shard Table
+ * ============================================================================ */
+
+static int marufs_init_shard_table(struct marufs_sb_info *sbi)
+{
+ /* Validate shard_table_offset within DAX range */
+ u64 shard_table_offset = READ_CXL_LE64(sbi->gsb->shard_table_offset);
+ if (!marufs_dax_range_valid(sbi, shard_table_offset,
+ (u64)sbi->num_shards *
+ MARUFS_SHARD_HEADER_SIZE)) {
+ pr_err("shard_table_offset 0x%llx out of range\n",
+ shard_table_offset);
+ return -EINVAL;
+ }
+
+ WARN_ON_ONCE(shard_table_offset != MARUFS_SHARD_TABLE_OFFSET);
+
+ /* shard_cache + header pointers already populated by
+ * marufs_sbi_init_layout_ptrs at mount start. Here we validate each
+ * header and fill the buckets/entries pointers. */
+ for (u32 i = 0; i < sbi->num_shards; i++) {
+ struct marufs_shard_header *sh =
+ marufs_shard_header_get(sbi, i);
+ if (!sh) {
+ pr_err("shard_cache header NULL at %u\n", i);
+ return -EINVAL;
+ }
+
+ if (READ_CXL_LE32(sh->magic) != MARUFS_SHARD_MAGIC) {
+ pr_err("bad shard magic at %u (0x%x)\n", i,
+ READ_CXL_LE32(sh->magic));
+ marufs_free_shard_resources(sbi);
+ return -EINVAL;
+ }
+ if (READ_CXL_LE32(sh->shard_id) != i) {
+ pr_err("shard_id mismatch at %u (got %u)\n", i,
+ READ_CXL_LE32(sh->shard_id));
+ marufs_free_shard_resources(sbi);
+ return -EINVAL;
+ }
+
+ u32 nb = READ_LE32(sh->num_buckets);
+ u32 ne = READ_LE32(sh->num_entries);
+ if (!marufs_shard_geometry_valid(nb, ne)) {
+ pr_err("shard %u geometry invalid: buckets=%u entries=%u (expected %u)\n",
+ i, nb, ne, sbi->entries_per_shard);
+ marufs_free_shard_resources(sbi);
+ return -EINVAL;
+ }
+
+ u64 boff = READ_CXL_LE64(sh->bucket_array_offset);
+ u64 entry_off = READ_CXL_LE64(sh->entry_array_offset);
+
+ /* Validate offsets + array sizes within device bounds */
+ if (!marufs_dax_range_valid(sbi, boff, (u64)nb * sizeof(u32)) ||
+ !marufs_dax_range_valid(sbi, entry_off,
+ (u64)ne *
+ MARUFS_INDEX_ENTRY_SIZE)) {
+ pr_err("shard %u: offset out of bounds "
+ "(bucket=%llu entry=%llu total=%llu)\n",
+ i, boff, entry_off, sbi->total_size);
+ marufs_free_shard_resources(sbi);
+ return -EINVAL;
+ }
+
+ sbi->shard_cache[i].buckets = marufs_dax_ptr(sbi, boff);
+ sbi->shard_cache[i].entries = marufs_dax_ptr(sbi, entry_off);
+ }
+
+ pr_debug(
+ "shard table initialized (%u shards validated, RAM counters + shard cache allocated)\n",
+ sbi->num_shards);
+
+ return 0;
+}
+
+/* ============================================================================
+ * Load and validate RAT (Region Allocation Table)
+ * ============================================================================ */
+
+static int marufs_load_rat(struct marufs_sb_info *sbi)
+{
+ /* Get RAT offset from superblock */
+ u64 rat_offset = READ_CXL_LE64(sbi->gsb->rat_offset);
+ if (rat_offset == 0) {
+ pr_err("RAT not present! rat_offset=0 (filesystem too old or corrupted)\n");
+ return -EINVAL;
+ }
+
+ /* Validate rat_offset within DAX range */
+ if (!marufs_dax_range_valid(sbi, rat_offset,
+ sizeof(struct marufs_rat))) {
+ pr_err("rat_offset 0x%llx out of range\n", rat_offset);
+ return -EINVAL;
+ }
+
+ WARN_ON_ONCE(rat_offset != MARUFS_RAT_OFFSET);
+
+ /* sbi->rat already cached by marufs_sbi_init_layout_ptrs at mount start. */
+ struct marufs_rat *rat = marufs_rat_get(sbi);
+ if (!rat)
+ return -EINVAL;
+
+ /* Validate RAT magic */
+ u32 magic = READ_CXL_LE32(rat->magic);
+ if (magic != MARUFS_RAT_MAGIC) {
+ pr_err("invalid RAT magic 0x%x (expected 0x%x) at offset 0x%llx\n",
+ magic, MARUFS_RAT_MAGIC, rat_offset);
+ return -EINVAL;
+ }
+
+ /* Validate RAT version */
+ u32 version = READ_CXL_LE32(rat->version);
+ if (version != 1) {
+ pr_err("unsupported RAT version %u (expected 1)\n", version);
+ return -EINVAL;
+ }
+
+ pr_debug("RAT loaded at offset 0x%llx, RAT pointer=%p\n", rat_offset,
+ rat);
+ pr_debug(" device_size=%llu, regions_start=0x%llx\n",
+ READ_CXL_LE64(rat->device_size),
+ READ_CXL_LE64(rat->regions_start));
+ pr_debug(" num_entries=%u/%u\n", READ_CXL_LE32(rat->num_entries),
+ READ_CXL_LE32(rat->max_entries));
+ pr_debug(" total_allocated=%llu, total_free=%llu\n",
+ READ_CXL_LE64(rat->total_allocated),
+ READ_CXL_LE64(rat->total_free));
+
+ return 0;
+}
+
+/* ============================================================================
+ * Create root directory inode
+ * ============================================================================ */
+
+static struct inode *marufs_make_root_inode(struct super_block *sb)
+{
+ struct inode *inode;
+ struct marufs_inode_info *xi;
+
+ inode = new_inode(sb);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ inode->i_ino = MARUFS_ROOT_INO;
+ inode->i_mode = S_IFDIR | MARUFS_ROOT_DIR_MODE;
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
+
+ inode_set_atime_to_ts(inode, current_time(inode));
+ inode_set_mtime_to_ts(inode, current_time(inode));
+ inode_set_ctime_to_ts(inode, current_time(inode));
+
+ inode->i_op = &marufs_dir_inode_ops;
+ inode->i_fop = &marufs_dir_ops;
+
+ set_nlink(inode, 2);
+
+ xi = marufs_inode_get(inode);
+ xi->region_id = 0;
+ xi->entry_idx = 0;
+ xi->shard_id = 0;
+
+ return inode;
+}
+
+/* ============================================================================
+ * Common mount handling for DEV_DAX
+ * ============================================================================ */
+
+static int marufs_fill_super_common(struct super_block *sb,
+ struct marufs_sb_info *sbi, int silent,
+ enum marufs_me_strategy me_strategy)
+{
+ struct inode *root_inode;
+ struct dentry *root_dentry;
+ int ret;
+
+ /* Step 1: Read superblock */
+ ret = marufs_read_superblock(sbi, sb, silent);
+ if (ret) {
+ pr_err("failed to read superblock\n");
+ pr_err("mount with 'format' option to initialize: mount -t %s -o daxdev=...,node_id=%d,format none /mnt/...\n",
+ MARUFS_MODULE_NAME, sbi->node_id);
+ return ret;
+ }
+
+ /* Step 2: Initialize shard table */
+ ret = marufs_init_shard_table(sbi);
+ if (ret) {
+ pr_err("failed to initialize shard table\n");
+ goto err_free_gsb;
+ }
+
+ /* Step 2.5: Load RAT (Region Allocation Table) */
+ ret = marufs_load_rat(sbi);
+ if (ret) {
+ pr_err("failed to load RAT\n");
+ goto err_free_gsb;
+ }
+
+ /* Step 3: RAT mode - no fixed region initialization needed */
+ /* RAT entries are allocated on-demand during file creation */
+
+ /* Step 4: Configure VFS superblock */
+ sb->s_magic = MARUFS_MAGIC;
+ sb->s_blocksize = MARUFS_VFS_BLOCK_SIZE;
+ sb->s_blocksize_bits = MARUFS_VFS_BLOCK_SIZE_BITS;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+ sb->s_op = &marufs_sops;
+ MARUFS_SET_D_OP(sb, &marufs_dentry_ops);
+ sb->s_time_gran = 1;
+
+ /* Step 5: Create root inode */
+ root_inode = marufs_make_root_inode(sb);
+ if (IS_ERR(root_inode)) {
+ ret = PTR_ERR(root_inode);
+ goto err_free_gsb;
+ }
+
+ root_dentry = d_make_root(root_inode);
+ if (!root_dentry) {
+ ret = -ENOMEM;
+ goto err_free_gsb;
+ }
+
+ sb->s_root = root_dentry;
+
+ /* Step 6: Initialize cache */
+ ret = marufs_cache_init(sbi);
+ if (ret)
+ pr_warn("failed to initialize entry cache: %d\n", ret);
+
+ /* Step 7: Register sysfs */
+ ret = marufs_sysfs_register(sbi);
+ if (ret)
+ pr_warn("failed to register sysfs: %d\n", ret);
+
+ /* Step 7.5: Initialize Global ME */
+ u64 me_offset = READ_CXL_LE64(sbi->gsb->me_area_offset);
+ if (me_offset != MARUFS_ME_AREA_OFFSET) {
+ pr_err("ME area offset mismatch (got 0x%llx, expected 0x%x). Re-format required.\n",
+ me_offset, MARUFS_ME_AREA_OFFSET);
+ ret = -EINVAL;
+ goto err_free_gsb;
+ }
+
+ /* Start the unified ME poll thread (serves Global ME + all NRHT MEs) */
+ marufs_me_registry_init(sbi);
+ ret = marufs_me_registry_start(sbi);
+ if (ret) {
+ pr_err("ME registry start failed: %d\n", ret);
+ goto err_free_gsb;
+ }
+
+ /* sbi->me_area already cached by marufs_sbi_init_layout_ptrs at mount start. */
+ sbi->me = marufs_me_create(sbi->me_area, MARUFS_ME_GLOBAL_SHARDS,
+ MARUFS_ME_MAX_NODES, sbi->node_id,
+ MARUFS_ME_DEFAULT_POLL_US, me_strategy);
+ if (IS_ERR(sbi->me)) {
+ pr_err("ME create failed: %ld\n", PTR_ERR(sbi->me));
+ sbi->me = NULL;
+ ret = -ENOMEM;
+ marufs_me_registry_stop(sbi);
+ goto err_free_gsb;
+ }
+ pr_info("ME strategy: %s\n",
+ sbi->me->strategy == MARUFS_ME_ORDER ? "order" : "request");
+
+ ret = sbi->me->ops->join(sbi->me);
+ if (ret) {
+ pr_err("ME join failed: %d\n", ret);
+ marufs_me_destroy(sbi->me);
+ sbi->me = NULL;
+ marufs_me_registry_stop(sbi);
+ goto err_free_gsb;
+ }
+
+ /* Register with unified poll thread */
+ marufs_me_register(sbi, sbi->me);
+
+ /* Step 8: Start background GC thread */
+ ret = marufs_gc_start(sbi);
+ if (ret)
+ pr_warn("failed to start gc thread: %d\n", ret);
+
+ pr_info("filesystem mounted (node=%u, shards=%u)\n", sbi->node_id,
+ sbi->num_shards);
+ return 0;
+
+err_free_gsb:
+ /* Free shard_cache and per-shard DRAM arrays allocated before RAT loading */
+ marufs_free_shard_resources(sbi);
+
+ return ret;
+}
+
+/* ============================================================================
+ * Auto-mount helpers (called from marufs_fill_super)
+ * ============================================================================ */
+
+/*
+ * marufs_sbi_init_layout_ptrs - cache all on-disk layout pointers.
+ *
+ * Each cached pointer references a region at a compile-time fixed offset
+ * (GSB at 0, bootstrap area at MARUFS_BOOTSTRAP_AREA_OFFSET, RAT at
+ * MARUFS_RAT_OFFSET, ME area at MARUFS_ME_AREA_OFFSET). All four pointers
+ * are valid as soon as sbi->dax_base is set — even before format or
+ * read_superblock has populated the underlying memory. Callers that
+ * dereference the pointers must still ensure the content is initialised
+ * (e.g. format completed, read_superblock validated).
+ *
+ * Call once in marufs_fill_super after marufs_dax_acquire succeeds.
+ */
+static int marufs_sbi_init_layout_ptrs(struct marufs_sb_info *sbi)
+{
+ u32 i;
+
+ sbi->gsb = marufs_dax_ptr(sbi, MARUFS_GSB_OFFSET);
+ sbi->bootstrap_slots =
+ marufs_dax_ptr(sbi, MARUFS_BOOTSTRAP_AREA_OFFSET);
+ sbi->rat = marufs_dax_ptr(sbi, MARUFS_RAT_OFFSET);
+ sbi->me_area = marufs_dax_ptr(sbi, MARUFS_ME_AREA_OFFSET);
+
+ /* Allocate per-shard cache + populate header pointers (offset-derived).
+ * buckets/entries fields are filled later by marufs_init_shard_table
+ * after format/read_superblock validates shard content. */
+ sbi->shard_cache = kvmalloc_array(MARUFS_REGION_NUM_SHARDS,
+ sizeof(struct marufs_shard_cache),
+ GFP_KERNEL | __GFP_ZERO);
+ if (!sbi->shard_cache)
+ return -ENOMEM;
+
+ for (i = 0; i < MARUFS_REGION_NUM_SHARDS; i++) {
+ u64 off = MARUFS_SHARD_TABLE_OFFSET +
+ (u64)i * MARUFS_SHARD_HEADER_SIZE;
+ sbi->shard_cache[i].header = marufs_dax_ptr(sbi, off);
+ }
+ return 0;
+}
+
+/*
+ * marufs_check_needs_format - determine whether the device requires formatting.
+ *
+ * Check GSB magic and checksum; invalid means format needed.
+ */
+static bool marufs_check_needs_format(struct marufs_sb_info *sbi)
+{
+ struct marufs_superblock *gsb = marufs_gsb_get(sbi);
+ if (!gsb || READ_CXL_LE32(gsb->magic) != MARUFS_MAGIC)
+ return true;
+
+ /* Magic OK — validate checksum; stale layout leaves magic valid but
+ * checksum wrong, so reformat rather than failing the mount.
+ */
+ u32 stored = READ_CXL_LE32(gsb->checksum);
+ u32 computed = marufs_gsb_checksum(gsb);
+ if (stored != computed) {
+ pr_warn("bootstrap: GSB checksum mismatch (stored=0x%x computed=0x%x) — reformatting\n",
+ stored, computed);
+ return true;
+ }
+ return false;
+}
+
+/*
+ * marufs_run_formatter - formatter path for slot[0] on an unformatted device.
+ *
+ * Sets slot[0]=FORMATTING, optionally injects stuck state (chaos test), then
+ * calls marufs_format_device() and promotes slot[0] to CLAIMED on success.
+ *
+ * Return values:
+ * 0 success; slot[0]=CLAIMED, GSB magic written.
+ * -EOWNERDEAD inject_stuck path: slot[0] LEFT at FORMATTING for joiners to
+ * detect via timeout. Caller must NOT release the bootstrap
+ * slot but MUST run normal sbi teardown (shard_cache, nrht_stats
+ * etc.) — route to err_release_dax (which skips bs_release).
+ * other real format error; slot released, caller may goto err_release_dax.
+ */
+/*
+ * marufs_format_and_promote - format the device then promote slot[0] to CLAIMED.
+ *
+ * Shared between formatter election (marufs_run_formatter) and joiner-side
+ * stuck recovery (marufs_run_joiner). Caller must already own slot[0] at
+ * MARUFS_BS_FORMATTING. On format failure, releases the bootstrap slot.
+ */
+static int marufs_format_and_promote(struct marufs_sb_info *sbi)
+{
+ int ret = marufs_format_device(sbi);
+ if (ret) {
+ pr_err("bootstrap: format failed: %d\n", ret);
+ marufs_bootstrap_release(sbi);
+ return ret;
+ }
+ /* GSB magic was written inside format_device; promote slot */
+ marufs_bootstrap_promote_claimed(sbi);
+ return 0;
+}
+
+static int marufs_run_formatter(struct marufs_sb_info *sbi,
+ struct super_block *sb)
+{
+ pr_info("bootstrap: formatter elected (slot=0, node_id=%u)\n",
+ sbi->node_id);
+
+ /* Signal formatting in progress to joiners */
+ marufs_bootstrap_set_status(sbi, 0, MARUFS_BS_FORMATTING);
+
+ if (marufs_bootstrap_should_inject_stuck()) {
+ pr_info("bootstrap: DEBUG stuck-formatter injection active — leaving slot[0]=FORMATTING, aborting mount (test-only)\n");
+ return -EOWNERDEAD;
+ }
+
+ return marufs_format_and_promote(sbi);
+}
+
+/*
+ * marufs_run_joiner - joiner path: wait for formatter, handle stuck recovery.
+ *
+ * Returns 0 on success (GSB magic visible or recovery format complete).
+ * Returns -EAGAIN if we lost the steal race — caller must release its joiner
+ * slot and retry from marufs_bootstrap_claim.
+ * Returns other negative on unrecoverable error; slot released by callee.
+ */
+static int marufs_run_joiner(struct marufs_sb_info *sbi)
+{
+ int ret;
+
+ pr_info("bootstrap: joiner waiting for format (slot=%d, node_id=%u)\n",
+ sbi->bootstrap_slot_idx, sbi->node_id);
+
+ ret = marufs_bootstrap_wait_for_format(sbi);
+ if (ret == 0)
+ return 0;
+
+ if (ret != -EAGAIN) {
+ pr_err("bootstrap: wait_for_format error: %d\n", ret);
+ marufs_bootstrap_release(sbi);
+ return ret;
+ }
+
+ /* Formatter stuck — attempt steal + re-format */
+ ret = marufs_bootstrap_steal_stuck_slot0(sbi);
+ if (ret == -EAGAIN) {
+ /* Another node won the steal; caller releases joiner slot and
+ * retries from scratch. */
+ pr_info("bootstrap: steal lost, retrying mount\n");
+ return -EAGAIN;
+ }
+ if (ret) {
+ pr_err("bootstrap: steal failed: %d\n", ret);
+ return ret;
+ }
+
+ /* We now own slot[0] at FORMATTING; re-format and promote */
+ sbi->node_id = 1;
+ return marufs_format_and_promote(sbi);
+}
+
+/*
+ * marufs_auto_mount - top-level auto-mount: claim → format-or-wait.
+ *
+ * Handles the full claim/format/wait loop. On success returns 0 with sbi
+ * fully populated. On failure returns negative errno; bootstrap slot is
+ * released for normal errors but LEFT at FORMATTING for the inject_stuck
+ * test path (-EOWNERDEAD). In all error cases caller routes to
+ * err_release_dax for normal sbi teardown.
+ */
+static int marufs_auto_mount(struct marufs_sb_info *sbi, struct super_block *sb,
+ const struct marufs_mount_opts *opts)
+{
+ /* Initialise bootstrap area (no-op on well-formed devices) */
+ marufs_bootstrap_init_area(sbi->dax_base);
+
+ /*
+ * Claim → format/wait loop.
+ *
+ * Two retry triggers:
+ * (a) bootstrap_claim returns -EAGAIN: lost the per-slot write race;
+ * another node won this slot, retry the whole scan.
+ * (b) run_joiner returns -EAGAIN: detected a stuck formatter and
+ * lost the steal race for slot[0]; release our joiner slot and
+ * retry the claim from scratch.
+ *
+ * Bounded by EBUSY (table full) or other fatal errors propagating up.
+ */
+ int slot_idx = -1;
+ for (;;) {
+ int ret = marufs_bootstrap_claim(sbi, &slot_idx);
+ if (ret == -EAGAIN) {
+ pr_info("bootstrap: claim race lost, retrying\n");
+ continue;
+ }
+ if (ret) {
+ pr_err("bootstrap: claim failed: %d\n", ret);
+ return ret;
+ }
+
+ sbi->node_id = slot_idx + 1;
+ pr_info("bootstrap: node_id=%u (slot %d)\n", sbi->node_id,
+ slot_idx);
+
+ if (slot_idx == 0 && marufs_check_needs_format(sbi)) {
+ ret = marufs_run_formatter(sbi, sb);
+ /* -EAGAIN means inject_stuck: sbi already freed,
+ * propagate. */
+ if (ret)
+ return ret;
+ } else if (slot_idx != 0) {
+ ret = marufs_run_joiner(sbi);
+ if (ret == -EAGAIN) {
+ /* Lost steal race: release joiner slot, retry */
+ marufs_bootstrap_release(sbi);
+ continue;
+ }
+ if (ret)
+ return ret;
+ } else {
+ /* slot_idx == 0 && !needs_format: device already
+ * formatted, claimed a free slot[0] after reclaim.
+ * Ensure status is CLAIMED (may have been EMPTY). */
+ marufs_bootstrap_promote_claimed(sbi);
+ }
+
+ return 0;
+ }
+}
+
+/*
+ * marufs_manual_mount - manual node_id= path (legacy explicit mount).
+ *
+ * Optionally formats, then claims the explicit slot.
+ * Returns 0 on success, negative on error (DAX not released — caller handles).
+ */
+static int marufs_manual_mount(struct marufs_sb_info *sbi,
+ struct super_block *sb,
+ const struct marufs_mount_opts *opts)
+{
+ int ret;
+
+ if (opts->format) {
+ ret = marufs_format_device(sbi);
+ if (ret) {
+ pr_err("in-kernel format failed\n");
+ return ret;
+ }
+ }
+
+ /* Try to claim bootstrap slot for this node_id.
+ * Best-effort: if bootstrap area not yet initialised on an old
+ * image, the claim will see magic==0 and succeed silently.
+ */
+ ret = marufs_bootstrap_claim_explicit(sbi, opts->node_id);
+ if (ret == -EBUSY) {
+ pr_err("bootstrap: slot for node_id=%d already active\n",
+ opts->node_id);
+ return ret;
+ } else if (ret) {
+ /* Other errors (EAGAIN, EINVAL) — warn but continue */
+ pr_warn("bootstrap: explicit claim for node_id=%d: %d\n",
+ opts->node_id, ret);
+ }
+ return 0;
+}
+
+/* ============================================================================
+ * Mount handling - marufs_fill_super (DEV_DAX)
+ * ============================================================================ */
+
+int marufs_fill_super(struct super_block *sb, void *data, int silent)
+{
+ /* Parse mount options */
+ struct marufs_mount_opts opts;
+ int ret = marufs_parse_options((char *)data, &opts);
+ if (ret)
+ return ret;
+
+ /* Allocate marufs_sb_info */
+ struct marufs_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+ if (!sbi)
+ return -ENOMEM;
+
+ sb->s_fs_info = sbi;
+ sbi->bootstrap_slot_idx = -1; /* not yet claimed */
+ mutex_init(&sbi->vm_ops_lock);
+ mutex_init(&sbi->me_list_lock);
+ mutex_init(&sbi->nrht_me_lock);
+
+ /* Validate node_id only for explicit manual mounts (node_id ≥ 1).
+ * Auto-mount path (opts.node_id == 0) assigns node_id from bootstrap claim.
+ */
+ if (opts.node_id != 0) {
+ if (opts.node_id > MARUFS_MAX_NODE_ID) {
+ pr_err("node_id=%d invalid (valid range: 1..%d)\n",
+ opts.node_id, MARUFS_MAX_NODE_ID);
+ kfree(sbi);
+ return -EINVAL;
+ }
+ sbi->node_id = opts.node_id;
+ }
+ /* Auto-mount: node_id assigned after bootstrap claim below */
+
+ /* Fine-grained NRHT per-CPU stats (bucket-chain depth). alloc_percpu
+ * zeroes every field; recorders check for NULL for safety.
+ */
+ sbi->nrht_stats = alloc_percpu(struct marufs_nrht_stats_pcpu);
+ if (!sbi->nrht_stats) {
+ kfree(sbi);
+ return -ENOMEM;
+ }
+
+ /* DEV_DAX path */
+ if (!opts.daxdev[0]) {
+ pr_err("must specify daxdev= mount option\n");
+ ret = -EINVAL;
+ goto err_free_sbi;
+ }
+ strscpy(sbi->daxdev_path, opts.daxdev, sizeof(sbi->daxdev_path));
+ pr_debug("fill_super DEV_DAX %s (node_id=%d)\n", opts.daxdev,
+ opts.node_id);
+
+ ret = marufs_dax_acquire(sbi);
+ if (ret) {
+ pr_err("DEV_DAX acquisition failed for %s\n", opts.daxdev);
+ goto err_free_sbi;
+ }
+
+ /* Cache all on-disk layout pointers (GSB, bootstrap, RAT, ME area, shard headers). */
+ ret = marufs_sbi_init_layout_ptrs(sbi);
+ if (ret) {
+ pr_err("layout pointer init failed: %d\n", ret);
+ goto err_release_dax;
+ }
+
+ if (opts.node_id == 0)
+ ret = marufs_auto_mount(sbi, sb, &opts);
+ else
+ ret = marufs_manual_mount(sbi, sb, &opts);
+ if (ret)
+ goto err_release_dax;
+
+ /* Seed admin role cache before fill_super_common (GC starts inside). */
+ sbi->cached_admin_node_id = marufs_current_admin_node_id(sbi);
+
+ /* Common handling */
+ ret = marufs_fill_super_common(sb, sbi, silent, opts.me_strategy);
+ if (ret)
+ goto err_bs_release;
+
+ return 0;
+
+err_bs_release:
+ marufs_bootstrap_release(sbi);
+err_release_dax:
+ marufs_dax_release(sbi);
+err_free_sbi:
+ marufs_free_shard_resources(sbi);
+ free_percpu(sbi->nrht_stats);
+ kfree(sbi);
+ sb->s_fs_info = NULL;
+ return ret;
+}
+
+/* ============================================================================
+ * Mount/unmount callbacks
+ * ============================================================================ */
+
+static struct dentry *marufs_mount(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *data)
+{
+ /* DEV_DAX uses mount_nodev — no block device */
+ return mount_nodev(fs_type, flags, data, marufs_fill_super);
+}
+
+static void marufs_kill_sb(struct super_block *sb)
+{
+ struct marufs_sb_info *sbi = marufs_sb_get(sb);
+
+ if (sbi) {
+ /* Bootstrap: release slot (CLAIMED → EMPTY) */
+ marufs_bootstrap_release(sbi);
+
+ /* Teardown all NRHT ME instances (opt-in per rat_entry) */
+ for (u32 i = 0; i < MARUFS_MAX_RAT_ENTRIES; i++) {
+ marufs_me_teardown(sbi, sbi->nrht_me[i]);
+ sbi->nrht_me[i] = NULL;
+ }
+
+ /* Teardown Global ME (must happen before GC stop and dax_release) */
+ marufs_me_teardown(sbi, sbi->me);
+ sbi->me = NULL;
+
+ /* Stop the unified registry poll thread */
+ marufs_me_registry_stop(sbi);
+
+ /* Stop background GC thread */
+ marufs_gc_stop(sbi);
+
+ /* RAT mode: regions persist across mounts, GC handles cleanup */
+
+ /* Unregister sysfs */
+ marufs_sysfs_unregister(sbi);
+
+ /* Destroy entry cache */
+ marufs_cache_destroy(sbi);
+
+ /*
+ * Release non-DAX buffers. In DAX mode, shard_table and gsb
+ * point into CXL memory -- nothing to free.
+ */
+
+ /* Free per-shard RAM hint, counters, and shard cache */
+ marufs_free_shard_resources(sbi);
+
+ /* Close DAX device file (mmap delegation) */
+ if (sbi->dax_filp) {
+ filp_close(sbi->dax_filp, NULL);
+ sbi->dax_filp = NULL;
+ }
+
+ /* Release DAX mapping */
+ marufs_dax_release(sbi);
+
+ free_percpu(sbi->nrht_stats);
+ kfree(sbi);
+ }
+
+ kill_anon_super(sb);
+
+ pr_info("filesystem unmounted\n");
+}
+
+/* ============================================================================
+ * Filesystem type registration
+ * ============================================================================ */
+
+static struct file_system_type marufs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = MARUFS_MODULE_NAME,
+ .mount = marufs_mount,
+ .kill_sb = marufs_kill_sb,
+ .fs_flags = 0, /* No FS_REQUIRES_DEV - DEV_DAX support (mount_nodev) */
+};
+
+/* ============================================================================
+ * Inode cache initialization
+ * ============================================================================ */
+
+static void marufs_inode_init_once(void *obj)
+{
+ struct marufs_inode_info *xi = obj;
+ inode_init_once(&xi->vfs_inode);
+}
+
+/* ============================================================================
+ * Module init/exit
+ * ============================================================================ */
+
+static int __init marufs_init(void)
+{
+ int ret;
+
+ marufs_inode_cachep = kmem_cache_create(
+ "marufs_inode_cache", sizeof(struct marufs_inode_info), 0,
+ SLAB_RECLAIM_ACCOUNT | MARUFS_SLAB_MEM_SPREAD,
+ marufs_inode_init_once);
+ if (!marufs_inode_cachep) {
+ return -ENOMEM;
+ }
+
+ ret = register_filesystem(&marufs_fs_type);
+ if (ret) {
+ kmem_cache_destroy(marufs_inode_cachep);
+ return ret;
+ }
+
+ /* Initialize sysfs interface */
+ ret = marufs_sysfs_init();
+ if (ret) {
+ pr_err("failed to initialize sysfs\n");
+ unregister_filesystem(&marufs_fs_type);
+ kmem_cache_destroy(marufs_inode_cachep);
+ return ret;
+ }
+
+ pr_info("Partitioned Global Index filesystem loaded (node_id=%d)\n",
+ marufs_node_id);
+
+ return 0;
+}
+
+static void __exit marufs_exit(void)
+{
+ pr_info("unloading module\n");
+
+ /* Clean up sysfs interface */
+ marufs_sysfs_exit();
+
+ unregister_filesystem(&marufs_fs_type);
+ rcu_barrier();
+ kmem_cache_destroy(marufs_inode_cachep);
+
+ pr_info("module unloaded\n");
+}
+
+module_init(marufs_init);
+module_exit(marufs_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("XCMP Team");
+MODULE_DESCRIPTION(
+ "MARUFS - Partitioned Global Index filesystem for CXL shared memory");
+MODULE_VERSION("1.0");
diff --git a/marufs_kernel/src/super.h b/marufs_kernel/src/super.h
new file mode 100644
index 0000000..2363e3a
--- /dev/null
+++ b/marufs_kernel/src/super.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * super.h - Superblock fill / VFS mount entry point.
+ */
+
+#ifndef _MARUFS_SUPER_H
+#define _MARUFS_SUPER_H
+
+struct super_block;
+
+int marufs_fill_super(struct super_block *sb, void *data, int silent);
+
+#endif /* _MARUFS_SUPER_H */
diff --git a/marufs_kernel/src/sysfs.c b/marufs_kernel/src/sysfs.c
new file mode 100644
index 0000000..ce68446
--- /dev/null
+++ b/marufs_kernel/src/sysfs.c
@@ -0,0 +1,251 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * sysfs.c - MARUFS sysfs interface (core).
+ *
+ * Owns the registered-sbi list and the umbrella attribute groups.
+ * Domain-specific attributes are split out:
+ * sysfs_me.c - ME inspection + per-CPU stats
+ * sysfs_gc.c - GC monitoring (deleg_info, gc_status)
+ * sysfs_nrht.c - NRHT chain-depth histogram
+ * sysfs_debug.c - fault injection + manual GC control (debug group)
+ */
+
+#include
+#include
+#include
+#include
+#include
+
+#include "marufs.h"
+#include "sysfs_internal.h"
+#include "sysfs_me.h"
+#include "sysfs_gc.h"
+#include "sysfs_nrht.h"
+#include "sysfs_debug.h"
+
+static struct kobject *marufs_kobj;
+
+/* Shared with split sysfs_*.c files via sysfs_internal.h. */
+struct marufs_sb_info *marufs_sysfs_sbi_list[MARUFS_MAX_MOUNTS];
+DEFINE_MUTEX(marufs_sysfs_lock); /* Protects sbi_list access */
+
+struct marufs_sb_info *marufs_sysfs_get_sbi(void)
+{
+ int i;
+ for (i = 0; i < MARUFS_MAX_MOUNTS; i++) {
+ if (marufs_sysfs_sbi_list[i])
+ return marufs_sysfs_sbi_list[i];
+ }
+ return NULL;
+}
+
+struct marufs_sb_info *marufs_sysfs_find_by_node(u32 node_id)
+{
+ int i;
+ for (i = 0; i < MARUFS_MAX_MOUNTS; i++) {
+ if (marufs_sysfs_sbi_list[i] &&
+ marufs_sysfs_sbi_list[i]->node_id == node_id)
+ return marufs_sysfs_sbi_list[i];
+ }
+ return NULL;
+}
+
+/* /sys/fs/marufs/version */
+static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ return sysfs_emit(buf, "%d\n", MARUFS_VERSION);
+}
+static struct kobj_attribute version_attr = __ATTR_RO(version);
+
+/* /sys/fs/marufs/region_info */
+static ssize_t region_info_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct marufs_sb_info *sbi;
+ int len = 0;
+ u32 i;
+
+ mutex_lock(&marufs_sysfs_lock);
+ sbi = marufs_sysfs_get_sbi();
+ if (!sbi) {
+ mutex_unlock(&marufs_sysfs_lock);
+ return sysfs_emit(buf, "No filesystem mounted\n");
+ }
+
+ /* RAT mode: show RAT entries */
+ len += sysfs_emit_at(
+ buf, len, "RAT_Entry\tNode\tPID\tState\tSize\tOffset\tName\n");
+
+ for (i = 0; i < MARUFS_MAX_RAT_ENTRIES; i++) {
+ u32 state, owner_node, owner_pid;
+ u64 size, offset;
+ const char *state_str;
+ char name[MARUFS_NAME_MAX + 1];
+ struct marufs_rat_entry *entry = marufs_rat_entry_get(sbi, i);
+ if (!entry)
+ continue;
+
+ state = READ_LE32(entry->state);
+ if (state != MARUFS_RAT_ENTRY_ALLOCATED)
+ continue;
+
+ owner_node = READ_LE16(entry->owner_node_id);
+ owner_pid = READ_LE32(entry->owner_pid);
+ size = READ_LE64(entry->size);
+ offset = READ_LE64(entry->phys_offset);
+
+ /* Copy name safely */
+ memcpy(name, entry->name, MARUFS_NAME_MAX);
+ name[MARUFS_NAME_MAX] = '\0';
+
+ state_str = "ALLOCATED";
+
+ len += sysfs_emit_at(buf, len,
+ "%u\t%u\t%u\t%s\t%llu\t0x%llx\t%s\n", i,
+ owner_node, owner_pid, state_str, size,
+ offset, name);
+ }
+
+ mutex_unlock(&marufs_sysfs_lock);
+ return len;
+}
+static struct kobj_attribute region_info_attr = __ATTR_RO(region_info);
+
+/* /sys/fs/marufs/perm_info */
+static ssize_t perm_info_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct marufs_sb_info *sbi;
+ int len = 0;
+ u32 i;
+
+ mutex_lock(&marufs_sysfs_lock);
+ sbi = marufs_sysfs_get_sbi();
+ if (!sbi) {
+ mutex_unlock(&marufs_sysfs_lock);
+ return sysfs_emit(buf, "No filesystem mounted\n");
+ }
+
+ if (!sbi->rat) {
+ mutex_unlock(&marufs_sysfs_lock);
+ return sysfs_emit(buf, "No RAT\n");
+ }
+
+ len += sysfs_emit_at(buf, len, "RAT_Entry\tDefault\tDelegations\n");
+
+ MARUFS_CXL_RMB(sbi->rat, sizeof(*sbi->rat));
+ for (i = 0; i < MARUFS_MAX_RAT_ENTRIES; i++) {
+ u32 state, default_perms, num_deleg;
+ struct marufs_rat_entry *entry = marufs_rat_entry_get(sbi, i);
+ if (!entry)
+ continue;
+
+ state = READ_LE32(entry->state);
+ if (state != MARUFS_RAT_ENTRY_ALLOCATED)
+ continue;
+
+ /* Invalidate CL2 (default_perms, owner_node_id, deleg_num_entries, ...) */
+ MARUFS_CXL_RMB(&entry->default_perms, 64);
+ default_perms = READ_CXL_LE16(entry->default_perms);
+ num_deleg = READ_CXL_LE16(entry->deleg_num_entries);
+ len += sysfs_emit_at(buf, len, "%u\t0x%04x\t%u\n", i,
+ default_perms, num_deleg);
+ }
+
+ mutex_unlock(&marufs_sysfs_lock);
+ return len;
+}
+static struct kobj_attribute perm_info_attr = __ATTR_RO(perm_info);
+
+static struct attribute *marufs_attrs[] = {
+ &version_attr.attr,
+ ®ion_info_attr.attr,
+ &perm_info_attr.attr,
+ &deleg_info_attr.attr,
+ &gc_status_attr.attr,
+ &me_info_attr.attr,
+ &me_poll_stats_attr.attr,
+ &me_fine_stats_attr.attr,
+ &me_per_shard_acquire_attr.attr,
+ &me_poll_thread_cpu_attr.attr,
+ &nrht_chain_depth_attr.attr,
+ NULL,
+};
+
+static struct attribute_group marufs_attr_group = {
+ .attrs = marufs_attrs,
+};
+
+static struct attribute *marufs_debug_attrs[] = {
+ &me_freeze_heartbeat_attr.attr,
+ &me_sync_is_holder_attr.attr,
+ &gc_trigger_attr.attr,
+ &gc_stop_attr.attr,
+ &gc_pause_attr.attr,
+ &gc_restart_attr.attr,
+ &bootstrap_dump_attr.attr,
+ NULL,
+};
+
+static struct attribute_group marufs_debug_attr_group = {
+ .name = "debug",
+ .attrs = marufs_debug_attrs,
+};
+
+int marufs_sysfs_init(void)
+{
+ marufs_kobj = kobject_create_and_add(MARUFS_MODULE_NAME, fs_kobj);
+ if (!marufs_kobj)
+ return -ENOMEM;
+ int ret = sysfs_create_group(marufs_kobj, &marufs_attr_group);
+ if (ret)
+ return ret;
+
+ ret = sysfs_create_group(marufs_kobj, &marufs_debug_attr_group);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+void marufs_sysfs_exit(void)
+{
+ if (marufs_kobj) {
+ sysfs_remove_group(marufs_kobj, &marufs_attr_group);
+ sysfs_remove_group(marufs_kobj, &marufs_debug_attr_group);
+ kobject_put(marufs_kobj);
+ marufs_kobj = NULL;
+ }
+}
+
+int marufs_sysfs_register(struct marufs_sb_info *sbi)
+{
+ int i;
+ mutex_lock(&marufs_sysfs_lock);
+ for (i = 0; i < MARUFS_MAX_MOUNTS; i++) {
+ if (!marufs_sysfs_sbi_list[i]) {
+ marufs_sysfs_sbi_list[i] = sbi;
+ mutex_unlock(&marufs_sysfs_lock);
+ return 0;
+ }
+ }
+ mutex_unlock(&marufs_sysfs_lock);
+ pr_warn("sysfs: max %d mounts reached, sysfs may not track this mount\n",
+ MARUFS_MAX_MOUNTS);
+ return 0; /* Non-fatal: mount still succeeds */
+}
+
+void marufs_sysfs_unregister(struct marufs_sb_info *sbi)
+{
+ int i;
+ mutex_lock(&marufs_sysfs_lock);
+ for (i = 0; i < MARUFS_MAX_MOUNTS; i++) {
+ if (marufs_sysfs_sbi_list[i] == sbi) {
+ marufs_sysfs_sbi_list[i] = NULL;
+ mutex_unlock(&marufs_sysfs_lock);
+ return;
+ }
+ }
+ mutex_unlock(&marufs_sysfs_lock);
+}
diff --git a/marufs_kernel/src/sysfs_debug.c b/marufs_kernel/src/sysfs_debug.c
new file mode 100644
index 0000000..444afd4
--- /dev/null
+++ b/marufs_kernel/src/sysfs_debug.c
@@ -0,0 +1,371 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * sysfs_debug.c - MARUFS test-only sysfs attrs (fault injection for ME
+ * crash-detection tests and bootstrap chaos tests). Not for production use.
+ */
+
+#include
+#include
+#include
+#include
+#include
+
+#include "marufs.h"
+#include "me.h"
+#include "sysfs_debug.h"
+#include "sysfs_internal.h"
+
+/*
+ * me_freeze_heartbeat - fault injection: simulate a crashed node.
+ *
+ * Read: one line per registered mount — "node= frozen=<0|1>".
+ * Write format: " <0|1>".
+ * Sets the debug_freeze_poll flag on every ME instance owned by any
+ * sbi whose node_id matches . A non-zero value makes that
+ * node's poll thread skip the entire poll cycle (heartbeat, grant,
+ * doorbell), so from peers' point of view the node looks crashed —
+ * their wait_for_token deadline path will exercise the liveness
+ * probe and self-takeover. Zero re-enables normal operation.
+ *
+ * Scoping by node_id (not a module-global flag) lets local multi-mount
+ * setups freeze one node without killing all peers on the same machine.
+ */
+static ssize_t me_freeze_heartbeat_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ ssize_t n = 0;
+
+ mutex_lock(&marufs_sysfs_lock);
+ for (int i = 0; i < MARUFS_MAX_MOUNTS; i++) {
+ struct marufs_sb_info *sbi = marufs_sysfs_sbi_list[i];
+ if (!sbi)
+ continue;
+
+ mutex_lock(&sbi->me_list_lock);
+ struct marufs_me_instance *me;
+ int frozen = 0;
+ list_for_each_entry(me, &sbi->me_list, list_node) {
+ frozen = atomic_read(&me->debug_freeze_poll);
+ break; /* uniform across sbi's MEs */
+ }
+ mutex_unlock(&sbi->me_list_lock);
+
+ n += scnprintf(buf + n, PAGE_SIZE - n, "node=%u frozen=%d\n",
+ sbi->node_id, frozen);
+ }
+ mutex_unlock(&marufs_sysfs_lock);
+ return n;
+}
+
+static ssize_t me_freeze_heartbeat_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned int node_id, val;
+ if (sscanf(buf, "%u %u", &node_id, &val) != 2)
+ return -EINVAL;
+
+ mutex_lock(&marufs_sysfs_lock);
+ for (int i = 0; i < MARUFS_MAX_MOUNTS; i++) {
+ struct marufs_sb_info *sbi = marufs_sysfs_sbi_list[i];
+ if (!sbi || sbi->node_id != node_id)
+ continue;
+
+ mutex_lock(&sbi->me_list_lock);
+ struct marufs_me_instance *me;
+ list_for_each_entry(me, &sbi->me_list, list_node)
+ atomic_set(&me->debug_freeze_poll, !!val);
+ mutex_unlock(&sbi->me_list_lock);
+ }
+ mutex_unlock(&marufs_sysfs_lock);
+ return count;
+}
+
+struct kobj_attribute me_freeze_heartbeat_attr =
+ __ATTR(me_freeze_heartbeat, 0600, me_freeze_heartbeat_show,
+ me_freeze_heartbeat_store);
+
+/*
+ * me_sync_is_holder - test-only: reset DRAM is_holder flags to match CB.
+ *
+ * Writing any value walks every registered mount's MEs and per-shard
+ * forces `sh->is_holder = (cb.holder == me->node_id)`. Used between
+ * fault-injection tests to purge stale flags left by takeover races
+ * (peer reclaimed CB while our DRAM flag never got updated because
+ * poll was frozen). Production relies on the poll thread keeping
+ * DRAM in sync via grant/receive paths, so this knob is test-only.
+ */
+static ssize_t me_sync_is_holder_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ mutex_lock(&marufs_sysfs_lock);
+ for (int i = 0; i < MARUFS_MAX_MOUNTS; i++) {
+ struct marufs_sb_info *sbi = marufs_sysfs_sbi_list[i];
+ if (!sbi)
+ continue;
+
+ mutex_lock(&sbi->me_list_lock);
+ struct marufs_me_instance *me;
+ list_for_each_entry(me, &sbi->me_list, list_node) {
+ for (u32 s = 0; s < me->num_shards; s++) {
+ struct marufs_me_shard *sh =
+ me_shard_get(me, s);
+ u32 h = me_cb_snapshot(&me->cbs[s], NULL);
+ if (h == me->node_id)
+ me_shard_become_holder(sh);
+ else
+ me_shard_lose_holder(sh);
+ }
+ }
+ mutex_unlock(&sbi->me_list_lock);
+ }
+ mutex_unlock(&marufs_sysfs_lock);
+ return count;
+}
+
+struct kobj_attribute me_sync_is_holder_attr =
+ __ATTR(me_sync_is_holder, 0200, NULL, me_sync_is_holder_store);
+
+/*
+ * GC trigger - write any value to trigger manual GC on all local mounts.
+ * Each mount's GC sweeps its own node's dead entries. Debug-only:
+ * normal operation relies on the per-sbi GC kthread.
+ */
+static ssize_t gc_trigger_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf,
+ size_t count)
+{
+ int i;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ mutex_lock(&marufs_sysfs_lock);
+ for (i = 0; i < MARUFS_MAX_MOUNTS; i++) {
+ if (marufs_sysfs_sbi_list[i])
+ marufs_gc_reclaim_dead_regions(
+ marufs_sysfs_sbi_list[i]);
+ }
+ mutex_unlock(&marufs_sysfs_lock);
+
+ return count;
+}
+
+struct kobj_attribute gc_trigger_attr =
+ __ATTR(gc_trigger, 0200, NULL, gc_trigger_store);
+
+/*
+ * GC stop - write node_id to stop specific node's GC, "all" to stop all.
+ * echo 1 > gc_stop # stop node_id=1
+ * echo all > gc_stop # stop all
+ */
+static ssize_t gc_stop_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ mutex_lock(&marufs_sysfs_lock);
+ if (strncmp(buf, "all", 3) == 0) {
+ int i;
+
+ pr_info("emergency GC stop (all) via sysfs\n");
+ for (i = 0; i < MARUFS_MAX_MOUNTS; i++) {
+ if (marufs_sysfs_sbi_list[i])
+ marufs_gc_stop(marufs_sysfs_sbi_list[i]);
+ }
+ } else {
+ u32 node_id;
+ struct marufs_sb_info *sbi;
+
+ if (kstrtou32(buf, 0, &node_id)) {
+ mutex_unlock(&marufs_sysfs_lock);
+ return -EINVAL;
+ }
+
+ sbi = marufs_sysfs_find_by_node(node_id);
+ if (!sbi) {
+ mutex_unlock(&marufs_sysfs_lock);
+ return -ENOENT;
+ }
+
+ pr_info("GC stop for node %u via sysfs\n", node_id);
+ marufs_gc_stop(sbi);
+ }
+ mutex_unlock(&marufs_sysfs_lock);
+
+ return count;
+}
+
+struct kobj_attribute gc_stop_attr = __ATTR(gc_stop, 0200, NULL, gc_stop_store);
+
+/*
+ * GC pause - temporarily pause GC without killing thread.
+ * Read: shows pause state per node (e.g., "node1:0 node2:1")
+ * Write: "1" or "0" for all, "node_id:1" or "node_id:0" for specific node.
+ * echo 1 > gc_pause # pause all
+ * echo 0 > gc_pause # resume all
+ * echo 1:1 > gc_pause # pause node_id=1
+ * echo 2:0 > gc_pause # resume node_id=2
+ */
+static ssize_t gc_pause_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ int len = 0;
+ int i;
+
+ mutex_lock(&marufs_sysfs_lock);
+ for (i = 0; i < MARUFS_MAX_MOUNTS; i++) {
+ if (marufs_sysfs_sbi_list[i])
+ len += sysfs_emit_at(
+ buf, len, "node%u:%d ",
+ marufs_sysfs_sbi_list[i]->node_id,
+ atomic_read(
+ &marufs_sysfs_sbi_list[i]->gc_paused));
+ }
+ mutex_unlock(&marufs_sysfs_lock);
+
+ if (len == 0)
+ return sysfs_emit(buf, "0\n");
+
+ len += sysfs_emit_at(buf, len, "\n");
+ return len;
+}
+
+static ssize_t gc_pause_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ u32 node_id;
+ int val;
+ const char *colon;
+
+ colon = strchr(buf, ':');
+ if (colon) {
+ /* "node_id:0" or "node_id:1" format */
+ struct marufs_sb_info *sbi;
+
+ if (kstrtou32(buf, 0, &node_id))
+ return -EINVAL;
+ if (kstrtoint(colon + 1, 0, &val))
+ return -EINVAL;
+
+ mutex_lock(&marufs_sysfs_lock);
+ sbi = marufs_sysfs_find_by_node(node_id);
+ if (!sbi) {
+ mutex_unlock(&marufs_sysfs_lock);
+ return -ENOENT;
+ }
+
+ atomic_set(&sbi->gc_paused, val ? 1 : 0);
+ mutex_unlock(&marufs_sysfs_lock);
+ pr_debug("GC %s for node %u via sysfs\n",
+ val ? "paused" : "resumed", node_id);
+ } else {
+ /* Plain "0" or "1" — apply to all */
+ bool pause;
+ int i;
+
+ if (kstrtobool(buf, &pause))
+ return -EINVAL;
+
+ mutex_lock(&marufs_sysfs_lock);
+ for (i = 0; i < MARUFS_MAX_MOUNTS; i++) {
+ if (marufs_sysfs_sbi_list[i])
+ atomic_set(&marufs_sysfs_sbi_list[i]->gc_paused,
+ pause ? 1 : 0);
+ }
+ mutex_unlock(&marufs_sysfs_lock);
+ pr_debug("GC %s (all) via sysfs\n",
+ pause ? "paused" : "resumed");
+ }
+
+ return count;
+}
+
+struct kobj_attribute gc_pause_attr =
+ __ATTR(gc_pause, 0600, gc_pause_show, gc_pause_store);
+
+/*
+ * GC restart - restart dead GC threads.
+ * echo 1 > gc_restart # restart node_id=1
+ * echo all > gc_restart # restart all dead threads
+ */
+static ssize_t gc_restart_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf,
+ size_t count)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ mutex_lock(&marufs_sysfs_lock);
+ if (strncmp(buf, "all", 3) == 0) {
+ int i;
+
+ for (i = 0; i < MARUFS_MAX_MOUNTS; i++) {
+ if (marufs_sysfs_sbi_list[i])
+ marufs_gc_restart(marufs_sysfs_sbi_list[i]);
+ }
+ } else {
+ u32 node_id;
+ struct marufs_sb_info *sbi;
+
+ if (kstrtou32(buf, 0, &node_id)) {
+ mutex_unlock(&marufs_sysfs_lock);
+ return -EINVAL;
+ }
+
+ sbi = marufs_sysfs_find_by_node(node_id);
+ if (!sbi) {
+ mutex_unlock(&marufs_sysfs_lock);
+ return -ENODEV;
+ }
+
+ marufs_gc_restart(sbi);
+ }
+ mutex_unlock(&marufs_sysfs_lock);
+
+ return count;
+}
+
+struct kobj_attribute gc_restart_attr =
+ __ATTR(gc_restart, 0200, NULL, gc_restart_store);
+
+/*
+ * bootstrap_inject_stuck_formatter is now a module param
+ * (/sys/module//parameters/bootstrap_inject_stuck_formatter).
+ * It must be set BEFORE mounting the formatter node, so a per-sbi sysfs
+ * attribute (which only exists after sb construction) is too late.
+ * The old per-sbi implementation has been removed.
+ */
+
+/*
+ * bootstrap_dump - read-only: dump bootstrap slot table for all mounts.
+ *
+ * Read: one line per slot per mount.
+ */
+static ssize_t bootstrap_dump_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ ssize_t n = 0;
+
+ mutex_lock(&marufs_sysfs_lock);
+ for (int i = 0; i < MARUFS_MAX_MOUNTS; i++) {
+ struct marufs_sb_info *sbi = marufs_sysfs_sbi_list[i];
+ if (!sbi)
+ continue;
+ n += scnprintf(buf + n, PAGE_SIZE - n,
+ "=== mount node=%u ===\n", sbi->node_id);
+ if (n < PAGE_SIZE - 1)
+ n += marufs_bootstrap_dump_slots(sbi, buf + n,
+ PAGE_SIZE - n);
+ }
+ mutex_unlock(&marufs_sysfs_lock);
+ return n;
+}
+
+struct kobj_attribute bootstrap_dump_attr =
+ __ATTR(bootstrap_dump, 0400, bootstrap_dump_show, NULL);
diff --git a/marufs_kernel/src/sysfs_debug.h b/marufs_kernel/src/sysfs_debug.h
new file mode 100644
index 0000000..3b38f24
--- /dev/null
+++ b/marufs_kernel/src/sysfs_debug.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * sysfs_debug.h - Debug / test-only sysfs attributes for MARUFS.
+ *
+ * Two categories:
+ * 1. Fault injection (me_freeze_heartbeat, me_sync_is_holder) for
+ * reproducible ME crash-detection tests.
+ * 2. Manual GC control (gc_trigger, gc_stop, gc_pause, gc_restart) —
+ * not used in steady-state operation; reserved for operators
+ * forcing GC sweeps or pausing reaping during diagnostics.
+ *
+ * Read-only GC monitoring (gc_status) and delegation inspection
+ * (deleg_info) live in sysfs_gc.c — those are production-safe.
+ */
+
+#ifndef _MARUFS_SYSFS_DEBUG_H
+#define _MARUFS_SYSFS_DEBUG_H
+
+#include
+
+/* Fault injection attrs. */
+extern struct kobj_attribute me_freeze_heartbeat_attr;
+extern struct kobj_attribute me_sync_is_holder_attr;
+
+/* Manual GC control attrs. */
+extern struct kobj_attribute gc_trigger_attr;
+extern struct kobj_attribute gc_stop_attr;
+extern struct kobj_attribute gc_pause_attr;
+extern struct kobj_attribute gc_restart_attr;
+
+/* Bootstrap dump attr (inject_stuck is now a module param, not sysfs). */
+extern struct kobj_attribute bootstrap_dump_attr;
+
+#endif /* _MARUFS_SYSFS_DEBUG_H */
diff --git a/marufs_kernel/src/sysfs_gc.c b/marufs_kernel/src/sysfs_gc.c
new file mode 100644
index 0000000..1121dfc
--- /dev/null
+++ b/marufs_kernel/src/sysfs_gc.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * sysfs_gc.c - GC monitoring sysfs attributes (deleg_info, gc_status).
+ *
+ * Manual GC control (trigger/stop/pause/restart) is in sysfs_debug.c.
+ */
+
+#include
+#include
+#include
+#include
+#include
+
+#include "marufs.h"
+#include "sysfs_internal.h"
+#include "sysfs_gc.h"
+
+/*
+ * /sys/fs/marufs/deleg_info - per-region delegation detail
+ * Write a region_id (RAT index), then read back all delegation entries.
+ */
+static u32 deleg_info_region_id;
+
+static ssize_t deleg_info_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ int len = 0;
+
+ mutex_lock(&marufs_sysfs_lock);
+ struct marufs_sb_info *sbi = marufs_sysfs_get_sbi();
+ if (!sbi) {
+ mutex_unlock(&marufs_sysfs_lock);
+ return sysfs_emit(buf, "No filesystem mounted\n");
+ }
+
+ u32 rid = deleg_info_region_id;
+ struct marufs_rat_entry *entry = marufs_rat_entry_get(sbi, rid);
+ if (!entry) {
+ mutex_unlock(&marufs_sysfs_lock);
+ return sysfs_emit(buf, "Invalid region_id %u\n", rid);
+ }
+
+ u32 state = READ_LE32(entry->state);
+ if (state != MARUFS_RAT_ENTRY_ALLOCATED) {
+ mutex_unlock(&marufs_sysfs_lock);
+ return sysfs_emit(buf, "region %u not ALLOCATED (state=%u)\n",
+ rid, state);
+ }
+
+ char name[MARUFS_NAME_MAX + 1];
+ memcpy(name, entry->name, MARUFS_NAME_MAX);
+ name[MARUFS_NAME_MAX] = '\0';
+ len += sysfs_emit_at(buf, len, "region: %u name: %s\n", rid, name);
+
+ for (u32 i = 0; i < MARUFS_DELEG_MAX_ENTRIES; i++) {
+ struct marufs_deleg_entry *de = &entry->deleg_entries[i];
+ u32 ds = READ_CXL_LE32(de->state);
+
+ if (ds == MARUFS_DELEG_EMPTY)
+ continue;
+
+ MARUFS_CXL_RMB(de, sizeof(*de));
+ len += sysfs_emit_at(
+ buf, len,
+ " deleg[%u]: state=%u node=%u pid=%u perms=0x%x birth_time=%llu\n",
+ i, ds, READ_CXL_LE32(de->node_id),
+ READ_CXL_LE32(de->pid), READ_CXL_LE32(de->perms),
+ READ_CXL_LE64(de->birth_time));
+ }
+
+ mutex_unlock(&marufs_sysfs_lock);
+ return len;
+}
+
+static ssize_t deleg_info_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf,
+ size_t count)
+{
+ u32 rid;
+ if (kstrtou32(buf, 0, &rid))
+ return -EINVAL;
+ if (rid >= MARUFS_MAX_RAT_ENTRIES)
+ return -EINVAL;
+
+ deleg_info_region_id = rid;
+ return count;
+}
+
+struct kobj_attribute deleg_info_attr =
+ __ATTR(deleg_info, 0644, deleg_info_show, deleg_info_store);
+
+/*
+ * GC status - show per-node GC thread liveness and epoch counter.
+ * cat gc_status => "node1:alive epoch=42 node2:dead epoch=0"
+ */
+static ssize_t gc_status_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ int len = 0;
+ int i;
+
+ mutex_lock(&marufs_sysfs_lock);
+ for (i = 0; i < MARUFS_MAX_MOUNTS; i++) {
+ struct marufs_sb_info *sbi = marufs_sysfs_sbi_list[i];
+ if (!sbi)
+ continue;
+
+ {
+ bool has_thread = sbi->gc_thread != NULL;
+ int epoch = atomic_read(&sbi->gc_epoch);
+
+ len += sysfs_emit_at(buf, len, "node%u:%s epoch=%d ",
+ sbi->node_id,
+ has_thread ? "running" : "stopped",
+ epoch);
+ }
+ }
+ mutex_unlock(&marufs_sysfs_lock);
+
+ if (len == 0)
+ return sysfs_emit(buf, "no mounts\n");
+
+ len += sysfs_emit_at(buf, len, "\n");
+ return len;
+}
+
+struct kobj_attribute gc_status_attr =
+ __ATTR(gc_status, 0444, gc_status_show, NULL);
diff --git a/marufs_kernel/src/sysfs_gc.h b/marufs_kernel/src/sysfs_gc.h
new file mode 100644
index 0000000..573a8d1
--- /dev/null
+++ b/marufs_kernel/src/sysfs_gc.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * sysfs_gc.h - GC monitoring sysfs attributes.
+ *
+ * Read-only / config-only knobs that surface GC state for operators:
+ * deleg_info - per-region delegation table dump
+ * gc_status - per-node GC thread liveness + epoch counter
+ *
+ * Manual GC control (trigger/stop/pause/restart) lives in sysfs_debug.c.
+ */
+
+#ifndef _MARUFS_SYSFS_GC_H
+#define _MARUFS_SYSFS_GC_H
+
+#include
+
+extern struct kobj_attribute deleg_info_attr;
+extern struct kobj_attribute gc_status_attr;
+
+#endif /* _MARUFS_SYSFS_GC_H */
diff --git a/marufs_kernel/src/sysfs_internal.h b/marufs_kernel/src/sysfs_internal.h
new file mode 100644
index 0000000..a999c4d
--- /dev/null
+++ b/marufs_kernel/src/sysfs_internal.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * sysfs_internal.h - shared state and helpers across MARUFS sysfs sources.
+ *
+ * sysfs.c owns the registered-sbi list and the lock that protects it.
+ * Domain-split files (sysfs_me.c, sysfs_gc.c, sysfs_nrht.c, sysfs_debug.c)
+ * include this header to access them without a forest of duplicated
+ * extern declarations.
+ */
+
+#ifndef _MARUFS_SYSFS_INTERNAL_H
+#define _MARUFS_SYSFS_INTERNAL_H
+
+#include
+
+#include "marufs.h"
+
+#define MARUFS_MAX_MOUNTS MARUFS_MAX_NODE_ID
+
+extern struct marufs_sb_info *marufs_sysfs_sbi_list[MARUFS_MAX_MOUNTS];
+extern struct mutex marufs_sysfs_lock;
+
+/* Return first registered sbi (any is fine for shared-CXL reads). */
+struct marufs_sb_info *marufs_sysfs_get_sbi(void);
+
+/* Find sbi by node_id; NULL if not registered. */
+struct marufs_sb_info *marufs_sysfs_find_by_node(u32 node_id);
+
+#endif /* _MARUFS_SYSFS_INTERNAL_H */
diff --git a/marufs_kernel/src/sysfs_me.c b/marufs_kernel/src/sysfs_me.c
new file mode 100644
index 0000000..8482b40
--- /dev/null
+++ b/marufs_kernel/src/sysfs_me.c
@@ -0,0 +1,617 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * sysfs_me.c - ME sysfs attributes (state inspection + per-CPU stats).
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "marufs.h"
+#include "me.h"
+#include "me_stats.h"
+#include "sysfs_internal.h"
+#include "sysfs_me.h"
+
+/*
+ * /sys/fs/marufs/me_info - dump ME (Mutual Exclusion) state for debugging.
+ *
+ * Write a tag to select scope, then read:
+ * echo global > me_info # only Global ME
+ * echo > me_info # only NRHT ME for region
+ * echo all > me_info # all registered ME instances (default)
+ *
+ * Output per ME instance:
+ * [] strategy= shards= max_nodes= local_node=
+ * members:
+ * shard : holder= state= hb=
+ * gen= acq= holding= waiters=
+ * cached_succ= last_hb_obs=
+ */
+#define MARUFS_ME_INFO_ALL ((u32) - 1)
+#define MARUFS_ME_INFO_GLOB ((u32) - 2)
+static u32 me_info_filter = MARUFS_ME_INFO_ALL;
+
+static const char *me_state_name(u32 s)
+{
+ switch (s) {
+ case 0:
+ return "FREE";
+ case 1:
+ return "HELD";
+ case 2:
+ return "RELEASING";
+ default:
+ return "?";
+ }
+}
+
+static const char *me_format_tag(struct marufs_sb_info *sbi,
+ struct marufs_me_instance *me, char *buf,
+ size_t buflen)
+{
+ if (me == sbi->me) {
+ snprintf(buf, buflen, "global");
+ return buf;
+ }
+ for (u32 i = 0; i < MARUFS_MAX_RAT_ENTRIES; i++) {
+ if (sbi->nrht_me[i] == me) {
+ snprintf(buf, buflen, "nrht[%u]", i);
+ return buf;
+ }
+ }
+ snprintf(buf, buflen, "unknown");
+ return buf;
+}
+
+static int me_emit_instance(char *buf, int len, struct marufs_sb_info *sbi,
+ struct marufs_me_instance *me)
+{
+ char tag[24];
+ me_format_tag(sbi, me, tag, sizeof(tag));
+
+ MARUFS_CXL_RMB(me->header, sizeof(*me->header));
+ len += sysfs_emit_at(
+ buf, len,
+ "[%s] strategy=%s shards=%u max_nodes=%u local_node=%u active=%d\n",
+ tag, me->strategy == MARUFS_ME_REQUEST ? "request" : "order",
+ me->num_shards, me->max_nodes, me->node_id,
+ atomic_read(&me->active));
+
+ /* Membership: list ACTIVE nodes */
+ len += sysfs_emit_at(buf, len, " members:");
+ int first = 1;
+ /* slot[i] is for external node_id (i + 1) */
+ for (u32 n = 0; n < me->max_nodes; n++) {
+ struct marufs_me_membership_slot *ms = me_membership_get(me, n);
+ u32 status = READ_CXL_LE32(ms->status);
+ if (status == MARUFS_ME_ACTIVE) {
+ len += sysfs_emit_at(buf, len, "%s%u",
+ first ? " " : ",", n + 1);
+ first = 0;
+ }
+ }
+ if (first)
+ len += sysfs_emit_at(buf, len, " (none)");
+ len += sysfs_emit_at(buf, len, "\n");
+
+ for (u32 s = 0; s < me->num_shards; s++) {
+ struct marufs_me_cb *cb = me_cb_get(me, s);
+
+ u32 holder = READ_CXL_LE32(cb->holder);
+ u32 state = READ_CXL_LE32(cb->state);
+ u64 gen = READ_CXL_LE64(cb->generation);
+ u64 acq = READ_CXL_LE64(cb->acquire_count);
+
+ /* Heartbeat now lives per-node on the holder's membership slot */
+ u64 hb = 0;
+ if (marufs_me_is_valid_node(me, holder)) {
+ struct marufs_me_membership_slot *hms =
+ me_membership_get(me, holder - 1);
+ hb = READ_CXL_LE64(hms->heartbeat);
+ }
+
+ char hbuf[16];
+ if (holder == (u32)0xFF || holder == (u32)-1)
+ snprintf(hbuf, sizeof(hbuf), "NONE");
+ else
+ snprintf(hbuf, sizeof(hbuf), "%u", holder);
+
+ struct marufs_me_shard *sh = me->shards ? &me->shards[s] : NULL;
+
+ len += sysfs_emit_at(
+ buf, len,
+ " shard %u: holder=%s state=%s hb=%llu gen=%llu acq=%llu holding=%d waiters=%d cached_succ=%u is_holder=%d\n",
+ s, hbuf, me_state_name(state), hb, gen, acq,
+ sh ? atomic_read(&sh->holding) : 0,
+ sh ? atomic_read(&sh->local_waiters) : 0,
+ sh ? sh->cached_successor : 0, sh ? sh->is_holder : 0);
+
+ /* Own doorbell slot — surfaces token-pass state for debugging
+ * (who rang, which seq, what gen). last_* are DRAM baselines
+ * used by wait_for_token's phantom filter.
+ */
+ if (me->slots) {
+ struct marufs_me_slot *ms = me_my_slot(me, s);
+ u32 from = READ_CXL_LE32(ms->from_node);
+ u64 tseq = READ_CXL_LE64(ms->token_seq);
+ u64 cgaw = READ_CXL_LE64(ms->cb_gen_at_write);
+ u32 req = READ_CXL_LE32(ms->requesting);
+ u32 rseq = READ_CXL_LE32(ms->sequence);
+ u64 rat = READ_CXL_LE64(ms->requested_at);
+ u64 gat = READ_CXL_LE64(ms->granted_at);
+ u64 last_seq = sh ? sh->last_token_seq : 0;
+ u64 last_gen = sh ? sh->last_cb_gen : 0;
+
+ len += sysfs_emit_at(
+ buf, len,
+ " my_slot: from=%u seq=%llu cb_gen_at_write=%llu last_seq=%llu last_gen=%llu req=%u rseq=%u req_at=%llu grant_at=%llu\n",
+ from, tseq, cgaw, last_seq, last_gen, req, rseq,
+ rat, gat);
+ }
+
+ /* Guard against PAGE_SIZE overflow */
+ if (len > PAGE_SIZE - 512) {
+ len += sysfs_emit_at(buf, len, " ... (truncated)\n");
+ return len;
+ }
+ }
+ return len;
+}
+
+static ssize_t me_info_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ mutex_lock(&marufs_sysfs_lock);
+
+ u32 filter = me_info_filter;
+ int len = sysfs_emit_at(buf, 0, "filter=");
+ if (filter == MARUFS_ME_INFO_ALL)
+ len += sysfs_emit_at(buf, len, "all\n");
+ else if (filter == MARUFS_ME_INFO_GLOB)
+ len += sysfs_emit_at(buf, len, "global\n");
+ else
+ len += sysfs_emit_at(buf, len, "nrht[%u]\n", filter);
+
+ /* Iterate ALL registered sbis — NRHT ME instances live only in the
+ * sbi whose node_id first triggered nrht_me_get. Without this loop,
+ * reading from a mount that hasn't done NRHT ops hides those MEs.
+ */
+ int any_sbi = 0;
+ for (int i = 0; i < MARUFS_MAX_MOUNTS; i++) {
+ struct marufs_sb_info *sbi = marufs_sysfs_sbi_list[i];
+ if (!sbi)
+ continue;
+ any_sbi = 1;
+
+ len += sysfs_emit_at(buf, len, "== node %u ==\n", sbi->node_id);
+
+ mutex_lock(&sbi->me_list_lock);
+ struct marufs_me_instance *me;
+ list_for_each_entry(me, &sbi->me_list, list_node) {
+ if (filter == MARUFS_ME_INFO_GLOB && me != sbi->me)
+ continue;
+ if (filter != MARUFS_ME_INFO_ALL &&
+ filter != MARUFS_ME_INFO_GLOB) {
+ if (filter >= MARUFS_MAX_RAT_ENTRIES ||
+ sbi->nrht_me[filter] != me)
+ continue;
+ }
+ len = me_emit_instance(buf, len, sbi, me);
+ if (len > PAGE_SIZE - 256)
+ break;
+ }
+ mutex_unlock(&sbi->me_list_lock);
+ if (len > PAGE_SIZE - 256)
+ break;
+ }
+ mutex_unlock(&marufs_sysfs_lock);
+
+ if (!any_sbi)
+ return sysfs_emit(buf, "No filesystem mounted\n");
+ if (len == 0)
+ return sysfs_emit(buf, "no ME instances\n");
+ return len;
+}
+
+static ssize_t me_info_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ u32 rid;
+
+ if (sysfs_streq(buf, "all"))
+ me_info_filter = MARUFS_ME_INFO_ALL;
+ else if (sysfs_streq(buf, "global"))
+ me_info_filter = MARUFS_ME_INFO_GLOB;
+ else if (!kstrtou32(buf, 0, &rid) && rid < MARUFS_MAX_RAT_ENTRIES)
+ me_info_filter = rid;
+ else
+ return -EINVAL;
+ return count;
+}
+
+struct kobj_attribute me_info_attr =
+ __ATTR(me_info, 0644, me_info_show, me_info_store);
+
+/*
+ * /sys/fs/marufs/me_poll_stats - per-ME poll-thread cost counters.
+ *
+ * Shows cumulative RMB counts (CB / slot / membership), poll-cycle
+ * invocations, and wall-clock ns spent inside ops->poll_cycle(). Useful
+ * for quantifying polling overhead and validating optimizations that
+ * reduce CXL traffic (e.g. pending-mask scan skip).
+ *
+ * Write any value → reset all counters across every registered ME.
+ */
+static ssize_t me_poll_stats_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ int len = 0;
+ int any_sbi = 0;
+
+ mutex_lock(&marufs_sysfs_lock);
+ for (int i = 0; i < MARUFS_MAX_MOUNTS; i++) {
+ struct marufs_sb_info *sbi = marufs_sysfs_sbi_list[i];
+ if (!sbi)
+ continue;
+
+ any_sbi = 1;
+ len += sysfs_emit_at(buf, len, "== node %u ==\n", sbi->node_id);
+
+ mutex_lock(&sbi->me_list_lock);
+ struct marufs_me_instance *me;
+
+ list_for_each_entry(me, &sbi->me_list, list_node) {
+ u64 cycles = atomic64_read(&me->poll_cycles);
+ u64 ns = atomic64_read(&me->poll_ns_total);
+ u64 rmb_cb = atomic64_read(&me->poll_rmb_cb);
+ u64 rmb_slot = atomic64_read(&me->poll_rmb_slot);
+ u64 rmb_mem = atomic64_read(&me->poll_rmb_membership);
+ u64 avg_ns = cycles ? ns / cycles : 0;
+ const char *tag = (me == sbi->me) ? "global" : "nrht";
+
+ len += sysfs_emit_at(
+ buf, len,
+ " %s shards=%u strategy=%s cycles=%llu ns_total=%llu ns_avg=%llu rmb_cb=%llu rmb_slot=%llu rmb_membership=%llu\n",
+ tag, me->num_shards,
+ me->strategy == MARUFS_ME_ORDER ? "order" :
+ "request",
+ cycles, ns, avg_ns, rmb_cb, rmb_slot, rmb_mem);
+ if (len > PAGE_SIZE - 256)
+ break;
+ }
+ mutex_unlock(&sbi->me_list_lock);
+ if (len > PAGE_SIZE - 256)
+ break;
+ }
+ mutex_unlock(&marufs_sysfs_lock);
+
+ if (!any_sbi)
+ return sysfs_emit(buf, "No filesystem mounted\n");
+ if (len == 0)
+ return sysfs_emit(buf, "no ME instances\n");
+ return len;
+}
+
+/*
+ * me_poll_stats_store - reset handler. Writing ANY value (content ignored)
+ * zeros every poll counter across every registered ME on every mount.
+ * Intended for benchmark harnesses that delta-measure a timed window —
+ * reset immediately before the run, read immediately after.
+ */
+static ssize_t me_poll_stats_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf,
+ size_t count)
+{
+ mutex_lock(&marufs_sysfs_lock);
+ for (int i = 0; i < MARUFS_MAX_MOUNTS; i++) {
+ struct marufs_sb_info *sbi = marufs_sysfs_sbi_list[i];
+
+ if (!sbi)
+ continue;
+ mutex_lock(&sbi->me_list_lock);
+ struct marufs_me_instance *me;
+
+ list_for_each_entry(me, &sbi->me_list, list_node) {
+ atomic64_set(&me->poll_cycles, 0);
+ atomic64_set(&me->poll_ns_total, 0);
+ atomic64_set(&me->poll_rmb_cb, 0);
+ atomic64_set(&me->poll_rmb_slot, 0);
+ atomic64_set(&me->poll_rmb_membership, 0);
+ }
+ mutex_unlock(&sbi->me_list_lock);
+ }
+ mutex_unlock(&marufs_sysfs_lock);
+ return count;
+}
+
+struct kobj_attribute me_poll_stats_attr =
+ __ATTR(me_poll_stats, 0644, me_poll_stats_show, me_poll_stats_store);
+
+/*
+ * Aggregate a per-CPU struct marufs_me_stats_pcpu into a zero-initialised
+ * output. Plain field-by-field sum across for_each_possible_cpu; cost
+ * scales with CPU count but runs only on sysfs read.
+ */
+static void me_aggregate_stats(struct marufs_me_instance *me,
+ struct marufs_me_stats_pcpu *out)
+{
+ int cpu;
+
+ memset(out, 0, sizeof(*out));
+ if (!me->stats)
+ return;
+ for_each_possible_cpu(cpu) {
+ struct marufs_me_stats_pcpu *p = per_cpu_ptr(me->stats, cpu);
+
+ out->wait_count += p->wait_count;
+ out->wait_wall_ns += p->wait_wall_ns;
+ out->wait_cpu_ns += p->wait_cpu_ns;
+ out->wait_spin_hit += p->wait_spin_hit;
+ out->wait_sleep_hit += p->wait_sleep_hit;
+ out->wait_deadline_hit += p->wait_deadline_hit;
+ out->wait_fast_hit += p->wait_fast_hit;
+ out->poll_ns_membership += p->poll_ns_membership;
+ out->poll_ns_doorbell += p->poll_ns_doorbell;
+ out->poll_ns_scan += p->poll_ns_scan;
+ out->lock_hold_count += p->lock_hold_count;
+ out->lock_hold_ns_total += p->lock_hold_ns_total;
+ out->grant_age_count += p->grant_age_count;
+ for (int b = 0; b < MARUFS_ME_LAT_BUCKETS; b++) {
+ out->wait_lat_buckets[b] += p->wait_lat_buckets[b];
+ out->lock_hold_buckets[b] += p->lock_hold_buckets[b];
+ out->grant_age_buckets[b] += p->grant_age_buckets[b];
+ }
+ for (int s = 0; s < MARUFS_NRHT_MAX_NUM_SHARDS; s++)
+ out->per_shard_acquire[s] += p->per_shard_acquire[s];
+ }
+}
+
+/*
+ * me_emit_buckets - shared helper for log2(ns) bucket rows.
+ * Prints " =a/b/c/..." where each slot is one bucket count.
+ */
+static int me_emit_buckets(char *buf, int len, const char *name,
+ const u64 *buckets, int nbuckets)
+{
+ len += sysfs_emit_at(buf, len, " %s=[", name);
+ for (int i = 0; i < nbuckets; i++)
+ len += sysfs_emit_at(buf, len, "%s%llu", i ? "," : "",
+ buckets[i]);
+ len += sysfs_emit_at(buf, len, "]\n");
+ return len;
+}
+
+/*
+ * /sys/fs/marufs/me_fine_stats - per-ME fine-grained counters.
+ *
+ * Aggregates per-CPU counters across all CPUs and emits a human-readable
+ * dump per registered ME instance. Covers:
+ * - wait_for_token: count, wall+cpu ns, hit phase split, lat histogram
+ * - poll_cycle phase breakdown (membership/doorbell/scan ns)
+ * - lock hold time (sum + histogram)
+ * - grant age histogram (request-mode)
+ *
+ * Per-shard acquire distribution and NRHT chain depth are exposed via
+ * dedicated nodes (me_per_shard_acquire, nrht_chain_depth).
+ *
+ * Write any value → zero all per-CPU counters across every ME.
+ */
+static ssize_t me_fine_stats_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ int len = 0;
+ int any_sbi = 0;
+
+ mutex_lock(&marufs_sysfs_lock);
+ for (int i = 0; i < MARUFS_MAX_MOUNTS; i++) {
+ struct marufs_sb_info *sbi = marufs_sysfs_sbi_list[i];
+ if (!sbi)
+ continue;
+
+ any_sbi = 1;
+ len += sysfs_emit_at(buf, len, "== node %u ==\n", sbi->node_id);
+
+ mutex_lock(&sbi->me_list_lock);
+ struct marufs_me_instance *me;
+
+ list_for_each_entry(me, &sbi->me_list, list_node) {
+ struct marufs_me_stats_pcpu agg;
+ me_aggregate_stats(me, &agg);
+
+ const char *tag = (me == sbi->me) ? "global" : "nrht";
+
+ /* Raw totals — consumers (bench harness) diff between
+ * reset + read and compute avg/util themselves. One
+ * key=value per line keeps parsing trivial.
+ */
+ len += sysfs_emit_at(buf, len,
+ " %s shards=%u strategy=%s\n",
+ tag, me->num_shards,
+ me->strategy == MARUFS_ME_ORDER ?
+ "order" :
+ "request");
+ len += sysfs_emit_at(
+ buf, len,
+ " wait count=%llu wall_ns=%llu cpu_ns=%llu\n",
+ agg.wait_count, agg.wait_wall_ns,
+ agg.wait_cpu_ns);
+ len += sysfs_emit_at(
+ buf, len,
+ " wait_hit spin=%llu sleep=%llu deadline=%llu fast=%llu\n",
+ agg.wait_spin_hit, agg.wait_sleep_hit,
+ agg.wait_deadline_hit, agg.wait_fast_hit);
+ len = me_emit_buckets(buf, len, "wait_lat_buckets",
+ agg.wait_lat_buckets,
+ MARUFS_ME_LAT_BUCKETS);
+ len += sysfs_emit_at(
+ buf, len,
+ " poll_ns mem=%llu doorbell=%llu scan=%llu\n",
+ agg.poll_ns_membership, agg.poll_ns_doorbell,
+ agg.poll_ns_scan);
+ len += sysfs_emit_at(
+ buf, len,
+ " lock_hold count=%llu ns_total=%llu\n",
+ agg.lock_hold_count, agg.lock_hold_ns_total);
+ len = me_emit_buckets(buf, len, "lock_hold_buckets",
+ agg.lock_hold_buckets,
+ MARUFS_ME_LAT_BUCKETS);
+ len += sysfs_emit_at(buf, len,
+ " grant_age count=%llu\n",
+ agg.grant_age_count);
+ len = me_emit_buckets(buf, len, "grant_age_buckets",
+ agg.grant_age_buckets,
+ MARUFS_ME_LAT_BUCKETS);
+
+ if (len > PAGE_SIZE - 512)
+ break;
+ }
+ mutex_unlock(&sbi->me_list_lock);
+ if (len > PAGE_SIZE - 512)
+ break;
+ }
+ mutex_unlock(&marufs_sysfs_lock);
+
+ if (!any_sbi)
+ return sysfs_emit(buf, "No filesystem mounted\n");
+ if (len == 0)
+ return sysfs_emit(buf, "no ME instances\n");
+ return len;
+}
+
+static ssize_t me_fine_stats_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf,
+ size_t count)
+{
+ int cpu;
+
+ mutex_lock(&marufs_sysfs_lock);
+ for (int i = 0; i < MARUFS_MAX_MOUNTS; i++) {
+ struct marufs_sb_info *sbi = marufs_sysfs_sbi_list[i];
+
+ if (!sbi)
+ continue;
+ mutex_lock(&sbi->me_list_lock);
+ struct marufs_me_instance *me;
+
+ list_for_each_entry(me, &sbi->me_list, list_node) {
+ if (!me->stats)
+ continue;
+ for_each_possible_cpu(cpu) {
+ memset(per_cpu_ptr(me->stats, cpu), 0,
+ sizeof(struct marufs_me_stats_pcpu));
+ }
+ }
+ mutex_unlock(&sbi->me_list_lock);
+ }
+ mutex_unlock(&marufs_sysfs_lock);
+ return count;
+}
+
+struct kobj_attribute me_fine_stats_attr =
+ __ATTR(me_fine_stats, 0644, me_fine_stats_show, me_fine_stats_store);
+
+/*
+ * /sys/fs/marufs/me_per_shard_acquire - per-shard acquire hotspot.
+ *
+ * Exposed as "shard=count" lines, one per shard. Long output when
+ * num_shards is high; skip emitting zero-count shards to keep it tight.
+ *
+ * Reset handled by writing to me_fine_stats (shared per-CPU struct).
+ */
+static ssize_t me_per_shard_acquire_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ int len = 0;
+ int any_sbi = 0;
+
+ mutex_lock(&marufs_sysfs_lock);
+ for (int i = 0; i < MARUFS_MAX_MOUNTS; i++) {
+ struct marufs_sb_info *sbi = marufs_sysfs_sbi_list[i];
+ if (!sbi)
+ continue;
+
+ any_sbi = 1;
+ len += sysfs_emit_at(buf, len, "== node %u ==\n", sbi->node_id);
+
+ mutex_lock(&sbi->me_list_lock);
+ struct marufs_me_instance *me;
+
+ list_for_each_entry(me, &sbi->me_list, list_node) {
+ struct marufs_me_stats_pcpu agg;
+ me_aggregate_stats(me, &agg);
+
+ const char *tag = (me == sbi->me) ? "global" : "nrht";
+ len += sysfs_emit_at(buf, len, " %s shards=%u\n", tag,
+ me->num_shards);
+ u32 cap = min_t(u32, me->num_shards,
+ MARUFS_NRHT_MAX_NUM_SHARDS);
+ for (u32 s = 0; s < cap; s++) {
+ if (!agg.per_shard_acquire[s])
+ continue;
+ len += sysfs_emit_at(
+ buf, len, " shard=%u count=%llu\n",
+ s, agg.per_shard_acquire[s]);
+ if (len > PAGE_SIZE - 128)
+ break;
+ }
+ }
+ mutex_unlock(&sbi->me_list_lock);
+ if (len > PAGE_SIZE - 128)
+ break;
+ }
+ mutex_unlock(&marufs_sysfs_lock);
+
+ if (!any_sbi)
+ return sysfs_emit(buf, "No filesystem mounted\n");
+ return len;
+}
+
+struct kobj_attribute me_per_shard_acquire_attr =
+ __ATTR(me_per_shard_acquire, 0444, me_per_shard_acquire_show, NULL);
+
+/*
+ * /sys/fs/marufs/me_poll_thread_cpu - cumulative on-CPU time of the
+ * per-sbi ME poll kthread.
+ *
+ * Emits one line per sbi:
+ * node= cpu_ns= wall_ns=
+ *
+ * Consumer (bench harness) reads before and after a timed window;
+ * delta(cpu_ns) / delta(wall_ns) = per-CPU utilization of the poll
+ * thread (not wait-relative like me_fine_stats::cpu_ns). The cumulative
+ * `sum_exec_runtime` counter can't be reset, so diff-based sampling is
+ * mandatory. Read-only.
+ */
+static ssize_t me_poll_thread_cpu_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ int len = 0;
+ int any_sbi = 0;
+
+ mutex_lock(&marufs_sysfs_lock);
+ for (int i = 0; i < MARUFS_MAX_MOUNTS; i++) {
+ struct marufs_sb_info *sbi = marufs_sysfs_sbi_list[i];
+ if (!sbi || !sbi->me_poll_thread)
+ continue;
+
+ any_sbi = 1;
+ u64 cpu_ns = sbi->me_poll_thread->se.sum_exec_runtime;
+ u64 wall_ns = ktime_get_ns();
+
+ len += sysfs_emit_at(buf, len,
+ "node=%u cpu_ns=%llu wall_ns=%llu\n",
+ sbi->node_id, cpu_ns, wall_ns);
+ }
+ mutex_unlock(&marufs_sysfs_lock);
+
+ if (!any_sbi)
+ return sysfs_emit(buf, "No filesystem mounted\n");
+ return len;
+}
+
+struct kobj_attribute me_poll_thread_cpu_attr =
+ __ATTR(me_poll_thread_cpu, 0444, me_poll_thread_cpu_show, NULL);
diff --git a/marufs_kernel/src/sysfs_me.h b/marufs_kernel/src/sysfs_me.h
new file mode 100644
index 0000000..5542eb3
--- /dev/null
+++ b/marufs_kernel/src/sysfs_me.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * sysfs_me.h - ME (Mutual Exclusion) sysfs attributes.
+ *
+ * Surfaces ME state inspection (me_info) and per-CPU stats counters
+ * (me_poll_stats, me_fine_stats, me_per_shard_acquire, me_poll_thread_cpu).
+ * Production-safe — read-mostly, write paths are reset-only.
+ */
+
+#ifndef _MARUFS_SYSFS_ME_H
+#define _MARUFS_SYSFS_ME_H
+
+#include
+
+extern struct kobj_attribute me_info_attr;
+extern struct kobj_attribute me_poll_stats_attr;
+extern struct kobj_attribute me_fine_stats_attr;
+extern struct kobj_attribute me_per_shard_acquire_attr;
+extern struct kobj_attribute me_poll_thread_cpu_attr;
+
+#endif /* _MARUFS_SYSFS_ME_H */
diff --git a/marufs_kernel/src/sysfs_nrht.c b/marufs_kernel/src/sysfs_nrht.c
new file mode 100644
index 0000000..761ae2d
--- /dev/null
+++ b/marufs_kernel/src/sysfs_nrht.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * sysfs_nrht.c - NRHT (Non-Resident Hash Table) sysfs attributes.
+ */
+
+#include
+#include
+#include
+#include
+#include
+
+#include "marufs.h"
+#include "nrht_stats.h"
+#include "sysfs_internal.h"
+#include "sysfs_nrht.h"
+
+/*
+ * /sys/fs/marufs/nrht_chain_depth - NRHT bucket-chain walk histogram.
+ *
+ * Write any value → reset across every sbi.
+ */
+static ssize_t nrht_chain_depth_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ int len = 0;
+ int any_sbi = 0;
+
+ mutex_lock(&marufs_sysfs_lock);
+ for (int i = 0; i < MARUFS_MAX_MOUNTS; i++) {
+ struct marufs_sb_info *sbi = marufs_sysfs_sbi_list[i];
+ if (!sbi || !sbi->nrht_stats)
+ continue;
+
+ any_sbi = 1;
+ struct marufs_nrht_stats_pcpu agg = { 0 };
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct marufs_nrht_stats_pcpu *p =
+ per_cpu_ptr(sbi->nrht_stats, cpu);
+
+ agg.find_chain_count += p->find_chain_count;
+ agg.find_chain_steps_total += p->find_chain_steps_total;
+ for (int b = 0; b < MARUFS_NRHT_DEPTH_BUCKETS; b++)
+ agg.chain_depth_buckets[b] +=
+ p->chain_depth_buckets[b];
+ }
+
+ u64 avg_steps = agg.find_chain_count ?
+ agg.find_chain_steps_total /
+ agg.find_chain_count :
+ 0;
+ len += sysfs_emit_at(
+ buf, len,
+ "node=%u find_chain count=%llu steps_total=%llu avg=%llu\n",
+ sbi->node_id, agg.find_chain_count,
+ agg.find_chain_steps_total, avg_steps);
+ len += sysfs_emit_at(buf, len, " depth_log2=[");
+ for (int b = 0; b < MARUFS_NRHT_DEPTH_BUCKETS; b++)
+ len += sysfs_emit_at(buf, len, "%s%llu", b ? "," : "",
+ agg.chain_depth_buckets[b]);
+ len += sysfs_emit_at(buf, len, "]\n");
+ }
+ mutex_unlock(&marufs_sysfs_lock);
+
+ if (!any_sbi)
+ return sysfs_emit(buf, "No filesystem mounted\n");
+ return len;
+}
+
+static ssize_t nrht_chain_depth_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int cpu;
+
+ mutex_lock(&marufs_sysfs_lock);
+ for (int i = 0; i < MARUFS_MAX_MOUNTS; i++) {
+ struct marufs_sb_info *sbi = marufs_sysfs_sbi_list[i];
+
+ if (!sbi || !sbi->nrht_stats)
+ continue;
+ for_each_possible_cpu(cpu) {
+ memset(per_cpu_ptr(sbi->nrht_stats, cpu), 0,
+ sizeof(struct marufs_nrht_stats_pcpu));
+ }
+ }
+ mutex_unlock(&marufs_sysfs_lock);
+ return count;
+}
+
+struct kobj_attribute nrht_chain_depth_attr = __ATTR(
+ nrht_chain_depth, 0644, nrht_chain_depth_show, nrht_chain_depth_store);
diff --git a/marufs_kernel/src/sysfs_nrht.h b/marufs_kernel/src/sysfs_nrht.h
new file mode 100644
index 0000000..7353df6
--- /dev/null
+++ b/marufs_kernel/src/sysfs_nrht.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * sysfs_nrht.h - NRHT (Non-Resident Hash Table) sysfs attributes.
+ */
+
+#ifndef _MARUFS_SYSFS_NRHT_H
+#define _MARUFS_SYSFS_NRHT_H
+
+#include
+
+extern struct kobj_attribute nrht_chain_depth_attr;
+
+#endif /* _MARUFS_SYSFS_NRHT_H */
diff --git a/marufs_kernel/tests/bench_name_ref.c b/marufs_kernel/tests/bench_name_ref.c
new file mode 100644
index 0000000..f0bea2e
--- /dev/null
+++ b/marufs_kernel/tests/bench_name_ref.c
@@ -0,0 +1,719 @@
+// SPDX-License-Identifier: Apache-2.0
+/*
+ * bench_name_ref.c - Microbenchmark for MARUFS name-ref ioctl operations
+ *
+ * Measures per-operation latency (ns) with statistics:
+ * mean, median (p50), p99, p999, min, max
+ *
+ * Modes:
+ * (default) Sequential access — best-case, cache-warm
+ * --shuffle Random access order — defeats spatial locality / prefetch
+ * --prefill N Pre-populate N entries before benchmark — longer chains,
+ * larger working set (exceeds L3 if N > ~200K)
+ *
+ * Usage:
+ * bench_name_ref [options]
+ *
+ * Options:
+ * -n Number of iterations (default: 1000)
+ * -s Region size in MB (default: 64)
+ * --shuffle Randomize access order
+ * --prefill Pre-populate N background entries
+ *
+ * Examples:
+ * bench_name_ref /mnt/marufs
+ * bench_name_ref /mnt/marufs -n 5000 --shuffle
+ * bench_name_ref /mnt/marufs -n 1000 --shuffle --prefill 50000
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "../include/marufs_uapi.h"
+
+#define BATCH_SIZE 32
+#define SLOT_SIZE 64
+
+/* ── Timing ──────────────────────────────────────────────────────── */
+
+static inline long long now_ns(void)
+{
+ struct timespec ts;
+ clock_gettime(CLOCK_MONOTONIC, &ts);
+ return (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
+}
+
+/* ── Statistics ──────────────────────────────────────────────────── */
+
+static int cmp_ll(const void *a, const void *b)
+{
+ long long va = *(const long long *)a;
+ long long vb = *(const long long *)b;
+ return (va > vb) - (va < vb);
+}
+
+struct stats {
+ long long mean, median, p99, p999, min, max;
+};
+
+static struct stats compute_stats(long long *samples, int n)
+{
+ struct stats s = {0};
+ long long sum = 0;
+
+ if (n <= 0)
+ return s;
+
+ qsort(samples, (size_t)n, sizeof(long long), cmp_ll);
+
+ for (int i = 0; i < n; i++)
+ sum += samples[i];
+
+ s.mean = sum / n;
+ s.median = samples[n / 2];
+ s.p99 = samples[(int)((double)(n - 1) * 0.99)];
+ s.p999 = samples[(int)((double)(n - 1) * 0.999)];
+ s.min = samples[0];
+ s.max = samples[n - 1];
+ return s;
+}
+
+static void print_header(void)
+{
+ printf(" %-40s %5s %7s %7s %7s %7s %7s %7s\n",
+ "operation", "n", "mean", "p50", "p99", "p999", "min", "max");
+ printf(" %-40s %5s %7s %7s %7s %7s %7s %7s\n",
+ "", "", "(ns)", "(ns)", "(ns)", "(ns)", "(ns)", "(ns)");
+}
+
+static void print_stats(const char *label, struct stats *s, int n)
+{
+ printf(" %-40s %5d %7lld %7lld %7lld %7lld %7lld %7lld\n",
+ label, n, s->mean, s->median, s->p99, s->p999, s->min, s->max);
+}
+
+/* ── Shuffle (Fisher-Yates) ──────────────────────────────────────── */
+
+static void shuffle_int(int *arr, int n)
+{
+ for (int i = n - 1; i > 0; i--) {
+ int j = rand() % (i + 1);
+ int tmp = arr[i];
+ arr[i] = arr[j];
+ arr[j] = tmp;
+ }
+}
+
+static int *make_order(int n, int do_shuffle)
+{
+ int *order = malloc((size_t)n * sizeof(int));
+ if (!order) return NULL;
+ for (int i = 0; i < n; i++)
+ order[i] = i;
+ if (do_shuffle)
+ shuffle_int(order, n);
+ return order;
+}
+
+/* Helper: create + init NRHT file, returns fd or -1 */
+static int create_nrht(const char *mount_path, int pid, __u32 me_strategy)
+{
+ char path[512];
+ struct marufs_nrht_init_req ninit;
+ int nfd, ret;
+
+ snprintf(path, sizeof(path), "%s/bench_nrht_%d", mount_path, pid);
+ unlink(path);
+ nfd = open(path, O_CREAT | O_RDWR | O_CLOEXEC, 0644);
+ if (nfd < 0) return -1;
+
+ memset(&ninit, 0, sizeof(ninit));
+ ninit.max_entries = 0; /* default 8192 */
+ ninit.me_strategy = me_strategy;
+ ret = ioctl(nfd, MARUFS_IOC_NRHT_INIT, &ninit);
+ if (ret != 0) { close(nfd); return -1; }
+ return nfd;
+}
+
+static const char *strategy_name(__u32 s)
+{
+ return s == MARUFS_ME_REQUEST ? "request" : "order";
+}
+
+/* ── Prefill: populate background entries for cache pressure ─────── */
+
+static void prefill(int nrht_fd, int region_fd, int pid, int count)
+{
+ struct marufs_name_offset_req bent[BATCH_SIZE];
+ struct marufs_batch_name_offset_req breq;
+ int done = 0;
+
+ printf(" prefilling %d background entries...", count);
+ fflush(stdout);
+
+ while (done < count) {
+ int batch = BATCH_SIZE;
+ if (done + batch > count)
+ batch = count - done;
+
+ for (int j = 0; j < batch; j++) {
+ memset(&bent[j], 0, sizeof(bent[j]));
+ snprintf(bent[j].name, sizeof(bent[j].name),
+ "pf_%d_%d", pid, done + j);
+ bent[j].offset = (__u64)(done + j) * SLOT_SIZE;
+ bent[j].target_region_fd = region_fd;
+ }
+
+ memset(&breq, 0, sizeof(breq));
+ breq.count = (__u32)batch;
+ breq.entries = (__u64)(unsigned long)bent;
+ ioctl(nrht_fd, MARUFS_IOC_BATCH_NAME_OFFSET, &breq);
+ done += batch;
+ }
+ printf(" done\n");
+}
+
+static void prefill_cleanup(int nrht_fd, int pid, int count)
+{
+ for (int i = 0; i < count; i++) {
+ struct marufs_name_offset_req req = {0};
+ snprintf(req.name, sizeof(req.name), "pf_%d_%d", pid, i);
+ ioctl(nrht_fd, MARUFS_IOC_CLEAR_NAME, &req);
+ }
+}
+
+/* ── Warmup ──────────────────────────────────────────────────────── */
+
+static void warmup(int nrht_fd, int region_fd, int pid)
+{
+ for (int i = 0; i < 16; i++) {
+ struct marufs_name_offset_req req = {0};
+ snprintf(req.name, sizeof(req.name), "_w_%d_%d", pid, i);
+ req.target_region_fd = region_fd;
+ ioctl(nrht_fd, MARUFS_IOC_NAME_OFFSET, &req);
+ }
+ for (int i = 0; i < 16; i++) {
+ struct marufs_find_name_req freq = {0};
+ snprintf(freq.name, sizeof(freq.name), "_w_%d_%d", pid, i);
+ ioctl(nrht_fd, MARUFS_IOC_FIND_NAME, &freq);
+ }
+ for (int i = 0; i < 16; i++) {
+ struct marufs_name_offset_req req = {0};
+ snprintf(req.name, sizeof(req.name), "_w_%d_%d", pid, i);
+ ioctl(nrht_fd, MARUFS_IOC_CLEAR_NAME, &req);
+ }
+}
+
+/* ── Cleanup helpers ─────────────────────────────────────────────── */
+
+static void cleanup_single(int nrht_fd, int pid, int n, const char *pfx)
+{
+ for (int i = 0; i < n; i++) {
+ struct marufs_name_offset_req req = {0};
+ snprintf(req.name, sizeof(req.name), "%s_%d_%d", pfx, pid, i);
+ ioctl(nrht_fd, MARUFS_IOC_CLEAR_NAME, &req);
+ }
+}
+
+static void cleanup_batch(int nrht_fd, int pid, int n, const char *pfx)
+{
+ for (int i = 0; i < n; i++) {
+ for (int j = 0; j < BATCH_SIZE; j++) {
+ struct marufs_name_offset_req req = {0};
+ snprintf(req.name, sizeof(req.name),
+ "%s_%d_%d_%d", pfx, pid, i, j);
+ ioctl(nrht_fd, MARUFS_IOC_CLEAR_NAME, &req);
+ }
+ }
+}
+
+/* ── Benchmarks (single) ────────────────────────────────────────── */
+
+static struct stats bench_single_insert(int nrht_fd, int region_fd, int pid,
+ long long *s, int n, int *order)
+{
+ for (int i = 0; i < n; i++) {
+ int idx = order[i];
+ struct marufs_name_offset_req req = {0};
+ long long t0;
+
+ t0 = now_ns();
+ snprintf(req.name, sizeof(req.name), "si_%d_%d", pid, idx);
+ req.offset = (__u64)idx * SLOT_SIZE;
+ req.target_region_fd = region_fd;
+
+ ioctl(nrht_fd, MARUFS_IOC_NAME_OFFSET, &req);
+ s[i] = now_ns() - t0;
+ }
+ return compute_stats(s, n);
+}
+
+static struct stats bench_single_find(int nrht_fd, int pid,
+ long long *s, int n, int *order)
+{
+ for (int i = 0; i < n; i++) {
+ int idx = order[i];
+ struct marufs_find_name_req freq = {0};
+ long long t0;
+
+ t0 = now_ns();
+ snprintf(freq.name, sizeof(freq.name), "si_%d_%d", pid, idx);
+
+ ioctl(nrht_fd, MARUFS_IOC_FIND_NAME, &freq);
+ s[i] = now_ns() - t0;
+ }
+ return compute_stats(s, n);
+}
+
+static struct stats bench_single_clear(int nrht_fd, int pid,
+ long long *s, int n, int *order)
+{
+ for (int i = 0; i < n; i++) {
+ int idx = order[i];
+ struct marufs_name_offset_req req = {0};
+ long long t0;
+
+ t0 = now_ns();
+ snprintf(req.name, sizeof(req.name), "si_%d_%d", pid, idx);
+
+ ioctl(nrht_fd, MARUFS_IOC_CLEAR_NAME, &req);
+ s[i] = now_ns() - t0;
+ }
+ return compute_stats(s, n);
+}
+
+/* ── Benchmarks (batch) ──────────────────────────────────────────── */
+
+static struct stats bench_batch_insert(int nrht_fd, int region_fd, int pid,
+ long long *s, int n, int *order)
+{
+ struct marufs_name_offset_req bent[BATCH_SIZE];
+ struct marufs_batch_name_offset_req breq;
+
+ for (int i = 0; i < n; i++) {
+ int idx = order[i];
+ long long t0;
+
+ t0 = now_ns();
+ for (int j = 0; j < BATCH_SIZE; j++) {
+ memset(&bent[j], 0, sizeof(bent[j]));
+ snprintf(bent[j].name, sizeof(bent[j].name),
+ "bi_%d_%d_%d", pid, idx, j);
+ bent[j].offset =
+ (__u64)(idx * BATCH_SIZE + j) * SLOT_SIZE;
+ bent[j].target_region_fd = region_fd;
+ }
+
+ memset(&breq, 0, sizeof(breq));
+ breq.count = BATCH_SIZE;
+ breq.entries = (__u64)(unsigned long)bent;
+
+ ioctl(nrht_fd, MARUFS_IOC_BATCH_NAME_OFFSET, &breq);
+ s[i] = now_ns() - t0;
+ }
+ return compute_stats(s, n);
+}
+
+static struct stats bench_batch_find(int nrht_fd, int pid,
+ long long *s, int n, int *order)
+{
+ struct marufs_find_name_req bent[BATCH_SIZE];
+ struct marufs_batch_find_req breq;
+
+ for (int i = 0; i < n; i++) {
+ int idx = order[i];
+ long long t0;
+
+ t0 = now_ns();
+ for (int j = 0; j < BATCH_SIZE; j++) {
+ memset(&bent[j], 0, sizeof(bent[j]));
+ snprintf(bent[j].name, sizeof(bent[j].name),
+ "bi_%d_%d_%d", pid, idx, j);
+ }
+
+ memset(&breq, 0, sizeof(breq));
+ breq.count = BATCH_SIZE;
+ breq.entries = (__u64)(unsigned long)bent;
+
+ ioctl(nrht_fd, MARUFS_IOC_BATCH_FIND_NAME, &breq);
+ s[i] = now_ns() - t0;
+ }
+ return compute_stats(s, n);
+}
+
+/* ── Main ────────────────────────────────────────────────────────── */
+
+int main(int argc, char **argv)
+{
+ const char *mount_path = NULL;
+ int iters = 1000;
+ int size_mb = 64;
+ int do_shuffle = 0;
+ int prefill_count = 0;
+ __u32 me_strategy = MARUFS_ME_REQUEST;
+ char filepath[512];
+ char nrht_path[512];
+ int fd, nrht_fd, pid = (int)getpid();
+ long long *samples;
+ struct stats st, st_amort;
+ int *order;
+
+ /* Parse args */
+ for (int i = 1; i < argc; i++) {
+ if (strcmp(argv[i], "--shuffle") == 0) {
+ do_shuffle = 1;
+ } else if (strcmp(argv[i], "--prefill") == 0 && i + 1 < argc) {
+ prefill_count = atoi(argv[++i]);
+ } else if (strcmp(argv[i], "-n") == 0 && i + 1 < argc) {
+ iters = atoi(argv[++i]);
+ } else if (strcmp(argv[i], "-s") == 0 && i + 1 < argc) {
+ size_mb = atoi(argv[++i]);
+ } else if (strcmp(argv[i], "--strategy") == 0 && i + 1 < argc) {
+ const char *v = argv[++i];
+ if (!strcmp(v, "order"))
+ me_strategy = MARUFS_ME_ORDER;
+ else if (!strcmp(v, "request"))
+ me_strategy = MARUFS_ME_REQUEST;
+ else {
+ fprintf(stderr,
+ "invalid --strategy '%s' (order|request)\n",
+ v);
+ return 1;
+ }
+ } else if (argv[i][0] != '-' && !mount_path) {
+ mount_path = argv[i];
+ } else {
+ fprintf(stderr,
+ "Usage: %s [-n iters] [-s size_mb] "
+ "[--shuffle] [--prefill N] "
+ "[--strategy order|request]\n", argv[0]);
+ return 1;
+ }
+ }
+
+ if (!mount_path) {
+ fprintf(stderr,
+ "Usage: %s [-n iters] [-s size_mb] "
+ "[--shuffle] [--prefill N] "
+ "[--strategy order|request]\n", argv[0]);
+ return 1;
+ }
+
+ if (iters <= 0 || iters > 100000) {
+ fprintf(stderr, "iterations must be 1..100000\n");
+ return 1;
+ }
+
+ srand((unsigned)pid ^ (unsigned)time(NULL));
+
+ /* Create region file */
+ snprintf(filepath, sizeof(filepath), "%s/bench_%d", mount_path, pid);
+ fd = open(filepath, O_CREAT | O_RDWR | O_CLOEXEC, 0644);
+ if (fd < 0) { perror("open"); return 1; }
+ if (ftruncate(fd, (__off_t)size_mb * 1024 * 1024) < 0) {
+ perror("ftruncate");
+ close(fd); unlink(filepath); return 1;
+ }
+
+ /* Create NRHT file */
+ snprintf(nrht_path, sizeof(nrht_path), "%s/bench_nrht_%d", mount_path, pid);
+ nrht_fd = create_nrht(mount_path, pid, me_strategy);
+ if (nrht_fd < 0) {
+ perror("NRHT init");
+ close(fd); unlink(filepath); return 1;
+ }
+
+ samples = calloc((size_t)iters, sizeof(long long));
+ if (!samples) {
+ close(nrht_fd); unlink(nrht_path);
+ close(fd); unlink(filepath); return 1;
+ }
+
+ printf("=== MARUFS name-ref ioctl microbenchmark ===\n");
+ printf("mount=%s region=%dMB iterations=%d batch=%d pid=%d\n",
+ mount_path, size_mb, iters, BATCH_SIZE, pid);
+ printf("mode=%s prefill=%d me_strategy=%s\n\n",
+ do_shuffle ? "SHUFFLE (random)" : "SEQUENTIAL (cache-warm)",
+ prefill_count, strategy_name(me_strategy));
+
+ warmup(nrht_fd, fd, pid);
+
+ /* Prefill background entries for cache pressure */
+ if (prefill_count > 0)
+ prefill(nrht_fd, fd, pid, prefill_count);
+
+ /* ── [1] Single operations ───────────────────────────────── */
+
+ printf("\n[1] Single operations (ns/call)\n");
+ print_header();
+
+ order = make_order(iters, do_shuffle);
+ if (!order) {
+ free(samples);
+ close(nrht_fd); unlink(nrht_path);
+ close(fd); unlink(filepath); return 1;
+ }
+
+ st = bench_single_insert(nrht_fd, fd, pid, samples, iters, order);
+ print_stats("NAME_OFFSET insert", &st, iters);
+
+ if (do_shuffle) shuffle_int(order, iters);
+ st = bench_single_find(nrht_fd, pid, samples, iters, order);
+ print_stats("FIND_NAME lookup", &st, iters);
+
+ if (do_shuffle) shuffle_int(order, iters);
+ st = bench_single_clear(nrht_fd, pid, samples, iters, order);
+ print_stats("CLEAR_NAME delete", &st, iters);
+
+ free(order);
+
+ /* ── [2] Batch operations (bs=32) ────────────────────────── */
+
+ printf("\n[2] Batch operations, batch_size=%d (ns/call)\n", BATCH_SIZE);
+ print_header();
+
+ order = make_order(iters, do_shuffle);
+ if (!order) {
+ free(samples);
+ close(nrht_fd); unlink(nrht_path);
+ close(fd); unlink(filepath); return 1;
+ }
+
+ st = bench_batch_insert(nrht_fd, fd, pid, samples, iters, order);
+ print_stats("BATCH_NAME_OFFSET insert", &st, iters);
+
+ if (do_shuffle) shuffle_int(order, iters);
+ st = bench_batch_find(nrht_fd, pid, samples, iters, order);
+ print_stats("BATCH_FIND_NAME lookup", &st, iters);
+
+ free(order);
+
+ /* ── [3] Amortized per-entry cost ────────────────────────── */
+
+ printf("\n[3] Amortized per-entry cost (ns/entry, batch=%d vs single)\n",
+ BATCH_SIZE);
+ print_header();
+
+ cleanup_batch(nrht_fd, pid, iters, "bi");
+
+ order = make_order(iters, do_shuffle);
+ if (!order) {
+ free(samples);
+ close(nrht_fd); unlink(nrht_path);
+ close(fd); unlink(filepath); return 1;
+ }
+
+ st = bench_batch_insert(nrht_fd, fd, pid, samples, iters, order);
+ st_amort = st;
+ st_amort.mean /= BATCH_SIZE;
+ st_amort.median /= BATCH_SIZE;
+ st_amort.p99 /= BATCH_SIZE;
+ st_amort.p999 /= BATCH_SIZE;
+ st_amort.min /= BATCH_SIZE;
+ st_amort.max /= BATCH_SIZE;
+ print_stats("batch insert (per entry)", &st_amort, iters);
+
+ cleanup_batch(nrht_fd, pid, iters, "bi");
+ cleanup_single(nrht_fd, pid, iters, "si");
+
+ if (do_shuffle) shuffle_int(order, iters);
+ st = bench_single_insert(nrht_fd, fd, pid, samples, iters, order);
+ print_stats("single insert (per entry)", &st, iters);
+
+ if (st_amort.mean > 0) {
+ printf("\n -> batch amortized speedup: %.1fx\n",
+ (double)st.mean / (double)st_amort.mean);
+ }
+
+ free(order);
+
+ /* ── [4] Throughput ──────────────────────────────────────── */
+
+ printf("\n[4] Throughput\n");
+
+ cleanup_single(nrht_fd, pid, iters, "si");
+
+ {
+ long long t0, elapsed;
+ double ops;
+ int *torder = make_order(iters, do_shuffle);
+ if (!torder) {
+ free(samples);
+ close(nrht_fd); unlink(nrht_path);
+ close(fd); unlink(filepath); return 1;
+ }
+
+ /* Single insert */
+ t0 = now_ns();
+ for (int i = 0; i < iters; i++) {
+ int idx = torder[i];
+ struct marufs_name_offset_req req = {0};
+ long long ti = now_ns();
+ snprintf(req.name, sizeof(req.name),
+ "tp_%d_%d", pid, idx);
+ req.offset = (__u64)idx * SLOT_SIZE;
+ req.target_region_fd = fd;
+ ioctl(nrht_fd, MARUFS_IOC_NAME_OFFSET, &req);
+ samples[i] = now_ns() - ti;
+ }
+ elapsed = now_ns() - t0;
+ ops = (double)iters / ((double)elapsed / 1e9);
+ st = compute_stats(samples, iters);
+ printf(" NAME_OFFSET insert: %8.0f ops/sec p50=%lld p99=%lld p999=%lld ns\n",
+ ops, st.median, st.p99, st.p999);
+
+ /* Single lookup */
+ if (do_shuffle) shuffle_int(torder, iters);
+ t0 = now_ns();
+ for (int i = 0; i < iters; i++) {
+ int idx = torder[i];
+ struct marufs_find_name_req freq = {0};
+ long long ti = now_ns();
+ snprintf(freq.name, sizeof(freq.name),
+ "tp_%d_%d", pid, idx);
+ ioctl(nrht_fd, MARUFS_IOC_FIND_NAME, &freq);
+ samples[i] = now_ns() - ti;
+ }
+ elapsed = now_ns() - t0;
+ ops = (double)iters / ((double)elapsed / 1e9);
+ st = compute_stats(samples, iters);
+ printf(" FIND_NAME lookup: %8.0f ops/sec p50=%lld p99=%lld p999=%lld ns\n",
+ ops, st.median, st.p99, st.p999);
+
+ cleanup_single(nrht_fd, pid, iters, "tp");
+
+ /* Batch insert (entries/sec) */
+ {
+ struct marufs_name_offset_req bent[BATCH_SIZE];
+ struct marufs_batch_name_offset_req breq;
+
+ if (do_shuffle) shuffle_int(torder, iters);
+ t0 = now_ns();
+ for (int i = 0; i < iters; i++) {
+ int idx = torder[i];
+ long long ti = now_ns();
+ for (int j = 0; j < BATCH_SIZE; j++) {
+ memset(&bent[j], 0, sizeof(bent[j]));
+ snprintf(bent[j].name,
+ sizeof(bent[j].name),
+ "btp_%d_%d_%d", pid, idx, j);
+ bent[j].offset =
+ (__u64)(idx * BATCH_SIZE + j) *
+ SLOT_SIZE;
+ bent[j].target_region_fd = fd;
+ }
+ memset(&breq, 0, sizeof(breq));
+ breq.count = BATCH_SIZE;
+ breq.entries = (__u64)(unsigned long)bent;
+ ioctl(nrht_fd, MARUFS_IOC_BATCH_NAME_OFFSET, &breq);
+ samples[i] = now_ns() - ti;
+ }
+ elapsed = now_ns() - t0;
+ ops = (double)(iters * BATCH_SIZE) /
+ ((double)elapsed / 1e9);
+ st = compute_stats(samples, iters);
+ printf(" BATCH_NAME_OFFSET bs=32: %8.0f entries/sec "
+ "p50=%lld p99=%lld p999=%lld ns/batch\n",
+ ops, st.median, st.p99, st.p999);
+ printf(" %8s "
+ "p50=%lld p99=%lld p999=%lld ns/entry\n",
+ "", st.median / BATCH_SIZE,
+ st.p99 / BATCH_SIZE, st.p999 / BATCH_SIZE);
+
+ cleanup_batch(nrht_fd, pid, iters, "btp");
+ }
+
+ /* Batch lookup (entries/sec) */
+ {
+ struct marufs_name_offset_req bent[BATCH_SIZE];
+ struct marufs_batch_name_offset_req breq;
+
+ if (do_shuffle) shuffle_int(torder, iters);
+ for (int i = 0; i < iters; i++) {
+ int idx = torder[i];
+ for (int j = 0; j < BATCH_SIZE; j++) {
+ memset(&bent[j], 0, sizeof(bent[j]));
+ snprintf(bent[j].name,
+ sizeof(bent[j].name),
+ "bfl_%d_%d_%d", pid, idx, j);
+ bent[j].offset =
+ (__u64)(idx * BATCH_SIZE + j) *
+ SLOT_SIZE;
+ bent[j].target_region_fd = fd;
+ }
+ memset(&breq, 0, sizeof(breq));
+ breq.count = BATCH_SIZE;
+ breq.entries = (__u64)(unsigned long)bent;
+ ioctl(nrht_fd, MARUFS_IOC_BATCH_NAME_OFFSET, &breq);
+ }
+
+ {
+ struct marufs_find_name_req fbent[BATCH_SIZE];
+ struct marufs_batch_find_req fbreq;
+
+ if (do_shuffle) shuffle_int(torder, iters);
+ t0 = now_ns();
+ for (int i = 0; i < iters; i++) {
+ int idx = torder[i];
+ long long ti = now_ns();
+ for (int j = 0; j < BATCH_SIZE; j++) {
+ memset(&fbent[j], 0,
+ sizeof(fbent[j]));
+ snprintf(fbent[j].name,
+ sizeof(fbent[j].name),
+ "bfl_%d_%d_%d",
+ pid, idx, j);
+ }
+ fbreq.count = BATCH_SIZE;
+ fbreq.entries =
+ (__u64)(unsigned long)fbent;
+ ioctl(nrht_fd, MARUFS_IOC_BATCH_FIND_NAME,
+ &fbreq);
+ samples[i] = now_ns() - ti;
+ }
+ elapsed = now_ns() - t0;
+ ops = (double)(iters * BATCH_SIZE) /
+ ((double)elapsed / 1e9);
+ st = compute_stats(samples, iters);
+ printf(" BATCH_FIND_NAME bs=32: "
+ "%8.0f entries/sec "
+ "p50=%lld p99=%lld p999=%lld ns/batch\n",
+ ops, st.median, st.p99, st.p999);
+ printf(" "
+ "%8s "
+ "p50=%lld p99=%lld p999=%lld ns/entry\n",
+ "", st.median / BATCH_SIZE,
+ st.p99 / BATCH_SIZE,
+ st.p999 / BATCH_SIZE);
+ }
+
+ cleanup_batch(nrht_fd, pid, iters, "bfl");
+ }
+
+ free(torder);
+ }
+
+ /* Cleanup */
+ if (prefill_count > 0) {
+ printf("\n cleaning up prefill entries...");
+ fflush(stdout);
+ prefill_cleanup(nrht_fd, pid, prefill_count);
+ printf(" done\n");
+ }
+
+ free(samples);
+ close(nrht_fd);
+ unlink(nrht_path);
+ close(fd);
+ unlink(filepath);
+
+ printf("\nDone.\n");
+ return 0;
+}
diff --git a/marufs_kernel/tests/dax_zero.c b/marufs_kernel/tests/dax_zero.c
new file mode 100644
index 0000000..3bd6842
--- /dev/null
+++ b/marufs_kernel/tests/dax_zero.c
@@ -0,0 +1,46 @@
+#define _GNU_SOURCE
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+int main(int argc, char *argv[])
+{
+ int fd;
+ void *addr;
+ off_t size;
+
+ if (argc != 3) {
+ fprintf(stderr, "Usage: %s \n",
+ argv[0]);
+ return 1;
+ }
+
+ fd = open(argv[1], O_RDWR);
+ if (fd < 0) {
+ perror("open");
+ return 1;
+ }
+
+ size = strtoull(argv[2], NULL, 0);
+ if (size <= 0) {
+ fprintf(stderr, "Invalid size\n");
+ return 1;
+ }
+
+ addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ if (addr == MAP_FAILED) {
+ perror("mmap");
+ return 1;
+ }
+
+ memset(addr, 0, size);
+
+ munmap(addr, size);
+ close(fd);
+
+ return 0;
+}
\ No newline at end of file
diff --git a/marufs_kernel/tests/monitor_me.sh b/marufs_kernel/tests/monitor_me.sh
new file mode 100755
index 0000000..17be56c
--- /dev/null
+++ b/marufs_kernel/tests/monitor_me.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# monitor_me.sh — periodic dump of /sys/fs/marufs/me_info
+#
+# Usage:
+# sudo ./monitor_me.sh # all ME, 200ms interval
+# sudo ./monitor_me.sh 0.05 # all ME, 50ms interval
+# sudo ./monitor_me.sh 0.1 global # only Global ME
+# sudo ./monitor_me.sh 0.1 0 # only NRHT region 0
+# sudo ./monitor_me.sh 0.1 all timestamp # prefix every line with timestamp
+# sudo ./monitor_me.sh 0.1 all diff # only print when output changes
+#
+# Requires marufs.ko with me_info sysfs (sysfs.c).
+
+set -u
+
+SYS=/sys/fs/marufs/me_info
+INTERVAL="${1:-0.2}"
+FILTER="${2:-all}"
+MODE="${3:-plain}" # plain | timestamp | diff
+
+if [ ! -e "$SYS" ]; then
+ echo "ERROR: $SYS missing — rebuild + reinstall marufs.ko" >&2
+ exit 1
+fi
+
+# Select filter (global | | all)
+echo "$FILTER" > "$SYS" || { echo "ERROR: failed to set filter '$FILTER'"; exit 1; }
+
+prev=""
+while :; do
+ ts="$(date '+%H:%M:%S.%3N')"
+ cur="$(cat "$SYS")"
+
+ case "$MODE" in
+ diff)
+ if [ "$cur" != "$prev" ]; then
+ printf '===== %s =====\n%s\n' "$ts" "$cur"
+ prev="$cur"
+ fi
+ ;;
+ timestamp)
+ printf '%s\n' "$cur" | sed "s|^|$ts |"
+ echo "---"
+ ;;
+ plain|*)
+ clear 2>/dev/null || true
+ printf '===== %s (filter=%s) =====\n' "$ts" "$FILTER"
+ printf '%s\n' "$cur"
+ ;;
+ esac
+ sleep "$INTERVAL"
+done
diff --git a/marufs_kernel/tests/setup_local_multinode.sh b/marufs_kernel/tests/setup_local_multinode.sh
new file mode 100755
index 0000000..582805c
--- /dev/null
+++ b/marufs_kernel/tests/setup_local_multinode.sh
@@ -0,0 +1,453 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# setup_local_multinode.sh - MARUFS Local Multi-Node Environment Setup
+#
+# Builds MARUFS, formats a DEV_DAX device with a shared region pool,
+# and mounts it twice with different node_ids for multi-node testing.
+#
+# Usage:
+# sudo ./setup_local_multinode.sh # Default: /dev/dax6.0
+# sudo ./setup_local_multinode.sh --device /dev/dax0.0 # Custom device
+# sudo ./setup_local_multinode.sh --daxheap # DAXHEAP mode (WC mmap)
+# sudo ./setup_local_multinode.sh --teardown # Unmount + unload
+# sudo ./setup_local_multinode.sh --status # Show current state
+#
+# After setup:
+# Node 0: /mnt/marufs (read-write via shared pool allocation)
+# Node 1: /mnt/marufs2 (read-write via shared pool allocation)
+# Both nodes can read all files and write to dynamically allocated regions.
+
+set -euo pipefail
+
+# --- Configuration (override via environment or flags) ---
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
+
+MODULE_NAME="${MARUFS_MODULE_NAME:-marufs}" # Filesystem type and module name
+DAX_DEVICE="${MARUFS_DAX_DEVICE:-/dev/dax6.0}"
+NUM_MOUNTS="${MARUFS_NUM_MOUNTS:-2}" # Number of simulated nodes (1..8)
+# Per-index defaults: MARUFS_MOUNT_=... / MARUFS_NODE_=... override below.
+# Mount 0 uses /mnt/, mount i>0 uses /mnt/(i+1). Node IDs default to i+1.
+MOUNT_POINT_0="${MARUFS_MOUNT_0:-/mnt/${MODULE_NAME}}"
+MOUNT_POINT_1="${MARUFS_MOUNT_1:-/mnt/${MODULE_NAME}2}"
+NODE_ID_0="${MARUFS_NODE_0:-1}"
+NODE_ID_1="${MARUFS_NODE_1:-2}"
+BUILD_DIR="${MARUFS_BUILD_DIR:-$PROJECT_DIR/build}"
+NUM_SHARDS="${MARUFS_NUM_SHARDS:-64}"
+NUM_REGIONS="${MARUFS_NUM_REGIONS:-4}"
+REGION_OWNERS="${MARUFS_REGION_OWNERS:-}" # auto-set below if empty
+CHMOD_MODE="${MARUFS_CHMOD:-1777}" # permissions for mount points
+ME_STRATEGY="${MARUFS_ME_STRATEGY:-request}" # ME strategy: order or request
+LEGACY_NODE_ID="${MARUFS_LEGACY_NODE_ID:-false}" # true = pass node_id= explicitly (old style)
+
+# DAXHEAP configuration
+USE_DAXHEAP="${MARUFS_DAXHEAP:-false}"
+DAXHEAP_DIR="${MARUFS_DAXHEAP_DIR:-}"
+DAXHEAP_MODULE="${DAXHEAP_DIR}/kernel/core/daxheap.ko"
+DAXHEAP_SIZE="${MARUFS_DAXHEAP_SIZE:-192G}" # Allocation size from daxheap buffer
+
+# Derived paths
+MODULE_PATH="$BUILD_DIR/${MODULE_NAME}.ko"
+
+# --- Colors ---
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+CYAN='\033[0;36m'
+NC='\033[0m'
+
+log_info() { echo -e "${CYAN}[INFO]${NC} [$MODULE_NAME] $1"; }
+log_success() { echo -e "${GREEN}[ OK ]${NC} [$MODULE_NAME] $1"; }
+log_warn() { echo -e "${YELLOW}[WARN]${NC} [$MODULE_NAME] $1"; }
+log_error() { echo -e "${RED}[ERR ]${NC} [$MODULE_NAME] $1"; }
+
+# --- Helper: get device size in bytes from sysfs ---
+get_dax_size_bytes() {
+ local dev_name
+ dev_name=$(basename "$DAX_DEVICE")
+ local size_file="/sys/bus/dax/devices/${dev_name}/size"
+ if [ -f "$size_file" ]; then
+ cat "$size_file"
+ else
+ echo "0"
+ fi
+}
+
+# --- Argument Parsing ---
+ACTION="setup"
+SKIP_BUILD=false
+
+while [[ $# -gt 0 ]]; do
+ case $1 in
+ --device) DAX_DEVICE="$2"; shift 2 ;;
+ --mount-0) MOUNT_POINT_0="$2"; shift 2 ;;
+ --mount-1) MOUNT_POINT_1="$2"; shift 2 ;;
+ --node-0) NODE_ID_0="$2"; shift 2 ;;
+ --node-1) NODE_ID_1="$2"; shift 2 ;;
+ --num-mounts) NUM_MOUNTS="$2"; shift 2 ;;
+ --num-regions) NUM_REGIONS="$2"; shift 2 ;;
+ --num-shards) NUM_SHARDS="$2"; shift 2 ;;
+ --daxheap) USE_DAXHEAP=true; shift ;;
+ --daxheap-dir) DAXHEAP_DIR="$2"; DAXHEAP_MODULE="${DAXHEAP_DIR}/kernel/core/daxheap.ko"; shift 2 ;;
+ --daxheap-size) DAXHEAP_SIZE="$2"; shift 2 ;;
+ --skip-build) SKIP_BUILD=true; shift ;;
+ --me-strategy) ME_STRATEGY="$2"; shift 2 ;;
+ --legacy) LEGACY_NODE_ID=true; shift ;;
+ --teardown) ACTION="teardown"; shift ;;
+ --status) ACTION="status"; shift ;;
+ --help|-h)
+ cat <<'EOF'
+Usage: sudo ./setup_local_multinode.sh [OPTIONS]
+
+Actions:
+ (default) Build, format, mount (full setup)
+ --teardown Unmount all and unload module
+ --status Show current state
+
+Options:
+ --device DEV DAX device (default: /dev/dax6.0)
+ --daxheap Use DAXHEAP mode (WC mmap for GPU high-bandwidth)
+ --daxheap-dir DIR daxheap source directory (required when --daxheap)
+ --daxheap-size SZ daxheap allocation size (default: 100G)
+ --num-mounts N Number of mount points / simulated nodes, 1..8 (default: 2)
+ --mount-0 PATH Node 0 mount point (default: /mnt/marufs)
+ --mount-1 PATH Node 1 mount point (default: /mnt/marufs2)
+ (use MARUFS_MOUNT_=... env var for indices >= 2)
+ --node-0 ID Node 0 ID (default: 1)
+ --node-1 ID Node 1 ID (default: 2)
+ (use MARUFS_NODE_=... env var for indices >= 2)
+ --num-regions N Number of regions (default: 4)
+ --num-shards N Number of shards (default: 64)
+ --skip-build Skip build step (use existing binaries)
+ --me-strategy S ME strategy: order (default) or request
+
+Environment:
+ MARUFS_DAX_DEVICE, MARUFS_MOUNT_0, MARUFS_MOUNT_1, MARUFS_NODE_0, MARUFS_NODE_1,
+ MARUFS_BUILD_DIR, MARUFS_NUM_SHARDS, MARUFS_NUM_REGIONS, MARUFS_CHMOD,
+ MARUFS_DAXHEAP (true/false), MARUFS_DAXHEAP_DIR
+
+Examples:
+ sudo ./setup_local_multinode.sh
+ sudo ./setup_local_multinode.sh --daxheap
+ sudo ./setup_local_multinode.sh --device /dev/dax0.0 --num-regions 2
+ sudo ./setup_local_multinode.sh --teardown
+EOF
+ exit 0
+ ;;
+ *)
+ log_error "Unknown option: $1"
+ exit 1
+ ;;
+ esac
+done
+
+# --- Validate + expand NUM_MOUNTS into MOUNT_POINTS[] / NODE_IDS[] ---
+if ! [[ "$NUM_MOUNTS" =~ ^[1-8]$ ]]; then
+ log_error "--num-mounts must be 1..8 (got: $NUM_MOUNTS)"
+ exit 1
+fi
+
+# Default mount path for index i: /mnt/ (i=0) | /mnt/(i+1) (i>0)
+default_mount_for_idx() {
+ local i=$1
+ if [ "$i" -eq 0 ]; then
+ echo "/mnt/${MODULE_NAME}"
+ else
+ echo "/mnt/${MODULE_NAME}$((i + 1))"
+ fi
+}
+
+MOUNT_POINTS=()
+NODE_IDS=()
+for ((i = 0; i < NUM_MOUNTS; i++)); do
+ # MARUFS_MOUNT_ / MARUFS_NODE_ env overrides, else defaults.
+ m_var="MARUFS_MOUNT_$i"
+ n_var="MARUFS_NODE_$i"
+ default_mp=$(default_mount_for_idx "$i")
+ default_nid=$((i + 1))
+
+ # Preserve --mount-0/--mount-1/--node-0/--node-1 CLI flags for back-compat.
+ if [ "$i" -eq 0 ]; then default_mp="$MOUNT_POINT_0"; default_nid="$NODE_ID_0"; fi
+ if [ "$i" -eq 1 ]; then default_mp="$MOUNT_POINT_1"; default_nid="$NODE_ID_1"; fi
+
+ MOUNT_POINTS[i]="${!m_var:-$default_mp}"
+ NODE_IDS[i]="${!n_var:-$default_nid}"
+done
+
+# Note: REGION_OWNERS variable is legacy and no longer used
+# Regions are now dynamically allocated from a shared pool
+if [ -z "$REGION_OWNERS" ]; then
+ REGION_OWNERS=$(IFS=,; echo "${NODE_IDS[*]}")
+fi
+
+# ============================================================================
+# Status
+# ============================================================================
+do_status() {
+ echo "============================================"
+ echo " ${MODULE_NAME} Environment Status"
+ echo "============================================"
+
+ # daxheap module
+ echo -n " daxheap: "
+ if grep -q "^daxheap " /proc/modules; then
+ echo -e "${GREEN}loaded${NC}"
+ else
+ echo -e "${YELLOW}not loaded${NC}"
+ fi
+
+ # Module
+ echo -n " Module: "
+ if grep -q "^${MODULE_NAME} " /proc/modules; then
+ echo -e "${GREEN}loaded${NC}"
+ local node_id_file="/sys/fs/${MODULE_NAME}/node_id"
+ [ -f "$node_id_file" ] && echo " node_id=$(cat $node_id_file)"
+ else
+ echo -e "${YELLOW}not loaded${NC}"
+ fi
+
+ # Mounts
+ for ((i = 0; i < NUM_MOUNTS; i++)); do
+ local mp="${MOUNT_POINTS[i]}"
+ echo -n " Mount $i: "
+ if mount | grep -q "$mp type ${MODULE_NAME}"; then
+ local count=$(ls -1 "$mp" 2>/dev/null | wc -l)
+ echo -e "${GREEN}$mp${NC} ($count files)"
+ else
+ echo -e "${YELLOW}not mounted${NC} ($mp)"
+ fi
+ done
+
+ # Device — detect from mount if available, fallback to DAX_DEVICE
+ local actual_dev="$DAX_DEVICE"
+ local mounted_opts
+ mounted_opts=$(mount | grep "type ${MODULE_NAME}" | head -1 | sed -n 's/.*daxdev=\([^,)]*\).*/\1/p')
+ if [ -n "$mounted_opts" ]; then
+ actual_dev="$mounted_opts"
+ fi
+ echo -n " Device: "
+ if [ -e "$actual_dev" ]; then
+ echo -e "${GREEN}$actual_dev${NC}"
+ local size_file="/sys/bus/dax/devices/$(basename $actual_dev)/size"
+ if [ -f "$size_file" ]; then
+ local size_bytes=$(cat "$size_file")
+ local size_gb=$(awk "BEGIN {printf \"%.1f\", $size_bytes/1024/1024/1024}")
+ echo " size=${size_gb} GB"
+ fi
+ else
+ echo -e "${RED}$actual_dev not found${NC}"
+ fi
+
+ # Filesystem stats (based on mount 0)
+ if mount | grep -q "${MOUNT_POINTS[0]} type ${MODULE_NAME}"; then
+ echo ""
+ echo " Filesystem:"
+ df -h "${MOUNT_POINTS[0]}" 2>/dev/null | tail -1 | awk '{printf " Size: %s Used: %s Avail: %s Use%%: %s\n", $2, $3, $4, $5}'
+ fi
+
+ echo "============================================"
+}
+
+# ============================================================================
+# Teardown
+# ============================================================================
+do_teardown() {
+ echo "============================================"
+ echo " ${MODULE_NAME} Environment Teardown"
+ echo "============================================"
+
+ # Delegate unmount + unload to uninstall.sh
+ MARUFS_MODULE_NAME="$MODULE_NAME" "$PROJECT_DIR/uninstall.sh" --force
+
+ # Note: daxheap module is NOT unloaded here — managed externally
+
+ echo ""
+ log_success "Teardown complete"
+}
+
+# ============================================================================
+# Setup
+# ============================================================================
+do_setup() {
+ local mode_label="DEV_DAX"
+ if [ "$USE_DAXHEAP" = true ]; then
+ mode_label="DAXHEAP (WC mmap)"
+ fi
+
+ echo "============================================"
+ echo " ${MODULE_NAME} Local Multi-Node Setup"
+ echo "============================================"
+ echo " Mode: $mode_label"
+ echo " Device: $DAX_DEVICE"
+ echo " Mounts: $NUM_MOUNTS"
+ for ((i = 0; i < NUM_MOUNTS; i++)); do
+ printf " Node %d: %s (node_id=%s)\n" \
+ "$i" "${MOUNT_POINTS[i]}" "${NODE_IDS[i]}"
+ done
+ echo " Regions: $NUM_REGIONS (shared pool with dynamic allocation)"
+ echo " Shards: $NUM_SHARDS"
+ if [ "$USE_DAXHEAP" = true ]; then
+ echo " daxheap dir: $DAXHEAP_DIR"
+ fi
+ echo "============================================"
+ echo ""
+
+ # --- Pre-flight checks ---
+ log_info "Pre-flight checks..."
+
+ if [ "$(id -u)" -ne 0 ]; then
+ log_error "Must run as root (sudo)"
+ exit 1
+ fi
+
+ if [ "$USE_DAXHEAP" = false ] && [ ! -e "$DAX_DEVICE" ]; then
+ log_error "Device not found: $DAX_DEVICE"
+ exit 1
+ fi
+
+ if [ "$USE_DAXHEAP" = true ] && [ ! -f "$DAXHEAP_MODULE" ]; then
+ log_error "daxheap module not found: $DAXHEAP_MODULE"
+ log_error "Build daxheap first: cd $DAXHEAP_DIR/kernel/core && make"
+ exit 1
+ fi
+
+ log_success "Pre-flight OK"
+ echo ""
+
+ # --- Step 1-3: Build + Clean + Load via install.sh ---
+ log_info "Step 1-3/5: Build, clean, and load module via install.sh..."
+
+ local install_args="--node-id $NODE_ID_0 --build-dir $BUILD_DIR --module-name $MODULE_NAME"
+ if [ "$SKIP_BUILD" = true ]; then
+ install_args="$install_args --skip-build"
+ fi
+ if [ "$USE_DAXHEAP" = true ]; then
+ install_args="$install_args --daxheap --daxheap-dir $DAXHEAP_DIR"
+ fi
+
+ # Clean existing state first via uninstall.sh
+ MARUFS_MODULE_NAME="$MODULE_NAME" "$PROJECT_DIR/uninstall.sh" --force 2>/dev/null || true
+
+ # Load daxheap before MARUFS if needed
+ if [ "$USE_DAXHEAP" = true ]; then
+ if grep -q "^daxheap " /proc/modules; then
+ log_info " daxheap already loaded, skipping insmod"
+ else
+ log_info " Loading daxheap (dax_device=$DAX_DEVICE, host_id=$NODE_ID_0)..."
+ insmod "$DAXHEAP_MODULE" dax_device="$DAX_DEVICE" host_id="$NODE_ID_0"
+ sleep 1
+ if ! grep -q "^daxheap " /proc/modules; then
+ log_error "daxheap module load failed"
+ exit 1
+ fi
+ log_success "daxheap loaded"
+ fi
+ fi
+
+ # Build + load MARUFS module (no mount — we do dual mount below)
+ "$PROJECT_DIR/install.sh" $install_args
+ if [ $? -ne 0 ]; then
+ log_error "install.sh failed"
+ exit 1
+ fi
+
+ log_success "Module ready"
+
+ # --- Step 4: Mount (first mount formats; rest attach) ---
+ log_info "Step 4/4: Mounting $NUM_MOUNTS nodes..."
+
+ for ((i = 0; i < NUM_MOUNTS; i++)); do
+ mkdir -p "${MOUNT_POINTS[i]}"
+ done
+
+ if [ "$USE_DAXHEAP" = true ]; then
+ # DAXHEAP: primary allocates the shared buffer, secondaries attach via bufid.
+ local primary_opts="daxheap=${DAXHEAP_SIZE}"
+ [ "$LEGACY_NODE_ID" = true ] && primary_opts="${primary_opts},node_id=${NODE_IDS[0]}"
+ log_info " DAXHEAP primary mount (buffer size: $DAXHEAP_SIZE, opts: $primary_opts)"
+ mount -t "${MODULE_NAME}" -o "$primary_opts" none "${MOUNT_POINTS[0]}"
+ chmod "$CHMOD_MODE" "${MOUNT_POINTS[0]}"
+ log_success "-> ${MOUNT_POINTS[0]} (DAXHEAP primary, ${DAXHEAP_SIZE})"
+
+ # Read buf_id from sysfs (published by primary mount)
+ local bufid_file="/sys/fs/${MODULE_NAME}/daxheap_bufid"
+ if [ ! -f "$bufid_file" ]; then
+ log_error "buf_id sysfs not found: $bufid_file"
+ exit 1
+ fi
+ local bufid
+ bufid=$(cat "$bufid_file")
+ if [ "$bufid" = "0x0" ] || [ -z "$bufid" ]; then
+ log_error "Primary mount did not publish a valid buf_id"
+ exit 1
+ fi
+
+ for ((i = 1; i < NUM_MOUNTS; i++)); do
+ local sec_opts="daxheap_bufid=${bufid}"
+ [ "$LEGACY_NODE_ID" = true ] && sec_opts="${sec_opts},node_id=${NODE_IDS[i]}"
+ log_info " DAXHEAP secondary mount $i (bufid=${bufid})"
+ mount -t "${MODULE_NAME}" -o "$sec_opts" none "${MOUNT_POINTS[i]}"
+ chmod "$CHMOD_MODE" "${MOUNT_POINTS[i]}"
+ log_success "-> ${MOUNT_POINTS[i]} (DAXHEAP secondary, bufid=${bufid})"
+ done
+ else
+ # DEV_DAX auto-mount: first mounter auto-elects formatter via bootstrap.
+ # Pass me_strategy; omit node_id= for auto-mount (pass it for --legacy).
+ local base_opts="daxdev=${DAX_DEVICE},me_strategy=${ME_STRATEGY}"
+ if [ "$LEGACY_NODE_ID" = true ]; then
+ # Legacy: explicit node_id + format flag on first mount
+ mount -t "${MODULE_NAME}" \
+ -o "${base_opts},node_id=${NODE_IDS[0]},format" \
+ none "${MOUNT_POINTS[0]}"
+ chmod "$CHMOD_MODE" "${MOUNT_POINTS[0]}"
+ log_success "-> ${MOUNT_POINTS[0]} (legacy format, node_id=${NODE_IDS[0]})"
+ for ((i = 1; i < NUM_MOUNTS; i++)); do
+ mount -t "${MODULE_NAME}" \
+ -o "${base_opts},node_id=${NODE_IDS[i]}" \
+ none "${MOUNT_POINTS[i]}"
+ chmod "$CHMOD_MODE" "${MOUNT_POINTS[i]}"
+ log_success "-> ${MOUNT_POINTS[i]} (legacy, node_id=${NODE_IDS[i]})"
+ done
+ else
+ # Auto-mount: no node_id= — bootstrap election assigns node_ids.
+ for ((i = 0; i < NUM_MOUNTS; i++)); do
+ log_info " Auto-mount $i -> ${MOUNT_POINTS[i]}"
+ mount -t "${MODULE_NAME}" \
+ -o "${base_opts}" \
+ none "${MOUNT_POINTS[i]}"
+ chmod "$CHMOD_MODE" "${MOUNT_POINTS[i]}"
+ log_success "-> ${MOUNT_POINTS[i]} (auto-mount)"
+ done
+ fi
+ fi
+
+ # --- Done ---
+ echo ""
+ echo "============================================"
+ echo -e " ${GREEN}Setup Complete!${NC} ($mode_label)"
+ echo "============================================"
+ echo ""
+ echo " Quick test:"
+ echo " touch ${MOUNT_POINTS[0]}/hello.txt # Node ${NODE_IDS[0]} creates"
+ echo " ls ${MOUNT_POINTS[1]:-${MOUNT_POINTS[0]}}/ # Cross-node visibility"
+ echo ""
+ echo " Run tests:"
+ if [ "$USE_DAXHEAP" = true ]; then
+ echo " ./tests/test_local_multinode.sh --daxheap --skip-setup --no-cleanup"
+ else
+ echo " ./tests/test_local_multinode.sh --skip-setup --no-cleanup"
+ fi
+ echo ""
+ echo " Teardown:"
+ echo " sudo ./tests/setup_local_multinode.sh --teardown"
+ echo ""
+}
+
+# ============================================================================
+# Main
+# ============================================================================
+case "$ACTION" in
+ setup) do_setup ;;
+ teardown) do_teardown ;;
+ status) do_status ;;
+esac
diff --git a/marufs_kernel/tests/test_bootstrap_chaos.sh b/marufs_kernel/tests/test_bootstrap_chaos.sh
new file mode 100755
index 0000000..b57a60c
--- /dev/null
+++ b/marufs_kernel/tests/test_bootstrap_chaos.sh
@@ -0,0 +1,602 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# test_bootstrap_chaos.sh - Real bootstrap chaos: stuck recovery + concurrent mount race.
+#
+# Standalone test — manages module load/unload and mount lifecycle on its own.
+# Does NOT depend on setup_local_multinode.sh.
+#
+# Tests:
+# T1 stuck-formatter recovery
+# Set bootstrap_inject_stuck_formatter=1, mount node A.
+# A becomes formatter but skips GSB-magic write; slot[0] stays FORMATTING.
+# Mount node B with format_timeout_ms=2000; B waits, detects stuck,
+# steals slot[0], re-formats, succeeds.
+# Verify: B's mount succeeded and slot[0]=CLAIMED.
+#
+# T2 concurrent mount race (N nodes mount simultaneously)
+# Spawn N mounts in parallel. All must succeed (last one becomes
+# formatter if needed; others retry or become joiners).
+# Verify: N CLAIMED slots with unique node_ids.
+#
+# T3 serial mount → umount → remount (slot reuse)
+# Mount A; verify CLAIMED; umount A.
+# Dump: slot[0] EMPTY.
+# Mount A again; verify CLAIMED.
+#
+# Usage:
+# sudo MARUFS_DAX_DEVICE=/dev/dax6.0 ./tests/test_bootstrap_chaos.sh
+# sudo ./tests/test_bootstrap_chaos.sh --keep # keep mounts on failure
+#
+# Environment overrides:
+# MARUFS_DAX_DEVICE block device (default /dev/dax6.0)
+# MARUFS_MODULE_NAME filesystem type name (default marufs)
+# MARUFS_MODULE_KO path to marufs.ko (default ../build/marufs.ko)
+# MARUFS_T2_NODES number of concurrent mounts for T2 (default 4)
+
+set -uo pipefail
+
+# ---------------------------------------------------------------------------
+# Configuration
+# ---------------------------------------------------------------------------
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
+
+DAX_DEVICE="${MARUFS_DAX_DEVICE:-/dev/dax2.0}"
+MODULE_NAME="${MARUFS_MODULE_NAME:-marufs}"
+MODULE_KO="${MARUFS_MODULE_KO:-${PROJECT_DIR}/build/${MODULE_NAME}.ko}"
+MNT_BASE="/mnt/${MODULE_NAME}_chaos"
+NUM_NODES_T2="${MARUFS_T2_NODES:-8}"
+KEEP_ON_FAIL=false
+ME_STRATEGY="${MARUFS_ME_STRATEGY:-request}"
+
+# Sysfs paths (module params — writable before any mount)
+PARAM_BASE="/sys/module/${MODULE_NAME}/parameters"
+DUMP_BS="/sys/fs/${MODULE_NAME}/debug/bootstrap_dump"
+
+# Tmp dir for parallel mount rc files
+TMPDIR_RC="$(mktemp -d /tmp/marufs_chaos_XXXXXX)"
+trap 'rm -rf "$TMPDIR_RC"' EXIT
+
+PASS=0
+FAIL=0
+
+# ---------------------------------------------------------------------------
+# Output helpers
+# ---------------------------------------------------------------------------
+info() { echo "[chaos] $*"; }
+pass() { echo " PASS: $1"; PASS=$((PASS + 1)); }
+fail() { echo " FAIL: $1" >&2; FAIL=$((FAIL + 1)); }
+die() { echo "FATAL: $*" >&2; exit 1; }
+
+# ---------------------------------------------------------------------------
+# Pre-flight
+# ---------------------------------------------------------------------------
+[[ $EUID -eq 0 ]] || die "must run as root"
+[[ -c "$DAX_DEVICE" ]] || die "DAX device not found: $DAX_DEVICE"
+[[ -f "$MODULE_KO" ]] || die "marufs.ko not found: $MODULE_KO (build first with make)"
+
+while [[ $# -gt 0 ]]; do
+ case "$1" in
+ --keep) KEEP_ON_FAIL=true; shift ;;
+ *) die "unknown option: $1" ;;
+ esac
+done
+
+# ---------------------------------------------------------------------------
+# Module lifecycle helpers
+# ---------------------------------------------------------------------------
+module_loaded() { grep -q "^${MODULE_NAME} " /proc/modules 2>/dev/null; }
+
+load_module() {
+ if module_loaded; then
+ info "module already loaded — unloading first"
+ unload_module
+ fi
+ info "loading $MODULE_KO"
+ insmod "$MODULE_KO" || die "insmod failed"
+ # Verify module param path is accessible
+ local deadline=$((SECONDS + 5))
+ while [[ ! -d "$PARAM_BASE" ]] && [[ $SECONDS -lt $deadline ]]; do
+ sleep 0.2
+ done
+ [[ -d "$PARAM_BASE" ]] || die "module param dir not found: $PARAM_BASE"
+}
+
+unload_module() {
+ # Unmount all chaos mounts first
+ local mp
+ for mp in "${MNT_BASE}"_*; do
+ [[ -d "$mp" ]] || continue
+ if mountpoint -q "$mp" 2>/dev/null; then
+ umount "$mp" 2>/dev/null || umount -l "$mp" 2>/dev/null || true
+ fi
+ done
+ # Also catch any plain MNT_BASE mounts
+ if mountpoint -q "$MNT_BASE" 2>/dev/null; then
+ umount "$MNT_BASE" 2>/dev/null || umount -l "$MNT_BASE" 2>/dev/null || true
+ fi
+ if module_loaded; then
+ # Try several rmmod attempts with backoff — sb teardown / kthreads
+ # holding refs can take a moment after umount returns.
+ local i
+ for i in 1 2 3 4 5 6 7 8 9 10; do
+ module_loaded || break
+ rmmod "$MODULE_NAME" 2>/dev/null && break
+ sleep 0.5
+ done
+ if module_loaded; then
+ info "warning: module still loaded after 5s; lsmod shows:"
+ lsmod | grep "^${MODULE_NAME}" | sed 's/^/ /'
+ info "skipping further unload — manual rmmod may be needed"
+ fi
+ fi
+}
+
+# ---------------------------------------------------------------------------
+# Mount helpers
+# ---------------------------------------------------------------------------
+mount_node() {
+ # mount_node [extra_opts] [timeout_seconds]
+ # mounts at ${MNT_BASE}_${mnt_suffix}
+ # Use unique source per call to avoid util-linux "already mounted" check
+ # that triggers when source string ("none") matches /sys/fs/pstore etc.
+ local suffix="$1"
+ local extra="${2:-}"
+ local t_seconds="${3:-}"
+ local mp="${MNT_BASE}_${suffix}"
+ local src="marufs_${suffix}"
+ mkdir -p "$mp"
+ local opts="daxdev=${DAX_DEVICE},me_strategy=${ME_STRATEGY}"
+ [[ -n "$extra" ]] && opts="${opts},${extra}"
+ if [[ -n "$t_seconds" ]]; then
+ timeout "$t_seconds" mount -t "${MODULE_NAME}" -o "$opts" "$src" "$mp"
+ else
+ mount -t "${MODULE_NAME}" -o "$opts" "$src" "$mp"
+ fi
+}
+
+umount_node() {
+ local suffix="$1"
+ local mp="${MNT_BASE}_${suffix}"
+ if mountpoint -q "$mp" 2>/dev/null; then
+ umount "$mp" || umount -l "$mp" || true
+ fi
+ rmdir "$mp" 2>/dev/null || true
+}
+
+# ---------------------------------------------------------------------------
+# Module param helpers
+# ---------------------------------------------------------------------------
+set_param() {
+ local name="$1" val="$2"
+ local path="${PARAM_BASE}/${name}"
+ [[ -w "$path" ]] || die "module param not writable: $path"
+ echo "$val" > "$path"
+}
+
+get_param() {
+ local name="$1"
+ cat "${PARAM_BASE}/${name}" 2>/dev/null || echo ""
+}
+
+# ---------------------------------------------------------------------------
+# Slot dump helpers
+# ---------------------------------------------------------------------------
+dump_slots() {
+ # Print only the slot lines from the first mount section (node=1).
+ # Works whether there is one mount or many. awk is robust against
+ # missing-next-section (single-mount) scenarios that broke the pure
+ # grep-pipe approach with pipefail.
+ if [[ -r "$DUMP_BS" ]]; then
+ awk '
+ /^=== mount node=1 ===/ { in_section = 1; next }
+ /^=== mount node=/ { in_section = 0; next }
+ in_section
+ ' "$DUMP_BS"
+ else
+ echo "(bootstrap_dump sysfs unavailable — no active mounts)"
+ fi
+}
+
+count_claimed() {
+ dump_slots | grep -c "status=CLAIMED" || true
+}
+
+slot0_status() {
+ dump_slots | grep '^slot\[0\]' | grep -oE "status=[A-Z]+" | cut -d= -f2 || echo "?"
+}
+
+# dmesg scan: look for formatter/joiner election log lines added in super.c
+dmesg_formatter_count() {
+ dmesg | grep -c "bootstrap: formatter elected" 2>/dev/null || true
+}
+dmesg_joiner_count() {
+ dmesg | grep -c "bootstrap: joiner waiting for format" 2>/dev/null || true
+}
+
+# ---------------------------------------------------------------------------
+# Cleanup on exit (unless --keep)
+# ---------------------------------------------------------------------------
+cleanup() {
+ local rc=$?
+ if [[ $rc -ne 0 ]] && [[ "$KEEP_ON_FAIL" == true ]]; then
+ info "Keeping mounts for inspection (--keep). Module: $MODULE_NAME"
+ info "Dump: cat $DUMP_BS"
+ info "dmesg | grep bootstrap"
+ return
+ fi
+ unload_module 2>/dev/null || true
+}
+trap cleanup EXIT
+
+# ---------------------------------------------------------------------------
+# Device-wipe helper (used between tests to start from fresh state)
+# ---------------------------------------------------------------------------
+DAX_ZERO="${SCRIPT_DIR}/dax_zero"
+wipe_device() {
+ if [[ -x "$DAX_ZERO" ]]; then
+ info "wiping $DAX_DEVICE for fresh state"
+ "$DAX_ZERO" "$DAX_DEVICE" 2097152 >/dev/null || die "dax_zero failed"
+ else
+ info "warning: $DAX_ZERO not found; skipping wipe"
+ fi
+}
+
+# ---------------------------------------------------------------------------
+# T1: stuck-formatter recovery
+# ---------------------------------------------------------------------------
+info "================================================================"
+info "T1: stuck-formatter recovery (inject_stuck + steal path)"
+info "================================================================"
+
+wipe_device
+load_module
+
+# Shorten stuck-detection timeout for fast test
+set_param "bootstrap_format_timeout_ms" "2000"
+
+# Activate fault injection BEFORE node A mounts (module param readable during fill_super)
+set_param "bootstrap_inject_stuck_formatter" "1"
+info "T1: inject_stuck=1, format_timeout_ms=2000"
+
+# Node A mounts — wins slot[0], writes status=FORMATTING, then inject path
+# returns -EAGAIN deliberately (mount fails) but bootstrap_release is skipped,
+# so slot[0] stays at FORMATTING for node B to detect.
+info "T1: mounting node A (mount expected to FAIL with -EAGAIN; slot[0] kept at FORMATTING)"
+if mount_node "a" 2>/dev/null; then
+ fail "T1: node A mount succeeded (expected -EAGAIN injection)"
+ exit 1
+else
+ info "T1: node A mount failed as expected (inject_stuck path)"
+fi
+
+# Confirm slot[0] is FORMATTING via dmesg (bootstrap_dump sysfs requires a live sb)
+if dmesg | tail -n 50 | grep -q "stuck-formatter injection active"; then
+ info "T1: dmesg confirms inject_stuck path executed"
+else
+ fail "T1: dmesg missing inject_stuck log line"
+ exit 1
+fi
+
+# Clear inject flag so node B formats normally after stealing
+set_param "bootstrap_inject_stuck_formatter" "0"
+info "T1: inject_stuck cleared for node B"
+
+# Record dmesg baseline
+dmesg_before=$(dmesg | wc -l)
+
+# Node B mounts — should wait ~2s, detect stuck, steal slot[0], re-format
+info "T1: mounting node B (joiner → should detect stuck → steal → format)"
+if mount_node "b" "" 15; then
+ pass "T1: node B mount succeeded after stuck-formatter recovery"
+else
+ fail "T1: node B mount timed out or failed (steal path broken)"
+fi
+
+# Verify slot[0] is CLAIMED (formatter stole and promoted)
+sleep 0.3 # let sysfs registration settle
+s0=$(slot0_status)
+if [[ "$s0" == "CLAIMED" ]]; then
+ pass "T1: slot[0] = CLAIMED after recovery"
+else
+ echo " DEBUG: DUMP_BS=$DUMP_BS readable=$( [[ -r "$DUMP_BS" ]] && echo yes || echo no )"
+ echo " DEBUG: ls /sys/fs/${MODULE_NAME}/debug/:"
+ ls -la "/sys/fs/${MODULE_NAME}/debug/" 2>&1 | sed 's/^/ /'
+ echo " DEBUG: raw dump (between markers):"
+ echo " >>>>"
+ cat "$DUMP_BS" 2>&1 | sed 's/^/ /'
+ echo " <<<<"
+ fail "T1: slot[0] = $s0 (expected CLAIMED)"
+fi
+
+# Verify dmesg shows steal path
+if dmesg | tail -n +"$dmesg_before" | grep -q "stole slot\[0\]"; then
+ pass "T1: dmesg confirms slot[0] steal"
+else
+ fail "T1: dmesg missing 'stole slot[0]' log (steal path not exercised?)"
+fi
+
+# Cleanup T1 (node A's mount failed by design, no umount needed)
+umount_node "b"
+unload_module
+set +e; rmdir "${MNT_BASE}_a" "${MNT_BASE}_b" 2>/dev/null; set -e
+
+# ---------------------------------------------------------------------------
+# T2: concurrent mount race — N nodes mount simultaneously, M iterations
+#
+# Stress design (single-host limitations acknowledged):
+# 1. File barrier so all N subshells reach mount() syscall together
+# (otherwise fork serialization makes 1st claim before 8th is even forked).
+# 2. Per-thread random microsecond jitter post-barrier so different
+# iterations probe different race phasings.
+# 3. Loop M iterations. Each iteration: wipe device → load → barrier-release
+# → wait → verify → unmount → unload. Aggregates stats across iterations.
+# 4. Verifies: every iteration produces N unique CLAIMED slots, exactly 1
+# formatter elected, no mount returns non-zero.
+# ---------------------------------------------------------------------------
+NUM_ITERATIONS_T2="${MARUFS_T2_ITERS:-10}"
+info "================================================================"
+info "T2: concurrent mount race ($NUM_NODES_T2 nodes × $NUM_ITERATIONS_T2 iterations)"
+info "================================================================"
+
+DAX_ZERO="${SCRIPT_DIR}/dax_zero"
+if [[ ! -x "$DAX_ZERO" ]]; then
+ fail "T2: $DAX_ZERO not found — cannot wipe between iterations"
+ exit 1
+fi
+
+# Helper: count dmesg occurrences of pattern since baseline line number
+dmesg_count_since() {
+ local baseline="$1" pattern="$2"
+ dmesg | tail -n +"$baseline" | grep -c "$pattern" || true
+}
+
+# Helper: extract a field from all CLAIMED slots in dump (e.g., node_id, token)
+extract_claimed_field() {
+ local field="$1"
+ dump_slots | grep "status=CLAIMED" | grep -oE "${field}=[0-9a-fx]+"
+}
+
+# Spawn N concurrent mounts via barrier file; populates rc files in $TMPDIR_RC
+spawn_concurrent_mounts() {
+ local n_mounts="$1" barrier_file="$2"
+ local n mp src
+ rm -f "$barrier_file" "${TMPDIR_RC}"/rc_*
+ for ((n = 1; n <= n_mounts; n++)); do
+ mp="${MNT_BASE}_t2_${n}"
+ src="marufs_t2_${n}"
+ mkdir -p "$mp"
+ (
+ # Spin-wait barrier — minimal cost, releases all simultaneously
+ while [[ ! -e "$barrier_file" ]]; do : ; done
+ # Random microsecond jitter (0-2ms) to scatter entries within
+ # the same kernel scheduler tick — reorders thread arrival
+ # at bootstrap_claim's CAS-less write+reread.
+ local jitter=$((RANDOM % 2000))
+ usleep "$jitter" 2>/dev/null || sleep 0
+ mount -t "${MODULE_NAME}" \
+ -o "daxdev=${DAX_DEVICE},me_strategy=${ME_STRATEGY}" \
+ "$src" "$mp" 2>/dev/null
+ echo $? > "${TMPDIR_RC}/rc_${n}"
+ ) &
+ done
+ # Give subshells time to reach the barrier (200ms) then release them
+ sleep 0.2
+ touch "$barrier_file"
+ wait
+}
+
+# Tally rc files; sets iter_ok and iter_fail in caller scope
+tally_rc_files() {
+ local n_mounts="$1"
+ local n rc_file
+ iter_ok=0
+ iter_fail=0
+ for ((n = 1; n <= n_mounts; n++)); do
+ rc_file="${TMPDIR_RC}/rc_${n}"
+ if [[ -f "$rc_file" ]] && [[ "$(cat "$rc_file")" -eq 0 ]]; then
+ iter_ok=$((iter_ok + 1))
+ else
+ iter_fail=$((iter_fail + 1))
+ fi
+ done
+}
+
+# Verify per-iteration invariants; updates t2_iter_failures global
+verify_iter_invariants() {
+ local it="$1" iter_ok="$2" dmesg_baseline="$3"
+
+ # Mount count
+ if [[ $iter_ok -ne $NUM_NODES_T2 ]]; then
+ info " iter $it: $iter_ok/$NUM_NODES_T2 mounts succeeded"
+ t2_iter_failures=$((t2_iter_failures + 1))
+ fi
+
+ # CLAIMED slot count
+ local claimed
+ claimed=$(count_claimed)
+ if [[ "$claimed" -ne "$NUM_NODES_T2" ]]; then
+ info " iter $it: $claimed CLAIMED slots (expected $NUM_NODES_T2)"
+ t2_iter_failures=$((t2_iter_failures + 1))
+ fi
+
+ # Unique node_ids
+ local all_ids uniq_ids
+ all_ids=$(extract_claimed_field "node_id" | sort)
+ uniq_ids=$(echo "$all_ids" | sort -u)
+ if [[ "$all_ids" != "$uniq_ids" ]]; then
+ info " iter $it: duplicate node_ids: $all_ids"
+ t2_iter_failures=$((t2_iter_failures + 1))
+ fi
+
+ # Token uniqueness
+ local all_toks uniq_toks n_toks n_uniq_toks
+ all_toks=$(extract_claimed_field "token" | sort)
+ uniq_toks=$(echo "$all_toks" | sort -u)
+ n_toks=$(echo "$all_toks" | wc -l)
+ n_uniq_toks=$(echo "$uniq_toks" | wc -l)
+ if [[ "$n_toks" -ne "$n_uniq_toks" ]] || [[ "$n_toks" -ne "$NUM_NODES_T2" ]]; then
+ info " iter $it: token count=$n_toks unique=$n_uniq_toks (expect $NUM_NODES_T2 unique)"
+ t2_iter_failures=$((t2_iter_failures + 1))
+ fi
+
+ # dmesg claim trace count
+ local iter_claims n_iter_claims
+ iter_claims=$(dmesg | tail -n +"$dmesg_baseline" \
+ | grep -oE "bootstrap: node_id=[0-9]+ \(slot [0-9]+\)" | sort -u)
+ n_iter_claims=$(echo "$iter_claims" | grep -c . || true)
+ if [[ "$n_iter_claims" -ne "$iter_ok" ]]; then
+ info " iter $it: dmesg claim lines=$n_iter_claims, expected=$iter_ok"
+ t2_iter_failures=$((t2_iter_failures + 1))
+ fi
+}
+
+# Run a single T2 iteration; updates t2_total_* globals
+run_t2_iteration() {
+ local it="$1"
+ info "T2: --- iteration $it/$NUM_ITERATIONS_T2 ---"
+
+ "$DAX_ZERO" "$DAX_DEVICE" 2097152 >/dev/null || die "dax_zero failed"
+ load_module
+ set_param "bootstrap_format_timeout_ms" "30000"
+ # Widen the race window between free-slot scan and CLAIMED write so
+ # concurrent mounters actually collide on the same slot. Without this,
+ # the race window is microseconds and 8 contending mounts almost never
+ # observe the same free slot — the CAS-less claim path is never exercised.
+ set_param "bootstrap_debug_pre_write_delay_us" "${MARUFS_T2_DELAY_US:-5000}"
+
+ local dmesg_baseline
+ dmesg_baseline=$(dmesg | wc -l)
+ local barrier_file="${TMPDIR_RC}/barrier_${it}"
+
+ spawn_concurrent_mounts "$NUM_NODES_T2" "$barrier_file"
+
+ local iter_ok iter_fail
+ tally_rc_files "$NUM_NODES_T2"
+
+ t2_total_ok=$((t2_total_ok + iter_ok))
+ t2_total_fail=$((t2_total_fail + iter_fail))
+
+ verify_iter_invariants "$it" "$iter_ok" "$dmesg_baseline"
+
+ local iter_fmt iter_steal iter_race claimed
+ claimed=$(count_claimed)
+ iter_fmt=$(dmesg_count_since "$dmesg_baseline" "bootstrap: formatter elected")
+ iter_steal=$(dmesg_count_since "$dmesg_baseline" "stole slot")
+ iter_race=$(dmesg_count_since "$dmesg_baseline" "claim race lost")
+ t2_total_formatter=$((t2_total_formatter + iter_fmt))
+ t2_total_steal_attempts=$((t2_total_steal_attempts + iter_steal))
+
+ info " iter $it: ok=$iter_ok fail=$iter_fail claimed=$claimed formatter=$iter_fmt steal=$iter_steal race-lost=$iter_race"
+
+ local n
+ for ((n = 1; n <= NUM_NODES_T2; n++)); do
+ umount_node "t2_${n}"
+ done
+ unload_module
+}
+
+t2_total_ok=0
+t2_total_fail=0
+t2_total_formatter=0
+t2_total_steal_attempts=0
+t2_iter_failures=0
+
+for ((it = 1; it <= NUM_ITERATIONS_T2; it++)); do
+ run_t2_iteration "$it"
+done
+
+# Aggregate verdict
+expected_total=$((NUM_NODES_T2 * NUM_ITERATIONS_T2))
+if [[ $t2_total_ok -eq $expected_total ]]; then
+ pass "T2: all $expected_total mounts succeeded across $NUM_ITERATIONS_T2 iterations"
+else
+ fail "T2: $t2_total_ok/$expected_total mounts succeeded ($t2_total_fail failed)"
+fi
+
+if [[ $t2_total_formatter -eq $NUM_ITERATIONS_T2 ]]; then
+ pass "T2: exactly 1 formatter per iteration (total $t2_total_formatter)"
+else
+ fail "T2: expected $NUM_ITERATIONS_T2 formatter elections, got $t2_total_formatter"
+fi
+
+if [[ $t2_iter_failures -eq 0 ]]; then
+ pass "T2: every iteration passed all sanity checks"
+else
+ fail "T2: $t2_iter_failures iterations had sanity-check failures"
+fi
+
+info "T2: aggregate stats — total ok=$t2_total_ok fail=$t2_total_fail formatter=$t2_total_formatter steal_attempts=$t2_total_steal_attempts"
+
+# ---------------------------------------------------------------------------
+# T3: serial mount → umount → remount (slot reuse sanity)
+# ---------------------------------------------------------------------------
+info "================================================================"
+info "T3: serial mount → umount → remount (slot reuse)"
+info "================================================================"
+
+load_module
+set_param "bootstrap_format_timeout_ms" "30000"
+
+info "T3: first mount"
+if mount_node "t3"; then
+ pass "T3: first mount succeeded"
+else
+ fail "T3: first mount failed"
+fi
+
+s0=$(slot0_status)
+if [[ "$s0" == "CLAIMED" ]]; then
+ pass "T3: slot[0] = CLAIMED after first mount"
+else
+ fail "T3: slot[0] = $s0 after first mount (expected CLAIMED)"
+fi
+
+info "T3: unmounting"
+umount_node "t3"
+
+# Slot[0] should now be EMPTY (graceful umount path)
+# Need at least one mount active for dump sysfs to work; mount briefly again
+# Actually slot status is in CXL memory; check via re-mount logging
+# Instead: remount and check dmesg — if slot reused it won't re-format
+dmesg_before_t3=$(dmesg | wc -l)
+
+info "T3: remount (should reuse EMPTY slot without re-formatting)"
+if mount_node "t3"; then
+ pass "T3: remount succeeded"
+else
+ fail "T3: remount failed"
+fi
+
+s0_after=$(slot0_status)
+if [[ "$s0_after" == "CLAIMED" ]]; then
+ pass "T3: slot[0] = CLAIMED after remount"
+else
+ fail "T3: slot[0] = $s0_after after remount (expected CLAIMED)"
+fi
+
+# Verify no second format happened (formatter elected log would appear again only if format needed)
+fmt_after=$(dmesg | tail -n +"$dmesg_before_t3" | grep -c "bootstrap: formatter elected" || true)
+# Second mount on already-formatted device: slot[0] EMPTY → claim succeeds,
+# needs_format=false → no formatter elected log (slot_idx=0 && !needs_format branch).
+if [[ "$fmt_after" -eq 0 ]]; then
+ pass "T3: no re-format on remount of existing device"
+else
+ fail "T3: unexpected re-format on remount ($fmt_after formatter-elected events)"
+fi
+
+umount_node "t3"
+unload_module
+
+# ---------------------------------------------------------------------------
+# Summary
+# ---------------------------------------------------------------------------
+echo ""
+echo "Bootstrap chaos: PASS=$PASS FAIL=$FAIL"
+if [[ $FAIL -eq 0 ]]; then
+ echo "All bootstrap chaos tests PASSED."
+ exit 0
+else
+ echo "Some bootstrap chaos tests FAILED."
+ exit 1
+fi
diff --git a/marufs_kernel/tests/test_chown_race.c b/marufs_kernel/tests/test_chown_race.c
new file mode 100644
index 0000000..e968200
--- /dev/null
+++ b/marufs_kernel/tests/test_chown_race.c
@@ -0,0 +1,647 @@
+// SPDX-License-Identifier: Apache-2.0
+/*
+ * test_chown_race.c - CHOWN concurrency and timing attack tests
+ *
+ * Validates that the CAS-based atomic ownership transfer in MARUFS_IOC_CHOWN
+ * correctly handles race conditions, concurrent attacks, and GC interactions.
+ *
+ * Usage:
+ * test_chown_race