multikernel · SiyuanSun0736 · May 19, 2026 · May 19, 2026 · May 19, 2026
diff --git a/BUILTINS.md b/BUILTINS.md
@@ -98,7 +98,7 @@ fn main() -> i32 {
     - `flags`: Attachment flags (context-dependent)
 - Perf event form:
     - `handle`: Program handle returned from `load()`
-    - `opts`: `perf_options` value — only `perf_type` and `perf_config` are required; all other fields have defaults
+    - `opts`: `perf_options` value — only `perf_type` and `perf_config` are required; all other fields have defaults, including no group (`group` invalid and `group_fd=-1`)
     - `flags`: Must be `0` for perf attaches; nonzero values are rejected
 
 **Return Value:**
@@ -120,8 +120,20 @@ var perf_att = attach(perf_prog, perf_options { perf_type: perf_type_hardware, p
 var count = read(perf_att)
 detach(perf_att)
 detach(perf_prog)
+
+// Grouped perf events: branch joins cache's leader group. Adding a member restarts the group.
+var cache = attach(perf_prog, perf_options { perf_type: perf_type_hardware, perf_config: cache_misses }, 0)
+var branch = attach(perf_prog, perf_options {
+    perf_type: perf_type_hardware,
+    perf_config: branch_misses,
+    group: cache,
+}, 0)
+detach(branch)
+detach(cache)
 ```
 
+Grouped events are scheduled as one atomic PMU unit. Separate events and separate groups may be multiplexed, but members inside one group cannot be independently multiplexed. Static groups that exceed the target PMU counter limit are rejected at compile time; override the detected/default limit with `KERNELSCRIPT_PERF_GROUP_MAX_EVENTS` when compiling for a different target.
+
 **Context-specific implementations:**
 - **eBPF:** Not available
 - **Userspace:** Uses `attach_bpf_program_by_fd` for standard targets and `ks_attach_perf_event` for perf events
@@ -163,15 +175,60 @@ detach(prog)  // Clean up
 **Variadic:** No
 **Context:** Userspace only
 
-**Description:** Read the current hardware/software counter value from a perf attachment.
+**Description:** Read the current hardware/software counter value from a perf attachment. If the kernel multiplexed the event, the value is scaled with `time_enabled / time_running`.
 
 **Parameters:**
 - `handle`: Perf attachment returned from `attach(handle, perf_options, flags)`
 
 **Return Value:**
-- Returns the raw 64-bit counter value on success
+- Returns the raw 64-bit counter value when no multiplexing occurred
+- Returns a scaled value when `time_running < time_enabled`
 - Returns `-1` on invalid/stale attachment or read failure
 - Reads use the attachment's `perf_fd` directly; the internal token detects copied handles used after detach.
+- Use `read_group(leader)` when you need a same-time group snapshot.
+
+---
+
+#### `read_raw(handle)`
+**Signature:** `read_raw(handle: PerfAttachment) -> i64`
+**Variadic:** No
+**Context:** Userspace only
+
+**Description:** Read the unscaled raw hardware/software counter value from a perf attachment.
+
+**Return Value:**
+- Returns the raw counter value
+- Returns `-1` on invalid/stale attachment or read failure
+
+---
+
+#### `read_details(handle)`
+**Signature:** `read_details(handle: PerfAttachment) -> PerfReadDetails`
+**Variadic:** No
+**Context:** Userspace only
+
+**Description:** Read raw, scaled, `time_enabled`, and `time_running` details for a perf attachment.
+
+**Return Value:**
+- `raw`: unscaled counter value
+- `scaled`: multiplex-corrected value, or `-1` on timing/read error
+- `time_enabled`: perf enabled time
+- `time_running`: perf running time
+
+---
+
+#### `read_group(leader)`
+**Signature:** `read_group(leader: PerfAttachment) -> PerfGroupRead`
+**Variadic:** No
+**Context:** Userspace only
+
+**Description:** Read a same-time snapshot from a perf event group leader. This enables `PERF_FORMAT_GROUP | PERF_FORMAT_ID` in generated perf events.
+
+**Return Value:**
+- `count`: number of entries returned, capped at 16
+- `values`: multiplex-scaled values from the snapshot
+- `ids`: perf event IDs for the returned values
+- `time_enabled` / `time_running`: timing fields used for scaling
 
 ---
 

diff --git a/README.md b/README.md
@@ -306,7 +306,7 @@ fn on_branch_miss(ctx: *bpf_perf_event_data) -> i32 {
 fn main() -> i32 {
     var prog = load(on_branch_miss)
 
-    // Minimal form — defaults: pid=-1 (all procs), cpu=0,
+    // Minimal form — defaults: pid=-1 (all procs), cpu=0, no group,
     // period=1_000_000, wakeup=1; perf attach flags must be 0
     var att = attach(prog, perf_options { perf_type: perf_type_hardware, perf_config: branch_misses }, 0)
     var count = read(att)
@@ -318,6 +318,22 @@ fn main() -> i32 {
 }
 ```
 
+Perf events can share a kernel scheduling group by passing the leader attachment directly with `group`.
+The lower-level `group_fd: cache.perf_fd` form is still supported for compatibility:
+
+```kernelscript
+var cache = attach(prog, perf_options { perf_type: perf_type_hardware, perf_config: cache_misses }, 0)
+var branch = attach(prog, perf_options {
+    perf_type: perf_type_hardware,
+    perf_config: branch_misses,
+    group: cache,
+}, 0)
+```
+
+Adding a member restarts the whole group from zero. Detaching a leader cascades to any live members. A group competes for PMU counters as one atomic unit: different groups can be multiplexed over time, but members inside one group are not independently multiplexed. For statically visible groups, the compiler rejects groups that need more PMU counter slots than the target limit. The limit is read from known sysfs PMU caps when available, defaults to 4, and can be overridden with `KERNELSCRIPT_PERF_GROUP_MAX_EVENTS`.
+
+`read(att)` returns a multiplex-scaled count when the kernel reports `time_running < time_enabled`. Use `read_raw(att)` for the raw value, `read_details(att)` for raw/scaled/timing details, and `read_group(leader)` for a same-time group snapshot.
+
 **Available `perf_type` values:**
 
 | Enum value | Hardware/software event |

diff --git a/SPEC.md b/SPEC.md
@@ -461,7 +461,7 @@ fn main() -> i32 {
     var prog = load(my_handler)
 
     // Only perf_type + perf_config are required; all other fields use language-level defaults:
-    // pid=-1, cpu=0, period=1_000_000, wakeup=1, inherit/exclude_*=false
+    // pid=-1, cpu=0, no group, period=1_000_000, wakeup=1, inherit/exclude_*=false
     var misses = attach(prog, perf_options { perf_type: perf_type_hardware, perf_config: branch_misses }, 0)
 
     // Override specific fields as needed:
@@ -473,8 +473,19 @@ fn main() -> i32 {
         exclude_kernel: true,
     }, 0)
 
-    print("misses=%lld cache=%lld", read(misses), read(cache))
+    // Put branch misses in cache's perf event group. Adding a member restarts
+    // the whole group from zero. The lower-level group_fd: cache.perf_fd form
+    // is still accepted.
+    var branch = attach(prog, perf_options {
+        perf_type: perf_type_hardware,
+        perf_config: branch_misses,
+        group: cache,
+    }, 0)
 
+    print("misses=%lld cache=%lld branch=%lld", read(misses), read(cache), read(branch))
+    var snapshot = read_group(cache)
+
+    detach(branch)
     detach(cache)  // IOC_DISABLE → bpf_link__destroy → close(perf_fd)
     detach(misses)
     detach(prog)
@@ -490,6 +501,8 @@ fn main() -> i32 {
 | `perf_config` | `u64` | *(required)* | `perf_event_attr.config` value for that type |
 | `pid` | `i32` | `-1` | -1 = all processes; ≥0 = specific PID |
 | `cpu` | `i32` | `0` | ≥0 = specific CPU; -1 = any CPU (pid must be ≥0) |
+| `group_fd` | `i32` | `-1` | -1 = standalone event; ≥0 = perf group leader fd |
+| `group` | `PerfAttachment` | invalid attachment | Preferred high-level group leader attachment |
 | `period` | `u64` | `1000000` | Sample after this many events |
 | `wakeup` | `u32` | `1` | Wake userspace after N samples |
 | `inherit` | `bool` | `false` | Inherit to forked children |
@@ -538,16 +551,39 @@ For event families with a richer config space, such as `perf_type_hw_cache`, pro
 |---|---|---|
 | `ks_open_perf_event` | `int (ks_perf_options)` | Calls `perf_event_open(2)`, returns fd |
 | `ks_attach_perf_event` | `PerfAttachment (int prog_fd, ks_perf_options, int flags)` | Full open-reset-attach-enable lifecycle |
-| `ks_read_perf_count` | `int64_t (int perf_fd)` | Reads current 64-bit counter via `read()` |
+| `ks_read_perf_count` | `int64_t (int perf_fd)` | Reads current counter and applies multiplex scaling when needed |
 | `ks_perf_attachment_read` | `int64_t (PerfAttachment)` | Direct fd read through the attachment value with stale-handle detection |
+| `ks_perf_attachment_read_raw` | `int64_t (PerfAttachment)` | Direct raw counter read with stale-handle detection |
+| `ks_perf_attachment_read_details` | `PerfReadDetails (PerfAttachment)` | Returns raw, scaled, `time_enabled`, and `time_running` |
+| `ks_perf_attachment_read_group` | `PerfGroupRead (PerfAttachment)` | Reads a same-time group snapshot from a leader attachment |
 
-**Attach sequence (compiler-generated, inside `ks_attach_perf_event`):**
+**Attach sequence for standalone events (compiler-generated, inside `ks_attach_perf_event`):**
 1. `ks_attr.attr.disabled = 1` — open counter without starting it  
-2. `syscall(SYS_perf_event_open, ...)` → `perf_fd`  
+2. `syscall(SYS_perf_event_open, ..., group_fd=-1, ...)` → `perf_fd`  
 3. `ioctl(perf_fd, PERF_EVENT_IOC_RESET, 0)` — zero the counter  
 4. `bpf_program__attach_perf_event(prog, perf_fd)` — link BPF program  
 5. `ioctl(perf_fd, PERF_EVENT_IOC_ENABLE, 0)` — **start counting**  
 
+**Perf event groups:**
+- `group: leader_attachment` is the preferred way to join a perf group.
+- `group_fd >= 0` opens the new event as a member of that leader fd.
+- Group members are opened disabled, linked to the BPF program, then the leader is disabled, reset, and enabled with `PERF_IOC_FLAG_GROUP`.
+- Adding a member to an already running group restarts the whole group from zero.
+- A group is scheduled as an atomic PMU unit. Separate events and separate groups may be multiplexed; members inside one group are not independently multiplexed. If a statically visible group needs more PMU counter slots than the target limit, compilation fails.
+- The compile-time group limit uses known sysfs PMU caps when available, falls back to `4`, and can be overridden with `KERNELSCRIPT_PERF_GROUP_MAX_EVENTS`.
+- `perf_type_software` and `perf_type_tracepoint` do not consume PMU counter slots for this check; static hardware/raw/cache/breakpoint events consume one slot, and dynamic `perf_type` values are conservatively counted as one slot.
+- Detaching a member is allowed. Detaching a leader cascades to any live members.
+- `read_group(leader)` enables `PERF_FORMAT_GROUP | PERF_FORMAT_ID` and returns up to 16 same-time group values plus perf IDs and timing fields.
+
+**Counter reads:**
+- Generated perf events request `PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING`.
+- `read(att)` returns the raw value when `time_enabled == time_running`.
+- If multiplexing occurred, `read(att)` returns `value * time_enabled / time_running` using a 128-bit intermediate.
+- If `time_running == 0`, `read(att)` reports an error and returns `-1`.
+- `read_raw(att)` returns the unscaled raw counter.
+- `read_details(att)` returns raw, scaled, `time_enabled`, and `time_running`.
+- `read_group(leader)` returns a snapshot struct; group `values[]` are scaled using the snapshot timing fields.
+
 **Detach sequence (compiler-generated):**
 1. `ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, 0)` — stop counting  
 2. `bpf_link__destroy(link)` — unlink BPF program  
@@ -559,7 +595,8 @@ For event families with a richer config space, such as `perf_type_hw_cache`, pro
 - Returns a first-class `PerfAttachment` value for perf attaches so one program can hold multiple live counters
 - `PerfAttachment` carries `perf_fd` plus an internal generation token; `read(attachment)` avoids global attachment-list scans and rejects copied handles after detach
 - Exposes omitted `perf_options` fields as language-level defaults (partial struct literal)
-- Validates `pid ≥ -1`, `cpu ≥ -1`, and rejects `pid == -1 && cpu == -1` at runtime
+- Validates `pid ≥ -1`, `cpu ≥ -1`, `group_fd ≥ -1`, and rejects `pid == -1 && cpu == -1` at runtime
+- Treats `group` as valid only when it carries a live `PerfAttachment` generation token; otherwise `group_fd` controls grouping
 - Emits `PERF_FLAG_FD_CLOEXEC` for safe fd inheritance
 - BPF program section is `SEC("perf_event")`
 

diff --git a/examples/perf_cache_miss.ks b/examples/perf_cache_miss.ks
@@ -11,19 +11,48 @@ fn on_cache_miss(ctx: *bpf_perf_event_data) -> i32 {
 fn main() -> i32 {
     var prog = load(on_cache_miss)
 
-    // Only perf_type + perf_config are required; pid, cpu, period, wakeup and flag fields
+    // Only perf_type + perf_config are required; pid, cpu, group/group_fd, period, wakeup and flag fields
     // default to: pid=-1 (all procs), cpu=0, period=1_000_000, wakeup=1,
-    // inherit/exclude_kernel/exclude_user=false.
+    // no group, inherit/exclude_kernel/exclude_user=false.
     var cache = attach(prog, perf_options { perf_type: perf_type_hardware, perf_config: cache_misses, period: 10000000, inherit: true }, 0)
-    var branch = attach(prog, perf_options { perf_type: perf_type_hardware, perf_config: branch_misses, period: 10000000, inherit: true }, 0)
+    // branch joins cache's perf event group. Adding a member restarts the whole group from zero.
+    var branch = attach(prog, perf_options { perf_type: perf_type_hardware, perf_config: branch_misses, period: 10000000, inherit: true, group: cache }, 0)
     print("Cache-miss and branch-miss perf_event demo attached")
     var cache_count = read(cache)
     print("Cache-miss count: %lld", cache_count)
     var branch_count = read(branch)
     print("Branch-miss count: %lld", branch_count)
+
+    var prev = read_details(cache)
+    // Simulate workload with cache misses and branch misses.
+    var x = 0
+    var i = 0
+    for (i in 0..10000000) {
+        if (i % 100 == 0) {
+            x = x + 1
+        } else {
+            x = x * 2
+        }
+    }
+    var cur = read_details(cache)
+    var delta = cur.scaled - prev.scaled
+    var dt_ns = cur.time_enabled - prev.time_enabled
+    if (dt_ns > 0) {
+        var per_sec = (delta * 1000000000) / dt_ns
+        print("Cache misses/sec: %lld", per_sec)
+    }
+
+    var snapshot = read_group(cache)
+    print("Grouped snapshot entries: %u", snapshot.count)
+
+    var snapshot_index = 0
+    while (snapshot_index < snapshot.count) {
+        print("id=%llu value=%lld", snapshot.ids[snapshot_index], snapshot.values[snapshot_index])
+        snapshot_index = snapshot_index + 1
+    }
 
-    detach(cache)
     detach(branch)
+    detach(cache)
     detach(prog)
     print("Cache-miss and branch-miss perf_event demo detached")
     return 0

diff --git a/examples/perf_page_fault.ks b/examples/perf_page_fault.ks
@@ -14,20 +14,26 @@ fn main() -> i32 {
     // pid: 0 = current process, cpu: -1 = any CPU (standard per-process monitoring).
     // page_faults (PERF_COUNT_SW_PAGE_FAULTS) is the most reliable software event:
     // every heap/stack allocation triggers minor page faults, no scheduler dependency.
-    var att = attach(prog, perf_options { perf_type: perf_type_software, perf_config: page_faults, pid: 0, cpu: -1, period: 1 }, 0)
-    print("Page-fault perf_event demo attached")
+    var page = attach(prog, perf_options { perf_type: perf_type_software, perf_config: page_faults, pid: 0, cpu: -1, period: 1 }, 0)
+    // branch joins cache's perf event group. Adding a member restarts the whole group from zero.
+    var branch = attach(prog, perf_options { perf_type: perf_type_hardware, perf_config: branch_misses, period: 10000000, inherit: true}, 0)
+
+    print("perf_event demo attached")
 
     // Repeatedly increment a counter; stack/heap activity will generate page faults.
     var x: i64 = 0
     for (i in 0..10000000) {
         x = x + 1
     }
 
-    var count = read(att)
-    print("Page-fault count: %lld", count)
+    var page_fault_count = read(page)
+    print("Page-fault count: %lld", page_fault_count)
+    var branch_count = read(branch)
+    print("Branch-miss count: %lld", branch_count)
 
-    detach(att)
-    print("Page-fault perf_event demo detached")
+    detach(page)
+    detach(branch)
+    print("perf_event demo detached")
     detach(prog)
     return 0
 }
diff --git a/src/ir_generator.ml b/src/ir_generator.ml
@@ -877,7 +877,7 @@ let rec lower_expression ctx (expr : Ast.expr) =
                 emit_variable_decl_val ctx ptr_val ptr_val.val_type (Some ptr_expr) expr.expr_pos;
 
                 (* result = *ptr *)
-                let load_expr = make_ir_expr (IRValue ptr_val) element_type expr.expr_pos in
+                let load_expr = make_ir_expr (IRUnOp (IRDeref, ptr_val)) element_type expr.expr_pos in
                 emit_variable_decl_val ctx result_val element_type (Some load_expr) expr.expr_pos);
 
            result_val)
@@ -3572,4 +3572,4 @@ let generate_ir ?(use_type_annotations=false) ast symbol_table source_name =
   with
   | exn ->
       Printf.eprintf "IR generation failed: %s\n" (Printexc.to_string exn);
-      raise exn
+      raise exn