diff --git a/compiler/src/main/java/com/dylibso/chicory/compiler/internal/Compiler.java b/compiler/src/main/java/com/dylibso/chicory/compiler/internal/Compiler.java index b432af314..96e393d05 100644 --- a/compiler/src/main/java/com/dylibso/chicory/compiler/internal/Compiler.java +++ b/compiler/src/main/java/com/dylibso/chicory/compiler/internal/Compiler.java @@ -100,7 +100,17 @@ public final class Compiler { private static final MethodType MACHINE_CALL_METHOD_TYPE = methodType(long[].class, Instance.class, Memory.class, int.class, long[].class); - private static final int MAX_MACHINE_CALL_METHODS = 1024; // must be power of two + // C2 JIT's HugeMethodLimit (default 8KB) — methods exceeding this get degraded optimization. + // Dispatch chunks are sized to stay under this limit for full C2 compilation. + private static final int HUGE_METHOD_LIMIT = + Integer.getInteger("chicory.hugeMethodLimit", 8000); + // Estimated upper bound: tableswitch offset (4) + label + invokestatic (~5) + areturn (1) + + // overhead + private static final int ESTIMATED_BYTES_PER_DISPATCH_ENTRY = 40; + private static final int MAX_DISPATCH_METHODS = + Integer.highestOneBit( + HUGE_METHOD_LIMIT / ESTIMATED_BYTES_PER_DISPATCH_ENTRY); // must be power of two + private static final int MAX_CALL_INDIRECT_METHODS = 1024; // must be power of two // 1024*12 was empirically determined to work for the 50K small wasm functions. // So lets start there and halve it until we find a size that works. // This should give us the biggest class size possible. @@ -781,11 +791,11 @@ private void compileMachineCallClass() { // static implementation for Machine.call() Consumer callMethod; - if (functionTypes.size() < MAX_MACHINE_CALL_METHODS) { + if (functionTypes.size() < MAX_DISPATCH_METHODS) { callMethod = asm -> compileMachineCallInvoke(asm, 0, functionTypes.size()); } else { // Best value that worked with the 50K small wasm functions - var maxMachineCallMethods = MAX_MACHINE_CALL_METHODS << 2; + var maxMachineCallMethods = MAX_DISPATCH_METHODS << 2; maxMachineCallMethods = loadChunkedClass( functionTypes.size(), @@ -1042,7 +1052,7 @@ private void compileCallIndirect( asm.load(instance, OBJECT_TYPE); // Can we fit the impl in a single method? - if (validIds.size() <= MAX_MACHINE_CALL_METHODS) { + if (validIds.size() <= MAX_CALL_INDIRECT_METHODS) { int[] keys = validIds.stream().mapToInt(x -> x).toArray(); Label[] labels = validIds.stream().map(x -> new Label()).toArray(Label[]::new); @@ -1069,10 +1079,10 @@ private void compileCallIndirect( .appendParameterTypes(Memory.class, Instance.class, int.class); // Best value that worked with the 50K small wasm functions - var maxMachineCallMethods = MAX_MACHINE_CALL_METHODS << 2; + var maxCallIndirectMethods = MAX_CALL_INDIRECT_METHODS << 2; loadChunkedClass( functionTypes.size(), - maxMachineCallMethods, + maxCallIndirectMethods, (collector, start, end, chunkSize) -> compileExtraClass( collector, @@ -1093,8 +1103,8 @@ private void compileCallIndirect( end)); })); - assert Integer.bitCount(maxMachineCallMethods) == 1; // power of two - int shift = Integer.numberOfTrailingZeros(maxMachineCallMethods); + assert Integer.bitCount(maxCallIndirectMethods) == 1; // power of two + int shift = Integer.numberOfTrailingZeros(maxCallIndirectMethods); // switch (funcId >> shift) Label[] labels = new Label[((functionTypes.size() - 1) >> shift) + 1]; diff --git a/jmh/src/main/java/com/dylibso/chicory/bench/BenchmarkDispatchChunkSize.java b/jmh/src/main/java/com/dylibso/chicory/bench/BenchmarkDispatchChunkSize.java new file mode 100644 index 000000000..6875675e4 --- /dev/null +++ b/jmh/src/main/java/com/dylibso/chicory/bench/BenchmarkDispatchChunkSize.java @@ -0,0 +1,80 @@ +package com.dylibso.chicory.bench; + +import com.dylibso.chicory.compiler.MachineFactoryCompiler; +import com.dylibso.chicory.runtime.ExportFunction; +import com.dylibso.chicory.runtime.Instance; +import com.dylibso.chicory.wabt.Wat2Wasm; +import com.dylibso.chicory.wasm.Parser; +import java.util.concurrent.TimeUnit; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +// Measures dispatch overhead for large wasm modules (2000 functions). +// Compare with/without the HugeMethodLimit-aware chunking: +// java -Dchicory.hugeMethodLimit=1000000 -jar benchmarks.jar BenchmarkDispatchChunkSize (no +// limit) +// java -jar benchmarks.jar BenchmarkDispatchChunkSize (default +// 8KB) +// +// Results (JDK 25, Apple M3 Max): +// Benchmark (targetFunc) Mode Cnt Score Units +// -- hugeMethodLimit=1000000 (chunks of 1024, exceeds C2 HugeMethodLimit) -- +// BenchmarkDispatchChunkSize.dispatch 0 avgt 5 0.033 us/op +// BenchmarkDispatchChunkSize.dispatch 999 avgt 5 0.034 us/op +// BenchmarkDispatchChunkSize.dispatch 1999 avgt 5 0.035 us/op +// -- default (chunks of 128, under C2 HugeMethodLimit) -- +// BenchmarkDispatchChunkSize.dispatch 0 avgt 5 0.004 us/op +// BenchmarkDispatchChunkSize.dispatch 999 avgt 5 0.004 us/op +// BenchmarkDispatchChunkSize.dispatch 1999 avgt 5 0.004 us/op +@State(Scope.Benchmark) +@Warmup(iterations = 3, time = 2) +@Measurement(iterations = 5, time = 3) +@OutputTimeUnit(TimeUnit.MICROSECONDS) +@BenchmarkMode(Mode.AverageTime) +@Fork(1) +public class BenchmarkDispatchChunkSize { + + private static final int NUM_FUNCTIONS = 2000; + + @Param({"0", "999", "1999"}) + private int targetFunc; + + private ExportFunction exportFunc; + + @Setup + public void setup() { + StringBuilder wat = new StringBuilder(); + wat.append("(module\n"); + for (int i = 0; i < NUM_FUNCTIONS; i++) { + wat.append(" (func $f").append(i); + wat.append(" (export \"f").append(i).append("\")"); + wat.append(" (param i32) (result i32)\n"); + wat.append(" local.get 0\n"); + wat.append(" i32.const ").append(i + 1).append("\n"); + wat.append(" i32.add)\n"); + } + wat.append(")\n"); + + byte[] wasm = Wat2Wasm.parse(wat.toString()); + Instance instance = + Instance.builder(Parser.parse(wasm)) + .withMachineFactory(MachineFactoryCompiler::compile) + .build(); + exportFunc = instance.export("f" + targetFunc); + } + + @Benchmark + public void dispatch(Blackhole bh) { + bh.consume(exportFunc.apply(42)); + } +}