diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index a18a727..50f03a7 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -20,7 +20,7 @@ jobs:
         arch:
           - x64
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - uses: julia-actions/setup-julia@v2
         with:
           version: ${{ matrix.version }}
diff --git a/Project.toml b/Project.toml
index 23ecd8e..9b4671f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,19 +1,26 @@
 name = "CachedInterpolations"
 uuid = "b9709bfb-d23d-5560-8276-8c35c4b76823"
+version = "1.0.0"
 authors = ["Tim Holy <tim.holy@gmail.com>"]
-version = "0.1.1"
 
 [deps]
 Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 
 [compat]
+Aqua = "0.8"
+Documenter = "1"
+ExplicitImports = "1"
 Interpolations = "0.14 - 1"
 StaticArrays = "1"
+Test = "1"
 julia = "1.10"
 
 [extras]
+Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
+Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+ExplicitImports = "7d51a73a-1435-4ff3-83d9-f097790105c7"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test"]
+test = ["Aqua", "Documenter", "ExplicitImports", "Test"]
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..f54218f
--- /dev/null
+++ b/README.md
@@ -0,0 +1,90 @@
+# CachedInterpolations.jl
+
+[![CI](https://github.com/HolyLab/CachedInterpolations.jl/actions/workflows/CI.yml/badge.svg)](https://github.com/HolyLab/CachedInterpolations.jl/actions/workflows/CI.yml)
+[![codecov](https://codecov.io/gh/HolyLab/CachedInterpolations.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/HolyLab/CachedInterpolations.jl)
+[![version](https://juliahub.com/docs/General/CachedInterpolations/stable/version.svg)](https://juliahub.com/ui/Packages/General/CachedInterpolations)
+
+Performance-optimized quadratic B-spline interpolation for large or memory-mapped arrays.
+
+## The problem
+
+When `P` is a large array stored on disk as a memory-mapped file, evaluating
+`P[y1, y2, i1, i2]` at many `(i1, i2)` tile positions with slowly-varying
+floating-point coordinates `(y1, y2)` incurs repeated disk I/O — even when the
+same 3×3 neighborhood of `P` is accessed on consecutive iterations.
+
+## The solution
+
+`CachedInterpolations` wraps `P` in an array of lightweight
+`CachedInterpolation` objects — one per tile `(i1, i2)`. Each object caches
+the 3×3×... block of `P` centered on the most-recent evaluation point.
+When the interpolating coordinates change only slightly between calls (e.g.,
+in a gradient-descent loop), the cache is reused and disk I/O is avoided.
+
+## Installation
+
+```julia
+using Pkg
+Pkg.add("CachedInterpolations")
+```
+
+## Usage
+
+```julia
+using CachedInterpolations
+
+# A 3×2 array: two tiles of length 3 each
+A = [0.0 0.0; 1.0 2.0; 0.0 0.0]   # size (3, 2), N=1 interpolating dim, 1 tiling dim
+
+itps = cachedinterpolators(A, 1)   # returns a length-2 Array of CachedInterpolation
+
+itps[1](2.0)   # quadratic-interpolated value at position 2.0 in tile 1 → 0.75
+itps[2](2.0)   # same position in tile 2 → 1.5
+```
+
+Repeated calls near the same coordinate reuse the cached 3-element block and
+avoid re-reading from `A` (or from disk, if `A` is memory-mapped).
+
+### Centered coordinate axes
+
+Pass an `origin` tuple to shift the visible axis so that a convenient index
+(e.g., the center of the array) maps to zero:
+
+```julia
+A = reshape([0.0; 1.0; 0.0], (3, 1))
+itps = cachedinterpolators(A, 1, (2,))  # index 2 → coordinate 0
+
+axes(itps[1])    # (-1:1,)
+itps[1](0.0)     # 0.75 — the peak, now addressed as 0
+itps[1](-0.5)    # off-center
+```
+
+### Gradient evaluation
+
+`Interpolations.gradient` works on a `CachedInterpolation` with the same call
+signature as on a regular `Interpolations` interpolant:
+
+```julia
+using Interpolations
+g = Interpolations.gradient(itps[1], 0.0)   # SVector of partial derivatives
+```
+
+> **Note:** `gradient` assumes you have already called `itp(y...)` at the same
+> coordinate to populate the cache. Calling `gradient` without a prior `itp(y...)`
+> at the same location returns incorrect results.
+
+### Working with memory-mapped arrays
+
+```julia
+using Mmap
+
+# Memory-map a large on-disk array
+open("bigdata.bin") do io
+    P = Mmap.mmap(io, Array{Float64, 4}, (256, 256, 100, 50))
+    itps = cachedinterpolators(P, 2)   # first 2 dims interpolating, last 2 tiling
+
+    # Evaluate at floating-point coords across all tiles — cache avoids re-reading
+    # the same 3×3 block when coords change slowly
+    result = [itps[i, j](128.0, 128.0) for i in 1:100, j in 1:50]
+end
+```
diff --git a/src/CachedInterpolations.jl b/src/CachedInterpolations.jl
index 50410e1..66382df 100644
--- a/src/CachedInterpolations.jl
+++ b/src/CachedInterpolations.jl
@@ -1,7 +1,8 @@
 module CachedInterpolations
 
-using Interpolations, StaticArrays
+using Interpolations: Interpolations, AbstractInterpolation, BSpline, InPlace, NoInterp, OnCell, Quadratic
 using Interpolations: weightedindexes, value_weights, gradient_weights, InterpGetindex
+using StaticArrays: StaticArrays, SVector
 import Base: getindex
 
 export CachedInterpolation, cachedinterpolators
@@ -19,12 +20,12 @@ quadratic interpolation of a large multidimensional array.  The first
     end
 ```
 where `x_1`, `x_2` are floating-point indexes and `i_1`, `i_2` are
-integer indexes.  A CachedInterpolation simulates an array-of-arrays
+integer indexes.  A `CachedInterpolation` simulates an array-of-arrays
 interface for this task, while in reality using only a single
 underlying `P` array.
 
 The performance enhancement comes from caching: when `P` is bigger
-than the computer's memory, `P` may be a mmapped-file, and direct
+than the computer's memory, `P` may be a memory-mapped file, and direct
 access to values of `P` will therefore be limited by disk I/O. If one
 has a task in which all elements of `B` have to be evaluated
 repeatedly for different values of `y_1`, `y_2`, but that these values
@@ -42,17 +43,39 @@ Create an array-of-interpolating arrays with the function
 CachedInterpolations
 
 """
-A single `CachedInterpolation` represents a "movable"
-3-by-3-by... view of `P[:, :, i_1, i_2]` for a specific `(i_1,
-i_2)`. An array of these objects thus implements an array-of-arrays
-interface. Create them with `cachedinterpolators`.
+    CachedInterpolation
+
+A single interpolating tile over a tiling array. A `CachedInterpolation`
+represents a "movable" 3×3×... view of `P[:, :, i_1, i_2]` for a specific
+`(i_1, i_2)` pair. Calling `itp(y_1, y_2)` returns the scalar
+quadratic-interpolated value at position `(y_1, y_2)` (floating-point or
+integer coordinates), caching the surrounding 3×3×... block of `P` to avoid
+redundant I/O when coordinates change by only a small amount between calls.
+Integer-index access is also available via `itp[y_1, y_2]`.
+
+Create arrays of `CachedInterpolation`s with [`cachedinterpolators`](@ref).
+
+# Examples
+```jldoctest
+julia> A = reshape([0.0; 1.0; 0.0], (3, 1));
+
+julia> itps = cachedinterpolators(A, 1);
+
+julia> itp = itps[1];
+
+julia> itp(2.0)   # quadratic-interpolated peak of [0, 1, 0]
+0.75
+
+julia> itp(1.5)   # off-center
+0.5
+```
 """
 mutable struct CachedInterpolation{T, N, M, O, K} <: AbstractInterpolation{T, N, BSpline{Quadratic{InPlace}}}
     # Note: M = N+K
-    coefs::Array{T, M}   # tiled array of 3x3x... buffers
-    parent::Array{T, M}  # the overall array (`P` in the documentation above)
+    const coefs::Array{T, M}   # tiled array of 3x3x... buffers
+    const parent::Array{T, M}  # the overall array (`P` in the documentation above)
     center::NTuple{N, Int}  # rounded (y_1, y_2) of prev. eval for this tile
-    tileindex::CartesianIndex{K}
+    const tileindex::CartesianIndex{K}
 end
 
 const splitN = Base.IteratorsMD.split
@@ -65,26 +88,48 @@ Base.axes(itp::CachedInterpolation{T, N, M, O}, d) where {T, N, M, O} =
     d <= N ? axes(itp.parent, d) .- O[d] : Base.OneTo(1)
 
 """
+    cachedinterpolators(parent::Array, N)
+    cachedinterpolators(parent::Array, N, origin)
 
-`itps = cachedinterpolators(parent, N, [origin=(0,0,...)])` creates an
-array-of-CachedInterpolation arrays from an underlying `parent` array,
-where the first `N` dimensions of `parent` are interpolating.  The
-equivalent of
-```
-    parent[y_1, y_2, i_1, i_2]
-```
-becomes
-```
-    itp = itps[i_1, i_2]
-    itp[y_1, y_2]
+Create an `Array` of [`CachedInterpolation`](@ref) objects from the dense array
+`parent`, where the first `N` dimensions of `parent` are interpolating and the
+remaining dimensions are tiling. `parent` must be a dense `Array`; subtypes of
+`AbstractArray` such as `SubArray` or `SharedArray` are not supported.
+
+The expression `parent[y_1, y_2, i_1, i_2]` becomes:
+
+```julia
+itp = itps[i_1, i_2]
+itp(y_1, y_2)
 ```
-and `itp` caches the needed values of `parent[:,:,i_1,i_2]` centered
-around `y_1, y_2`.
 
-Optionally specify the `origin` argument to offset the `y_i`
-coordinates within a tile.  For example, one can mimic a
-`CenterIndexedArray` when `size(parent, d)` is odd for the first `N`
-dimensions and you supply `origin = (div(size(parent,1)+1, 2), ...)`.
+Each `itp` caches a 3×3×... block of `parent` centered on the most-recent
+evaluation point, so repeated calls with nearby coordinates avoid redundant
+memory or disk I/O (useful when `parent` is a memory-mapped file).
+
+The optional `origin` argument is a tuple of `N` integers (default: all zeros)
+that offsets the coordinate system. With a nonzero `origin`, dimension `d` is
+addressed as `y_d + origin[d]` in `parent`, shifting the user-visible axes by
+`-origin`. For example, setting `origin = (div(size(parent,d)+1, 2) for d in
+1:N)` centers the axes at zero when the interpolating dimensions have odd size.
+
+# Examples
+```jldoctest
+julia> A = reshape([0.0; 1.0; 0.0], (3, 1));
+
+julia> itps = cachedinterpolators(A, 1);
+
+julia> itps[1](2.0)   # quadratic-interpolated peak of [0, 1, 0]
+0.75
+
+julia> itps_c = cachedinterpolators(A, 1, (2,));  # origin at index 2
+
+julia> axes(itps_c[1])   # coordinates shifted; 0 now addresses the peak
+(-1:1,)
+
+julia> itps_c[1](0.0)    # peak via centered coordinates
+0.75
+```
 """
 function cachedinterpolators(parent::Array{T, M}, N, origin = ntuple(d -> 0, N)) where {T, M}
     0 <= N <= M || error("N must be between 0 and $M")
@@ -127,13 +172,6 @@ end
     return icoefs[wis...]
 end
 
-struct CoefsWrapper{N, A}
-    coefs::A
-end
-
-Base.size(itp::CoefsWrapper{N}) where {N} = splitN(size(itp.coefs), Val(N))[1]
-Base.size(itp::CoefsWrapper{N}, d) where {N} = d <= N ? size(itp.coefs, d) : 1
-
 # FIXME: this function cheats dangerously, because it does _not_
 # update the cache. This is equivalent to the assumption you've called
 # getindex for the current `(x_1, x_2, ...)` location before calling
@@ -155,10 +193,6 @@ end
 ### Potential deprecations
 
 # if AbstractInterpolation <: AbstractArray goes away, this can be deprecated
-getindex(itp::CachedInterpolation{T, N, M, O, K}, xs::Vararg{Int, N}) where {T, N, M, O, K} = itp(xs...)
-
-### Deprecations
-
-@deprecate getindex(itp::CachedInterpolation{T, N, M, O, K}, xs::Vararg{Number, N}) where {T, N, M, O, K} itp(xs...)
+getindex(itp::CachedInterpolation{T, N, M, O, K}, xs::Vararg{Integer, N}) where {T, N, M, O, K} = itp(xs...)
 
 end # module
diff --git a/test/runtests.jl b/test/runtests.jl
index 6b54245..36fd355 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,5 +1,5 @@
 import CachedInterpolations
-using Interpolations, Test
+using Aqua, Documenter, ExplicitImports, Interpolations, Test
 
 @testset "CachedInterpolations" begin
     A = reshape([0; 1; 0], (3, 1))
@@ -25,6 +25,14 @@ using Interpolations, Test
 
     @test Interpolations.gradient(C[1, 2, 2], 3.2, 3.8) === Interpolations.gradient(Ai, 3.2, 3.8, 1, 2, 2)
 
+    # gradient!
+    g = zeros(2)
+    Interpolations.gradient!(g, C[1, 2, 2], 3.2, 3.8)
+    @test g ≈ collect(Interpolations.gradient(C[1, 2, 2], 3.2, 3.8))
+
+    # getindex with Integer arguments
+    @test C[1, 2, 2][3, 4] == C[1, 2, 2](3.0, 4.0)
+
     # With origin
     C = CachedInterpolations.cachedinterpolators(A, 2, (4, 4))
     @test C[1, 2, 2](-0.8, 0.8) ≈ Ai(3.2, 4.8, 1, 2, 2)
@@ -32,10 +40,36 @@ using Interpolations, Test
     @test C[1, 2, 2](-0.8, -0.2) ≈ Ai(3.2, 3.8, 1, 2, 2)
     @test Interpolations.gradient(C[1, 2, 2], -0.8, -0.2) ≈ Interpolations.gradient(Ai, 3.2, 3.8, 1, 2, 2)
 
+    # axes with origin offset
+    c_orig = C[1, 1, 1]
+    @test axes(c_orig) == (-3:3, -3:3)
+    @test axes(c_orig, 1) == -3:3
+    @test axes(c_orig, 3) == Base.OneTo(1)
+
     # Check for Float32 with Float64 indexes, since that's the
     # default mismatch case
     A = rand(Float32, 7, 7, 2, 2, 3)
     Ai = interpolate!(A, (Q, Q, NoInterp(), NoInterp(), NoInterp()))
     C = CachedInterpolations.cachedinterpolators(A, 2, (4, 4))
     @test @inferred(C[1, 2, 2](-0.8, 0.8)) ≈ Ai(3.2, 4.8, 1, 2, 2)
+
+    @testset "Doctests" begin
+        DocMeta.setdocmeta!(CachedInterpolations, :DocTestSetup, :(using CachedInterpolations); recursive=true)
+        doctest(CachedInterpolations; manual=false)
+    end
+
+    @testset "Aqua" begin
+        Aqua.test_all(CachedInterpolations)
+    end
+
+    @testset "ExplicitImports" begin
+        # weightedindexes, value_weights, gradient_weights, InterpGetindex, gradient, gradient!
+        # are non-public Interpolations internals required by this package's implementation.
+        # Base.IteratorsMD.split is a non-public Base internal with no public alternative.
+        test_explicit_imports(
+            CachedInterpolations;
+            all_explicit_imports_are_public=(; ignore=(:weightedindexes, :value_weights, :gradient_weights, :InterpGetindex)),
+            all_qualified_accesses_are_public=(; ignore=(:IteratorsMD, :gradient, :gradient!, :split, :OneTo)),
+        )
+    end
 end