Skip to content

location being interpreted as null #21

@eeholmes

Description

@eeholmes

Update: see proposed solution by Copilot which seems to fix. eeholmes#2. Prob duplicate of #3 though Copilot soln worked with current version o fzstd so maybe slightly different issue or Copilot solution worked with current fzstd.

I am having a strange bug for one of my icechunks. Notice location: null. These datasets open fine in Python and files created in an identical way with VirtualiZarr+to_icechunk() open fine. Both icechunks open file with @earthmover/icechunk-js but I cannot use that since it requires one pass the credentials for the underlying data and this is for a tool for generic opening of public icechunks stores.

Here is the problem which you can see from the repro.mjs below. I have tried to debug but this Icechunk opens fine in Python and @earthmover/icechunk-js and every other icechunk store I have tried works.

BAD ICECHUNK ChunkRef
ref: {
  index: [ 0, 0, 0 ],
  inline: null,
  offset: 50122,
  length: 69838,
  chunkId: null,
  location: null,
  checksumEtag: null,
  checksumLastModified: 1781018600
}

Here is a repro:

import * as zarr from "zarrita";
import { IcechunkStore } from "icechunk-js";

const repoUrl = "https://data.source.coop/eeholmes/cefi/nepacific-icechunk";

const store = await IcechunkStore.open(repoUrl);
const root = await zarr.open(store, { kind: "group" });

async function readFirstValue(path) {
  console.log("\n" + "=".repeat(80));
  console.log("Reading", path);
  console.log("=".repeat(80));

  try {
    const arr = await zarr.open(root.resolve(path.replace(/^\//, "")), {
      kind: "array",
    });

    console.log("metadata OK");
    console.log("shape:", arr.shape);
    console.log("dtype:", arr.dtype);
    console.log("dimensionNames:", arr.dimensionNames);

    const selection = arr.shape.map(() => 0);
    console.log("selection:", selection);

    const value = await zarr.get(arr, selection);
    console.log("read OK:", value);
  } catch (err) {
    console.error("read FAILED");
    console.error("name:", err?.name);
    console.error("message:", err?.message);
    console.error("stack:", err?.stack);
  }
}

console.log("Opened repo with icechunk-js:");
console.log(repoUrl);

await readFirstValue("/group1/chlos");        // works
await readFirstValue("/test3/btm_htotal");    // fails

console.log("\nDone");
process.exit(0);

Here is how the store that won't open was created.

# Where the data are
source_data_bucket = "s3://noaa-oar-cefi-regional-mom6-pds"
source_data_prefix = "northeast_pacific/full_domain/hindcast/daily/regrid/r20250912"
source_data_region = "us-east-1"

# Where my icechunk is
icechunk_bucket = "us-west-2.opendata.source.coop"
icechunk_prefix = "eeholmes/cefi/nepacific-icechunk"
icechunk_region = "us-west-2"

# Where my creds for my icechunk bucket are
icechunk_creds = "source-cefi-creds.json"

# Create an object-store handle for the REMOTE files.
store = from_url(source_data_bucket, region=source_data_region, skip_signature=True)
registry = ObjectStoreRegistry({source_data_bucket: store})
parser = HDFParser()

# Tell the store how to access the remote chunks
config = icechunk.RepositoryConfig.default()
config.set_virtual_chunk_container(
    icechunk.VirtualChunkContainer(
        url_prefix=f"{source_data_bucket}/", # need that trailing /
        store=icechunk.s3_store(region=source_data_region, anonymous=True),
    ),
)

# Read in the json file
import json
from pathlib import Path

with open(icechunk_creds) as f:
    source_creds = json.load(f)

# This uses info on the bucket where the icechunk store is
storage = icechunk.s3_storage(
    bucket=icechunk_bucket,
    prefix=icechunk_prefix,
    region=source_creds["region_name"],
    access_key_id=source_creds["aws_access_key_id"],
    secret_access_key=source_creds["aws_secret_access_key"],
    session_token=source_creds["aws_session_token"],
)

# Create store if it is empty
try:
    repo = icechunk.Repository.create(storage, config)
    print("Created new Icechunk repo")
except Exception:
    repo = icechunk.Repository.open(storage, config=config)
    print("Opened existing Icechunk repo")

# Create a session
session = repo.writable_session(branch="main")

import numpy as np
import pandas as pd
import xarray as xr
from pathlib import Path

group_name = "test4"

def time_is_good(vds):
    t = pd.Index(vds.time.values)
    return t.is_unique and t.is_monotonic_increasing

vds_list = []

varname = vars_1[3]
var_url = get_urls(varname)[0]

print(f"Opening {varname:20s} {Path(var_url).name}")

vds = open_virtual_dataset(
    url=var_url,
    parser=parser,
    registry=registry,
    loadable_variables=["time", "lat", "lon"],
    decode_times=True,
).drop_vars(["nv", "ncrs", "crs"], errors="ignore")

vds.vz.to_icechunk(
    session.store,
    group=group_name,
)

snapshot_id = session.commit("full-domain files")
print("Committed:", snapshot_id)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions