Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions .github/workflows/cre-local-env-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ jobs:
uses: aws-actions/configure-aws-credentials@010d0da01d0b5a38af31e9c3470dbfdabdecca3a # v4.0.1
with:
aws-region: ${{ secrets.QA_AWS_REGION }}
role-to-assume: ${{ secrets.QA_AWS_ROLE_TO_ASSUME }}
role-to-assume: ${{ secrets.AWS_CTF_READ_ACCESS_ROLE_ARN }}
role-duration-seconds: 1800
mask-aws-account-id: true

Expand Down Expand Up @@ -151,7 +151,8 @@ jobs:
env:
DISABLE_DX_TRACKING: true
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
AWS_ECR: ${{ secrets.AWS_ACCOUNT_ID_PROD }}.dkr.ecr.us-west-2.amazonaws.com
MAIN_AWS_ECR: ${{ secrets.AWS_ACCOUNT_ID_PROD }}.dkr.ecr.us-west-2.amazonaws.com
SDLC_AWS_ECR: ${{ secrets.QA_AWS_ACCOUNT_NUMBER }}.dkr.ecr.us-west-2.amazonaws.com
run: |
# Remove chip_ingress/chip_config sections since CI role lacks ECR permissions for the Atlas repo
awk '/^\[chip_ingress\.build_config\]/,/^$/{next} /^\[chip_ingress\.pull_config\]/,/^$/{next} /^\[chip_config\.build_config\]/,/^$/{next} /^\[chip_config\.pull_config\]/,/^$/{next} {print}' configs/setup.toml > configs/setup.toml.tmp && mv configs/setup.toml.tmp configs/setup.toml
Expand All @@ -165,6 +166,7 @@ jobs:
CTF_CONFIGS: "./configs/workflow-gateway-don.toml"
CTF_JD_IMAGE: "${{ secrets.AWS_ACCOUNT_ID_PROD }}.dkr.ecr.${{ secrets.QA_AWS_REGION }}.amazonaws.com/job-distributor:0.22.1"
CTF_CHAINLINK_IMAGE: "${{ secrets.QA_AWS_ACCOUNT_NUMBER }}.dkr.ecr.${{ secrets.QA_AWS_REGION }}.amazonaws.com/chainlink:${{ github.event_name == 'pull_request' && format('nightly-{0}-plugins', steps.set-date.outputs.date) || inputs.chainlink_image_tag }}"
CTF_CHIP_ROUTER_IMAGE: "${{ secrets.QA_AWS_ACCOUNT_NUMBER }}.dkr.ecr.${{ secrets.QA_AWS_REGION }}.amazonaws.com/local-cre-chip-router:v1.0.0"
DISABLE_DX_TRACKING: "true"
CI: "true"
run: |
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/cre-regression-system-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ jobs:
# Beholder stack will be started only for the Beholder tests
CHIP_INGRESS_IMAGE: ${{ secrets.AWS_ACCOUNT_ID_PROD }}.dkr.ecr.${{ secrets.QA_AWS_REGION }}.amazonaws.com/atlas-chip-ingress:da84cb72d3a160e02896247d46ab4b9806ebee2f
CHIP_CONFIG_IMAGE: ${{ secrets.AWS_ACCOUNT_ID_PROD }}.dkr.ecr.${{ secrets.QA_AWS_REGION }}.amazonaws.com/atlas-chip-config:7b4e9ee68fd1c737dd3480b5a3ced0188f29b969
CTF_CHIP_ROUTER_IMAGE: "${{ secrets.QA_AWS_ACCOUNT_NUMBER }}.dkr.ecr.${{ secrets.QA_AWS_REGION }}.amazonaws.com/local-cre-chip-router:v1.0.0"
BILLING_PLATFORM_SERVICE_IMAGE: ${{ secrets.AWS_ACCOUNT_ID_PROD }}.dkr.ecr.${{ secrets.QA_AWS_REGION }}.amazonaws.com/billing-platform-service:v1.45.0

steps:
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/cre-soak-memory-leak.yml
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ jobs:
working-directory: system-tests/tests
env:
GITHUB_TOKEN: ${{ steps.github-token.outputs.access-token || '' }}
CTF_CHIP_ROUTER_IMAGE: "${{ secrets.QA_AWS_ACCOUNT_NUMBER }}.dkr.ecr.${{ secrets.QA_AWS_REGION }}.amazonaws.com/local-cre-chip-router:v1.0.0"
run: |
gotestsum \
--jsonfile=/tmp/gotest.log \
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/cre-system-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@ jobs:
env:
CTF_JD_IMAGE: "${{ secrets.AWS_ACCOUNT_ID_PROD }}.dkr.ecr.${{ secrets.QA_AWS_REGION }}.amazonaws.com/job-distributor:0.22.1"
CTF_CHAINLINK_IMAGE: "${{ steps.resolve-chainlink-image.outputs.resolved_image }}"
CTF_CHIP_ROUTER_IMAGE: "${{ secrets.QA_AWS_ACCOUNT_NUMBER }}.dkr.ecr.${{ secrets.QA_AWS_REGION }}.amazonaws.com/local-cre-chip-router:v1.0.0"
CTF_CONFIGS: ${{ matrix.tests.configs }}
CRE_VERSION: ${{ matrix.tests.cre_version }}
TEST_NAME: ${{ matrix.tests.test_name }}
Expand Down
84 changes: 50 additions & 34 deletions core/scripts/cre/environment/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Slack: #topic-local-dev-environments
- [Start Environment](#start-environment)
- [Using a pre-built Chainlink image](#using-a-pre-built-chainlink-image)
- [Beholder](#beholder)
- [Beholder vs. ChIP Test Sink](#beholder-vs-chip-test-sink-port-conflict-and-using-both-together)
- [Chip Router Topology](#chip-router-topology)
- [Storage](#storage)
- [Purging environment state](#purging-environment-state)
- [Stop Environment](#stop-environment)
Expand Down Expand Up @@ -76,7 +76,7 @@ Slack: #topic-local-dev-environments
- [Automated Hot Swapping with fswatch](#automated-hot-swapping-with-fswatch)
8. [Telemetry Configuration](#telemetry-configuration)
- [OTEL Stack (OpenTelemetry)](#otel-stack-opentelemetry)
- [Chip Ingress (Beholder)](#chip-ingress-beholder)
- [Chip Router and Beholder](#chip-router-and-beholder)
- [Expected Error Messages](#expected-error-messages)
9. [Using a Specific Docker Image for Chainlink Node](#using-a-specific-docker-image-for-chainlink-node)
10. [Using Existing EVM & P2P Keys](#using-existing-evm--p2p-keys)
Expand Down Expand Up @@ -147,8 +147,8 @@ It will compile local CRE as `local_cre`. With it installed you will be able to

# QUICKSTART
```
# e.g. AWS_ECR=<PROD_ACCOUNT_ID>.dkr.ecr.<REGION>.amazonaws.com
AWS_ECR=<PROD_AWS_URL> go run . env start --auto-setup
# e.g. MAIN_AWS_ECR=<main-registry> SDLC_AWS_ECR=<chip-router-registry>
MAIN_AWS_ECR=<MAIN_REGISTRY_URL> SDLC_AWS_ECR=<SDLC_REGISTRY_URL> go run . env start --auto-setup
```
> You can find `PROD_ACCOUNT_ID` and `REGION` in the `[profile prod]` section of the [AWS CLI configuration guide](https://smartcontract-it.atlassian.net/wiki/spaces/INFRA/pages/1045495923/Configure+the+AWS+CLI#Configure). If for some reason you want to limit the AWS config to bare minimum, include only `staging-default` profile and `cl-secure-sso` session entries.

Expand All @@ -161,11 +161,17 @@ Refer to [this document](https://docs.google.com/document/d/1HtVLv2ipx2jvU15WYOi
Environment can be setup by running `go run . env setup` inside `core/scripts/cre/environment` folder. Its configuration is defined in [configs/setup.toml](configs/setup.toml) file. It will make sure that:
- you have AWS CLI installed and configured
- you have GH CLI installed and authenticated
- you have required Job Distributor, Chip Ingress, and Chip Config images
- you have required Job Distributor, Chip Router, Chip Ingress, and Chip Config images

**Image Versioning:**

Docker images for Beholder services (chip-ingress, chip-config) use commit-based tags instead of mutable tags like `local-cre`. This ensures you always know which version is running and prevents hard-to-debug issues from version mismatches. The exact versions are defined in [configs/setup.toml](configs/setup.toml).
Managed CRE images use local aliases with commit-based tags instead of mutable tags like `latest` or account-qualified ECR names. For example, env TOMLs use `chip-router:<commit-sha>`, while [configs/setup.toml](configs/setup.toml) defines how that alias is built locally or pulled from ECR and retagged locally. The setup config now distinguishes between the main CRE registry (`MAIN_AWS_ECR`) and the separate Chip Router registry (`SDLC_AWS_ECR`).

`env start` treats Chip Router as required infrastructure. It resolves the effective router image in this order:
- `CTF_CHIP_ROUTER_IMAGE`, if set
- `chip_router.image` from your env TOML

If the effective image is missing locally, startup uses the same build-or-pull fallback flow as the Beholder images. The committed env TOMLs intentionally use the local alias, not the full remote ECR image.

**Plugin installation during image build:**

Expand Down Expand Up @@ -231,7 +237,7 @@ Apply this to **all** nodes in the nodeset. Nightly images are built by the [Doc

### Beholder

When environment is started with `--with-beholder` or with `-b` flag after the DON is ready we will boot up `Chip Ingress` and `Red Panda`, create a `cre` topic and download and install workflow-related protobufs from the [chainlink-protos](https://github.com/smartcontractkit/chainlink-protos/tree/main/workflows) repository.
When environment is started with `--with-beholder` or with `-b` flag after the DON is ready we boot up real ChIP Ingress and Red Panda, create a `cre` topic, and download and install workflow-related protobufs from the [chainlink-protos](https://github.com/smartcontractkit/chainlink-protos/tree/main/workflows) repository.

Once up and running you will be able to access [CRE topic view](http://localhost:8080/topics/cre) to see workflow-emitted events. These include both standard events emitted by the Workflow Engine and custom events emitted from your workflow.

Expand All @@ -249,46 +255,47 @@ Beholder requires `chip-ingress` and `chip-config` Docker images with specific v

When starting Beholder, the system will:
- **In CI (`CI=true`)**: Skip image checks (docker-compose will pull at runtime)
- **Interactive terminal**: Auto-build missing images from sources. If build fails and `AWS_ECR` is set, you'll be offered to pull from ECR instead
- **Non-interactive (tests, scripts)**: Auto-pull from ECR if `AWS_ECR` is set, otherwise fail with instructions
- **Interactive terminal**: Auto-build missing images from sources. If build fails and the required registry env vars are set, you'll be offered to pull from ECR instead
- **Non-interactive (tests, scripts)**: Auto-pull from ECR if the required registry env vars are set, otherwise fail with instructions

To manually ensure images are available, run:
```bash
# Build from sources
go run . env setup

# Or pull from ECR (requires AWS SSO access)
AWS_ECR=<account-id>.dkr.ecr.us-west-2.amazonaws.com go run . env setup
MAIN_AWS_ECR=<main-registry> SDLC_AWS_ECR=<chip-router-registry> go run . env setup
```

#### Beholder vs. ChIP Test Sink: Port Conflict and Using Both Together
#### Chip Router Topology

Both the **real Beholder** (Chip Ingress + Red Panda) and the **ChIP Test Sink** (used by CRE system tests for assertions) bind to the same gRPC port by default (50051). Chainlink nodes are configured to send workflow telemetry to `host.docker.internal:50051`, so only one service can receive on that port at a time.
Chip Router is the single owner of ChIP ingress on `50051`. Chainlink nodes send workflow telemetry to the router, and the router fans that traffic out to downstream subscribers.

**Default behavior in tests:**
- Most CRE smoke/regression tests use the **test sink** (`t_helpers.StartChipTestSink`). The sink listens on 50051, receives CloudEvents from nodes, and runs test assertions. No Kafka/Red Panda.
- Beholder-specific tests (e.g. `Test_CRE_V2_Suite` with Cron Beholder scenario, `Test_CRE_V1_Billing_Cron_Beholder`) use **real Beholder** via `t_helpers.StartBeholder`. They start Beholder on 50051, consume from Kafka, and run assertions. The test cleanup stops Beholder so subsequent tests can use the test sink.
That means:
- test sinks no longer bind the node ingress port directly
- real ChIP / Beholder no longer owns `50051`
- both paths are treated as downstream subscribers behind the same ingress owner

**To use both together** (test assertions + Red Panda/Kafka observability):
Current local port layout:
- `50050`: Chip Router admin API
- `50051`: Chip Router ingress gRPC
- `50052`: chip-config
- `50053`: real ChIP / Beholder ingress gRPC

1. **Start Beholder on a different port** (e.g. 50052):
```bash
go run . env beholder start --grpc-port 50052
```
Or, when starting the full environment:
```bash
go run . env start --with-beholder --grpc-port 50052
```
In tests:
- sink-backed tests start an ephemeral sink and register it with Chip Router
- Beholder-backed tests start real ChIP on `50053` and register it with Chip Router

2. **Run the test sink on the default port (50051)** so it receives events from nodes. The test sink must listen on 50051 because node config is fixed to that port.
Router component output is persisted in [state/local_cre.toml](state/local_cre.toml) under `chip_router.out`. Subscriber IDs remain separate runtime artifacts because they are lifecycle bookkeeping, not part of the environment topology.

3. **Configure the test sink to forward to Beholder** by setting `UpstreamEndpoint` in the sink config. The `chiptestsink` package supports this, but `t_helpers.StartChipTestSink` does not expose it. To use both:
- Use `chiptestsink.NewServer` directly with `Config{UpstreamEndpoint: "localhost:50052", ...}` instead of `StartChipTestSink`, or
- Extend the test helper to accept an optional upstream endpoint.
To override the router image without changing committed TOMLs, set:

4. **Resulting flow:** Nodes → test sink (50051) → assertions + forward → Beholder (50052) → Kafka/Red Panda.
```bash
export CTF_CHIP_ROUTER_IMAGE=chip-router:<commit-sha>
```

**Summary:** Use either Beholder or the test sink alone for simplicity. Use both only when you need test assertions and Red Panda observability in the same run; then run Beholder on a non-default port and configure the sink to forward to it.
This override wins over `chip_router.image` in the env TOML.
Chip Router pulls use `SDLC_AWS_ECR`; the rest of the managed CRE images use `MAIN_AWS_ECR`.

### Storage

Expand Down Expand Up @@ -1713,19 +1720,28 @@ ctf obs u

This provides access to Grafana, Prometheus, and Loki for monitoring and log aggregation.

### Chip Ingress (Beholder)
Nodes send workflow events to `chip-ingress:50051` for workflow monitoring. Start Chip Ingress either:
### Chip Router and Beholder
Nodes send workflow events to `host.docker.internal:50051`, which is owned by Chip Router. Chip Router fans out those events to registered downstream subscribers such as test sinks and real ChIP / Beholder.

Start the full environment plus Beholder with:

**Option 1: Start with environment**
```bash
go run . env start --with-beholder
```

**Option 2: Start separately**
Or start Beholder separately after the environment is already up:

```bash
go run . env beholder start
```

Chip Router ports:
- admin: `50050`
- ingress gRPC: `50051`

Real ChIP / Beholder downstream port:
- gRPC: `50053`

### OTel Tracing Configuration

To enable OpenTelemetry (OTel) tracing for workflow engines and see traces in Tempo/Grafana, **multiple configuration toggles must be set**:
Expand Down
20 changes: 16 additions & 4 deletions core/scripts/cre/environment/configs/setup.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,19 @@ local_image = "job-distributor:0.22.1"

[job_distributor.pull_config]
local_image = "job-distributor:0.22.1"
ecr_image = "{{.ECR}}/job-distributor:0.22.1"
ecr_image = "{{.MAIN_ECR}}/job-distributor:0.22.1"

[chip_router.build_config]
repository = "https://github.com/smartcontractkit/chainlink-testing-framework"
branch = "main"
commit = "838769782600ad166f1afd2bca0de02ef4c42862"
dockerfile = "framework/components/chiprouter/Dockerfile"
docker_ctx = "framework/components/chiprouter"
local_image = "local-cre-chip-router:v1.0.1"

[chip_router.pull_config]
local_image = "local-cre-chip-router:v1.0.1"
ecr_image = "{{.SDLC_ECR}}/local-cre-chip-router:v1.0.1"

[chip_ingress.build_config]
repository = "https://github.com/smartcontractkit/atlas"
Expand All @@ -25,7 +37,7 @@ pre_run = "pushd chip-ingress && go mod vendor && popd"

[chip_ingress.pull_config]
local_image = "chip-ingress:da84cb72d3a160e02896247d46ab4b9806ebee2f"
ecr_image = "{{.ECR}}/atlas-chip-ingress:da84cb72d3a160e02896247d46ab4b9806ebee2f"
ecr_image = "{{.MAIN_ECR}}/atlas-chip-ingress:da84cb72d3a160e02896247d46ab4b9806ebee2f"

[chip_config.build_config]
repository = "https://github.com/smartcontractkit/atlas"
Expand All @@ -38,7 +50,7 @@ pre_run = "pushd chip-config && go mod vendor && popd"

[chip_config.pull_config]
local_image = "chip-config:7b4e9ee68fd1c737dd3480b5a3ced0188f29b969"
ecr_image = "{{.ECR}}/atlas-chip-config:7b4e9ee68fd1c737dd3480b5a3ced0188f29b969"
ecr_image = "{{.MAIN_ECR}}/atlas-chip-config:7b4e9ee68fd1c737dd3480b5a3ced0188f29b969"

[billing_platform_service.build_config]
repository = "https://github.com/smartcontractkit/billing-platform-service"
Expand All @@ -50,7 +62,7 @@ local_image = "billing-platform-service:local-cre"

[billing_platform_service.pull_config]
local_image = "billing-platform-service:local-cre"
ecr_image = "{{.ECR}}/billing-platform-service:1.36.1"
ecr_image = "{{.MAIN_ECR}}/billing-platform-service:1.36.1"

[observability]
repository = "https://github.com/smartcontractkit/chainlink-observability"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@

[chip_router]
image = "local-cre-chip-router:v1.0.1"

[[blockchains]]
type = "anvil"
chain_id = "1337"
Expand Down
3 changes: 3 additions & 0 deletions core/scripts/cre/environment/configs/workflow-don-tron.toml
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@

[chip_router]
image = "local-cre-chip-router:v1.0.1"

[[blockchains]]
type = "anvil"
chain_id = "1337"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@

[chip_router]
image = "local-cre-chip-router:v1.0.1"

[[blockchains]]
type = "anvil"
chain_id = "1337"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# Same as workflow-gateway-don.toml but with Aptos chain and a single Aptos capability.
# Anvil 1337: registry and gateway. Aptos: local devnet (chain_id 4). Run: env config path <this file>, then env start.

[chip_router]
image = "local-cre-chip-router:v1.0.1"

[[blockchains]]
type = "anvil"
chain_id = "1337"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
#
# Used by: system-tests/tests/smoke/cre/v2_grpc_source_test.go

[chip_router]
image = "local-cre-chip-router:v1.0.1"

[[blockchains]]
type = "anvil"
chain_id = "1337"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@

[chip_router]
image = "local-cre-chip-router:v1.0.1"

[[blockchains]]
type = "anvil"
chain_id = "1337"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# NOTE: Identical to workflow-gatewway-capabilities.toml but with a vault capability config override
# to disable the new pending queue feature.
[chip_router]
image = "local-cre-chip-router:v1.0.1"

[[blockchains]]
type = "anvil"
chain_id = "1337"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
[chip_router]
image = "local-cre-chip-router:v1.0.1"

[[blockchains]]
chain_id = "1337"
container_name = "anvil-1337"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@

[chip_router]
image = "local-cre-chip-router:v1.0.1"

[[blockchains]]
type = "anvil"
chain_id = "1337"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@

[chip_router]
image = "local-cre-chip-router:v1.0.1"

[[blockchains]]
type = "anvil"
chain_id = "1337"
Expand Down
Loading
Loading