Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,15 @@ Included in this repo is an alpine based Dockerfile.
- If a repository commit is specified it will held at that commit.
- Configuring the update frequency is done via a CRON in `PLACE_LOADER_CRON` environment variable, or the `--update-cron` flag. Use [crontab guru](https://crontab.guru/) to validate your CRONs!!!

### Retry Configuration

When repository loading fails (e.g., network issues, GitHub downtime), the loader automatically retries with exponential backoff:

- `PLACE_LOADER_MAX_RETRY_ATTEMPTS`: Maximum number of retry attempts before giving up (default: `10`)
- `PLACE_LOADER_MAX_BACKOFF_SECONDS`: Maximum backoff time between retries in seconds (default: `300` / 5 minutes)

Backoff timing follows the pattern: attempt² seconds (1s, 4s, 9s, 16s...) capped at the max backoff value.

### Client

Included is a simple client that can be configured via the `PLACE_LOADER_URI` environment variable.
Expand Down
2 changes: 1 addition & 1 deletion shard.lock
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ shards:

openssl_ext:
git: https://github.com/spider-gazelle/openssl_ext.git
version: 2.8.1
version: 2.8.2

pars: # Overridden
git: https://github.com/spider-gazelle/pars.git
Expand Down
197 changes: 197 additions & 0 deletions spec/loader_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ module PlaceOS::FrontendLoader
repository = example_repository(TEST_FOLDER)
expected_path = File.join(TEST_DIR, repository.folder_name)
reset
# Clear retry tracking between tests
Loader.retry_attempts.clear
Loader.last_retry_time.clear
end

describe "#startup_finished?" do
Expand Down Expand Up @@ -185,6 +188,200 @@ module PlaceOS::FrontendLoader
repository.has_runtime_error.should be_false
repository.error_message.should be_nil
end

it "should retry with exponential backoff when only error fields are updated" do
changes = [] of PlaceOS::Model::Repository::ChangeFeed::Change(PlaceOS::Model::Repository)
changefeed = Model::Repository.changes
spawn do
changefeed.each do |change|
changes << change
end
end

Fiber.yield

loader = Loader.new
branch = "doesnt-exist"
repository.branch = branch
repository.save!
sleep 100.milliseconds

# First attempt should fail and set error fields
loader.process_resource(:created, repository).success?.should be_false
Dir.exists?(expected_path).should be_false
sleep 100.milliseconds

repository = Model::Repository.find!(repository.id.as(String))
repository.has_runtime_error.should be_true
error_msg = repository.error_message
error_msg.should_not be_nil

# Clear changes to track only the next error field update
changes.clear

# Now update only the error fields through the ORM
# This simulates what happens when the loader updates error fields after a failed clone
Model::Repository.update(repository.id, {
has_runtime_error: true,
error_message: "Updated error message",
})
sleep 200.milliseconds

# The changefeed should have received an update event
changes.size.should be >= 1
# Find the update event (filter out any other events)
update_change = changes.find(&.updated?)
update_change.should_not be_nil

# First retry should happen immediately (attempt 1, backoff = 1^2 = 1s)
result = loader.process_resource(:updated, update_change.not_nil!.value)
result.error?.should be_true # Still fails because branch doesn't exist

# Wait for any changefeed events from the load failure to be processed
sleep 300.milliseconds

# Clear changes and trigger another error-only update
changes.clear
Model::Repository.update(repository.id, {
has_runtime_error: true,
error_message: "Still failing",
})
sleep 200.milliseconds

# Verify retry tracking was set
repo_id = repository.id.not_nil!
Loader.retry_attempts[repo_id].should eq 1
Loader.last_retry_time[repo_id].should_not be_nil

# Verify no successful clone happened
Dir.exists?(expected_path).should be_false

changefeed.stop
end

it "should stop retrying after max attempts" do
changes = [] of PlaceOS::Model::Repository::ChangeFeed::Change(PlaceOS::Model::Repository)
changefeed = Model::Repository.changes
spawn do
changefeed.each do |change|
changes << change
end
end

Fiber.yield

loader = Loader.new
branch = "doesnt-exist"
repository.branch = branch
repository.save!

# First attempt should fail
loader.process_resource(:created, repository).success?.should be_false

# Simulate max retry attempts by directly setting the counter
repo_id = repository.id.not_nil!
max_attempts = Loader.settings.max_retry_attempts
Loader.retry_attempts[repo_id] = max_attempts

# Clear changes to track only the next error field update
changes.clear

# Now update only the error fields through the ORM
Model::Repository.update(repository.id, {
has_runtime_error: true,
error_message: "Max retries test",
})
sleep 200.milliseconds

# The changefeed should have received an update event
update_change = changes.find(&.updated?)
update_change.should_not be_nil

# Now when we try to process this error-only update, it should skip due to max retries
result = loader.process_resource(:updated, update_change.not_nil!.value)
result.skipped?.should be_true

# Verify it doesn't increment beyond max
Loader.retry_attempts[repo_id].should eq max_attempts

changefeed.stop
end

it "should not update database if error fields haven't changed" do
loader = Loader.new
branch = "doesnt-exist"
repository.branch = branch

# First attempt should fail
loader.process_resource(:created, repository).success?.should be_false

repository = Model::Repository.find!(repository.id.as(String))
initial_updated_at = repository.updated_at
repository.has_runtime_error.should be_true

sleep 10.milliseconds

# Second attempt with same error should not update the database
# (simulating what would happen if the error fields are already set)
repository.branch = branch
loader.process_resource(:created, repository).success?.should be_false

repository = Model::Repository.find!(repository.id.as(String))
# updated_at should not have changed since we skip updating when error fields are the same
repository.updated_at.should eq initial_updated_at
end

it "should process when non-error fields change even if error fields also change" do
changes = [] of PlaceOS::Model::Repository::ChangeFeed::Change(PlaceOS::Model::Repository)
changefeed = Model::Repository.changes
spawn do
changefeed.each do |change|
changes << change
end
end

Fiber.yield

loader = Loader.new
branch = "doesnt-exist"
repository.branch = branch
repository.save!
sleep 100.milliseconds

# First attempt should fail and set error fields
loader.process_resource(:created, repository).success?.should be_false
Dir.exists?(expected_path).should be_false
sleep 200.milliseconds

repository = Model::Repository.find!(repository.id.as(String))
repository.has_runtime_error.should be_true

# Clear changes to track only the next update
changes.clear

# Now update the branch to a valid one AND clear error fields
# This should NOT be skipped because branch is a real field change
Model::Repository.update(repository.id, {
branch: "master",
has_runtime_error: false,
error_message: nil,
})
sleep 200.milliseconds

# The changefeed should have received an update event
changes.size.should be >= 1
update_change = changes.find(&.updated?)
update_change.should_not be_nil

# When processing this update with both branch and error fields changed, it should NOT skip
result = loader.process_resource(:updated, update_change.not_nil!.value)
result.success?.should be_true

# Verify the repository was actually loaded
Dir.exists?(expected_path).should be_true

changefeed.stop
end
end
end
end
10 changes: 6 additions & 4 deletions src/constants.cr
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,12 @@ module PlaceOS::FrontendLoader
PORT = (ENV["PLACE_LOADER_PORT"]? || 3000).to_i

# settings for `./placeos-frontend-loader/loader.cr`
WWW = ENV["PLACE_LOADER_WWW"]? || "www"
CRON = ENV["PLACE_LOADER_CRON"]? || "0 * * * *"
GIT_USER = ENV["PLACE_LOADER_GIT_USER"]? || ""
GIT_PASS = ENV["PLACE_LOADER_GIT_PASS"]? || ""
WWW = ENV["PLACE_LOADER_WWW"]? || "www"
CRON = ENV["PLACE_LOADER_CRON"]? || "0 * * * *"
GIT_USER = ENV["PLACE_LOADER_GIT_USER"]? || ""
GIT_PASS = ENV["PLACE_LOADER_GIT_PASS"]? || ""
MAX_RETRY_ATTEMPTS = (ENV["PLACE_LOADER_MAX_RETRY_ATTEMPTS"]? || "10").to_i
MAX_BACKOFF_SECONDS = (ENV["PLACE_LOADER_MAX_BACKOFF_SECONDS"]? || "300").to_i

GITLAB_TOKEN = ENV["GITLAB_TOKEN"]? || ""

Expand Down
79 changes: 74 additions & 5 deletions src/placeos-frontend-loader/loader.cr
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@
Log = ::Log.for(self)

Habitat.create do
setting content_directory : String = WWW

Check warning on line 17 in src/placeos-frontend-loader/loader.cr

View workflow job for this annotation

GitHub Actions / Ameba

Lint/UselessAssign

Useless assignment to variable `content_directory`
Raw output
> setting content_directory : String = WWW
          ^
setting update_crontab : String = CRON

Check warning on line 18 in src/placeos-frontend-loader/loader.cr

View workflow job for this annotation

GitHub Actions / Ameba

Lint/UselessAssign

Useless assignment to variable `update_crontab`
Raw output
> setting update_crontab : String = CRON
          ^
setting max_retry_attempts : Int32 = MAX_RETRY_ATTEMPTS

Check warning on line 19 in src/placeos-frontend-loader/loader.cr

View workflow job for this annotation

GitHub Actions / Ameba

Lint/UselessAssign

Useless assignment to variable `max_retry_attempts`
Raw output
> setting max_retry_attempts : Int32 = MAX_RETRY_ATTEMPTS
          ^
setting max_backoff_seconds : Int32 = MAX_BACKOFF_SECONDS

Check warning on line 20 in src/placeos-frontend-loader/loader.cr

View workflow job for this annotation

GitHub Actions / Ameba

Lint/UselessAssign

Useless assignment to variable `max_backoff_seconds`
Raw output
> setting max_backoff_seconds : Int32 = MAX_BACKOFF_SECONDS
          ^
end

class_getter instance : Loader do
Expand All @@ -34,6 +36,10 @@
class_getter uri_lookup : Hash(String, RepoCache) = {} of String => RepoCache
class_getter folder_lookup : Hash(String, RepoCache) = {} of String => RepoCache

# Track retry attempts for exponential backoff
class_getter retry_attempts : Hash(String, Int32) = {} of String => Int32
class_getter last_retry_time : Hash(String, Time) = {} of String => Time

def initialize(
@content_directory : String = Loader.settings.content_directory,
@update_crontab : String = Loader.settings.update_crontab,
Expand Down Expand Up @@ -129,7 +135,7 @@
end
end

def process_resource(action : Resource::Action, resource : Model::Repository) : Resource::Result

Check warning on line 138 in src/placeos-frontend-loader/loader.cr

View workflow job for this annotation

GitHub Actions / Ameba

Metrics/CyclomaticComplexity

Cyclomatic complexity too high [15/10]
Raw output
> def process_resource(action : Resource::Action, resource : Model::Repository) : Resource::Result
      ^
repository = resource

# Only consider Interface Repositories
Expand All @@ -142,17 +148,74 @@
return Resource::Result::Skipped
end

# Skip load for error indicator update actions
if (changes = repository.changed_attributes) && (changes[:has_runtime_error]? || changes[:error_message]?)
return Resource::Result::Skipped
# Implement exponential backoff for error-only updates
if action.updated? && (changes = repository.changed_attributes) && !changes.empty?
error_only_changes = changes.keys.all? { |key| key.in?(:has_runtime_error, :error_message, :updated_at) }

# Skip processing if only error fields changed
if error_only_changes
# If error flag is being cleared (recovery), skip without retry logic
unless repository.has_runtime_error
Log.debug { "#{repository.folder_name}: skipping error flag clear update" }
return Resource::Result::Skipped
end

# Error flag is still set - apply backoff for retry
repo_id = repository.id.not_nil!
attempt = Loader.retry_attempts[repo_id]? || 0

# Stop retrying after max attempts
max_attempts = Loader.settings.max_retry_attempts
if attempt >= max_attempts
Log.warn { "#{repository.folder_name}: max retry attempts (#{max_attempts}) reached, giving up" }
return Resource::Result::Skipped
end

last_retry = Loader.last_retry_time[repo_id]?

if last_retry
# Calculate backoff: min(attempt^2, max_backoff_seconds)
max_backoff = Loader.settings.max_backoff_seconds
backoff_seconds = Math.min(attempt ** 2, max_backoff)
elapsed = (Time.utc - last_retry).total_seconds

if elapsed < backoff_seconds
Log.debug { "#{repository.folder_name}: skipping retry, backoff not elapsed (#{elapsed.to_i}/#{backoff_seconds}s)" }
return Resource::Result::Skipped
end
end

# Update retry tracking
Loader.retry_attempts[repo_id] = attempt + 1
Loader.last_retry_time[repo_id] = Time.utc
max_backoff = Loader.settings.max_backoff_seconds
max_attempts = Loader.settings.max_retry_attempts
backoff = Math.min(attempt ** 2, max_backoff)
Log.info { "#{repository.folder_name}: retrying after error (attempt #{attempt + 1}/#{max_attempts}, backoff: #{backoff}s)" }
end
end

# Load the repository
Loader.load(
result = Loader.load(
repository: repository,
content_directory: @content_directory,
)

# Reset retry counter on success
if result == Resource::Result::Success
repo_id = repository.id.not_nil!
Loader.retry_attempts.delete(repo_id)
Loader.last_retry_time.delete(repo_id)
end

result
in Action::Deleted
# Clean up retry tracking on deletion
if repo_id = repository.id
Loader.retry_attempts.delete(repo_id)
Loader.last_retry_time.delete(repo_id)
end

# Unload the repository
Loader.unload(
repository: repository,
Expand Down Expand Up @@ -219,8 +282,14 @@
repo_cache.cache
end

Model::Repository.update(repository.id, {has_runtime_error: has_error, error_message: error_message}) if has_error || repository.has_runtime_error
# Only update error fields if they've actually changed to avoid triggering unnecessary events
if has_error || repository.has_runtime_error
if repository.has_runtime_error != has_error || repository.error_message != error_message
Model::Repository.update(repository.id, {has_runtime_error: has_error, error_message: error_message})
end
end
return Resource::Result::Error unless cache

# grab the required commit
content_directory = File.expand_path(content_directory)
repository_directory = File.join(content_directory, repository.folder_name)
Expand Down
Loading