diff --git a/README.md b/README.md index b545ac7..0e350e2 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,15 @@ Included in this repo is an alpine based Dockerfile. - If a repository commit is specified it will held at that commit. - Configuring the update frequency is done via a CRON in `PLACE_LOADER_CRON` environment variable, or the `--update-cron` flag. Use [crontab guru](https://crontab.guru/) to validate your CRONs!!! +### Retry Configuration + +When repository loading fails (e.g., network issues, GitHub downtime), the loader automatically retries with exponential backoff: + +- `PLACE_LOADER_MAX_RETRY_ATTEMPTS`: Maximum number of retry attempts before giving up (default: `10`) +- `PLACE_LOADER_MAX_BACKOFF_SECONDS`: Maximum backoff time between retries in seconds (default: `300` / 5 minutes) + +Backoff timing follows the pattern: attempt² seconds (1s, 4s, 9s, 16s...) capped at the max backoff value. + ### Client Included is a simple client that can be configured via the `PLACE_LOADER_URI` environment variable. diff --git a/shard.lock b/shard.lock index ff61fd8..b64f38c 100644 --- a/shard.lock +++ b/shard.lock @@ -127,7 +127,7 @@ shards: openssl_ext: git: https://github.com/spider-gazelle/openssl_ext.git - version: 2.8.1 + version: 2.8.2 pars: # Overridden git: https://github.com/spider-gazelle/pars.git diff --git a/spec/loader_spec.cr b/spec/loader_spec.cr index dcf2247..03ea53a 100644 --- a/spec/loader_spec.cr +++ b/spec/loader_spec.cr @@ -9,6 +9,9 @@ module PlaceOS::FrontendLoader repository = example_repository(TEST_FOLDER) expected_path = File.join(TEST_DIR, repository.folder_name) reset + # Clear retry tracking between tests + Loader.retry_attempts.clear + Loader.last_retry_time.clear end describe "#startup_finished?" do @@ -185,6 +188,200 @@ module PlaceOS::FrontendLoader repository.has_runtime_error.should be_false repository.error_message.should be_nil end + + it "should retry with exponential backoff when only error fields are updated" do + changes = [] of PlaceOS::Model::Repository::ChangeFeed::Change(PlaceOS::Model::Repository) + changefeed = Model::Repository.changes + spawn do + changefeed.each do |change| + changes << change + end + end + + Fiber.yield + + loader = Loader.new + branch = "doesnt-exist" + repository.branch = branch + repository.save! + sleep 100.milliseconds + + # First attempt should fail and set error fields + loader.process_resource(:created, repository).success?.should be_false + Dir.exists?(expected_path).should be_false + sleep 100.milliseconds + + repository = Model::Repository.find!(repository.id.as(String)) + repository.has_runtime_error.should be_true + error_msg = repository.error_message + error_msg.should_not be_nil + + # Clear changes to track only the next error field update + changes.clear + + # Now update only the error fields through the ORM + # This simulates what happens when the loader updates error fields after a failed clone + Model::Repository.update(repository.id, { + has_runtime_error: true, + error_message: "Updated error message", + }) + sleep 200.milliseconds + + # The changefeed should have received an update event + changes.size.should be >= 1 + # Find the update event (filter out any other events) + update_change = changes.find(&.updated?) + update_change.should_not be_nil + + # First retry should happen immediately (attempt 1, backoff = 1^2 = 1s) + result = loader.process_resource(:updated, update_change.not_nil!.value) + result.error?.should be_true # Still fails because branch doesn't exist + + # Wait for any changefeed events from the load failure to be processed + sleep 300.milliseconds + + # Clear changes and trigger another error-only update + changes.clear + Model::Repository.update(repository.id, { + has_runtime_error: true, + error_message: "Still failing", + }) + sleep 200.milliseconds + + # Verify retry tracking was set + repo_id = repository.id.not_nil! + Loader.retry_attempts[repo_id].should eq 1 + Loader.last_retry_time[repo_id].should_not be_nil + + # Verify no successful clone happened + Dir.exists?(expected_path).should be_false + + changefeed.stop + end + + it "should stop retrying after max attempts" do + changes = [] of PlaceOS::Model::Repository::ChangeFeed::Change(PlaceOS::Model::Repository) + changefeed = Model::Repository.changes + spawn do + changefeed.each do |change| + changes << change + end + end + + Fiber.yield + + loader = Loader.new + branch = "doesnt-exist" + repository.branch = branch + repository.save! + + # First attempt should fail + loader.process_resource(:created, repository).success?.should be_false + + # Simulate max retry attempts by directly setting the counter + repo_id = repository.id.not_nil! + max_attempts = Loader.settings.max_retry_attempts + Loader.retry_attempts[repo_id] = max_attempts + + # Clear changes to track only the next error field update + changes.clear + + # Now update only the error fields through the ORM + Model::Repository.update(repository.id, { + has_runtime_error: true, + error_message: "Max retries test", + }) + sleep 200.milliseconds + + # The changefeed should have received an update event + update_change = changes.find(&.updated?) + update_change.should_not be_nil + + # Now when we try to process this error-only update, it should skip due to max retries + result = loader.process_resource(:updated, update_change.not_nil!.value) + result.skipped?.should be_true + + # Verify it doesn't increment beyond max + Loader.retry_attempts[repo_id].should eq max_attempts + + changefeed.stop + end + + it "should not update database if error fields haven't changed" do + loader = Loader.new + branch = "doesnt-exist" + repository.branch = branch + + # First attempt should fail + loader.process_resource(:created, repository).success?.should be_false + + repository = Model::Repository.find!(repository.id.as(String)) + initial_updated_at = repository.updated_at + repository.has_runtime_error.should be_true + + sleep 10.milliseconds + + # Second attempt with same error should not update the database + # (simulating what would happen if the error fields are already set) + repository.branch = branch + loader.process_resource(:created, repository).success?.should be_false + + repository = Model::Repository.find!(repository.id.as(String)) + # updated_at should not have changed since we skip updating when error fields are the same + repository.updated_at.should eq initial_updated_at + end + + it "should process when non-error fields change even if error fields also change" do + changes = [] of PlaceOS::Model::Repository::ChangeFeed::Change(PlaceOS::Model::Repository) + changefeed = Model::Repository.changes + spawn do + changefeed.each do |change| + changes << change + end + end + + Fiber.yield + + loader = Loader.new + branch = "doesnt-exist" + repository.branch = branch + repository.save! + sleep 100.milliseconds + + # First attempt should fail and set error fields + loader.process_resource(:created, repository).success?.should be_false + Dir.exists?(expected_path).should be_false + sleep 200.milliseconds + + repository = Model::Repository.find!(repository.id.as(String)) + repository.has_runtime_error.should be_true + + # Clear changes to track only the next update + changes.clear + + # Now update the branch to a valid one AND clear error fields + # This should NOT be skipped because branch is a real field change + Model::Repository.update(repository.id, { + branch: "master", + has_runtime_error: false, + error_message: nil, + }) + sleep 200.milliseconds + + # The changefeed should have received an update event + changes.size.should be >= 1 + update_change = changes.find(&.updated?) + update_change.should_not be_nil + + # When processing this update with both branch and error fields changed, it should NOT skip + result = loader.process_resource(:updated, update_change.not_nil!.value) + result.success?.should be_true + + # Verify the repository was actually loaded + Dir.exists?(expected_path).should be_true + + changefeed.stop + end end end end diff --git a/src/constants.cr b/src/constants.cr index cbe3328..0b6a010 100644 --- a/src/constants.cr +++ b/src/constants.cr @@ -15,10 +15,12 @@ module PlaceOS::FrontendLoader PORT = (ENV["PLACE_LOADER_PORT"]? || 3000).to_i # settings for `./placeos-frontend-loader/loader.cr` - WWW = ENV["PLACE_LOADER_WWW"]? || "www" - CRON = ENV["PLACE_LOADER_CRON"]? || "0 * * * *" - GIT_USER = ENV["PLACE_LOADER_GIT_USER"]? || "" - GIT_PASS = ENV["PLACE_LOADER_GIT_PASS"]? || "" + WWW = ENV["PLACE_LOADER_WWW"]? || "www" + CRON = ENV["PLACE_LOADER_CRON"]? || "0 * * * *" + GIT_USER = ENV["PLACE_LOADER_GIT_USER"]? || "" + GIT_PASS = ENV["PLACE_LOADER_GIT_PASS"]? || "" + MAX_RETRY_ATTEMPTS = (ENV["PLACE_LOADER_MAX_RETRY_ATTEMPTS"]? || "10").to_i + MAX_BACKOFF_SECONDS = (ENV["PLACE_LOADER_MAX_BACKOFF_SECONDS"]? || "300").to_i GITLAB_TOKEN = ENV["GITLAB_TOKEN"]? || "" diff --git a/src/placeos-frontend-loader/loader.cr b/src/placeos-frontend-loader/loader.cr index f68a39a..704e526 100644 --- a/src/placeos-frontend-loader/loader.cr +++ b/src/placeos-frontend-loader/loader.cr @@ -16,6 +16,8 @@ module PlaceOS::FrontendLoader Habitat.create do setting content_directory : String = WWW setting update_crontab : String = CRON + setting max_retry_attempts : Int32 = MAX_RETRY_ATTEMPTS + setting max_backoff_seconds : Int32 = MAX_BACKOFF_SECONDS end class_getter instance : Loader do @@ -34,6 +36,10 @@ module PlaceOS::FrontendLoader class_getter uri_lookup : Hash(String, RepoCache) = {} of String => RepoCache class_getter folder_lookup : Hash(String, RepoCache) = {} of String => RepoCache + # Track retry attempts for exponential backoff + class_getter retry_attempts : Hash(String, Int32) = {} of String => Int32 + class_getter last_retry_time : Hash(String, Time) = {} of String => Time + def initialize( @content_directory : String = Loader.settings.content_directory, @update_crontab : String = Loader.settings.update_crontab, @@ -142,17 +148,74 @@ module PlaceOS::FrontendLoader return Resource::Result::Skipped end - # Skip load for error indicator update actions - if (changes = repository.changed_attributes) && (changes[:has_runtime_error]? || changes[:error_message]?) - return Resource::Result::Skipped + # Implement exponential backoff for error-only updates + if action.updated? && (changes = repository.changed_attributes) && !changes.empty? + error_only_changes = changes.keys.all? { |key| key.in?(:has_runtime_error, :error_message, :updated_at) } + + # Skip processing if only error fields changed + if error_only_changes + # If error flag is being cleared (recovery), skip without retry logic + unless repository.has_runtime_error + Log.debug { "#{repository.folder_name}: skipping error flag clear update" } + return Resource::Result::Skipped + end + + # Error flag is still set - apply backoff for retry + repo_id = repository.id.not_nil! + attempt = Loader.retry_attempts[repo_id]? || 0 + + # Stop retrying after max attempts + max_attempts = Loader.settings.max_retry_attempts + if attempt >= max_attempts + Log.warn { "#{repository.folder_name}: max retry attempts (#{max_attempts}) reached, giving up" } + return Resource::Result::Skipped + end + + last_retry = Loader.last_retry_time[repo_id]? + + if last_retry + # Calculate backoff: min(attempt^2, max_backoff_seconds) + max_backoff = Loader.settings.max_backoff_seconds + backoff_seconds = Math.min(attempt ** 2, max_backoff) + elapsed = (Time.utc - last_retry).total_seconds + + if elapsed < backoff_seconds + Log.debug { "#{repository.folder_name}: skipping retry, backoff not elapsed (#{elapsed.to_i}/#{backoff_seconds}s)" } + return Resource::Result::Skipped + end + end + + # Update retry tracking + Loader.retry_attempts[repo_id] = attempt + 1 + Loader.last_retry_time[repo_id] = Time.utc + max_backoff = Loader.settings.max_backoff_seconds + max_attempts = Loader.settings.max_retry_attempts + backoff = Math.min(attempt ** 2, max_backoff) + Log.info { "#{repository.folder_name}: retrying after error (attempt #{attempt + 1}/#{max_attempts}, backoff: #{backoff}s)" } + end end # Load the repository - Loader.load( + result = Loader.load( repository: repository, content_directory: @content_directory, ) + + # Reset retry counter on success + if result == Resource::Result::Success + repo_id = repository.id.not_nil! + Loader.retry_attempts.delete(repo_id) + Loader.last_retry_time.delete(repo_id) + end + + result in Action::Deleted + # Clean up retry tracking on deletion + if repo_id = repository.id + Loader.retry_attempts.delete(repo_id) + Loader.last_retry_time.delete(repo_id) + end + # Unload the repository Loader.unload( repository: repository, @@ -219,8 +282,14 @@ module PlaceOS::FrontendLoader repo_cache.cache end - Model::Repository.update(repository.id, {has_runtime_error: has_error, error_message: error_message}) if has_error || repository.has_runtime_error + # Only update error fields if they've actually changed to avoid triggering unnecessary events + if has_error || repository.has_runtime_error + if repository.has_runtime_error != has_error || repository.error_message != error_message + Model::Repository.update(repository.id, {has_runtime_error: has_error, error_message: error_message}) + end + end return Resource::Result::Error unless cache + # grab the required commit content_directory = File.expand_path(content_directory) repository_directory = File.join(content_directory, repository.folder_name)