diff --git a/go.mod b/go.mod index 5b748f52d..40d4b7872 100644 --- a/go.mod +++ b/go.mod @@ -6,7 +6,7 @@ require ( github.com/BurntSushi/toml v1.6.0 github.com/alecthomas/chroma/v2 v2.27.0 github.com/bradleyfalzon/ghinstallation/v2 v2.19.0 - github.com/cloudbase/garm-provider-common v0.1.9 + github.com/cloudbase/garm-provider-common v0.1.10-0.20260627162627-e00d7529cc6f github.com/felixge/httpsnoop v1.1.0 github.com/gdamore/tcell/v2 v2.13.10 github.com/go-gormigrate/gormigrate/v2 v2.1.6 diff --git a/go.sum b/go.sum index f1df2ca78..7b279994a 100644 --- a/go.sum +++ b/go.sum @@ -25,8 +25,8 @@ github.com/chzyer/test v1.0.0 h1:p3BQDXSxOhOG0P9z6/hGnII4LGiEPOYBhs8asl/fC04= github.com/chzyer/test v1.0.0/go.mod h1:2JlltgoNkt4TW/z9V/IzDdFaMTM2JPIi26O1pF38GC8= github.com/clipperhouse/uax29/v2 v2.7.0 h1:+gs4oBZ2gPfVrKPthwbMzWZDaAFPGYK72F0NJv2v7Vk= github.com/clipperhouse/uax29/v2 v2.7.0/go.mod h1:EFJ2TJMRUaplDxHKj1qAEhCtQPW2tJSwu5BF98AuoVM= -github.com/cloudbase/garm-provider-common v0.1.9 h1:ZL53ma/j7BgMAqW/OJ/jCnx1MUd8hLcYxUTEFb/o/e0= -github.com/cloudbase/garm-provider-common v0.1.9/go.mod h1:8tnJcLXtaMUDEUgX3MGLFEYnMpiCAKaKWPtzNnzEpAE= +github.com/cloudbase/garm-provider-common v0.1.10-0.20260627162627-e00d7529cc6f h1:IalGGcSKBGUd+KFFn2OXO6SMcODyCkSO/SMXtm4nwy8= +github.com/cloudbase/garm-provider-common v0.1.10-0.20260627162627-e00d7529cc6f/go.mod h1:i1KXJVzi5ouzbdu5BXjuA+rqk4nxIBVBEYrKReSERIU= github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= diff --git a/runner/pool/pool.go b/runner/pool/pool.go index b9414bef3..b575b60e5 100644 --- a/runner/pool/pool.go +++ b/runner/pool/pool.go @@ -1334,7 +1334,7 @@ func (r *basePoolManager) addRunnerToPool(pool params.Pool, aditionalLabels []st } if poolInstanceCount >= int64(pool.MaxRunners) { - return fmt.Errorf("max workers (%d) reached for pool %s", pool.MaxRunners, pool.ID) + return runnerErrors.NewNoCapacityError("max workers (%d) reached for pool %s", pool.MaxRunners, pool.ID) } if err := r.AddRunner(r.ctx, pool.ID, aditionalLabels); err != nil { @@ -1412,8 +1412,6 @@ func (r *basePoolManager) retryFailedInstancesForOnePool(ctx context.Context, po g, errCtx := errgroup.WithContext(ctx) for _, instance := range existingInstances { - instance := instance - if instance.Status != commonParams.InstanceError { continue } @@ -1472,7 +1470,10 @@ func (r *basePoolManager) retryFailedInstancesForOnePool(ctx context.Context, po ctx, "queueing previously failed instance for retry", "runner_name", instance.Name) // Set instance to pending create and wait for retry. - if _, err := r.store.UpdateInstance(r.ctx, instance.Name, updateParams); err != nil { + // Use ForceUpdateInstance() here. It will ignore the instance transition from error to + // something other than a cleanup status (pending_delete/deleting). We don't really want to allow + // transitioning from error directly to "creating" otherwise. + if _, err := r.store.ForceUpdateInstance(r.ctx, instance.Name, updateParams); err != nil { slog.With(slog.Any("error", err)).ErrorContext( ctx, "failed to update runner status", "runner_name", instance.Name) @@ -2174,9 +2175,15 @@ func (r *basePoolManager) consumeQueuedJobs() error { "pool_id", pool.ID, "job_id", job.WorkflowJobID) if err := r.addRunnerToPool(pool, jobLabels); err != nil { - slog.With(slog.Any("error", err)).ErrorContext( - r.ctx, "could not add runner to pool", - "pool_id", pool.ID) + if errors.Is(err, runnerErrors.ErrNoCapacity) { + slog.With(slog.Any("error", err)).InfoContext( + r.ctx, "could not add runner to pool", + "pool_id", pool.ID) + } else { + slog.With(slog.Any("error", err)).ErrorContext( + r.ctx, "could not add runner to pool", + "pool_id", pool.ID) + } continue } slog.DebugContext(r.ctx, "a new runner was added as a response to queued job", @@ -2187,7 +2194,7 @@ func (r *basePoolManager) consumeQueuedJobs() error { } if !runnerCreated { - slog.WarnContext( + slog.InfoContext( r.ctx, "could not create a runner for job; unlocking", "job_id", job.WorkflowJobID) if err := r.store.UnlockJob(r.ctx, job.WorkflowJobID, r.ID()); err != nil { diff --git a/vendor/github.com/cloudbase/garm-provider-common/errors/errors.go b/vendor/github.com/cloudbase/garm-provider-common/errors/errors.go index 76e85d9cf..9a12c4ba7 100644 --- a/vendor/github.com/cloudbase/garm-provider-common/errors/errors.go +++ b/vendor/github.com/cloudbase/garm-provider-common/errors/errors.go @@ -32,6 +32,7 @@ var ( ErrTimeout = NewTimeoutError("timed out") ErrUnprocessable = NewUnprocessableError("cannot process request") ErrNoPoolsAvailable = NewNoPoolsAvailableError("no pools available") + ErrNoCapacity = NewNoCapacityError("no capacity available") ) type baseError struct { @@ -43,7 +44,7 @@ func (b *baseError) Error() string { } // NewProviderError returns a new ProviderError -func NewProviderError(msg string, a ...interface{}) error { +func NewProviderError(msg string, a ...any) error { return &ProviderError{ baseError{ msg: fmt.Sprintf(msg, a...), @@ -66,7 +67,7 @@ func (p *ProviderError) Is(target error) bool { } // NewMissingSecretError returns a new MissingSecretError -func NewMissingSecretError(msg string, a ...interface{}) error { +func NewMissingSecretError(msg string, a ...any) error { return &MissingSecretError{ baseError{ msg: fmt.Sprintf(msg, a...), @@ -112,7 +113,7 @@ func (p *UnauthorizedError) Is(target error) bool { } // NewNotFoundError returns a new NotFoundError -func NewNotFoundError(msg string, a ...interface{}) error { +func NewNotFoundError(msg string, a ...any) error { return &NotFoundError{ baseError{ msg: fmt.Sprintf(msg, a...), @@ -158,7 +159,7 @@ func (p *DuplicateUserError) Is(target error) bool { } // NewBadRequestError returns a new BadRequestError -func NewBadRequestError(msg string, a ...interface{}) error { +func NewBadRequestError(msg string, a ...any) error { return &BadRequestError{ baseError{ msg: fmt.Sprintf(msg, a...), @@ -181,7 +182,7 @@ func (p *BadRequestError) Is(target error) bool { } // NewConflictError returns a new ConflictError -func NewConflictError(msg string, a ...interface{}) error { +func NewConflictError(msg string, a ...any) error { return &ConflictError{ baseError{ msg: fmt.Sprintf(msg, a...), @@ -204,7 +205,7 @@ func (p *ConflictError) Is(target error) bool { } // NewTimeoutError returns a new TimoutError -func NewTimeoutError(msg string, a ...interface{}) error { +func NewTimeoutError(msg string, a ...any) error { return &TimoutError{ baseError{ msg: fmt.Sprintf(msg, a...), @@ -227,15 +228,15 @@ func (p *TimoutError) Is(target error) bool { } // NewUnprocessableError returns a new UnprocessableError -func NewUnprocessableError(msg string, a ...interface{}) error { - return &TimoutError{ +func NewUnprocessableError(msg string, a ...any) error { + return &UnprocessableError{ baseError{ msg: fmt.Sprintf(msg, a...), }, } } -// TimoutError is returned when an operation times out. +// UnprocessableError is returned when a request cannot be processed. type UnprocessableError struct { baseError } @@ -249,16 +250,16 @@ func (p *UnprocessableError) Is(target error) bool { return ok } -// NewNoPoolsAvailableError returns a new UnprocessableError -func NewNoPoolsAvailableError(msg string, a ...interface{}) error { - return &TimoutError{ +// NewNoPoolsAvailableError returns a new NoPoolsAvailableError +func NewNoPoolsAvailableError(msg string, a ...any) error { + return &NoPoolsAvailableError{ baseError{ msg: fmt.Sprintf(msg, a...), }, } } -// NoPoolsAvailableError is returned when anthere are not pools available. +// NoPoolsAvailableError is returned when there are no pools available. type NoPoolsAvailableError struct { baseError } @@ -271,3 +272,26 @@ func (p *NoPoolsAvailableError) Is(target error) bool { _, ok := target.(*NoPoolsAvailableError) return ok } + +// NewNoCapacityError returns a new NoCapacityError +func NewNoCapacityError(msg string, a ...any) error { + return &NoCapacityError{ + baseError{ + msg: fmt.Sprintf(msg, a...), + }, + } +} + +// NoCapacityError is returned when there is no capacity available. +type NoCapacityError struct { + baseError +} + +func (p *NoCapacityError) Is(target error) bool { + if target == nil { + return false + } + + _, ok := target.(*NoCapacityError) + return ok +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 21e24def6..bc027c676 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -25,7 +25,7 @@ github.com/chzyer/readline # github.com/clipperhouse/uax29/v2 v2.7.0 ## explicit; go 1.18 github.com/clipperhouse/uax29/v2/graphemes -# github.com/cloudbase/garm-provider-common v0.1.9 +# github.com/cloudbase/garm-provider-common v0.1.10-0.20260627162627-e00d7529cc6f ## explicit; go 1.25.0 github.com/cloudbase/garm-provider-common/cloudconfig github.com/cloudbase/garm-provider-common/defaults