Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 36 additions & 15 deletions pkg/history/history.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@ package history

import (
"encoding/json"
"io"
"os"
"path/filepath"
"slices"
"strconv"
"strings"
)

Expand Down Expand Up @@ -208,32 +208,53 @@ func (h *History) append(message string) error {
}

func (h *History) load() error {
f, err := os.Open(h.path)
data, err := os.ReadFile(h.path)
if err != nil {
return err
}
defer f.Close()

var all []string
dec := json.NewDecoder(f)
for {
var message string
if err := dec.Decode(&message); err != nil {
if err == io.EOF {
break
}
// Count lines to pre-size the slice.
n := 0
for _, b := range data {
if b == '\n' {
n++
}
}

// Parse all lines. Each line is a JSON-encoded string (e.g. "hello").
// strconv.Unquote handles the same escape sequences as JSON and is
// much faster than json.Unmarshal for quoted strings.
all := make([]string, 0, n)
s := string(data)
for s != "" {
i := strings.IndexByte(s, '\n')
var line string
if i < 0 {
line = s
s = ""
} else {
line = s[:i]
s = s[i+1:]
}
if line == "" {
continue
}

message, err := strconv.Unquote(line)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 MEDIUM: Silent error handling in strconv.Unquote may cause data loss

When strconv.Unquote fails on line 243, the error is silently ignored with continue. This means corrupt or malformed lines will be skipped without logging. If the history file contains invalid JSON strings due to disk corruption, application bugs, or manual editing, users will silently lose history entries. The previous implementation with json.Decoder.Decode would have failed and returned an error on the first invalid line, alerting the user.

Recommendation: Consider at minimum logging the skipped line or counting skipped entries and returning an error if too many are skipped.

if err != nil {
continue
}
all = append(all, message)
}

// Deduplicate keeping the latest occurrence of each message
seen := make(map[string]bool)
// Deduplicate keeping the latest occurrence of each message.
seen := make(map[string]struct{}, len(all))
h.Messages = make([]string, 0, len(all))
for i := len(all) - 1; i >= 0; i-- {
if seen[all[i]] {
if _, dup := seen[all[i]]; dup {
continue
}
seen[all[i]] = true
seen[all[i]] = struct{}{}
h.Messages = append(h.Messages, all[i])
}
slices.Reverse(h.Messages)
Expand Down
109 changes: 81 additions & 28 deletions pkg/modelsdev/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"encoding/json"
"fmt"
"io"
"log/slog"
"net/http"
"os"
Expand All @@ -22,15 +23,20 @@ const (

// Store manages access to the models.dev data.
// All methods are safe for concurrent use.
//
// Use NewStore to obtain the process-wide singleton instance.
// The database is loaded on first access via GetDatabase and
// shared across all callers, avoiding redundant disk/network I/O.
type Store struct {
cacheFile string
mu sync.Mutex
db *Database
etag string // ETag from last successful fetch, used for conditional requests
}

// NewStore creates a new models.dev store.
// The database is loaded on first access via GetDatabase.
func NewStore() (*Store, error) {
// singleton holds the process-wide Store instance. It is initialised lazily
// on the first call to NewStore. All subsequent calls return the same value.
var singleton = sync.OnceValues(func() (*Store, error) {
homeDir, err := os.UserHomeDir()
if err != nil {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 HIGH: Singleton initialization failure is permanent and cannot recover

The singleton pattern using sync.OnceValues means that if os.UserHomeDir() or os.MkdirAll fails during the first call, the error is cached permanently. If the first call happens during startup when the home directory is temporarily unavailable (e.g., network mount not ready, Docker volume not attached), ALL future calls will fail with the same error even after the condition is resolved. The singleton cannot be reset, requiring a process restart.

For a long-running process, this is a severe availability issue.

Recommendation: Consider making the singleton recoverable by using a mutex-protected initialization that can retry on failure, or document this limitation clearly and ensure proper error handling at startup.

return nil, fmt.Errorf("failed to get user home directory: %w", err)
Expand All @@ -44,6 +50,15 @@ func NewStore() (*Store, error) {
return &Store{
cacheFile: filepath.Join(cacheDir, CacheFileName),
}, nil
})

// NewStore returns the process-wide singleton Store.
//
// The database is loaded lazily on the first call to GetDatabase and
// then cached in memory so that every caller shares one copy.
// The first call creates the cache directory if it does not exist.
func NewStore() (*Store, error) {
return singleton()
}

// NewDatabaseStore creates a Store pre-populated with the given database.
Expand All @@ -63,12 +78,13 @@ func (s *Store) GetDatabase(ctx context.Context) (*Database, error) {
return s.db, nil
}

db, err := loadDatabase(ctx, s.cacheFile)
db, etag, err := loadDatabase(ctx, s.cacheFile)
if err != nil {
return nil, err
}

s.db = db
s.etag = etag
return db, nil
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 MEDIUM: Cache inconsistency if saveToCache fails after updating in-memory fields

If saveToCache fails after updating s.db and s.etag in memory, the in-memory cache and disk cache are out of sync. Subsequent calls will return the cached db but the disk cache wasn't updated, so after a restart, the old cache is loaded. This leads to unnecessary API calls after restart.

Recommendation: Consider updating the in-memory cache only after successful disk write, or add logic to detect and recover from this inconsistency.

}

Expand Down Expand Up @@ -128,80 +144,117 @@ func (s *Store) GetModel(ctx context.Context, id string) (*Model, error) {

// loadDatabase loads the database from the local cache file or
// falls back to fetching from the models.dev API.
func loadDatabase(ctx context.Context, cacheFile string) (*Database, error) {
// It returns the database and the ETag associated with the data.
func loadDatabase(ctx context.Context, cacheFile string) (*Database, string, error) {
// Try to load from cache first
cached, err := loadFromCache(cacheFile)
if err == nil && time.Since(cached.LastRefresh) < refreshInterval {
return &cached.Database, nil
return &cached.Database, cached.ETag, nil
}

// Cache is invalid or doesn't exist, fetch from API
database, fetchErr := fetchFromAPI(ctx)
// Cache is stale or doesn't exist — try a conditional fetch with the ETag.
var etag string
if cached != nil {
etag = cached.ETag
}

database, newETag, fetchErr := fetchFromAPI(ctx, etag)
if fetchErr != nil {
// If API fetch fails, but we have cached data, use it
// If API fetch fails but we have cached data, use it regardless of age.
if cached != nil {
return &cached.Database, nil
slog.Debug("API fetch failed, using stale cache", "error", fetchErr)
return &cached.Database, cached.ETag, nil
}
return nil, "", fmt.Errorf("failed to fetch from API and no cached data available: %w", fetchErr)
}

// database is nil when the server returned 304 Not Modified.
if database == nil && cached != nil {
// Bump LastRefresh so we don't re-check until the next interval.
cached.LastRefresh = time.Now()
if saveErr := saveToCache(cacheFile, &cached.Database, cached.ETag); saveErr != nil {
slog.Warn("Failed to update cache timestamp", "error", saveErr)
}
return nil, fmt.Errorf("failed to fetch from API and no cached data available: %w", fetchErr)
return &cached.Database, cached.ETag, nil
}

// Save to cache
if err := saveToCache(cacheFile, database); err != nil {
// Log the error but don't fail the request
slog.Warn("Warning: failed to save to cache", "error", err)
// Save the fresh data to cache.
if saveErr := saveToCache(cacheFile, database, newETag); saveErr != nil {
slog.Warn("Failed to save to cache", "error", saveErr)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 MEDIUM: No validation of cached data before returning after 304 response

When handling a 304 Not Modified response, the code returns cached.Database without validating that it's structurally sound. If the cache file was corrupted or partially written, json.Unmarshal might succeed but produce a Database with nil Providers, causing nil pointer panics in GetModel.

Recommendation: Add validation of the cached data structure (e.g., check that Providers is not nil) before returning it. This is good defensive programming against cache corruption.

}

return database, nil
return database, newETag, nil
}

func fetchFromAPI(ctx context.Context) (*Database, error) {
// fetchFromAPI fetches the models.dev database.
// If etag is non-empty it is sent as If-None-Match; a 304 response
// returns (nil, etag, nil) to indicate no change.
func fetchFromAPI(ctx context.Context, etag string) (*Database, string, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, ModelsDevAPIURL, http.NoBody)
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
return nil, "", fmt.Errorf("failed to create request: %w", err)
}

if etag != "" {
req.Header.Set("If-None-Match", etag)
}

resp, err := (&http.Client{Timeout: 30 * time.Second}).Do(req)
if err != nil {
return nil, fmt.Errorf("failed to fetch from API: %w", err)
return nil, "", fmt.Errorf("failed to fetch from API: %w", err)
}
defer resp.Body.Close()

if resp.StatusCode == http.StatusNotModified {
slog.Debug("models.dev data not modified (304)")
return nil, etag, nil
}

if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("API returned status %d", resp.StatusCode)
return nil, "", fmt.Errorf("API returned status %d", resp.StatusCode)
}

// Read the full body then unmarshal — avoids the extra intermediate
// buffering that json.Decoder.Decode performs.
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, "", fmt.Errorf("failed to read response body: %w", err)
}

var providers map[string]Provider
if err := json.NewDecoder(resp.Body).Decode(&providers); err != nil {
return nil, fmt.Errorf("failed to decode response: %w", err)
if err := json.Unmarshal(body, &providers); err != nil {
return nil, "", fmt.Errorf("failed to decode response: %w", err)
}

newETag := resp.Header.Get("ETag")

return &Database{
Providers: providers,
UpdatedAt: time.Now(),
}, nil
}, newETag, nil
}

func loadFromCache(cacheFile string) (*CachedData, error) {
f, err := os.Open(cacheFile)
data, err := os.ReadFile(cacheFile)
if err != nil {
return nil, fmt.Errorf("failed to open cache file: %w", err)
return nil, fmt.Errorf("failed to read cache file: %w", err)
}
defer f.Close()

var cached CachedData
if err := json.NewDecoder(f).Decode(&cached); err != nil {
if err := json.Unmarshal(data, &cached); err != nil {
return nil, fmt.Errorf("failed to decode cached data: %w", err)
}

return &cached, nil
}

func saveToCache(cacheFile string, database *Database) error {
func saveToCache(cacheFile string, database *Database, etag string) error {
now := time.Now()
cached := CachedData{
Database: *database,
CachedAt: now,
LastRefresh: now,
ETag: etag,
}

data, err := json.MarshalIndent(cached, "", " ")
Expand Down
1 change: 1 addition & 0 deletions pkg/modelsdev/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,4 +62,5 @@ type CachedData struct {
Database Database `json:"database"`
CachedAt time.Time `json:"cached_at"`
LastRefresh time.Time `json:"last_refresh"`
ETag string `json:"etag,omitempty"`
}
18 changes: 8 additions & 10 deletions pkg/teamloader/teamloader.go
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,9 @@ func getModelsForAgent(ctx context.Context, cfg *latest.Config, a *latest.AgentC
var models []provider.Provider
thinkingConfigured := false

// Obtain the singleton store once, outside the loop.
modelsStore, modelsStoreErr := modelsdev.NewStore()
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 MEDIUM: Singleton store initialization error is silently ignored

The modelsStore is obtained once before the loop. If modelsStoreErr is not nil, it's stored but never returned to the caller or logged. The code continues and skips the models.dev lookup when modelsStoreErr != nil. If the user hasn't configured max_tokens and modelsStoreErr is not nil, the code silently falls back to defaultMaxTokens without informing the user that the models.dev lookup failed.

This makes debugging harder when users encounter unexpected token limit behavior.

Recommendation: Log the error at an appropriate level (at least debug, preferably warn) so users can diagnose issues.


for name := range strings.SplitSeq(a.Model, ",") {
modelCfg, exists := cfg.Models[name]
isAutoModel := false
Expand All @@ -310,11 +313,7 @@ func getModelsForAgent(ctx context.Context, cfg *latest.Config, a *latest.AgentC
maxTokens := &defaultMaxTokens
if modelCfg.MaxTokens != nil {
maxTokens = modelCfg.MaxTokens
} else {
modelsStore, err := modelsdev.NewStore()
if err != nil {
return nil, false, err
}
} else if modelsStoreErr == nil {
m, err := modelsStore.GetModel(ctx, modelCfg.Provider+"/"+modelCfg.Model)
if err == nil {
maxTokens = &m.Limit.Output
Expand Down Expand Up @@ -355,6 +354,9 @@ func getModelsForAgent(ctx context.Context, cfg *latest.Config, a *latest.AgentC
func getFallbackModelsForAgent(ctx context.Context, cfg *latest.Config, a *latest.AgentConfig, runConfig *config.RuntimeConfig) ([]provider.Provider, error) {
var fallbackModels []provider.Provider

// Obtain the singleton store once, outside the loop.
modelsStore, modelsStoreErr := modelsdev.NewStore()
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 MEDIUM: Store initialization error in fallback models is never logged

In getFallbackModelsForAgent, modelsStore initialization error is silently ignored. If NewStore() fails, the error is never logged. If the user has configured fallback models that depend on models.dev for max_tokens, they'll silently get defaultMaxTokens without knowing why.

This could lead to unexpected behavior where models hit token limits earlier than expected.

Recommendation: Log the error at an appropriate level (at least debug, preferably warn) to help users diagnose issues.


for _, name := range a.GetFallbackModels() {
modelCfg, exists := cfg.Models[name]
if !exists {
Expand All @@ -371,11 +373,7 @@ func getFallbackModelsForAgent(ctx context.Context, cfg *latest.Config, a *lates
maxTokens := &defaultMaxTokens
if modelCfg.MaxTokens != nil {
maxTokens = modelCfg.MaxTokens
} else {
modelsStore, err := modelsdev.NewStore()
if err != nil {
return nil, err
}
} else if modelsStoreErr == nil {
m, err := modelsStore.GetModel(ctx, modelCfg.Provider+"/"+modelCfg.Model)
if err == nil {
maxTokens = &m.Limit.Output
Expand Down
Loading