Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .claude/settings.local.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@
"Bash(grep:*)",
"Bash(git commit:*)",
"Bash(git push:*)",
"Bash(go:*)"
"Bash(go:*)",
"Bash(gh issue list:*)",
"Bash(gh issue view:*)",
"Bash(gh auth:*)",
"Bash(gh repo view:*)"
],
"deny": []
}
Expand Down
19 changes: 16 additions & 3 deletions cmd/urlmap/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,11 @@ var (
jsThreshold float64
jsPoolSize int

// Cache flags
jsCacheEnabled bool
jsCacheSize int
jsCacheTTL time.Duration

// Robots.txt flags
respectRobots bool
)
Expand Down Expand Up @@ -105,6 +110,11 @@ func init() {
// Browser pool flags
rootCmd.Flags().IntVar(&jsPoolSize, "js-pool-size", 2, "Number of browser instances in the pool")

// Cache flags
rootCmd.Flags().BoolVar(&jsCacheEnabled, "js-cache", false, "Enable caching of JavaScript rendered pages")
rootCmd.Flags().IntVar(&jsCacheSize, "js-cache-size", 100, "Maximum number of cached entries")
rootCmd.Flags().DurationVar(&jsCacheTTL, "js-cache-ttl", 5*time.Minute, "Cache entry time-to-live")

// Robots.txt flags
rootCmd.Flags().BoolVar(&respectRobots, "respect-robots", false, "Respect robots.txt rules and crawl delays")

Expand Down Expand Up @@ -147,9 +157,12 @@ func runCrawl(cmd *cobra.Command, args []string) error {
UserAgent: userAgent,
Fallback: jsFallback,
AutoDetect: jsAuto || jsAutoStrict,
StrictMode: jsAutoStrict,
Threshold: jsThreshold,
PoolSize: jsPoolSize,
StrictMode: jsAutoStrict,
Threshold: jsThreshold,
PoolSize: jsPoolSize,
CacheEnabled: jsCacheEnabled,
CacheSize: jsCacheSize,
CacheTTL: jsCacheTTL,
}
}

Expand Down
75 changes: 63 additions & 12 deletions internal/client/js_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ type JSClient struct {
pool *BrowserPool
config *JSConfig
logger *slog.Logger
cache *RenderCache
}

// NewJSClient creates a new JavaScript client with the given configuration
Expand Down Expand Up @@ -40,6 +41,14 @@ func NewJSClient(config *JSConfig, logger *slog.Logger) (*JSClient, error) {
logger: logger,
}

// Create cache if enabled
if config.CacheEnabled {
client.cache = NewRenderCache(config.CacheSize, config.CacheTTL)
logger.Info("JavaScript render cache enabled",
"max_size", config.CacheSize,
"ttl", config.CacheTTL)
}

return client, nil
}

Expand All @@ -54,6 +63,30 @@ func (c *JSClient) RenderPage(ctx context.Context, targetURL string) (string, er

// Get implements a similar interface to the HTTP client for compatibility
func (c *JSClient) Get(ctx context.Context, targetURL string) (*JSResponse, error) {
// Check cache first if enabled
if c.cache != nil {
if entry, hit := c.cache.Get(targetURL); hit {
c.logger.Debug("Cache hit for URL", "url", targetURL)

// Parse URL for response metadata
parsedURL, err := url.Parse(targetURL)
if err != nil {
return nil, fmt.Errorf("failed to parse URL: %w", err)
}

return &JSResponse{
URL: targetURL,
Content: entry.Content,
Status: entry.StatusCode,
Headers: entry.Headers,
Host: parsedURL.Host,
FromCache: true,
}, nil
}
c.logger.Debug("Cache miss for URL", "url", targetURL)
}

// Not in cache, render the page
content, err := c.RenderPage(ctx, targetURL)
if err != nil {
return nil, err
Expand All @@ -65,13 +98,22 @@ func (c *JSClient) Get(ctx context.Context, targetURL string) (*JSResponse, erro
return nil, fmt.Errorf("failed to parse URL: %w", err)
}

return &JSResponse{
URL: targetURL,
Content: content,
Status: 200, // Assume success if we got content
Headers: make(map[string]string),
Host: parsedURL.Host,
}, nil
response := &JSResponse{
URL: targetURL,
Content: content,
Status: 200, // Assume success if we got content
Headers: make(map[string]string),
Host: parsedURL.Host,
FromCache: false,
}

// Store in cache if enabled
if c.cache != nil {
c.cache.Set(targetURL, content, response.Headers, response.Status)
c.logger.Debug("Stored render result in cache", "url", targetURL)
}

return response, nil
}

// Close cleans up the JavaScript client resources
Expand All @@ -94,13 +136,22 @@ func (c *JSClient) GetPoolStats() map[string]interface{} {
return c.pool.GetPoolStats()
}

// GetCacheStats returns statistics about the render cache
func (c *JSClient) GetCacheStats() map[string]interface{} {
if c.cache == nil {
return nil
}
return c.cache.Stats()
}

// JSResponse represents a response from JavaScript rendering
type JSResponse struct {
URL string
Content string
Status int
Headers map[string]string
Host string
URL string
Content string
Status int
Headers map[string]string
Host string
FromCache bool // Indicates if this response was served from cache
}

// String returns the rendered HTML content
Expand Down
159 changes: 159 additions & 0 deletions internal/client/js_client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -218,3 +218,162 @@ func TestJSResponse_StatusCode(t *testing.T) {
t.Errorf("Expected status 200, got %d", status)
}
}

func TestJSClient_CacheHit(t *testing.T) {
// Create test server
testServer := shared.CreateBasicTestServer()
defer testServer.Close()

logger := slog.Default()
config := &JSConfig{
Enabled: true,
BrowserType: "chromium",
Headless: true,
Timeout: 30 * time.Second,
WaitFor: "networkidle",
PoolSize: 1,
CacheEnabled: true,
CacheSize: 10,
CacheTTL: 1 * time.Hour,
}

client, err := NewJSClient(config, logger)
if err != nil {
t.Fatalf("Failed to create JS client: %v", err)
}
defer client.Close()

ctx := context.Background()

// First request - should not be cached
response1, err := client.Get(ctx, testServer.URL)
if err != nil {
t.Fatalf("Failed to get page: %v", err)
}

if response1.FromCache {
t.Error("First request should not be from cache")
}

// Second request - should be cached
response2, err := client.Get(ctx, testServer.URL)
if err != nil {
t.Fatalf("Failed to get cached page: %v", err)
}

if !response2.FromCache {
t.Error("Second request should be from cache")
}

// Content should be the same
if response1.Content != response2.Content {
t.Error("Cached content should match original content")
}

// Check cache stats
cacheStats := client.GetCacheStats()
if cacheStats == nil {
t.Fatal("Cache stats should not be nil")
}

if cacheStats["size"].(int) != 1 {
t.Errorf("Expected cache size 1, got %v", cacheStats["size"])
}
}

func TestJSClient_CacheExpiration(t *testing.T) {
// Create test server
testServer := shared.CreateBasicTestServer()
defer testServer.Close()

logger := slog.Default()
config := &JSConfig{
Enabled: true,
BrowserType: "chromium",
Headless: true,
Timeout: 30 * time.Second,
WaitFor: "networkidle",
PoolSize: 1,
CacheEnabled: true,
CacheSize: 10,
CacheTTL: 100 * time.Millisecond, // Short TTL for testing
}

client, err := NewJSClient(config, logger)
if err != nil {
t.Fatalf("Failed to create JS client: %v", err)
}
defer client.Close()

ctx := context.Background()

// First request
response1, err := client.Get(ctx, testServer.URL)
if err != nil {
t.Fatalf("Failed to get page: %v", err)
}

if response1.FromCache {
t.Error("First request should not be from cache")
}

// Wait for cache to expire
time.Sleep(150 * time.Millisecond)

// Second request - should not be cached (expired)
response2, err := client.Get(ctx, testServer.URL)
if err != nil {
t.Fatalf("Failed to get page after expiration: %v", err)
}

if response2.FromCache {
t.Error("Request after expiration should not be from cache")
}
}

func TestJSClient_CacheDisabled(t *testing.T) {
// Create test server
testServer := shared.CreateBasicTestServer()
defer testServer.Close()

logger := slog.Default()
config := &JSConfig{
Enabled: true,
BrowserType: "chromium",
Headless: true,
Timeout: 30 * time.Second,
WaitFor: "networkidle",
PoolSize: 1,
CacheEnabled: false, // Cache disabled
}

client, err := NewJSClient(config, logger)
if err != nil {
t.Fatalf("Failed to create JS client: %v", err)
}
defer client.Close()

ctx := context.Background()

// First request
response1, err := client.Get(ctx, testServer.URL)
if err != nil {
t.Fatalf("Failed to get page: %v", err)
}

// Second request - should not be cached
response2, err := client.Get(ctx, testServer.URL)
if err != nil {
t.Fatalf("Failed to get page: %v", err)
}

if response1.FromCache || response2.FromCache {
t.Error("No requests should be from cache when caching is disabled")
}

// Cache stats should be nil
cacheStats := client.GetCacheStats()
if cacheStats != nil {
t.Error("Cache stats should be nil when caching is disabled")
}
}
44 changes: 33 additions & 11 deletions internal/client/js_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,22 +40,34 @@ type JSConfig struct {

// PoolSize specifies the number of browser instances in the pool
PoolSize int

// CacheEnabled indicates whether to cache rendered pages
CacheEnabled bool

// CacheSize specifies the maximum number of cache entries
CacheSize int

// CacheTTL specifies how long cache entries remain valid
CacheTTL time.Duration
}

// DefaultJSConfig returns a default JavaScript configuration
func DefaultJSConfig() *JSConfig {
return &JSConfig{
Enabled: false,
BrowserType: "chromium",
Headless: true,
Timeout: 30 * time.Second,
WaitFor: "networkidle",
UserAgent: "urlmap/1.0",
AutoDetect: false,
StrictMode: false,
Threshold: 0.5,
Fallback: true,
PoolSize: 2,
Enabled: false,
BrowserType: "chromium",
Headless: true,
Timeout: 30 * time.Second,
WaitFor: "networkidle",
UserAgent: "urlmap/1.0",
AutoDetect: false,
StrictMode: false,
Threshold: 0.5,
Fallback: true,
PoolSize: 2,
CacheEnabled: false,
CacheSize: 100,
CacheTTL: 5 * time.Minute,
}
}

Expand Down Expand Up @@ -101,5 +113,15 @@ func (c *JSConfig) Validate() error {
return fmt.Errorf("pool size must be positive, got: %v", c.PoolSize)
}

// Validate cache configuration
if c.CacheEnabled {
if c.CacheSize <= 0 {
return fmt.Errorf("cache size must be positive when cache is enabled, got: %v", c.CacheSize)
}
if c.CacheTTL <= 0 {
return fmt.Errorf("cache TTL must be positive when cache is enabled, got: %v", c.CacheTTL)
}
}

return nil
}
Loading
Loading