Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -198,3 +198,42 @@ git push --follow-tags
## Help

If you have any questions or run into any problems, please create a Github issue and we'll try our best to help.

## Configurable Batching

The scraper now supports configurable batching to control the rate and size of data sent to Typesense. This helps prevent overwhelming the Typesense server with too many small requests.

### Configuration Options

You can configure batching using either **JSON config files** or **environment variables**.

#### JSON Config File

Add these parameters to your JSON configuration file:

```json
{
"index_name": "my_docs",
"batch_size": 400,
"buffer_size_limit": 100,
"flush_interval_seconds": 60,
"start_urls": [...],
"selectors": {...}
}
```

#### Environment Variables

Set these environment variables to configure batching behavior:

- `TYPESENSE_BUFFER_SIZE_LIMIT`: Maximum number of records to buffer before flushing (default: batch_size \* 2)
- `TYPESENSE_FLUSH_INTERVAL_SECONDS`: Time interval in seconds to flush buffered records (default: 60)

### How It Works

The scraper will flush records to Typesense when either:

1. The buffer reaches the size limit (`buffer_size_limit`)
2. The time interval has elapsed (`flush_interval_seconds`)

This ensures a controlled, predictable rate of data ingestion that won't overwhelm your Typesense server.
2 changes: 2 additions & 0 deletions configs/public/typesense_docs.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
"text": ".content__default p, .content__default ul li, .content__default table tbody tr"
}
},
"buffer_size_limit": 5000,
"flush_interval_seconds": 1,
"scrape_start_urls": false,
"strip_chars": " .,;:#",
"custom_settings": {
Expand Down
7 changes: 7 additions & 0 deletions scraper/src/config/config_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ class ConfigLoader:
allowed_domains = None
api_key = None
app_id = None
buffer_size_limit = 1000
flush_interval_seconds = 60
custom_settings = None
dns_resolver = None
extra_records = []
Expand Down Expand Up @@ -119,6 +121,11 @@ def _parse(self):
if self.index_name_tmp is None:
self.index_name_tmp = os.environ.get('INDEX_NAME_TMP',
f'{self.index_name}_{int(datetime.now().timestamp())}')
if self.buffer_size_limit is None:
self.buffer_size_limit = os.environ.get('TYPESENSE_BUFFER_SIZE_LIMIT', 1000)

if self.flush_interval_seconds is None:
self.flush_interval_seconds = os.environ.get('TYPESENSE_FLUSH_INTERVAL_SECONDS', 60)

# Parse config
self.selectors = SelectorsParser().parse(self.selectors)
Expand Down
12 changes: 12 additions & 0 deletions scraper/src/config/config_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,15 @@ def validate(self):
if self.config.nb_hits_max and not isinstance(self.config.nb_hits_max,
int):
raise Exception('nb_hits_max should be integer')

if self.config.buffer_size_limit is not None and not isinstance(self.config.buffer_size_limit, int):
raise Exception('buffer_size_limit should be an integer')

if self.config.buffer_size_limit is not None and not self.config.buffer_size_limit > 0:
raise Exception('buffer_size_limit should be a positive integer')

if self.config.flush_interval_seconds is not None and not isinstance(self.config.flush_interval_seconds, int):
raise Exception('flush_interval_seconds should be an integer')

if self.config.flush_interval_seconds is not None and not self.config.flush_interval_seconds > 0:
raise Exception('flush_interval_seconds should be a positive integer')
4 changes: 3 additions & 1 deletion scraper/src/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@ def run_config(config):
typesense_helper = TypesenseHelper(
config.index_name,
config.index_name_tmp,
config.custom_settings
config.custom_settings,
config.buffer_size_limit,
config.flush_interval_seconds,
)
typesense_helper.create_tmp_collection()

Expand Down
69 changes: 69 additions & 0 deletions scraper/src/tests/config_loader/buffer_size_limit_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# coding: utf-8
from ...config.config_loader import ConfigLoader
from .abstract import config
import pytest


class TestBufferSizeLimit:
@staticmethod
def test_default_buffer_size_limit():
"""Should use default buffer_size_limit of 1000 when not specified"""
c = config({})

config_loaded = ConfigLoader(c)

assert config_loaded.buffer_size_limit == 1000

def test_custom_buffer_size_limit(self):
"""Should use custom buffer_size_limit when specified in config"""
c = config({"buffer_size_limit": 500})

config_loaded = ConfigLoader(c)

assert config_loaded.buffer_size_limit == 500

def test_buffer_size_limit_zero(self):
"""Should raise exception when buffer_size_limit is 0"""
c = config({"buffer_size_limit": 0})

with pytest.raises(Exception, match="buffer_size_limit should be a positive integer"):
ConfigLoader(c)

def test_buffer_size_limit_negative(self):
"""Should raise exception when buffer_size_limit is negative"""
c = config({"buffer_size_limit": -100})

with pytest.raises(Exception, match="buffer_size_limit should be a positive integer"):
ConfigLoader(c)

def test_buffer_size_limit_string(self):
"""Should raise exception when buffer_size_limit is a string"""
c = config({"buffer_size_limit": "500"})

with pytest.raises(Exception, match="buffer_size_limit should be an integer"):
ConfigLoader(c)

def test_buffer_size_limit_float(self):
"""Should raise exception when buffer_size_limit is a float"""
c = config({"buffer_size_limit": 500.5})

with pytest.raises(Exception, match="buffer_size_limit should be an integer"):
ConfigLoader(c)

def test_buffer_size_limit_large_value(self):
"""Should accept large buffer_size_limit values"""

c = config({"buffer_size_limit": 10000})

config_loaded = ConfigLoader(c)

assert config_loaded.buffer_size_limit == 10000

def test_buffer_size_limit_one(self):
"""Should accept buffer_size_limit of 1"""

c = config({"buffer_size_limit": 1})

config_loaded = ConfigLoader(c)

assert config_loaded.buffer_size_limit == 1
69 changes: 69 additions & 0 deletions scraper/src/tests/config_loader/flush_interval_seconds_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# coding: utf-8
from ...config.config_loader import ConfigLoader
from .abstract import config
import pytest


class TestFlushIntervalSeconds:
@staticmethod
def test_default_flush_interval_seconds():
"""Should use default flush_interval_seconds of 60 when not specified"""
c = config({})

config_loaded = ConfigLoader(c)

assert config_loaded.flush_interval_seconds == 60

def test_custom_flush_interval_seconds(self):
"""Should use custom flush_interval_seconds when specified in config"""
c = config({"flush_interval_seconds": 30})

config_loaded = ConfigLoader(c)

assert config_loaded.flush_interval_seconds == 30

def test_flush_interval_seconds_zero(self):
"""Should raise exception when flush_interval_seconds is 0"""
c = config({"flush_interval_seconds": 0})

with pytest.raises(Exception, match="flush_interval_seconds should be a positive integer"):
ConfigLoader(c)

def test_flush_interval_seconds_negative(self):
"""Should raise exception when flush_interval_seconds is negative"""
c = config({"flush_interval_seconds": -30})

with pytest.raises(Exception, match="flush_interval_seconds should be a positive integer"):
ConfigLoader(c)

def test_flush_interval_seconds_string(self):
"""Should raise exception when flush_interval_seconds is a string"""
c = config({"flush_interval_seconds": "30"})

with pytest.raises(Exception, match="flush_interval_seconds should be an integer"):
ConfigLoader(c)

def test_flush_interval_seconds_float(self):
"""Should raise exception when flush_interval_seconds is a float"""
c = config({"flush_interval_seconds": 30.5})

with pytest.raises(Exception, match="flush_interval_seconds should be an integer"):
ConfigLoader(c)

def test_flush_interval_seconds_large_value(self):
"""Should accept large flush_interval_seconds values"""

c = config({"flush_interval_seconds": 3600})

config_loaded = ConfigLoader(c)

assert config_loaded.flush_interval_seconds == 3600

def test_flush_interval_seconds_one(self):
"""Should accept flush_interval_seconds of 1"""

c = config({"flush_interval_seconds": 1})

config_loaded = ConfigLoader(c)

assert config_loaded.flush_interval_seconds == 1
14 changes: 7 additions & 7 deletions scraper/src/tests/typesense_helper/commit_tmp_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ def typesense_client():
def test_create_tmp_collection(typesense_client):
"""Test that a temporary collection is created with the expected schema"""
# Arrange
original_helper = TypesenseHelper('test_alias', 'collection', {})
original_helper = TypesenseHelper('test_alias', 'collection', {}, 1000, 60)
original_helper.create_tmp_collection()

# Act
Expand Down Expand Up @@ -305,7 +305,7 @@ def test_create_tmp_collection_already_exists(typesense_client):
'token_separators': ['_', '-'],
}
)
original_helper = TypesenseHelper('test_alias', 'collection', {})
original_helper = TypesenseHelper('test_alias', 'collection', {}, 1000, 60)
original_helper.create_tmp_collection()

# Act
Expand Down Expand Up @@ -458,7 +458,7 @@ def test_add_records(typesense_client):
url = "http://example.com"
from_sitemap = True

helper = TypesenseHelper('test_alias', 'collection', {})
helper = TypesenseHelper('test_alias', 'collection', {}, 1, 60)
helper.create_tmp_collection()

# Call the method under test
Expand Down Expand Up @@ -615,7 +615,7 @@ def test_commit_tmp_collection(typesense_client):
url = "http://example.com"
from_sitemap = True

helper = TypesenseHelper('test_alias', 'collection', {})
helper = TypesenseHelper('test_alias', 'collection', {}, 1000, 60)
helper.create_tmp_collection()

original_synonyms = typesense_client.collections['collection'].synonyms.retrieve()[
Expand All @@ -630,7 +630,7 @@ def test_commit_tmp_collection(typesense_client):
helper.commit_tmp_collection()

# Act
tmp_collection_helper = TypesenseHelper('test_alias', 'collection_tmp', {})
tmp_collection_helper = TypesenseHelper('test_alias', 'collection_tmp', {}, 1000, 60)
tmp_collection_helper.create_tmp_collection()
tmp_collection_helper.add_records(records, url, from_sitemap)
tmp_collection_helper.commit_tmp_collection()
Expand Down Expand Up @@ -728,7 +728,7 @@ def test_commit_tmp_collection_with_curation_rules(typesense_client):
url = "http://example.com"
from_sitemap = True

helper = TypesenseHelper('test_alias_curation', 'collection', {})
helper = TypesenseHelper('test_alias_curation', 'collection', {}, 1000, 60)
helper.create_tmp_collection()

override = {
Expand Down Expand Up @@ -757,7 +757,7 @@ def test_commit_tmp_collection_with_curation_rules(typesense_client):

# Act
tmp_collection_helper = TypesenseHelper(
'test_alias_curation', 'collection_tmp_curation', {}
'test_alias_curation', 'collection_tmp_curation', {}, 1000, 60
)
tmp_collection_helper.create_tmp_collection()
tmp_collection_helper.add_records(records, url, from_sitemap)
Expand Down
Loading