|
1 | | -from collections.abc import Iterator |
2 | | -from pathlib import Path |
3 | | -from typing import Protocol, runtime_checkable |
| 1 | +from typing import Protocol |
| 2 | + |
| 3 | +from scratchgpt.core.types import DictTensorLoader |
| 4 | +from scratchgpt.tokenizer.base_tokenizer import Tokenizer |
4 | 5 |
|
5 | 6 |
|
6 | | -@runtime_checkable |
7 | 7 | class DataSource(Protocol): |
8 | 8 | """ |
9 | | - An interface for providing raw data to the Trainer. |
| 9 | + A protocol for classes that can provide training and validation DataLoaders. |
10 | 10 |
|
11 | | - A DataSource is an iterable object that yields individual, |
12 | | - untokenized training samples as strings. |
| 11 | + This uses structural subtyping. Any class that implements a matching |
| 12 | + `get_dataloaders` method will be considered a valid DataSource. |
13 | 13 | """ |
14 | 14 |
|
15 | | - def __iter__(self) -> Iterator[str]: |
16 | | - """Returns an iterator over the raw text samples.""" |
| 15 | + def get_dataloaders( |
| 16 | + self, |
| 17 | + tokenizer: Tokenizer, |
| 18 | + block_size: int, |
| 19 | + batch_size: int, |
| 20 | + splits: tuple[float, float], |
| 21 | + random_seed: int, |
| 22 | + ) -> tuple[DictTensorLoader, DictTensorLoader | None]: |
| 23 | + """ |
| 24 | + Processes data and returns train and validation DataLoaders. |
| 25 | + """ |
17 | 26 | ... |
18 | | - |
19 | | - |
20 | | -@runtime_checkable |
21 | | -class ByteSizableDataSource(DataSource, Protocol): |
22 | | - """An optional extension for DataSources that can report their total size in bytes.""" |
23 | | - |
24 | | - def total_bytes(self) -> int: |
25 | | - """Returns the total size of the data source in bytes.""" |
26 | | - ... |
27 | | - |
28 | | - |
29 | | -class FileDataSource(ByteSizableDataSource): |
30 | | - """Yields the entire content of a single text file as one sample.""" |
31 | | - |
32 | | - def __init__(self, file_path: Path): |
33 | | - if not file_path.is_file(): |
34 | | - raise FileNotFoundError(f"Source file not found at: {file_path}") |
35 | | - self._file_path = file_path |
36 | | - |
37 | | - def __len__(self) -> int: |
38 | | - return 1 |
39 | | - |
40 | | - def __iter__(self) -> Iterator[str]: |
41 | | - with open(self._file_path, encoding="utf-8", errors="ignore") as f: |
42 | | - yield f.read() |
43 | | - |
44 | | - def total_bytes(self) -> int: |
45 | | - return self._file_path.stat().st_size |
46 | | - |
47 | | - |
48 | | -class FolderDataSource(ByteSizableDataSource): |
49 | | - """Iterates through a directory and yields the content of each file.""" |
50 | | - |
51 | | - def __init__(self, folder_path: Path): |
52 | | - if not folder_path.is_dir(): |
53 | | - raise NotADirectoryError(f"Source path is not a directory: {folder_path}") |
54 | | - |
55 | | - self._file_paths = [p for p in folder_path.rglob("*") if p.is_file() and not p.name.startswith(".")] |
56 | | - print(f"✅ Found {len(self._file_paths)} files to process in {folder_path}.") |
57 | | - |
58 | | - def __len__(self) -> int: |
59 | | - return len(self._file_paths) |
60 | | - |
61 | | - def __iter__(self) -> Iterator[str]: |
62 | | - for file_path in self._file_paths: |
63 | | - with open(file_path, encoding="utf-8", errors="ignore") as f: |
64 | | - yield from f |
65 | | - |
66 | | - def total_bytes(self) -> int: |
67 | | - return sum(p.stat().st_size for p in self._file_paths) |
68 | | - |
69 | | - |
70 | | -class LineByLineFileDataSource(ByteSizableDataSource): |
71 | | - """Reads a text file and yields each line as a separate sample.""" |
72 | | - |
73 | | - def __init__(self, file_path: Path): |
74 | | - if not file_path.is_file(): |
75 | | - raise FileNotFoundError(f"Source file not found at: {file_path}") |
76 | | - self._file_path = file_path |
77 | | - |
78 | | - print("Pre-counting lines for progress bar...") |
79 | | - with open(self._file_path, encoding="utf-8", errors="ignore") as f: |
80 | | - self._line_count = sum(1 for _ in f) |
81 | | - |
82 | | - def __len__(self) -> int: |
83 | | - return self._line_count |
84 | | - |
85 | | - def __iter__(self) -> Iterator[str]: |
86 | | - with open(self._file_path, encoding="utf-8", errors="ignore") as f: |
87 | | - yield from f |
88 | | - |
89 | | - def total_bytes(self) -> int: |
90 | | - return self._file_path.stat().st_size |
0 commit comments