Skip to content

Commit dcf53c4

Browse files
Prepare project for PyPI packaging (#16)
* feat: Prepare project for PyPI packaging This commit prepares the `scratchgpt` project for packaging and distribution on PyPI. Key changes include: - Updated `pyproject.toml` with a project description, classifiers, and repository URLs to improve package metadata. - Populated `scratchgpt/__init__.py` to define a clean public API, exposing the core `TransformerLanguageModel` and configuration classes. - Added tokenizer dependencies to the `dev` group to ensure the test suite runs correctly in a development environment. * feat: Expose interfaces and implementations in public API Based on user feedback, this commit expands the public API of the `scratchgpt` package to make it more extensible and reusable. Key changes include: - Updated `scratchgpt/__init__.py` to export interfaces, classes, and functions from the `tokenizer`, `data`, `training`, and `model_io` modules. - This allows users to import and build upon the library's core components, such as `DataSource`, `Trainer`, and `Tokenizer`. --------- Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com>
1 parent 03fe4e1 commit dcf53c4

3 files changed

Lines changed: 195 additions & 107 deletions

File tree

pyproject.toml

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,19 @@
11
[project]
22
name = "scratchgpt"
33
version = "0.3.0"
4-
description = "Add your description here"
4+
description = "A small-scale transformer-based language model implemented from scratch in Python."
55
authors = [
66
{ name = "Aleksandr Yeganov", email = "ayeganov@gmail.com"},
77
{ name = "Dario Cazzani", email ="dariocazzani@gmail.com" }
88
]
99
readme = "README.md"
1010
requires-python = ">=3.12"
11+
classifiers = [
12+
"Programming Language :: Python :: 3",
13+
"License :: OSI Approved :: MIT License",
14+
"Operating System :: OS Independent",
15+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
16+
]
1117
dependencies = [
1218
"numpy>=2.3.2",
1319
"ptflops>=0.7.5",
@@ -19,6 +25,10 @@ dependencies = [
1925
"types-tqdm>=4.67.0.20250809",
2026
]
2127

28+
[project.urls]
29+
Homepage = "https://github.com/LabStrangeLoop/scratchgpt"
30+
Repository = "https://github.com/LabStrangeLoop/scratchgpt"
31+
2232
[project.optional-dependencies]
2333
hf-tokenizers = [
2434
"tokenizers>=0.19.0",
@@ -32,6 +42,8 @@ dev = [
3242
"mypy>=1.17.1",
3343
"pytest>=8.4.1",
3444
"ruff>=0.1.0",
45+
"tokenizers>=0.19.0",
46+
"huggingface-hub>=0.34.4",
3547
]
3648

3749
[tool.pytest.ini_options]

scratchgpt/__init__.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
"""
2+
ScratchGPT: A small-scale transformer-based language model implemented from scratch.
3+
"""
4+
5+
from scratchgpt.config import (
6+
ScratchGPTArchitecture,
7+
ScratchGPTConfig,
8+
ScratchGPTTraining,
9+
)
10+
from scratchgpt.data.datasource import (
11+
ByteSizableDataSource,
12+
DataSource,
13+
FileDataSource,
14+
FolderDataSource,
15+
LineByLineFileDataSource,
16+
)
17+
from scratchgpt.model.model import TransformerLanguageModel
18+
from scratchgpt.model_io import (
19+
ModelLoadFailedError,
20+
TokenizerLoadFailedError,
21+
get_best_model_weights_path,
22+
get_latest_model_weights_path,
23+
get_tokenizer,
24+
get_tokenizer_path,
25+
load_model,
26+
load_tokenizer,
27+
save_tokenizer,
28+
)
29+
from scratchgpt.tokenizer.base_tokenizer import (
30+
SerializableTokenizer,
31+
Tokenizer,
32+
register_tokenizer,
33+
)
34+
from scratchgpt.tokenizer.char_tokenizer import CharTokenizer, Utf8Tokenizer
35+
from scratchgpt.tokenizer.hf_tokenizer import HuggingFaceTokenizer
36+
from scratchgpt.tokenizer.tiktoken import TiktokenWrapper
37+
from scratchgpt.training.trainer import Trainer, get_dtype_for_vocab_size
38+
39+
__all__ = [
40+
# Core Model and Config
41+
"TransformerLanguageModel",
42+
"ScratchGPTConfig",
43+
"ScratchGPTArchitecture",
44+
"ScratchGPTTraining",
45+
# Data Sources
46+
"DataSource",
47+
"ByteSizableDataSource",
48+
"FileDataSource",
49+
"FolderDataSource",
50+
"LineByLineFileDataSource",
51+
# Model I/O
52+
"load_model",
53+
"load_tokenizer",
54+
"save_tokenizer",
55+
"get_tokenizer",
56+
"get_best_model_weights_path",
57+
"get_latest_model_weights_path",
58+
"get_tokenizer_path",
59+
"ModelLoadFailedError",
60+
"TokenizerLoadFailedError",
61+
# Tokenizers
62+
"Tokenizer",
63+
"SerializableTokenizer",
64+
"register_tokenizer",
65+
"CharTokenizer",
66+
"Utf8Tokenizer",
67+
"HuggingFaceTokenizer",
68+
"TiktokenWrapper",
69+
# Training
70+
"Trainer",
71+
"get_dtype_for_vocab_size",
72+
]

0 commit comments

Comments
 (0)