Prepare project for PyPI packaging (#16)

google-labs-jules[bot] · web-flow · commit dcf53c40f528 · 2025-09-12T15:47:57.000-04:00
* feat: Prepare project for PyPI packaging

This commit prepares the `scratchgpt` project for packaging and distribution on PyPI.

Key changes include:
- Updated `pyproject.toml` with a project description, classifiers, and repository URLs to improve package metadata.
- Populated `scratchgpt/__init__.py` to define a clean public API, exposing the core `TransformerLanguageModel` and configuration classes.
- Added tokenizer dependencies to the `dev` group to ensure the test suite runs correctly in a development environment.

* feat: Expose interfaces and implementations in public API

Based on user feedback, this commit expands the public API of the `scratchgpt` package to make it more extensible and reusable.

Key changes include:
- Updated `scratchgpt/__init__.py` to export interfaces, classes, and functions from the `tokenizer`, `data`, `training`, and `model_io` modules.
- This allows users to import and build upon the library's core components, such as `DataSource`, `Trainer`, and `Tokenizer`.

---------

Co-authored-by: google-labs-jules[bot] &lt;161369871+google-labs-jules[bot]@users.noreply.github.com&gt;
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,13 +1,19 @@
 [project]
 name = "scratchgpt"
 version = "0.3.0"
-description = "Add your description here"
+description = "A small-scale transformer-based language model implemented from scratch in Python."
 authors = [
   { name = "Aleksandr Yeganov", email = "ayeganov@gmail.com"},
   { name = "Dario Cazzani", email ="dariocazzani@gmail.com" }
 ]
 readme = "README.md"
 requires-python = ">=3.12"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+]
 dependencies = [
     "numpy>=2.3.2",
     "ptflops>=0.7.5",
@@ -19,6 +25,10 @@ dependencies = [
     "types-tqdm>=4.67.0.20250809",
 ]
 
+[project.urls]
+Homepage = "https://github.com/LabStrangeLoop/scratchgpt"
+Repository = "https://github.com/LabStrangeLoop/scratchgpt"
+
 [project.optional-dependencies]
 hf-tokenizers = [
   "tokenizers>=0.19.0",
@@ -32,6 +42,8 @@ dev = [
     "mypy>=1.17.1",
     "pytest>=8.4.1",
     "ruff>=0.1.0",
+    "tokenizers>=0.19.0",
+    "huggingface-hub>=0.34.4",
 ]
 
 [tool.pytest.ini_options]
diff --git a/scratchgpt/__init__.py b/scratchgpt/__init__.py
@@ -0,0 +1,72 @@
+"""
+ScratchGPT: A small-scale transformer-based language model implemented from scratch.
+"""
+
+from scratchgpt.config import (
+    ScratchGPTArchitecture,
+    ScratchGPTConfig,
+    ScratchGPTTraining,
+)
+from scratchgpt.data.datasource import (
+    ByteSizableDataSource,
+    DataSource,
+    FileDataSource,
+    FolderDataSource,
+    LineByLineFileDataSource,
+)
+from scratchgpt.model.model import TransformerLanguageModel
+from scratchgpt.model_io import (
+    ModelLoadFailedError,
+    TokenizerLoadFailedError,
+    get_best_model_weights_path,
+    get_latest_model_weights_path,
+    get_tokenizer,
+    get_tokenizer_path,
+    load_model,
+    load_tokenizer,
+    save_tokenizer,
+)
+from scratchgpt.tokenizer.base_tokenizer import (
+    SerializableTokenizer,
+    Tokenizer,
+    register_tokenizer,
+)
+from scratchgpt.tokenizer.char_tokenizer import CharTokenizer, Utf8Tokenizer
+from scratchgpt.tokenizer.hf_tokenizer import HuggingFaceTokenizer
+from scratchgpt.tokenizer.tiktoken import TiktokenWrapper
+from scratchgpt.training.trainer import Trainer, get_dtype_for_vocab_size
+
+__all__ = [
+    # Core Model and Config
+    "TransformerLanguageModel",
+    "ScratchGPTConfig",
+    "ScratchGPTArchitecture",
+    "ScratchGPTTraining",
+    # Data Sources
+    "DataSource",
+    "ByteSizableDataSource",
+    "FileDataSource",
+    "FolderDataSource",
+    "LineByLineFileDataSource",
+    # Model I/O
+    "load_model",
+    "load_tokenizer",
+    "save_tokenizer",
+    "get_tokenizer",
+    "get_best_model_weights_path",
+    "get_latest_model_weights_path",
+    "get_tokenizer_path",
+    "ModelLoadFailedError",
+    "TokenizerLoadFailedError",
+    # Tokenizers
+    "Tokenizer",
+    "SerializableTokenizer",
+    "register_tokenizer",
+    "CharTokenizer",
+    "Utf8Tokenizer",
+    "HuggingFaceTokenizer",
+    "TiktokenWrapper",
+    # Training
+    "Trainer",
+    "get_dtype_for_vocab_size",
+]
diff --git a/uv.lock b/uv.lock