Skip to content

Commit 6a7151d

Browse files
authored
Merge pull request #216 from CausalInferenceLab/feature/A1-db-explorer
Feature/a1 db explorer
2 parents 11e1379 + 3e40b33 commit 6a7151d

File tree

12 files changed

+627
-26
lines changed

12 files changed

+627
-26
lines changed

docs/BaseComponent_ko.md

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -190,13 +190,15 @@ retriever = FunctionalComponent(my_retriever, name="MyRetriever", hook=hook)
190190

191191
```python
192192
from lang2sql.core.hooks import MemoryHook
193+
from lang2sql.flows.baseline import SequentialFlow
194+
193195
hook = MemoryHook()
194196

195-
flow = BaselineFlow(steps=[...], hook=hook) # 또는 컴포넌트마다 hook 주입
196-
out = flow.run_query("지난달 매출")
197+
flow = SequentialFlow(steps=[...], hook=hook) # 또는 컴포넌트마다 hook 주입
198+
out = flow.run("지난달 매출")
197199

198200
# 이벤트 확인
199-
for e in hook.events:
201+
for e in hook.snapshot():
200202
print(e.phase, e.component, e.duration_ms, e.error)
201203
```
202204

docs/Hook_and_exception_ko.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -111,16 +111,16 @@ class MemoryHook:
111111

112112
#### MemoryHook 사용 예시
113113

114-
```py
114+
```python
115115
from lang2sql.core.hooks import MemoryHook
116-
from lang2sql.flows.baseline import BaselineFlow
116+
from lang2sql.flows.baseline import SequentialFlow
117117

118118
hook = MemoryHook()
119-
flow = BaselineFlow(steps=[...], hook=hook)
119+
flow = SequentialFlow(steps=[...], hook=hook)
120120

121-
out = flow.run_query("지난달 매출")
121+
out = flow.run("지난달 매출")
122122

123-
for e in hook.events:
123+
for e in hook.snapshot():
124124
print(e.name, e.phase, e.component, e.duration_ms, e.error)
125125
```
126126

docs/tutorials/getting-started-without-datahub.md

Lines changed: 43 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -122,19 +122,53 @@ print(f"FAISS index saved to: {OUTPUT_DIR}/catalog.faiss")
122122

123123
### 4) 실행
124124

125+
v2 CLI는 외부 벡터 인덱스 경로를 인수로 받지 않습니다.
126+
앞서 생성한 FAISS 인덱스를 활용하려면 Python API로 파이프라인을 직접 구성합니다.
127+
128+
```python
129+
# run_query.py
130+
import os
131+
from dotenv import load_dotenv
132+
from lang2sql import CatalogChunker, VectorRetriever
133+
from lang2sql.integrations.db import SQLAlchemyDB
134+
from lang2sql.integrations.embedding import OpenAIEmbedding
135+
from lang2sql.integrations.llm import OpenAILLM
136+
from lang2sql.integrations.vectorstore import FAISSVectorStore
137+
from lang2sql.flows.hybrid import HybridNL2SQL
138+
139+
load_dotenv()
140+
141+
INDEX_DIR = "./dev/table_info_db"
142+
embedding = OpenAIEmbedding(
143+
model=os.getenv("OPEN_AI_EMBEDDING_MODEL", "text-embedding-3-large"),
144+
api_key=os.getenv("OPEN_AI_KEY"),
145+
)
146+
147+
# FAISS 인덱스 로드 후 파이프라인 구성
148+
store = FAISSVectorStore.load(f"{INDEX_DIR}/catalog.faiss")
149+
150+
pipeline = HybridNL2SQL(
151+
catalog=[], # FAISS에 이미 인덱싱돼 있으므로 빈 리스트
152+
llm=OpenAILLM(model=os.getenv("OPEN_AI_LLM_MODEL", "gpt-4o"), api_key=os.getenv("OPEN_AI_KEY")),
153+
db=SQLAlchemyDB(os.getenv("DB_URL", "sqlite:///sample.db")),
154+
embedding=embedding,
155+
db_dialect=os.getenv("DB_TYPE", "sqlite"),
156+
)
157+
158+
rows = pipeline.run("주문 수를 집계하는 SQL을 만들어줘")
159+
print(rows)
160+
```
161+
162+
Streamlit UI:
163+
125164
```bash
126-
# Streamlit UI
127165
lang2sql run-streamlit
166+
```
128167

129-
# CLI 예시 (FAISS 인덱스 사용)
130-
lang2sql query "주문 수를 집계하는 SQL을 만들어줘" \
131-
--vectordb-type faiss \
132-
--vectordb-location ./dev/table_info_db
168+
CLI (카탈로그 없이 baseline만 가능):
133169

134-
# CLI 예시 (pgvector)
135-
lang2sql query "주문 수를 집계하는 SQL을 만들어줘" \
136-
--vectordb-type pgvector \
137-
--vectordb-location "postgresql://pgvector:pgvector@localhost:5432/postgres"
170+
```bash
171+
lang2sql query "주문 수를 집계해줘" --flow baseline --dialect sqlite
138172
```
139173

140174
### 5) (선택) pgvector로 적재하기
@@ -229,4 +263,3 @@ VectorRetriever.from_chunks(
229263
print(f"pgvector collection populated: {TABLE}")
230264
```
231265

232-
주의: FAISS 디렉토리 또는 pgvector 컬렉션이 없으면 현재 코드는 DataHub에서 메타데이터를 가져와 인덱스를 생성하려고 시도합니다. DataHub를 사용하지 않는 경우 위 절차로 사전에 VectorDB를 만들어 두세요.

docs/tutorials/v2-complete-tutorial.md

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
5-1. 샘플 문서 자동 생성
2121
6. 가장 쉬운 로컬 스모크 테스트 (API 키 없이)
2222
7. BaselineNL2SQL 기본 사용 (KeywordRetriever)
23+
7-1. DB 탐색: SQLAlchemyExplorer
2324
8. 실제 LLM 연결 (OpenAI / Anthropic)
2425
9. VectorRetriever 기초 (빠른 시작)
2526
10. 문서 파싱: MarkdownLoader / PlainTextLoader / DirectoryLoader / PDFLoader
@@ -232,6 +233,99 @@ print(rows)
232233

233234
---
234235

236+
## 7-1) DB 탐색: SQLAlchemyExplorer
237+
238+
LLM에게 넘길 스키마 정보가 필요하거나, 처음 보는 DB를 손으로 살펴볼 때 사용합니다.
239+
카탈로그를 미리 구축하지 않아도 DDL + 샘플 데이터를 바로 꺼내볼 수 있습니다.
240+
241+
### 기본 사용
242+
243+
```python
244+
from lang2sql import build_explorer_from_url
245+
246+
exp = build_explorer_from_url("sqlite:///sample.db")
247+
248+
# 1) 어떤 테이블이 있는지
249+
print(exp.list_tables())
250+
# ['customers', 'orders', ...]
251+
252+
# 2) 테이블 DDL — CREATE TABLE 원문
253+
print(exp.get_ddl("orders"))
254+
# CREATE TABLE orders (
255+
# id INTEGER PRIMARY KEY,
256+
# customer_id INTEGER NOT NULL REFERENCES customers(id),
257+
# amount REAL,
258+
# status TEXT DEFAULT 'pending'
259+
# )
260+
261+
# 3) 실제 샘플 데이터 (기본 5행)
262+
print(exp.sample_data("orders"))
263+
# [{'id': 1, 'customer_id': 1, 'amount': 99.9, 'status': 'shipped'}, ...]
264+
265+
# 4) 커스텀 읽기 전용 질의
266+
print(exp.execute_read_only("SELECT status, COUNT(*) AS cnt FROM orders GROUP BY status"))
267+
# [{'status': 'pending', 'cnt': 3}, {'status': 'shipped', 'cnt': 2}]
268+
```
269+
270+
### 전체 테이블 한 번에 둘러보기
271+
272+
```python
273+
from lang2sql import build_explorer_from_url
274+
275+
exp = build_explorer_from_url("sqlite:///sample.db")
276+
277+
for table in exp.list_tables():
278+
print(f"\n=== {table} ===")
279+
print(exp.get_ddl(table))
280+
rows = exp.sample_data(table, limit=2)
281+
print("샘플:", rows)
282+
```
283+
284+
### PostgreSQL / MySQL 연결
285+
286+
URL만 바꾸면 됩니다.
287+
288+
```python
289+
from lang2sql import build_explorer_from_url
290+
291+
# PostgreSQL
292+
exp = build_explorer_from_url("postgresql://user:password@localhost:5432/mydb")
293+
294+
# MySQL
295+
exp = build_explorer_from_url("mysql+pymysql://user:password@localhost:3306/mydb")
296+
297+
# schema 지정 (schema 파라미터)
298+
exp = build_explorer_from_url("postgresql://user:pass@host/db", schema="analytics")
299+
print(exp.list_tables()) # analytics 스키마 테이블만
300+
```
301+
302+
### 기존 SQLAlchemyDB engine 재사용
303+
304+
연결 풀을 따로 만들지 않고 공유할 수 있습니다.
305+
306+
```python
307+
from lang2sql.integrations.db import SQLAlchemyDB, SQLAlchemyExplorer
308+
309+
db = SQLAlchemyDB("sqlite:///sample.db")
310+
exp = SQLAlchemyExplorer.from_engine(db._engine)
311+
312+
# db는 SQL 실행, exp는 탐색 — 같은 연결 풀 공유
313+
rows = db.execute("SELECT COUNT(*) AS cnt FROM orders")
314+
ddl = exp.get_ddl("orders")
315+
```
316+
317+
### 쓰기 구문은 거부됩니다
318+
319+
```python
320+
exp.execute_read_only("DROP TABLE orders")
321+
# ValueError: Write operations not allowed: 'DROP TABLE orders'
322+
323+
exp.execute_read_only("INSERT INTO orders VALUES (99, 1, 0, 'test')")
324+
# ValueError: Write operations not allowed: 'INSERT INTO orders ...'
325+
```
326+
327+
---
328+
235329
## 8) 실제 LLM 연결 (OpenAI / Anthropic)
236330

237331
LLM 백엔드는 교체 가능합니다.

src/lang2sql/__init__.py

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
1-
from .factory import build_db_from_env, build_embedding_from_env, build_llm_from_env
1+
from .factory import (
2+
build_db_from_env,
3+
build_embedding_from_env,
4+
build_explorer_from_url,
5+
build_llm_from_env,
6+
)
27
from .components.enrichment.context_enricher import ContextEnricher
38
from .components.enrichment.question_profiler import QuestionProfiler
49
from .components.execution.sql_executor import SQLExecutor
@@ -28,16 +33,18 @@
2833
from .core.exceptions import ComponentError, IntegrationMissingError, Lang2SQLError
2934
from .core.hooks import MemoryHook, NullHook, TraceHook
3035
from .core.ports import (
36+
CatalogLoaderPort,
37+
DBExplorerPort,
3138
DBPort,
3239
DocumentLoaderPort,
3340
EmbeddingPort,
3441
LLMPort,
3542
VectorStorePort,
3643
)
44+
from .integrations.db.sqlalchemy_ import SQLAlchemyExplorer
3745
from .flows.enriched_nl2sql import EnrichedNL2SQL
3846
from .flows.hybrid import HybridNL2SQL
3947
from .flows.nl2sql import BaselineNL2SQL
40-
from .integrations.catalog.datahub_ import DataHubCatalogLoader
4148
from .integrations.embedding.azure_ import AzureOpenAIEmbedding
4249
from .integrations.embedding.bedrock_ import BedrockEmbedding
4350
from .integrations.embedding.gemini_ import GeminiEmbedding
@@ -48,8 +55,6 @@
4855
from .integrations.llm.gemini_ import GeminiLLM
4956
from .integrations.llm.huggingface_ import HuggingFaceLLM
5057
from .integrations.llm.ollama_ import OllamaLLM
51-
from .integrations.vectorstore.faiss_ import FAISSVectorStore
52-
from .integrations.vectorstore.pgvector_ import PGVectorStore
5358

5459
__all__ = [
5560
# Data types
@@ -64,9 +69,11 @@
6469
# Ports (protocols)
6570
"LLMPort",
6671
"DBPort",
72+
"DBExplorerPort",
6773
"EmbeddingPort",
6874
"VectorStorePort",
6975
"DocumentLoaderPort",
76+
"CatalogLoaderPort",
7077
# Components — retrieval
7178
"KeywordRetriever",
7279
"VectorRetriever",
@@ -116,8 +123,33 @@
116123
"OllamaEmbedding",
117124
# Catalog integrations (Phase 3)
118125
"DataHubCatalogLoader",
126+
# DB Explorer (Phase A1)
127+
"SQLAlchemyExplorer",
119128
# Factory (Phase 6)
120129
"build_llm_from_env",
121130
"build_embedding_from_env",
122131
"build_db_from_env",
132+
"build_explorer_from_url",
123133
]
134+
135+
# ---------------------------------------------------------------------------
136+
# Lazy imports (PEP 562) — optional dependencies that have import side-effects
137+
# (e.g. faiss prints INFO logs on import) or are rarely needed at startup.
138+
# ---------------------------------------------------------------------------
139+
_LAZY_IMPORTS: dict[str, tuple[str, str]] = {
140+
"DataHubCatalogLoader": (".integrations.catalog.datahub_", "DataHubCatalogLoader"),
141+
"FAISSVectorStore": (".integrations.vectorstore.faiss_", "FAISSVectorStore"),
142+
"PGVectorStore": (".integrations.vectorstore.pgvector_", "PGVectorStore"),
143+
}
144+
145+
146+
def __getattr__(name: str):
147+
if name in _LAZY_IMPORTS:
148+
module_path, attr = _LAZY_IMPORTS[name]
149+
import importlib
150+
151+
obj = getattr(importlib.import_module(module_path, package=__name__), attr)
152+
# Cache in module globals so subsequent accesses skip __getattr__
153+
globals()[name] = obj
154+
return obj
155+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

src/lang2sql/components/retrieval/vector.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,81 @@ def add(self, chunks: list[IndexedChunk]) -> None:
168168
self._vectorstore.upsert(ids, vectors)
169169
self._registry.update({c["chunk_id"]: c for c in chunks})
170170

171+
# ── Persistence ──────────────────────────────────────────────────
172+
173+
def save(self, path: str) -> None:
174+
"""벡터 인덱스와 registry를 path에 저장.
175+
176+
FAISSVectorStore처럼 save()를 지원하는 store에서만 동작한다.
177+
InMemoryVectorStore 등 save()가 없는 store는 NotImplementedError.
178+
179+
저장 파일:
180+
{path} — FAISSVectorStore 벡터 인덱스
181+
{path}.meta — chunk_id 순서 목록 (FAISSVectorStore 내부)
182+
{path}.registry — registry JSON
183+
"""
184+
import json
185+
import pathlib
186+
187+
save_fn = getattr(self._vectorstore, "save", None)
188+
if save_fn is None:
189+
raise NotImplementedError(
190+
f"{type(self._vectorstore).__name__} does not support save(). "
191+
"Use FAISSVectorStore for file-based persistence."
192+
)
193+
save_fn(path)
194+
pathlib.Path(path + ".registry").write_text(
195+
json.dumps(self._registry), encoding="utf-8"
196+
)
197+
198+
@classmethod
199+
def load(
200+
cls,
201+
path: str,
202+
*,
203+
vectorstore: VectorStorePort,
204+
embedding: EmbeddingPort,
205+
top_n: int = 5,
206+
score_threshold: float = 0.0,
207+
name: Optional[str] = None,
208+
hook: Optional[TraceHook] = None,
209+
) -> "VectorRetriever":
210+
"""저장된 registry를 복원해 VectorRetriever를 반환.
211+
212+
벡터 인덱스 복원은 호출자가 직접 수행한 뒤 vectorstore로 전달한다.
213+
이렇게 하면 VectorRetriever가 특정 store 구현체에 의존하지 않는다.
214+
215+
Args:
216+
path: save() 시 사용한 경로 (registry 파일 위치 기준).
217+
vectorstore: 이미 로드된 VectorStorePort 구현체.
218+
embedding: EmbeddingPort 구현체.
219+
top_n: 최대 반환 스키마/컨텍스트 수. 기본 5.
220+
score_threshold: 이 점수 이하는 결과에서 제외. 기본 0.0.
221+
222+
Example:
223+
store = FAISSVectorStore.load(path)
224+
retriever = VectorRetriever.load(path, vectorstore=store, embedding=emb)
225+
"""
226+
import json
227+
import pathlib
228+
229+
registry_path = pathlib.Path(path + ".registry")
230+
if not registry_path.exists():
231+
raise FileNotFoundError(f"Registry file not found: {registry_path}")
232+
233+
registry = json.loads(registry_path.read_text(encoding="utf-8"))
234+
return cls(
235+
vectorstore=vectorstore,
236+
embedding=embedding,
237+
registry=registry,
238+
top_n=top_n,
239+
score_threshold=score_threshold,
240+
name=name,
241+
hook=hook,
242+
)
243+
244+
# ── Core retrieval ────────────────────────────────────────────────
245+
171246
def _run(self, query: str) -> RetrievalResult:
172247
"""
173248
Args:

0 commit comments

Comments
 (0)