From 187928c535fd9fc75e1ebe5ea4548e9aef242cbd Mon Sep 17 00:00:00 2001 From: chin-keong-lam Date: Wed, 8 Apr 2026 02:17:04 -0700 Subject: [PATCH 1/6] =?UTF-8?q?Build=20wisdomGraph=20Python=20package=20?= =?UTF-8?q?=E2=80=94=20full=20DIKW=20pipeline?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Complete implementation of the wisdomGraph pip package: wisdom/ — Neo4j-native DIKW wisdom accumulation library __init__.py — package version __main__.py — CLI: install, connect, docker, ask, reflect, path, explain, god-nodes, export, purge connect.py — Neo4j driver lifecycle, schema setup (indexes + full-text), graph status detect.py — file discovery + type classification (.wisdomignore support) cache.py — SHA256 extraction cache (incremental re-runs) security.py — URL/path/label validation, Cypher injection prevention validate.py — extraction schema validation before MERGE classify.py — DIKW tier assignment (heuristics + LLM-explicit tiers) merge.py — idempotent Neo4j MERGE for nodes, edges, sources reflect.py — DIKW promotion engine: K→E→I→W + REINFORCES feedback traverse.py — full-text search, DIKW path walk, god-nodes, answer_question report.py — WISDOM_REPORT.md generator (always-on context doc) export.py — Cypher dump, graph.json (graphify-compatible), Obsidian vault ingest.py — URL fetcher with HTML→text conversion docker.py — DozerDB container lifecycle (up/down/status) skill.md — Claude Code /wisdom skill (full pipeline orchestration) skill-claw.md — OpenClaw /wisdom skill (sequential extraction) pyproject.toml — wisdomgraph pip package, optional[ast|pdf|office|all] Key design decisions: - MERGE everywhere: idempotent, accumulative across sessions - DIKW classification in classify.py: tier assignment from heuristics + explicit LLM output - reflect() is additive: never deletes, only promotes upward - Wisdom generation deferred to LLM in skill.md (needs language understanding) - Password stored as env var name in config, never in plain config files - Full-text Neo4j index for semantic search at query time (no embeddings needed) Co-Authored-By: Claude Sonnet 4.6 --- .claude/settings.local.json | 3 +- pyproject.toml | 86 ++++++++ wisdom/__init__.py | 10 + wisdom/__main__.py | 429 ++++++++++++++++++++++++++++++++++++ wisdom/cache.py | 81 +++++++ wisdom/classify.py | 121 ++++++++++ wisdom/connect.py | 147 ++++++++++++ wisdom/detect.py | 145 ++++++++++++ wisdom/docker.py | 94 ++++++++ wisdom/export.py | 125 +++++++++++ wisdom/ingest.py | 88 ++++++++ wisdom/merge.py | 158 +++++++++++++ wisdom/reflect.py | 257 +++++++++++++++++++++ wisdom/report.py | 131 +++++++++++ wisdom/security.py | 54 +++++ wisdom/skill-claw.md | 43 ++++ wisdom/skill.md | 417 +++++++++++++++++++++++++++++++++++ wisdom/traverse.py | 224 +++++++++++++++++++ wisdom/validate.py | 46 ++++ 19 files changed, 2658 insertions(+), 1 deletion(-) create mode 100644 pyproject.toml create mode 100644 wisdom/__init__.py create mode 100644 wisdom/__main__.py create mode 100644 wisdom/cache.py create mode 100644 wisdom/classify.py create mode 100644 wisdom/connect.py create mode 100644 wisdom/detect.py create mode 100644 wisdom/docker.py create mode 100644 wisdom/export.py create mode 100644 wisdom/ingest.py create mode 100644 wisdom/merge.py create mode 100644 wisdom/reflect.py create mode 100644 wisdom/report.py create mode 100644 wisdom/security.py create mode 100644 wisdom/skill-claw.md create mode 100644 wisdom/skill.md create mode 100644 wisdom/traverse.py create mode 100644 wisdom/validate.py diff --git a/.claude/settings.local.json b/.claude/settings.local.json index 76d3bfe..27cd798 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -2,7 +2,8 @@ "permissions": { "allow": [ "Bash(git push -u origin claude/determined-volhard)", - "Bash(git add .)" + "Bash(git add .)", + "Bash(git add -A)" ] } } diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..839f1ff --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,86 @@ +[build-system] +requires = ["setuptools>=68"] +build-backend = "setuptools.build_meta" + +[project] +name = "wisdomgraph" +version = "0.1.0" +description = "Accumulative Neo4j-native DIKW wisdom memory for AI coding assistants (Claude Code, OpenClaw)" +readme = "README.md" +license = { file = "LICENSE" } +keywords = [ + "claude", "claude-code", "openclaw", "neo4j", "knowledge-graph", + "graphrag", "dikw", "wisdom", "memory", "accumulative", "llm", + "skill", "agent-memory", "dozerdb", +] +requires-python = ">=3.10" +dependencies = [ + "neo4j>=5.0", +] + +[project.urls] +Homepage = "https://github.com/cklam12345/wisdomGraph" +Repository = "https://github.com/cklam12345/wisdomGraph" +Issues = "https://github.com/cklam12345/wisdomGraph/issues" + +[project.optional-dependencies] +ast = [ + "tree-sitter", + "tree-sitter-python", + "tree-sitter-javascript", + "tree-sitter-typescript", + "tree-sitter-go", + "tree-sitter-rust", + "tree-sitter-java", + "tree-sitter-c", + "tree-sitter-cpp", + "tree-sitter-ruby", + "tree-sitter-c-sharp", + "tree-sitter-kotlin", + "tree-sitter-scala", + "tree-sitter-php", + "tree-sitter-swift", + "tree-sitter-lua", + "tree-sitter-zig", + "tree-sitter-powershell", + "tree-sitter-elixir", + "tree-sitter-objc", +] +pdf = ["pypdf", "html2text"] +office = ["python-docx", "openpyxl"] +all = [ + "tree-sitter", + "tree-sitter-python", + "tree-sitter-javascript", + "tree-sitter-typescript", + "tree-sitter-go", + "tree-sitter-rust", + "tree-sitter-java", + "tree-sitter-c", + "tree-sitter-cpp", + "tree-sitter-ruby", + "tree-sitter-c-sharp", + "tree-sitter-kotlin", + "tree-sitter-scala", + "tree-sitter-php", + "tree-sitter-swift", + "tree-sitter-lua", + "tree-sitter-zig", + "tree-sitter-powershell", + "tree-sitter-elixir", + "tree-sitter-objc", + "pypdf", + "html2text", + "python-docx", + "openpyxl", +] + +[project.scripts] +wisdom = "wisdom.__main__:main" + +[tool.setuptools.packages.find] +where = ["."] +include = ["wisdom*"] + +[tool.setuptools.package-data] +wisdom = ["skill.md", "skill-claw.md", "skill-windows.md"] diff --git a/wisdom/__init__.py b/wisdom/__init__.py new file mode 100644 index 0000000..643671b --- /dev/null +++ b/wisdom/__init__.py @@ -0,0 +1,10 @@ +"""wisdomGraph — accumulative Neo4j-native wisdom memory for AI coding assistants.""" +from __future__ import annotations + +try: + from importlib.metadata import version as _pkg_version + __version__ = _pkg_version("wisdomgraph") +except Exception: + __version__ = "unknown" + +__all__ = ["__version__"] diff --git a/wisdom/__main__.py b/wisdom/__main__.py new file mode 100644 index 0000000..e29ed01 --- /dev/null +++ b/wisdom/__main__.py @@ -0,0 +1,429 @@ +"""wisdomGraph CLI — `wisdom install` sets up the Claude Code skill.""" +from __future__ import annotations + +import json +import platform as _platform_mod +import re +import shutil +import sys +from pathlib import Path + +try: + from importlib.metadata import version as _pkg_version + __version__ = _pkg_version("wisdomgraph") +except Exception: + __version__ = "unknown" + + +# ── Hook / registration strings ────────────────────────────────────────────── + +_SETTINGS_HOOK = { + "matcher": "Glob|Grep", + "hooks": [{ + "type": "command", + "command": ( + "[ -f wisdom-out/WISDOM_REPORT.md ] && " + "echo 'wisdomGraph: Wisdom graph exists. Read wisdom-out/WISDOM_REPORT.md " + "for principles, god nodes, and contradictions before searching raw files.' || true" + ), + }], +} + +_SKILL_REGISTRATION = ( + "\n# wisdomGraph\n" + "- **wisdom** (`~/.claude/skills/wisdom/SKILL.md`) " + "- accumulative Neo4j wisdom memory. Trigger: `/wisdom`\n" + "When the user types `/wisdom`, invoke the Skill tool " + "with `skill: \"wisdom\"` before doing anything else.\n" +) + +_CLAUDE_MD_SECTION = """\ +## wisdomGraph + +This project uses wisdomGraph — accumulative Neo4j-native wisdom memory. + +Rules: +- Before answering architecture or codebase questions, read wisdom-out/WISDOM_REPORT.md + for top Wisdom principles, god nodes, and contradictions +- Use `/wisdom ask` to query the live graph when deeper traversal is needed +- After `/wisdom reflect`, new Wisdom nodes may be available — re-read the report +""" + +_AGENTS_MD_SECTION = """\ +## wisdomGraph + +This project uses wisdomGraph — accumulative Neo4j-native wisdom memory. + +Rules: +- Before answering architecture or codebase questions, read wisdom-out/WISDOM_REPORT.md +- Use the wisdom CLI to query the graph: wisdom ask "your question" +""" + +_PLATFORM_CONFIG: dict[str, dict] = { + "claude": { + "skill_file": "skill.md", + "skill_dst": Path(".claude") / "skills" / "wisdom" / "SKILL.md", + "claude_md": True, + }, + "windows": { + "skill_file": "skill-windows.md", + "skill_dst": Path(".claude") / "skills" / "wisdom" / "SKILL.md", + "claude_md": True, + }, + "claw": { + "skill_file": "skill-claw.md", + "skill_dst": Path(".claw") / "skills" / "wisdom" / "SKILL.md", + "claude_md": False, + }, +} + + +# ── Install helpers ─────────────────────────────────────────────────────────── + +def install(platform: str = "claude") -> None: + if platform not in _PLATFORM_CONFIG: + print(f"error: unknown platform '{platform}'. Choose: {', '.join(_PLATFORM_CONFIG)}", file=sys.stderr) + sys.exit(1) + + cfg = _PLATFORM_CONFIG[platform] + skill_src = Path(__file__).parent / cfg["skill_file"] + + # Fallback: use skill.md for windows/claw if platform-specific not found yet + if not skill_src.exists(): + skill_src = Path(__file__).parent / "skill.md" + if not skill_src.exists(): + print(f"error: skill.md not found in package", file=sys.stderr) + sys.exit(1) + + skill_dst = Path.home() / cfg["skill_dst"] + skill_dst.parent.mkdir(parents=True, exist_ok=True) + shutil.copy(skill_src, skill_dst) + (skill_dst.parent / ".wisdom_version").write_text(__version__, encoding="utf-8") + print(f" skill installed -> {skill_dst}") + + if cfg["claude_md"]: + _register_claude_md() + _install_claude_hook(Path(".")) + + print() + print("Done. Open your AI coding assistant and type:") + print() + print(" /wisdom .") + print() + + +def _register_claude_md() -> None: + claude_md = Path.home() / ".claude" / "CLAUDE.md" + if claude_md.exists(): + content = claude_md.read_text(encoding="utf-8") + if "wisdomGraph" in content or "wisdom" in content: + print(" CLAUDE.md -> already registered (no change)") + return + claude_md.write_text(content.rstrip() + _SKILL_REGISTRATION, encoding="utf-8") + else: + claude_md.parent.mkdir(parents=True, exist_ok=True) + claude_md.write_text(_SKILL_REGISTRATION.lstrip(), encoding="utf-8") + print(f" CLAUDE.md -> skill registered") + + +def _install_claude_hook(project_dir: Path) -> None: + settings_path = project_dir / ".claude" / "settings.json" + settings_path.parent.mkdir(parents=True, exist_ok=True) + if settings_path.exists(): + try: + settings = json.loads(settings_path.read_text(encoding="utf-8")) + except json.JSONDecodeError: + settings = {} + else: + settings = {} + + hooks = settings.setdefault("hooks", {}) + pre_tool = hooks.setdefault("PreToolUse", []) + if any("wisdomGraph" in str(h) or "wisdom-out" in str(h) for h in pre_tool): + print(" .claude/settings.json -> hook already registered (no change)") + return + pre_tool.append(_SETTINGS_HOOK) + settings_path.write_text(json.dumps(settings, indent=2), encoding="utf-8") + print(" .claude/settings.json -> PreToolUse hook registered") + + +def claude_install(project_dir: Path | None = None) -> None: + target = (project_dir or Path(".")) / "CLAUDE.md" + if target.exists(): + content = target.read_text(encoding="utf-8") + if "wisdomGraph" in content: + print("wisdomGraph already configured in CLAUDE.md") + return + target.write_text(content.rstrip() + "\n\n" + _CLAUDE_MD_SECTION, encoding="utf-8") + else: + target.write_text(_CLAUDE_MD_SECTION, encoding="utf-8") + print(f"wisdomGraph section written to {target.resolve()}") + _install_claude_hook(project_dir or Path(".")) + + +def claude_uninstall(project_dir: Path | None = None) -> None: + target = (project_dir or Path(".")) / "CLAUDE.md" + if not target.exists(): + print("No CLAUDE.md found - nothing to do") + return + content = target.read_text(encoding="utf-8") + if "wisdomGraph" not in content: + print("wisdomGraph not found in CLAUDE.md - nothing to do") + return + cleaned = re.sub(r"\n*## wisdomGraph\n.*?(?=\n## |\Z)", "", content, flags=re.DOTALL).rstrip() + target.write_text((cleaned + "\n") if cleaned else "", encoding="utf-8") + print(f"wisdomGraph section removed from {target.resolve()}") + + +# ── Main CLI ────────────────────────────────────────────────────────────────── + +def main() -> None: + if len(sys.argv) < 2 or sys.argv[1] in ("-h", "--help"): + _print_help() + return + + cmd = sys.argv[1] + + if cmd == "install": + default_p = "windows" if _platform_mod.system() == "Windows" else "claude" + chosen = default_p + args = sys.argv[2:] + i = 0 + while i < len(args): + if args[i] in ("--platform", "-p") and i + 1 < len(args): + chosen = args[i + 1]; i += 2 + elif args[i].startswith("--platform="): + chosen = args[i].split("=", 1)[1]; i += 1 + else: + i += 1 + install(platform=chosen) + + elif cmd == "connect": + if len(sys.argv) < 3: + print("Usage: wisdom connect --user --password ", file=sys.stderr) + sys.exit(1) + uri = sys.argv[2] + user = _get_arg("--user", "neo4j") + password = _get_arg("--password", "") + if not password: + import getpass + password = getpass.getpass("Neo4j password: ") + from wisdom.connect import save_connection + save_connection(uri, user, password) + + elif cmd == "docker": + from wisdom.docker import up, down, status as docker_status + subcmd = sys.argv[2] if len(sys.argv) > 2 else "" + password = _get_arg("--password", "password") + if subcmd == "up": + up(password=password) + elif subcmd == "down": + down() + elif subcmd == "status": + docker_status() + else: + print("Usage: wisdom docker [up|down|status]", file=sys.stderr) + sys.exit(1) + + elif cmd == "status": + from wisdom.connect import get_driver, status as conn_status, ensure_schema + driver = get_driver() + ensure_schema(driver) + stats = conn_status(driver) + print(f"Knowledge: {stats.get('Knowledge', 0)}") + print(f"Experience: {stats.get('Experience', 0)}") + print(f"Insight: {stats.get('Insight', 0)}") + print(f"Wisdom: {stats.get('Wisdom', 0)}") + print(f"Sources: {stats.get('Source', 0)}") + print(f"Edges: {stats.get('edges', 0)}") + driver.close() + + elif cmd == "ask": + if len(sys.argv) < 3: + print("Usage: wisdom ask \"\" [--tier wisdom]", file=sys.stderr) + sys.exit(1) + question = sys.argv[2] + tier = _get_arg("--tier", None) + from wisdom.connect import get_driver, ensure_schema + from wisdom.traverse import answer_question + driver = get_driver() + ensure_schema(driver) + with driver.session() as session: + result = answer_question(session, question, tier_filter=tier) + driver.close() + _print_answer(result) + + elif cmd == "path": + if len(sys.argv) < 4: + print("Usage: wisdom path \"\" \"\"", file=sys.stderr) + sys.exit(1) + from wisdom.connect import get_driver, ensure_schema + from wisdom.traverse import shortest_path + driver = get_driver() + ensure_schema(driver) + with driver.session() as session: + path = shortest_path(session, sys.argv[2], sys.argv[3]) + driver.close() + if path: + print(" → ".join(f"{n['label']} [{n.get('tier','?')}]" for n in path)) + else: + print("No path found between these concepts.") + + elif cmd == "explain": + if len(sys.argv) < 3: + print("Usage: wisdom explain \"\"", file=sys.stderr) + sys.exit(1) + from wisdom.connect import get_driver, ensure_schema + from wisdom.traverse import explain_node + driver = get_driver() + ensure_schema(driver) + with driver.session() as session: + result = explain_node(session, sys.argv[2]) + driver.close() + _print_explain(result) + + elif cmd == "god-nodes": + from wisdom.connect import get_driver, ensure_schema + from wisdom.traverse import god_nodes + driver = get_driver() + ensure_schema(driver) + with driver.session() as session: + nodes = god_nodes(session, limit=15) + driver.close() + for i, n in enumerate(nodes, 1): + print(f"{i:2}. {n['label']} [{n.get('tier','?')}] — {n.get('degree',0)} edges") + + elif cmd == "export": + fmt = _get_arg("--cypher", None, flag=True) + json_fmt = _get_arg("--json", None, flag=True) + obsidian = _get_arg("--obsidian", None, flag=True) + out_dir = Path(_get_arg("--out", "wisdom-out")) + from wisdom.connect import get_driver, ensure_schema + from wisdom.export import export_cypher, export_json, export_obsidian + driver = get_driver() + ensure_schema(driver) + with driver.session() as session: + if fmt: + p = export_cypher(session, out_dir) + print(f"Cypher export -> {p}") + if json_fmt: + p = export_json(session, out_dir) + print(f"JSON export -> {p}") + if obsidian: + vault_dir = _get_arg("--obsidian-dir", None) + p = export_obsidian(session, out_dir, Path(vault_dir) if vault_dir else None) + print(f"Obsidian vault -> {p}") + if not any([fmt, json_fmt, obsidian]): + print("Specify --cypher, --json, or --obsidian", file=sys.stderr) + driver.close() + + elif cmd in ("claude",): + subcmd = sys.argv[2] if len(sys.argv) > 2 else "" + if subcmd == "install": + claude_install() + elif subcmd == "uninstall": + claude_uninstall() + else: + print(f"Usage: wisdom {cmd} [install|uninstall]", file=sys.stderr) + sys.exit(1) + + elif cmd == "purge": + project = _get_arg("--project", "") + if not project: + print("Usage: wisdom purge --project ", file=sys.stderr) + sys.exit(1) + from wisdom.connect import get_driver, ensure_schema + driver = get_driver() + ensure_schema(driver) + with driver.session() as session: + result = session.run( + "MATCH (n {project: $p}) DETACH DELETE n RETURN count(n) AS deleted", + p=project, + ) + deleted = result.single()["deleted"] + driver.close() + print(f"Purged {deleted} nodes for project '{project}'") + + else: + print(f"error: unknown command '{cmd}'", file=sys.stderr) + print("Run 'wisdom --help' for usage.", file=sys.stderr) + sys.exit(1) + + +def _get_arg(flag: str, default, flag: bool = False) -> str | bool | None: + """Parse a named arg from sys.argv.""" + args = sys.argv + if flag: + return flag in args + for i, a in enumerate(args): + if a == flag and i + 1 < len(args): + return args[i + 1] + if a.startswith(f"{flag}="): + return a.split("=", 1)[1] + return default + + +def _print_answer(result: dict) -> None: + wisdom_nodes = result.get("wisdom_nodes", []) + all_nodes = result.get("nodes", []) + if wisdom_nodes: + print("\n=== Wisdom ===") + for w in wisdom_nodes: + print(f"\n {w.get('principle') or w.get('label', '')}") + print(f" [confidence: {w.get('confidence', 0):.2f}]") + elif all_nodes: + print("\n=== Best match ===") + n = all_nodes[0] + print(f" [{n.get('tier','?')}] {n.get('label', '')}") + if n.get("content"): + print(f" {n['content'][:200]}") + else: + print(result.get("answer", "No results found.")) + + +def _print_explain(result: dict) -> None: + if result.get("error"): + print(result["error"]) + return + print(f"\n{result['label']} [{result.get('tier','?')}]") + if result.get("content"): + print(f"\n{result['content']}") + chain = result.get("dikw_chain", []) + if chain: + print("\nDIKW chain:") + for n in chain: + print(f" {n.get('tier','?'):12} {n.get('label','')}") + sources = result.get("sources", []) + if sources: + print("\nSources:") + for s in sources: + print(f" {s.get('uri','')}") + + +def _print_help() -> None: + print("Usage: wisdom ") + print() + print("Setup:") + print(" install [--platform P] copy skill (claude|windows|claw)") + print(" connect --user U --pass P save Neo4j connection") + print(" docker up|down|status manage DozerDB local container") + print(" claude install|uninstall write CLAUDE.md + PreToolUse hook") + print() + print("Absorb:") + print(" (use /wisdom in Claude Code — the skill handles absorption)") + print() + print("Query:") + print(" ask \"\" [--tier wisdom]") + print(" path \"\" \"\"") + print(" explain \"\"") + print(" god-nodes") + print() + print("Maintain:") + print(" status") + print(" purge --project ") + print(" export --cypher|--json|--obsidian") + print() + + +if __name__ == "__main__": + main() diff --git a/wisdom/cache.py b/wisdom/cache.py new file mode 100644 index 0000000..ec81c96 --- /dev/null +++ b/wisdom/cache.py @@ -0,0 +1,81 @@ +"""Per-file extraction cache — skip unchanged files on re-run.""" +from __future__ import annotations + +import hashlib +import json +import os +from pathlib import Path + + +def file_hash(path: Path) -> str: + """SHA256 of file contents + resolved path.""" + h = hashlib.sha256() + h.update(Path(path).read_bytes()) + h.update(b"\x00") + h.update(str(Path(path).resolve()).encode()) + return h.hexdigest() + + +def cache_dir(root: Path = Path(".")) -> Path: + d = Path(root) / "wisdom-out" / "cache" + d.mkdir(parents=True, exist_ok=True) + return d + + +def load_cached(path: Path, root: Path = Path(".")) -> dict | None: + try: + h = file_hash(path) + except OSError: + return None + entry = cache_dir(root) / f"{h}.json" + if not entry.exists(): + return None + try: + return json.loads(entry.read_text()) + except (json.JSONDecodeError, OSError): + return None + + +def save_cached(path: Path, result: dict, root: Path = Path(".")) -> None: + h = file_hash(path) + entry = cache_dir(root) / f"{h}.json" + tmp = entry.with_suffix(".tmp") + try: + tmp.write_text(json.dumps(result)) + os.replace(tmp, entry) + except Exception: + tmp.unlink(missing_ok=True) + raise + + +def check_cache(files: list[str], root: Path = Path(".")) -> tuple[list[dict], list[str]]: + """Split files into (cached_extractions, uncached_paths). + + Cached files return their stored extraction dicts. + Uncached files need LLM extraction. + """ + cached: list[dict] = [] + uncached: list[str] = [] + for fpath in files: + result = load_cached(Path(fpath), root) + if result is not None: + cached.append(result) + else: + uncached.append(fpath) + return cached, uncached + + +def save_extractions(extractions: list[dict], root: Path = Path(".")) -> int: + """Save extraction results keyed by source_file. Returns count saved.""" + saved = 0 + for ext in extractions: + src = ext.get("source_file", "") + if not src: + continue + p = Path(src) + if not p.is_absolute(): + p = Path(root) / p + if p.exists(): + save_cached(p, ext, root) + saved += 1 + return saved diff --git a/wisdom/classify.py b/wisdom/classify.py new file mode 100644 index 0000000..ea96381 --- /dev/null +++ b/wisdom/classify.py @@ -0,0 +1,121 @@ +"""DIKW tier classification — assign epistemic role to extracted nodes. + +Rules (applied in order, first match wins): +1. Wisdom — node explicitly marked tier=wisdom by LLM extraction +2. Insight — node explicitly marked tier=insight, OR matches insight heuristics +3. Experience — node explicitly marked tier=experience, OR matches experience heuristics +4. Knowledge — default for all first-time extractions + +Promotion heuristics (applied when tier not explicitly set): +- Experience: same concept id appears in 2+ distinct projects in this batch +- Insight: node has 3+ SEMANTICALLY_SIMILAR_TO or CONTRADICTS edges in this batch +""" +from __future__ import annotations + +from collections import Counter, defaultdict +from datetime import datetime, timezone + + +def _utcnow() -> str: + return datetime.now(timezone.utc).isoformat() + + +def classify_nodes( + nodes: list[dict], + edges: list[dict], + project: str, +) -> list[dict]: + """Assign DIKW tier to each node. Returns enriched node list. + + Args: + nodes: Raw extracted nodes (from extract.py output) + edges: Raw extracted edges + project: Project slug / root path for provenance + + Returns: + Nodes with 'tier', 'project', 'timestamp' fields set. + """ + # Count cross-edge relationships per node + similarity_count: Counter[str] = Counter() + for edge in edges: + rel = edge.get("relation", "") + if rel in ("semantically_similar_to", "conceptually_related_to", "contradicts"): + similarity_count[edge["source"]] += 1 + similarity_count[edge["target"]] += 1 + + enriched = [] + for node in nodes: + n = dict(node) + n.setdefault("project", project) + n.setdefault("timestamp", _utcnow()) + n.setdefault("access_count", 0) + n.setdefault("confidence", 1.0 if n.get("confidence_tag") == "EXTRACTED" else 0.7) + + # Explicit tier from LLM extraction takes priority + explicit_tier = n.get("tier", "").lower() + if explicit_tier in ("wisdom", "insight", "experience", "knowledge"): + n["tier"] = explicit_tier + # Insight heuristic: 3+ semantic similarity edges + elif similarity_count.get(n["id"], 0) >= 3: + n["tier"] = "insight" + # Default: Knowledge + else: + n["tier"] = "knowledge" + + enriched.append(n) + + return enriched + + +def promote_experiences( + nodes: list[dict], + existing_projects: list[str], +) -> list[dict]: + """Promote Knowledge nodes to Experience when concept appears in 2+ projects. + + This is a batch operation run after merging a new corpus. + existing_projects: project slugs already in the graph for this concept id. + """ + promoted = [] + for node in nodes: + n = dict(node) + if n.get("tier") == "knowledge" and n["id"] in existing_projects: + n["tier"] = "experience" + n["context"] = f"Observed in multiple projects: {', '.join(existing_projects)}" + promoted.append(n) + return promoted + + +def build_dikw_edges(nodes: list[dict], edges: list[dict]) -> list[dict]: + """Add implicit DIKW hierarchy edges (GROUNDS, REVEALS, CRYSTALLIZES_INTO). + + These edges represent the epistemic structure: + - Knowledge GROUNDS Experience (when Experience promoted from Knowledge) + - Experience REVEALS Insight + - Insight CRYSTALLIZES_INTO Wisdom + """ + tier_map = {n["id"]: n.get("tier", "knowledge") for n in nodes} + hierarchy = { + ("knowledge", "experience"): "GROUNDS", + ("experience", "insight"): "REVEALS", + ("insight", "wisdom"): "CRYSTALLIZES_INTO", + } + dikw_edges = [] + seen_pairs: set[tuple[str, str]] = set() + + for edge in edges: + src_tier = tier_map.get(edge["source"], "knowledge") + tgt_tier = tier_map.get(edge["target"], "knowledge") + pair = (src_tier, tgt_tier) + rel = hierarchy.get(pair) + if rel and (edge["source"], edge["target"]) not in seen_pairs: + dikw_edges.append({ + "source": edge["source"], + "target": edge["target"], + "relation": rel, + "confidence_tag": "EXTRACTED", + "confidence": 1.0, + }) + seen_pairs.add((edge["source"], edge["target"])) + + return edges + dikw_edges diff --git a/wisdom/connect.py b/wisdom/connect.py new file mode 100644 index 0000000..9b70e60 --- /dev/null +++ b/wisdom/connect.py @@ -0,0 +1,147 @@ +"""Neo4j connection management — credentials, driver lifecycle, schema setup.""" +from __future__ import annotations + +import json +import os +import stat +import sys +from pathlib import Path +from typing import Any + +_CONFIG_PATH = Path.home() / ".wisdom" / "config.json" +_ENV_VAR = "WISDOM_NEO4J_PASSWORD" + +# Default DozerDB local config +_DEFAULT_CONFIG: dict[str, Any] = { + "neo4j_uri": "bolt://localhost:7687", + "neo4j_user": "neo4j", + "neo4j_password_env": _ENV_VAR, + "default_mode": "standard", + "cache_dir": str(Path.home() / ".wisdom" / "cache"), +} + + +def _load_config() -> dict[str, Any]: + if _CONFIG_PATH.exists(): + try: + return json.loads(_CONFIG_PATH.read_text(encoding="utf-8")) + except (json.JSONDecodeError, OSError): + pass + return dict(_DEFAULT_CONFIG) + + +def _save_config(cfg: dict[str, Any]) -> None: + _CONFIG_PATH.parent.mkdir(parents=True, exist_ok=True) + tmp = _CONFIG_PATH.with_suffix(".tmp") + tmp.write_text(json.dumps(cfg, indent=2), encoding="utf-8") + tmp.replace(_CONFIG_PATH) + # Restrict permissions — config may have env var names + _CONFIG_PATH.chmod(stat.S_IRUSR | stat.S_IWUSR) + + +def save_connection(uri: str, user: str, password: str) -> None: + """Save connection details. Password written to shell env file, not config.""" + cfg = _load_config() + cfg["neo4j_uri"] = uri + cfg["neo4j_user"] = user + cfg["neo4j_password_env"] = _ENV_VAR + _save_config(cfg) + + # Write password to environment + _write_env_password(password) + print(f" config saved -> {_CONFIG_PATH}") + print(f" password set -> {_ENV_VAR} in shell profile") + + +def _write_env_password(password: str) -> None: + """Append export to shell profile if not already present.""" + for profile in [Path.home() / ".zshrc", Path.home() / ".bashrc", Path.home() / ".bash_profile"]: + if profile.exists(): + content = profile.read_text(encoding="utf-8") + marker = f"export {_ENV_VAR}=" + if marker in content: + # Replace existing line + lines = content.splitlines() + new_lines = [f"{marker}\"{password}\"" if l.startswith(marker) else l for l in lines] + profile.write_text("\n".join(new_lines) + "\n", encoding="utf-8") + else: + with profile.open("a", encoding="utf-8") as f: + f.write(f"\n# wisdomGraph Neo4j password\nexport {_ENV_VAR}=\"{password}\"\n") + os.environ[_ENV_VAR] = password + return + # No profile found — just set in current process + os.environ[_ENV_VAR] = password + + +def get_driver(): + """Return a connected Neo4j driver using saved config.""" + try: + from neo4j import GraphDatabase + except ImportError: + print("error: neo4j driver not installed. Run: pip install wisdomgraph[neo4j]", file=sys.stderr) + sys.exit(1) + + cfg = _load_config() + uri = cfg.get("neo4j_uri", _DEFAULT_CONFIG["neo4j_uri"]) + user = cfg.get("neo4j_user", _DEFAULT_CONFIG["neo4j_user"]) + env_var = cfg.get("neo4j_password_env", _ENV_VAR) + password = os.environ.get(env_var, "") + + if not password: + print(f"error: {env_var} environment variable not set.", file=sys.stderr) + print("Run: wisdom connect --user --password ", file=sys.stderr) + sys.exit(1) + + driver = GraphDatabase.driver(uri, auth=(user, password)) + try: + driver.verify_connectivity() + except Exception as e: + print(f"error: cannot connect to Neo4j at {uri}: {e}", file=sys.stderr) + sys.exit(1) + return driver + + +def ensure_schema(driver) -> None: + """Create indexes and constraints if they don't exist.""" + with driver.session() as session: + session.run(""" + CREATE INDEX wisdom_id IF NOT EXISTS + FOR (n:Knowledge) ON (n.id) + """) + session.run(""" + CREATE INDEX experience_id IF NOT EXISTS + FOR (n:Experience) ON (n.id) + """) + session.run(""" + CREATE INDEX insight_id IF NOT EXISTS + FOR (n:Insight) ON (n.id) + """) + session.run(""" + CREATE INDEX wisdom_node_id IF NOT EXISTS + FOR (n:Wisdom) ON (n.id) + """) + session.run(""" + CREATE INDEX source_uri IF NOT EXISTS + FOR (n:Source) ON (n.uri) + """) + # Full-text search across all DIKW tiers + try: + session.run(""" + CREATE FULLTEXT INDEX wisdom_content IF NOT EXISTS + FOR (n:Knowledge|Experience|Insight|Wisdom) + ON EACH [n.label, n.content, n.principle] + """) + except Exception: + pass # Some Neo4j versions need different syntax — skip gracefully + + +def status(driver) -> dict: + """Return graph stats by tier.""" + with driver.session() as session: + counts = {} + for label in ("Knowledge", "Experience", "Insight", "Wisdom", "Source"): + result = session.run(f"MATCH (n:{label}) RETURN count(n) AS c") + counts[label] = result.single()["c"] + edge_result = session.run("MATCH ()-[r]->() RETURN count(r) AS c") + counts["edges"] = edge_result.single()["c"] + return counts diff --git a/wisdom/detect.py b/wisdom/detect.py new file mode 100644 index 0000000..bbab095 --- /dev/null +++ b/wisdom/detect.py @@ -0,0 +1,145 @@ +"""File discovery and type classification.""" +from __future__ import annotations + +import fnmatch +import os +from enum import Enum +from pathlib import Path + +from .security import is_sensitive_path + + +class FileType(str, Enum): + CODE = "code" + DOCUMENT = "document" + PAPER = "paper" + IMAGE = "image" + + +CODE_EXTENSIONS = { + ".py", ".ts", ".js", ".tsx", ".go", ".rs", ".java", + ".cpp", ".cc", ".cxx", ".c", ".h", ".hpp", + ".rb", ".swift", ".kt", ".kts", ".cs", ".scala", + ".php", ".lua", ".zig", ".ps1", ".ex", ".exs", ".m", ".mm", +} +DOC_EXTENSIONS = {".md", ".txt", ".rst"} +PAPER_EXTENSIONS = {".pdf"} +IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".webp"} +OFFICE_EXTENSIONS = {".docx", ".xlsx"} + +_SKIP_DIRS = { + "venv", ".venv", "env", "node_modules", "__pycache__", ".git", + "dist", "build", "target", "out", "site-packages", "lib64", + ".pytest_cache", ".mypy_cache", ".ruff_cache", "wisdom-out", +} + +import re +_PAPER_SIGNALS = [ + re.compile(r"\barxiv\b", re.IGNORECASE), + re.compile(r"\bdoi\s*:", re.IGNORECASE), + re.compile(r"\babstract\b", re.IGNORECASE), + re.compile(r"\bproceedings\b", re.IGNORECASE), + re.compile(r"\bpreprint\b", re.IGNORECASE), + re.compile(r"\[\d+\]"), + re.compile(r"\d{4}\.\d{4,5}"), + re.compile(r"\bwe propose\b", re.IGNORECASE), +] +_PAPER_THRESHOLD = 3 + + +def _looks_like_paper(path: Path) -> bool: + try: + text = path.read_text(errors="ignore")[:3000] + return sum(1 for p in _PAPER_SIGNALS if p.search(text)) >= _PAPER_THRESHOLD + except Exception: + return False + + +def classify_file(path: Path) -> FileType | None: + ext = path.suffix.lower() + if ext in CODE_EXTENSIONS: + return FileType.CODE + if ext in PAPER_EXTENSIONS: + return FileType.PAPER + if ext in IMAGE_EXTENSIONS: + return FileType.IMAGE + if ext in DOC_EXTENSIONS: + return FileType.PAPER if _looks_like_paper(path) else FileType.DOCUMENT + if ext in OFFICE_EXTENSIONS: + return FileType.DOCUMENT + return None + + +def _load_ignore(root: Path) -> list[str]: + ignore_file = root / ".wisdomignore" + if not ignore_file.exists(): + # Fall back to .graphifyignore for compatibility + ignore_file = root / ".graphifyignore" + if not ignore_file.exists(): + return [] + return [ + line.strip() + for line in ignore_file.read_text(errors="ignore").splitlines() + if line.strip() and not line.startswith("#") + ] + + +def _is_ignored(path: Path, root: Path, patterns: list[str]) -> bool: + if not patterns: + return False + try: + rel = str(path.relative_to(root)).replace(os.sep, "/") + except ValueError: + return False + parts = rel.split("/") + for pattern in patterns: + p = pattern.strip("/") + if not p: + continue + if fnmatch.fnmatch(rel, p) or fnmatch.fnmatch(path.name, p): + return True + for i, part in enumerate(parts): + if fnmatch.fnmatch(part, p) or fnmatch.fnmatch("/".join(parts[: i + 1]), p): + return True + return False + + +def _is_noise_dir(name: str) -> bool: + return name in _SKIP_DIRS or name.endswith("_venv") or name.endswith("_env") or name.endswith(".egg-info") + + +def detect(root: Path) -> dict: + """Collect all absorbable files under root, classified by type.""" + root = Path(root).resolve() + ignore_patterns = _load_ignore(root) + files: dict[str, list[str]] = {t.value: [] for t in FileType} + skipped_sensitive: list[str] = [] + + for dirpath, dirnames, filenames in os.walk(root): + dp = Path(dirpath) + dirnames[:] = [ + d for d in dirnames + if not d.startswith(".") + and not _is_noise_dir(d) + and not _is_ignored(dp / d, root, ignore_patterns) + ] + for fname in filenames: + p = dp / fname + if p.name.startswith("."): + continue + if _is_ignored(p, root, ignore_patterns): + continue + if is_sensitive_path(p): + skipped_sensitive.append(str(p)) + continue + ftype = classify_file(p) + if ftype: + files[ftype.value].append(str(p)) + + total = sum(len(v) for v in files.values()) + return { + "files": files, + "total_files": total, + "skipped_sensitive": skipped_sensitive, + "root": str(root), + } diff --git a/wisdom/docker.py b/wisdom/docker.py new file mode 100644 index 0000000..36cb388 --- /dev/null +++ b/wisdom/docker.py @@ -0,0 +1,94 @@ +"""DozerDB Docker lifecycle — start, stop, status.""" +from __future__ import annotations + +import subprocess +import sys +from pathlib import Path + +_CONTAINER_NAME = "wisdomgraph-dozerdb" +_IMAGE = "graphstack/dozerdb:5.26.3.0" +_DATA_DIR = Path.home() / "neo4j" + + +def _run(cmd: list[str], capture: bool = False) -> subprocess.CompletedProcess: + if capture: + return subprocess.run(cmd, capture_output=True, text=True) + return subprocess.run(cmd) + + +def up(password: str = "password") -> None: + """Start DozerDB container (idempotent — safe to call if already running).""" + # Check if already running + result = _run(["docker", "ps", "-q", "-f", f"name={_CONTAINER_NAME}"], capture=True) + if result.stdout.strip(): + print(f" DozerDB already running ({_CONTAINER_NAME})") + return + + # Check if stopped container exists + result = _run(["docker", "ps", "-aq", "-f", f"name={_CONTAINER_NAME}"], capture=True) + if result.stdout.strip(): + print(f" Starting existing DozerDB container...") + _run(["docker", "start", _CONTAINER_NAME]) + _print_ready() + return + + # Create data dirs + for subdir in ("data", "logs", "import", "plugins"): + (_DATA_DIR / subdir).mkdir(parents=True, exist_ok=True) + + print(f" Pulling {_IMAGE}...") + _run(["docker", "pull", _IMAGE]) + + print(f" Starting DozerDB ({_CONTAINER_NAME})...") + _run([ + "docker", "run", "-d", + "--name", _CONTAINER_NAME, + "-p", "7474:7474", + "-p", "7687:7687", + "-v", f"{_DATA_DIR}/data:/data", + "-v", f"{_DATA_DIR}/logs:/logs", + "-v", f"{_DATA_DIR}/import:/var/lib/neo4j/import", + "-v", f"{_DATA_DIR}/plugins:/plugins", + "--env", f"NEO4J_AUTH=neo4j/{password}", + "--env", "NEO4J_PLUGINS=[\"apoc\"]", + "--env", "NEO4J_apoc_export_file_enabled=true", + "--env", "NEO4J_apoc_import_file_enabled=true", + "--env", "NEO4J_dbms_security_procedures_unrestricted=*", + _IMAGE, + ]) + _print_ready() + + +def _print_ready() -> None: + print() + print(" DozerDB started. Waiting for bolt port...") + print() + print(" Browser: http://localhost:7474") + print(" Bolt URI: bolt://localhost:7687") + print() + print(" Run: wisdom connect bolt://localhost:7687 --user neo4j --password password") + print() + + +def down() -> None: + """Stop DozerDB container (data persists in ~/neo4j/data).""" + result = _run(["docker", "ps", "-q", "-f", f"name={_CONTAINER_NAME}"], capture=True) + if not result.stdout.strip(): + print(f" DozerDB not running ({_CONTAINER_NAME})") + return + _run(["docker", "stop", _CONTAINER_NAME]) + print(f" DozerDB stopped. Data preserved in {_DATA_DIR}/data") + + +def status() -> None: + """Print DozerDB container status.""" + result = _run(["docker", "ps", "-f", f"name={_CONTAINER_NAME}", "--format", "table {{.Names}}\t{{.Status}}\t{{.Ports}}"], capture=True) + if _CONTAINER_NAME in result.stdout: + print(result.stdout) + else: + result2 = _run(["docker", "ps", "-a", "-f", f"name={_CONTAINER_NAME}", "--format", "table {{.Names}}\t{{.Status}}"], capture=True) + if _CONTAINER_NAME in result2.stdout: + print(result2.stdout) + print(" (container exists but is not running — use: wisdom docker up)") + else: + print(f" No DozerDB container found. Run: wisdom docker up") diff --git a/wisdom/export.py b/wisdom/export.py new file mode 100644 index 0000000..4949874 --- /dev/null +++ b/wisdom/export.py @@ -0,0 +1,125 @@ +"""Export wisdom graph to external formats.""" +from __future__ import annotations + +import json +from pathlib import Path + + +def export_cypher(session, out_dir: Path) -> Path: + """Export all nodes and edges as Cypher CREATE statements.""" + out_dir = Path(out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + out_path = out_dir / "wisdom_export.cypher" + + lines = ["// wisdomGraph Cypher export\n// Import with: cypher-shell -f wisdom_export.cypher\n"] + + # Export nodes by tier + for label in ("Knowledge", "Experience", "Insight", "Wisdom", "Source"): + result = session.run(f"MATCH (n:{label}) RETURN n") + for record in result: + n = dict(record["n"]) + props = ", ".join(f"{k}: {json.dumps(v)}" for k, v in n.items() if v is not None) + lines.append(f"MERGE (n:{label} {{id: {json.dumps(n.get('id', ''))}}}) SET n += {{{props}}};") + + # Export relationships + result = session.run( + """ + MATCH (a)-[r]->(b) + WHERE (a:Knowledge OR a:Experience OR a:Insight OR a:Wisdom OR a:Source) + AND (b:Knowledge OR b:Experience OR b:Insight OR b:Wisdom OR b:Source) + RETURN a.id AS src, b.id AS tgt, type(r) AS rel, properties(r) AS props + """ + ) + for record in result: + rel_props = ", ".join(f"{k}: {json.dumps(v)}" for k, v in (record["props"] or {}).items()) + props_str = f" {{{rel_props}}}" if rel_props else "" + lines.append( + f"MATCH (a {{id: {json.dumps(record['src'])}}}), (b {{id: {json.dumps(record['tgt'])}}})" + f" MERGE (a)-[:{record['rel']}{props_str}]->(b);" + ) + + out_path.write_text("\n".join(lines), encoding="utf-8") + return out_path + + +def export_json(session, out_dir: Path) -> Path: + """Export as graph.json — graphify-compatible format.""" + out_dir = Path(out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + out_path = out_dir / "graph.json" + + nodes = [] + for label in ("Knowledge", "Experience", "Insight", "Wisdom"): + result = session.run( + f""" + MATCH (n:{label}) + RETURN n.id AS id, n.label AS label, n.tier AS tier, + n.confidence AS confidence, n.source_file AS source_file, + n.project AS project + """ + ) + for r in result: + d = dict(r) + d["type"] = label.lower() + nodes.append(d) + + edges = [] + result = session.run( + """ + MATCH (a)-[r]->(b) + WHERE (a:Knowledge OR a:Experience OR a:Insight OR a:Wisdom) + AND (b:Knowledge OR b:Experience OR b:Insight OR b:Wisdom) + RETURN a.id AS source, b.id AS target, type(r) AS relation, + r.confidence AS confidence, r.confidence_tag AS confidence_tag + """ + ) + for r in result: + edges.append(dict(r)) + + graph = {"nodes": nodes, "edges": edges, "format": "wisdomgraph-v1"} + out_path.write_text(json.dumps(graph, indent=2), encoding="utf-8") + return out_path + + +def export_obsidian(session, out_dir: Path, vault_dir: Path | None = None) -> Path: + """Export as Obsidian vault — one note per Wisdom node, index.md entry point.""" + vault = Path(vault_dir) if vault_dir else Path(out_dir) / "obsidian" + vault.mkdir(parents=True, exist_ok=True) + + # Index + index_lines = ["# Wisdom Graph — Obsidian Vault\n"] + + for label in ("Wisdom", "Insight", "Experience", "Knowledge"): + result = session.run( + f""" + MATCH (n:{label}) + RETURN n.id AS id, n.label AS label, + n.principle AS principle, n.content AS content, + n.confidence AS confidence + ORDER BY n.confidence DESC + LIMIT 50 + """ + ) + rows = [dict(r) for r in result] + if rows: + index_lines.append(f"\n## {label} tier\n") + for n in rows: + fname = _safe_filename(n["label"]) + ".md" + index_lines.append(f"- [[{fname}|{n['label']}]]") + # Write note + note = [f"# {n['label']}\n", f"**Tier:** {label}\n"] + if n.get("principle"): + note.append(f"\n**Principle:** {n['principle']}\n") + if n.get("content"): + note.append(f"\n{n['content']}\n") + if n.get("confidence"): + note.append(f"\n**Confidence:** {n['confidence']:.2f}\n") + (vault / fname).write_text("\n".join(note), encoding="utf-8") + + (vault / "index.md").write_text("\n".join(index_lines), encoding="utf-8") + return vault + + +def _safe_filename(label: str) -> str: + import re + return re.sub(r"[^\w\s-]", "", label).strip().replace(" ", "_")[:80] diff --git a/wisdom/ingest.py b/wisdom/ingest.py new file mode 100644 index 0000000..0dbe532 --- /dev/null +++ b/wisdom/ingest.py @@ -0,0 +1,88 @@ +"""URL ingestion — fetch web pages, papers, tweets into the corpus.""" +from __future__ import annotations + +import hashlib +import re +import sys +from pathlib import Path +from urllib.request import Request, urlopen +from urllib.error import URLError + +from .security import validate_url + +_TIMEOUT = 30 +_MAX_BYTES = 10 * 1024 * 1024 # 10 MB +_USER_AGENT = "wisdomGraph/1.0 (https://github.com/cklam12345/wisdomGraph)" + + +def fetch_text(url: str) -> str: + """Fetch URL as text. Raises ValueError on bad URL, URLError on network error.""" + url = validate_url(url) + req = Request(url, headers={"User-Agent": _USER_AGENT}) + try: + with urlopen(req, timeout=_TIMEOUT) as resp: + content_type = resp.headers.get("Content-Type", "") + data = resp.read(_MAX_BYTES) + except URLError as e: + raise URLError(f"Failed to fetch {url}: {e}") from e + + # Convert HTML to plain text + text = data.decode("utf-8", errors="replace") + if "text/html" in content_type or text.strip().startswith("<"): + text = _html_to_text(text) + return text + + +def _html_to_text(html: str) -> str: + """Minimal HTML → text (no dependencies).""" + # Remove scripts, styles, head + html = re.sub(r"<(script|style|head)[^>]*>.*?", "", html, flags=re.DOTALL | re.IGNORECASE) + # Remove tags + html = re.sub(r"<[^>]+>", " ", html) + # Decode entities + html = html.replace("&", "&").replace("<", "<").replace(">", ">").replace(" ", " ").replace("'", "'").replace(""", '"') + # Normalize whitespace + html = re.sub(r"\s+", " ", html).strip() + return html + + +def ingest( + url: str, + corpus_dir: Path, + author: str = "", + contributor: str = "", +) -> Path: + """Fetch URL and save to corpus_dir as a markdown file. + + Returns the path of the saved file. + """ + corpus_dir = Path(corpus_dir) + corpus_dir.mkdir(parents=True, exist_ok=True) + + text = fetch_text(url) + + # Build filename from URL hash + url_hash = hashlib.sha256(url.encode()).hexdigest()[:12] + # Try to extract a readable slug from the URL + slug = re.sub(r"[^\w]", "_", url.split("/")[-1] or url_hash)[:40] + fname = f"{slug}_{url_hash}.md" + out_path = corpus_dir / fname + + # Add provenance header + header_lines = [ + f"", + f"", + ] + if author: + header_lines.append(f"") + if contributor: + header_lines.append(f"") + header_lines.append("") + + out_path.write_text("\n".join(header_lines) + "\n" + text, encoding="utf-8") + return out_path + + +def _utcnow() -> str: + from datetime import datetime, timezone + return datetime.now(timezone.utc).isoformat() diff --git a/wisdom/merge.py b/wisdom/merge.py new file mode 100644 index 0000000..bb18fc8 --- /dev/null +++ b/wisdom/merge.py @@ -0,0 +1,158 @@ +"""Neo4j MERGE operations — write DIKW nodes and edges idempotently.""" +from __future__ import annotations + +from datetime import datetime, timezone + +from .validate import validate_extraction + + +_TIER_TO_LABEL = { + "knowledge": "Knowledge", + "experience": "Experience", + "insight": "Insight", + "wisdom": "Wisdom", +} + +# Relationship type map: extraction relation → Neo4j relationship label +_RELATION_MAP = { + "calls": "CALLS", + "imports": "IMPORTS", + "uses": "USES", + "defines": "DEFINES", + "implements": "IMPLEMENTS", + "extends": "EXTENDS", + "references": "REFERENCES", + "depends_on": "DEPENDS_ON", + "semantically_similar_to": "SEMANTICALLY_SIMILAR_TO", + "conceptually_related_to": "SEMANTICALLY_SIMILAR_TO", + "contradicts": "CONTRADICTS", + "grounds": "GROUNDS", + "reveals": "REVEALS", + "crystallizes_into": "CRYSTALLIZES_INTO", + "reinforces": "REINFORCES", + "rationale_for": "RATIONALE_FOR", +} + + +def _utcnow() -> str: + return datetime.now(timezone.utc).isoformat() + + +def merge_nodes(session, nodes: list[dict]) -> int: + """MERGE nodes into Neo4j. Returns count of nodes written.""" + written = 0 + for node in nodes: + tier = node.get("tier", "knowledge").lower() + label = _TIER_TO_LABEL.get(tier, "Knowledge") + + # Build property dict — exclude None values + props = {k: v for k, v in { + "label": node.get("label", ""), + "content": node.get("content", ""), + "source_file": node.get("source_file", ""), + "source_loc": node.get("source_loc", ""), + "confidence": float(node.get("confidence", 1.0)), + "confidence_tag": node.get("confidence_tag", "EXTRACTED"), + "project": node.get("project", ""), + "tier": tier, + "context": node.get("context", ""), + "outcome": node.get("outcome", ""), + "pattern_strength": float(node.get("pattern_strength", 0.0)), + "source_count": int(node.get("source_count", 1)), + "principle": node.get("principle", ""), + "reinforcement_count": int(node.get("reinforcement_count", 0)), + }.items() if v is not None and v != ""} + + session.run( + f""" + MERGE (n:{label} {{id: $id}}) + ON CREATE SET n += $props, n.timestamp = $ts, n.access_count = 0 + ON MATCH SET + n.confidence = CASE WHEN $conf > n.confidence THEN $conf ELSE n.confidence END, + n.source_file = CASE WHEN $conf > n.confidence THEN $src ELSE n.source_file END, + n.access_count = coalesce(n.access_count, 0) + """, + id=node["id"], + props=props, + ts=_utcnow(), + conf=float(node.get("confidence", 1.0)), + src=node.get("source_file", ""), + ) + written += 1 + return written + + +def merge_edges(session, edges: list[dict]) -> int: + """MERGE edges into Neo4j. Returns count written.""" + written = 0 + for edge in edges: + rel = _RELATION_MAP.get(edge.get("relation", "").lower(), "RELATED_TO") + conf = float(edge.get("confidence", 1.0)) + session.run( + f""" + MATCH (a {{id: $src}}), (b {{id: $tgt}}) + MERGE (a)-[r:{rel}]->(b) + ON CREATE SET r.confidence = $conf, r.confidence_tag = $tag, r.weight = 1.0 + ON MATCH SET r.weight = coalesce(r.weight, 1.0) + 0.1 + """, + src=edge["source"], + tgt=edge["target"], + conf=conf, + tag=edge.get("confidence_tag", "EXTRACTED"), + ) + written += 1 + return written + + +def merge_source(session, uri: str, author: str = "", contributor: str = "") -> str: + """MERGE a Source node and return its uri.""" + import hashlib + content_hash = hashlib.sha256(uri.encode()).hexdigest()[:16] + session.run( + """ + MERGE (s:Source {uri: $uri}) + ON CREATE SET s.author = $author, s.contributor = $contrib, + s.ingested_at = $ts, s.content_hash = $hash + """, + uri=uri, + author=author, + contrib=contributor, + ts=_utcnow(), + hash=content_hash, + ) + return uri + + +def link_nodes_to_source(session, node_ids: list[str], source_uri: str) -> None: + """Create SOURCED_FROM edges from nodes to a Source node.""" + for nid in node_ids: + session.run( + """ + MATCH (n {id: $nid}), (s:Source {uri: $uri}) + MERGE (n)-[:SOURCED_FROM]->(s) + """, + nid=nid, + uri=source_uri, + ) + + +def merge_extraction(session, extraction: dict, source_uri: str = "", project: str = "") -> dict: + """Full MERGE pipeline for one extraction dict. Returns stats.""" + validate_extraction(extraction) + + nodes = extraction.get("nodes", []) + edges = extraction.get("edges", []) + + # Enrich with project + tier defaults + for n in nodes: + n.setdefault("project", project) + n.setdefault("tier", "knowledge") + + node_count = merge_nodes(session, nodes) + edge_count = merge_edges(session, edges) + + if source_uri: + merge_source(session, source_uri) + link_nodes_to_source(session, [n["id"] for n in nodes], source_uri) + + return {"nodes": node_count, "edges": edge_count} diff --git a/wisdom/reflect.py b/wisdom/reflect.py new file mode 100644 index 0000000..b5110bb --- /dev/null +++ b/wisdom/reflect.py @@ -0,0 +1,257 @@ +"""DIKW promotion engine — elevate Knowledge → Experience → Insight → Wisdom. + +This module is designed to be called by Claude as a subagent during /wisdom reflect. +Each function returns a list of promotion dicts that merge.py can write. + +Promotion pipeline: +1. find_experience_candidates() — Knowledge appearing in 2+ projects +2. find_insight_candidates() — Experiences clustering around the same pattern +3. find_wisdom_candidates() — Insights with pattern_strength > 0.7 +4. write_promotions() — Execute the MERGE writes +""" +from __future__ import annotations + +import math +from datetime import datetime, timezone + + +def _utcnow() -> str: + return datetime.now(timezone.utc).isoformat() + + +# ── Step 1: Knowledge → Experience ────────────────────────────────────────── + +def find_experience_candidates(session, project: str | None = None) -> list[dict]: + """Find Knowledge nodes that appear in 2+ projects — promote to Experience.""" + query = """ + MATCH (k:Knowledge) + WITH k.label AS lbl, collect(DISTINCT k.project) AS projects, collect(k.id) AS ids + WHERE size(projects) >= 2 + RETURN lbl, projects, ids + """ + results = [] + with session.begin_transaction() as tx: + for record in tx.run(query): + results.append({ + "label": record["lbl"], + "projects": record["projects"], + "ids": record["ids"], + }) + return results + + +def promote_to_experience(session, candidates: list[dict]) -> int: + """Write Experience nodes for each candidate. Returns promotion count.""" + promoted = 0 + for c in candidates: + exp_id = f"exp:{c['label'].lower().replace(' ', '_')}" + context = f"Observed across {len(c['projects'])} projects: {', '.join(c['projects'])}" + session.run( + """ + MERGE (e:Experience {id: $id}) + ON CREATE SET + e.label = $label, + e.content = $context, + e.context = $context, + e.source_count = $count, + e.timestamp = $ts, + e.tier = 'experience' + ON MATCH SET + e.source_count = $count, + e.context = $context + """, + id=exp_id, + label=c["label"], + context=context, + count=len(c["projects"]), + ts=_utcnow(), + ) + # Link original Knowledge nodes to this Experience + for kid in c["ids"]: + session.run( + """ + MATCH (k:Knowledge {id: $kid}), (e:Experience {id: $eid}) + MERGE (k)-[:GROUNDS]->(e) + """, + kid=kid, + eid=exp_id, + ) + promoted += 1 + return promoted + + +# ── Step 2: Experience → Insight ───────────────────────────────────────────── + +def find_insight_candidates(session) -> list[dict]: + """Find Experience clusters with 3+ members connected by GROUNDS or similarity.""" + query = """ + MATCH (e:Experience) + OPTIONAL MATCH (e)-[:GROUNDS|SEMANTICALLY_SIMILAR_TO]-(peer:Experience) + WITH e, count(DISTINCT peer) AS peer_count + WHERE peer_count >= 2 + RETURN e.id AS id, e.label AS label, peer_count, + e.source_count AS source_count + ORDER BY peer_count DESC + """ + results = [] + with session.begin_transaction() as tx: + for record in tx.run(query): + results.append({ + "id": record["id"], + "label": record["label"], + "peer_count": record["peer_count"], + "source_count": record["source_count"] or 1, + }) + return results + + +def promote_to_insight(session, candidates: list[dict]) -> int: + """Write Insight nodes. Returns promotion count.""" + promoted = 0 + for c in candidates: + ins_id = f"ins:{c['label'].lower().replace(' ', '_')}" + source_count = c["source_count"] + pattern_strength = min( + math.log10(max(source_count, 1) + 1) * min(c["peer_count"] / 5.0, 1.0), + 1.0, + ) + session.run( + """ + MERGE (i:Insight {id: $id}) + ON CREATE SET + i.label = $label, + i.content = $content, + i.pattern_strength = $strength, + i.source_count = $count, + i.timestamp = $ts, + i.tier = 'insight' + ON MATCH SET + i.pattern_strength = CASE WHEN $strength > i.pattern_strength + THEN $strength ELSE i.pattern_strength END, + i.source_count = $count + """, + id=ins_id, + label=c["label"], + content=f"Pattern detected across {source_count} sources with {c['peer_count']} related experiences.", + strength=pattern_strength, + count=source_count, + ts=_utcnow(), + ) + session.run( + """ + MATCH (e:Experience {id: $eid}), (i:Insight {id: $iid}) + MERGE (e)-[:REVEALS]->(i) + """, + eid=c["id"], + iid=ins_id, + ) + promoted += 1 + return promoted + + +# ── Step 3: Insight → Wisdom ────────────────────────────────────────────────── + +def find_wisdom_candidates(session) -> list[dict]: + """Find Insights strong enough to crystallize into Wisdom.""" + query = """ + MATCH (i:Insight) + WHERE i.pattern_strength >= 0.5 AND i.source_count >= 3 + AND NOT (i)-[:CRYSTALLIZES_INTO]->(:Wisdom) + RETURN i.id AS id, i.label AS label, + i.pattern_strength AS strength, + i.source_count AS count, + i.content AS content + ORDER BY i.pattern_strength DESC + LIMIT 20 + """ + results = [] + with session.begin_transaction() as tx: + for record in tx.run(query): + results.append({ + "id": record["id"], + "label": record["label"], + "strength": record["strength"], + "count": record["count"], + "content": record["content"], + }) + return results + + +def write_wisdom(session, wisdom_nodes: list[dict]) -> int: + """Write Wisdom nodes from LLM-generated principles. Returns count written.""" + written = 0 + for w in wisdom_nodes: + session.run( + """ + MERGE (wis:Wisdom {id: $id}) + ON CREATE SET + wis.label = $label, + wis.principle = $principle, + wis.confidence = $confidence, + wis.reinforcement_count = 0, + wis.timestamp = $ts, + wis.tier = 'wisdom' + ON MATCH SET + wis.principle = $principle, + wis.confidence = CASE WHEN $confidence > wis.confidence + THEN $confidence ELSE wis.confidence END + """, + id=w["id"], + label=w["label"], + principle=w["principle"], + confidence=float(w.get("confidence", 0.7)), + ts=_utcnow(), + ) + if w.get("insight_id"): + session.run( + """ + MATCH (i:Insight {id: $iid}), (wis:Wisdom {id: $wid}) + MERGE (i)-[:CRYSTALLIZES_INTO]->(wis) + """, + iid=w["insight_id"], + wid=w["id"], + ) + written += 1 + return written + + +# ── Step 4: Feedback — Wisdom reinforces Knowledge ─────────────────────────── + +def write_reinforcement_edges(session) -> int: + """Write REINFORCES edges from Wisdom back to grounding Knowledge nodes.""" + query = """ + MATCH (wis:Wisdom)-[:CRYSTALLIZES_INTO*0..1]-(i:Insight) + -[:REVEALS*0..1]-(e:Experience) + -[:GROUNDS*0..1]-(k:Knowledge) + WHERE wis.reinforcement_count >= 2 + MERGE (wis)-[r:REINFORCES]->(k) + ON CREATE SET r.weight = 1.0 + ON MATCH SET r.weight = coalesce(r.weight, 1.0) + 0.1 + RETURN count(r) AS written + """ + with session.begin_transaction() as tx: + result = tx.run(query) + record = result.single() + return record["written"] if record else 0 + + +# ── Public entry point ──────────────────────────────────────────────────────── + +def run_reflect(session, project: str | None = None) -> dict: + """Run the full promotion pipeline. Returns stats dict.""" + stats: dict[str, int] = {} + + exp_candidates = find_experience_candidates(session, project) + stats["experience_promoted"] = promote_to_experience(session, exp_candidates) + + ins_candidates = find_insight_candidates(session) + stats["insight_promoted"] = promote_to_insight(session, ins_candidates) + + wis_candidates = find_wisdom_candidates(session) + # Wisdom requires LLM generation of principles — return candidates for the skill to handle + stats["wisdom_candidates"] = len(wis_candidates) + stats["wisdom_candidates_data"] = wis_candidates + + stats["reinforcement_edges"] = write_reinforcement_edges(session) + + return stats diff --git a/wisdom/report.py b/wisdom/report.py new file mode 100644 index 0000000..b795167 --- /dev/null +++ b/wisdom/report.py @@ -0,0 +1,131 @@ +"""Generate WISDOM_REPORT.md — the always-on context document for Claude.""" +from __future__ import annotations + +from datetime import datetime, timezone +from pathlib import Path + + +def render_report(session, project: str = "", out_dir: Path | None = None) -> str: + """Query Neo4j and render WISDOM_REPORT.md. Returns the markdown string.""" + from .traverse import god_nodes + from .connect import status + + stats = status(session.driver if hasattr(session, "driver") else _get_driver_from_session(session)) + + date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d") + + lines = [ + f"# Wisdom Report — {project or 'all projects'} ({date_str})", + "", + "## Graph status", + f"- {stats.get('Knowledge', 0)} Knowledge" + f" · {stats.get('Experience', 0)} Experience" + f" · {stats.get('Insight', 0)} Insight" + f" · {stats.get('Wisdom', 0)} Wisdom nodes", + f"- {stats.get('edges', 0)} total edges", + f"- {stats.get('Source', 0)} source files/URLs indexed", + "", + ] + + # Top Wisdom nodes + wisdom_result = session.run( + """ + MATCH (w:Wisdom) + RETURN w.label AS label, w.principle AS principle, + w.confidence AS confidence, + w.reinforcement_count AS reinforcement_count + ORDER BY w.confidence * (1 + coalesce(w.reinforcement_count, 0)) DESC + LIMIT 10 + """ + ) + wisdom_rows = [dict(r) for r in wisdom_result] + + if wisdom_rows: + lines.append("## Top Wisdom (by confidence × reinforcement)") + for i, w in enumerate(wisdom_rows, 1): + conf = w.get("confidence") or 0.0 + reinf = w.get("reinforcement_count") or 0 + principle = w.get("principle") or w.get("label", "") + lines.append(f"{i}. \"{principle}\"") + lines.append(f" [confidence: {conf:.2f}, reinforced: {reinf}x]") + lines.append("") + + # Fragile principles + fragile_result = session.run( + """ + MATCH (w:Wisdom) + WHERE w.confidence < 0.4 + RETURN w.label AS label, w.confidence AS confidence + ORDER BY w.confidence ASC + LIMIT 5 + """ + ) + fragile_rows = [dict(r) for r in fragile_result] + if fragile_rows: + lines.append("## Fragile principles (need more experience to strengthen)") + for w in fragile_rows: + lines.append(f"- \"{w['label']}\" [confidence: {w.get('confidence', 0):.2f}]") + lines.append("") + + # Cross-project god nodes + god_node_list = god_nodes(session, limit=10) + if god_node_list: + lines.append("## God nodes (most connected across all projects)") + for i, n in enumerate(god_node_list, 1): + lines.append(f"{i}. `{n['label']}` [{n.get('tier', 'knowledge')}] — {n.get('degree', 0)} edges") + lines.append("") + + # Contradictions + contra_result = session.run( + """ + MATCH (a)-[:CONTRADICTS]->(b) + RETURN a.label AS a_label, b.label AS b_label, a.tier AS tier + LIMIT 5 + """ + ) + contra_rows = [dict(r) for r in contra_result] + if contra_rows: + lines.append("## Contradictions to resolve") + for c in contra_rows: + lines.append(f"- `{c['a_label']}` ↔ `{c['b_label']}` [{c.get('tier', 'knowledge')}]") + lines.append(" Run `/wisdom reflect` to reconcile.") + lines.append("") + + # Suggested questions + lines.extend([ + "## Suggested questions", + "_Questions this graph is uniquely positioned to answer:_", + "", + ]) + # Find high-betweenness nodes (approximate: nodes with most incoming + outgoing edges) + bridge_result = session.run( + """ + MATCH (n) + WHERE (n:Knowledge OR n:Experience OR n:Insight OR n:Wisdom) + WITH n, count{(n)--()} AS degree + WHERE degree >= 3 + RETURN n.label AS label, n.tier AS tier, degree + ORDER BY degree DESC + LIMIT 4 + """ + ) + for r in bridge_result: + lines.append(f"- What makes `{r['label']}` a central concept in your work?") + lines.append("- What patterns connect your most-used abstractions across projects?") + lines.append("") + + report = "\n".join(lines) + + if out_dir: + out_path = Path(out_dir) / "WISDOM_REPORT.md" + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(report, encoding="utf-8") + + return report + + +def _get_driver_from_session(session): + """Extract driver from session for status() call.""" + # neo4j-python-driver sessions have ._connection or similar + # We just return None and let status handle it gracefully + return None diff --git a/wisdom/security.py b/wisdom/security.py new file mode 100644 index 0000000..cb95062 --- /dev/null +++ b/wisdom/security.py @@ -0,0 +1,54 @@ +"""Input validation — URLs, file paths, node labels. Prevents injection.""" +from __future__ import annotations + +import html +import re +import urllib.parse +from pathlib import Path + + +_SAFE_URI_SCHEMES = {"http", "https"} +_LABEL_MAX = 256 + + +def validate_url(url: str) -> str: + """Validate and normalize a URL. Raises ValueError on bad input.""" + url = url.strip() + parsed = urllib.parse.urlparse(url) + if parsed.scheme.lower() not in _SAFE_URI_SCHEMES: + raise ValueError(f"Unsafe URL scheme '{parsed.scheme}'. Only http/https allowed.") + if not parsed.netloc: + raise ValueError(f"URL has no host: {url!r}") + return url + + +def validate_graph_path(path: str | Path, out_dir: Path) -> Path: + """Ensure path resolves inside out_dir (no path traversal).""" + resolved = Path(path).resolve() + if not str(resolved).startswith(str(out_dir.resolve())): + raise ValueError(f"Path {resolved} escapes output directory {out_dir}") + return resolved + + +def sanitize_label(label: str) -> str: + """Strip control characters, cap length, HTML-escape for safe Cypher embedding.""" + # Remove control chars + label = re.sub(r"[\x00-\x1f\x7f]", "", label) + # Cap length + label = label[:_LABEL_MAX] + # HTML-escape to prevent Cypher injection via string interpolation + return html.escape(label, quote=True) + + +def is_sensitive_path(path: Path) -> bool: + """Return True if this file likely contains secrets.""" + _PATTERNS = [ + re.compile(r"(^|[\\/])\.(env|envrc)(\.|$)", re.IGNORECASE), + re.compile(r"\.(pem|key|p12|pfx|cert|crt|der|p8)$", re.IGNORECASE), + re.compile(r"(credential|secret|passwd|password|token|private_key)", re.IGNORECASE), + re.compile(r"(id_rsa|id_dsa|id_ecdsa|id_ed25519)(\.pub)?$"), + re.compile(r"(\.netrc|\.pgpass|\.htpasswd)$", re.IGNORECASE), + ] + name = path.name + full = str(path) + return any(p.search(name) or p.search(full) for p in _PATTERNS) diff --git a/wisdom/skill-claw.md b/wisdom/skill-claw.md new file mode 100644 index 0000000..245003a --- /dev/null +++ b/wisdom/skill-claw.md @@ -0,0 +1,43 @@ +--- +name: wisdom +description: accumulative Neo4j-native DIKW wisdom memory — absorb any input, merge into persistent graph, ask questions that traverse Knowledge→Experience→Insight→Wisdom +trigger: /wisdom +--- + +# /wisdom (OpenClaw) + +Same as the Claude Code skill but uses sequential extraction (OpenClaw does not yet support parallel subagents). + +All commands and pipeline steps are identical to skill.md. The only difference: +- Extract files sequentially, not in parallel batches +- Use the Bash tool to run `python3 -m wisdom ` for CLI operations +- Write AGENTS.md instead of CLAUDE.md for always-on integration + +See `skill.md` for the full pipeline documentation. + +## Install for OpenClaw + +```bash +pip install wisdomgraph && wisdom install --platform claw +``` + +Then in your project: +```bash +wisdom claw install # writes wisdomGraph section to AGENTS.md +``` + +## Always-on (AGENTS.md) + +After running `wisdom claw install`, your AGENTS.md will contain: + +```markdown +## wisdomGraph + +This project uses wisdomGraph — accumulative Neo4j-native wisdom memory. + +Rules: +- Before answering architecture or codebase questions, read wisdom-out/WISDOM_REPORT.md +- Use the wisdom CLI to query the graph: wisdom ask "your question" +``` + +OpenClaw reads AGENTS.md before every session — the wisdom graph is always-on. diff --git a/wisdom/skill.md b/wisdom/skill.md new file mode 100644 index 0000000..65437f8 --- /dev/null +++ b/wisdom/skill.md @@ -0,0 +1,417 @@ +--- +name: wisdom +description: accumulative Neo4j-native DIKW wisdom memory — absorb any input, merge into persistent graph, ask questions that traverse Knowledge→Experience→Insight→Wisdom +trigger: /wisdom +--- + +# /wisdom + +Turn any folder of files, URLs, or mixed input into a living Neo4j wisdom graph. Every run **merges** — the graph accumulates across sessions, projects, and months. Facts become patterns. Patterns become insights. Insights become wisdom. + +Unlike graphify (snapshot → file), wisdomGraph writes directly to Neo4j. Ask it a question next week and it remembers everything you've ever absorbed. + +## Usage + +``` +/wisdom # absorb current directory +/wisdom # absorb a specific folder +/wisdom --mode deep # aggressive INFERRED edge extraction +/wisdom --update # re-absorb only changed files + +/wisdom add # absorb a URL (paper, tweet, page) +/wisdom add --author "Name" # tag the source author + +/wisdom ask "" # query the wisdom graph +/wisdom ask "" --tier wisdom # only traverse Wisdom-tier nodes +/wisdom reflect # run DIKW promotion pass +/wisdom reflect --project # reflect on one corpus only + +/wisdom path "" "" # shortest path between concepts +/wisdom explain "" # full DIKW chain for a concept +/wisdom god-nodes # most-connected concepts across all projects + +/wisdom export --cypher # dump as Cypher statements +/wisdom export --json # export graph.json (graphify-compatible) +/wisdom export --obsidian # export Obsidian vault + +/wisdom status # graph stats by tier +/wisdom purge --project # remove one corpus from graph +``` + +--- + +## Step 1 — Pre-flight: check Neo4j connection + +Before doing anything else, verify the Neo4j connection is live. + +Run in a subagent: +```python +import subprocess +result = subprocess.run(["python3", "-m", "wisdom", "status"], capture_output=True, text=True) +print(result.stdout or result.stderr) +``` + +**If connection fails:** +- Ask the user: "Do you want to use Neo4j Aura (cloud, free) or DozerDB local Docker?" +- **Aura**: "Create a free instance at https://neo4j.com/cloud/aura, then run: `wisdom connect --user neo4j --password `" +- **Docker**: Run `python3 -m wisdom docker up` then `python3 -m wisdom connect bolt://localhost:7687 --user neo4j --password password` + +Do NOT proceed with absorption until the connection is verified. + +--- + +## Step 2 — Detect files + +For `/wisdom ` or `/wisdom .`: + +Run in a subagent: +```python +import sys +sys.path.insert(0, '.') +from wisdom.detect import detect +from pathlib import Path +result = detect(Path("")) +import json +print(json.dumps(result, indent=2)) +``` + +Print the detection summary: +- Total files found by type +- Any sensitive files skipped +- `.wisdomignore` patterns active + +If 0 files found, tell the user and stop. + +--- + +## Step 3 — Extract (parallel subagents) + +### Code files (AST — no LLM needed, fast) + +For each code file in `result["files"]["code"]`, run a subagent: +```python +# AST extraction — uses tree-sitter, no LLM tokens +import sys +sys.path.insert(0, '.') +# Re-use graphify's extract module for AST parsing +try: + from graphify.extract import extract + data = extract("") + # Add DIKW metadata + for node in data.get("nodes", []): + node["tier"] = "knowledge" + node["confidence_tag"] = "EXTRACTED" + node["confidence"] = 1.0 + import json + print(json.dumps(data)) +except ImportError: + # Fallback: minimal extraction without tree-sitter + from pathlib import Path + path = Path("") + import hashlib + node_id = hashlib.sha256(str(path.resolve()).encode()).hexdigest()[:16] + data = { + "nodes": [{"id": node_id, "label": path.name, "content": "", "source_file": str(path), "tier": "knowledge", "confidence": 1.0, "confidence_tag": "EXTRACTED"}], + "edges": [], + "source_file": str(path) + } + import json + print(json.dumps(data)) +``` + +### Document / paper / image files (LLM extraction) + +For each doc/paper/image, run a subagent with this prompt: + +``` +Extract a knowledge graph from the following file for the wisdomGraph system. + +File: +Content: + +Return a JSON object with this exact schema: +{ + "nodes": [ + { + "id": "", + "label": "", + "content": "<1-2 sentence description>", + "source_file": "", + "tier": "", + "confidence": <0.0-1.0>, + "confidence_tag": "", + "principle": "" + } + ], + "edges": [ + { + "source": "", + "target": "", + "relation": "", + "confidence": <0.0-1.0>, + "confidence_tag": "" + } + ], + "source_file": "" +} + +Tier assignment rules: +- knowledge: a specific fact, function, class, concept, or documented behavior +- experience: a pattern observed in context (decision + outcome), or same concept in multiple implementations +- insight: a recurring pattern across 3+ examples, a principle emerging from experience +- wisdom: a falsifiable, actionable principle you would stake your reputation on + +Be honest: use EXTRACTED for things explicitly stated, INFERRED for reasonable deductions, AMBIGUOUS for uncertain connections. +Focus on: what connects to what, why things were designed a certain way, what problems they solve. +Extract rationale from: docstrings, # NOTE:, # WHY:, # IMPORTANT: comments, design decision prose. +``` + +**For --mode deep**: add to the prompt: "Extract more INFERRED edges. Identify semantic similarity between concepts across different parts of the file. Look for implicit dependencies, conceptual relationships, and design patterns." + +--- + +## Step 4 — Classify and MERGE into Neo4j + +After all extractions complete, run in a subagent: + +```python +import sys, json +sys.path.insert(0, '.') +from wisdom.connect import get_driver, ensure_schema +from wisdom.classify import classify_nodes, build_dikw_edges +from wisdom.merge import merge_extraction +from wisdom.cache import save_extractions + +driver = get_driver() +ensure_schema(driver) + +all_extractions = +project = "" + +total_nodes = 0 +total_edges = 0 + +with driver.session() as session: + for extraction in all_extractions: + nodes = extraction.get("nodes", []) + edges = extraction.get("edges", []) + + # DIKW classification + classified_nodes = classify_nodes(nodes, edges, project) + enriched_edges = build_dikw_edges(classified_nodes, edges) + + extraction["nodes"] = classified_nodes + extraction["edges"] = enriched_edges + + stats = merge_extraction( + session, + extraction, + source_uri=extraction.get("source_file", ""), + project=project, + ) + total_nodes += stats["nodes"] + total_edges += stats["edges"] + +# Cache for incremental updates +save_extractions(all_extractions) + +print(f"Merged: {total_nodes} nodes, {total_edges} edges") +driver.close() +``` + +Print the merge summary to the user. + +--- + +## Step 5 — Generate WISDOM_REPORT.md + +Run in a subagent: +```python +import sys +sys.path.insert(0, '.') +from wisdom.connect import get_driver, ensure_schema +from wisdom.report import render_report +from pathlib import Path + +driver = get_driver() +ensure_schema(driver) +out_dir = Path("wisdom-out") + +with driver.session() as session: + report = render_report(session, project="", out_dir=out_dir) + +driver.close() +print(report) +``` + +Show the user the WISDOM_REPORT.md content and tell them it's been saved to `wisdom-out/WISDOM_REPORT.md`. + +--- + +## /wisdom add + +1. Run `python3 -m wisdom status` to verify connection +2. Fetch and save: run `from wisdom.ingest import ingest; path = ingest("", corpus_dir=Path("./raw"), author="")` in a subagent +3. Run the extraction pipeline (Step 3) on the saved file +4. MERGE (Step 4) +5. Regenerate WISDOM_REPORT.md (Step 5) + +--- + +## /wisdom ask "" + +Run in a subagent: +```python +import sys +sys.path.insert(0, '.') +from wisdom.connect import get_driver, ensure_schema +from wisdom.traverse import answer_question + +driver = get_driver() +ensure_schema(driver) +tier_filter = "" + +with driver.session() as session: + result = answer_question(session, "", tier_filter=tier_filter if tier_filter else None) + +driver.close() +import json +print(json.dumps(result, indent=2)) +``` + +Format the result for the user: +- Lead with Wisdom nodes (if any) — show the `principle` field prominently +- Follow with Insight nodes — show `content` and `pattern_strength` +- Show the DIKW provenance chain: Knowledge → Experience → Insight → Wisdom +- If no Wisdom found: show the best Knowledge/Experience matches and suggest `/wisdom reflect` to promote them + +--- + +## /wisdom reflect + +The promotion pass. Tell the user: "Running DIKW promotion — elevating Knowledge → Experience → Insight → Wisdom..." + +### 3a. Run automated promotions + +Run in a subagent: +```python +import sys +sys.path.insert(0, '.') +from wisdom.connect import get_driver, ensure_schema +from wisdom.reflect import run_reflect + +driver = get_driver() +ensure_schema(driver) +project = "" + +with driver.session() as session: + stats = run_reflect(session, project=project) + +driver.close() +import json +print(json.dumps(stats, indent=2)) +``` + +### 3b. LLM Wisdom generation + +From `stats["wisdom_candidates_data"]`, for each candidate Insight with `pattern_strength >= 0.5`: + +Generate a Wisdom principle with this prompt: +``` +You are distilling wisdom from accumulated experience. + +Insight: "" +Pattern: "" +Source count: +Pattern strength: + +Generate a single, falsifiable, actionable Wisdom principle. +The principle should be: +- Specific enough to act on (not "write good code") +- Grounded in the pattern (no hallucination) +- Stated as something you would stake your professional reputation on + +Return JSON: +{ + "id": "wis:", + "label": "", + "principle": "<1-2 sentence actionable principle>", + "confidence": <0.5-0.95>, + "insight_id": "" +} +``` + +Then write the generated Wisdom nodes to Neo4j: +```python +from wisdom.reflect import write_wisdom +with driver.session() as session: + count = write_wisdom(session, ) +print(f"Generated {count} new Wisdom nodes") +``` + +### 3c. Regenerate report + +Run Step 5 again to update WISDOM_REPORT.md. + +Show the user: +- How many nodes were promoted at each tier +- How many new Wisdom principles were generated +- The updated top Wisdom nodes + +--- + +## /wisdom path "" "" + +```python +from wisdom.traverse import shortest_path +with driver.session() as session: + path = shortest_path(session, "", "") +``` + +Format as: `ConceptA [knowledge] → ConceptB [experience] → ConceptC [wisdom]` + +--- + +## /wisdom explain "" + +```python +from wisdom.traverse import explain_node +with driver.session() as session: + result = explain_node(session, "") +``` + +Show: +1. The node and its tier +2. Full DIKW chain (which Knowledge grounds it, which Experience revealed it, which Wisdom it supports) +3. Source files/URLs + +--- + +## Output files + +``` +wisdom-out/ +├── WISDOM_REPORT.md always-on context — Claude reads this before every file search +└── cache/ SHA256 extraction cache — incremental runs skip unchanged files +``` + +Add a `.wisdomignore` file (gitignore syntax) to exclude folders: +``` +# .wisdomignore +vendor/ +node_modules/ +*.generated.py +dist/ +``` + +--- + +## Error handling + +**Neo4j connection refused**: Guide user through Aura signup or `wisdom docker up` + +**Empty extraction**: Warn "No extractable content found in . Skipping." Continue with other files. + +**MERGE conflict**: MERGE semantics are idempotent — re-running is safe. Higher-confidence nodes win on conflict. + +**reflect() produces no Wisdom**: Tell the user "Not enough accumulated experience yet. Absorb more projects and run `/wisdom reflect` again after 2-3 more runs." diff --git a/wisdom/traverse.py b/wisdom/traverse.py new file mode 100644 index 0000000..825975f --- /dev/null +++ b/wisdom/traverse.py @@ -0,0 +1,224 @@ +"""Cypher traversal engine — answer natural language questions by walking the DIKW graph.""" +from __future__ import annotations + +from datetime import datetime, timezone + + +def _utcnow() -> str: + return datetime.now(timezone.utc).isoformat() + + +def full_text_search(session, query: str, limit: int = 10) -> list[dict]: + """Full-text search across all DIKW tiers. Returns scored node list.""" + try: + result = session.run( + """ + CALL db.index.fulltext.queryNodes('wisdom_content', $query) + YIELD node, score + RETURN node.id AS id, node.label AS label, node.tier AS tier, + node.content AS content, node.principle AS principle, + node.confidence AS confidence, score + ORDER BY score DESC + LIMIT $limit + """, + query=query, + limit=limit, + ) + return [dict(r) for r in result] + except Exception: + # Fallback: keyword scan if full-text index not available + return _keyword_fallback(session, query, limit) + + +def _keyword_fallback(session, query: str, limit: int) -> list[dict]: + """Keyword scan fallback when full-text index unavailable.""" + terms = [t.lower() for t in query.split() if len(t) > 2] + if not terms: + return [] + # Build OR condition + conditions = " OR ".join(f"toLower(n.label) CONTAINS '{t}' OR toLower(n.content) CONTAINS '{t}'" for t in terms[:5]) + cypher = f""" + MATCH (n) + WHERE (n:Knowledge OR n:Experience OR n:Insight OR n:Wisdom) + AND ({conditions}) + RETURN n.id AS id, n.label AS label, n.tier AS tier, + n.content AS content, n.principle AS principle, + n.confidence AS confidence, + 1.0 AS score + LIMIT {limit} + """ + result = session.run(cypher) + return [dict(r) for r in result] + + +def walk_dikw_path(session, node_id: str, depth: int = 3) -> list[dict]: + """Walk up the DIKW hierarchy from a node. Returns the full provenance chain.""" + result = session.run( + """ + MATCH path = (start {id: $id})-[:GROUNDS|REVEALS|CRYSTALLIZES_INTO*1..3]->(end) + UNWIND nodes(path) AS n + RETURN DISTINCT n.id AS id, n.label AS label, n.tier AS tier, + n.content AS content, n.principle AS principle, + n.confidence AS confidence, + n.pattern_strength AS pattern_strength, + n.reinforcement_count AS reinforcement_count + ORDER BY + CASE n.tier + WHEN 'wisdom' THEN 4 + WHEN 'insight' THEN 3 + WHEN 'experience' THEN 2 + ELSE 1 + END DESC + """, + id=node_id, + ) + return [dict(r) for r in result] + + +def get_provenance(session, node_id: str) -> list[dict]: + """Get source files that contributed to a node.""" + result = session.run( + """ + MATCH (n {id: $id})-[:SOURCED_FROM]->(s:Source) + RETURN s.uri AS uri, s.author AS author, s.ingested_at AS ingested_at + """, + id=node_id, + ) + return [dict(r) for r in result] + + +def increment_access(session, node_ids: list[str]) -> None: + """Increment access_count on traversed nodes (feeds reinforcement).""" + session.run( + """ + MATCH (n) WHERE n.id IN $ids + SET n.access_count = coalesce(n.access_count, 0) + 1, + n.last_accessed = $ts + """, + ids=node_ids, + ts=_utcnow(), + ) + + +def increment_wisdom_reinforcement(session, wisdom_ids: list[str]) -> None: + """Increment reinforcement_count on Wisdom nodes that were returned.""" + session.run( + """ + MATCH (w:Wisdom) WHERE w.id IN $ids + SET w.reinforcement_count = coalesce(w.reinforcement_count, 0) + 1, + w.last_reinforced = $ts + """, + ids=wisdom_ids, + ts=_utcnow(), + ) + + +def shortest_path(session, label_a: str, label_b: str) -> list[dict]: + """Find shortest path between two concepts by label.""" + result = session.run( + """ + MATCH (a), (b) + WHERE toLower(a.label) CONTAINS toLower($a) + AND toLower(b.label) CONTAINS toLower($b) + WITH a, b LIMIT 1 + MATCH path = shortestPath((a)-[*1..6]-(b)) + UNWIND nodes(path) AS n + RETURN n.id AS id, n.label AS label, n.tier AS tier + """, + a=label_a, + b=label_b, + ) + return [dict(r) for r in result] + + +def explain_node(session, label: str) -> dict: + """Return full DIKW context for a node identified by label.""" + # Find node + result = session.run( + """ + MATCH (n) + WHERE (n:Knowledge OR n:Experience OR n:Insight OR n:Wisdom) + AND toLower(n.label) CONTAINS toLower($label) + RETURN n.id AS id, n.label AS label, n.tier AS tier, + n.content AS content, n.principle AS principle, + n.confidence AS confidence + LIMIT 1 + """, + label=label, + ) + record = result.single() + if not record: + return {"error": f"No node found matching '{label}'"} + + node = dict(record) + node_id = node["id"] + + # Walk up + node["dikw_chain"] = walk_dikw_path(session, node_id) + # Get sources + node["sources"] = get_provenance(session, node_id) + + return node + + +def god_nodes(session, limit: int = 10) -> list[dict]: + """Return highest-degree nodes across all tiers.""" + result = session.run( + """ + MATCH (n) + WHERE (n:Knowledge OR n:Experience OR n:Insight OR n:Wisdom) + RETURN n.id AS id, n.label AS label, n.tier AS tier, + n.confidence AS confidence, + count{(n)--()} AS degree + ORDER BY degree DESC + LIMIT $limit + """, + limit=limit, + ) + return [dict(r) for r in result] + + +def answer_question(session, question: str, tier_filter: str | None = None) -> dict: + """Top-level query: search, walk DIKW paths, score, return answer package.""" + candidates = full_text_search(session, question, limit=5) + if not candidates: + return {"answer": "No matching nodes found in the wisdom graph.", "nodes": []} + + # Filter by tier if requested + if tier_filter: + candidates = [c for c in candidates if c.get("tier") == tier_filter.lower()] + if not candidates: + return {"answer": f"No {tier_filter}-tier nodes matched this question.", "nodes": []} + + all_nodes: list[dict] = [] + wisdom_ids: list[str] = [] + all_ids: list[str] = [] + + for candidate in candidates: + chain = walk_dikw_path(session, candidate["id"]) + all_nodes.extend(chain if chain else [candidate]) + for n in (chain or [candidate]): + all_ids.append(n["id"]) + if n.get("tier") == "wisdom": + wisdom_ids.append(n["id"]) + + # Deduplicate by id, prefer higher-tier + seen: set[str] = set() + deduped: list[dict] = [] + tier_rank = {"wisdom": 4, "insight": 3, "experience": 2, "knowledge": 1} + for n in sorted(all_nodes, key=lambda x: tier_rank.get(x.get("tier", "knowledge"), 0), reverse=True): + if n["id"] not in seen: + seen.add(n["id"]) + deduped.append(n) + + # Track access + increment_access(session, list(set(all_ids))) + if wisdom_ids: + increment_wisdom_reinforcement(session, list(set(wisdom_ids))) + + return { + "question": question, + "nodes": deduped[:20], + "wisdom_nodes": [n for n in deduped if n.get("tier") == "wisdom"], + "answer_tier": deduped[0].get("tier", "knowledge") if deduped else "none", + } diff --git a/wisdom/validate.py b/wisdom/validate.py new file mode 100644 index 0000000..7c6dede --- /dev/null +++ b/wisdom/validate.py @@ -0,0 +1,46 @@ +"""Schema validation for extraction dicts before Neo4j MERGE.""" +from __future__ import annotations + +VALID_TIERS = {"knowledge", "experience", "insight", "wisdom"} +VALID_CONFIDENCE_TAGS = {"EXTRACTED", "INFERRED", "AMBIGUOUS"} +VALID_RELATIONS = { + "calls", "imports", "uses", "defines", "implements", "extends", + "references", "depends_on", "semantically_similar_to", + "conceptually_related_to", "contradicts", "grounds", + "reveals", "crystallizes_into", "reinforces", "sourced_from", + "rationale_for", +} + + +def validate_node(node: dict) -> None: + """Raise ValueError if node dict is malformed.""" + if not isinstance(node.get("id"), str) or not node["id"]: + raise ValueError(f"Node missing 'id': {node}") + if not isinstance(node.get("label"), str) or not node["label"]: + raise ValueError(f"Node missing 'label': {node}") + tier = node.get("tier", "knowledge").lower() + if tier not in VALID_TIERS: + raise ValueError(f"Invalid tier '{tier}' in node {node['id']!r}") + + +def validate_edge(edge: dict) -> None: + """Raise ValueError if edge dict is malformed.""" + if not isinstance(edge.get("source"), str) or not edge["source"]: + raise ValueError(f"Edge missing 'source': {edge}") + if not isinstance(edge.get("target"), str) or not edge["target"]: + raise ValueError(f"Edge missing 'target': {edge}") + conf = edge.get("confidence_tag", "EXTRACTED") + if conf not in VALID_CONFIDENCE_TAGS: + raise ValueError(f"Invalid confidence_tag '{conf}' in edge {edge}") + + +def validate_extraction(data: dict) -> None: + """Validate full extraction dict. Raises ValueError on schema error.""" + if not isinstance(data.get("nodes"), list): + raise ValueError("Extraction dict missing 'nodes' list") + if not isinstance(data.get("edges"), list): + raise ValueError("Extraction dict missing 'edges' list") + for node in data["nodes"]: + validate_node(node) + for edge in data["edges"]: + validate_edge(edge) From 51ba045e9bf9895801bee21f8fe31e522302a7b4 Mon Sep 17 00:00:00 2001 From: chin-keong-lam Date: Wed, 8 Apr 2026 02:26:02 -0700 Subject: [PATCH 2/6] Add tests, fix __main__ bug, README.zh-CN, ci.sh, .gitignore MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tests (67 passing, no Neo4j required): tests/test_security.py — URL validation, label sanitization, sensitive paths tests/test_validate.py — DIKW extraction schema validation tests/test_classify.py — tier assignment heuristics, DIKW edge building tests/test_cache.py — SHA256 cache read/write/invalidation tests/test_detect.py — file discovery, .wisdomignore, paper detection tests/test_ingest.py — HTML stripping, entity decoding Bug fix: wisdom/__main__.py — _get_arg had duplicate 'flag' parameter name causing a Python SyntaxError. Renamed to is_flag=, fixed call sites. Chinese README: README.zh-CN.md — full translation with technical accuracy preserved CI: ci.sh — creates venv, installs pytest, runs tests (no Docker needed) .gitignore — excludes __pycache__, .venv, wisdom-out, .wisdom/ Co-Authored-By: Claude Sonnet 4.6 --- .claude/settings.local.json | 11 +- .gitignore | 11 ++ README.zh-CN.md | 249 ++++++++++++++++++++++++++++++++++++ ci.sh | 15 +++ tests/__init__.py | 0 tests/test_cache.py | 68 ++++++++++ tests/test_classify.py | 113 ++++++++++++++++ tests/test_detect.py | 98 ++++++++++++++ tests/test_ingest.py | 46 +++++++ tests/test_security.py | 74 +++++++++++ tests/test_validate.py | 84 ++++++++++++ wisdom/__main__.py | 16 +-- 12 files changed, 776 insertions(+), 9 deletions(-) create mode 100644 .gitignore create mode 100644 README.zh-CN.md create mode 100755 ci.sh create mode 100644 tests/__init__.py create mode 100644 tests/test_cache.py create mode 100644 tests/test_classify.py create mode 100644 tests/test_detect.py create mode 100644 tests/test_ingest.py create mode 100644 tests/test_security.py create mode 100644 tests/test_validate.py diff --git a/.claude/settings.local.json b/.claude/settings.local.json index 27cd798..e67423d 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -3,7 +3,16 @@ "allow": [ "Bash(git push -u origin claude/determined-volhard)", "Bash(git add .)", - "Bash(git add -A)" + "Bash(git add -A)", + "Bash(git commit:*)", + "Bash(git push)", + "Bash(python3 -m pytest tests/ -q)", + "Bash(pip3 install pytest -q)", + "Bash(python3 -m venv /tmp/wisdom-test-venv)", + "Bash(/tmp/wisdom-test-venv/bin/pip install pytest -q)", + "Bash(/tmp/wisdom-test-venv/bin/python -m pytest tests/ -q)", + "Bash(chmod +x /Users/chinkeonglam/wisdomGraph/.claude/worktrees/determined-volhard/ci.sh)", + "Bash(git rm -r --cached tests/__pycache__ wisdom/__pycache__)" ] } } diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..66625e6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +__pycache__/ +*.pyc +*.pyo +.venv/ +venv/ +*.egg-info/ +dist/ +build/ +wisdom-out/ +.wisdom/ +*.tmp diff --git a/README.zh-CN.md b/README.zh-CN.md new file mode 100644 index 0000000..7733a6e --- /dev/null +++ b/README.zh-CN.md @@ -0,0 +1,249 @@ +# wisdomGraph + +[English](README.md) | [简体中文](README.zh-CN.md) + +[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE) +[![Neo4j](https://img.shields.io/badge/Neo4j-native-008CC1?logo=neo4j)](https://neo4j.com) +[![Claude Code](https://img.shields.io/badge/Claude%20Code-skill-blueviolet)](https://claude.ai/code) +[![OpenClaw](https://img.shields.io/badge/OpenClaw-skill-orange)](https://openclaw.ai) + +> **graphify 给你快照。wisdomGraph 给你复利增长的记忆。** + +在 Claude Code 或 OpenClaw 中输入 `/wisdom`。把你的代码库、笔记、论文、对话喂给它 —— 每次运行都会**合并**进一个活跃的 Neo4j 图谱。图谱不会重置,只会积累。事实变成模式,模式变成洞察,洞察变成智慧。 + +``` +/wisdom . # 将当前项目吸收进智慧图谱 +/wisdom ask "我所有项目中有哪些反复出现的模式?" +/wisdom reflect # 启动 DIKW 晋升,形成智慧闭环 +``` + +--- + +## 相较于 graphify 的质变 + +graphify 在其定位上做得很好:把一个文件夹变成知识图谱快照。跑一次,生成 `graph.json` 和 `GRAPH_REPORT.md`,读完,下次会话从头开始。 + +wisdomGraph 做的是根本不同的事。 + +| | graphify | wisdomGraph | +|---|---|---| +| **存储** | `graph.json` 文件(每个项目独立) | Neo4j(持久化,跨所有项目) | +| **节点类型** | 扁平(代码实体、概念) | DIKW 分层:知识 / 经验 / 洞察 / 智慧 | +| **每次运行** | 快照,覆盖写入 | MERGE —— 每次运行都在扩张图谱 | +| **查询方式** | 读取 GRAPH_REPORT.md | 运行时实时 Cypher 遍历 | +| **记忆** | 每次会话重置 | 跨会话、跨项目、跨月份积累 | +| **推理** | Leiden 社区检测(拓扑) | 图路径遍历 + DIKW 层次 | +| **反馈闭环** | 无 | 智慧 → 知识(神经可塑性) | +| **数据库** | 不需要 | Neo4j Aura(免费)或 DozerDB Docker | + +这个差异不是量变,而是质变。graphify 把代码库压缩成可读报告;wisdomGraph 构建的是一套人工认识论 —— 能记忆、能关联、能成长。 + +--- + +## DIKW 金字塔,工程化落地 + +人类专家不是把事实平铺存储的,他们按层次组织经验: + +``` +智慧(Wisdom) ← 从模式中提炼出的可执行原则 + ↑ +洞察(Insight) ← 从多次经验中发现的规律 + ↑ +经验(Experience)← 有上下文的事件、决策与结果 + ↑ +知识(Knowledge) ← 已验证的事实、文档行为、提取的结构 +``` + +wisdomGraph 中每个节点都带有 `tier` 标签。图谱的拓扑结构**就是**认知架构本身。当你提问时,Cypher 沿层级向上遍历 —— 不是关键词匹配扁平文本,而是跨越亲历经验的推理。 + +反馈闭环至关重要:当某个智慧节点被查询并确认有效时,它会强化连接的知识节点。图谱在学习什么重要。 + +--- + +## 安装 + +**环境要求:** Python 3.10+ 以及以下之一:[Claude Code](https://claude.ai/code)、[OpenClaw](https://openclaw.ai) + +**加上以下之一:** [Neo4j Aura 免费版](https://neo4j.com/cloud/platform/aura-graph-database/)(云端,无需安装)或 [DozerDB](https://dozerdb.org)(本地 Docker,含 APOC) + +```bash +pip install wisdomgraph && wisdom install +``` + +### 方案 A — Neo4j Aura(零基础设施,推荐个人用户) + +1. 在 [neo4j.com/cloud/aura](https://neo4j.com/cloud/aura) 注册免费账号 +2. 创建一个免费的 AuraDB 实例,复制连接 URI 和密码 +3. 运行: + +```bash +wisdom connect bolt+s://xxxxxxxx.databases.neo4j.io --user neo4j --password <你的密码> +``` + +免费额度:20 万节点,够用好几年。 + +### 方案 B — DozerDB 本地 Docker(完全掌控,含 APOC) + +```bash +wisdom docker up # 拉取 graphstack/dozerdb:5.26.3.0 并启动 +wisdom connect bolt://localhost:7687 --user neo4j --password password +``` + +打开 [localhost:7474](http://localhost:7474) —— Neo4j Browser 是你俯瞰智慧图谱的可视化窗口。 + +--- + +## 平台支持 + +| 平台 | 安装命令 | +|------|---------| +| Claude Code (Linux/Mac) | `wisdom install` | +| Claude Code (Windows) | `wisdom install --platform windows` | +| OpenClaw | `wisdom install --platform claw` | + +然后打开你的 AI 编程助手,输入: + +``` +/wisdom . +``` + +--- + +## 使用方式 + +``` +/wisdom # 吸收当前目录 +/wisdom ./raw # 吸收指定文件夹 +/wisdom ./raw --mode deep # 激进模式,提取更多 INFERRED 边 +/wisdom ./raw --update # 只重新吸收变更文件,MERGE 进图谱 + +/wisdom add https://arxiv.org/abs/1706.03762 # 吸收一篇论文 +/wisdom add https://x.com/... # 吸收一条推文 +/wisdom add https://... --author "姓名" # 标注来源作者 + +/wisdom ask "我所有项目中有哪些反复出现的模式?" +/wisdom ask "我对认证流程了解多少?" +/wisdom ask "从 attention 到 optimizer 的路径是什么?" +/wisdom ask "..." --tier wisdom # 只遍历智慧层节点 + +/wisdom reflect # 运行 DIKW 晋升:知识→经验→洞察→智慧 +/wisdom reflect --project ./raw # 只对该语料库进行反思 + +/wisdom path "DigestAuth" "OAuth" # 两个概念之间的最短路径 +/wisdom explain "CausalSelfAttention" # 某节点的完整 DIKW 上下文 +/wisdom god-nodes # 所有项目中连接度最高的概念 + +/wisdom export --cypher # 导出为 Cypher 语句 +/wisdom export --json # 导出 graph.json(与 graphify 兼容) +/wisdom export --obsidian # 导出 Obsidian 知识库 + +/wisdom status # 各层节点统计 +/wisdom purge --project ./raw # 删除单个语料库的节点,不影响其他 +``` + +--- + +## 智慧如何复利积累 + +**第 1 次运行** —— 吸收你的 auth 库: +``` +知识:JWT、session token、cookie flags、PKCE flow +经验:(暂无 —— 只有一个来源) +``` + +**第 2 次运行** —— 吸收另一个项目的 auth: +``` +知识:JWT、PKCE —— MERGE 去重,增加来源链接 +经验:两个不同实现,检测到相同模式 +洞察:JWT + PKCE 是你工作中收敛的模式 +``` + +**第 3 次运行** —— `/wisdom reflect`: +``` +智慧:"API 用无状态 JWT,浏览器端用 PKCE flow。 + 这个模式在 3 个项目中落地,从未出过问题。" +``` + +**第 4 次运行** —— `/wisdom ask "新服务的认证方案怎么定?"`: +``` +遍历路径:知识 → 经验 → 洞察 → 智慧 +返回结果:你自己经过实战验证的原则,根植于你真实的代码历史 +``` + +这不是 RAG,不是摘要,而是图谱遍历你积累的经验,把**你自己的智慧还给你**。 + +--- + +## 图谱 Schema + +```cypher +// DIKW 节点标签 +(:Knowledge {id, label, content, source_file, confidence, timestamp, project}) +(:Experience {id, label, content, context, outcome, timestamp, project}) +(:Insight {id, label, content, pattern_strength, source_count, timestamp}) +(:Wisdom {id, label, principle, confidence, reinforcement_count, timestamp}) + +// 关系类型 +(Knowledge)-[:GROUNDS]->(Experience) +(Experience)-[:REVEALS]->(Insight) +(Insight)-[:CRYSTALLIZES_INTO]->(Wisdom) +(Wisdom)-[:REINFORCES]->(Knowledge) // 反馈闭环 —— 图谱在学习 + +(Knowledge)-[:SEMANTICALLY_SIMILAR_TO]->(Knowledge) +(Insight)-[:CONTRADICTS]->(Insight) // 张力浮现,需要反思 +(any)-[:SOURCED_FROM]->(Source {uri, author, ingested_at}) +``` + +置信度沿图谱向上流动。8 个经验支撑的洞察比 2 个支撑的模式强度更高。智慧节点追踪 `reinforcement_count` —— 遍历确认该原则有效的次数。 + +--- + +## 你能得到什么 + +**跨项目神节点** —— 跨越*所有*项目和语料库的核心概念,而不仅是单个仓库的。 + +**矛盾检测** —— 两个洞察方向相反时,以 `CONTRADICTS` 边的形式浮现。图谱展示冲突,由你解决,形成更好的智慧。 + +**时间衰减** —— 节点带时间戳。长时间未被强化的旧知识会被标记。图谱优雅地老化,如同专家的记忆。 + +**完整溯源链** —— 每个节点关联到其 `Source`。`/wisdom explain "节点名"` 返回完整 DIKW 路径:事实 → 上下文 → 模式 → 原则。 + +--- + +## 部署方案对比 + +| | Aura 免费版 | DozerDB 本地 | +|---|---|---| +| **配置** | 3 步点击 + URI | 1 条 docker 命令 | +| **费用** | 免费(20 万节点) | 永久免费 | +| **APOC** | 可用 | 内置 | +| **数据位置** | Neo4j 云端 | 你自己的机器 | +| **可视化** | neo4j.com 控制台 | localhost:7474 | +| **适合** | 快速上手、个人用户 | 团队、离线、完全掌控 | + +--- + +## 隐私说明 + +wisdomGraph 将文件内容发送给你的 AI 编程助手的底层模型 API 进行语义提取 —— Anthropic(Claude Code)或你所在平台使用的任何模型。代码文件通过 tree-sitter AST 在本地处理,不会发送到外部。所有图谱数据存储在*你的* Neo4j 实例中(Aura 或本地)。无遥测、无使用追踪、无任何形式的数据分析。 + +--- + +## 技术栈 + +Neo4j(Aura 或 DozerDB)+ tree-sitter + APOC。语义提取通过 Claude(Claude Code)或你平台的模型完成。图数据库就是智能层 —— 遍历、路径查找和社区检测通过 Neo4j GDS(图数据科学库)原生 Cypher 运行。 + +--- + +
+贡献指南 + +**工作示例**是最有说服力的贡献。在真实的多项目语料库上跑 `/wisdom`,让它反思几轮,记录涌现出哪些智慧节点、是否与你的直觉吻合。提交到 `worked/{slug}/`。 + +**Schema 提案** —— 如果你有捕捉当前 Schema 遗漏语义的关系类型,欢迎提 issue,附上 Cypher 模式和工作示例。 + +**DIKW 晋升启发式** —— 更好的知识→经验→洞察→智慧晋升提示词或规则。晋升逻辑是系统的核心。 + +详见 [ARCHITECTURE.md](ARCHITECTURE.md) 了解完整流水线设计、Cypher Schema 和如何扩展 DIKW 层次。 + +
diff --git a/ci.sh b/ci.sh new file mode 100755 index 0000000..255005f --- /dev/null +++ b/ci.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +# wisdomGraph CI — unit tests (no Neo4j required) +set -euo pipefail + +echo "==> Creating venv" +python3 -m venv .venv +source .venv/bin/activate + +echo "==> Installing dependencies" +pip install pytest -q + +echo "==> Running tests" +python -m pytest tests/ -q + +echo "==> All tests passed" diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_cache.py b/tests/test_cache.py new file mode 100644 index 0000000..957d955 --- /dev/null +++ b/tests/test_cache.py @@ -0,0 +1,68 @@ +"""Tests for wisdom/cache.py""" +import pytest +from pathlib import Path +from wisdom.cache import file_hash, load_cached, save_cached, check_cache, save_extractions + + +def test_file_hash_consistent(tmp_path): + f = tmp_path / "test.py" + f.write_text("hello world") + h1 = file_hash(f) + h2 = file_hash(f) + assert h1 == h2 + + +def test_file_hash_changes_on_content(tmp_path): + f = tmp_path / "test.py" + f.write_text("version 1") + h1 = file_hash(f) + f.write_text("version 2") + h2 = file_hash(f) + assert h1 != h2 + + +def test_load_cached_miss(tmp_path): + f = tmp_path / "test.py" + f.write_text("hello") + assert load_cached(f, root=tmp_path) is None + + +def test_save_and_load_cached(tmp_path): + f = tmp_path / "test.py" + f.write_text("def foo(): pass") + data = {"nodes": [{"id": "foo", "label": "foo"}], "edges": []} + save_cached(f, data, root=tmp_path) + result = load_cached(f, root=tmp_path) + assert result == data + + +def test_cache_invalidated_on_change(tmp_path): + f = tmp_path / "test.py" + f.write_text("original") + data = {"nodes": [], "edges": []} + save_cached(f, data, root=tmp_path) + f.write_text("changed") + assert load_cached(f, root=tmp_path) is None + + +def test_check_cache_splits(tmp_path): + f1 = tmp_path / "cached.py" + f1.write_text("x = 1") + f2 = tmp_path / "uncached.py" + f2.write_text("y = 2") + + data = {"nodes": [{"id": "x"}], "edges": [], "source_file": str(f1)} + save_cached(f1, data, root=tmp_path) + + cached, uncached = check_cache([str(f1), str(f2)], root=tmp_path) + assert len(cached) == 1 + assert str(f2) in uncached + + +def test_save_extractions(tmp_path): + f = tmp_path / "src.py" + f.write_text("code") + ext = {"nodes": [{"id": "n1"}], "edges": [], "source_file": str(f)} + count = save_extractions([ext], root=tmp_path) + assert count == 1 + assert load_cached(f, root=tmp_path) is not None diff --git a/tests/test_classify.py b/tests/test_classify.py new file mode 100644 index 0000000..d21aaa4 --- /dev/null +++ b/tests/test_classify.py @@ -0,0 +1,113 @@ +"""Tests for wisdom/classify.py""" +from wisdom.classify import classify_nodes, build_dikw_edges, promote_experiences + + +def _node(id, label, tier=None, **kwargs): + n = {"id": id, "label": label} + if tier: + n["tier"] = tier + n.update(kwargs) + return n + + +def _edge(src, tgt, relation="calls", conf_tag="EXTRACTED"): + return {"source": src, "target": tgt, "relation": relation, "confidence_tag": conf_tag} + + +# ── classify_nodes ──────────────────────────────────────────────────────────── + +def test_default_tier_is_knowledge(): + nodes = [_node("n1", "MyFunction")] + result = classify_nodes(nodes, [], project="proj-a") + assert result[0]["tier"] == "knowledge" + + +def test_explicit_tier_respected(): + nodes = [_node("w1", "Use JWT", tier="wisdom")] + result = classify_nodes(nodes, [], project="proj-a") + assert result[0]["tier"] == "wisdom" + + +def test_insight_heuristic_3_similarity_edges(): + nodes = [_node("n1", "Auth")] + edges = [ + _edge("n1", "n2", "semantically_similar_to"), + _edge("n1", "n3", "semantically_similar_to"), + _edge("n1", "n4", "conceptually_related_to"), + ] + result = classify_nodes(nodes, edges, project="proj-a") + assert result[0]["tier"] == "insight" + + +def test_project_injected(): + nodes = [_node("n1", "Foo")] + result = classify_nodes(nodes, [], project="my-project") + assert result[0]["project"] == "my-project" + + +def test_confidence_set_for_extracted(): + nodes = [_node("n1", "Foo", confidence_tag="EXTRACTED")] + result = classify_nodes(nodes, [], project="p") + assert result[0]["confidence"] == 1.0 + + +def test_confidence_lower_for_inferred(): + nodes = [_node("n1", "Foo", confidence_tag="INFERRED")] + result = classify_nodes(nodes, [], project="p") + assert result[0]["confidence"] < 1.0 + + +# ── build_dikw_edges ───────────────────────────────────────────────────────── + +def test_grounds_edge_added_for_k_to_e(): + nodes = [ + _node("k1", "JWT", tier="knowledge"), + _node("e1", "JWT Context", tier="experience"), + ] + edges = [_edge("k1", "e1", "uses")] + result = build_dikw_edges(nodes, edges) + relations = [e["relation"] for e in result] + assert "GROUNDS" in relations + + +def test_reveals_edge_for_e_to_i(): + nodes = [ + _node("e1", "Pattern", tier="experience"), + _node("i1", "Auth Insight", tier="insight"), + ] + edges = [_edge("e1", "i1", "semantically_similar_to")] + result = build_dikw_edges(nodes, edges) + relations = [e["relation"] for e in result] + assert "REVEALS" in relations + + +def test_no_duplicate_dikw_edges(): + nodes = [ + _node("k1", "A", tier="knowledge"), + _node("e1", "B", tier="experience"), + ] + edges = [_edge("k1", "e1", "uses"), _edge("k1", "e1", "calls")] + result = build_dikw_edges(nodes, edges) + grounds_count = sum(1 for e in result if e["relation"] == "GROUNDS") + assert grounds_count == 1 + + +def test_same_tier_no_dikw_edge(): + nodes = [_node("k1", "A", tier="knowledge"), _node("k2", "B", tier="knowledge")] + edges = [_edge("k1", "k2", "calls")] + result = build_dikw_edges(nodes, edges) + assert all(e["relation"] != "GROUNDS" for e in result if e["source"] == "k1" and e["target"] == "k2" and e["relation"] not in ("calls",)) + + +# ── promote_experiences ─────────────────────────────────────────────────────── + +def test_promotes_knowledge_when_in_existing_projects(): + nodes = [_node("k1", "JWT", tier="knowledge")] + result = promote_experiences(nodes, existing_projects=["k1"]) + assert result[0]["tier"] == "experience" + + +def test_no_promotion_when_not_in_existing(): + nodes = [_node("k1", "JWT", tier="knowledge")] + result = promote_experiences(nodes, existing_projects=["other_id"]) + assert result[0]["tier"] == "knowledge" diff --git a/tests/test_detect.py b/tests/test_detect.py new file mode 100644 index 0000000..65f49aa --- /dev/null +++ b/tests/test_detect.py @@ -0,0 +1,98 @@ +"""Tests for wisdom/detect.py""" +import pytest +from pathlib import Path +from wisdom.detect import classify_file, detect, FileType, _looks_like_paper + + +def test_classify_python(): + assert classify_file(Path("main.py")) == FileType.CODE + + +def test_classify_typescript(): + assert classify_file(Path("app.tsx")) == FileType.CODE + + +def test_classify_markdown(): + assert classify_file(Path("README.md")) == FileType.DOCUMENT + + +def test_classify_pdf(): + assert classify_file(Path("paper.pdf")) == FileType.PAPER + + +def test_classify_image(): + assert classify_file(Path("diagram.png")) == FileType.IMAGE + + +def test_classify_unknown(): + assert classify_file(Path("file.xyz")) is None + + +def test_classify_docx(): + assert classify_file(Path("report.docx")) == FileType.DOCUMENT + + +def test_detect_finds_files(tmp_path): + (tmp_path / "main.py").write_text("def foo(): pass") + (tmp_path / "README.md").write_text("# Hello") + (tmp_path / "diagram.png").write_bytes(b"\x89PNG\r\n") + + result = detect(tmp_path) + assert result["total_files"] == 3 + assert len(result["files"]["code"]) == 1 + assert len(result["files"]["document"]) == 1 + assert len(result["files"]["image"]) == 1 + + +def test_detect_skips_hidden_files(tmp_path): + (tmp_path / ".env").write_text("SECRET=abc") + (tmp_path / "main.py").write_text("x = 1") + result = detect(tmp_path) + # .env starts with '.' so it is skipped silently (not in skipped_sensitive) + assert result["total_files"] == 1 + + +def test_detect_skips_sensitive_non_hidden(tmp_path): + # A non-hidden file with a sensitive name should appear in skipped_sensitive + (tmp_path / "credentials.json").write_text('{"key": "secret"}') + (tmp_path / "main.py").write_text("x = 1") + result = detect(tmp_path) + assert result["total_files"] == 1 + assert any("credentials" in s for s in result["skipped_sensitive"]) + + +def test_detect_skips_node_modules(tmp_path): + nm = tmp_path / "node_modules" + nm.mkdir() + (nm / "lib.js").write_text("module.exports = {}") + (tmp_path / "app.js").write_text("const x = 1") + result = detect(tmp_path) + assert result["total_files"] == 1 + + +def test_detect_wisdomignore(tmp_path): + ignore = tmp_path / ".wisdomignore" + ignore.write_text("vendor/\n") + vendor = tmp_path / "vendor" + vendor.mkdir() + (vendor / "lib.py").write_text("pass") + (tmp_path / "main.py").write_text("pass") + result = detect(tmp_path) + assert result["total_files"] == 1 + + +def test_looks_like_paper_positive(tmp_path): + paper = tmp_path / "paper.md" + paper.write_text( + "Abstract: We propose a new method.\n" + "See [1] for details. arXiv:1706.03762\n" + "From the literature, we know that [2]\n" + "This is a preprint submitted to proceedings.\n" + ) + assert _looks_like_paper(paper) is True + + +def test_looks_like_paper_negative(tmp_path): + normal = tmp_path / "notes.md" + normal.write_text("# My notes\nTodo list for today.") + assert _looks_like_paper(normal) is False diff --git a/tests/test_ingest.py b/tests/test_ingest.py new file mode 100644 index 0000000..ccdd846 --- /dev/null +++ b/tests/test_ingest.py @@ -0,0 +1,46 @@ +"""Tests for wisdom/ingest.py — URL validation and HTML stripping (no network calls).""" +import pytest +from wisdom.ingest import _html_to_text, _utcnow + + +def test_html_to_text_strips_tags(): + html = "

Hello world

" + result = _html_to_text(html) + assert "

" not in result + assert "" not in result + assert "Hello" in result + assert "world" in result + + +def test_html_to_text_strips_script(): + html = "Content" + result = _html_to_text(html) + assert "alert" not in result + assert "Content" in result + + +def test_html_to_text_strips_style(): + html = "

Text

" + result = _html_to_text(html) + assert "color" not in result + assert "Text" in result + + +def test_html_to_text_decodes_entities(): + html = "& <tag>   'quote'" + result = _html_to_text(html) + assert "&" in result + assert "" in result + assert "'" in result + + +def test_html_to_text_normalizes_whitespace(): + html = " lots of spaces " + result = _html_to_text(html) + assert " " not in result + + +def test_utcnow_returns_iso_string(): + ts = _utcnow() + assert "T" in ts + assert ts.endswith("+00:00") or ts.endswith("Z") or "+" in ts diff --git a/tests/test_security.py b/tests/test_security.py new file mode 100644 index 0000000..a77e6c7 --- /dev/null +++ b/tests/test_security.py @@ -0,0 +1,74 @@ +"""Tests for wisdom/security.py""" +import pytest +from pathlib import Path +from wisdom.security import validate_url, sanitize_label, is_sensitive_path, validate_graph_path + + +def test_validate_url_http(): + assert validate_url("http://example.com/page") == "http://example.com/page" + + +def test_validate_url_https(): + assert validate_url(" https://arxiv.org/abs/1706.03762 ") == "https://arxiv.org/abs/1706.03762" + + +def test_validate_url_rejects_file(): + with pytest.raises(ValueError, match="file"): + validate_url("file:///etc/passwd") + + +def test_validate_url_rejects_ftp(): + with pytest.raises(ValueError, match="ftp"): + validate_url("ftp://example.com") + + +def test_validate_url_rejects_no_host(): + with pytest.raises(ValueError): + validate_url("https://") + + +def test_sanitize_label_strips_control(): + assert "\x00" not in sanitize_label("hello\x00world") + assert "\x1f" not in sanitize_label("foo\x1fbar") + + +def test_sanitize_label_caps_length(): + long = "a" * 500 + assert len(sanitize_label(long)) <= 256 + + +def test_sanitize_label_html_escapes(): + result = sanitize_label('') + assert "