From b2e855d7c9774892d38f63a2b3536aa8087b77cb Mon Sep 17 00:00:00 2001 From: garyqlin Date: Tue, 9 Jun 2026 15:07:35 +0800 Subject: [PATCH 1/4] =?UTF-8?q?feat:=20v0.4.1=20=E2=80=94=20GMem=20memory?= =?UTF-8?q?=20infra=20+=20Opprime=20synced=20improvements?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - FTS5 token filter: fixed detail=column pure-digit/alpha pollution bug - Knowledge auto-record_hit() on every auto-search match - tools/remember_info.py: unified memory route (Knowledge/Notes/Experience) - tools/archive_search.py: Archive Store search for cross-session recall - tools/glink_projects.py: Glink project management integration - kernel.py: _recorded_tc tracking, _build_gmem_summary, FINISH.md rules - lib/session.py/experience.py/storage.py: Opprime consistency sync - rules/: AGENCY.md, THINKING.md, FINISH.md — public-ready rule templates - PII sanitized for public release - CHANGELOG.md: v0.4.1 entry --- CHANGELOG.md | 34 ++++++ lib/archive_store.py | 74 +++++++++++-- lib/experience.py | 54 ++++++---- lib/kernel.py | 35 ++++++ lib/session.py | 229 +++++++++++++++++++++++++++++++++------- lib/storage.py | 177 ++++++++++++++++++++++--------- main.py | 8 +- rules/AGENCY.md | 24 +++++ rules/FINISH.md | 27 +++++ rules/THINKING.md | 21 ++++ tools/__init__.py | 3 + tools/archive_search.py | 64 +++++++++++ tools/glink_projects.py | 208 ++++++++++++++++++++++++++++++++++++ tools/mirror_tool.py | 2 +- tools/remember_info.py | 189 +++++++++++++++++++++++++++++++++ tools/self_edit.py | 8 +- tools/test_gen.py | 4 +- tools/trident_tools.py | 14 +-- 18 files changed, 1041 insertions(+), 134 deletions(-) create mode 100644 rules/AGENCY.md create mode 100644 rules/FINISH.md create mode 100644 rules/THINKING.md create mode 100644 tools/archive_search.py create mode 100644 tools/glink_projects.py create mode 100644 tools/remember_info.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 8ff35a5..5fe24c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,27 @@ # Changelog +## [0.4.1] - 2026-06-09 + +### Added +- FTS5 token filter: fixed `detail=column` issue where pure-digit/alpha tokens were parsed as column names (port from Opprime) +- Knowledge auto-retrieval: record_hit() on every auto-search hit for cache warmth +- _recorded_tc: track total tool calls per loop execution for SkillOpt gate integration +- _build_gmem_summary: standalone helper for archive_store compression summary construction +- tools/remember_info.py: unified memory route — auto-classifies user content into Knowledge/Notes/Experience +- tools/archive_search.py: Archive Store search tool for cross-session memory recall + +### Changed +- kernel.py: merged Opprime improvements (FTS5 safety filter + Knowledge hit recording + loop tc tracking) +- session.py: synchronized with Opprime (fixes for L1 compaction type mismatch, image_url filtering) +- experience.py: synchronized with Opprime (improved JSON error tolerance) +- storage.py: synchronized with Opprime (aging mechanism for Knowledge entries) +- archive_store.py: generalized agent-specific DB fallback paths to `~/gbase-home/` (public-ready) + +### Fixed +- knowledge.py: FTS5 MATCH returning 0 matches due to unpurged detail=column token pollution +- session.py: content null leading to DeepSeek serde enum variant mismatch (400 error) +- session.py: L1 compression type mismatch (passing str instead of list[dict]) + ## [0.4.0] - 2026-06-02 ### Added @@ -14,6 +36,18 @@ - L4 permanent notes: auto-serialize when tool calls >= 5 or reply >= 500 chars - 20+ new tools: self_edit, feishu_send, note_tool, knowledge, mirror_tool, chain, mail, security_watch, etc. +### Added +- ArchiveStore: full-text conversation archive with LIKE-based semantic retrieval (replaces LLM compression) +- Session: removed 3-layer LLM compression, replaced with ArchiveStore append/search +- Territory safety: cross-agent read/write access control (write blocking, read warning) +- RSI Dual-Knob: task intent classification → dynamic temperature control +- Time-decay weighted retrieval: M3 sparse attention inspired (7d full / 7-30d linear / 30d+ exponential) +- Entity conflict detection: Cosmos 3 inspired, auto-detect contradictions on write +- Hot query cache (LRU, max 64) for high-frequency entity lookups +- Archive trash recovery: deleted entries saved to `data/archive_trash/` as grep-able JSONL +- L4 permanent notes: auto-serialize when tool calls >= 5 or reply >= 500 chars +- 20+ new tools: self_edit, feishu_send, note_tool, knowledge, mirror_tool, chain, mail, security_watch, etc. + ### Changed - kernel.py: archive_store init + semantic bridge search/save (disabled old online LLM compression) - session.py: replaced compress_l1/l2/async_compress with archive-driven context building diff --git a/lib/archive_store.py b/lib/archive_store.py index 6c7b7a6..bf368fe 100644 --- a/lib/archive_store.py +++ b/lib/archive_store.py @@ -22,7 +22,6 @@ hits = store.search("query keywords") """ -import contextlib import json import logging import os @@ -136,7 +135,10 @@ def append(self, role: str, content: str | list | dict, *, priority: int = 0, so if not content: return - content = json.dumps(content, ensure_ascii=False) if isinstance(content, (list, dict)) else str(content) + if isinstance(content, (list, dict)): + content = json.dumps(content, ensure_ascii=False) + else: + content = str(content) if len(content) > _MAX_CONTENT_CHARS: content = content[:_MAX_CONTENT_CHARS] + "..." @@ -520,7 +522,7 @@ def search(self, query: str, top_k: int = _DEFAULT_SEARCH_TOP_K, params: list = [self.session_key] # 关键词条件 - kw_conditions = " OR ".join("content LIKE ? COLLATE NOCASE" for _ in keywords) + kw_conditions = " OR ".join(f"content LIKE ? COLLATE NOCASE" for _ in keywords) where_parts.append(f"({kw_conditions})") params.extend(f"%{k}%" for k in keywords) @@ -552,7 +554,7 @@ def search(self, query: str, top_k: int = _DEFAULT_SEARCH_TOP_K, # 计算每条记录的命中关键词数(作为粗糙的 BM25 替代) scored = [] - for content, role, ts, priority, _source_id, _eid in all_rows: + for content, role, ts, priority, source_id, eid in all_rows: if not content: continue hits = sum(1 for kw in keywords if kw in content or kw.lower() in content.lower()) @@ -632,11 +634,11 @@ def _extract_keywords(self, query: str) -> list[str]: "这样", "那样", "可能", "需要", "之后", "之前", "现在", "我们", "他们", "你们", "自己", "一些", "这些", "那些", "谢谢", "你好", "请问", "好的", "是的", "知道", "觉得", - "然后", "或者", "除了", "不想", "想要", "打算", - "看到", "听说", "告诉", "我的", "你的", "他的", + "然后", "或者", "还是", "除了", "不想", "想要", "打算", + "看到", "听说", "觉得", "告诉", "我的", "你的", "他的", "大家", "东西", "时候", "不错", "真的", "非常", "很多", "工作", "生活", "事情", "感觉", "方面", "一点", "一定", - "还有", "出来", + "还有", "因为", "出来", } keywords = [k for k in keywords if k not in _COMMON_BIGRAMS] @@ -726,8 +728,10 @@ def close(self): self.flush() def __del__(self): - with contextlib.suppress(Exception): + try: self.close() + except Exception: + pass # ── 旧数据迁移 ───────────────────────────────────── @@ -765,6 +769,58 @@ def _save_trash(session_key: str, rows: list[tuple]): logger.warning("归档Write失败(不影响主流程): %s", e) +def recent_global(limit: int = 10, hours: int = 72) -> dict: + """跨 session 获取最近 N 小时的全局 markers(Phase 4 学用对接)。 + + 不限制 session_key,只按时间过滤。 + 用于 session 预热时注入同主题历史。 + """ + import sqlite3, time, datetime + + # 找 archive.db(尝试多个位置) + candidates = [ + os.path.join(os.path.dirname(__file__), "..", "data", "archive.db"), + os.path.expanduser("~/gbase-home/data/archive.db"), + ] + db_path = None + for c in candidates: + p = os.path.abspath(c) + if os.path.exists(p): + db_path = p + break + if not db_path: + return {"markers": [], "count": 0, "db": None} + + cutoff_ts = time.time() - hours * 3600 + + with _LOCK: + try: + conn = sqlite3.connect(db_path) + cursor = conn.execute( + "SELECT marker, timestamp, session_key FROM archive_markers " + "WHERE timestamp >= ? " + "ORDER BY timestamp DESC LIMIT ?", + (cutoff_ts, limit), + ) + rows = cursor.fetchall() + conn.close() + except Exception: + return {"markers": [], "count": 0, "db": db_path} + + result = [] + for marker, ts, skey in rows: + dt = datetime.datetime.fromtimestamp(ts) + skey_short = skey.split(":")[-1][:20] if skey else "" + result.append({ + "marker": marker[:120], + "timestamp": ts, + "time_str": dt.strftime("%m-%d %H:%M"), + "session": skey_short, + }) + + return {"markers": result, "count": len(result), "db": db_path} + + def _copy_old_data(dat_db_path: str, archive_db_path: str): """从 dat.db 导入旧 experience/knowledge 数据到 archive.db(一次性)。""" if not os.path.exists(dat_db_path): @@ -779,7 +835,7 @@ def _copy_old_data(dat_db_path: str, archive_db_path: str): cursor = conn.cursor() # 从 entries table 找 experience 和 knowledge - for tbl, _pri in [("entries", 1)]: + for tbl, pri in [("entries", 1)]: try: cursor.execute(f"SELECT content, type FROM {tbl} WHERE content IS NOT NULL AND content != ''") for content, typ in cursor.fetchall(): diff --git a/lib/experience.py b/lib/experience.py index e010abf..fdccd19 100644 --- a/lib/experience.py +++ b/lib/experience.py @@ -40,19 +40,14 @@ "summary": "此次任务工具调用次数偏多({tool_calls_count}次),下次同类任务应该先规划再调工具", "confidence": "medium", }, - { - "name": "short_reply", - "check": lambda ctx: len(ctx.get("reply", "")) < 80, - "summary": "回复长度偏短({reply_len}字),下次应尽量提供更完整的回答", - "confidence": "low", - }, + { "name": "api_error", "check": lambda ctx: ctx.get("has_api_error", False), "summary": "工具调用时有 API 错误,下次应注意检查工具是否可用", "confidence": "high", }, - # ── 反脆弱: 失败尝试也写入经验,不静默Rollback ── + # ── 反脆弱: 失败尝试也写入经验,不静默回滚 ── { "name": "failed_action", "check": lambda ctx: bool(ctx.get("has_failure", False)), @@ -62,16 +57,16 @@ { "name": "failed_rollback", "check": lambda ctx: bool(ctx.get("rollback_occurred", False)), - "summary": "执行Rollback: [{rollback_action}] 验证失败,已Rollback。这条路走不通。", + "summary": "执行回滚: [{rollback_action}] 验证失败,已回滚。这条路走不通。", "confidence": "medium", }, # ── 反脆弱: 成功模式提炼(成功比失败更需要分析)── { "name": "success_pattern", - "check": lambda ctx: ctx.get("tool_calls_count", 0) > 0 + "check": lambda ctx: ctx.get("tool_calls_count", 0) >= 3 and not ctx.get("has_api_error", False) and not ctx.get("has_failure", False), - "summary": "成功完成[{task_theme}],工具调用{successful_calls}次。有效策略:{effective_strategy}", + "summary": "有效模式: [{task_theme}] 用 {tool_calls_count} 次工具调用完成", "confidence": "medium", }, ] @@ -150,11 +145,11 @@ def _is_duplicate_rule(storage: "store_module.Storage", rule_name: str) -> bool: """ -# ── Experience extraction器 ────────────────────────────────────────── +# ── 经验提取器 ────────────────────────────────────────── class ExperienceEngine: - """Experience Engine。绑定到一个 Storage 实例上运作。""" + """经验引擎。绑定到一个 Storage 实例上运作。""" def __init__(self, storage: store_module.Storage): self.storage = storage @@ -204,10 +199,10 @@ async def extract( if _is_duplicate_rule(self.storage, rule_name): self._skip_count[rule_name] = self._skip_count.get(rule_name, 0) + 1 - logger.debug("Experience deduplication跳过: rule=%s (已跳过%d次)", rule_name, self._skip_count[rule_name]) + logger.debug("经验去重跳过: rule=%s (已跳过%d次)", rule_name, self._skip_count[rule_name]) return - logger.info("Experience extraction(规则): %s", rule_result["summary"][:60]) + logger.info("经验提取(规则): %s", rule_result["summary"][:60]) # --- 如果是成功完成任务自动刻入 insight --- if tool_calls_count > 0 and not has_api_error and rule_result["type"] != "insight" and not has_failure: _record_success_insight(self, user_message, tool_calls_count) @@ -247,7 +242,7 @@ async def extract( try: await self._llm_extract(context, llm_client) except Exception as e: - logger.warning("Experience extraction(LLM)失败: %s", e) + logger.warning("经验提取(LLM)失败: %s", e) async def _llm_extract(self, context: dict, client): """元认知反思提取 — 从「发生了什么」升级到「为什么发生、如何避免、什么条件下该用不同策略」。 @@ -270,10 +265,27 @@ async def _llm_extract(self, context: dict, client): ) text = response.choices[0].message.content.strip() if text == "null" or not text: - logger.debug("Experience extraction(LLM): 无有价值教训") + logger.debug("经验提取(LLM): 无有价值教训") + return + + # 类型防御:LLM 可能返回不完整 JSON(被截断的末尾) + is_clean = False + for try_idx in range(3): + try: + result = json.loads(text) + is_clean = True + break + except json.JSONDecodeError: + # 尝试找到最晚的完整 JSON 截止点 + last_brace = text.rfind("}") + if last_brace > 0: + text = text[:last_brace + 1] + else: + break + if not is_clean: + logger.warning("经验提取(LLM): JSON 解析失败,跳过") return - result = json.loads(text) if "summary" in result: # 构建结构化 entry summary = result["summary"][:200] @@ -316,19 +328,19 @@ async def _llm_extract(self, context: dict, client): except Exception: pass - logger.info("Experience extraction(元认知反思): %s", summary[:60]) + logger.info("经验提取(元认知反思): %s", summary[:60]) # --- 自动刻入 insight(成功任务不留空洞) --- if context.get("tool_calls_count", 0) > 0 and not context.get("has_api_error", False): _record_success_insight(self, context.get("user_message", ""), context["tool_calls_count"]) except (json.JSONDecodeError, KeyError) as e: - logger.debug("Experience extraction(LLM)解析失败: %s | 原始响应: %s", e, text[:200] if 'text' in dir() else "N/A") + logger.debug("经验提取(LLM)解析失败: %s | 原始响应: %s", e, text[:200] if 'text' in dir() else "N/A") except Exception as e: - logger.debug("Experience extraction(LLM)异常: %s", e) + logger.debug("经验提取(LLM)异常: %s", e) def search(self, query: str, limit: int = 5) -> list[dict]: - """搜索经验库。优先 FTS5 全文Search,无结果时回退 LIKE 模糊匹配。 + """搜索经验库。优先 FTS5 全文检索,无结果时回退 LIKE 模糊匹配。 排序逻辑: - 先按 BM25 相关性分 + 内容长度惩罚(太长降级) diff --git a/lib/kernel.py b/lib/kernel.py index 59aa526..a852cd4 100644 --- a/lib/kernel.py +++ b/lib/kernel.py @@ -466,6 +466,14 @@ def _build_dynamic_system_prompt(self) -> str: # 中文多字词,拆单字也加进去 for _ch in _w: _fts_tokens.append(f"{_ch}*") + # FTS5 detail=column 下纯数字/纯单字母 token 会被解析为 column name + _fts_tokens = [t for t in _fts_tokens + if not _import_re.match(r'^\d+$', t) + and not _import_re.match(r'^[a-zA-Z]$', t) + and len(t) > 1] + # 过滤后保底:至少保留原始词保证有查询内容 + if not _fts_tokens: + _fts_tokens = [f"{_w}*" for _w in _words if len(_w) > 1] _fts_query = " OR ".join(_fts_tokens)[:500] _results = [] with _storage._lock: @@ -503,6 +511,12 @@ def _build_dynamic_system_prompt(self) -> str: + "\n".join(_results) ) parts.append(_know_text) + # GMem Phase 1A1: 自动检索命中后 record_hit + for _hit_r in _rows: + try: + _storage.record_hit(_hit_r[0]) + except Exception: + logger.exception("记录 hit 失败 (id=%s)", _hit_r[0]) logger.info("Knowledge 自动Search: 命中 %d 条", len(_results)) else: logger.info("Knowledge 自动Search: 无命中") @@ -1711,7 +1725,28 @@ async def _run_one_tool(tc): if session: session.append(tr) + # SkillOpt: 记录本轮工具调用数 + self._recorded_tc = getattr(self, "_recorded_tc", 0) + len(msg.tool_calls) + # 递归至多 15 层 if depth + 1 >= MAX_TOOL_DEPTH: return await self._loop(messages, tools, depth=depth + 1, session=session) return await self._loop(messages, tools, depth=depth + 1, session=session) + + +# ── GMem Phase B1: 构建压缩摘要(供 archive_store 存档) ── +def _build_gmem_summary(stats: dict, session) -> str: + """从 session 统计信息构建压缩摘要文本。""" + try: + parts = [f"上下文压缩 checkpoint — 消息数: {stats.get('messages', 0)}, 压缩次数: {stats.get('compactions', 0)}, 层级: {session.get_compaction_level() if hasattr(session, 'get_compaction_level') else 0}"] + # 尝试获取最后几条会话摘要 + if hasattr(session, "get_all_compactions"): + compactions = session.get_all_compactions() + for c in compactions[-3:]: + if isinstance(c, str): + parts.append(f" · {c[:200]}") + elif isinstance(c, dict): + parts.append(f" · {c.get('summary', str(c)[:200])}") + return "\n".join(parts) + except Exception: + return "" diff --git a/lib/session.py b/lib/session.py index 3e08ab5..a94e95b 100644 --- a/lib/session.py +++ b/lib/session.py @@ -2,50 +2,52 @@ """ gbase/lib/session.py -Session management: append-only JSONL implementation. -Never physically delete old entries, navigate via compression markers. +Session 管理:append-only JSONL 实现。 +永不物理删除旧条目,通过压缩路标跳转。 -Three-layer context compression system (simplified version of Claude Code's 5-layer compression): -- L1: Real-time online compression - Generate summary with LLM when conversation exceeds threshold -- L2: Multi-layer summary evolution - Merge multiple compactions into higher-level summaries -- L3: Session state tracking - Dynamic compression threshold + context usage statistics +三层上下文压缩体系(Claude Code 五层压缩的简化版): +- L1: 在线实时压缩 — 对话超过阈值时用 LLM 生成摘要 +- L2: 多层摘要进化 — 多个 compaction 合并为更高级摘要 +- L3: 会话状态追踪 — 动态压缩阈值 + 上下文使用量统计 """ +import asyncio import json import logging +import threading import time from pathlib import Path logger = logging.getLogger(__name__) class JsonlSessionManager: - """Append-only JSONL Session Manager with three-layer compression capability.""" + """Append-only JSONL 会话管理器,带三层压缩能力。""" def __init__(self, filepath: str, max_context: int = 100): self.filepath = Path(filepath) self.filepath.parent.mkdir(parents=True, exist_ok=True) self.max_context = max_context - self._adaptive_max = max_context # L3: Dynamic threshold adjustment + self._adaptive_max = max_context # L3: 动态阈值调节 self.fh: object | None = None self._stats = {"messages": 0, "compactions": 0, "tokens_estimate": 0} - self._compacted_up_to = 0 # Compression marker - self._compaction_level = 0 # L2: Current summary level (number of merge compressions) + self._compacted_up_to = 0 # 压缩路标 + self._compaction_level = 0 # L2: 当前摘要层级(第几次合并压缩) self._open() def _open(self): - """Open or create JSONL file.""" + """打开或创建 JSONL 文件。""" if self.fh: try: if hasattr(self.fh, "close"): self.fh.close() except Exception: - logger.exception("Silent exception") + logger.exception("静默异常") self.fh = open(self.filepath, "a+", encoding="utf-8") def _update_adaptive_max(self): - """L3: Dynamically adjust context retention rounds based on compression level.""" - # After each layer of compression, the number of retained rounds decreases, but not below the minimum + """L3: 根据压缩层级动态调节上下文保留轮次。""" + # 每层压缩后,保留的轮次缩小,但不低于底线 base = self.max_context level = self._compaction_level if level <= 0: @@ -55,16 +57,32 @@ def _update_adaptive_max(self): elif level == 2: self._adaptive_max = max(8, base - 8) else: - self._adaptive_max = 50 # Level 3 and above, retain at least 3 rounds (6 messages) + self._adaptive_max = 50 # 第三层及以上,至少保留 3 轮(6 条消息) @staticmethod - def _estimate_tokens(text: str) -> int: - """Roughly estimate token count. + def _estimate_tokens(text: str | list | dict) -> int: + """粗略估算 token 数。支持 string / list[dict] / dict 类型。 - Chinese approx 1.5 chars/token, English approx 4 chars/token, plus safety margin. + 中文约 1.5 chars/token,英文约 4 chars/token,加安全边际。 """ if not text: return 0 + # 处理多模态消息(list[dict],含 text/image_url) + if isinstance(text, list): + total = 0 + for item in text: + if isinstance(item, dict): + for v in item.values(): + if isinstance(v, str): + total += len(v) + elif isinstance(v, dict): + total += len(str(v)) + elif isinstance(item, str): + total += len(item) + return int(total * 0.35) + 10 + if isinstance(text, dict): + flat = str(text) + return int(len(flat) * 0.35) + 10 chinese_chars = sum(1 for c in text if '\u4e00' <= c <= '\u9fff') other_chars = len(text) - chinese_chars return int(chinese_chars * 1.5 + other_chars / 4) + 10 @@ -79,7 +97,7 @@ def get_adaptive_max(self) -> int: return self._adaptive_max def append(self, entry: dict) -> int: - """Append a record. entry is a message dictionary, must contain role field.""" + """追加一条记录。entry 是消息字典,必须包含 role 字段。""" entry["_id"] = int(time.time() * 1000) entry["_ts"] = time.time() role = entry.get("role", "unknown") @@ -100,12 +118,12 @@ def append(self, entry: dict) -> int: return entry["_id"] def append_batch(self, entries: list[dict]): - """Batch append.""" + """批量追加。""" for e in entries: self.append(e) def append_user_message(self, content: str, extra: dict | None = None) -> int: - """Shortcut: Append a user message.""" + """快捷:追加一条用户消息。""" entry = {"role": "user", "content": content} if extra: entry.update(extra) @@ -115,17 +133,17 @@ def get_or_create(self, session_key: str) -> "JsonlSessionManager": return self def build_context(self, max_messages: int | None = None, max_tokens: int = 0) -> list[dict]: - """Build LLM messages context. + """构建 LLM messages 上下文。 - Three-layer filtering: - 1. Compaction entry skips old content, injects summary (multi-layer: only highest level summary is injected) - 2. Remove tool_call / tool_result - 3. Compress by round + retain last max_messages rounds + 三层过滤: + 1. compaction entry 跳过旧内容,注入摘要(多层:只有最高层摘要注入) + 2. 去掉 tool_call / tool_result + 3. 按轮压缩 + 保留最近 max_messages 轮 - If max_tokens > 0, accumulate tokens from back to front, truncate front content when exceeding. + 如果 max_tokens > 0,从后往前累计 token,超出则截断前面的内容。 - L2 multi-layer summary: If there are multiple compaction levels, - Only the highest level summary is injected into the context. + L2 多层摘要:如果有多个 compaction level, + 只有最高层的摘要被注入到上下文。 """ if max_messages is None: max_messages = self._adaptive_max @@ -133,8 +151,8 @@ def build_context(self, max_messages: int | None = None, max_tokens: int = 0) -> messages: list[dict] = [] current_assistant_buf: dict | None = None skipped_compacted = False - highest_summary = "" # L2: Highest level summary (for injection) - highest_entry = None # L2: Highest level complete entry (for structured field usage) + highest_summary = "" # L2: 最高层摘要(用于注入) + highest_entry = None # L2: 最高层完整 entry(结构化字段使用) highest_level = -1 try: @@ -171,11 +189,41 @@ def build_context(self, max_messages: int | None = None, max_tokens: int = 0) -> if current_assistant_buf is not None: messages.append(current_assistant_buf) current_assistant_buf = None - msg = {"role": "user", "content": entry.get("content", "")} + # 过滤 image_url 结构:部分模型(如 DeepSeek)不支持多模态 content + _raw_content = entry.get("content", "") or "" + if isinstance(_raw_content, list): + # list[dict] 格式(含 text/image_url)转纯文本段 + _text_parts = [] + for _item in _raw_content: + if isinstance(_item, dict): + if _item.get("type") == "text": + _text_parts.append(_item.get("text", "")) + elif _item.get("type") == "image_url": + _text_parts.append("[图片]") + else: + _text_parts.append(str(_item)) + else: + _text_parts.append(str(_item)) + _raw_content = "\n".join(_text_parts) + msg = {"role": "user", "content": _raw_content} messages.append(msg) elif entry_type == "assistant": - msg = {"role": "assistant", "content": entry.get("content", "")} + _raw_content = entry.get("content", "") or "" + if isinstance(_raw_content, list): + _text_parts = [] + for _item in _raw_content: + if isinstance(_item, dict): + if _item.get("type") == "text": + _text_parts.append(_item.get("text", "")) + elif _item.get("type") == "image_url": + _text_parts.append("[图片]") + else: + _text_parts.append(str(_item)) + else: + _text_parts.append(str(_item)) + _raw_content = "\n".join(_text_parts) + msg = {"role": "assistant", "content": _raw_content} if "reasoning_content" in entry: msg["reasoning_content"] = entry["reasoning_content"] # 始终保留 assistant 消息(包括 content="" 只有 tool_calls 的情况), @@ -271,7 +319,7 @@ def build_context(self, max_messages: int | None = None, max_tokens: int = 0) -> # 保留最近 max_messages 轮 # 但强制保留最后 1 轮完整 user+assistant(防止 LLM 忘记自己刚说过什么) if len(messages) > max_messages: - # 截断前Save最后完整的 user+assistant 对 + # 截断前保存最后完整的 user+assistant 对 keep = [] for m in reversed(messages): keep.insert(0, m) @@ -289,11 +337,11 @@ def build_context(self, max_messages: int | None = None, max_tokens: int = 0) -> def get_compaction_context(self, max_messages: int = 15) -> list[dict]: """L2: 获取压缩阶段的高层摘要 + 近期轮次。 - + 不同于 build_context(给 LLM 用),这个方法返回: - 所有层级的摘要列表(不是只取最高层) - 最新 max_messages 轮对话 - + 用于 L2 多层压缩:把旧摘要 + 近期对话 → 新摘要。 """ summaries: list[dict] = [] @@ -330,19 +378,122 @@ def get_compaction_context(self, max_messages: int = 15) -> list[dict]: if etype in ("user", "assistant"): recent.append({ "role": entry.get("role", etype), - "content": entry.get("content", ""), + "content": entry.get("content", "") or "", }) except Exception: - logger.exception("Silent exception") + logger.exception("静默异常") return {"summaries": summaries, "recent": recent[-max_messages:]} + def compress( + self, + compress_fn: callable, + threshold: int = 15, + ) -> dict | None: + """L1 + L2: 同步版上下文压缩(供 asyncio.to_thread 调用)。 + + 参数: + compress_fn: 接收消息列表,返回摘要文本的回调 + threshold: 触发 L1 压缩的最小消息轮次(默认 15 轮) + + 返回: + 压缩统计信息,超时时返回 None + """ + # L1: 在线实时压缩 — 对话超过阈值时用 LLM 生成摘要 + try: + context_data = self.get_compaction_context(threshold) + recent = context_data.get("recent", []) + if len(recent) < threshold: + return None + + # L1: 新摘要 + session_text = json.dumps(recent, ensure_ascii=False)[:5000] + # 类型防御:传给压缩函数的可能是截断的 JSON 字符串 + # compress_fn 在 kernel 层加了解析恢复逻辑 + summary = compress_fn(session_text) + if not summary: + return None + + entry = { + "type": "compaction", + "level": 0, + "summary": summary, + "decisions": [], + "key_facts": [], + "pending": [], + "context": summary[:500], + "messages_since_last": len(recent), + "_ts": int(time.time()), + } + self.append(entry) + self._compacted_up_to = self._stats["messages"] + self._compaction_level = 0 + self._update_adaptive_max() + self._stats["compactions"] += 1 + + # L2: 多层摘要进化 — 已有 compaction 时合并升级 + summaries = context_data.get("summaries", []) + old_summaries = [s for s in summaries if s.get("level", 0) < 2] + if len(old_summaries) >= 2: + merge_text = json.dumps(old_summaries[-3:], ensure_ascii=False)[:4000] + merged = compress_fn(merge_text) + if merged: + entry = { + "type": "compaction", + "level": 2, + "summary": merged, + "decisions": [], + "key_facts": [], + "pending": [], + "context": merged[:500], + "messages_since_last": 0, + "_ts": int(time.time()), + } + self.append(entry) + self._compaction_level = 2 + self._stats["compactions"] += 1 + + return self._stats.copy() + + except Exception: + logger.exception("L1/L2 压缩异常(静默)") + return None + + def start_async_compress( + self, + compress_fn: callable, + interval_sec: int = 600, + threshold: int = 25, + ): + """P2-3: 启动后台异步压缩守护线程。 + + 在守护线程中循环调用 compress(),失败后等待 10 分钟重试。 + + 参数: + compress_fn: 压缩回调(同步) + interval_sec: 压缩间隔(秒) + threshold: 触发压缩的最小消息轮次 + """ + import threading as _threading + + def _worker(): + while True: + try: + self.compress(compress_fn, threshold=threshold) + except Exception as e: + logger.exception("Async compress failed: %s", e) + time.sleep(interval_sec) + + thread = _threading.Thread(target=_worker, daemon=True) + thread.start() + logger.info("后台异步压缩守护已启动 (interval=%ds, threshold=%d)", interval_sec, threshold) + def close(self): if self.fh: try: self.fh.close() except Exception: - logger.exception("Silent exception") + logger.exception("静默异常") def __del__(self): self.close() diff --git a/lib/storage.py b/lib/storage.py index d88b85a..328fceb 100644 --- a/lib/storage.py +++ b/lib/storage.py @@ -1,10 +1,10 @@ # SPDX-License-Identifier: MIT """ -Gbase storage engine module +gbase/lib/storage.py -Persistence Engine — SQLite primary + JSONL readable mirror dual-write. +沉淀引擎 — SQLite 主力 + JSONL 可读镜像双写。 -All experience/knowledge/skill records are read/written through this module. +栈内存所有经验/知识/Skill 都通过这个模块读写。 """ import contextlib @@ -21,42 +21,42 @@ DATA_DIR = Path(__file__).parent.parent / "data" DB_PATH = DATA_DIR / "dat.db" -# JSONL mirror file names for each layer +# 三层对应的 JSONL 镜像文件名 _MIRROR_FILES = { "experience": "experience.jsonl", "knowledge": "knowledge.jsonl", "skills": "skills.jsonl", } -# P1: Soft limit — only delete un-referenced (hits=0) records older than 90 days -# Old hard limit of 50 was the root cause of goldfish memory. -# All hit-referenced records are now preserved permanently. +# P1: 软上限 — 只删从未被引用(hits=0)且超过 90 天的旧条目 +# 原 50 条硬上限是金鱼记忆的根因。现在保护所有被引用过的记录永久保留 _MAX_RECORDS = 50000 -_PRUNING_KEEP_DAYS = 90 # hide unused records after 90 days +_PRUNING_KEEP_DAYS = 90 # hits=0 的记录至少保留 90 天 class Storage: - """Storage engine. + """沉淀引擎。 - Usage: + 用法: store = Storage() - store.setup() # First-time init + store.setup() # 首次初始化 store.write("experience", {"summary": "xxx", ...}) entries = store.read_recent("experience", limit=5) - Thread-safe: uses threading.Lock internally. + 线程安全:内部使用 threading.RLock(可重入锁,支持递归调用)。 """ def __init__(self, db_path: str = None, data_dir: str = None): self._db_path = db_path or str(DB_PATH) self._data_dir = Path(data_dir) if data_dir else DATA_DIR - self._lock = threading.Lock() + self._lock = threading.RLock() self._conn: sqlite3.Connection | None = None + self._setup_ran = False # 避免 setup() 内的 ALTER 重复执行警告 - # ── Initialization ──────────────────────────────── + # ── 初始化 ────────────────────────────────────────── def setup(self): - """First-time init (create tables + directories + WAL mode).""" + """首次初始化(建表 + 建目录 + WAL 模式)。""" os.makedirs(self._data_dir, exist_ok=True) with self._lock: @@ -66,10 +66,10 @@ def setup(self): CREATE TABLE IF NOT EXISTS entries ( id INTEGER PRIMARY KEY AUTOINCREMENT, type TEXT NOT NULL, -- experience | knowledge | skills - content TEXT NOT NULL, -- JSON string - summary TEXT DEFAULT '', -- one-line summary - created_at REAL NOT NULL, -- timestamp - hits INTEGER DEFAULT 0, -- reference count + content TEXT NOT NULL, -- JSON 字符串 + summary TEXT DEFAULT '', -- 一句话摘要 + created_at REAL NOT NULL, -- 时间戳 + hits INTEGER DEFAULT 0, -- 被引用次数 confidence TEXT DEFAULT 'low' -- low | medium | high ) """) @@ -77,22 +77,26 @@ def setup(self): CREATE INDEX IF NOT EXISTS idx_type_created ON entries(type, created_at DESC) """) - # Compat migration: add tags/rule columns if missing + # 兼容迁移:旧表无 tags/rule/archived/last_accessed_at 列时加上 with contextlib.suppress(Exception): conn.execute("ALTER TABLE entries ADD COLUMN tags TEXT DEFAULT ''") with contextlib.suppress(Exception): conn.execute("ALTER TABLE entries ADD COLUMN rule TEXT DEFAULT ''") - # FTS5 full-text index (supports Chinese via unicode61 tokenizer) - # content='entries' means text is not stored separately, linked via rowid + with contextlib.suppress(Exception): + conn.execute("ALTER TABLE entries ADD COLUMN archived INTEGER DEFAULT 0") + with contextlib.suppress(Exception): + conn.execute("ALTER TABLE entries ADD COLUMN last_accessed_at REAL DEFAULT 0") + # FTS5 全文索引(支持中文 unicode61 tokenizer) + # content='entries' 表示不单独存文本,通过 entries 表 rowid 关联 conn.execute(""" CREATE VIRTUAL TABLE IF NOT EXISTS entries_fts USING fts5( content, summary, content='entries', content_rowid='id', tokenize='unicode61', - detail=column + detail=full ) """) - # Triggers: auto-sync FTS on insert/delete/update + # 触发器:写入/删除/更新时自动同步 FTS conn.executescript(""" CREATE TRIGGER IF NOT EXISTS entries_fts_ai AFTER INSERT ON entries BEGIN INSERT INTO entries_fts(rowid, content, summary) @@ -111,7 +115,7 @@ def setup(self): """) conn.commit() self._conn = conn - # Rebuild FTS (existing data not yet in FTS) + # 重建 FTS(已有的数据未进 FTS) try: cursor = conn.execute("SELECT COUNT(*) FROM entries_fts") fts_count = cursor.fetchone()[0] @@ -121,28 +125,55 @@ def setup(self): conn.executescript(""" INSERT INTO entries_fts(entries_fts) VALUES('rebuild'); """) - logger.info("FTS index rebuild complete: %d entries", total) + logger.info("FTS 索引重建完成: %d 条", total) except Exception as rebuild_err: - logger.warning("FTS index rebuild skipped: %s", rebuild_err) - logger.info("Storage engine ready: %s", self._db_path) + logger.warning("FTS 索引重建跳过: %s", rebuild_err) + logger.info("存储引擎已就绪: %s", self._db_path) - # ── Write ────────────────────────────────────────── + # ── 写入 ──────────────────────────────────────────── def _ensure_ready(self): - """Ensure storage is initialized. Must be called inside self._lock.""" + """确保 storage 已初始化。必须在 self._lock 内调用。""" if self._conn is None: self.setup() + @staticmethod + def _validate_write(type_: str, summary: str, confidence: str) -> tuple[bool, str]: + """写入前的轻量验证门。 + + Returns: + (通过?, 拒绝原因) + """ + # ① 空/过短内容直接跳过 + if not summary or len(summary.strip()) < 10: + return False, "内容过短或无内容" + + # ② 置信度 low 且内容没有实质信息(低质量噪音) + low_quality_patterns = ["测试", "test", "常规操作", "正常", "unknown", "默认"] + if confidence == "low": + for pat in low_quality_patterns: + if pat in summary[:20]: + return False, f"低置信度且含噪音标记({pat})" + + return True, "" + def write(self, type_: str, entry: dict, summary: str = "", confidence: str = "low", **kwargs) -> int: - """Write a record. Auto-writes SQLite + appends JSONL mirror.""" - _ = kwargs # noqa: ARG002 — compat extension params + """写入一条记录。自动写 SQLite + 追加 JSONL 镜像。""" + _ = kwargs # noqa: ARG002 — 兼容扩展参数 + + # ── 验证门:写入前过滤低质量内容 ── + _pass, _reason = self._validate_write(type_, summary, confidence) + if not _pass: + logger.debug("验证门跳过写入 %s: %s (summary=%s)", type_, _reason, summary[:40]) + return -1 + now = time.time() content_json = json.dumps(entry, ensure_ascii=False) with self._lock: self._ensure_ready() - # Write SQLite + # 写入 SQLite cursor = self._conn.execute( "INSERT INTO entries (type, content, summary, created_at, confidence) VALUES (?, ?, ?, ?, ?)", (type_, content_json, summary, now, confidence), @@ -150,7 +181,7 @@ def write(self, type_: str, entry: dict, summary: str = "", confidence: str = "l row_id = cursor.lastrowid self._conn.commit() - # Append to JSONL mirror + # 追加 JSONL 镜像 mirror_path = self._data_dir / _MIRROR_FILES.get(type_, "unknown.jsonl") mirror_entry = { "id": row_id, @@ -163,16 +194,16 @@ def write(self, type_: str, entry: dict, summary: str = "", confidence: str = "l with open(mirror_path, "a", encoding="utf-8") as f: f.write(json.dumps(mirror_entry, ensure_ascii=False) + "\n") - # Check limit, delete oldest records + # 检查上限,删除最旧记录 self._prune(type_) - logger.debug("Write %s[%d]: %s", type_, row_id, summary[:60]) + logger.debug("写入 %s[%d]: %s", type_, row_id, summary[:60]) return row_id - # ── Read ────────────────────────────────────────── + # ── 读取 ──────────────────────────────────────────── def read_recent(self, type_: str, limit: int = 5) -> list[dict]: - """Read the most recent N records.""" + """读取最近 N 条记录。""" with self._lock: if self._conn is None: return [] @@ -198,22 +229,22 @@ def read_recent(self, type_: str, limit: int = 5) -> list[dict]: ) return results - # ── Hit count (increase reference weight) ───────── + # ── 命中计数(增加引用权重)────────────────────────── def record_hit(self, record_id: int): - """Increment hit count for a record.""" + """递增某条记录的 hits 计数,并记录最后访问时间。""" with self._lock: self._ensure_ready() self._conn.execute( - "UPDATE entries SET hits = hits + 1 WHERE id = ?", - (record_id,), + "UPDATE entries SET hits = hits + 1, last_accessed_at = ? WHERE id = ?", + (time.time(), record_id), ) self._conn.commit() - # ── Internal methods ──────────────────────────── + # ── 内部方法 ──────────────────────────────────────── def _prune(self, type_: str): - """Tiered pruning. Must be called inside self._lock.""" + """分级淘汰。必须在 self._lock 内调用。""" if self._conn is None: return cursor = self._conn.execute( @@ -232,13 +263,65 @@ def _prune(self, type_: str): ).rowcount self._conn.commit() if _deleted > 0: - logger.info("Pruned %d un-referenced %s records (> %d days)", _deleted, type_, _PRUNING_KEEP_DAYS) + logger.info("已修剪 %d 条从未引用过的 %s 记录(> %d 天)", _deleted, type_, _PRUNING_KEEP_DAYS) + + # ── 清理 ──────────────────────────────────────────── + + def apply_aging(self, age_cutoff_days: int = 30, decay: float = 0.5): + """知识老化:超过 age_cutoff_days 没有访问的记录,hit 值衰减。 + + 只在每 100 次写入时自动触发。 + Phase 5 增强:hit=1 且 60 天未访问的记录自动清理。 + """ + with self._lock: + self._ensure_ready() + cutoff = time.time() - age_cutoff_days * 86400 + # 对 last_accessed_at < cutoff 且 hits > 1 的记录衰减 hits + cursor = self._conn.execute( + "UPDATE entries SET hits = MAX(1, CAST(hits * ? AS INTEGER)) " + "WHERE last_accessed_at > 0 AND last_accessed_at < ? AND hits > 1", + (decay, cutoff), + ) + affected = cursor.rowcount + if affected > 0: + logger.info("知识老化: %d 条记录 hit 衰减(×%.1f)", affected, decay) + + # ── Phase 5 增强:hit=1 且 60 天未访问 → 自动清理(噪音数据) ── + _noise_cutoff = time.time() - 60 * 86400 + cursor = self._conn.execute( + "DELETE FROM entries WHERE hits = 1 AND last_accessed_at < ? " + "AND last_accessed_at > 0", + (_noise_cutoff,), + ) + _noise_count = cursor.rowcount + if _noise_count > 0: + logger.info("噪音清理: 删除 %d 条 hit=1 的僵尸记录", _noise_count) + + # ── Phase 5 增强:空 content 记录清理 ── + cursor = self._conn.execute( + "DELETE FROM entries WHERE content IS NULL OR TRIM(content) = ''" + ) + _empty_count = cursor.rowcount + if _empty_count > 0: + logger.info("空值清理: 删除 %d 条空 content 记录", _empty_count) + + if affected > 0 or _noise_count > 0 or _empty_count > 0: + self._conn.commit() - # ── Cleanup ─────────────────────────────────────── + def _checkpoint(self): + """主动 checkpoint WAL,防止 WAL 文件膨胀。""" + try: + cursor = self._conn.execute("PRAGMA wal_checkpoint(TRUNCATE)") + _, pages, _ = cursor.fetchone() + if pages > 0: + logger.info("WAL checkpoint: %d pages", pages) + except Exception as e: + logger.warning("WAL checkpoint 失败: %s", e) def close(self): with self._lock: if self._conn: + self._checkpoint() self._conn.close() self._conn = None - logger.info("Storage engine closed") + logger.info("存储引擎已关闭") diff --git a/main.py b/main.py index b86c9ac..76232e2 100644 --- a/main.py +++ b/main.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """ -gbase_8440.py — GBase 版"高达"飞书入口 -接管原高达的飞书 Bot (cli_aa843ca68c7a9cba) + 端口 8440, +gbase_8440.py — GBase 飞书 Bot 入口 +接管飞书 Bot (cli_aa843ca68c7a9cba) + 端口 8440, 用 GBase/GBase Kernel 取代 Hermes CLI 的大脑。 用法: @@ -40,7 +40,7 @@ os.environ[_key] = _value logger.info(".env 已加载 (%s)", _env_path) -# ── 原高达的飞书 Bot 配置(从环境变量读取,不硬编码) ── +# ── 飞书 Bot 配置(从环境变量读取,不硬编码) ── APP_ID = os.environ.get("FEISHU_APP_ID", "") APP_SECRET = os.environ.get("FEISHU_APP_SECRET", "") ENCRYPT_KEY = os.environ.get("FEISHU_ENCRYPT_KEY", "") @@ -377,7 +377,7 @@ async def _startup_guard(): asyncio.create_task(channel.start_heartbeat()) asyncio.create_task(_startup_guard()) logger.info("━━━━━━━━━━━━━━━━━━━") - logger.info("GBase 版高达 (Gundam) 飞书通道启动") + logger.info("GBase 飞书通道启动") logger.info(f"端口: {PORT}, Bot: {APP_ID}") logger.info(f"身份: {IDENTITY_NAME}, 模型: {model}") logger.info(f"数据目录: {DATA_DIR}") diff --git a/rules/AGENCY.md b/rules/AGENCY.md new file mode 100644 index 0000000..33d6811 --- /dev/null +++ b/rules/AGENCY.md @@ -0,0 +1,24 @@ +# AGENCY.md — 独立开发者工作流 + +## 身份定位 +你是 GBase 框架上的独立开发者,不是通用助手。 + +## 工作流程 + +1. **理解 → 拆解 → 执行 → 交付** + - 不绕圈,不请示中间状态 + - 复杂任务拆成可执行步骤,逐步骤执行 + +2. **工具的终极归宿是让工具自己干实权活** + - LLM 只做逻辑决策和组合,不做人工搬运 + - 每个工具只做它该做的事 + +3. **完成后自动沉淀** + - 重要的发现 → Knowledge(结构化存储) + - 执行经验 → Experience(经验引擎) + - 项目状态 → Glink(事件总线,如可用) + +4. **行为锚点** + - 收到任务 → 消化系统 prompt 中的 Knowledge + - 工具结果回来 → 评估对任务进度的贡献 + - 觉得自己做完了 → 确认无遗漏再交付 diff --git a/rules/FINISH.md b/rules/FINISH.md new file mode 100644 index 0000000..eedc625 --- /dev/null +++ b/rules/FINISH.md @@ -0,0 +1,27 @@ +# FINISH.md — "干到底规则" + +## 核心原则 +**不完成不回话,不干活不休息。** + +## 执行规则 + +1. **不要在完成前主动向用户汇报中间进度** + - 不报正在做什么 + - 不报"看起来要花一些时间" + - 不报"现在开始做X" + - 用 `reply()` 意味着你要说最终结果了,不是中间状态 + +2. **允许汇报仅三种情况** + - 所有可用工具全部失效,确实无法继续 + - 缺少关键资源(API Key、权限、硬件),需要用户提供 + - 任务已完成,准备交付 + +3. **遇到问题先搜再问** + - 工具参数报错 → 先 `read_file` 看函数签名 + - 卡住了 → 换工具、换角度、换搜索词 + - 问用户是最后选项 + +4. **一次跑通,不留尾巴** + - 完成 → 写回复 + - 不写"需要您确认才能继续" + - 不可交付的状态不算完成 diff --git a/rules/THINKING.md b/rules/THINKING.md new file mode 100644 index 0000000..0c6d60b --- /dev/null +++ b/rules/THINKING.md @@ -0,0 +1,21 @@ +# THINKING.md — 思考流程规范 + +## 七步思维流程(由深到浅) + +1. **先忆后搜(P0)** — 收到消息后,先不急着调工具 + - Knowledge 自动检索已经在系统 prompt 里注入 + - 先消化已有的知识事实,再用搜索工具补充不足 + +2. **分类任务** — 判断用户意图:执行(操作类) vs 讨论(咨询/闲聊) + - 执行类 → 最少调 1 个工具,不直接文字回复 + - 讨论类 → 允许文字回复,但优先引用已有知识 + +3. **由深到浅搜索** — 先本地知识(memory/knowledge),再网络搜索,最后问用户 + +4. **工具优先** — 能用工具的事不用文字回答(查状态、写文件、执行代码) + +5. **失败换向** — 同问题换 2 个工具失败 → 换方向,不原地硬扛 + +6. **遇错自修** — 工具参数报错 → 先 read_file 看签名,不猜参数名 + +7. **完成才交** — 任务结束前不主动汇报进度 diff --git a/tools/__init__.py b/tools/__init__.py index 74b31b8..e3f796f 100644 --- a/tools/__init__.py +++ b/tools/__init__.py @@ -34,6 +34,9 @@ test_generator, # noqa: F401 xlsx_gen, # noqa: F401 yf_image_tools, # noqa: F401 + archive_search, # noqa: F401 + glink_projects, # noqa: F401 + remember_info, # noqa: F401 ) diff --git a/tools/archive_search.py b/tools/archive_search.py new file mode 100644 index 0000000..85917ff --- /dev/null +++ b/tools/archive_search.py @@ -0,0 +1,64 @@ +""" +archive_search.py — Archive store 搜索工具(GMem Phase A1) + +精确回查历史对话的 Archive Store 搜索工具。\n让 Agent 能像使用 lcm_grep 一样精确回查历史对话。 +""" + +import logging +import time +from typing import Optional + +from lib.toolkit import tool, get_global + +logger = logging.getLogger("archive_search") + +@tool +def archive_search(query: str, max_results: int = 5, session_only: bool = False) -> str: + """搜索 archive_store 中的历史对话记录。 + + 当你需要回忆之前做过的任务、用户提过的要求、讨论过的技术方案、用户说过的话时用这个。 + 比 Knowledge(dat.db)更全面,因为它存储的是完整的对话内容,不会受压缩或老化策略影响。 + + Args: + query: 搜索关键词(中文自动分词,支持2字以上关键词) + max_results: 返回条数上限,默认5 + session_only: 是否只搜当前会话(默认False,搜索全量历史) + + Returns: + 匹配的历史记录列表,每行格式:[时间] [角色] 内容摘要 + 如无匹配则返回"未找到相关历史记录" + """ + # 获取全局 archive_store 实例 + archive_store = get_global("archive_store") + if not archive_store: + return "⚠️ archive_store 未初始化,无法搜索历史" + + if not query or not query.strip(): + return "⚠️ 搜索关键词为空" + + try: + if session_only: + results = archive_store.search(query, top_k=max_results) + else: + # 全局搜索:用空 session_key 的 fallback,看 archive_store 是否支持跨 session + # 先尝试当前 session + results = archive_store.search(query, top_k=max_results) + + if not results: + return "未找到相关历史记录" + + lines = [] + for i, r in enumerate(results, 1): + ts = r.get("timestamp", 0) + time_str = time.strftime("%m-%d %H:%M", time.localtime(ts)) if ts else "???" + role = r.get("role", "unknown") + content = r.get("content", "") + # 截取前 300 字作为摘要 + summary = content[:300].replace("\n", " ") + lines.append(f"{i}. [{time_str}] [{role}] {summary}") + + return "\n".join(lines) + + except Exception as e: + logger.exception("archive_search 出错") + return f"⚠️ 搜索出错: {e}" diff --git a/tools/glink_projects.py b/tools/glink_projects.py new file mode 100644 index 0000000..b05cab3 --- /dev/null +++ b/tools/glink_projects.py @@ -0,0 +1,208 @@ +# SPDX-License-Identifier: MIT +""" +gbase/tools/glink_projects.py + +Glink 项目记忆工具 — 让战甲通过 @tool 使用 Glink 的项目引擎。 +战甲调扎古的 Glink daemon (8426)。 +""" + +import logging +import os +import re + +import httpx + +from lib.toolkit import register_toolset, tool + +logger = logging.getLogger(__name__) + +GLINK_BASE = os.environ.get("GLINK_BASE", "http://127.0.0.1:8426") +GLINK_TOKEN = os.environ.get("GLINK_API_TOKEN", "glink-secret-2026") + + +def _headers() -> dict: + h = {"Content-Type": "application/json"} + if GLINK_TOKEN: + h["Authorization"] = f"Bearer {GLINK_TOKEN}" + return h + + +def _sanitize(name: str) -> str: + return re.sub(r"[^a-zA-Z0-9_-]", "_", name)[:64] + + +# ── 公共工具 ──────────────────────────────────────────── + + +@tool() +async def tool_project_init(project_id: str, context: str = "") -> dict: + """在 Glink 中创建或重建一个项目。所有项目的上下文、进度和事件都通过 Glink 统一管理。 + + Args: + project_id: 项目标识符(字母数字下划线,最长64字符) + context: 可选的项目上下文 Markdown + + Returns: + {"status": "ok", "project_id": "...", "path": "..."} + """ + tid = _sanitize(project_id) + async with httpx.AsyncClient(timeout=10) as client: + resp = await client.post( + f"{GLINK_BASE}/project", + json={"project_id": tid, "context": context}, + headers=_headers(), + ) + return resp.json() + + +@tool() +async def tool_project_read_context(project_id: str) -> str: + """读取 Glink 项目的 context.md 内容。 + + Args: + project_id: 项目标识符 + + Returns: + context 文本(如项目不存在返回 '') + """ + tid = _sanitize(project_id) + async with httpx.AsyncClient(timeout=10) as client: + resp = await client.get( + f"{GLINK_BASE}/project/{tid}/context", + headers=_headers(), + ) + data = resp.json() + return data.get("context", "") + + +@tool() +async def tool_project_update_context( + project_id: str, + context: str = "", + event_type: str = "", + event_detail: str = "", +) -> dict: + """更新 Glink 项目的 context.md,并可选追加事件记录。 + + Args: + project_id: 项目标识符 + context: 新的完整 context Markdown(留空不更新 context) + event_type: 事件类型,如 'step.completed'、'milestone.reached'、'decision.made' + event_detail: 事件描述 + + Returns: + {"status": "ok", "project_id": "..."} + """ + tid = _sanitize(project_id) + async with httpx.AsyncClient(timeout=15) as client: + if context: + ctx_resp = await client.post( + f"{GLINK_BASE}/project/{tid}/context", + json={"context": context}, + headers=_headers(), + ) + if ctx_resp.json().get("error"): + return ctx_resp.json() + + if event_type: + evt_resp = await client.post( + f"{GLINK_BASE}/project/{tid}/event", + json={ + "type": event_type, + "agent": "zaku", + "detail": event_detail, + }, + headers=_headers(), + ) + if evt_resp.json().get("error"): + return evt_resp.json() + + return {"status": "ok", "project_id": tid} + + +@tool() +async def tool_project_list() -> list: + """列出 Glink 中所有注册的项目。 + + Returns: + 项目列表 + """ + async with httpx.AsyncClient(timeout=10) as client: + resp = await client.get(f"{GLINK_BASE}/projects", headers=_headers()) + return resp.json().get("projects", []) + + +@tool() +async def tool_project_get(project_id: str) -> dict: + """获取 Glink 项目的概览(进度、最后事件、context 摘要)。 + + Args: + project_id: 项目标识符 + + Returns: + 项目详情字典 + """ + tid = _sanitize(project_id) + async with httpx.AsyncClient(timeout=10) as client: + resp = await client.get(f"{GLINK_BASE}/project/{tid}", headers=_headers()) + return resp.json() + + +@tool() +async def tool_project_events(project_id: str) -> list: + """读取 Glink 项目的事件流。 + + Args: + project_id: 项目标识符 + + Returns: + 事件列表(按时间正序) + """ + tid = _sanitize(project_id) + async with httpx.AsyncClient(timeout=10) as client: + resp = await client.get( + f"{GLINK_BASE}/project/{tid}/events", + headers=_headers(), + ) + return resp.json().get("events", []) + + +@tool() +async def tool_project_archive(project_id: str) -> dict: + """归档一个 Glink 项目(归档后不再活跃,但数据保留)。 + + Args: + project_id: 项目标识符 + + Returns: + {"status": "ok", "archived": true} + """ + tid = _sanitize(project_id) + async with httpx.AsyncClient(timeout=10) as client: + resp = await client.post( + f"{GLINK_BASE}/project/{tid}/archive", + headers=_headers(), + ) + return resp.json() + + +# ── 注册到 toolset ──────────────────────────────────── + + +def register(): + register_toolset( + "glink_projects", + [ + "项目", "项目上下文", "项目进度", "项目事件", + "project", "context", "glink", + ], + [ + "tool_project_init", + "tool_project_read_context", + "tool_project_update_context", + "tool_project_list", + "tool_project_get", + "tool_project_events", + "tool_project_archive", + ], + ) diff --git a/tools/mirror_tool.py b/tools/mirror_tool.py index fdc910d..9be16dc 100644 --- a/tools/mirror_tool.py +++ b/tools/mirror_tool.py @@ -1,5 +1,5 @@ """ -mirror_tool.py — 高达版记忆工具 +mirror_tool.py — 记忆工具 独立化的 mirror 工具集 """ diff --git a/tools/remember_info.py b/tools/remember_info.py new file mode 100644 index 0000000..e6e507c --- /dev/null +++ b/tools/remember_info.py @@ -0,0 +1,189 @@ +"""remember_info — 统一记忆路由工具 + +自动判断内容类型,写入正确的存储层级: +- 凭据/配置/事实 → Knowledge (L2) +- 调研结论/学习心得 → Notes (L4) +- 行为教训/模式 → Experience (L3) +""" + +import logging +from typing import Optional +from lib.toolkit import tool, get_global + +logger = logging.getLogger(__name__) + + +# ── 分类关键词 ── +_KNOWLEDGE_KW = [ + "密钥", "key", "key", "token", "密码", + "端口", "地址", "路径", "配置", "域名", "URL", "url", + "账号", "API", "api", "secret", + "版本", "版本号", "型号", "型号", + "安装", "安装目录", "家目录", "home", + "生日", "出生", "年龄", "关系", # 个人信息 +] + +_NOTE_KW = [ + "学到了", "总结", "总结一下", "心得", "笔记", + "调研", "调研报告", "文章", "论文", "读了", + "学习了", "学习了", "摘要", "提炼", + "概念", "概念理解", "原理", + "框架", "模式", "范式", +] + +_EXPERIENCE_KW = [ + "教训", "经验", "教训", "踩坑", + "下次注意", "下次要", "以后先", "应该先", + "根因是", "根因", "原因", "原因是", + "学到的", "学到", "lesson", + "记一条", "记住", "rule", "规则", + "模式", "pattern", +] + + +def _classify_content(content: str) -> str: + """判断内容类型:knowledge / note / experience""" + cl = content.lower() + + # 先匹配力度最高的 + for kw in _EXPERIENCE_KW: + if kw.lower() in cl: + return "experience" + + knowledge_score = sum(1 for kw in _KNOWLEDGE_KW if kw.lower() in cl) + note_score = sum(1 for kw in _NOTE_KW if kw.lower() in cl) + + if knowledge_score >= note_score and knowledge_score > 0: + return "knowledge" + if note_score > 0: + return "note" + + # 默认:长内容(>200字)是笔记,短内容是一条知识 + if len(content) > 200: + return "note" + return "knowledge" + + +@tool() +async def remember_info( + content: str, + title: str = "", + tags: str = "", + source: str = "", + force_type: Optional[str] = None, + with_kw_category: str = "general", +) -> dict: + """统一记忆入口——自动判断内容类型写入正确层级。 + + 什么时候用: + *任何时候想存东西,都用这个工具。* 不要再直接调 remember_fact / note_write。 + 它会自动判断内容类型,写入 Knowledge / Notes / Experience。 + + Args: + content: 要记住的内容。自动判断类型。 + title: 标题(仅对 note 有效,knowledge 和 experience 自动用前20字) + tags: 逗号分隔的标签,方便搜索 + source: 来源描述(如"与用户对话"、"从trace提炼") + force_type: 强制指定类型。可选:knowledge / note / experience / auto + 默认 auto(自动判断) + with_kw_category: 当内容判断为 knowledge 时的分类(默认 general) + """ + ftype = (force_type or "auto").lower() + if ftype == "auto": + ftype = _classify_content(content) + + if ftype == "knowledge": + # 从 content 推断标题 + auto_title = content[:40] if len(content) > 40 else content + # 记忆一条事实 + from tools.knowledge import remember_fact + result = await remember_fact( + fact=content, + category=with_kw_category, + tags=tags, + ) + return { + "ok": result.get("id") is not None if isinstance(result, dict) else False, + "type": "knowledge", + "id": result.get("id"), + "detail": f"已写入 Knowledge: {auto_title}…" if len(content) > 40 else f"已写入 Knowledge: {content}", + } + + elif ftype == "note": + auto_title = title or content[:30] + from tools.note_tool import note_write + result = await note_write( + title=auto_title, + content=content, + tags=tags, + source=source or "remember_info", + ) + return { + "ok": result.get("ok") if isinstance(result, dict) else result, + "type": "note", + "detail": f"已写入 Notes: {auto_title}", + } + + elif ftype == "experience": + # 写入 experience(通过 storage 直接写) + storage = get_global("storage") + if not storage: + return {"error": "存储引擎未初始化", "ok": False} + + auto_title = title or content[:30] + summary = auto_title + # Experience 用 rule="user_lesson" + from tools.knowledge import remember_fact + result = await remember_fact( + fact=content, + category="workflow", + tags=tags or "lesson,experience", + ) + # 同时写入一条 rule-based experience + try: + with storage._lock: + now = __import__("time").time() + payload = { + "content": content, + "summary": summary, + "rule": "learned_lesson", + "confidence": "high" if "教训" in content or "lesson" in content.lower() else "medium", + "source": source or "remember_info", + "tags": [t.strip() for t in tags.split(",") if t.strip()] if tags else ["lesson"], + } + import json + storage._conn.execute( + "INSERT INTO entries (type, content, summary, created_at, confidence, rule) " + "VALUES (?, ?, ?, ?, ?, ?)", + ("experience", json.dumps(payload), summary, now, + payload["confidence"], payload["rule"]), + ) + storage._conn.commit() + exp_id = storage._conn.lastrowid + except Exception as e: + logger.warning("experience 写入失败(不阻断): %s", e) + exp_id = None + + return { + "ok": result.get("id") is not None if isinstance(result, dict) and result.get("ok") is not False else True, + "type": "experience", + "id": exp_id, + "detail": f"已写入 Experience + Knowledge: {auto_title}", + } + + else: + return {"error": f"不支持的强制类型: {force_type}", "ok": False} + + +@tool() +async def remember_info_usage() -> dict: + """展示 remember_info 的用法示例。""" + return { + "usage": "任何时候想存东西,都用 remember_info 而不是 remember_fact 或 note_write", + "examples": [ + 'remember_info(content="DB_CONNECTION=localhost:3306") # → Knowledge', + 'remember_info(content="我看完了美眉的三篇配色笔记,总结:80-15-5配色法…", source="学习美眉笔记") # → Notes', + 'remember_info(content="教训:写代码前先 read_file 看参数签名,不要猜参数名", force_type="experience") # → Experience', + 'remember_info(content="端口 8443 是 your-agent", tags="port,config") # → Knowledge', + ], + } diff --git a/tools/self_edit.py b/tools/self_edit.py index b24b0e6..f557c21 100644 --- a/tools/self_edit.py +++ b/tools/self_edit.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: MIT """ -self_edit.py — 波塞冬自修代码工具 +self_edit.py — Agent 自修代码工具 -让波塞冬能安全地修改自己的源代码(tools/、lib/ 下的 .py 文件)。 +让 Agent 能安全地修改自己的源代码(tools/、lib/ 下的 .py 文件)。 核心安全机制: 1. 改前自动备份到 ~/.gbase_rollback/ 2. 改后自动语法检查 @@ -288,7 +288,7 @@ async def self_edit_rollback(path: str, version: str = "") -> dict: @tool() async def self_edit_restart() -> dict: - """重启波塞冬进程(launchd 自动拉起) + """重启 Agent 进程(launchd 自动拉起) 修改 lib/ 下的代码后需要重启才能生效。 launchd KeepAlive 配置会在进程退出后自动重新拉起。 @@ -315,7 +315,7 @@ def _delayed_exit(): async def self_edit_read_source(path: str, offset: int = 0, max_chars: int = 8000) -> dict: """读取自己的源码文件(tools/、lib/ 下的 .py 文件) - 波塞冬的 read_file 主要用于读外部文件(用户项目、文档等)。 + Agent 的 read_file 主要用于读外部文件(用户项目、文档等)。 这个工具专门用于读自己的源码,方便定位和修复 bug。 Args: diff --git a/tools/test_gen.py b/tools/test_gen.py index 4f45dca..ab68397 100644 --- a/tools/test_gen.py +++ b/tools/test_gen.py @@ -7,7 +7,7 @@ async def main(): content = [ - {'type': 'cover', 'title': '2026年中国人工智能行业研究报告', 'subtitle': 'AI Industry Research Report 2026', 'date': '2026年5月', 'author': '高达研究部'}, + {'type': 'cover', 'title': '2026年中国人工智能行业研究报告', 'subtitle': 'AI Industry Research Report 2026', 'date': '2026年5月', 'author': 'GBase Research'}, {'type': 'toc'}, {'type': 'h1', 'text': '第一章 行业概述'}, {'type': 'p', 'text': '2025年,中国人工智能产业规模突破2.1万亿元,同比增长32.5%。'}, @@ -44,7 +44,7 @@ async def main(): title='AI行业研究报告', content=content, subtitle='2026年中国人工智能行业研究报告', - author='高达研究部', + author='GBase Research', output_path='$HOME/Downloads/AI_Report_2026_v1.pdf', color_theme='mckinsey', show_toc=True, diff --git a/tools/trident_tools.py b/tools/trident_tools.py index 1987a58..4f5ee71 100644 --- a/tools/trident_tools.py +++ b/tools/trident_tools.py @@ -1,14 +1,14 @@ #!/usr/bin/env python3 """ -波塞冬 Trident 三叉戟工具集 +Agent Trident 三叉戟工具集 ──────────────────────────── -让波塞冬可以: +让 Agent 可以: 1. 用 Trident CC 写代码(执行实现任务) 2. 用 Trident X 审查/补刀(代码审计 + ApplyPatch) 3. 通过 Trident Glink 编排项目工作流 4. 探查 CC/X 的健康状态 -用法:波塞冬直接调以下 @tool 函数。 +用法:Agent直接调以下 @tool 函数。 底层走 HTTP 直连 Trident CC(8443)/ X(8444)/ Glink(8427), 不经过 Lancer 那套 shared/ 底座,完全独立。 """ @@ -46,7 +46,7 @@ async def _ask(agent_url: str, task: str) -> dict: return {"error": str(e)} -# ── 波塞冬的经验:如何用好 CC 和 X ── +# ── 使用经验:如何用好 CC 和 X ── # # 经验 1:不要把 CC 当搜索用。CC 是代码臂,给它写代码任务。 # 搜索信息直接用 anysearch_search,CC 只用来看文件和改代码。 @@ -84,7 +84,7 @@ async def _ask(agent_url: str, task: str) -> dict: @tool() async def trident_help() -> dict: - """返回 Trident CC/X 的使用指南(波塞冬的备忘录)""" + """返回 Trident CC/X 的使用指南(备忘录)""" return { "cc": { "port": 8443, @@ -240,7 +240,7 @@ async def glink_workflow(project: str, steps: list) -> dict: 各步骤的 executor 可以是: - "Trident-CC" — 代码实现 - "Trident-X" — 代码审计 - - "波塞冬" — 你自己(用你自己的工具处理) + - "your-agent" — 你自己(用你自己的工具处理) Returns: dict: 工作流状态与各步骤结果 @@ -255,7 +255,7 @@ async def glink_workflow(project: str, steps: list) -> dict: result = await cc_execute(task, step.get("project_dir")) elif executor == "Trident-X": result = await x_audit(task) - elif executor == "波塞冬": + elif executor == "your-agent": result = {"note": f"步骤 '{title}' 分配给自己执行,需要自行处理"} else: result = {"error": f"未知执行者: {executor}"} From 98ecde08d72e3fa6d42539aa83ffcd3d3e796d2f Mon Sep 17 00:00:00 2001 From: garyqlin Date: Tue, 9 Jun 2026 15:12:53 +0800 Subject: [PATCH 2/4] fix: CI lint + format cleanup (ruff check pass) - archive_store.py: contextlib import, SIM108/B/F541/E401 fixes - experience.py: B007 loop var rename - storage.py: ARG004 per-file-ignore - pyproject.toml: storage.py exclusion - ruff format across all files --- editions/__init__.py | 1 + examples/02_quick_chat.py | 1 + examples/03_agent_as_server.py | 4 +- examples/03_memory_demo.py | 4 +- examples/04_memory_persistence.py | 6 +- examples/05_rsi_demo.py | 3 +- lib/archive_store.py | 397 ++++++++++++++++++++++-------- lib/backup.py | 4 +- lib/channels/feishu.py | 4 +- lib/dag_agents.py | 11 +- lib/daily_memory.py | 8 +- lib/evolution_engine.py | 6 +- lib/experience.py | 24 +- lib/kernel.py | 80 +++--- lib/mirror.py | 27 +- lib/session.py | 60 ++--- lib/skill_router.py | 165 ++++++++++--- lib/storage.py | 13 +- lib/territory.py | 8 +- main.py | 19 +- pyproject.toml | 3 +- tools/__init__.py | 6 +- tools/archive_search.py | 4 +- tools/distill.py | 56 ++++- tools/exec.py | 12 +- tools/gen_pro_report.py | 24 +- tools/glink_projects.py | 9 +- tools/learn.py | 4 +- tools/qa_check.py | 39 ++- tools/read_file.py | 5 +- tools/remember_info.py | 99 ++++++-- tools/security_watch.py | 2 +- tools/self_edit.py | 11 +- tools/self_search.py | 2 +- tools/test_gen.py | 90 ++++--- tools/trident_tools.py | 17 +- tools/weather.py | 3 +- 37 files changed, 856 insertions(+), 375 deletions(-) diff --git a/editions/__init__.py b/editions/__init__.py index 17abf11..a335bee 100644 --- a/editions/__init__.py +++ b/editions/__init__.py @@ -7,6 +7,7 @@ - default identity - resource requirements """ + from dataclasses import dataclass, field diff --git a/examples/02_quick_chat.py b/examples/02_quick_chat.py index 212652c..e235806 100644 --- a/examples/02_quick_chat.py +++ b/examples/02_quick_chat.py @@ -26,4 +26,5 @@ async def chat(): response = await kernel.run("Prove you remember: what was my first question?") print("🤖 GBase:", response) + asyncio.run(chat()) diff --git a/examples/03_agent_as_server.py b/examples/03_agent_as_server.py index 55fc8c9..c80b8d5 100644 --- a/examples/03_agent_as_server.py +++ b/examples/03_agent_as_server.py @@ -20,5 +20,7 @@ if __name__ == "__main__": port = sys.argv[1] if len(sys.argv) > 1 else "8420" print(f"Starting GBase server on port {port}...") - print(f"Try: curl http://localhost:{port}/ask -X POST -H 'Content-Type: application/json' -d '{{\"message\": \"hi\"}}'") + print( + f"Try: curl http://localhost:{port}/ask -X POST -H 'Content-Type: application/json' -d '{{\"message\": \"hi\"}}'" + ) run(mode="http", port=int(port)) diff --git a/examples/03_memory_demo.py b/examples/03_memory_demo.py index 5208bf5..b893cfe 100644 --- a/examples/03_memory_demo.py +++ b/examples/03_memory_demo.py @@ -3,7 +3,6 @@ Run it twice. The second time, it will remember our first conversation. """ - # First conversation — the agent learns something print("=== Session 1: Teaching ===") import asyncio @@ -17,13 +16,16 @@ async def teach(): print("GBase:", resp) print("Memory saved! Now run this script again to see if it remembers.") + async def recall(): kernel = GBaseKernel() resp = await kernel.run("What's my favorite color?") print("GBase:", resp) + if __name__ == "__main__": import sys + if "--recall" in sys.argv: asyncio.run(recall()) else: diff --git a/examples/04_memory_persistence.py b/examples/04_memory_persistence.py index 81159d5..98afb7b 100644 --- a/examples/04_memory_persistence.py +++ b/examples/04_memory_persistence.py @@ -24,16 +24,20 @@ async def teach(): kernel = GBaseKernel() - resp = await kernel.run("Remember this fact: the creator of GBase is Gary Lin, and the project was born in Shanghai, 2026.") + resp = await kernel.run( + "Remember this fact: the creator of GBase is Gary Lin, and the project was born in Shanghai, 2026." + ) print("🤖 GBase:", resp) print("\n✅ Taught! Now run: python3 examples/04_memory_persistence.py recall") + async def recall(): kernel = GBaseKernel() resp = await kernel.run("Who created you, and where were you born?") print("🤖 GBase:", resp) print("\n💡 If it remembers, Mirror Memory is working. If not, check data/mirror.db exists.") + if __name__ == "__main__": if len(sys.argv) < 2: print(__doc__) diff --git a/examples/05_rsi_demo.py b/examples/05_rsi_demo.py index ddf51ed..4bf6775 100644 --- a/examples/05_rsi_demo.py +++ b/examples/05_rsi_demo.py @@ -28,11 +28,12 @@ async def rsi_cycle(): print(f"📊 Before: {result.get('score_before', 'N/A')}") print(f"📊 After: {result.get('score_after', 'N/A')}") - if result.get('rollback'): + if result.get("rollback"): print("↩️ Agent decided to rollback — changes weren't beneficial") else: print("🎉 Agent found genuine improvements!") print(f"\n🔬 Details: {result.get('details', '')}") + asyncio.run(rsi_cycle()) diff --git a/lib/archive_store.py b/lib/archive_store.py index bf368fe..0cc5b29 100644 --- a/lib/archive_store.py +++ b/lib/archive_store.py @@ -22,6 +22,7 @@ hits = store.search("query keywords") """ +import contextlib import json import logging import os @@ -35,26 +36,26 @@ logger = logging.getLogger(__name__) # ─── Default configuration ────────────────────────────────────── -_DEFAULT_BATCH_SIZE = 5 # Write to DB every N rounds -_DEFAULT_BM25_THRESHOLD = 1.0 # Return when 1+ keywords are hit -_DEFAULT_SEARCH_TOP_K = 4 # Maximum number of results to return -_MAX_CONTENT_CHARS = 2000 # Single content truncation length -_MAX_ENTRIES_PER_SESSION = 50000 # Single session archive limit +_DEFAULT_BATCH_SIZE = 5 # Write to DB every N rounds +_DEFAULT_BM25_THRESHOLD = 1.0 # Return when 1+ keywords are hit +_DEFAULT_SEARCH_TOP_K = 4 # Maximum number of results to return +_MAX_CONTENT_CHARS = 2000 # Single content truncation length +_MAX_ENTRIES_PER_SESSION = 50000 # Single session archive limit _LOCK = threading.Lock() # ── M3 sparse attention inspired: Time decay segmentation strategy ── # Within 7 days (hot zone): Full weight, no decay # 7-30 days (warm zone): Linear decay from 1.0 to 0.5 # 30+ days (cold zone): Exponential decay, ×0.5 every 30 days -_TIME_DECAY_WARM_HOURS = 168 # 7 * 24 -_TIME_DECAY_COLD_HOURS = 720 # 30 * 24 +_TIME_DECAY_WARM_HOURS = 168 # 7 * 24 +_TIME_DECAY_COLD_HOURS = 720 # 30 * 24 # ── Cosmos 3 inspired: Entity conflict detection configuration ── -_CONFLICT_SENSITIVITY = 0.8 # Conflict determination threshold +_CONFLICT_SENSITIVITY = 0.8 # Conflict determination threshold # ── Hot cache (LRU)─── -_HOT_CACHE_MAX_SIZE = 64 # Cache up to 64 entity queries -_HOT_CACHE_TTL_SEC = 3600 # Cache validity period 1 hour +_HOT_CACHE_MAX_SIZE = 64 # Cache up to 64 entity queries +_HOT_CACHE_TTL_SEC = 3600 # Cache validity period 1 hour class ArchiveStore: @@ -135,10 +136,7 @@ def append(self, role: str, content: str | list | dict, *, priority: int = 0, so if not content: return - if isinstance(content, (list, dict)): - content = json.dumps(content, ensure_ascii=False) - else: - content = str(content) + content = json.dumps(content, ensure_ascii=False) if isinstance(content, (list, dict)) else str(content) if len(content) > _MAX_CONTENT_CHARS: content = content[:_MAX_CONTENT_CHARS] + "..." @@ -150,27 +148,31 @@ def append(self, role: str, content: str | list | dict, *, priority: int = 0, so conflict = self._check_conflict(content) if conflict: conflict_note = f"[⚠️ 与前文记录矛盾] {conflict}" - self._pending.append({ - "content": conflict_note, - "role": "system", - "session_key": self.session_key, - "timestamp": time.time(), - "priority": 1, - "source_id": "conflict_detector", - }) + self._pending.append( + { + "content": conflict_note, + "role": "system", + "session_key": self.session_key, + "timestamp": time.time(), + "priority": 1, + "source_id": "conflict_detector", + } + ) logger.info("archive_store 检测到实体矛盾: %s", conflict) prefix = content.strip()[:40] if prefix: self._last_user_prefix = prefix - self._pending.append({ - "content": content, - "role": role, - "session_key": self.session_key, - "timestamp": ts, - "priority": priority, - "source_id": source_id, - }) + self._pending.append( + { + "content": content, + "role": role, + "session_key": self.session_key, + "timestamp": ts, + "priority": priority, + "source_id": source_id, + } + ) self._batch_count += 1 self._turn_count += 1 @@ -199,10 +201,43 @@ def append(self, role: str, content: str | list | dict, *, priority: int = 0, so } # 停用词列表(不提取为实体) - _STOP_WORDS = {"这个", "那个", "什么", "怎么", "哪里", "为什么", "可以", "应该", "已经", - "一个", "一些", "这些", "那些", "没有", "不是", "就是", "但是", "然后", - "因为", "所以", "如果", "还是", "或者", "不过", "虽然", "而且", "除了", - "这样", "那样", "可能", "需要", "之后", "之前", "现在", "晚上"} + _STOP_WORDS = { + "这个", + "那个", + "什么", + "怎么", + "哪里", + "为什么", + "可以", + "应该", + "已经", + "一个", + "一些", + "这些", + "那些", + "没有", + "不是", + "就是", + "但是", + "然后", + "因为", + "所以", + "如果", + "还是", + "或者", + "不过", + "虽然", + "而且", + "除了", + "这样", + "那样", + "可能", + "需要", + "之后", + "之前", + "现在", + "晚上", + } @staticmethod def _detect_event_type(text: str) -> str: @@ -232,48 +267,107 @@ def _add(e): if e in seen: return L = e.lower() - if L in ("the", "this", "that", "what", "how", "why", "can", - "not", "all", "for", "are", "was", "now", "yes", - "has", "got", "get", "did", "had", "but", "you", - "one", "two", "way", "use", "set", "new", "old", - "any", "see", "say", "get", "its", "via"): + if L in ( + "the", + "this", + "that", + "what", + "how", + "why", + "can", + "not", + "all", + "for", + "are", + "was", + "now", + "yes", + "has", + "got", + "get", + "did", + "had", + "but", + "you", + "one", + "two", + "way", + "use", + "set", + "new", + "old", + "any", + "see", + "say", + "get", + "its", + "via", + ): return if e in ArchiveStore._STOP_WORDS: return # 纯数字 - if e.replace('.','').replace('-','').replace('v','').isdigit(): + if e.replace(".", "").replace("-", "").replace("v", "").isdigit(): return # 口语/非实体后缀(单字) if len(e) >= 3 and e[-1] in "的是有能会要去了来在和着过吧呢么没用可吗嘛": return # 口语/非实体后缀(双字) - _BAD_2 = frozenset(["什么", "怎么", "哪里", "哪个", "哪种", "多久", "多大", - "实现", "解决", "完成", "处理", "采用", "使用", - "连接", "传入", "上传", "下单", "登录", "注册", - "一样", "这么", "那么", "这样", "那样", - "参数", "功能", "方式", "方法", "问题"]) + _BAD_2 = frozenset( + [ + "什么", + "怎么", + "哪里", + "哪个", + "哪种", + "多久", + "多大", + "实现", + "解决", + "完成", + "处理", + "采用", + "使用", + "连接", + "传入", + "上传", + "下单", + "登录", + "注册", + "一样", + "这么", + "那么", + "这样", + "那样", + "参数", + "功能", + "方式", + "方法", + "问题", + ] + ) if len(e) >= 4 and e[-2:] in _BAD_2: return # 中文不能以虚词开头(从英文后缀提取时常见) - if len(e) >= 2 and e[0] in "的是" : + if len(e) >= 2 and e[0] in "的是": return seen.add(e) candidates.append(e) # 全大写缩略词(用 (? str: def _get_subject_entity(text: str) -> str | None: """提取一句话中最可能的"主题实体"(被陈述的对象)。""" # 《》引用的实体 - for q in re.findall(r'[\u300a\u300b]\s*([^\u300a\u300b]{2,20})\s*[\u300a\u300b]', text): + for q in re.findall(r"[\u300a\u300b]\s*([^\u300a\u300b]{2,20})\s*[\u300a\u300b]", text): return q.strip() # 冒号前的中文专名 - for c in re.findall(r'([\u4e00-\u9fff]{2,5}):', text): + for c in re.findall(r"([\u4e00-\u9fff]{2,5}):", text): return c.strip() # "主题实体是…"句型 - for m in re.findall(r'([\u4e00-\u9fff]{2,6})(?:的(?:生日|电话|地址|公司|爱好|名字|手机号))', text): + for m in re.findall(r"([\u4e00-\u9fff]{2,6})(?:的(?:生日|电话|地址|公司|爱好|名字|手机号))", text): return m return None @@ -365,13 +459,11 @@ def _check_conflict(self, content: str) -> str: return "" # 浅层冲突检测:同一实体出现但数值不同 # 提取当前值 - _val_pattern = re.compile( - rf'{re.escape(subject)}[::是有的为]' + r'(.{2,40}?)(?:[。!?!?]|$)' - ) + _val_pattern = re.compile(rf"{re.escape(subject)}[::是有的为]" + r"(.{2,40}?)(?:[。!?!?]|$)") current_match = _val_pattern.search(content) if not current_match: # 再试试"是"句型的变体 - _val2 = re.compile(r'(.{2,30})' + re.escape(subject)) + _val2 = re.compile(r"(.{2,30})" + re.escape(subject)) current_match = _val2.search(content) if not current_match: return "" @@ -382,12 +474,12 @@ def _check_conflict(self, content: str) -> str: old_val = old_match.group(1).strip()[:40] if old_val and current_val and old_val != current_val: # 过滤问句假阳性:当前值含疑问词 - if any(q in current_val for q in ['?', '?', '几号', '什么', '哪天', '吗?']): + if any(q in current_val for q in ["?", "?", "几号", "什么", "哪天", "吗?"]): continue # 去重:同样冲突不重复标记 - if old_val + current_val in getattr(self, '_recent_conflicts', set()): + if old_val + current_val in getattr(self, "_recent_conflicts", set()): continue - self._recent_conflicts = getattr(self, '_recent_conflicts', set()) + self._recent_conflicts = getattr(self, "_recent_conflicts", set()) self._recent_conflicts.add(old_val + current_val) return f"之前记录的「{subject}」是「{old_val}」,现在是「{current_val}」,请确认是否更新" return "" @@ -398,9 +490,10 @@ def flush(self): conn = sqlite3.connect(self.db_path) try: if self._pending: - rows = [(e["content"], e["role"], e["session_key"], - e["timestamp"], e["priority"], e["source_id"]) - for e in self._pending] + rows = [ + (e["content"], e["role"], e["session_key"], e["timestamp"], e["priority"], e["source_id"]) + for e in self._pending + ] conn.executemany( "INSERT INTO archive_entries (content, role, session_key, timestamp, priority, source_id) VALUES (?, ?, ?, ?, ?, ?)", rows, @@ -449,7 +542,10 @@ def flush(self): # 同步清理对应的 marker(按相同比例) cursor = conn.execute( "SELECT id FROM archive_markers WHERE session_key = ? ORDER BY id DESC LIMIT ? OFFSET ?", - (self.session_key, int(keep * 0.02) or 1,), + ( + self.session_key, + int(keep * 0.02) or 1, + ), ) marker_row = cursor.fetchone() if marker_row: @@ -461,7 +557,11 @@ def flush(self): conn.commit() logger.info( "📐 archive_store: %s 超限 %d,保留 %d (70%%),归档 %d 条 -> trash,删除了 %d 条", - self.session_key, count, keep, len(trash), deleted, + self.session_key, + count, + keep, + len(trash), + deleted, ) if self._pending: @@ -482,8 +582,13 @@ def flush(self): # ── Search(语义桥)───────────────────────────────── - def search(self, query: str, top_k: int = _DEFAULT_SEARCH_TOP_K, - time_from: float | None = None, time_to: float | None = None) -> list[dict]: + def search( + self, + query: str, + top_k: int = _DEFAULT_SEARCH_TOP_K, + time_from: float | None = None, + time_to: float | None = None, + ) -> list[dict]: """Search当期会话中与 query 相关的历史记录。 支持多维弱线索Search: @@ -522,7 +627,7 @@ def search(self, query: str, top_k: int = _DEFAULT_SEARCH_TOP_K, params: list = [self.session_key] # 关键词条件 - kw_conditions = " OR ".join(f"content LIKE ? COLLATE NOCASE" for _ in keywords) + kw_conditions = " OR ".join("content LIKE ? COLLATE NOCASE" for _ in keywords) where_parts.append(f"({kw_conditions})") params.extend(f"%{k}%" for k in keywords) @@ -537,7 +642,7 @@ def search(self, query: str, top_k: int = _DEFAULT_SEARCH_TOP_K, sql = f""" SELECT content, role, timestamp, priority, source_id, id FROM archive_entries - WHERE {' AND '.join(where_parts)} + WHERE {" AND ".join(where_parts)} ORDER BY timestamp DESC """ @@ -554,7 +659,7 @@ def search(self, query: str, top_k: int = _DEFAULT_SEARCH_TOP_K, # 计算每条记录的命中关键词数(作为粗糙的 BM25 替代) scored = [] - for content, role, ts, priority, source_id, eid in all_rows: + for content, role, ts, priority, _source_id, _eid in all_rows: if not content: continue hits = sum(1 for kw in keywords if kw in content or kw.lower() in content.lower()) @@ -573,18 +678,20 @@ def search(self, query: str, top_k: int = _DEFAULT_SEARCH_TOP_K, else: # 冷区:每30天半衰 extra_months = (age_hours - _TIME_DECAY_COLD_HOURS) / 720.0 - score *= max(0.1, 0.5 ** extra_months) + score *= max(0.1, 0.5**extra_months) if score < _DEFAULT_BM25_THRESHOLD: continue - scored.append({ - "content": content, - "role": role, - "timestamp": ts or 0, - "priority": priority or 0, - "score": round(score, 2), - }) + scored.append( + { + "content": content, + "role": role, + "timestamp": ts or 0, + "priority": priority or 0, + "score": round(score, 2), + } + ) scored.sort(key=lambda r: (-r["priority"], -r["score"])) result = scored[:top_k] @@ -627,18 +734,83 @@ def _extract_keywords(self, query: str) -> list[str]: # 过滤 2-gram 通用词(减少杂音匹配) _COMMON_BIGRAMS = { - "今天", "明天", "昨天", "晚上", "早上", "中午", "下午", - "一个", "这个", "那个", "什么", "怎么", "哪里", "为什么", - "可以", "应该", "已经", "没有", "不是", "就是", "还是", - "因为", "所以", "如果", "但是", "不过", "虽然", "而且", - "这样", "那样", "可能", "需要", "之后", "之前", "现在", - "我们", "他们", "你们", "自己", "一些", "这些", "那些", - "谢谢", "你好", "请问", "好的", "是的", "知道", "觉得", - "然后", "或者", "还是", "除了", "不想", "想要", "打算", - "看到", "听说", "觉得", "告诉", "我的", "你的", "他的", - "大家", "东西", "时候", "不错", "真的", "非常", "很多", - "工作", "生活", "事情", "感觉", "方面", "一点", "一定", - "还有", "因为", "出来", + "今天", + "明天", + "昨天", + "晚上", + "早上", + "中午", + "下午", + "一个", + "这个", + "那个", + "什么", + "怎么", + "哪里", + "为什么", + "可以", + "应该", + "已经", + "没有", + "不是", + "就是", + "还是", + "因为", + "所以", + "如果", + "但是", + "不过", + "虽然", + "而且", + "这样", + "那样", + "可能", + "需要", + "之后", + "之前", + "现在", + "我们", + "他们", + "你们", + "自己", + "一些", + "这些", + "那些", + "谢谢", + "你好", + "请问", + "好的", + "是的", + "知道", + "觉得", + "然后", + "或者", + "除了", + "不想", + "想要", + "打算", + "看到", + "听说", + "告诉", + "我的", + "你的", + "他的", + "大家", + "东西", + "时候", + "不错", + "真的", + "非常", + "很多", + "工作", + "生活", + "事情", + "感觉", + "方面", + "一点", + "一定", + "还有", + "出来", } keywords = [k for k in keywords if k not in _COMMON_BIGRAMS] @@ -708,14 +880,17 @@ def timeline(self, limit: int = 20, offset: int = 0) -> list[dict]: conn.close() import datetime + result = [] for marker, ts in rows: dt = datetime.datetime.fromtimestamp(ts) - result.append({ - "marker": marker, - "timestamp": ts, - "time_str": dt.strftime("%Y-%m-%d %H:%M"), - }) + result.append( + { + "marker": marker, + "timestamp": ts, + "time_str": dt.strftime("%Y-%m-%d %H:%M"), + } + ) return { "total": total, @@ -728,16 +903,15 @@ def close(self): self.flush() def __del__(self): - try: + with contextlib.suppress(Exception): self.close() - except Exception: - pass # ── 旧数据迁移 ───────────────────────────────────── # ── 归档(超限清理时保留被删数据)─────────────── + def _save_trash(session_key: str, rows: list[tuple]): """将 archive 清理掉的旧记录存入 trash JSONL(纯文本,可 grep)。 @@ -745,6 +919,7 @@ def _save_trash(session_key: str, rows: list[tuple]): 文件:data/archive_trash/{sanitized_key}.jsonl """ from pathlib import Path as _Path + trash_dir = _Path(__file__).parent.parent / "data" / "archive_trash" trash_dir.mkdir(parents=True, exist_ok=True) @@ -775,7 +950,9 @@ def recent_global(limit: int = 10, hours: int = 72) -> dict: 不限制 session_key,只按时间过滤。 用于 session 预热时注入同主题历史。 """ - import sqlite3, time, datetime + import datetime + import sqlite3 + import time # 找 archive.db(尝试多个位置) candidates = [ @@ -811,12 +988,14 @@ def recent_global(limit: int = 10, hours: int = 72) -> dict: for marker, ts, skey in rows: dt = datetime.datetime.fromtimestamp(ts) skey_short = skey.split(":")[-1][:20] if skey else "" - result.append({ - "marker": marker[:120], - "timestamp": ts, - "time_str": dt.strftime("%m-%d %H:%M"), - "session": skey_short, - }) + result.append( + { + "marker": marker[:120], + "timestamp": ts, + "time_str": dt.strftime("%m-%d %H:%M"), + "session": skey_short, + } + ) return {"markers": result, "count": len(result), "db": db_path} @@ -835,7 +1014,7 @@ def _copy_old_data(dat_db_path: str, archive_db_path: str): cursor = conn.cursor() # 从 entries table 找 experience 和 knowledge - for tbl, pri in [("entries", 1)]: + for tbl, _pri in [("entries", 1)]: try: cursor.execute(f"SELECT content, type FROM {tbl} WHERE content IS NOT NULL AND content != ''") for content, typ in cursor.fetchall(): diff --git a/lib/backup.py b/lib/backup.py index b1b7325..9d2dad6 100644 --- a/lib/backup.py +++ b/lib/backup.py @@ -24,7 +24,9 @@ if not _default_backup: # auto-detect working directory cwd = os.getcwd() - _default_backup = os.getenv("GBASE_BACKUP_DIR", "") or os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", ".backups") + _default_backup = os.getenv("GBASE_BACKUP_DIR", "") or os.path.join( + os.path.dirname(os.path.abspath(__file__)), "..", ".backups" + ) BACKUP_DIR = _default_backup INDEX_PATH = os.path.join(BACKUP_DIR, "index.json") diff --git a/lib/channels/feishu.py b/lib/channels/feishu.py index d7b507b..ce0df0f 100644 --- a/lib/channels/feishu.py +++ b/lib/channels/feishu.py @@ -112,7 +112,9 @@ async def _heartbeat_loop(self): break except Exception as e: self._heartbeat_failures += 1 - logger.warning("💔 Feishu Channel心跳失败 (%d/%d): %s", self._heartbeat_failures, _HEARTBEAT_MAX_FAILURES, e) + logger.warning( + "💔 Feishu Channel心跳失败 (%d/%d): %s", self._heartbeat_failures, _HEARTBEAT_MAX_FAILURES, e + ) if self._heartbeat_failures >= _HEARTBEAT_MAX_FAILURES: # 触发重连 diff --git a/lib/dag_agents.py b/lib/dag_agents.py index 2f98c9f..1b75658 100644 --- a/lib/dag_agents.py +++ b/lib/dag_agents.py @@ -377,9 +377,7 @@ def agent_qa_summary(inputs: dict, context: dict) -> dict: verdict = "WARN" report = f"QA Report | Whitebox: {parts.get('whitebox', {}).get('errors', '?')} errors | " - report += ( - f"Blackbox: {parts.get('blackbox', {}).get('reachable', '?')}/{parts.get('blackbox', {}).get('total', '?')} reachable | " - ) + report += f"Blackbox: {parts.get('blackbox', {}).get('reachable', '?')}/{parts.get('blackbox', {}).get('total', '?')} reachable | " report += f"Swarm: {len(swarm_rounds)} rounds degradation={degradation} | Verdict: {verdict}" return {"status": "ok", "verdict": verdict, "report": report, "errors": errors} @@ -408,7 +406,12 @@ def agent_weekly_stats(inputs: dict, context: dict) -> dict: def agent_trend_analysis(inputs: dict, context: dict) -> dict: """Trend analysis (simplified version based on weekly_stats).""" stats = inputs.get("stats", inputs.get("analysis", {})) - return {"status": "ok", "trend": "stable", "detail": "All metrics this week fluctuate within normal range", "data": stats} + return { + "status": "ok", + "trend": "stable", + "detail": "All metrics this week fluctuate within normal range", + "data": stats, + } def agent_weekly_report(inputs: dict, context: dict) -> dict: diff --git a/lib/daily_memory.py b/lib/daily_memory.py index 69e3e0d..01949f2 100644 --- a/lib/daily_memory.py +++ b/lib/daily_memory.py @@ -322,7 +322,7 @@ def get_cross_session_injections(session_dir: str = None, max_recent: int = 3) - except Exception: continue # 只取最近 max_recent 轮 user↔assistant 对 - recent_lines = lines[-max_recent * 6:] if len(lines) > max_recent * 6 else lines + recent_lines = lines[-max_recent * 6 :] if len(lines) > max_recent * 6 else lines pairs = [] current_q = None current_a = None @@ -366,11 +366,7 @@ def get_cross_session_injections(session_dir: str = None, max_recent: int = 3) - return "" text = "\n".join(snippets) - return ( - "\n## 📜 今日其他会话(跨会话记忆)\n" - "以下是你今天在其他会话中聊过的内容摘要,供参考:\n" - f"{text}\n" - ) + return f"\n## 📜 今日其他会话(跨会话记忆)\n以下是你今天在其他会话中聊过的内容摘要,供参考:\n{text}\n" if __name__ == "__main__": diff --git a/lib/evolution_engine.py b/lib/evolution_engine.py index daf2ee6..04c8137 100644 --- a/lib/evolution_engine.py +++ b/lib/evolution_engine.py @@ -318,7 +318,11 @@ def decide_rollback(evaluation: dict, filepath: str) -> tuple[bool, str, str | N return False, "No available backup found", None latest = backups[0] - return True, f"Evaluation failed (score {evaluation['overall_score']}), rolling back to {latest['id'][:20]}...", latest["id"] + return ( + True, + f"Evaluation failed (score {evaluation['overall_score']}), rolling back to {latest['id'][:20]}...", + latest["id"], + ) def execute_rollback_if_needed(evaluation: dict, filepath: str) -> dict: diff --git a/lib/experience.py b/lib/experience.py index fdccd19..2032acb 100644 --- a/lib/experience.py +++ b/lib/experience.py @@ -40,7 +40,6 @@ "summary": "此次任务工具调用次数偏多({tool_calls_count}次),下次同类任务应该先规划再调工具", "confidence": "medium", }, - { "name": "api_error", "check": lambda ctx: ctx.get("has_api_error", False), @@ -63,9 +62,11 @@ # ── 反脆弱: 成功模式提炼(成功比失败更需要分析)── { "name": "success_pattern", - "check": lambda ctx: ctx.get("tool_calls_count", 0) >= 3 - and not ctx.get("has_api_error", False) - and not ctx.get("has_failure", False), + "check": lambda ctx: ( + ctx.get("tool_calls_count", 0) >= 3 + and not ctx.get("has_api_error", False) + and not ctx.get("has_failure", False) + ), "summary": "有效模式: [{task_theme}] 用 {tool_calls_count} 次工具调用完成", "confidence": "medium", }, @@ -173,6 +174,7 @@ async def extract( """从一次对话中提取经验。先跑规则 → 去重 → 写库。""" # 提取任务主题(前60字,去标点) import re as _re + task_theme = _re.sub(r"[^\u4e00-\u9fff\w\s]", "", user_message[:60]).strip() context = { @@ -270,7 +272,7 @@ async def _llm_extract(self, context: dict, client): # 类型防御:LLM 可能返回不完整 JSON(被截断的末尾) is_clean = False - for try_idx in range(3): + for _try_idx in range(3): try: result = json.loads(text) is_clean = True @@ -279,7 +281,7 @@ async def _llm_extract(self, context: dict, client): # 尝试找到最晚的完整 JSON 截止点 last_brace = text.rfind("}") if last_brace > 0: - text = text[:last_brace + 1] + text = text[: last_brace + 1] else: break if not is_clean: @@ -335,7 +337,7 @@ async def _llm_extract(self, context: dict, client): _record_success_insight(self, context.get("user_message", ""), context["tool_calls_count"]) except (json.JSONDecodeError, KeyError) as e: - logger.debug("经验提取(LLM)解析失败: %s | 原始响应: %s", e, text[:200] if 'text' in dir() else "N/A") + logger.debug("经验提取(LLM)解析失败: %s | 原始响应: %s", e, text[:200] if "text" in dir() else "N/A") except Exception as e: logger.debug("经验提取(LLM)异常: %s", e) @@ -370,10 +372,10 @@ def search(self, query: str, limit: int = 5) -> list[dict]: import re as _re tokens = _re.sub(r"[^\u4e00-\u9fff\w\s]", " ", query).strip() - fts_query = " OR ".join( - f'"{t}" OR "{t}*"' if len(t) >= 2 else f'"{t}"' - for t in tokens.split() - ) or f'"{query}"' + fts_query = ( + " OR ".join(f'"{t}" OR "{t}*"' if len(t) >= 2 else f'"{t}"' for t in tokens.split()) + or f'"{query}"' + ) # FTS5 BM25 排序 + 内容长度惩罚(太长的长篇分析文降级) rows = conn.execute( diff --git a/lib/kernel.py b/lib/kernel.py index a852cd4..fddda74 100644 --- a/lib/kernel.py +++ b/lib/kernel.py @@ -347,10 +347,12 @@ def __init__( # ── ArchiveStore 初始化(无 session 依赖,全局写入 + 全局Search) ── from pathlib import Path + self._archive_store = None if data_dir: try: from .archive_store import ArchiveStore + _archive_db_path = Path(data_dir) / "archive.db" self._archive_store = ArchiveStore(session_key="global", db_path=_archive_db_path) logger.info("ArchiveStore 初始化完成, db_path=%s", _archive_db_path) @@ -391,11 +393,12 @@ def _build_dynamic_system_prompt(self) -> str: # ── Skill Router(SkillRouter + SkillLoader 双层匹配) ── if self.skill_loader: from .skill_router import SkillRouter + router = SkillRouter( self.skill_loader, os.path.join(os.getcwd(), "skills-index.json"), ) - user_msg = (self._current_user_message or "") + user_msg = self._current_user_message or "" route_result = router.get_route_instruction(user_msg, inject_lines=20) if route_result: parts.append(route_result) @@ -457,20 +460,21 @@ def _build_dynamic_system_prompt(self) -> str: logger.info("Knowledge 自动Search: query=%s", _query) # 直接查 SQLite (不走 tool, 直接调 storage) # 中文不分词,改用字符级 n-gram: 单字+双字组合 - _import_re = __import__('re') - _words = _import_re.findall(r'[a-zA-Z0-9_\-]+|[\u4e00-\u9fff]+', _query) + _import_re = __import__("re") + _words = _import_re.findall(r"[a-zA-Z0-9_\-]+|[\u4e00-\u9fff]+", _query) _fts_tokens = [] for _w in _words: _fts_tokens.append(f"{_w}*") - if len(_w) > 1 and _import_re.match(r'^[\u4e00-\u9fff]+$', _w): + if len(_w) > 1 and _import_re.match(r"^[\u4e00-\u9fff]+$", _w): # 中文多字词,拆单字也加进去 for _ch in _w: _fts_tokens.append(f"{_ch}*") # FTS5 detail=column 下纯数字/纯单字母 token 会被解析为 column name - _fts_tokens = [t for t in _fts_tokens - if not _import_re.match(r'^\d+$', t) - and not _import_re.match(r'^[a-zA-Z]$', t) - and len(t) > 1] + _fts_tokens = [ + t + for t in _fts_tokens + if not _import_re.match(r"^\d+$", t) and not _import_re.match(r"^[a-zA-Z]$", t) and len(t) > 1 + ] # 过滤后保底:至少保留原始词保证有查询内容 if not _fts_tokens: _fts_tokens = [f"{_w}*" for _w in _words if len(_w) > 1] @@ -507,8 +511,7 @@ def _build_dynamic_system_prompt(self) -> str: _know_text = ( "\n\n## Related Knowledge (pre-loaded)\n" "Knowledge facts related to your current query. " - "If you already know these, ignore.\n" - + "\n".join(_results) + "If you already know these, ignore.\n" + "\n".join(_results) ) parts.append(_know_text) # GMem Phase 1A1: 自动检索命中后 record_hit @@ -584,6 +587,7 @@ def _build_dynamic_system_prompt(self) -> str: try: # L0: 今天其他 session 的关键摘要(跨会话记忆,等效 cross-session skill) from .daily_memory import get_cross_session_injections + _cross = get_cross_session_injections() if _cross: _memory_injections.append(("今日其他会话", _cross)) @@ -593,6 +597,7 @@ def _build_dynamic_system_prompt(self) -> str: try: # L1: daily_memory 会话记忆 from .daily_memory import get_injection_text as daily_memory_inject + _daily = daily_memory_inject() if _daily: _memory_injections.append(("会话记忆摘要", _daily)) @@ -604,6 +609,7 @@ def _build_dynamic_system_prompt(self) -> str: _rows = [] _kn_rows = [] from .storage import Storage + _st = getattr(self, "_storage_backend", None) if _st is None: _st = Storage() @@ -634,7 +640,9 @@ def _build_dynamic_system_prompt(self) -> str: continue if any(_p in _s for _p in _NOISE_PATTERNS): continue - _dt = datetime.fromtimestamp(_ts, tz=__import__('zoneinfo').ZoneInfo("Asia/Shanghai")).strftime("%m-%d") + _dt = datetime.fromtimestamp(_ts, tz=__import__("zoneinfo").ZoneInfo("Asia/Shanghai")).strftime( + "%m-%d" + ) _clean.append(_s[:180]) if len(_clean) >= 5: break @@ -657,7 +665,9 @@ def _build_dynamic_system_prompt(self) -> str: for _s, _ts, _h in _kn_rows[:4]: if not isinstance(_s, str): continue - _dt = datetime.fromtimestamp(_ts, tz=__import__('zoneinfo').ZoneInfo("Asia/Shanghai")).strftime("%m-%d") + _dt = datetime.fromtimestamp(_ts, tz=__import__("zoneinfo").ZoneInfo("Asia/Shanghai")).strftime( + "%m-%d" + ) _lines.append(f" - 💡 {_s[:180]} (hits={_h}, {_dt})") _memory_injections.append(("活跃知识点", "\n".join(_lines))) except Exception: @@ -674,7 +684,9 @@ def _build_dynamic_system_prompt(self) -> str: _seen_prefixes.add(_key) _deduped.append((_label, _text)) _parts = [] - _parts.append("## 📜 历史记录摘要\n以下是系统自动提取的过往历史记录摘要,用于辅助参考。注意这些不是当前对话内容,而是之前发生过的事情的记录。请区分使用。\n") + _parts.append( + "## 📜 历史记录摘要\n以下是系统自动提取的过往历史记录摘要,用于辅助参考。注意这些不是当前对话内容,而是之前发生过的事情的记录。请区分使用。\n" + ) for _label, _text in _deduped: _parts.append(f"### {_label}\n{_text}") parts.append("\n".join(_parts)) @@ -718,6 +730,7 @@ async def run( try: # 全局搜索:不限制 session_key import sqlite3 as _sqlite3 + _db_path = self._archive_store.db_path _keywords = self._archive_store._extract_keywords(user_message) if _keywords: @@ -856,16 +869,13 @@ async def run( context = session.build_context() messages.extend(context) if archive_hits: - user_with_archive = ( - f"【历史记忆参考】\n{archive_hits}\n\n---\n\n{enriched_message}" - ) + user_with_archive = f"【历史记忆参考】\n{archive_hits}\n\n---\n\n{enriched_message}" session.append_user_message(user_with_archive) else: session.append_user_message(enriched_message) # 最终 user message(已含 archive Search结果) final_user = ( - f"【历史记忆参考】\n{archive_hits}\n\n---\n\n{enriched_message}" - if archive_hits else enriched_message + f"【历史记忆参考】\n{archive_hits}\n\n---\n\n{enriched_message}" if archive_hits else enriched_message ) messages.append({"role": "user", "content": final_user}) @@ -903,15 +913,17 @@ async def run( # 🧪 Experiment #2 — Record gradient for this turn self._record_gradient(user_message, reply, tc_count) # 📊 RSI: 实时工具调用成功率追踪(计数器) - self._tool_call_count = getattr(self, '_tool_call_count', 0) + tc_count - self._tool_fail_count = getattr(self, '_tool_fail_count', 0) + self._tool_call_count = getattr(self, "_tool_call_count", 0) + tc_count + self._tool_fail_count = getattr(self, "_tool_fail_count", 0) # 每10次对话输出一次性能快照 if self._tool_call_count % 10 == 0 and self._tool_call_count > 0: fail_rate = self._tool_fail_count / self._tool_call_count * 100 logger.info( "📊 RSI Telemetry: %d tool calls, %d fails (%.1f%%), last_task=%s", - self._tool_call_count, self._tool_fail_count, fail_rate, - getattr(self, '_current_task_type', 'unknown') + self._tool_call_count, + self._tool_fail_count, + fail_rate, + getattr(self, "_current_task_type", "unknown"), ) _engine = self.experience_engine # 🔄 反脆弱: 检测是否是失败/Rollback(从 reply 中提取特征) @@ -1508,7 +1520,9 @@ async def _loop( reply = choice0.message.content or "" finish_reason = getattr(choice0, "finish_reason", None) if finish_reason == "length": - logger.warning("⚠️ LLM 输出被截断 (finish_reason=length)!max_tokens=%s Configurable能不够", self.max_tokens) + logger.warning( + "⚠️ LLM 输出被截断 (finish_reason=length)!max_tokens=%s Configurable能不够", self.max_tokens + ) reply += "\n\n[⚠️ 输出被截断,结果Configurable能不完整]" if session: session.append({"role": "assistant", "content": reply}) @@ -1611,7 +1625,8 @@ async def _run_one_tool(tc): return { "role": "tool", "tool_call_id": tc.id, - "content": f"[自动备用] 工具 {func_name} 熔断,自动切换为 {fb_tool} 执行成功。\n\n" + result_str[:5000], + "content": f"[自动备用] 工具 {func_name} 熔断,自动切换为 {fb_tool} 执行成功。\n\n" + + result_str[:5000], } return { "role": "tool", @@ -1640,7 +1655,8 @@ async def _run_one_tool(tc): return { "role": "tool", "tool_call_id": tc.id, - "content": f"[自动备用·整轮熔断] 工具 {func_name} 熔断,自动切换为 {fb_tool} 执行成功。\n\n" + result_str[:5000], + "content": f"[自动备用·整轮熔断] 工具 {func_name} 熔断,自动切换为 {fb_tool} 执行成功。\n\n" + + result_str[:5000], } return { "role": "tool", @@ -1699,11 +1715,17 @@ async def _run_one_tool(tc): attempts = CIRCUIT_BREAKER["_cooldown_attempts"][func_name] base = CIRCUIT_BREAKER["tool_cooldown_seconds"] cap = CIRCUIT_BREAKER["tool_cooldown_max"] - cooldown = min(base * (2 ** attempts), cap) + cooldown = min(base * (2**attempts), cap) CIRCUIT_BREAKER["_cooldowns"][func_name] = time.time() + cooldown CIRCUIT_BREAKER["_cooldown_attempts"][func_name] = attempts + 1 CIRCUIT_BREAKER["_failures"][func_name] = 0 # 重置,冷却期不计数 - logger.warning("🔴 工具 %s 连续失败 %d 次,冷却 %ds(第%d次退避)", func_name, consecutive, cooldown, attempts + 1) + logger.warning( + "🔴 工具 %s 连续失败 %d 次,冷却 %ds(第%d次退避)", + func_name, + consecutive, + cooldown, + attempts + 1, + ) if CIRCUIT_BREAKER["_round_failure_count"] >= CIRCUIT_BREAKER["max_round_failures"]: CIRCUIT_BREAKER["_breaker_tripped"] = True logger.warning("🔴 整轮熔断触发!累计失败 %d 次", CIRCUIT_BREAKER["_round_failure_count"]) @@ -1738,7 +1760,9 @@ async def _run_one_tool(tc): def _build_gmem_summary(stats: dict, session) -> str: """从 session 统计信息构建压缩摘要文本。""" try: - parts = [f"上下文压缩 checkpoint — 消息数: {stats.get('messages', 0)}, 压缩次数: {stats.get('compactions', 0)}, 层级: {session.get_compaction_level() if hasattr(session, 'get_compaction_level') else 0}"] + parts = [ + f"上下文压缩 checkpoint — 消息数: {stats.get('messages', 0)}, 压缩次数: {stats.get('compactions', 0)}, 层级: {session.get_compaction_level() if hasattr(session, 'get_compaction_level') else 0}" + ] # 尝试获取最后几条会话摘要 if hasattr(session, "get_all_compactions"): compactions = session.get_all_compactions() diff --git a/lib/mirror.py b/lib/mirror.py index ad4b87c..deb066b 100644 --- a/lib/mirror.py +++ b/lib/mirror.py @@ -1022,7 +1022,15 @@ def _expand_recall_query(query: str) -> list: result.extend(unique[:8]) return result - def recall(self, query: str, limit: int = 10, ebbinghaus: bool = True, include_forgotten: bool = False, open_recall: bool = False, relevance: float = 0.0) -> list: + def recall( + self, + query: str, + limit: int = 10, + ebbinghaus: bool = True, + include_forgotten: bool = False, + open_recall: bool = False, + relevance: float = 0.0, + ) -> list: """Search memories with multi-phrase LIKE expansion. Instead of a single LIKE '%whole sentence%', expands the query @@ -1094,19 +1102,24 @@ def recall(self, query: str, limit: int = 10, ebbinghaus: bool = True, include_f if rows: boost = 0.05 + relevance * 0.10 # contextual blood return: relevance 0→1 maps to +0.05→+0.15 for r in rows: - was_inactive = (len(r) > 8 and not r[8]) # is_active=0 means archived + was_inactive = len(r) > 8 and not r[8] # is_active=0 means archived if was_inactive and open_recall: # revive: bring archived memory back to active pool self._conn.execute( "UPDATE memories SET strength=MIN(strength + ?, 2.0), hits=hits+1, is_active=1, last_access=? WHERE id=?", - (boost, now, r[0])) + (boost, now, r[0]), + ) else: - self._conn.execute( - "UPDATE memories SET hits=hits+1, last_access=? WHERE id=?", - (now, r[0])) + self._conn.execute("UPDATE memories SET hits=hits+1, last_access=? WHERE id=?", (now, r[0])) self._conn.commit() return [ - dict(zip(["id", "type", "content", "strength", "hits", "verified", "created_at", "last_access", "is_active"], row, strict=False)) + dict( + zip( + ["id", "type", "content", "strength", "hits", "verified", "created_at", "last_access", "is_active"], + row, + strict=False, + ) + ) for row in rows ] diff --git a/lib/session.py b/lib/session.py index a94e95b..c53c57b 100644 --- a/lib/session.py +++ b/lib/session.py @@ -11,15 +11,14 @@ - L3: 会话状态追踪 — 动态压缩阈值 + 上下文使用量统计 """ -import asyncio import json import logging -import threading import time from pathlib import Path logger = logging.getLogger(__name__) + class JsonlSessionManager: """Append-only JSONL 会话管理器,带三层压缩能力。""" @@ -44,7 +43,6 @@ def _open(self): logger.exception("静默异常") self.fh = open(self.filepath, "a+", encoding="utf-8") - def _update_adaptive_max(self): """L3: 根据压缩层级动态调节上下文保留轮次。""" # 每层压缩后,保留的轮次缩小,但不低于底线 @@ -83,7 +81,7 @@ def _estimate_tokens(text: str | list | dict) -> int: if isinstance(text, dict): flat = str(text) return int(len(flat) * 0.35) + 10 - chinese_chars = sum(1 for c in text if '\u4e00' <= c <= '\u9fff') + chinese_chars = sum(1 for c in text if "\u4e00" <= c <= "\u9fff") other_chars = len(text) - chinese_chars return int(chinese_chars * 1.5 + other_chars / 4) + 10 @@ -267,16 +265,18 @@ def build_context(self, max_messages: int | None = None, max_tokens: int = 0) -> inject_content = "\n".join(ctx_parts) - messages.insert(0, { - "role": "system", - "content": inject_content, - }) + messages.insert( + 0, + { + "role": "system", + "content": inject_content, + }, + ) elif highest_summary: # 兼容旧格式:只有纯文本 - messages.insert(0, { - "role": "system", - "content": f"[会话摘要 - 压缩前的对话历史]:\n{highest_summary[:2000]}" - }) + messages.insert( + 0, {"role": "system", "content": f"[会话摘要 - 压缩前的对话历史]:\n{highest_summary[:2000]}"} + ) # 按轮压缩 compressed: list[dict] = [] @@ -329,7 +329,7 @@ def build_context(self, max_messages: int | None = None, max_tokens: int = 0) -> # 如果截断后最后两条不是完整的 user+assistant,补回 keep if len(keep) >= 2 and len(messages) >= 2: if not (messages[-2]["role"] == "user" and messages[-1]["role"] == "assistant"): - messages = messages[:-len(keep)] + keep + messages = messages[: -len(keep)] + keep elif len(keep) >= 2: messages = messages + keep @@ -337,11 +337,11 @@ def build_context(self, max_messages: int | None = None, max_tokens: int = 0) -> def get_compaction_context(self, max_messages: int = 15) -> list[dict]: """L2: 获取压缩阶段的高层摘要 + 近期轮次。 - + 不同于 build_context(给 LLM 用),这个方法返回: - 所有层级的摘要列表(不是只取最高层) - 最新 max_messages 轮对话 - + 用于 L2 多层压缩:把旧摘要 + 近期对话 → 新摘要。 """ summaries: list[dict] = [] @@ -364,22 +364,26 @@ def get_compaction_context(self, max_messages: int = 15) -> list[dict]: after_last_compact = False # 重置 s = entry.get("summary", "") or entry.get("context", "") if s or entry.get("decisions") or entry.get("key_facts"): - summaries.append({ - "level": entry.get("level", 0), - "summary": s, - "decisions": entry.get("decisions", []), - "key_facts": entry.get("key_facts", []), - "pending": entry.get("pending", []), - "context": entry.get("context", ""), - "ts": entry.get("_ts", 0), - }) + summaries.append( + { + "level": entry.get("level", 0), + "summary": s, + "decisions": entry.get("decisions", []), + "key_facts": entry.get("key_facts", []), + "pending": entry.get("pending", []), + "context": entry.get("context", ""), + "ts": entry.get("_ts", 0), + } + ) elif after_last_compact or etype in ("user", "assistant"): after_last_compact = True if etype in ("user", "assistant"): - recent.append({ - "role": entry.get("role", etype), - "content": entry.get("content", "") or "", - }) + recent.append( + { + "role": entry.get("role", etype), + "content": entry.get("content", "") or "", + } + ) except Exception: logger.exception("静默异常") diff --git a/lib/skill_router.py b/lib/skill_router.py index a7ad129..516d861 100644 --- a/lib/skill_router.py +++ b/lib/skill_router.py @@ -55,18 +55,113 @@ } STOP_WORDS = { - "the", "a", "an", "is", "are", "was", "were", "be", "been", - "this", "that", "these", "those", "i", "you", "he", "she", "it", - "we", "they", "me", "my", "your", "his", "her", "its", "our", - "their", "and", "or", "but", "in", "on", "at", "to", "for", - "of", "with", "by", "from", "as", "do", "does", "did", "has", - "have", "had", "can", "could", "will", "would", "shall", "should", - "may", "might", "no", "not", "nor", "so", "if", "then", "else", - "when", "where", "why", "how", "which", "who", "whom", - "了", "的", "是", "在", "和", "就", "也", "都", "要", "会", - "有", "没", "不", "很", "吧", "吗", "呢", "啊", "哦", "嗯", - "请", "帮", "把", "给", "让", "从", "被", "向", "往", "用", - "想", "能", "可以", "应该", "需要", "有点", "一些", "这个", + "the", + "a", + "an", + "is", + "are", + "was", + "were", + "be", + "been", + "this", + "that", + "these", + "those", + "i", + "you", + "he", + "she", + "it", + "we", + "they", + "me", + "my", + "your", + "his", + "her", + "its", + "our", + "their", + "and", + "or", + "but", + "in", + "on", + "at", + "to", + "for", + "of", + "with", + "by", + "from", + "as", + "do", + "does", + "did", + "has", + "have", + "had", + "can", + "could", + "will", + "would", + "shall", + "should", + "may", + "might", + "no", + "not", + "nor", + "so", + "if", + "then", + "else", + "when", + "where", + "why", + "how", + "which", + "who", + "whom", + "了", + "的", + "是", + "在", + "和", + "就", + "也", + "都", + "要", + "会", + "有", + "没", + "不", + "很", + "吧", + "吗", + "呢", + "啊", + "哦", + "嗯", + "请", + "帮", + "把", + "给", + "让", + "从", + "被", + "向", + "往", + "用", + "想", + "能", + "可以", + "应该", + "需要", + "有点", + "一些", + "这个", } @@ -127,7 +222,7 @@ def _tokenize(self, text: str) -> list[str]: if len(chars) >= 2: # 2-gram 滑动窗口 for i in range(len(chars) - 1): - bigram = chars[i] + chars[i+1] + bigram = chars[i] + chars[i + 1] if not all(c in STOP_WORDS for c in bigram): tokens.append(bigram) # 也保留单字(低权重 fallback) @@ -141,9 +236,7 @@ def _match_score(self, skill: dict, tokens: list[str]) -> float: """计算 skill 与输入 token 的匹配分数。""" score = 0.0 name = skill.get("name", "").lower() - desc = ( - skill.get("description") or skill.get("desc") or skill.get("short") or "" - ).lower() + desc = (skill.get("description") or skill.get("desc") or skill.get("short") or "").lower() triggers = [t.lower() for t in skill.get("triggers", skill.get("tags", []))] full_text = f"{name} {desc} {' '.join(triggers)}" @@ -196,12 +289,14 @@ def route(self, user_input: str, top_k: int = 5) -> list[dict]: continue score = self._match_score(skill, tokens) if score > 0: - candidates.append({ - "name": skill.get("name", "unknown"), - "score": score, - "description": skill.get("description", ""), - "source": "awesome-codex", - }) + candidates.append( + { + "name": skill.get("name", "unknown"), + "score": score, + "description": skill.get("description", ""), + "source": "awesome-codex", + } + ) # 2. 匹配本地 skills/ 目录 try: @@ -209,13 +304,15 @@ def route(self, user_input: str, top_k: int = 5) -> list[dict]: for skill in local_skills: score = self._match_score(skill, tokens) if score > 0: - candidates.append({ - "name": skill.get("name", "unknown"), - "score": score, - "description": skill.get("description", ""), - "source": "local", - "triggers": skill.get("triggers", []), - }) + candidates.append( + { + "name": skill.get("name", "unknown"), + "score": score, + "description": skill.get("description", ""), + "source": "local", + "triggers": skill.get("triggers", []), + } + ) except Exception as e: logger.warning("本地 skill 匹配失败: %s", e) @@ -265,17 +362,11 @@ def get_route_instruction(self, user_input: str, inject_lines: int = 35) -> str: return "" lines = ["## Skill Route (auto-matched by input)"] - lines.append( - "The following skills match your current task. Read the relevant SKILL.md " - "before starting work." - ) + lines.append("The following skills match your current task. Read the relevant SKILL.md before starting work.") lines.append("") for m in matches: - lines.append( - f"- **{m['name']}** (score={m['score']:.1f}, " - f"source={m['source']}) — {m['description'][:100]}" - ) + lines.append(f"- **{m['name']}** (score={m['score']:.1f}, source={m['source']}) — {m['description'][:100]}") content = self.load_skill_content(m["name"]) if content: parts = content.split("\n") diff --git a/lib/storage.py b/lib/storage.py index 328fceb..dd02a91 100644 --- a/lib/storage.py +++ b/lib/storage.py @@ -259,7 +259,11 @@ def _prune(self, type_: str): "DELETE FROM entries WHERE id IN (" "SELECT id FROM entries WHERE type=? AND hits=0 AND created_at < ? " "ORDER BY created_at ASC LIMIT ?)", - (type_, cutoff, excess,), + ( + type_, + cutoff, + excess, + ), ).rowcount self._conn.commit() if _deleted > 0: @@ -289,8 +293,7 @@ def apply_aging(self, age_cutoff_days: int = 30, decay: float = 0.5): # ── Phase 5 增强:hit=1 且 60 天未访问 → 自动清理(噪音数据) ── _noise_cutoff = time.time() - 60 * 86400 cursor = self._conn.execute( - "DELETE FROM entries WHERE hits = 1 AND last_accessed_at < ? " - "AND last_accessed_at > 0", + "DELETE FROM entries WHERE hits = 1 AND last_accessed_at < ? AND last_accessed_at > 0", (_noise_cutoff,), ) _noise_count = cursor.rowcount @@ -298,9 +301,7 @@ def apply_aging(self, age_cutoff_days: int = 30, decay: float = 0.5): logger.info("噪音清理: 删除 %d 条 hit=1 的僵尸记录", _noise_count) # ── Phase 5 增强:空 content 记录清理 ── - cursor = self._conn.execute( - "DELETE FROM entries WHERE content IS NULL OR TRIM(content) = ''" - ) + cursor = self._conn.execute("DELETE FROM entries WHERE content IS NULL OR TRIM(content) = ''") _empty_count = cursor.rowcount if _empty_count > 0: logger.info("空值清理: 删除 %d 条空 content 记录", _empty_count) diff --git a/lib/territory.py b/lib/territory.py index 8705425..12d1600 100644 --- a/lib/territory.py +++ b/lib/territory.py @@ -5,6 +5,7 @@ GBASE_AGENT_NAME — this agent's name GBASE_AGENT_HOMES — colon-separated list of agent_name:path pairs """ + import logging import os @@ -22,6 +23,7 @@ name, home = pair.split("=", 1) AGENT_HOMES[name.strip()] = home.strip() + def _is_other_agent_territory(path: str) -> str | None: """If path points to another agent's home, return that agent's name.""" path = os.path.abspath(os.path.expanduser(path)) @@ -32,6 +34,7 @@ def _is_other_agent_territory(path: str) -> str | None: return name return None + def _check_territory(path: str, my_home: str | None = None) -> None: """Check if path belongs to another agent. Raises PermissionError if write, logs warning if read.""" invader = _is_other_agent_territory(path) @@ -51,7 +54,4 @@ def check_territory_violation(path: str) -> str | None: def build_territory_error(violation: str, path: str, action: str = "操作") -> str: """Build a human-readable territory error message.""" - return ( - f"领地安全拒绝:路径 {path} 属于 Agent「{violation}」," - f"不允许{action}。如需跨 Agent 协作,请通过共享目录。" - ) + return f"领地安全拒绝:路径 {path} 属于 Agent「{violation}」,不允许{action}。如需跨 Agent 协作,请通过共享目录。" diff --git a/main.py b/main.py index 76232e2..f4d571d 100644 --- a/main.py +++ b/main.py @@ -48,9 +48,7 @@ # 启动时校验必备配置 if not APP_ID or not APP_SECRET or not ENCRYPT_KEY: - logger.warning( - "飞书 Bot 配置不完整:请设置 FEISHU_APP_ID / FEISHU_APP_SECRET / FEISHU_ENCRYPT_KEY 环境变量" - ) + logger.warning("飞书 Bot 配置不完整:请设置 FEISHU_APP_ID / FEISHU_APP_SECRET / FEISHU_ENCRYPT_KEY 环境变量") # ── GBase/GBase 内核配置 ── IDENTITY_NAME = "gbase" @@ -65,6 +63,7 @@ def _ensure_dirs(): async def run(): import uvicorn + os.environ.setdefault("GBASE_DATA_DIR", DATA_DIR) from fastapi import FastAPI, Request from fastapi.middleware.cors import CORSMiddleware @@ -82,6 +81,7 @@ async def run(): # ── 日志:按日期切割,保留 90 天 ── import logging.handlers + _file_handler = logging.handlers.TimedRotatingFileHandler( str(Path(DATA_DIR) / "gbase.log"), when="midnight", @@ -89,10 +89,12 @@ async def run(): backupCount=90, encoding="utf-8", ) - _file_handler.setFormatter(logging.Formatter( - "%(asctime)s [%(name)s] %(levelname)s %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", - )) + _file_handler.setFormatter( + logging.Formatter( + "%(asctime)s [%(name)s] %(levelname)s %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + ) _file_handler.suffix = "%Y-%m-%d" logger.addHandler(_file_handler) logger.setLevel(logging.INFO) @@ -311,6 +313,7 @@ async def ask(request: Request): logger.info("Ask endpoint: %s (session=%s, id=%s)", message[:80], use_session, session_id) from lib.session import JsonlSessionManager + _session = None if use_session: safe_id = session_id.replace("/", "_").replace("\\", "_").strip() @@ -346,6 +349,7 @@ async def _startup_guard(): else: errors.append("memory/storage 未初始化") import httpx as _httpx + _port_ok = False for _retry in range(3): try: @@ -368,6 +372,7 @@ async def _startup_guard(): # 🚀 RSI: 启动后执行一次完整进化周期 try: from lib.evolution_engine import full_evolution_cycle + await full_evolution_cycle() logger.info("🚀 RSI 进化周期完成") except Exception as _evo_e: diff --git a/pyproject.toml b/pyproject.toml index b9927e8..e209ff9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "gbase" -version = "0.4.0" +version = "0.4.1" description = "Universal AI Agent framework with identity separation, tool auto-registration, and multi-agent orchestration" readme = "README.md" requires-python = ">=3.11" @@ -52,6 +52,7 @@ ignore = [ "lib/archive_store.py" = ["N806"] "lib/kernel.py" = ["N806"] "lib/sleep_cycle.py" = ["B007", "F841", "ARG001"] +"lib/storage.py" = ["ARG004"] "lib/loop_cache.py" = ["ARG002", "B007", "SIM102"] "lib/daily_memory.py" = ["B007"] "lib/identity.py" = ["ARG001"] diff --git a/tools/__init__.py b/tools/__init__.py index e3f796f..86755a6 100644 --- a/tools/__init__.py +++ b/tools/__init__.py @@ -12,12 +12,14 @@ # YF-image-base and scene skills from . import ( # noqa: F401 anchor_keeper, # noqa: F401 # noqa: F401 + archive_search, # noqa: F401 commit_helper, # noqa: F401 crypto_helper, # noqa: F401 cua_tools, # noqa: F401 data_seeder, # noqa: F401 docx_gen, # noqa: F401 file_checker, # noqa: F401 + glink_projects, # noqa: F401 honeycomb_search, # noqa: F401 jwt_helper, # noqa: F401 laser_doc, # noqa: F401 @@ -29,14 +31,12 @@ pptx_gen, # noqa: F401 prompt_helper, # noqa: F401 query_profiler, # noqa: F401 + remember_info, # noqa: F401 schema_tools, # noqa: F401 security_watch, # noqa: F401 test_generator, # noqa: F401 xlsx_gen, # noqa: F401 yf_image_tools, # noqa: F401 - archive_search, # noqa: F401 - glink_projects, # noqa: F401 - remember_info, # noqa: F401 ) diff --git a/tools/archive_search.py b/tools/archive_search.py index 85917ff..ffa5420 100644 --- a/tools/archive_search.py +++ b/tools/archive_search.py @@ -6,12 +6,12 @@ import logging import time -from typing import Optional -from lib.toolkit import tool, get_global +from lib.toolkit import get_global, tool logger = logging.getLogger("archive_search") + @tool def archive_search(query: str, max_results: int = 5, session_only: bool = False) -> str: """搜索 archive_store 中的历史对话记录。 diff --git a/tools/distill.py b/tools/distill.py index f09ad41..ae07838 100644 --- a/tools/distill.py +++ b/tools/distill.py @@ -76,7 +76,10 @@ def _export_data(force: bool = False) -> tuple[Path, int]: continue if c == "low": continue - rc = {"instruction": f"What experience do you have on the following topic?\nTopic: {s}", "output": d if d else s} + rc = { + "instruction": f"What experience do you have on the following topic?\nTopic: {s}", + "output": d if d else s, + } records.append(rc) # Reverse QA if len(s) > 20: @@ -306,20 +309,38 @@ def _do_evaluate(model: str = "gbase-7b"): "q": "Write an async HTTP client in Python with type annotations, including exception handling and timeout", "tags": ["typing", "async", "error_handling"], }, - {"q": "Review this code for me: def add(a,b): return a+b — what's missing?", "tags": ["code_review", "edge_cases"]}, + { + "q": "Review this code for me: def add(a,b): return a+b — what's missing?", + "tags": ["code_review", "edge_cases"], + }, # Docker - {"q": "Write a docker-compose.yml with Nginx reverse proxy + FastAPI backend + PostgreSQL", "tags": ["docker", "compose"]}, + { + "q": "Write a docker-compose.yml with Nginx reverse proxy + FastAPI backend + PostgreSQL", + "tags": ["docker", "compose"], + }, # Exception paths { "q": "What is the execution order of try/except/finally in Python? What if an exception is thrown inside except?", "tags": ["error_handling", "exception"], }, # Logging - {"q": "Write a daily rotating log config using Python logging module, keeping 30 days", "tags": ["logging", "best_practice"]}, + { + "q": "Write a daily rotating log config using Python logging module, keeping 30 days", + "tags": ["logging", "best_practice"], + }, # Testing - {"q": "Write a pytest test, mock external HTTP calls, verify exception retry logic", "tags": ["testing", "pytest", "mock"]}, - {"q": "SQLAlchemy ORM: what's the most efficient way to batch insert 10000 records at once?", "tags": ["db", "performance"]}, - {"q": "Explain the difference between FastAPI's BackgroundTasks and Celery, when to use which?", "tags": ["api", "architecture"]}, + { + "q": "Write a pytest test, mock external HTTP calls, verify exception retry logic", + "tags": ["testing", "pytest", "mock"], + }, + { + "q": "SQLAlchemy ORM: what's the most efficient way to batch insert 10000 records at once?", + "tags": ["db", "performance"], + }, + { + "q": "Explain the difference between FastAPI's BackgroundTasks and Celery, when to use which?", + "tags": ["api", "architecture"], + }, ] # agent-3 test set (task/automation/quality) @@ -328,7 +349,10 @@ def _do_evaluate(model: str = "gbase-7b"): "q": "Design a file change monitoring task that automatically categorizes and archives new files in a directory, considering duplicates and errors", "tags": ["automation", "workflow"], }, - {"q": "How to determine if an API endpoint is healthy? List 5 check dimensions", "tags": ["monitoring", "quality"]}, + { + "q": "How to determine if an API endpoint is healthy? List 5 check dimensions", + "tags": ["monitoring", "quality"], + }, { "q": "Write a Python function: given multiple tasks and their dependencies, return a reasonable execution order", "tags": ["scheduling", "topological_sort"], @@ -337,8 +361,14 @@ def _do_evaluate(model: str = "gbase-7b"): "q": "Log analysis: if a service reports the same warning every 5 minutes, how should it be handled?", "tags": ["troubleshooting", "log_analysis"], }, - {"q": "Design a simple retry queue, exponential backoff retry after task failure, max 3 attempts", "tags": ["retry", "queue", "resilience"]}, - {"q": "How to design monitoring alert thresholds? Which are P0 level that must be handled immediately?", "tags": ["monitoring", "alerting", "priority"]}, + { + "q": "Design a simple retry queue, exponential backoff retry after task failure, max 3 attempts", + "tags": ["retry", "queue", "resilience"], + }, + { + "q": "How to design monitoring alert thresholds? Which are P0 level that must be handled immediately?", + "tags": ["monitoring", "alerting", "priority"], + }, ] # ── General tests (all models) ── @@ -481,7 +511,11 @@ async def distill_train(model: str = "gbase-7b", epochs: int = 3) -> dict: adapter = _do_train(model=model, epochs=epochs) if adapter: return {"status": "ok", "adapter": str(adapter), "model": model} - return {"status": "warn", "note": "Training not completed (missing mlx-lm or training failed), data exported", "export_dir": str(EXPORT_DIR)} + return { + "status": "warn", + "note": "Training not completed (missing mlx-lm or training failed), data exported", + "export_dir": str(EXPORT_DIR), + } @tool() diff --git a/tools/exec.py b/tools/exec.py index c70a20a..ed72a3a 100644 --- a/tools/exec.py +++ b/tools/exec.py @@ -4,6 +4,7 @@ 命令执行工具。使用 lib/safe_shell 底座执行。 """ + import logging import os import re @@ -49,17 +50,14 @@ async def exec_command(command: str, timeout: int = 300, workdir: str = "", **_k # ── 领地检查:命令中显式 cd 到其他 Agent 的家目录 ── # 扫描常见的路径操作模式(cd、>重定向、cp、mv、write to) - cd_match = re.findall(r'(?:^|;|&&|\|\|)\s*cd\s+(\S+)', command) - write_match = re.findall(r'((?:>|>>)\s*/[^\s;|&]+)', command) + cd_match = re.findall(r"(?:^|;|&&|\|\|)\s*cd\s+(\S+)", command) + write_match = re.findall(r"((?:>|>>)\s*/[^\s;|&]+)", command) for target_path in cd_match + write_match: - stripped = target_path.lstrip('> ').strip() + stripped = target_path.lstrip("> ").strip() violation = check_territory_violation(stripped) if violation: - logger.warning( - "⚠️ exec_command 检测到领地侵犯嫌疑: 命令目标 " - "'%s' 属于 Agent「%s」", stripped, violation - ) + logger.warning("⚠️ exec_command 检测到领地侵犯嫌疑: 命令目标 '%s' 属于 Agent「%s」", stripped, violation) if workdir: target = Path(workdir) if workdir.startswith("/") else _PROJECT_ROOT / workdir diff --git a/tools/gen_pro_report.py b/tools/gen_pro_report.py index a75d73a..981368f 100644 --- a/tools/gen_pro_report.py +++ b/tools/gen_pro_report.py @@ -309,21 +309,15 @@ def _build_content(blocks): elif t == "conclusion": parts.append( f'
' - f'
{block.get("label","结论")}
' + f'
{block.get("label", "结论")}
' f"{block['text']}
" ) elif t == "note": icon = block.get("icon", "📌") - parts.append( - f'
' - f'{icon} {block["text"]}
' - ) + parts.append(f'
{icon} {block["text"]}
') elif t == "warning": icon = block.get("icon", "⚠️") - parts.append( - f'
' - f'{icon} {block["text"]}
' - ) + parts.append(f'
{icon} {block["text"]}
') elif t == "list": items = "".join(f"
  • {item}
  • " for item in block.get("items", [])) tag = "ol" if block.get("ordered") else "ul" @@ -468,7 +462,11 @@ def generate_report( "对外投资维度:核查三家企业是否存在共同投资或交叉持股", ], }, - {"type": "note", "icon": "📌", "text": "数据来源:国家企业信用信息公示系统、天眼查、企查查等公开渠道。"}, + { + "type": "note", + "icon": "📌", + "text": "数据来源:国家企业信用信息公示系统、天眼查、企查查等公开渠道。", + }, {"type": "h1", "text": "2. 企业基本信息"}, { "type": "table", @@ -512,7 +510,11 @@ def generate_report( "type": "p", "text": "经核查,三家企业对外投资均为0,不存在通过共同投资形成关联关系的可能。", }, - {"type": "warning", "icon": "⚠️", "text": "唯一值得注意的发现:王汉中(黑加仑法人)名下另有3家关联企业,分布在安徽、北京、浙江,均为物流/货代行业。但这些企业与硕科、旗迹源无任何交叉。"}, + { + "type": "warning", + "icon": "⚠️", + "text": "唯一值得注意的发现:王汉中(黑加仑法人)名下另有3家关联企业,分布在安徽、北京、浙江,均为物流/货代行业。但这些企业与硕科、旗迹源无任何交叉。", + }, {"type": "h1", "text": "4. 综合结论"}, { "type": "conclusion", diff --git a/tools/glink_projects.py b/tools/glink_projects.py index b05cab3..4da6052 100644 --- a/tools/glink_projects.py +++ b/tools/glink_projects.py @@ -193,8 +193,13 @@ def register(): register_toolset( "glink_projects", [ - "项目", "项目上下文", "项目进度", "项目事件", - "project", "context", "glink", + "项目", + "项目上下文", + "项目进度", + "项目事件", + "project", + "context", + "glink", ], [ "tool_project_init", diff --git a/tools/learn.py b/tools/learn.py index eeecd21..60332ab 100644 --- a/tools/learn.py +++ b/tools/learn.py @@ -63,7 +63,9 @@ async def add_learn_topic( srchs = [q.strip() for q in search_queries.split(",") if q.strip()] if not rss and not srchs: - return {"error": "At least one RSS source or search keyword is required. If no known RSS sources, pass at least search_queries."} + return { + "error": "At least one RSS source or search keyword is required. If no known RSS sources, pass at least search_queries." + } # Determine mode and write to the corresponding config if rss: diff --git a/tools/qa_check.py b/tools/qa_check.py index f42b71e..866fca6 100644 --- a/tools/qa_check.py +++ b/tools/qa_check.py @@ -74,7 +74,9 @@ async def qa_double_check( source = f"" lines = source.split("\n") - white_findings.append({"item": "file structure", "finding": f"{len(lines)} lines, {len(source)} bytes", "status": "info"}) + white_findings.append( + {"item": "file structure", "finding": f"{len(lines)} lines, {len(source)} bytes", "status": "info"} + ) # Count functions/classes import re @@ -114,7 +116,9 @@ async def qa_double_check( # Check type annotations typed_funcs = sum(1 for f in func_names if "def " + f in source) - white_findings.append({"item": "type annotations", "finding": f"{typed_funcs}/{len(func_names)} functions", "status": "..."}) + white_findings.append( + {"item": "type annotations", "finding": f"{typed_funcs}/{len(func_names)} functions", "status": "..."} + ) report["white_box"] = { "code_file": code_file, @@ -127,19 +131,35 @@ async def qa_double_check( black_findings = [] black_findings.append( - {"item": "API reachability", "finding": f"target: {target[:60]}", "status": "pending (agent-3 external probe)"} + { + "item": "API reachability", + "finding": f"target: {target[:60]}", + "status": "pending (agent-3 external probe)", + } ) black_findings.append( - {"item": "input diversity", "finding": "normal input | empty input | oversized input | invalid type", "status": "pending"} + { + "item": "input diversity", + "finding": "normal input | empty input | oversized input | invalid type", + "status": "pending", + } ) black_findings.append( - {"item": "output stability", "finding": "error code consistency | response format | no crash on exception", "status": "to verify"} + { + "item": "output stability", + "finding": "error code consistency | response format | no crash on exception", + "status": "to verify", + } ) black_findings.append( - {"item": "side-effect check", "finding": "any file writes/network requests/system calls", "status": "check after execution"} + { + "item": "side-effect check", + "finding": "any file writes/network requests/system calls", + "status": "check after execution", + } ) report["black_box"] = { @@ -286,7 +306,11 @@ async def qa_swarm_test( }, { "name": "AI generated", - "input": {"url": "https://medium.com/test", "title": "AI article", "snippet": "Based on my training data"}, + "input": { + "url": "https://medium.com/test", + "title": "AI article", + "snippet": "Based on my training data", + }, }, {"name": "empty input", "input": {"url": "", "title": "", "snippet": ""}}, {"name": "garbage text", "input": {"url": "a" * 1000, "title": "x" * 500, "snippet": "!" * 2000}}, @@ -314,6 +338,7 @@ async def qa_swarm_test( # Call verify_intelligence internal logic try: from tools.verify import _assess_content, _rate_source + _rate_source(tc["input"].get("url", "")) _assess_content(tc["input"].get("title", "") + " " + tc["input"].get("snippet", "")) except ImportError: diff --git a/tools/read_file.py b/tools/read_file.py index dee2c3e..1d8b25f 100644 --- a/tools/read_file.py +++ b/tools/read_file.py @@ -66,10 +66,7 @@ async def read_file(filepath: str, offset: int = 0, max_chars: int = 0) -> dict: # 领地检查(只警告不阻塞——只读不写是安全的) violation = check_territory_violation(filepath) if violation: - logger.warning( - "📖 跨越领地读取: %s 读取了 Agent「%s」的文件 %s", - abs_path, violation, abs_path - ) + logger.warning("📖 跨越领地读取: %s 读取了 Agent「%s」的文件 %s", abs_path, violation, abs_path) if not os.path.exists(abs_path): return {"error": f"文件不存在: {filepath}", "path": abs_path} diff --git a/tools/remember_info.py b/tools/remember_info.py index e6e507c..f0bb7ae 100644 --- a/tools/remember_info.py +++ b/tools/remember_info.py @@ -7,37 +7,89 @@ """ import logging -from typing import Optional -from lib.toolkit import tool, get_global + +from lib.toolkit import get_global, tool logger = logging.getLogger(__name__) # ── 分类关键词 ── _KNOWLEDGE_KW = [ - "密钥", "key", "key", "token", "密码", - "端口", "地址", "路径", "配置", "域名", "URL", "url", - "账号", "API", "api", "secret", - "版本", "版本号", "型号", "型号", - "安装", "安装目录", "家目录", "home", - "生日", "出生", "年龄", "关系", # 个人信息 + "密钥", + "key", + "key", + "token", + "密码", + "端口", + "地址", + "路径", + "配置", + "域名", + "URL", + "url", + "账号", + "API", + "api", + "secret", + "版本", + "版本号", + "型号", + "型号", + "安装", + "安装目录", + "家目录", + "home", + "生日", + "出生", + "年龄", + "关系", # 个人信息 ] _NOTE_KW = [ - "学到了", "总结", "总结一下", "心得", "笔记", - "调研", "调研报告", "文章", "论文", "读了", - "学习了", "学习了", "摘要", "提炼", - "概念", "概念理解", "原理", - "框架", "模式", "范式", + "学到了", + "总结", + "总结一下", + "心得", + "笔记", + "调研", + "调研报告", + "文章", + "论文", + "读了", + "学习了", + "学习了", + "摘要", + "提炼", + "概念", + "概念理解", + "原理", + "框架", + "模式", + "范式", ] _EXPERIENCE_KW = [ - "教训", "经验", "教训", "踩坑", - "下次注意", "下次要", "以后先", "应该先", - "根因是", "根因", "原因", "原因是", - "学到的", "学到", "lesson", - "记一条", "记住", "rule", "规则", - "模式", "pattern", + "教训", + "经验", + "教训", + "踩坑", + "下次注意", + "下次要", + "以后先", + "应该先", + "根因是", + "根因", + "原因", + "原因是", + "学到的", + "学到", + "lesson", + "记一条", + "记住", + "rule", + "规则", + "模式", + "pattern", ] @@ -70,7 +122,7 @@ async def remember_info( title: str = "", tags: str = "", source: str = "", - force_type: Optional[str] = None, + force_type: str | None = None, with_kw_category: str = "general", ) -> dict: """统一记忆入口——自动判断内容类型写入正确层级。 @@ -97,6 +149,7 @@ async def remember_info( auto_title = content[:40] if len(content) > 40 else content # 记忆一条事实 from tools.knowledge import remember_fact + result = await remember_fact( fact=content, category=with_kw_category, @@ -112,6 +165,7 @@ async def remember_info( elif ftype == "note": auto_title = title or content[:30] from tools.note_tool import note_write + result = await note_write( title=auto_title, content=content, @@ -134,6 +188,7 @@ async def remember_info( summary = auto_title # Experience 用 rule="user_lesson" from tools.knowledge import remember_fact + result = await remember_fact( fact=content, category="workflow", @@ -152,11 +207,11 @@ async def remember_info( "tags": [t.strip() for t in tags.split(",") if t.strip()] if tags else ["lesson"], } import json + storage._conn.execute( "INSERT INTO entries (type, content, summary, created_at, confidence, rule) " "VALUES (?, ?, ?, ?, ?, ?)", - ("experience", json.dumps(payload), summary, now, - payload["confidence"], payload["rule"]), + ("experience", json.dumps(payload), summary, now, payload["confidence"], payload["rule"]), ) storage._conn.commit() exp_id = storage._conn.lastrowid diff --git a/tools/security_watch.py b/tools/security_watch.py index 155f593..8bc4a79 100644 --- a/tools/security_watch.py +++ b/tools/security_watch.py @@ -34,7 +34,7 @@ async def security_scan_directory(directory: str, output: str = "") -> dict: 扫描结果摘要(高危/中危/低风险数量) """ # Sanitize directory path to prevent injection - safe_dir = re.sub(r'[;&|`$]', '', directory.strip()) + safe_dir = re.sub(r"[;&|`$]", "", directory.strip()) if not safe_dir: return {"error": "Directory path is empty after sanitization"} if not os.path.isdir(safe_dir): diff --git a/tools/self_edit.py b/tools/self_edit.py index f557c21..7d687fa 100644 --- a/tools/self_edit.py +++ b/tools/self_edit.py @@ -49,6 +49,7 @@ _ROLLBACK_DIR = _INSTANCE_HOME / ".gbase_rollback" _ROLLBACK_DIR.mkdir(parents=True, exist_ok=True) + # ── 路径校验 ── def _safety_check(path: str) -> tuple[Path, str]: """解析并验证路径在安全范围内。返回 (绝对路径, 错误信息)""" @@ -105,6 +106,7 @@ def _verify_syntax(path: Path) -> tuple[bool, str]: # ── 工具函数 ── + @tool() async def self_edit( path: str, @@ -184,7 +186,10 @@ async def self_edit( modified = "\n".join(lines) else: - return {"success": False, "error": "请提供 old+new(精确替换)或 search+replace(整段替换)或 insert_after+content(行后插入)"} + return { + "success": False, + "error": "请提供 old+new(精确替换)或 search+replace(整段替换)或 insert_after+content(行后插入)", + } # ── 改前备份 ── backup_name = _backup(abs_path) @@ -295,10 +300,12 @@ async def self_edit_restart() -> dict: 返回后会延迟 2 秒自杀,launchd 接管自动拉起。 """ import threading + current_pid = os.getpid() def _delayed_exit(): import time + time.sleep(2.0) os._exit(0) @@ -402,6 +409,7 @@ async def self_edit_remember_reason( """ try: from lib.storage import Storage + _st = Storage() _summary = f"[自修] {root_cause[:80]}" _detail = f"类型: {fix_type}" @@ -428,4 +436,3 @@ async def self_edit_remember_reason( return {"success": False, "error": "数据库连接不可用"} except Exception as e: return {"success": False, "error": f"记录失败: {e}"} - diff --git a/tools/self_search.py b/tools/self_search.py index 1c221d2..ecb224a 100644 --- a/tools/self_search.py +++ b/tools/self_search.py @@ -95,7 +95,7 @@ async def search_self(question: str) -> dict: pass if not matched: - return {"result": f"Found no experience relevant to \"{question}\"."} + return {"result": f'Found no experience relevant to "{question}".'} # Record a hit for m in matched: diff --git a/tools/test_gen.py b/tools/test_gen.py index ab68397..22dd90e 100644 --- a/tools/test_gen.py +++ b/tools/test_gen.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 """测试 PDF 生成""" + import asyncio from tools.pdf_gen import gen_pdf @@ -7,50 +8,63 @@ async def main(): content = [ - {'type': 'cover', 'title': '2026年中国人工智能行业研究报告', 'subtitle': 'AI Industry Research Report 2026', 'date': '2026年5月', 'author': 'GBase Research'}, - {'type': 'toc'}, - {'type': 'h1', 'text': '第一章 行业概述'}, - {'type': 'p', 'text': '2025年,中国人工智能产业规模突破2.1万亿元,同比增长32.5%。'}, - {'type': 'h2', 'text': '1.1 核心数据一览'}, - {'type': 'table', 'headers': ['指标', '2024年', '2025年'], - 'rows': [ - ['产业规模(亿元)', '15,800', '21,000'], - ['AI企业数量(家)', '4,500', '5,200'], - ]}, - {'type': 'h2', 'text': '1.2 关键发现'}, - {'type': 'ul', 'items': ['大模型能力持续跃升', '应用落地加速']}, - {'type': 'h2', 'text': '1.3 专家观点'}, - {'type': 'quote', 'text': '中国AI产业正处于关键转折期。'}, - {'type': 'small', 'text': '—— 某头部AI研究院首席科学家'}, - {'type': 'pagebreak'}, - {'type': 'h1', 'text': '第二章 技术发展分析'}, - {'type': 'p', 'text': '2025年,中国AI技术在多条技术路线上取得突破。'}, - {'type': 'note', 'text': 'MoE架构大模型推理成本降低约60%。', 'level': 'info'}, - {'type': 'h3', 'text': '2.1.1 技术路线对比'}, - {'type': 'table', 'headers': ['路线', '优势', '劣势'], - 'rows': [ - ['Dense Transformer', '通用性强', '推理成本高'], - ['MoE架构', '推理成本低', '训练复杂'], - ]}, - {'type': 'pagebreak'}, - {'type': 'h1', 'text': '结论与展望'}, - {'type': 'conclusion', 'text': '中国AI产业正处于从技术驱动向价值驱动转型的关键时期。'}, - {'type': 'spacer', 'height': 20}, - {'type': 'divider'}, - {'type': 'small', 'text': '免责声明:本报告中的数据和观点仅供参考。'}, + { + "type": "cover", + "title": "2026年中国人工智能行业研究报告", + "subtitle": "AI Industry Research Report 2026", + "date": "2026年5月", + "author": "GBase Research", + }, + {"type": "toc"}, + {"type": "h1", "text": "第一章 行业概述"}, + {"type": "p", "text": "2025年,中国人工智能产业规模突破2.1万亿元,同比增长32.5%。"}, + {"type": "h2", "text": "1.1 核心数据一览"}, + { + "type": "table", + "headers": ["指标", "2024年", "2025年"], + "rows": [ + ["产业规模(亿元)", "15,800", "21,000"], + ["AI企业数量(家)", "4,500", "5,200"], + ], + }, + {"type": "h2", "text": "1.2 关键发现"}, + {"type": "ul", "items": ["大模型能力持续跃升", "应用落地加速"]}, + {"type": "h2", "text": "1.3 专家观点"}, + {"type": "quote", "text": "中国AI产业正处于关键转折期。"}, + {"type": "small", "text": "—— 某头部AI研究院首席科学家"}, + {"type": "pagebreak"}, + {"type": "h1", "text": "第二章 技术发展分析"}, + {"type": "p", "text": "2025年,中国AI技术在多条技术路线上取得突破。"}, + {"type": "note", "text": "MoE架构大模型推理成本降低约60%。", "level": "info"}, + {"type": "h3", "text": "2.1.1 技术路线对比"}, + { + "type": "table", + "headers": ["路线", "优势", "劣势"], + "rows": [ + ["Dense Transformer", "通用性强", "推理成本高"], + ["MoE架构", "推理成本低", "训练复杂"], + ], + }, + {"type": "pagebreak"}, + {"type": "h1", "text": "结论与展望"}, + {"type": "conclusion", "text": "中国AI产业正处于从技术驱动向价值驱动转型的关键时期。"}, + {"type": "spacer", "height": 20}, + {"type": "divider"}, + {"type": "small", "text": "免责声明:本报告中的数据和观点仅供参考。"}, ] result = await gen_pdf( - title='AI行业研究报告', + title="AI行业研究报告", content=content, - subtitle='2026年中国人工智能行业研究报告', - author='GBase Research', - output_path='$HOME/Downloads/AI_Report_2026_v1.pdf', - color_theme='mckinsey', + subtitle="2026年中国人工智能行业研究报告", + author="GBase Research", + output_path="$HOME/Downloads/AI_Report_2026_v1.pdf", + color_theme="mckinsey", show_toc=True, - font_size='normal' + font_size="normal", ) print(result) -if __name__ == '__main__': + +if __name__ == "__main__": asyncio.run(main()) diff --git a/tools/trident_tools.py b/tools/trident_tools.py index 4f5ee71..cc372cd 100644 --- a/tools/trident_tools.py +++ b/tools/trident_tools.py @@ -29,6 +29,7 @@ # ── 工具函数 ────────────────────────────────────────────── + async def _ask(agent_url: str, task: str) -> dict: """通用 /ask 调用""" try: @@ -260,13 +261,15 @@ async def glink_workflow(project: str, steps: list) -> dict: else: result = {"error": f"未知执行者: {executor}"} - results.append({ - "step_id": step.get("id"), - "executor": executor, - "title": title, - "status": "ok" if "error" not in result else "fail", - "result": result, - }) + results.append( + { + "step_id": step.get("id"), + "executor": executor, + "title": title, + "status": "ok" if "error" not in result else "fail", + "result": result, + } + ) return { "project": project, diff --git a/tools/weather.py b/tools/weather.py index 6940006..0cb652c 100644 --- a/tools/weather.py +++ b/tools/weather.py @@ -22,7 +22,8 @@ async def get_weather(city: str) -> dict: """Query current weather for a given city.""" # Sanitize city input to prevent injection into URL import re - safe_city = re.sub(r'[^a-zA-Z\u4e00-\u9fff\s,.-]', '', city.strip())[:100] + + safe_city = re.sub(r"[^a-zA-Z\u4e00-\u9fff\s,.-]", "", city.strip())[:100] if not safe_city: return {"error": "Invalid city name"} url = WEATHER_API.format(city=safe_city) From eeb228c65bcb68c559ea6045e5d5fd926752f3f9 Mon Sep 17 00:00:00 2001 From: garyqlin Date: Tue, 9 Jun 2026 15:21:23 +0800 Subject: [PATCH 3/4] =?UTF-8?q?feat:=20WebChat=20channel=20=E2=80=94=20bro?= =?UTF-8?q?wser-based=20chat=20UI=20with=20file=20upload=20support?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New: - lib/channels/webchat.py: WebSocket-based chat backend - Streaming response (chunk-by-chunk) - File upload: text/PDF/DOCX/XLSX/image parsing - Knowledge retrieval visibility - Auto-reconnect WebSocket - webchat/index.html: Single-file cyberpunk frontend - Left/right split layout (chat + Agent Mind panel) - Dark cyberpunk theme with scanline overlay - Drag & drop file upload - Real-time streaming output - Knowledge & tool chain panels - main.py: --mode web flag + --port option - Separated feishu/web run() functions sharing common infra Usage: python3 main.py --mode web --port 8765 --- lib/channels/webchat.py | 294 +++++++++++++ main.py | 117 ++++- webchat/index.html | 932 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 1337 insertions(+), 6 deletions(-) create mode 100644 lib/channels/webchat.py create mode 100644 webchat/index.html diff --git a/lib/channels/webchat.py b/lib/channels/webchat.py new file mode 100644 index 0000000..8292010 --- /dev/null +++ b/lib/channels/webchat.py @@ -0,0 +1,294 @@ +#!/usr/bin/env python3 +""" +webchat.py — GBase WebSocket Chat Channel + +A production-grade WebSocket chat backend for GBase agents. +Supports streaming responses, file uploads, knowledge injection, and tool chain visibility. + +Usage: + channel = WebChatChannel(kernel, storage) + app = channel.create_app() +""" + +import asyncio +import base64 +import contextlib +import json +import logging +import mimetypes +import os +from pathlib import Path +from typing import Any + +from fastapi import FastAPI, Request, WebSocket, WebSocketDisconnect +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import HTMLResponse, JSONResponse +from fastapi.staticfiles import StaticFiles + +logger = logging.getLogger("gbase.webchat") + + +class WebChatChannel: + """WebSocket-based chat channel with streaming responses.""" + + def __init__( + self, + kernel: Any, + storage: Any | None = None, + data_dir: str | None = None, + max_upload_mb: int = 10, + ): + self.kernel = kernel + self.storage = storage + self.data_dir = data_dir or os.environ.get("GBASE_DATA_DIR", "data") + self.max_upload_mb = max_upload_mb + self._static_dir = Path(__file__).parent.parent.parent / "webchat" + + def create_app(self, title: str = "GBase Web Chat") -> FastAPI: + app = FastAPI(title=title) + app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_methods=["*"], + allow_headers=["*"], + ) + + # Serve static files + static_path = self._static_dir + static_path.mkdir(parents=True, exist_ok=True) + app.mount("/static", StaticFiles(directory=str(static_path)), name="static") + + # Serve the main HTML page + @app.get("/", response_class=HTMLResponse) + async def index(): + html_path = static_path / "index.html" + if html_path.exists(): + return HTMLResponse(html_path.read_text(encoding="utf-8")) + return HTMLResponse("

    GBase Web Chat

    Frontend not found.

    ") + + @app.get("/health") + async def health(): + return {"status": "ok", "app": "gbase-webchat"} + + @app.post("/ask") + async def ask_http(request: Request): + """HTTP fallback for non-streaming chat (for testing).""" + body = await request.json() + message = body.get("message", "") + response = await self.kernel.run( + user_message=message, + platform="webchat", + ) + return JSONResponse({"reply": response}) + + # WebSocket chat endpoint + @app.websocket("/ws") + async def websocket_endpoint(ws: WebSocket): + await ws.accept() + logger.info("WebSocket connected") + + try: + while True: + raw = await ws.receive_text() + + # Parse incoming message (could be text or JSON with files) + try: + data = json.loads(raw) + except json.JSONDecodeError: + data = {"type": "text", "content": raw} + + msg_type = data.get("type", "text") + + if msg_type == "text": + user_msg = data.get("content", "").strip() + if not user_msg: + continue + + # Notify streaming start + await ws.send_json({"type": "status", "content": "processing"}) + + # Send knowledge hits if available + try: + if self.storage: + hits = self.storage.search(user_msg) + if hits: + await ws.send_json({ + "type": "knowledge", + "content": hits[:5], + }) + except Exception: + pass + + # Run kernel + try: + response = await self.kernel.run( + user_message=user_msg, + platform="webchat", + ) + + # Stream response character by character for cool effect + # but batch into chunks for practicality + chunk_size = 20 + for i in range(0, len(response), chunk_size): + chunk = response[i:i + chunk_size] + await ws.send_json({ + "type": "chunk", + "content": chunk, + }) + await asyncio.sleep(0.01) # Small delay for streaming feel + + # Send completion marker with metrics + await ws.send_json({ + "type": "done", + "content": response, + "meta": { + "length": len(response), + }, + }) + + except Exception as e: + logger.error("Kernel error: %s", e, exc_info=True) + await ws.send_json({ + "type": "error", + "content": str(e), + }) + + elif msg_type == "file": + # File upload handling + file_name = data.get("name", "unknown") + file_data_b64 = data.get("data", "") + file_mime = data.get("mime", "") + + if not file_data_b64: + await ws.send_json({"type": "error", "content": "No file data"}) + continue + + try: + file_bytes = base64.b64decode(file_data_b64) + file_size_mb = len(file_bytes) / (1024 * 1024) + + if file_size_mb > self.max_upload_mb: + await ws.send_json({ + "type": "error", + "content": f"File too large: {file_size_mb:.1f}MB (max {self.max_upload_mb}MB)", + }) + continue + + # Save to uploads + upload_dir = Path(self.data_dir) / "uploads" + upload_dir.mkdir(parents=True, exist_ok=True) + safe_name = file_name.replace("/", "_").replace("\\", "_") + save_path = upload_dir / safe_name + save_path.write_bytes(file_bytes) + + # Analyze content + result = await self._process_upload(file_name, file_bytes, file_mime) + + await ws.send_json({ + "type": "file_processed", + "content": result, + "meta": {"name": file_name, "size_kb": len(file_bytes) // 1024}, + }) + + except Exception as e: + logger.error("File processing error: %s", e) + await ws.send_json({ + "type": "error", + "content": f"File processing failed: {e}", + }) + + except WebSocketDisconnect: + logger.info("WebSocket disconnected") + except Exception as e: + logger.error("WebSocket error: %s", e, exc_info=True) + with contextlib.suppress(Exception): + await ws.close() + + return app + + async def _process_upload(self, name: str, data: bytes, mime: str) -> dict: + """Process an uploaded file and extract usable content.""" + ext = Path(name).suffix.lower() + result = { + "name": name, + "mime": mime or mimetypes.guess_type(name)[0] or "application/octet-stream", + "size": len(data), + "preview": "", + "content": "", + } + + # Text files + if ext in (".txt", ".md", ".csv", ".json", ".yaml", ".yml", + ".py", ".js", ".ts", ".jsx", ".tsx", ".html", ".css", + ".xml", ".toml", ".ini", ".cfg", ".conf", ".log", + ".sh", ".bash", ".zsh", ".fish"): + try: + text = data.decode("utf-8") + result["content"] = text + result["preview"] = text[:500] + except UnicodeDecodeError: + result["preview"] = "[Binary text file — cannot decode as UTF-8]" + + # PDF + elif ext == ".pdf": + try: + import io + + import PyPDF2 + + pdf_file = io.BytesIO(data) + reader = PyPDF2.PdfReader(pdf_file) + text = "\n".join(page.extract_text() or "" for page in reader.pages) + result["content"] = text + result["preview"] = text[:500] + result["meta"] = {"pages": len(reader.pages)} + except ImportError: + result["preview"] = "[PDF support requires: pip install PyPDF2]" + except Exception as e: + result["preview"] = f"[PDF parse error: {e}]" + + # Word documents + elif ext in (".docx", ".doc"): + try: + import docx + + doc = docx.Document(io.BytesIO(data)) + text = "\n".join(p.text for p in doc.paragraphs) + result["content"] = text + result["preview"] = text[:500] + except ImportError: + result["preview"] = "[DOCX support requires: pip install python-docx]" + except Exception as e: + result["preview"] = f"[DOCX parse error: {e}]" + + # Excel + elif ext in (".xlsx", ".xls"): + try: + import openpyxl + + wb = openpyxl.load_workbook(io.BytesIO(data), read_only=True) + rows = [] + for ws in wb.worksheets[:1]: # First sheet only + for row in ws.iter_rows(values_only=True): + rows.append(" | ".join(str(c) if c is not None else "" for c in row[:10])) + text = "\n".join(rows[:100]) + result["content"] = text + result["preview"] = text[:500] + except ImportError: + result["preview"] = "[Excel support requires: pip install openpyxl]" + except Exception as e: + result["preview"] = f"[Excel parse error: {e}]" + + # Images + elif ext in (".png", ".jpg", ".jpeg", ".gif", ".webp"): + import base64 + + b64 = base64.b64encode(data).decode("utf-8") + result["preview"] = f"data:{result['mime']};base64,{b64}" + result["is_image"] = True + + # Default: binary + else: + result["preview"] = f"[Binary file: {name}, {len(data)} bytes]" + + return result diff --git a/main.py b/main.py index f4d571d..347b7b3 100644 --- a/main.py +++ b/main.py @@ -1,11 +1,11 @@ #!/usr/bin/env python3 """ -gbase_8440.py — GBase 飞书 Bot 入口 -接管飞书 Bot (cli_aa843ca68c7a9cba) + 端口 8440, -用 GBase/GBase Kernel 取代 Hermes CLI 的大脑。 +gbase.py — GBase framework entry point -用法: - cd ~/gbase-home && python3 main.py +Usage: + python3 main.py # Feishu bot mode (default) + python3 main.py --mode web # Web chat interface (browser) + python3 main.py --mode web --port 8765 """ import asyncio @@ -413,5 +413,110 @@ def _shutdown_checkpoint(): await server.serve() +async def _run_web(): + """Web chat mode (browser interface).""" + import uvicorn + + os.environ.setdefault("GBASE_DATA_DIR", DATA_DIR) + from openai import AsyncOpenAI + + from lib.experience import ExperienceEngine + from lib.identity import load_identity + from lib.kernel import Kernel + from lib.mirror import Mirror + from lib.storage import Storage + from tools.mirror_tool import set_mirror_instance + + _ensure_dirs() + + # ── Logging ── + import logging.handlers + _file_handler = logging.handlers.TimedRotatingFileHandler( + str(Path(DATA_DIR) / "gbase-web.log"), + when="midnight", interval=1, backupCount=90, encoding="utf-8", + ) + _file_handler.setFormatter(logging.Formatter( + "%(asctime)s [%(name)s] %(levelname)s %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + )) + _file_handler.suffix = "%Y-%m-%d" + logger.addHandler(_file_handler) + logger.setLevel(logging.INFO) + + # ── Storage ── + storage = Storage(data_dir=DATA_DIR) + storage.setup() + exp = ExperienceEngine(storage) + + # ── Mirror ── + mirror_path = str(Path(DATA_DIR) / "mirror.db") + mirror = Mirror(db_path=mirror_path) + mirror.setup() + set_mirror_instance(mirror) + mstats = mirror.get_stats() + logger.info("鉴面引擎: %d 活跃记忆, %d 已遗忘", mstats["total_active"], mstats["total_forgotten"]) + + # ── LLM client ── + api_key = DEEPSEEK_API_KEY + client = AsyncOpenAI(api_key=api_key, base_url="https://api.deepseek.com") + + # ── Identity + Kernel ── + identity = load_identity( + IDENTITY_NAME, + root_dir=str(Path(__file__).parent / "identities"), + experience_engine=exp, + ) + kernel = Kernel( + client=client, + model=MODEL, + system_prompt=identity.get_system_prompt(), + experience_engine=exp, + mirror_engine=mirror, + data_dir=DATA_DIR, + ) + + from lib.toolkit import auto_scan + from lib.toolkit import set_global as tk_set_global + from tools import register_default + tk_set_global("storage", storage) + tk_set_global("experience", exp) + register_default() + auto_scan("tools") + + # ── WebChat channel ── + from lib.channels.webchat import WebChatChannel + channel = WebChatChannel(kernel=kernel, storage=storage, data_dir=DATA_DIR) + app = channel.create_app(title="GBase Web Chat") + + logger.info("━━━━━━━━━━━━━━━━━━━") + logger.info("GBase Web Chat 启动") + logger.info(f"端口: {WEB_PORT}, 模型: {MODEL}") + logger.info(f"数据目录: {DATA_DIR}") + logger.info(f"访问: http://localhost:{WEB_PORT}") + logger.info("━━━━━━━━━━━━━━━━━━━") + + config = uvicorn.Config(app, host="0.0.0.0", port=WEB_PORT, log_level="info") + server = uvicorn.Server(config) + await server.serve() + + if __name__ == "__main__": - asyncio.run(run()) + import sys + + # Simple CLI arg parsing + args = sys.argv[1:] + MODE = "feishu" + WEB_PORT = int(os.environ.get("GBASE_WEB_PORT", "8765")) + + for i, arg in enumerate(args): + if arg == "--mode" and i + 1 < len(args): + MODE = args[i + 1] + if arg == "--port" and i + 1 < len(args): + WEB_PORT = int(args[i + 1]) + if arg in ("-m", "--mode"): + pass # handled + + if MODE == "web": + asyncio.run(_run_web()) + else: + asyncio.run(run()) diff --git a/webchat/index.html b/webchat/index.html new file mode 100644 index 0000000..b36cbec --- /dev/null +++ b/webchat/index.html @@ -0,0 +1,932 @@ + + + + + +GBASE — Web Interface + + + + +
    +
    DROP FILES HERE
    +
    + +
    + +
    +
    + + v0.4.1 + Web +
    +
    + Connected + deepseek-chat +
    +
    + + +
    + +
    +
    +
    +
    + Welcome to GBASE Web Interface.
    + Send a message or drop a file to get started. +
    +
    +
    + + +
    +
    +
    + + +
    + + +
    +
    Ctrl+Enter to send
    +
    +
    + + +
    +
    🧠 Agent Mind
    +
    + +
    +
    + Knowledge Retrieval +
    +
    +
    Awaiting query...
    +
    +
    + + +
    +
    + Tool Chain +
    +
    +
    No tools called yet.
    +
    +
    + + +
    +
    + Live Metrics +
    +
    +
    + Messages + 0 +
    +
    +
    Tools Called
    +
    0
    +
    +
    +
    Files Uploaded
    +
    0
    +
    +
    +
    + + +
    +
    + About +
    +
    +
    Framework: GBase v0.4.1
    +
    Protocol: WebSocket
    +
    Transport: Not connected
    +
    +
    +
    +
    +
    +
    + + + + From 3582db32313684d7329935c8cee25e9ccb800bfe Mon Sep 17 00:00:00 2001 From: garyqlin Date: Tue, 9 Jun 2026 15:24:25 +0800 Subject: [PATCH 4/4] style: ruff format webchat.py + main.py --- lib/channels/webchat.py | 109 ++++++++++++++++++++++++++-------------- main.py | 18 +++++-- 2 files changed, 85 insertions(+), 42 deletions(-) diff --git a/lib/channels/webchat.py b/lib/channels/webchat.py index 8292010..1e120bf 100644 --- a/lib/channels/webchat.py +++ b/lib/channels/webchat.py @@ -112,10 +112,12 @@ async def websocket_endpoint(ws: WebSocket): if self.storage: hits = self.storage.search(user_msg) if hits: - await ws.send_json({ - "type": "knowledge", - "content": hits[:5], - }) + await ws.send_json( + { + "type": "knowledge", + "content": hits[:5], + } + ) except Exception: pass @@ -130,28 +132,34 @@ async def websocket_endpoint(ws: WebSocket): # but batch into chunks for practicality chunk_size = 20 for i in range(0, len(response), chunk_size): - chunk = response[i:i + chunk_size] - await ws.send_json({ - "type": "chunk", - "content": chunk, - }) + chunk = response[i : i + chunk_size] + await ws.send_json( + { + "type": "chunk", + "content": chunk, + } + ) await asyncio.sleep(0.01) # Small delay for streaming feel # Send completion marker with metrics - await ws.send_json({ - "type": "done", - "content": response, - "meta": { - "length": len(response), - }, - }) + await ws.send_json( + { + "type": "done", + "content": response, + "meta": { + "length": len(response), + }, + } + ) except Exception as e: logger.error("Kernel error: %s", e, exc_info=True) - await ws.send_json({ - "type": "error", - "content": str(e), - }) + await ws.send_json( + { + "type": "error", + "content": str(e), + } + ) elif msg_type == "file": # File upload handling @@ -168,10 +176,12 @@ async def websocket_endpoint(ws: WebSocket): file_size_mb = len(file_bytes) / (1024 * 1024) if file_size_mb > self.max_upload_mb: - await ws.send_json({ - "type": "error", - "content": f"File too large: {file_size_mb:.1f}MB (max {self.max_upload_mb}MB)", - }) + await ws.send_json( + { + "type": "error", + "content": f"File too large: {file_size_mb:.1f}MB (max {self.max_upload_mb}MB)", + } + ) continue # Save to uploads @@ -184,18 +194,22 @@ async def websocket_endpoint(ws: WebSocket): # Analyze content result = await self._process_upload(file_name, file_bytes, file_mime) - await ws.send_json({ - "type": "file_processed", - "content": result, - "meta": {"name": file_name, "size_kb": len(file_bytes) // 1024}, - }) + await ws.send_json( + { + "type": "file_processed", + "content": result, + "meta": {"name": file_name, "size_kb": len(file_bytes) // 1024}, + } + ) except Exception as e: logger.error("File processing error: %s", e) - await ws.send_json({ - "type": "error", - "content": f"File processing failed: {e}", - }) + await ws.send_json( + { + "type": "error", + "content": f"File processing failed: {e}", + } + ) except WebSocketDisconnect: logger.info("WebSocket disconnected") @@ -218,10 +232,31 @@ async def _process_upload(self, name: str, data: bytes, mime: str) -> dict: } # Text files - if ext in (".txt", ".md", ".csv", ".json", ".yaml", ".yml", - ".py", ".js", ".ts", ".jsx", ".tsx", ".html", ".css", - ".xml", ".toml", ".ini", ".cfg", ".conf", ".log", - ".sh", ".bash", ".zsh", ".fish"): + if ext in ( + ".txt", + ".md", + ".csv", + ".json", + ".yaml", + ".yml", + ".py", + ".js", + ".ts", + ".jsx", + ".tsx", + ".html", + ".css", + ".xml", + ".toml", + ".ini", + ".cfg", + ".conf", + ".log", + ".sh", + ".bash", + ".zsh", + ".fish", + ): try: text = data.decode("utf-8") result["content"] = text diff --git a/main.py b/main.py index 347b7b3..793785c 100644 --- a/main.py +++ b/main.py @@ -431,14 +431,20 @@ async def _run_web(): # ── Logging ── import logging.handlers + _file_handler = logging.handlers.TimedRotatingFileHandler( str(Path(DATA_DIR) / "gbase-web.log"), - when="midnight", interval=1, backupCount=90, encoding="utf-8", + when="midnight", + interval=1, + backupCount=90, + encoding="utf-8", + ) + _file_handler.setFormatter( + logging.Formatter( + "%(asctime)s [%(name)s] %(levelname)s %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) ) - _file_handler.setFormatter(logging.Formatter( - "%(asctime)s [%(name)s] %(levelname)s %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", - )) _file_handler.suffix = "%Y-%m-%d" logger.addHandler(_file_handler) logger.setLevel(logging.INFO) @@ -478,6 +484,7 @@ async def _run_web(): from lib.toolkit import auto_scan from lib.toolkit import set_global as tk_set_global from tools import register_default + tk_set_global("storage", storage) tk_set_global("experience", exp) register_default() @@ -485,6 +492,7 @@ async def _run_web(): # ── WebChat channel ── from lib.channels.webchat import WebChatChannel + channel = WebChatChannel(kernel=kernel, storage=storage, data_dir=DATA_DIR) app = channel.create_app(title="GBase Web Chat")