# ============================================================ # Module: Dehydration & Auto-tagging (dehydrator.py) # 模块:数据脱水压缩 + 自动打标 # # Capabilities: # 能力: # 1. Dehydrate: compress memory content into high-density summaries (save tokens) # 脱水:将记忆桶的原始内容压缩为高密度摘要,省 token # 2. Merge: blend old and new content, keeping bucket size constant # 合并:揉合新旧内容,控制桶体积恒定 # 3. Analyze: auto-analyze content for domain/emotion/tags # 打标:自动分析内容,输出主题域/情感坐标/标签 # # Operating modes: # 工作模式: # - Primary: OpenAI-compatible API (DeepSeek/Ollama/LM Studio/vLLM/Gemini etc.) # 主路径:通过 OpenAI 兼容客户端调用 LLM API # - Fallback: local keyword extraction when API is unavailable # 备用路径:API 不可用时用本地关键词提取 # # Depended on by: server.py # 被谁依赖:server.py # ============================================================ import re import json import logging from collections import Counter import jieba from openai import AsyncOpenAI from utils import count_tokens_approx logger = logging.getLogger("ombre_brain.dehydrator") # --- Dehydration prompt: instructs cheap LLM to compress information --- # --- 脱水提示词:指导廉价 LLM 压缩信息 --- DEHYDRATE_PROMPT = """你是一个信息压缩专家。请将以下内容脱水为紧凑摘要。 压缩规则: 1. 提取所有核心事实,去除冗余修饰和重复 2. 保留最新的情绪状态和态度 3. 保留所有待办/未完成事项 4. 关键数字、日期、名称必须保留 5. 目标压缩率 > 70% 输出格式(纯 JSON,无其他内容): { "core_facts": ["事实1", "事实2"], "emotion_state": "当前情绪关键词", "todos": ["待办1", "待办2"], "keywords": ["关键词1", "关键词2"], "summary": "50字以内的核心总结" }""" # --- Diary digest prompt: split daily notes into independent memory entries --- # --- 日记整理提示词:把一大段日常拆分成多个独立记忆条目 --- DIGEST_PROMPT = """你是一个日记整理专家。用户会发送一段包含今天各种事情的文本(可能很杂乱),请你将其拆分成多个独立的记忆条目。 整理规则: 1. 每个条目应该是一个独立的主题/事件(不要混在一起) 2. 为每个条目自动分析元数据 3. 去除无意义的口水话和重复信息,保留核心内容 4. 同一主题的零散信息应合并为一个条目 5. 如果有待办事项,单独提取为一个条目 6. 单个条目内容不少于50字,过短的零碎信息合并到最相关的条目中 7. 总条目数控制在 2~6 个,避免过度碎片化 8. 在 content 中对人名、地名、专有名词用 [[双链]] 标记(如 [[婷易]]、[[Obsidian]]),普通词汇不要加 输出格式(纯 JSON 数组,无其他内容): [ { "name": "条目标题(10字以内)", "content": "整理后的内容", "domain": ["主题域1"], "valence": 0.7, "arousal": 0.4, "tags": ["核心词1", "核心词2", "扩展词1", "扩展词2"], "importance": 5 } ] tags 生成规则:先从原文精准提取 3~5 个核心词,再引申扩展 5~8 个语义相关词(近义词、上位词、关联场景词),合并为一个数组。 主题域可选(选最精确的 1~2 个,只选真正相关的): 日常: ["饮食", "穿搭", "出行", "居家", "购物"] 人际: ["家庭", "恋爱", "友谊", "社交"] 成长: ["工作", "学习", "考试", "求职"] 身心: ["健康", "心理", "睡眠", "运动"] 兴趣: ["游戏", "影视", "音乐", "阅读", "创作", "手工"] 数字: ["编程", "AI", "硬件", "网络"] 事务: ["财务", "计划", "待办"] 内心: ["情绪", "回忆", "梦境", "自省"] importance: 1-10,根据内容重要程度判断 valence: 0~1(0=消极, 0.5=中性, 1=积极) arousal: 0~1(0=平静, 0.5=普通, 1=激动)""" # --- Merge prompt: instruct LLM to blend old and new memories --- # --- 合并提示词:指导 LLM 揉合新旧记忆 --- MERGE_PROMPT = """你是一个信息合并专家。请将旧记忆与新内容合并为一份统一的简洁记录。 合并规则: 1. 新内容与旧记忆冲突时,以新内容为准 2. 去除重复信息 3. 保留所有重要事实 4. 总长度尽量不超过旧记忆的 120% 5. 对出现的人名、地名、专有名词用 [[双链]] 标记(如 [[婷易]]、[[Obsidian]]),普通词汇不要加 直接输出合并后的文本,不要加额外说明。""" # --- Auto-tagging prompt: analyze content for domain and emotion coords --- # --- 自动打标提示词:分析内容的主题域和情感坐标 --- ANALYZE_PROMPT = """你是一个内容分析器。请分析以下文本,输出结构化的元数据。 分析规则: 1. domain(主题域):选最精确的 1~2 个,只选真正相关的 日常: ["饮食", "穿搭", "出行", "居家", "购物"] 人际: ["家庭", "恋爱", "友谊", "社交"] 成长: ["工作", "学习", "考试", "求职"] 身心: ["健康", "心理", "睡眠", "运动"] 兴趣: ["游戏", "影视", "音乐", "阅读", "创作", "手工"] 数字: ["编程", "AI", "硬件", "网络"] 事务: ["财务", "计划", "待办"] 内心: ["情绪", "回忆", "梦境", "自省"] 2. valence(情感效价):0.0~1.0,0=极度消极 → 0.5=中性 → 1.0=极度积极 3. arousal(情感唤醒度):0.0~1.0,0=非常平静 → 0.5=普通 → 1.0=非常激动 4. tags(关键词标签):分两步生成,合并为一个数组: 第一步—精准提取:从原文抽取 3~5 个真正的核心词,不泛化、不遗漏 第二步—引申扩展:自动补充 8~10 个与当前场景语义相关的词,包括近义词、上位词、关联场景词、用户可能用不同措辞搜索的词 两步合并为一个 tags 数组,总计 10~15 个 5. suggested_name(建议桶名):10字以内的简短标题 6. 在 tags 和 suggested_name 中不要使用 [[]] 双链标记 输出格式(纯 JSON,无其他内容): { "domain": ["主题域1", "主题域2"], "valence": 0.7, "arousal": 0.4, "tags": ["核心词1", "核心词2", "扩展词1", "扩展词2", "..."], "suggested_name": "简短标题" }""" class Dehydrator: """ Data dehydrator + content analyzer. Three capabilities: dehydration / merge / auto-tagging (domain + emotion). Prefers API (better quality); auto-degrades to local (guaranteed availability). 数据脱水器 + 内容分析器。 三大能力:脱水压缩 / 新旧合并 / 自动打标。 优先走 API,API 挂了自动降级到本地。 """ def __init__(self, config: dict): # --- Read dehydration API config / 读取脱水 API 配置 --- dehy_cfg = config.get("dehydration", {}) self.api_key = dehy_cfg.get("api_key", "") self.model = dehy_cfg.get("model", "deepseek-chat") self.base_url = dehy_cfg.get("base_url", "https://api.deepseek.com/v1") self.max_tokens = dehy_cfg.get("max_tokens", 1024) self.temperature = dehy_cfg.get("temperature", 0.1) # --- API availability / 是否有可用的 API --- self.api_available = bool(self.api_key) # --- Initialize OpenAI-compatible client --- # --- 初始化 OpenAI 兼容客户端 --- # Supports any OpenAI-format API: DeepSeek / Ollama / LM Studio / vLLM / Gemini etc. # User only needs to set base_url in config.yaml if self.api_available: self.client = AsyncOpenAI( api_key=self.api_key, base_url=self.base_url, timeout=60.0, ) else: self.client = None # --------------------------------------------------------- # Dehydrate: compress raw content into concise summary # 脱水:将原始内容压缩为精简摘要 # Try API first, fallback to local # 先尝试 API,失败则回退本地 # --------------------------------------------------------- async def dehydrate(self, content: str, metadata: dict = None) -> str: """ Dehydrate/compress memory content. Returns formatted summary string ready for Claude context injection. 对记忆内容做脱水压缩。 返回格式化的摘要字符串,可直接注入 Claude 上下文。 """ if not content or not content.strip(): return "(空记忆 / empty memory)" # --- Content is short enough, no compression needed --- # --- 内容已经很短,不需要压缩 --- if count_tokens_approx(content) < 100: return self._format_output(content, metadata) # --- Local compression (Always used as requested) --- # --- 本地压缩 --- result = self._local_dehydrate(content) return self._format_output(result, metadata) # --------------------------------------------------------- # Merge: blend new content into existing bucket # 合并:将新内容揉入已有桶,保持体积恒定 # --------------------------------------------------------- async def merge(self, old_content: str, new_content: str) -> str: """ Merge new content with old memory, preventing infinite bucket growth. 将新内容与旧记忆合并,避免桶无限膨胀。 """ if not old_content and not new_content: return "" if not old_content: return new_content or "" if not new_content: return old_content # --- API merge (no local fallback) --- if not self.api_available: raise RuntimeError("脱水 API 不可用,请检查 config.yaml 中的 dehydration 配置") try: result = await self._api_merge(old_content, new_content) if result: return result raise RuntimeError("API 合并返回空结果") except RuntimeError: raise except Exception as e: raise RuntimeError(f"API 合并失败,请检查 API 连接: {e}") from e # --------------------------------------------------------- # API call: dehydration # API 调用:脱水压缩 # --------------------------------------------------------- async def _api_dehydrate(self, content: str) -> str: """ Call LLM API for intelligent dehydration (via OpenAI-compatible client). 调用 LLM API 执行智能脱水。 """ response = await self.client.chat.completions.create( model=self.model, messages=[ {"role": "system", "content": DEHYDRATE_PROMPT}, {"role": "user", "content": content[:3000]}, ], max_tokens=self.max_tokens, temperature=self.temperature, ) if not response.choices: return "" return response.choices[0].message.content or "" # --------------------------------------------------------- # API call: merge # API 调用:合并 # --------------------------------------------------------- async def _api_merge(self, old_content: str, new_content: str) -> str: """ Call LLM API for intelligent merge (via OpenAI-compatible client). 调用 LLM API 执行智能合并。 """ user_msg = f"旧记忆:\n{old_content[:2000]}\n\n新内容:\n{new_content[:2000]}" response = await self.client.chat.completions.create( model=self.model, messages=[ {"role": "system", "content": MERGE_PROMPT}, {"role": "user", "content": user_msg}, ], max_tokens=self.max_tokens, temperature=self.temperature, ) if not response.choices: return "" return response.choices[0].message.content or "" # --------------------------------------------------------- # Local dehydration (fallback when API is unavailable) # 本地脱水(无 API 时的兜底方案) # Keyword frequency + sentence position weighting # 基于关键词频率 + 句子位置权重 # --------------------------------------------------------- def _local_dehydrate(self, content: str) -> str: """ Local keyword extraction + position-weighted simple compression. 本地关键词提取 + 位置加权的简单压缩。 """ # --- Split into sentences / 分句 --- sentences = re.split(r"[。!?\n.!?]+", content) sentences = [s.strip() for s in sentences if len(s.strip()) > 5] if not sentences: return content[:200] # --- Extract high-frequency keywords / 提取高频关键词 --- keywords = self._extract_keywords(content) # --- Score sentences: position weight + keyword hits --- # --- 句子评分:开头结尾权重高 + 关键词命中加分 --- scored = [] for i, sent in enumerate(sentences): position_weight = 1.5 if i < 3 else (1.2 if i > len(sentences) - 3 else 1.0) keyword_hits = sum(1 for kw in keywords if kw in sent) score = position_weight * (1 + keyword_hits) scored.append((score, sent)) scored.sort(key=lambda x: x[0], reverse=True) # --- Top-8 sentences + keyword list / 取高分句 + 关键词列表 --- selected = [s for _, s in scored[:8]] summary = "。".join(selected) if len(summary) > 1000: summary = summary[:1000] + "…" return summary # --------------------------------------------------------- # Keyword extraction # 关键词提取 # Chinese + English tokenization → stopword filter → frequency sort # 中英文分词 + 停用词过滤 + 词频排序 # --------------------------------------------------------- def _extract_keywords(self, text: str) -> list[str]: """ Extract high-frequency keywords using jieba (Chinese + English mixed). 用 jieba 分词提取高频关键词。 """ try: words = jieba.lcut(text) except Exception: words = [] # English words / 英文单词 english_words = re.findall(r"[a-zA-Z]{3,}", text.lower()) words += english_words # Stopwords / 停用词 stopwords = { "的", "了", "在", "是", "我", "有", "和", "就", "不", "人", "都", "一个", "上", "也", "很", "到", "说", "要", "去", "你", "会", "着", "没有", "看", "好", "自己", "这", "他", "她", "the", "and", "for", "are", "but", "not", "you", "all", "can", "had", "her", "was", "one", "our", "out", "has", "have", "with", "this", "that", "from", "they", "been", "said", "will", "each", } filtered = [ w for w in words if w not in stopwords and len(w.strip()) > 1 and not re.match(r"^[0-9]+$", w) ] counter = Counter(filtered) return [word for word, _ in counter.most_common(15)] # --------------------------------------------------------- # Output formatting # 输出格式化 # Wraps dehydrated result with bucket name, tags, emotion coords # 把脱水结果包装成带桶名、标签、情感坐标的可读文本 # --------------------------------------------------------- def _format_output(self, content: str, metadata: dict = None) -> str: """ Format dehydrated result into context-injectable text. 将脱水结果格式化为可注入上下文的文本。 """ header = "" if metadata and isinstance(metadata, dict): name = metadata.get("name", "未命名") domains = ", ".join(metadata.get("domain", [])) try: valence = float(metadata.get("valence", 0.5)) arousal = float(metadata.get("arousal", 0.3)) except (ValueError, TypeError): valence, arousal = 0.5, 0.3 header = f"📌 记忆桶: {name}" if domains: header += f" [主题:{domains}]" header += f" [情感:V{valence:.1f}/A{arousal:.1f}]" # Show model's perspective if available (valence drift) model_v = metadata.get("model_valence") if model_v is not None: try: header += f" [我的视角:V{float(model_v):.1f}]" except (ValueError, TypeError): pass if metadata.get("digested"): header += " [已消化]" header += "\n" content = re.sub(r'\[\[([^\]]+)\]\]', r'\1', content) return f"{header}{content}" # --------------------------------------------------------- # Auto-tagging: analyze content for domain + emotion + tags # 自动打标:分析内容,输出主题域 + 情感坐标 + 标签 # Called by server.py when storing new memories # 存新记忆时由 server.py 调用 # --------------------------------------------------------- async def analyze(self, content: str) -> dict: """ Analyze content and return structured metadata. 分析内容,返回结构化元数据。 Returns: {"domain", "valence", "arousal", "tags", "suggested_name"} """ if not content or not content.strip(): return self._default_analysis() # --- API analyze (no local fallback) --- if not self.api_available: raise RuntimeError("脱水 API 不可用,请检查 config.yaml 中的 dehydration 配置") try: result = await self._api_analyze(content) if result: return result raise RuntimeError("API 打标返回空结果") except RuntimeError: raise except Exception as e: raise RuntimeError(f"API 打标失败,请检查 API 连接: {e}") from e # --------------------------------------------------------- # API call: auto-tagging # API 调用:自动打标 # --------------------------------------------------------- async def _api_analyze(self, content: str) -> dict: """ Call LLM API for content analysis / tagging. 调用 LLM API 执行内容分析打标。 """ response = await self.client.chat.completions.create( model=self.model, messages=[ {"role": "system", "content": ANALYZE_PROMPT}, {"role": "user", "content": content[:2000]}, ], max_tokens=256, temperature=0.1, ) if not response.choices: return self._default_analysis() raw = response.choices[0].message.content or "" if not raw.strip(): return self._default_analysis() return self._parse_analysis(raw) # --------------------------------------------------------- # Parse API JSON response with safety checks # 解析 API 返回的 JSON,做安全校验 # Ensure valence/arousal in 0~1, domain/tags valid # --------------------------------------------------------- def _parse_analysis(self, raw: str) -> dict: """ Parse and validate API tagging result. 解析并校验 API 返回的打标结果。 """ try: # Handle potential markdown code block wrapping # 处理可能的 markdown 代码块包裹 cleaned = raw.strip() if cleaned.startswith("```"): cleaned = cleaned.split("\n", 1)[-1].rsplit("```", 1)[0] result = json.loads(cleaned) except (json.JSONDecodeError, IndexError, ValueError): logger.warning(f"API tagging JSON parse failed / JSON 解析失败: {raw[:200]}") return self._default_analysis() if not isinstance(result, dict): return self._default_analysis() # --- Validate and clamp value ranges / 校验并钳制数值范围 --- try: valence = max(0.0, min(1.0, float(result.get("valence", 0.5)))) arousal = max(0.0, min(1.0, float(result.get("arousal", 0.3)))) except (ValueError, TypeError): valence, arousal = 0.5, 0.3 return { "domain": result.get("domain", ["未分类"])[:3], "valence": valence, "arousal": arousal, "tags": result.get("tags", [])[:15], "suggested_name": str(result.get("suggested_name", ""))[:20], } # --------------------------------------------------------- # Default analysis result (empty content or total failure) # 默认分析结果(内容为空或完全失败时用) # --------------------------------------------------------- def _default_analysis(self) -> dict: """ Return default neutral analysis result. 返回默认的中性分析结果。 """ return { "domain": ["未分类"], "valence": 0.5, "arousal": 0.3, "tags": [], "suggested_name": "", } # --------------------------------------------------------- # Diary digest: split daily notes into independent memory entries # 日记整理:把一大段日常拆分成多个独立记忆条目 # For the "grow" tool — "dump a day's content and it gets organized" # 给 grow 工具用,"一天结束发一坨内容"靠这个 # --------------------------------------------------------- async def digest(self, content: str) -> list[dict]: """ Split a large chunk of daily content into independent memory entries. 将一大段日常内容拆分成多个独立记忆条目。 Returns: [{"name", "content", "domain", "valence", "arousal", "tags", "importance"}, ...] """ if not content or not content.strip(): return [] # --- API digest (no local fallback) --- if not self.api_available: raise RuntimeError("脱水 API 不可用,请检查 config.yaml 中的 dehydration 配置") try: result = await self._api_digest(content) if result: return result raise RuntimeError("API 日记整理返回空结果") except RuntimeError: raise except Exception as e: raise RuntimeError(f"API 日记整理失败,请检查 API 连接: {e}") from e # --------------------------------------------------------- # API call: diary digest # API 调用:日记整理 # --------------------------------------------------------- async def _api_digest(self, content: str) -> list[dict]: """ Call LLM API for diary organization. 调用 LLM API 执行日记整理。 """ response = await self.client.chat.completions.create( model=self.model, messages=[ {"role": "system", "content": DIGEST_PROMPT}, {"role": "user", "content": content[:5000]}, ], max_tokens=2048, temperature=0.0, ) if not response.choices: return [] raw = response.choices[0].message.content or "" if not raw.strip(): return [] return self._parse_digest(raw) # --------------------------------------------------------- # Parse diary digest result with safety checks # 解析日记整理结果,做安全校验 # --------------------------------------------------------- def _parse_digest(self, raw: str) -> list[dict]: """ Parse and validate API diary digest result. 解析并校验 API 返回的日记整理结果。 """ try: cleaned = raw.strip() if cleaned.startswith("```"): cleaned = cleaned.split("\n", 1)[-1].rsplit("```", 1)[0] items = json.loads(cleaned) except (json.JSONDecodeError, IndexError, ValueError): logger.warning(f"Diary digest JSON parse failed / JSON 解析失败: {raw[:200]}") return [] if not isinstance(items, list): return [] validated = [] for item in items: if not isinstance(item, dict) or not item.get("content"): continue try: importance = max(1, min(10, int(item.get("importance", 5)))) except (ValueError, TypeError): importance = 5 try: valence = max(0.0, min(1.0, float(item.get("valence", 0.5)))) arousal = max(0.0, min(1.0, float(item.get("arousal", 0.3)))) except (ValueError, TypeError): valence, arousal = 0.5, 0.3 validated.append({ "name": str(item.get("name", ""))[:20], "content": str(item.get("content", "")), "domain": item.get("domain", ["未分类"])[:3], "valence": valence, "arousal": arousal, "tags": item.get("tags", [])[:15], "importance": importance, }) return validated