Files
Ombre_Brain/bucket_manager.py

782 lines
30 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# ============================================================
# Module: Memory Bucket Manager (bucket_manager.py)
# 模块:记忆桶管理器
#
# CRUD operations, multi-dimensional index search, activation updates
# for memory buckets.
# 记忆桶的增删改查、多维索引搜索、激活更新。
#
# Core design:
# 核心逻辑:
# - Each bucket = one Markdown file (YAML frontmatter + body)
# 每个记忆桶 = 一个 Markdown 文件
# - Storage by type: permanent / dynamic / archive
# 存储按类型分目录
# - Multi-dimensional soft index: domain + valence/arousal + fuzzy text
# 多维软索引:主题域 + 情感坐标 + 文本模糊匹配
# - Search strategy: domain pre-filter → weighted multi-dim ranking
# 搜索策略:主题域预筛 → 多维加权精排
# - Emotion coordinates based on Russell circumplex model:
# 情感坐标基于环形情感模型Russell circumplex
# valence (0~1): 0=negative → 1=positive
# arousal (0~1): 0=calm → 1=excited
#
# Depended on by: server.py, decay_engine.py
# 被谁依赖server.py, decay_engine.py
# ============================================================
import os
import math
import logging
import re
import shutil
from collections import Counter
from datetime import datetime
from pathlib import Path
from typing import Optional
import frontmatter
import jieba
from rapidfuzz import fuzz
from utils import generate_bucket_id, sanitize_name, safe_path, now_iso
logger = logging.getLogger("ombre_brain.bucket")
class BucketManager:
"""
Memory bucket manager — entry point for all bucket CRUD operations.
Buckets are stored as Markdown files with YAML frontmatter for metadata
and body for content. Natively compatible with Obsidian browsing/editing.
记忆桶管理器 —— 所有桶的 CRUD 操作入口。
桶以 Markdown 文件存储YAML frontmatter 存元数据,正文存内容。
天然兼容 Obsidian 直接浏览和编辑。
"""
def __init__(self, config: dict):
# --- Read storage paths from config / 从配置中读取存储路径 ---
self.base_dir = config["buckets_dir"]
self.permanent_dir = os.path.join(self.base_dir, "permanent")
self.dynamic_dir = os.path.join(self.base_dir, "dynamic")
self.archive_dir = os.path.join(self.base_dir, "archive")
self.fuzzy_threshold = config.get("matching", {}).get("fuzzy_threshold", 50)
self.max_results = config.get("matching", {}).get("max_results", 5)
# --- Wikilink config / 双链配置 ---
wikilink_cfg = config.get("wikilink", {})
self.wikilink_enabled = wikilink_cfg.get("enabled", True)
self.wikilink_use_tags = wikilink_cfg.get("use_tags", False)
self.wikilink_use_domain = wikilink_cfg.get("use_domain", True)
self.wikilink_use_auto_keywords = wikilink_cfg.get("use_auto_keywords", True)
self.wikilink_auto_top_k = wikilink_cfg.get("auto_top_k", 8)
self.wikilink_min_len = wikilink_cfg.get("min_keyword_len", 2)
self.wikilink_exclude_keywords = set(wikilink_cfg.get("exclude_keywords", []))
self.wikilink_stopwords = {
"", "", "", "", "", "", "", "", "", "",
"", "一个", "", "", "", "", "", "", "",
"", "", "", "没有", "", "", "自己", "", "", "",
"我们", "你们", "他们", "然后", "今天", "昨天", "明天", "一下",
"the", "and", "for", "are", "but", "not", "you", "all", "can",
"had", "her", "was", "one", "our", "out", "has", "have", "with",
"this", "that", "from", "they", "been", "said", "will", "each",
}
self.wikilink_stopwords |= {w.lower() for w in self.wikilink_exclude_keywords}
# --- Search scoring weights / 检索权重配置 ---
scoring = config.get("scoring_weights", {})
self.w_topic = scoring.get("topic_relevance", 4.0)
self.w_emotion = scoring.get("emotion_resonance", 2.0)
self.w_time = scoring.get("time_proximity", 1.5)
self.w_importance = scoring.get("importance", 1.0)
# ---------------------------------------------------------
# Create a new bucket
# 创建新桶
# Write content and metadata into a .md file
# 将内容和元数据写入一个 .md 文件
# ---------------------------------------------------------
async def create(
self,
content: str,
tags: list[str] = None,
importance: int = 5,
domain: list[str] = None,
valence: float = 0.5,
arousal: float = 0.3,
bucket_type: str = "dynamic",
name: str = None,
pinned: bool = False,
protected: bool = False,
) -> str:
"""
Create a new memory bucket, return bucket ID.
创建一个新的记忆桶,返回桶 ID。
pinned/protected=True: bucket won't be merged, decayed, or have importance changed.
Importance is locked to 10 for pinned/protected buckets.
pinned/protected 桶不参与合并与衰减importance 强制锁定为 10。
"""
bucket_id = generate_bucket_id()
bucket_name = sanitize_name(name) if name else bucket_id
domain = domain or ["未分类"]
tags = tags or []
linked_content = self._apply_wikilinks(content, tags, domain, bucket_name)
# --- Pinned/protected buckets: lock importance to 10 ---
# --- 钉选/保护桶importance 强制锁定为 10 ---
if pinned or protected:
importance = 10
# --- Build YAML frontmatter metadata / 构建元数据 ---
metadata = {
"id": bucket_id,
"name": bucket_name,
"tags": tags,
"domain": domain,
"valence": max(0.0, min(1.0, valence)),
"arousal": max(0.0, min(1.0, arousal)),
"importance": max(1, min(10, importance)),
"type": bucket_type,
"created": now_iso(),
"last_active": now_iso(),
"activation_count": 1,
}
if pinned:
metadata["pinned"] = True
if protected:
metadata["protected"] = True
# --- Assemble Markdown file (frontmatter + body) ---
# --- 组装 Markdown 文件 ---
post = frontmatter.Post(linked_content, **metadata)
# --- Choose directory by type + primary domain ---
# --- 按类型 + 主题域选择存储目录 ---
type_dir = self.permanent_dir if bucket_type == "permanent" else self.dynamic_dir
primary_domain = sanitize_name(domain[0]) if domain else "未分类"
target_dir = os.path.join(type_dir, primary_domain)
os.makedirs(target_dir, exist_ok=True)
# --- Filename: readable_name_bucketID.md (Obsidian friendly) ---
# --- 文件名可读名称_桶ID.md ---
if bucket_name and bucket_name != bucket_id:
filename = f"{bucket_name}_{bucket_id}.md"
else:
filename = f"{bucket_id}.md"
file_path = safe_path(target_dir, filename)
try:
with open(file_path, "w", encoding="utf-8") as f:
f.write(frontmatter.dumps(post))
except OSError as e:
logger.error(f"Failed to write bucket file / 写入桶文件失败: {file_path}: {e}")
raise
logger.info(
f"Created bucket / 创建记忆桶: {bucket_id} ({bucket_name}) → {primary_domain}/"
+ (" [PINNED]" if pinned else "") + (" [PROTECTED]" if protected else "")
)
return bucket_id
# ---------------------------------------------------------
# Read bucket content
# 读取桶内容
# Returns {"id", "metadata", "content", "path"} or None
# ---------------------------------------------------------
async def get(self, bucket_id: str) -> Optional[dict]:
"""
Read a single bucket by ID.
根据 ID 读取单个桶。
"""
if not bucket_id or not isinstance(bucket_id, str):
return None
file_path = self._find_bucket_file(bucket_id)
if not file_path:
return None
return self._load_bucket(file_path)
# ---------------------------------------------------------
# Update bucket
# 更新桶
# Supports: content, tags, importance, valence, arousal, name, resolved
# ---------------------------------------------------------
async def update(self, bucket_id: str, **kwargs) -> bool:
"""
Update bucket content or metadata fields.
更新桶的内容或元数据字段。
"""
file_path = self._find_bucket_file(bucket_id)
if not file_path:
return False
try:
post = frontmatter.load(file_path)
except Exception as e:
logger.warning(f"Failed to load bucket for update / 加载桶失败: {file_path}: {e}")
return False
# --- Pinned/protected buckets: lock importance to 10, ignore importance changes ---
# --- 钉选/保护桶importance 不可修改,强制保持 10 ---
is_pinned = post.get("pinned", False) or post.get("protected", False)
if is_pinned:
kwargs.pop("importance", None) # silently ignore importance update
# --- Update only fields that were passed in / 只改传入的字段 ---
if "content" in kwargs:
next_tags = kwargs.get("tags", post.get("tags", []))
next_domain = kwargs.get("domain", post.get("domain", []))
next_name = kwargs.get("name", post.get("name", ""))
post.content = self._apply_wikilinks(
kwargs["content"],
next_tags,
next_domain,
next_name,
)
if "tags" in kwargs:
post["tags"] = kwargs["tags"]
if "importance" in kwargs:
post["importance"] = max(1, min(10, int(kwargs["importance"])))
if "domain" in kwargs:
post["domain"] = kwargs["domain"]
if "valence" in kwargs:
post["valence"] = max(0.0, min(1.0, float(kwargs["valence"])))
if "arousal" in kwargs:
post["arousal"] = max(0.0, min(1.0, float(kwargs["arousal"])))
if "name" in kwargs:
post["name"] = sanitize_name(kwargs["name"])
if "resolved" in kwargs:
post["resolved"] = bool(kwargs["resolved"])
if "pinned" in kwargs:
post["pinned"] = bool(kwargs["pinned"])
if kwargs["pinned"]:
post["importance"] = 10 # pinned → lock importance to 10
# --- Auto-refresh activation time / 自动刷新激活时间 ---
post["last_active"] = now_iso()
try:
with open(file_path, "w", encoding="utf-8") as f:
f.write(frontmatter.dumps(post))
except OSError as e:
logger.error(f"Failed to write bucket update / 写入桶更新失败: {file_path}: {e}")
return False
logger.info(f"Updated bucket / 更新记忆桶: {bucket_id}")
return True
# ---------------------------------------------------------
# Wikilink injection
# 自动添加 Obsidian 双链
# ---------------------------------------------------------
def _apply_wikilinks(
self,
content: str,
tags: list[str],
domain: list[str],
name: str,
) -> str:
"""
Auto-inject Obsidian wikilinks, avoiding double-wrapping existing [[...]].
自动添加 Obsidian 双链,避免重复包裹已有 [[...]]。
"""
if not self.wikilink_enabled or not content:
return content
keywords = self._collect_wikilink_keywords(content, tags, domain, name)
if not keywords:
return content
# Split on existing wikilinks to avoid wrapping them again
# 按已有双链切分,避免重复包裹
segments = re.split(r"(\[\[[^\]]+\]\])", content)
pattern = re.compile("|".join(re.escape(kw) for kw in keywords))
for i, segment in enumerate(segments):
if segment.startswith("[[") and segment.endswith("]]"):
continue
updated = pattern.sub(lambda m: f"[[{m.group(0)}]]", segment)
segments[i] = updated
return "".join(segments)
def _collect_wikilink_keywords(
self,
content: str,
tags: list[str],
domain: list[str],
name: str,
) -> list[str]:
"""
Collect candidate keywords from tags/domain/auto-extraction.
汇总候选关键词:可选 tags/domain + 自动提词。
"""
candidates = []
if self.wikilink_use_tags:
candidates.extend(tags or [])
if self.wikilink_use_domain:
candidates.extend(domain or [])
if name:
candidates.append(name)
if self.wikilink_use_auto_keywords:
candidates.extend(self._extract_auto_keywords(content))
return self._normalize_keywords(candidates)
def _normalize_keywords(self, keywords: list[str]) -> list[str]:
"""
Deduplicate and sort by length (longer first to avoid short words
breaking long ones during replacement).
去重并按长度排序,优先替换长词。
"""
if not keywords:
return []
seen = set()
cleaned = []
for keyword in keywords:
if not isinstance(keyword, str):
continue
kw = keyword.strip()
if len(kw) < self.wikilink_min_len:
continue
if kw in self.wikilink_exclude_keywords:
continue
if kw.lower() in self.wikilink_stopwords:
continue
if kw in seen:
continue
seen.add(kw)
cleaned.append(kw)
return sorted(cleaned, key=len, reverse=True)
def _extract_auto_keywords(self, content: str) -> list[str]:
"""
Auto-extract keywords from body text, prioritizing high-frequency words.
从正文自动提词,优先高频词。
"""
if not content:
return []
try:
zh_words = [w.strip() for w in jieba.lcut(content) if w.strip()]
except Exception:
zh_words = []
en_words = re.findall(r"[A-Za-z][A-Za-z0-9_-]{2,20}", content)
# Chinese bigrams / 中文双词组合
zh_bigrams = []
for i in range(len(zh_words) - 1):
left = zh_words[i]
right = zh_words[i + 1]
if len(left) < self.wikilink_min_len or len(right) < self.wikilink_min_len:
continue
if not re.fullmatch(r"[\u4e00-\u9fff]+", left + right):
continue
if len(left + right) > 8:
continue
zh_bigrams.append(left + right)
merged = []
for word in zh_words + zh_bigrams + en_words:
if len(word) < self.wikilink_min_len:
continue
if re.fullmatch(r"\d+", word):
continue
if word.lower() in self.wikilink_stopwords:
continue
merged.append(word)
if not merged:
return []
counter = Counter(merged)
return [w for w, _ in counter.most_common(self.wikilink_auto_top_k)]
# ---------------------------------------------------------
# Delete bucket
# 删除桶
# ---------------------------------------------------------
async def delete(self, bucket_id: str) -> bool:
"""
Delete a memory bucket file.
删除指定的记忆桶文件。
"""
file_path = self._find_bucket_file(bucket_id)
if not file_path:
return False
try:
os.remove(file_path)
except OSError as e:
logger.error(f"Failed to delete bucket file / 删除桶文件失败: {file_path}: {e}")
return False
logger.info(f"Deleted bucket / 删除记忆桶: {bucket_id}")
return True
# ---------------------------------------------------------
# Touch bucket (refresh activation time + increment count)
# 触碰桶(刷新激活时间 + 累加激活次数)
# Called on every recall hit; affects decay score.
# 每次检索命中时调用,影响衰减得分。
# ---------------------------------------------------------
async def touch(self, bucket_id: str) -> None:
"""
Update a bucket's last activation time and count.
更新桶的最后激活时间和激活次数。
"""
file_path = self._find_bucket_file(bucket_id)
if not file_path:
return
try:
post = frontmatter.load(file_path)
post["last_active"] = now_iso()
post["activation_count"] = post.get("activation_count", 0) + 1
with open(file_path, "w", encoding="utf-8") as f:
f.write(frontmatter.dumps(post))
except Exception as e:
logger.warning(f"Failed to touch bucket / 触碰桶失败: {bucket_id}: {e}")
# ---------------------------------------------------------
# Multi-dimensional search (core feature)
# 多维搜索(核心功能)
#
# Strategy: domain pre-filter → weighted multi-dim ranking
# 策略:主题域预筛 → 多维加权精排
#
# Ranking formula:
# total = topic(×w_topic) + emotion(×w_emotion)
# + time(×w_time) + importance(×w_importance)
#
# Per-dimension scores (normalized to 0~1):
# topic = rapidfuzz weighted match (name/tags/domain/body)
# emotion = 1 - Euclidean distance (query v/a vs bucket v/a)
# time = e^(-0.02 × days) (recent memories first)
# importance = importance / 10
# ---------------------------------------------------------
async def search(
self,
query: str,
limit: int = None,
domain_filter: list[str] = None,
query_valence: float = None,
query_arousal: float = None,
) -> list[dict]:
"""
Multi-dimensional indexed search for memory buckets.
多维索引搜索记忆桶。
domain_filter: pre-filter by domain (None = search all)
query_valence/arousal: emotion coordinates for resonance scoring
"""
if not query or not query.strip():
return []
limit = limit or self.max_results
all_buckets = await self.list_all(include_archive=False)
if not all_buckets:
return []
# --- Layer 1: domain pre-filter (fast scope reduction) ---
# --- 第一层:主题域预筛(快速缩小范围)---
if domain_filter:
filter_set = {d.lower() for d in domain_filter}
candidates = [
b for b in all_buckets
if {d.lower() for d in b["metadata"].get("domain", [])} & filter_set
]
# Fall back to full search if pre-filter yields nothing
# 预筛为空则回退全量搜索
if not candidates:
candidates = all_buckets
else:
candidates = all_buckets
# --- Layer 2: weighted multi-dim ranking ---
# --- 第二层:多维加权精排 ---
scored = []
for bucket in candidates:
meta = bucket.get("metadata", {})
try:
# Dim 1: topic relevance (fuzzy text, 0~1)
topic_score = self._calc_topic_score(query, bucket)
# Dim 2: emotion resonance (coordinate distance, 0~1)
emotion_score = self._calc_emotion_score(
query_valence, query_arousal, meta
)
# Dim 3: time proximity (exponential decay, 0~1)
time_score = self._calc_time_score(meta)
# Dim 4: importance (direct normalization)
importance_score = max(1, min(10, int(meta.get("importance", 5)))) / 10.0
# --- Weighted sum / 加权求和 ---
total = (
topic_score * self.w_topic
+ emotion_score * self.w_emotion
+ time_score * self.w_time
+ importance_score * self.w_importance
)
# Normalize to 0~100 for readability
weight_sum = self.w_topic + self.w_emotion + self.w_time + self.w_importance
normalized = (total / weight_sum) * 100 if weight_sum > 0 else 0
# Resolved buckets get ranking penalty (but still reachable by keyword)
# 已解决的桶降权排序(但仍可被关键词激活)
if meta.get("resolved", False):
normalized *= 0.3
if normalized >= self.fuzzy_threshold:
bucket["score"] = round(normalized, 2)
scored.append(bucket)
except Exception as e:
logger.warning(
f"Scoring failed for bucket {bucket.get('id', '?')} / "
f"桶评分失败: {e}"
)
continue
scored.sort(key=lambda x: x["score"], reverse=True)
return scored[:limit]
# ---------------------------------------------------------
# Topic relevance sub-score:
# name(×3) + domain(×2.5) + tags(×2) + body(×1)
# 文本相关性子分:桶名(×3) + 主题域(×2.5) + 标签(×2) + 正文(×1)
# ---------------------------------------------------------
def _calc_topic_score(self, query: str, bucket: dict) -> float:
"""
Calculate text dimension relevance score (0~1).
计算文本维度的相关性得分。
"""
meta = bucket.get("metadata", {})
name_score = fuzz.partial_ratio(query, meta.get("name", "")) * 3
domain_score = (
max(
(fuzz.partial_ratio(query, d) for d in meta.get("domain", [])),
default=0,
)
* 2.5
)
tag_score = (
max(
(fuzz.partial_ratio(query, tag) for tag in meta.get("tags", [])),
default=0,
)
* 2
)
content_score = fuzz.partial_ratio(query, bucket.get("content", "")[:500]) * 1
return (name_score + domain_score + tag_score + content_score) / (100 * 8.5)
# ---------------------------------------------------------
# Emotion resonance sub-score:
# Based on Russell circumplex Euclidean distance
# 情感共鸣子分:基于环形情感模型的欧氏距离
# No emotion in query → neutral 0.5 (doesn't affect ranking)
# ---------------------------------------------------------
def _calc_emotion_score(
self, q_valence: float, q_arousal: float, meta: dict
) -> float:
"""
Calculate emotion resonance score (0~1, closer = higher).
计算情感共鸣度0~1越近越高
"""
if q_valence is None or q_arousal is None:
return 0.5 # No emotion coordinates → neutral / 无情感坐标时给中性分
try:
b_valence = float(meta.get("valence", 0.5))
b_arousal = float(meta.get("arousal", 0.3))
except (ValueError, TypeError):
return 0.5
# Euclidean distance, max sqrt(2) ≈ 1.414
dist = math.sqrt((q_valence - b_valence) ** 2 + (q_arousal - b_arousal) ** 2)
return max(0.0, 1.0 - dist / 1.414)
# ---------------------------------------------------------
# Time proximity sub-score:
# More recent activation → higher score
# 时间亲近子分:距上次激活越近分越高
# ---------------------------------------------------------
def _calc_time_score(self, meta: dict) -> float:
"""
Calculate time proximity score (0~1, more recent = higher).
计算时间亲近度。
"""
last_active_str = meta.get("last_active", meta.get("created", ""))
try:
last_active = datetime.fromisoformat(str(last_active_str))
days = max(0.0, (datetime.now() - last_active).total_seconds() / 86400)
except (ValueError, TypeError):
days = 30
return math.exp(-0.02 * days)
# ---------------------------------------------------------
# List all buckets
# 列出所有桶
# ---------------------------------------------------------
async def list_all(self, include_archive: bool = False) -> list[dict]:
"""
Recursively walk directories (including domain subdirs), list all buckets.
递归遍历目录(含域子目录),列出所有记忆桶。
"""
buckets = []
dirs = [self.permanent_dir, self.dynamic_dir]
if include_archive:
dirs.append(self.archive_dir)
for dir_path in dirs:
if not os.path.exists(dir_path):
continue
for root, _, files in os.walk(dir_path):
for filename in files:
if not filename.endswith(".md"):
continue
file_path = os.path.join(root, filename)
bucket = self._load_bucket(file_path)
if bucket:
buckets.append(bucket)
return buckets
# ---------------------------------------------------------
# Statistics (counts per category + total size)
# 统计信息(各分类桶数量 + 总体积)
# ---------------------------------------------------------
async def get_stats(self) -> dict:
"""
Return memory bucket statistics (including domain subdirs).
返回记忆桶的统计数据。
"""
stats = {
"permanent_count": 0,
"dynamic_count": 0,
"archive_count": 0,
"total_size_kb": 0.0,
"domains": {},
}
for subdir, key in [
(self.permanent_dir, "permanent_count"),
(self.dynamic_dir, "dynamic_count"),
(self.archive_dir, "archive_count"),
]:
if not os.path.exists(subdir):
continue
for root, _, files in os.walk(subdir):
for f in files:
if f.endswith(".md"):
stats[key] += 1
fpath = os.path.join(root, f)
try:
stats["total_size_kb"] += os.path.getsize(fpath) / 1024
except OSError:
pass
# Per-domain counts / 每个域的桶数量
domain_name = os.path.basename(root)
if domain_name != os.path.basename(subdir):
stats["domains"][domain_name] = stats["domains"].get(domain_name, 0) + 1
return stats
# ---------------------------------------------------------
# Archive bucket (move from permanent/dynamic into archive)
# 归档桶(从 permanent/dynamic 移入 archive
# Called by decay engine to simulate "forgetting"
# 由衰减引擎调用,模拟"遗忘"
# ---------------------------------------------------------
async def archive(self, bucket_id: str) -> bool:
"""
Move a bucket into the archive directory (preserving domain subdirs).
将指定桶移入归档目录(保留域子目录结构)。
"""
file_path = self._find_bucket_file(bucket_id)
if not file_path:
return False
try:
# Read once, get domain info and update type / 一次性读取
post = frontmatter.load(file_path)
domain = post.get("domain", ["未分类"])
primary_domain = sanitize_name(domain[0]) if domain else "未分类"
archive_subdir = os.path.join(self.archive_dir, primary_domain)
os.makedirs(archive_subdir, exist_ok=True)
dest = safe_path(archive_subdir, os.path.basename(file_path))
# Update type marker then move file / 更新类型标记后移动文件
post["type"] = "archived"
with open(file_path, "w", encoding="utf-8") as f:
f.write(frontmatter.dumps(post))
# Use shutil.move for cross-filesystem safety
# 使用 shutil.move 保证跨文件系统安全
shutil.move(file_path, str(dest))
except Exception as e:
logger.error(
f"Failed to archive bucket / 归档桶失败: {bucket_id}: {e}"
)
return False
logger.info(f"Archived bucket / 归档记忆桶: {bucket_id} → archive/{primary_domain}/")
return True
# ---------------------------------------------------------
# Internal: find bucket file across all three directories
# 内部:在三个目录中查找桶文件
# ---------------------------------------------------------
def _find_bucket_file(self, bucket_id: str) -> Optional[str]:
"""
Recursively search permanent/dynamic/archive for a bucket file
matching the given ID.
在 permanent/dynamic/archive 中递归查找指定 ID 的桶文件。
"""
if not bucket_id:
return None
for dir_path in [self.permanent_dir, self.dynamic_dir, self.archive_dir]:
if not os.path.exists(dir_path):
continue
for root, _, files in os.walk(dir_path):
for fname in files:
if not fname.endswith(".md"):
continue
# Match by exact ID segment in filename
# 通过文件名中的 ID 片段精确匹配
if bucket_id in fname:
return os.path.join(root, fname)
return None
# ---------------------------------------------------------
# Internal: load bucket data from .md file
# 内部:从 .md 文件加载桶数据
# ---------------------------------------------------------
def _load_bucket(self, file_path: str) -> Optional[dict]:
"""
Parse a Markdown file and return structured bucket data.
解析 Markdown 文件,返回桶的结构化数据。
"""
try:
post = frontmatter.load(file_path)
return {
"id": post.get("id", Path(file_path).stem),
"metadata": dict(post.metadata),
"content": post.content,
"path": file_path,
}
except Exception as e:
logger.warning(
f"Failed to load bucket file / 加载桶文件失败: {file_path}: {e}"
)
return None