docs: update README/INTERNALS for import feature, harden .gitignore

This commit is contained in:
P0luz
2026-04-19 12:09:53 +08:00
parent a09fbfe13a
commit 821546d5de
27 changed files with 5365 additions and 479 deletions

View File

@@ -60,6 +60,7 @@ class BucketManager:
self.permanent_dir = os.path.join(self.base_dir, "permanent")
self.dynamic_dir = os.path.join(self.base_dir, "dynamic")
self.archive_dir = os.path.join(self.base_dir, "archive")
self.feel_dir = os.path.join(self.base_dir, "feel")
self.fuzzy_threshold = config.get("matching", {}).get("fuzzy_threshold", 50)
self.max_results = config.get("matching", {}).get("max_results", 5)
@@ -122,7 +123,7 @@ class BucketManager:
bucket_name = sanitize_name(name) if name else bucket_id
domain = domain or ["未分类"]
tags = tags or []
linked_content = self._apply_wikilinks(content, tags, domain, bucket_name)
linked_content = content # wikilink injection disabled; LLM adds [[]] via prompt
# --- Pinned/protected buckets: lock importance to 10 ---
# --- 钉选/保护桶importance 强制锁定为 10 ---
@@ -154,8 +155,18 @@ class BucketManager:
# --- Choose directory by type + primary domain ---
# --- 按类型 + 主题域选择存储目录 ---
type_dir = self.permanent_dir if bucket_type == "permanent" else self.dynamic_dir
primary_domain = sanitize_name(domain[0]) if domain else "未分类"
if bucket_type == "permanent" or pinned:
type_dir = self.permanent_dir
if pinned and bucket_type != "permanent":
metadata["type"] = "permanent"
elif bucket_type == "feel":
type_dir = self.feel_dir
else:
type_dir = self.dynamic_dir
if bucket_type == "feel":
primary_domain = "沉淀物" # feel subfolder name
else:
primary_domain = sanitize_name(domain[0]) if domain else "未分类"
target_dir = os.path.join(type_dir, primary_domain)
os.makedirs(target_dir, exist_ok=True)
@@ -197,6 +208,25 @@ class BucketManager:
return None
return self._load_bucket(file_path)
# ---------------------------------------------------------
# Move bucket between directories
# 在目录间移动桶文件
# ---------------------------------------------------------
def _move_bucket(self, file_path: str, target_type_dir: str, domain: list[str] = None) -> str:
"""
Move a bucket file to a new type directory, preserving domain subfolder.
Returns new file path.
"""
primary_domain = sanitize_name(domain[0]) if domain else "未分类"
target_dir = os.path.join(target_type_dir, primary_domain)
os.makedirs(target_dir, exist_ok=True)
filename = os.path.basename(file_path)
new_path = safe_path(target_dir, filename)
if os.path.normpath(file_path) != os.path.normpath(new_path):
os.rename(file_path, new_path)
logger.info(f"Moved bucket / 移动记忆桶: {filename}{target_dir}/")
return new_path
# ---------------------------------------------------------
# Update bucket
# 更新桶
@@ -225,15 +255,7 @@ class BucketManager:
# --- Update only fields that were passed in / 只改传入的字段 ---
if "content" in kwargs:
next_tags = kwargs.get("tags", post.get("tags", []))
next_domain = kwargs.get("domain", post.get("domain", []))
next_name = kwargs.get("name", post.get("name", ""))
post.content = self._apply_wikilinks(
kwargs["content"],
next_tags,
next_domain,
next_name,
)
post.content = kwargs["content"] # wikilink injection disabled; LLM adds [[]] via prompt
if "tags" in kwargs:
post["tags"] = kwargs["tags"]
if "importance" in kwargs:
@@ -252,6 +274,10 @@ class BucketManager:
post["pinned"] = bool(kwargs["pinned"])
if kwargs["pinned"]:
post["importance"] = 10 # pinned → lock importance to 10
if "digested" in kwargs:
post["digested"] = bool(kwargs["digested"])
if "model_valence" in kwargs:
post["model_valence"] = max(0.0, min(1.0, float(kwargs["model_valence"])))
# --- Auto-refresh activation time / 自动刷新激活时间 ---
post["last_active"] = now_iso()
@@ -263,136 +289,33 @@ class BucketManager:
logger.error(f"Failed to write bucket update / 写入桶更新失败: {file_path}: {e}")
return False
# --- Auto-move: pinned → permanent/, resolved → archive/ ---
# --- 自动移动:钉选 → permanent/,已解决 → archive/ ---
domain = post.get("domain", ["未分类"])
if kwargs.get("pinned") and post.get("type") != "permanent":
post["type"] = "permanent"
with open(file_path, "w", encoding="utf-8") as f:
f.write(frontmatter.dumps(post))
self._move_bucket(file_path, self.permanent_dir, domain)
elif kwargs.get("resolved") and post.get("type") not in ("permanent", "feel"):
post["type"] = "archived"
with open(file_path, "w", encoding="utf-8") as f:
f.write(frontmatter.dumps(post))
self._move_bucket(file_path, self.archive_dir, domain)
logger.info(f"Updated bucket / 更新记忆桶: {bucket_id}")
return True
# ---------------------------------------------------------
# Wikilink injection
# 自动添加 Obsidian 双链
# Wikilink injection — DISABLED
# 自动添加 Obsidian 双链 — 已禁用
# Now handled by LLM prompts (Gemini adds [[]] for proper nouns)
# 现在由 LLM prompt 处理Gemini 对人名/地名/专有名词加 [[]]
# ---------------------------------------------------------
def _apply_wikilinks(
self,
content: str,
tags: list[str],
domain: list[str],
name: str,
) -> str:
"""
Auto-inject Obsidian wikilinks, avoiding double-wrapping existing [[...]].
自动添加 Obsidian 双链,避免重复包裹已有 [[...]]。
"""
if not self.wikilink_enabled or not content:
return content
keywords = self._collect_wikilink_keywords(content, tags, domain, name)
if not keywords:
return content
# Split on existing wikilinks to avoid wrapping them again
# 按已有双链切分,避免重复包裹
segments = re.split(r"(\[\[[^\]]+\]\])", content)
pattern = re.compile("|".join(re.escape(kw) for kw in keywords))
for i, segment in enumerate(segments):
if segment.startswith("[[") and segment.endswith("]]"):
continue
updated = pattern.sub(lambda m: f"[[{m.group(0)}]]", segment)
segments[i] = updated
return "".join(segments)
def _collect_wikilink_keywords(
self,
content: str,
tags: list[str],
domain: list[str],
name: str,
) -> list[str]:
"""
Collect candidate keywords from tags/domain/auto-extraction.
汇总候选关键词:可选 tags/domain + 自动提词。
"""
candidates = []
if self.wikilink_use_tags:
candidates.extend(tags or [])
if self.wikilink_use_domain:
candidates.extend(domain or [])
if name:
candidates.append(name)
if self.wikilink_use_auto_keywords:
candidates.extend(self._extract_auto_keywords(content))
return self._normalize_keywords(candidates)
def _normalize_keywords(self, keywords: list[str]) -> list[str]:
"""
Deduplicate and sort by length (longer first to avoid short words
breaking long ones during replacement).
去重并按长度排序,优先替换长词。
"""
if not keywords:
return []
seen = set()
cleaned = []
for keyword in keywords:
if not isinstance(keyword, str):
continue
kw = keyword.strip()
if len(kw) < self.wikilink_min_len:
continue
if kw in self.wikilink_exclude_keywords:
continue
if kw.lower() in self.wikilink_stopwords:
continue
if kw in seen:
continue
seen.add(kw)
cleaned.append(kw)
return sorted(cleaned, key=len, reverse=True)
def _extract_auto_keywords(self, content: str) -> list[str]:
"""
Auto-extract keywords from body text, prioritizing high-frequency words.
从正文自动提词,优先高频词。
"""
if not content:
return []
try:
zh_words = [w.strip() for w in jieba.lcut(content) if w.strip()]
except Exception:
zh_words = []
en_words = re.findall(r"[A-Za-z][A-Za-z0-9_-]{2,20}", content)
# Chinese bigrams / 中文双词组合
zh_bigrams = []
for i in range(len(zh_words) - 1):
left = zh_words[i]
right = zh_words[i + 1]
if len(left) < self.wikilink_min_len or len(right) < self.wikilink_min_len:
continue
if not re.fullmatch(r"[\u4e00-\u9fff]+", left + right):
continue
if len(left + right) > 8:
continue
zh_bigrams.append(left + right)
merged = []
for word in zh_words + zh_bigrams + en_words:
if len(word) < self.wikilink_min_len:
continue
if re.fullmatch(r"\d+", word):
continue
if word.lower() in self.wikilink_stopwords:
continue
merged.append(word)
if not merged:
return []
counter = Counter(merged)
return [w for w, _ in counter.most_common(self.wikilink_auto_top_k)]
# def _apply_wikilinks(self, content, tags, domain, name): ...
# def _collect_wikilink_keywords(self, content, tags, domain, name): ...
# def _normalize_keywords(self, keywords): ...
# def _extract_auto_keywords(self, content): ...
# ---------------------------------------------------------
# Delete bucket
@@ -425,7 +348,9 @@ class BucketManager:
async def touch(self, bucket_id: str) -> None:
"""
Update a bucket's last activation time and count.
Also triggers time ripple: nearby memories get a slight activation boost.
更新桶的最后激活时间和激活次数。
同时触发时间涟漪:时间上相邻的记忆轻微唤醒。
"""
file_path = self._find_bucket_file(bucket_id)
if not file_path:
@@ -438,9 +363,60 @@ class BucketManager:
with open(file_path, "w", encoding="utf-8") as f:
f.write(frontmatter.dumps(post))
# --- Time ripple: boost nearby memories within ±48h ---
# --- 时间涟漪±48小时内的记忆轻微唤醒 ---
current_time = datetime.fromisoformat(str(post.get("created", post.get("last_active", ""))))
await self._time_ripple(bucket_id, current_time)
except Exception as e:
logger.warning(f"Failed to touch bucket / 触碰桶失败: {bucket_id}: {e}")
async def _time_ripple(self, source_id: str, reference_time: datetime, hours: float = 48.0) -> None:
"""
Slightly boost activation_count of buckets created/activated near the reference time.
轻微提升时间相邻桶的激活次数(+0.3),不改 last_active 避免递归唤醒。
Max 5 buckets rippled per touch to bound I/O.
"""
try:
all_buckets = await self.list_all(include_archive=False)
except Exception:
return
rippled = 0
max_ripple = 5
for bucket in all_buckets:
if rippled >= max_ripple:
break
if bucket["id"] == source_id:
continue
meta = bucket.get("metadata", {})
# Skip pinned/permanent/feel
if meta.get("pinned") or meta.get("protected") or meta.get("type") in ("permanent", "feel"):
continue
created_str = meta.get("created", meta.get("last_active", ""))
try:
created = datetime.fromisoformat(str(created_str))
delta_hours = abs((reference_time - created).total_seconds()) / 3600
except (ValueError, TypeError):
continue
if delta_hours <= hours:
# Boost activation_count by 0.3 (fractional), don't change last_active
file_path = self._find_bucket_file(bucket["id"])
if not file_path:
continue
try:
post = frontmatter.load(file_path)
current_count = post.get("activation_count", 1)
# Store as float for fractional increments; calculate_score handles it
post["activation_count"] = round(current_count + 0.3, 1)
with open(file_path, "w", encoding="utf-8") as f:
f.write(frontmatter.dumps(post))
rippled += 1
except Exception:
continue
# ---------------------------------------------------------
# Multi-dimensional search (core feature)
# 多维搜索(核心功能)
@@ -576,7 +552,7 @@ class BucketManager:
)
content_score = fuzz.partial_ratio(query, bucket.get("content", "")[:1000]) * self.content_weight
return (name_score + domain_score + tag_score + content_score) / (100 * 10.5)
return (name_score + domain_score + tag_score + content_score) / (100 * (3 + 2.5 + 2 + self.content_weight))
# ---------------------------------------------------------
# Emotion resonance sub-score:
@@ -633,7 +609,7 @@ class BucketManager:
"""
buckets = []
dirs = [self.permanent_dir, self.dynamic_dir]
dirs = [self.permanent_dir, self.dynamic_dir, self.feel_dir]
if include_archive:
dirs.append(self.archive_dir)
@@ -664,6 +640,7 @@ class BucketManager:
"permanent_count": 0,
"dynamic_count": 0,
"archive_count": 0,
"feel_count": 0,
"total_size_kb": 0.0,
"domains": {},
}
@@ -672,6 +649,7 @@ class BucketManager:
(self.permanent_dir, "permanent_count"),
(self.dynamic_dir, "dynamic_count"),
(self.archive_dir, "archive_count"),
(self.feel_dir, "feel_count"),
]:
if not os.path.exists(subdir):
continue
@@ -745,7 +723,7 @@ class BucketManager:
"""
if not bucket_id:
return None
for dir_path in [self.permanent_dir, self.dynamic_dir, self.archive_dir]:
for dir_path in [self.permanent_dir, self.dynamic_dir, self.archive_dir, self.feel_dir]:
if not os.path.exists(dir_path):
continue
for root, _, files in os.walk(dir_path):
@@ -754,7 +732,8 @@ class BucketManager:
continue
# Match by exact ID segment in filename
# 通过文件名中的 ID 片段精确匹配
if bucket_id in fname:
name_part = fname[:-3] # remove .md
if name_part == bucket_id or name_part.endswith(f"_{bucket_id}"):
return os.path.join(root, fname)
return None