Files
Ombre_Brain/tests/test_llm_quality.py

112 lines
4.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# ============================================================
# Test 2: LLM Quality Baseline — needs GEMINI_API_KEY
# 测试 2LLM 质量基准 —— 需要 GEMINI_API_KEY
#
# Verifies LLM auto-tagging returns reasonable results:
# - domain is a non-empty list of strings
# - valence ∈ [0, 1]
# - arousal ∈ [0, 1]
# - tags is a list
# - suggested_name is a string
# - domain matches content semantics (loose check)
# ============================================================
import os
import pytest
# Skip all tests if no API key
pytestmark = pytest.mark.skipif(
not os.environ.get("OMBRE_API_KEY"),
reason="OMBRE_API_KEY not set — skipping LLM quality tests"
)
@pytest.fixture
def dehydrator(test_config):
from dehydrator import Dehydrator
return Dehydrator(test_config)
# Test cases: (content, expected_domains_superset, valence_range)
LLM_CASES = [
(
"今天学了 Python 的 asyncio终于搞懂了 event loop心情不错",
{"学习", "编程", "技术", "数字", "Python"},
(0.5, 1.0), # positive
),
(
"被导师骂了一顿,论文写得太差了,很沮丧",
{"学习", "学业", "心理", "工作"},
(0.0, 0.4), # negative
),
(
"和朋友去爬了一座山,山顶的风景超美,累但值得",
{"生活", "旅行", "社交", "运动", "健康"},
(0.6, 1.0), # positive
),
(
"在阳台上看日落,什么都没想,很平静",
{"生活", "心理", "自省"},
(0.4, 0.8), # calm positive
),
(
"I built a FastAPI app with Docker and deployed it on Render",
{"编程", "技术", "学习", "数字", "工作"},
(0.5, 1.0), # positive
),
]
class TestLLMQuality:
"""Verify LLM auto-tagging produces reasonable outputs."""
@pytest.mark.asyncio
@pytest.mark.parametrize("content,expected_domains,valence_range", LLM_CASES)
async def test_analyze_structure(self, dehydrator, content, expected_domains, valence_range):
"""Check that analyze() returns valid structure and reasonable values."""
result = await dehydrator.analyze(content)
# Structure checks
assert isinstance(result, dict)
assert "domain" in result
assert "valence" in result
assert "arousal" in result
assert "tags" in result
# Domain is non-empty list of strings
assert isinstance(result["domain"], list)
assert len(result["domain"]) >= 1
assert all(isinstance(d, str) for d in result["domain"])
# Valence and arousal in range
assert 0.0 <= result["valence"] <= 1.0, f"valence {result['valence']} out of range"
assert 0.0 <= result["arousal"] <= 1.0, f"arousal {result['arousal']} out of range"
# Valence roughly matches expected range (with tolerance)
lo, hi = valence_range
assert lo - 0.15 <= result["valence"] <= hi + 0.15, \
f"valence {result['valence']} not in expected range ({lo}, {hi}) for: {content[:30]}..."
# Tags is a list
assert isinstance(result["tags"], list)
@pytest.mark.asyncio
async def test_analyze_domain_semantic_match(self, dehydrator):
"""Check that domain has at least some semantic relevance."""
result = await dehydrator.analyze("我家的橘猫小橘今天又偷吃了桌上的鱼")
domains = set(result["domain"])
# Should contain something life/pet related
life_related = {"生活", "宠物", "家庭", "日常", "动物"}
assert domains & life_related, f"Expected life-related domain, got {domains}"
@pytest.mark.asyncio
async def test_analyze_empty_content(self, dehydrator):
"""Empty content should raise or return defaults gracefully."""
try:
result = await dehydrator.analyze("")
# If it doesn't raise, should still return valid structure
assert isinstance(result, dict)
assert 0.0 <= result["valence"] <= 1.0
except Exception:
pass # Raising is also acceptable