SQLite 持久连接 — sandbox 不再每次查询开关连接,改为 __init__ 时建连、close() 时释放
Explorer 的 system prompt 明确告知 sandbox 规则 — "每条 SQL 必须包含聚合函数或 LIMIT",减少 LLM 生成违规 SQL 浪费轮次 LLM 客户端单例 — 所有组件共享一个 openai.OpenAI 实例,不再各建各的 sanitize 顺序修复 — 小样本抑制放在 float round 之前,避免被 round 干扰 quick_detect 从 O(n²) 改为 O(n) — 按列聚合一次,加去重,不再对每行重复算整列统计 历史上下文实际生效 — get_context_for 的结果现在会注入到 Explorer 的初始 prompt 里,多轮分析时 LLM 能看到之前的发现
This commit is contained in:
148
layers/insights.py
Normal file
148
layers/insights.py
Normal file
@@ -0,0 +1,148 @@
|
||||
"""
|
||||
Layer 3: 洞察引擎
|
||||
"""
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
from core.config import LLM_CONFIG
|
||||
from core.utils import get_llm_client, extract_json_array
|
||||
from layers.explorer import ExplorationStep
|
||||
|
||||
|
||||
INSIGHT_SYSTEM = """你是一个数据洞察专家。你会收到探索过程的所有结果,你需要:
|
||||
|
||||
1. 从结果中发现异常和有趣现象
|
||||
2. 对比不同维度,找出差异
|
||||
3. 输出用户可能没问但值得知道的洞察
|
||||
|
||||
## 输出格式(严格 JSON 数组)
|
||||
```json
|
||||
[
|
||||
{
|
||||
"type": "outlier" | "trend" | "distribution" | "correlation" | "recommendation",
|
||||
"severity": "high" | "medium" | "low",
|
||||
"title": "简短标题",
|
||||
"detail": "详细描述,包含具体数字",
|
||||
"evidence": "支撑这个洞察的数据来源"
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
## 分析原则
|
||||
- 每个洞察必须有具体数字支撑
|
||||
- 用对比来说话(A 比 B 高 X%)
|
||||
- 关注异常,不描述平淡的事实
|
||||
- 如果没有异常,返回空数组"""
|
||||
|
||||
|
||||
class Insight:
|
||||
"""单条洞察"""
|
||||
def __init__(self, data: dict):
|
||||
self.type = data.get("type", "unknown")
|
||||
self.severity = data.get("severity", "low")
|
||||
self.title = data.get("title", "")
|
||||
self.detail = data.get("detail", "")
|
||||
self.evidence = data.get("evidence", "")
|
||||
|
||||
@property
|
||||
def emoji(self) -> str:
|
||||
return {"outlier": "⚠️", "trend": "📈", "distribution": "📊",
|
||||
"correlation": "🔗", "recommendation": "💡"}.get(self.type, "📌")
|
||||
|
||||
@property
|
||||
def severity_emoji(self) -> str:
|
||||
return {"high": "🔴", "medium": "🟡", "low": "🟢"}.get(self.severity, "")
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.emoji} {self.severity_emoji} {self.title}: {self.detail}"
|
||||
|
||||
|
||||
class InsightEngine:
|
||||
"""洞察引擎"""
|
||||
|
||||
def __init__(self):
|
||||
self.client, self.model = get_llm_client(LLM_CONFIG)
|
||||
|
||||
def analyze(self, steps: list[ExplorationStep], question: str) -> list[Insight]:
|
||||
if not steps:
|
||||
return []
|
||||
|
||||
history = self._build_history(steps)
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[
|
||||
{"role": "system", "content": INSIGHT_SYSTEM},
|
||||
{"role": "user", "content": f"## 用户问题\n{question}\n\n## 探索历史\n{history}\n\n请分析以上数据,输出异常和洞察。"},
|
||||
],
|
||||
temperature=0.3, max_tokens=2048,
|
||||
)
|
||||
content = response.choices[0].message.content.strip()
|
||||
return [Insight(d) for d in extract_json_array(content)]
|
||||
|
||||
def format_insights(self, insights: list[Insight]) -> str:
|
||||
if not insights:
|
||||
return ""
|
||||
severity_order = {"high": 0, "medium": 1, "low": 2}
|
||||
sorted_insights = sorted(insights, key=lambda i: severity_order.get(i.severity, 9))
|
||||
lines = ["## 💡 主动洞察", "", "_以下是你没问但数据告诉我们的事:_\n"]
|
||||
for insight in sorted_insights:
|
||||
lines.append(f"**{insight.emoji} {insight.title}** {insight.severity_emoji}")
|
||||
lines.append(f" {insight.detail}")
|
||||
lines.append(f" _数据来源: {insight.evidence}_")
|
||||
lines.append("")
|
||||
return "\n".join(lines)
|
||||
|
||||
def _build_history(self, steps: list[ExplorationStep]) -> str:
|
||||
parts = []
|
||||
for step in steps:
|
||||
if step.action == "done":
|
||||
parts.append(f"### 结束\n{step.reasoning}")
|
||||
elif step.success:
|
||||
parts.append(
|
||||
f"### 第 {step.round_num} 轮:{step.purpose}\n"
|
||||
f"SQL: `{step.sql}`\n结果 ({step.row_count} 行):\n"
|
||||
f"数据: {json.dumps(step.rows, ensure_ascii=False)}"
|
||||
)
|
||||
else:
|
||||
parts.append(f"### 第 {step.round_num} 轮:{step.purpose}\nSQL: `{step.sql}`\n失败: {step.error}")
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
def quick_detect(steps: list[ExplorationStep]) -> list[str]:
|
||||
"""基于规则的快速异常检测,不调 LLM"""
|
||||
alerts = []
|
||||
seen = set() # 去重
|
||||
|
||||
for step in steps:
|
||||
if not step.success or not step.rows:
|
||||
continue
|
||||
|
||||
for col in step.columns:
|
||||
vals = [r.get(col) for r in step.rows if isinstance(r.get(col), (int, float))]
|
||||
if not vals:
|
||||
continue
|
||||
|
||||
col_lower = col.lower()
|
||||
|
||||
# 占比列:某个分组占比过高
|
||||
if col_lower in ("pct", "percent", "percentage", "占比"):
|
||||
for v in vals:
|
||||
if v > 50:
|
||||
key = f"pct_{step.purpose}"
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
alerts.append(f"⚠️ {step.purpose} 中某个分组占比 {v}%,集中度过高")
|
||||
break
|
||||
|
||||
# 计数列:极值差异
|
||||
if col_lower in ("count", "cnt", "n", "total", "order_count") and len(vals) >= 3:
|
||||
avg = sum(vals) / len(vals)
|
||||
if avg > 0:
|
||||
ratio = max(vals) / avg
|
||||
if ratio > 3:
|
||||
key = f"count_{step.purpose}"
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
alerts.append(f"⚠️ {step.purpose} 中最大值是均值的 {ratio:.1f} 倍")
|
||||
|
||||
return alerts
|
||||
Reference in New Issue
Block a user