""" Layer 3: 洞察引擎 """ import json from typing import Any from core.config import LLM_CONFIG from core.utils import get_llm_client, extract_json_array from layers.explorer import ExplorationStep INSIGHT_SYSTEM = """你是一个数据洞察专家。你会收到探索过程的所有结果,你需要: 1. 从结果中发现异常和有趣现象 2. 对比不同维度,找出差异 3. 输出用户可能没问但值得知道的洞察 ## 输出格式(严格 JSON 数组) ```json [ { "type": "outlier" | "trend" | "distribution" | "correlation" | "recommendation", "severity": "high" | "medium" | "low", "title": "简短标题", "detail": "详细描述,包含具体数字", "evidence": "支撑这个洞察的数据来源" } ] ``` ## 分析原则 - 每个洞察必须有具体数字支撑 - 用对比来说话(A 比 B 高 X%) - 关注异常,不描述平淡的事实 - 如果没有异常,返回空数组""" class Insight: """单条洞察""" def __init__(self, data: dict): self.type = data.get("type", "unknown") self.severity = data.get("severity", "low") self.title = data.get("title", "") self.detail = data.get("detail", "") self.evidence = data.get("evidence", "") @property def emoji(self) -> str: return {"outlier": "⚠️", "trend": "📈", "distribution": "📊", "correlation": "🔗", "recommendation": "💡"}.get(self.type, "📌") @property def severity_emoji(self) -> str: return {"high": "🔴", "medium": "🟡", "low": "🟢"}.get(self.severity, "") def __str__(self): return f"{self.emoji} {self.severity_emoji} {self.title}: {self.detail}" class InsightEngine: """洞察引擎""" def __init__(self): self.client, self.model = get_llm_client(LLM_CONFIG) def analyze(self, steps: list[ExplorationStep], question: str) -> list[Insight]: if not steps: return [] history = self._build_history(steps) response = self.client.chat.completions.create( model=self.model, messages=[ {"role": "system", "content": INSIGHT_SYSTEM}, {"role": "user", "content": f"## 用户问题\n{question}\n\n## 探索历史\n{history}\n\n请分析以上数据,输出异常和洞察。"}, ], temperature=0.3, max_tokens=2048, ) content = response.choices[0].message.content.strip() return [Insight(d) for d in extract_json_array(content)] def format_insights(self, insights: list[Insight]) -> str: if not insights: return "" severity_order = {"high": 0, "medium": 1, "low": 2} sorted_insights = sorted(insights, key=lambda i: severity_order.get(i.severity, 9)) lines = ["## 💡 主动洞察", "", "_以下是你没问但数据告诉我们的事:_\n"] for insight in sorted_insights: lines.append(f"**{insight.emoji} {insight.title}** {insight.severity_emoji}") lines.append(f" {insight.detail}") lines.append(f" _数据来源: {insight.evidence}_") lines.append("") return "\n".join(lines) def _build_history(self, steps: list[ExplorationStep]) -> str: parts = [] for step in steps: if step.action == "done": parts.append(f"### 结束\n{step.reasoning}") elif step.success: parts.append( f"### 第 {step.round_num} 轮:{step.purpose}\n" f"SQL: `{step.sql}`\n结果 ({step.row_count} 行):\n" f"数据: {json.dumps(step.rows, ensure_ascii=False)}" ) else: parts.append(f"### 第 {step.round_num} 轮:{step.purpose}\nSQL: `{step.sql}`\n失败: {step.error}") return "\n\n".join(parts) def quick_detect(steps: list[ExplorationStep]) -> list[str]: """基于规则的快速异常检测,不调 LLM""" alerts = [] seen = set() # 去重 for step in steps: if not step.success or not step.rows: continue for col in step.columns: vals = [r.get(col) for r in step.rows if isinstance(r.get(col), (int, float))] if not vals: continue col_lower = col.lower() # 占比列:某个分组占比过高 if col_lower in ("pct", "percent", "percentage", "占比"): for v in vals: if v > 50: key = f"pct_{step.purpose}" if key not in seen: seen.add(key) alerts.append(f"⚠️ {step.purpose} 中某个分组占比 {v}%,集中度过高") break # 计数列:极值差异 if col_lower in ("count", "cnt", "n", "total", "order_count") and len(vals) >= 3: avg = sum(vals) / len(vals) if avg > 0: ratio = max(vals) / avg if ratio > 3: key = f"count_{step.purpose}" if key not in seen: seen.add(key) alerts.append(f"⚠️ {step.purpose} 中最大值是均值的 {ratio:.1f} 倍") return alerts