2026-01-06 19:44:17 +08:00
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
"""
|
|
|
|
|
|
简化的 Notebook 数据分析智能体
|
|
|
|
|
|
仅包含用户和助手两个角
|
|
|
|
|
|
2. 图片必须保存到指定的会话目录中,输出绝对路径,禁止使用plt.show()
|
|
|
|
|
|
3. 表格输出控制:超过15行只显示前5行和后5行
|
|
|
|
|
|
4. 强制使用SimHei字体:plt.rcParams['font.sans-serif'] = ['SimHei']
|
|
|
|
|
|
5. 输出格式严格使用YAML共享上下文的单轮对话模式
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
|
import json
|
2026-04-19 21:30:08 +08:00
|
|
|
|
import re
|
2026-01-06 19:44:17 +08:00
|
|
|
|
import yaml
|
|
|
|
|
|
from typing import Dict, Any, List, Optional
|
2026-04-19 21:30:08 +08:00
|
|
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
2026-01-06 19:44:17 +08:00
|
|
|
|
from utils.create_session_dir import create_session_output_dir
|
|
|
|
|
|
from utils.format_execution_result import format_execution_result
|
|
|
|
|
|
from utils.extract_code import extract_code_from_response
|
2026-04-19 21:30:08 +08:00
|
|
|
|
from utils.data_loader import load_and_profile_data, load_data_chunked, load_and_profile_data_smart
|
2026-01-06 19:44:17 +08:00
|
|
|
|
from utils.llm_helper import LLMHelper
|
|
|
|
|
|
from utils.code_executor import CodeExecutor
|
2026-01-31 18:00:05 +08:00
|
|
|
|
from utils.script_generator import generate_reusable_script
|
2026-04-19 21:30:08 +08:00
|
|
|
|
from utils.data_privacy import build_safe_profile, build_local_profile, sanitize_execution_feedback, generate_enriched_hint
|
2026-01-06 19:44:17 +08:00
|
|
|
|
from config.llm_config import LLMConfig
|
2026-04-19 21:30:08 +08:00
|
|
|
|
from config.app_config import app_config
|
2026-01-22 22:26:04 +08:00
|
|
|
|
from prompts import data_analysis_system_prompt, final_report_system_prompt, data_analysis_followup_prompt
|
2026-01-06 19:44:17 +08:00
|
|
|
|
|
|
|
|
|
|
|
2026-04-19 21:30:08 +08:00
|
|
|
|
# Regex patterns that indicate a data-context error (column/variable/DataFrame issues)
|
|
|
|
|
|
DATA_CONTEXT_PATTERNS = [
|
|
|
|
|
|
r"KeyError:\s*['\"](.+?)['\"]",
|
|
|
|
|
|
r"ValueError.*(?:column|col|field)",
|
|
|
|
|
|
r"NameError.*(?:df|data|frame)",
|
|
|
|
|
|
r"(?:empty|no\s+data|0\s+rows)",
|
|
|
|
|
|
r"IndexError.*(?:out of range|out of bounds)",
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-01-06 19:44:17 +08:00
|
|
|
|
class DataAnalysisAgent:
|
|
|
|
|
|
"""
|
|
|
|
|
|
数据分析智能体
|
|
|
|
|
|
|
|
|
|
|
|
职责:
|
|
|
|
|
|
- 接收用户自然语言需求
|
|
|
|
|
|
- 生成Python分析代码
|
|
|
|
|
|
- 执行代码并收集结果
|
|
|
|
|
|
- 基于执行结果继续生成后续分析代码
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
|
|
self,
|
|
|
|
|
|
llm_config: LLMConfig = None,
|
|
|
|
|
|
output_dir: str = "outputs",
|
|
|
|
|
|
max_rounds: int = 20,
|
|
|
|
|
|
force_max_rounds: bool = False,
|
|
|
|
|
|
):
|
|
|
|
|
|
"""
|
|
|
|
|
|
初始化智能体
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
config: LLM配置
|
|
|
|
|
|
output_dir: 输出目录
|
|
|
|
|
|
max_rounds: 最大对话轮数
|
|
|
|
|
|
force_max_rounds: 是否强制运行到最大轮数(忽略AI的完成信号)
|
|
|
|
|
|
"""
|
|
|
|
|
|
self.config = llm_config or LLMConfig()
|
|
|
|
|
|
self.llm = LLMHelper(self.config)
|
|
|
|
|
|
self.base_output_dir = output_dir
|
|
|
|
|
|
self.max_rounds = max_rounds
|
|
|
|
|
|
self.force_max_rounds = force_max_rounds
|
|
|
|
|
|
# 对话历史和上下文
|
|
|
|
|
|
self.conversation_history = []
|
|
|
|
|
|
self.analysis_results = []
|
|
|
|
|
|
self.current_round = 0
|
|
|
|
|
|
self.session_output_dir = None
|
|
|
|
|
|
self.executor = None
|
2026-04-19 16:29:59 +08:00
|
|
|
|
self.data_profile = "" # 存储数据画像(完整版,本地使用)
|
|
|
|
|
|
self.data_profile_safe = "" # 存储安全画像(发给LLM)
|
2026-01-31 18:00:05 +08:00
|
|
|
|
self.data_files = [] # 存储数据文件列表
|
|
|
|
|
|
self.user_requirement = "" # 存储用户需求
|
2026-04-19 21:30:08 +08:00
|
|
|
|
self._progress_callback = None # 进度回调函数
|
|
|
|
|
|
self._session_ref = None # Reference to SessionData for round tracking
|
|
|
|
|
|
|
|
|
|
|
|
def set_session_ref(self, session):
|
|
|
|
|
|
"""Set a reference to the SessionData instance for appending round data.
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
session: The SessionData instance for the current analysis session.
|
|
|
|
|
|
"""
|
|
|
|
|
|
self._session_ref = session
|
|
|
|
|
|
|
|
|
|
|
|
def set_progress_callback(self, callback):
|
|
|
|
|
|
"""Set a callback function(current_round, max_rounds, message) for progress updates."""
|
|
|
|
|
|
self._progress_callback = callback
|
|
|
|
|
|
|
|
|
|
|
|
def _summarize_result(self, result: Dict[str, Any]) -> str:
|
|
|
|
|
|
"""Produce a one-line summary from a code execution result.
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
result: The execution result dict from CodeExecutor.
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
A concise summary string, e.g. "执行成功,输出 DataFrame (150行×8列)"
|
|
|
|
|
|
or "执行失败: KeyError: 'col_x'".
|
|
|
|
|
|
"""
|
|
|
|
|
|
if result.get("success"):
|
|
|
|
|
|
evidence_rows = result.get("evidence_rows", [])
|
|
|
|
|
|
if evidence_rows:
|
|
|
|
|
|
num_rows = len(evidence_rows)
|
|
|
|
|
|
num_cols = len(evidence_rows[0]) if evidence_rows else 0
|
|
|
|
|
|
# Check auto_exported_files for more accurate row/col counts
|
|
|
|
|
|
auto_files = result.get("auto_exported_files", [])
|
|
|
|
|
|
if auto_files:
|
|
|
|
|
|
last_file = auto_files[-1]
|
|
|
|
|
|
num_rows = last_file.get("rows", num_rows)
|
|
|
|
|
|
num_cols = last_file.get("cols", num_cols)
|
|
|
|
|
|
return f"执行成功,输出 DataFrame ({num_rows}行×{num_cols}列)"
|
|
|
|
|
|
output = result.get("output", "")
|
|
|
|
|
|
if output:
|
|
|
|
|
|
first_line = output.strip().split("\n")[0][:80]
|
|
|
|
|
|
return f"执行成功: {first_line}"
|
|
|
|
|
|
return "执行成功"
|
|
|
|
|
|
else:
|
|
|
|
|
|
error = result.get("error", "未知错误")
|
|
|
|
|
|
if len(error) > 100:
|
|
|
|
|
|
error = error[:100] + "..."
|
|
|
|
|
|
return f"执行失败: {error}"
|
2026-01-06 19:44:17 +08:00
|
|
|
|
|
|
|
|
|
|
def _process_response(self, response: str) -> Dict[str, Any]:
|
|
|
|
|
|
"""
|
|
|
|
|
|
统一处理LLM响应,判断行动类型并执行相应操作
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
response: LLM的响应内容
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
处理结果字典
|
|
|
|
|
|
"""
|
|
|
|
|
|
try:
|
|
|
|
|
|
yaml_data = self.llm.parse_yaml_response(response)
|
2026-04-20 13:09:54 +08:00
|
|
|
|
action = yaml_data.get("action", "")
|
|
|
|
|
|
|
|
|
|
|
|
# If YAML parsing returned empty/no action, try to detect action from raw text
|
|
|
|
|
|
if not action:
|
|
|
|
|
|
if "analysis_complete" in response:
|
|
|
|
|
|
action = "analysis_complete"
|
|
|
|
|
|
# Try to extract final_report from raw text
|
|
|
|
|
|
if not yaml_data.get("final_report"):
|
|
|
|
|
|
yaml_data["action"] = "analysis_complete"
|
|
|
|
|
|
yaml_data["final_report"] = ""
|
|
|
|
|
|
elif "collect_figures" in response:
|
|
|
|
|
|
action = "collect_figures"
|
|
|
|
|
|
yaml_data["action"] = "collect_figures"
|
|
|
|
|
|
else:
|
|
|
|
|
|
action = "generate_code"
|
2026-01-06 19:44:17 +08:00
|
|
|
|
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print(f"[TARGET] 检测到动作: {action}")
|
2026-01-06 19:44:17 +08:00
|
|
|
|
|
|
|
|
|
|
if action == "analysis_complete":
|
|
|
|
|
|
return self._handle_analysis_complete(response, yaml_data)
|
|
|
|
|
|
elif action == "collect_figures":
|
|
|
|
|
|
return self._handle_collect_figures(response, yaml_data)
|
|
|
|
|
|
elif action == "generate_code":
|
|
|
|
|
|
return self._handle_generate_code(response, yaml_data)
|
|
|
|
|
|
else:
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print(f"[WARN] 未知动作类型: {action},按generate_code处理")
|
2026-01-06 19:44:17 +08:00
|
|
|
|
return self._handle_generate_code(response, yaml_data)
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print(f"[WARN] 解析响应失败: {str(e)},尝试提取代码并按generate_code处理")
|
2026-04-20 13:09:54 +08:00
|
|
|
|
# Check if this is actually an analysis_complete or collect_figures response
|
|
|
|
|
|
if "analysis_complete" in response:
|
|
|
|
|
|
return self._handle_analysis_complete(response, {"final_report": ""})
|
|
|
|
|
|
if "collect_figures" in response:
|
|
|
|
|
|
return self._handle_collect_figures(response, {"figures_to_collect": []})
|
2026-01-07 16:41:38 +08:00
|
|
|
|
# 即使YAML解析失败,也尝试提取代码
|
|
|
|
|
|
extracted_code = extract_code_from_response(response)
|
|
|
|
|
|
if extracted_code:
|
|
|
|
|
|
return self._handle_generate_code(response, {"code": extracted_code})
|
2026-01-06 19:44:17 +08:00
|
|
|
|
return self._handle_generate_code(response, {})
|
|
|
|
|
|
|
|
|
|
|
|
def _handle_analysis_complete(
|
|
|
|
|
|
self, response: str, yaml_data: Dict[str, Any]
|
|
|
|
|
|
) -> Dict[str, Any]:
|
|
|
|
|
|
"""处理分析完成动作"""
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print("[OK] 分析任务完成")
|
2026-01-06 19:44:17 +08:00
|
|
|
|
final_report = yaml_data.get("final_report", "分析完成,无最终报告")
|
|
|
|
|
|
return {
|
|
|
|
|
|
"action": "analysis_complete",
|
|
|
|
|
|
"final_report": final_report,
|
|
|
|
|
|
"response": response,
|
|
|
|
|
|
"continue": False,
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
def _handle_collect_figures(
|
|
|
|
|
|
self, response: str, yaml_data: Dict[str, Any]
|
|
|
|
|
|
) -> Dict[str, Any]:
|
|
|
|
|
|
"""处理图片收集动作"""
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print("[CHART] 开始收集图片")
|
2026-01-06 19:44:17 +08:00
|
|
|
|
figures_to_collect = yaml_data.get("figures_to_collect", [])
|
|
|
|
|
|
|
|
|
|
|
|
collected_figures = []
|
2026-04-19 16:29:59 +08:00
|
|
|
|
# 使用seen_paths集合来去重,防止重复收集
|
|
|
|
|
|
seen_paths = set()
|
2026-01-06 19:44:17 +08:00
|
|
|
|
|
|
|
|
|
|
for figure_info in figures_to_collect:
|
|
|
|
|
|
figure_number = figure_info.get("figure_number", "未知")
|
|
|
|
|
|
# 确保figure_number不为None时才用于文件名
|
|
|
|
|
|
if figure_number != "未知":
|
|
|
|
|
|
default_filename = f"figure_{figure_number}.png"
|
|
|
|
|
|
else:
|
|
|
|
|
|
default_filename = "figure_unknown.png"
|
|
|
|
|
|
filename = figure_info.get("filename", default_filename)
|
|
|
|
|
|
file_path = figure_info.get("file_path", "") # 获取具体的文件路径
|
|
|
|
|
|
description = figure_info.get("description", "")
|
|
|
|
|
|
analysis = figure_info.get("analysis", "")
|
|
|
|
|
|
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print(f"[GRAPH] 收集图片 {figure_number}: {filename}")
|
|
|
|
|
|
print(f" [DIR] 路径: {file_path}")
|
|
|
|
|
|
print(f" [NOTE] 描述: {description}")
|
|
|
|
|
|
print(f" [SEARCH] 分析: {analysis}")
|
2026-01-06 19:44:17 +08:00
|
|
|
|
|
2026-01-06 14:09:12 +08:00
|
|
|
|
# 验证文件是否存在
|
2026-01-06 18:08:44 +08:00
|
|
|
|
# 只有文件真正存在时才加入列表,防止报告出现裂图
|
2026-01-06 14:09:12 +08:00
|
|
|
|
if file_path and os.path.exists(file_path):
|
2026-01-09 16:52:45 +08:00
|
|
|
|
# 检查是否已经收集过该路径
|
|
|
|
|
|
abs_path = os.path.abspath(file_path)
|
|
|
|
|
|
if abs_path not in seen_paths:
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print(f" [OK] 文件存在: {file_path}")
|
2026-01-09 16:52:45 +08:00
|
|
|
|
# 记录图片信息
|
|
|
|
|
|
collected_figures.append(
|
|
|
|
|
|
{
|
|
|
|
|
|
"figure_number": figure_number,
|
|
|
|
|
|
"filename": filename,
|
|
|
|
|
|
"file_path": file_path,
|
|
|
|
|
|
"description": description,
|
|
|
|
|
|
"analysis": analysis,
|
|
|
|
|
|
}
|
|
|
|
|
|
)
|
|
|
|
|
|
seen_paths.add(abs_path)
|
|
|
|
|
|
else:
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print(f" [WARN] 跳过重复图片: {file_path}")
|
2026-01-06 14:09:12 +08:00
|
|
|
|
else:
|
2026-01-06 18:08:44 +08:00
|
|
|
|
if file_path:
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print(f" [WARN] 文件不存在: {file_path}")
|
2026-01-06 18:08:44 +08:00
|
|
|
|
else:
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print(f" [WARN] 未提供文件路径")
|
2026-01-06 19:44:17 +08:00
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
|
"action": "collect_figures",
|
|
|
|
|
|
"collected_figures": collected_figures,
|
|
|
|
|
|
"response": response,
|
|
|
|
|
|
"continue": True,
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
def _handle_generate_code(
|
|
|
|
|
|
self, response: str, yaml_data: Dict[str, Any]
|
|
|
|
|
|
) -> Dict[str, Any]:
|
|
|
|
|
|
"""处理代码生成和执行动作"""
|
|
|
|
|
|
# 从YAML数据中获取代码(更准确)
|
|
|
|
|
|
code = yaml_data.get("code", "")
|
2026-04-19 21:30:08 +08:00
|
|
|
|
reasoning = yaml_data.get("reasoning", "")
|
2026-01-06 19:44:17 +08:00
|
|
|
|
|
|
|
|
|
|
# 如果YAML中没有代码,尝试从响应中提取
|
|
|
|
|
|
if not code:
|
|
|
|
|
|
code = extract_code_from_response(response)
|
|
|
|
|
|
|
|
|
|
|
|
# 二次清洗:防止YAML中解析出的code包含markdown标记
|
|
|
|
|
|
if code:
|
|
|
|
|
|
code = code.strip()
|
|
|
|
|
|
if code.startswith("```"):
|
|
|
|
|
|
# 去除开头的 ```python 或 ```
|
|
|
|
|
|
code = re.sub(r"^```[a-zA-Z]*\n", "", code)
|
|
|
|
|
|
# 去除结尾的 ```
|
|
|
|
|
|
code = re.sub(r"\n```$", "", code)
|
|
|
|
|
|
code = code.strip()
|
|
|
|
|
|
|
|
|
|
|
|
if code:
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print(f"[TOOL] 执行代码:\n{code}")
|
2026-01-06 19:44:17 +08:00
|
|
|
|
print("-" * 40)
|
|
|
|
|
|
|
|
|
|
|
|
# 执行代码
|
|
|
|
|
|
result = self.executor.execute_code(code)
|
|
|
|
|
|
|
|
|
|
|
|
# 格式化执行结果
|
|
|
|
|
|
feedback = format_execution_result(result)
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print(f"[LIST] 执行反馈:\n{feedback}")
|
2026-01-06 19:44:17 +08:00
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
|
"action": "generate_code",
|
|
|
|
|
|
"code": code,
|
2026-04-19 21:30:08 +08:00
|
|
|
|
"reasoning": reasoning,
|
2026-01-06 19:44:17 +08:00
|
|
|
|
"result": result,
|
|
|
|
|
|
"feedback": feedback,
|
|
|
|
|
|
"response": response,
|
|
|
|
|
|
"continue": True,
|
|
|
|
|
|
}
|
|
|
|
|
|
else:
|
|
|
|
|
|
# 如果没有代码,说明LLM响应格式有问题,需要重新生成
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print("[WARN] 未从响应中提取到可执行代码,要求LLM重新生成")
|
2026-01-06 19:44:17 +08:00
|
|
|
|
return {
|
|
|
|
|
|
"action": "invalid_response",
|
2026-04-19 21:30:08 +08:00
|
|
|
|
"reasoning": reasoning,
|
2026-01-06 19:44:17 +08:00
|
|
|
|
"error": "响应中缺少可执行代码",
|
|
|
|
|
|
"response": response,
|
|
|
|
|
|
"continue": True,
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-04-19 21:30:08 +08:00
|
|
|
|
def _classify_error(self, error_message: str) -> str:
|
|
|
|
|
|
"""Classify execution error as data-context or other.
|
|
|
|
|
|
|
|
|
|
|
|
Inspects the error message against DATA_CONTEXT_PATTERNS to determine
|
|
|
|
|
|
if the error is related to data context (missing columns, undefined
|
|
|
|
|
|
data variables, empty DataFrames, etc.).
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
error_message: The error message string from code execution.
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
"data_context" if the error matches a data-context pattern,
|
|
|
|
|
|
"other" otherwise.
|
|
|
|
|
|
"""
|
|
|
|
|
|
for pattern in DATA_CONTEXT_PATTERNS:
|
|
|
|
|
|
if re.search(pattern, error_message, re.IGNORECASE):
|
|
|
|
|
|
return "data_context"
|
|
|
|
|
|
return "other"
|
|
|
|
|
|
|
|
|
|
|
|
def _trim_conversation_history(self):
|
|
|
|
|
|
"""Apply sliding window trimming to conversation history.
|
|
|
|
|
|
|
|
|
|
|
|
Retains the first user message (original requirement + Safe_Profile) at
|
|
|
|
|
|
index 0, generates a compressed summary of old messages, and keeps only
|
|
|
|
|
|
the most recent ``conversation_window_size`` message pairs in full.
|
|
|
|
|
|
"""
|
|
|
|
|
|
window_size = app_config.conversation_window_size
|
|
|
|
|
|
max_messages = window_size * 2 # pairs of user+assistant messages
|
|
|
|
|
|
|
|
|
|
|
|
if len(self.conversation_history) <= max_messages:
|
|
|
|
|
|
return # No trimming needed
|
|
|
|
|
|
|
|
|
|
|
|
first_message = self.conversation_history[0] # Always retain
|
|
|
|
|
|
|
|
|
|
|
|
# Determine trim boundary: skip first message + possible existing summary
|
|
|
|
|
|
start_idx = 1
|
|
|
|
|
|
has_existing_summary = (
|
|
|
|
|
|
len(self.conversation_history) > 1
|
|
|
|
|
|
and self.conversation_history[1]["role"] == "user"
|
|
|
|
|
|
and self.conversation_history[1]["content"].startswith("[分析摘要]")
|
|
|
|
|
|
)
|
|
|
|
|
|
if has_existing_summary:
|
|
|
|
|
|
start_idx = 2
|
|
|
|
|
|
|
|
|
|
|
|
# Messages to trim vs keep
|
|
|
|
|
|
messages_to_consider = self.conversation_history[start_idx:]
|
|
|
|
|
|
messages_to_trim = messages_to_consider[:-max_messages]
|
|
|
|
|
|
messages_to_keep = messages_to_consider[-max_messages:]
|
|
|
|
|
|
|
|
|
|
|
|
if not messages_to_trim:
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
# Generate summary of trimmed messages
|
|
|
|
|
|
summary = self._compress_trimmed_messages(messages_to_trim)
|
|
|
|
|
|
|
|
|
|
|
|
# Rebuild history: first_message + summary + recent messages
|
|
|
|
|
|
self.conversation_history = [first_message]
|
|
|
|
|
|
if summary:
|
|
|
|
|
|
self.conversation_history.append({"role": "user", "content": summary})
|
|
|
|
|
|
self.conversation_history.extend(messages_to_keep)
|
|
|
|
|
|
|
|
|
|
|
|
def _compress_trimmed_messages(self, messages: list) -> str:
|
|
|
|
|
|
"""Compress trimmed messages into a concise summary string.
|
|
|
|
|
|
|
2026-04-20 09:50:35 +08:00
|
|
|
|
Extracts the action type from each assistant message, the execution
|
|
|
|
|
|
outcome (success / failure), and completed SOP stages from the
|
|
|
|
|
|
subsequent user feedback message. Code blocks and raw execution
|
|
|
|
|
|
output are excluded.
|
|
|
|
|
|
|
|
|
|
|
|
The summary explicitly lists completed SOP stages so the LLM does
|
|
|
|
|
|
not restart from stage 1 after conversation trimming.
|
2026-04-19 21:30:08 +08:00
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
messages: List of conversation message dicts to compress.
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
A summary string prefixed with ``[分析摘要]``.
|
|
|
|
|
|
"""
|
|
|
|
|
|
summary_parts = ["[分析摘要] 以下是之前分析轮次的概要:"]
|
|
|
|
|
|
round_num = 0
|
2026-04-20 09:50:35 +08:00
|
|
|
|
completed_stages = set()
|
|
|
|
|
|
|
|
|
|
|
|
# SOP stage keywords to detect from assistant messages
|
|
|
|
|
|
stage_keywords = {
|
|
|
|
|
|
"阶段1": "数据探索与加载",
|
|
|
|
|
|
"阶段2": "基础分布分析",
|
|
|
|
|
|
"阶段3": "时序与来源分析",
|
|
|
|
|
|
"阶段4": "深度交叉分析",
|
|
|
|
|
|
"阶段5": "效率分析",
|
|
|
|
|
|
"阶段6": "高级挖掘",
|
|
|
|
|
|
}
|
2026-04-19 21:30:08 +08:00
|
|
|
|
|
|
|
|
|
|
for msg in messages:
|
|
|
|
|
|
content = msg["content"]
|
|
|
|
|
|
if msg["role"] == "assistant":
|
|
|
|
|
|
round_num += 1
|
|
|
|
|
|
# Extract action type from YAML-like content
|
|
|
|
|
|
action = "generate_code"
|
|
|
|
|
|
if "action: \"collect_figures\"" in content or "action: collect_figures" in content:
|
|
|
|
|
|
action = "collect_figures"
|
|
|
|
|
|
elif "action: \"analysis_complete\"" in content or "action: analysis_complete" in content:
|
|
|
|
|
|
action = "analysis_complete"
|
2026-04-20 09:50:35 +08:00
|
|
|
|
|
|
|
|
|
|
# Detect completed SOP stages
|
|
|
|
|
|
for stage_key, stage_name in stage_keywords.items():
|
|
|
|
|
|
if stage_key in content or stage_name in content:
|
|
|
|
|
|
completed_stages.add(f"{stage_key}: {stage_name}")
|
|
|
|
|
|
|
2026-04-19 21:30:08 +08:00
|
|
|
|
summary_parts.append(f"- 轮次{round_num}: 动作={action}")
|
|
|
|
|
|
elif msg["role"] == "user" and "代码执行反馈" in content:
|
|
|
|
|
|
success = "失败" if "[ERROR]" in content or "执行错误" in content else "成功"
|
|
|
|
|
|
if summary_parts and summary_parts[-1].startswith("- 轮次"):
|
|
|
|
|
|
summary_parts[-1] += f", 执行结果={success}"
|
|
|
|
|
|
|
2026-04-20 09:50:35 +08:00
|
|
|
|
# Append completed stages so the LLM knows where to continue
|
|
|
|
|
|
if completed_stages:
|
|
|
|
|
|
summary_parts.append("")
|
|
|
|
|
|
summary_parts.append("**已完成的SOP阶段** (请勿重复执行):")
|
|
|
|
|
|
for stage in sorted(completed_stages):
|
|
|
|
|
|
summary_parts.append(f" ✓ {stage}")
|
|
|
|
|
|
summary_parts.append("")
|
|
|
|
|
|
summary_parts.append("请从下一个未完成的阶段继续,不要重新执行已完成的阶段。")
|
|
|
|
|
|
|
2026-04-19 21:30:08 +08:00
|
|
|
|
return "\n".join(summary_parts)
|
|
|
|
|
|
|
|
|
|
|
|
def _profile_files_parallel(self, file_paths: list) -> tuple:
|
|
|
|
|
|
"""Profile multiple files concurrently using ThreadPoolExecutor.
|
|
|
|
|
|
|
|
|
|
|
|
Each file is profiled independently via ``build_safe_profile`` and
|
|
|
|
|
|
``build_local_profile``. Results are collected and merged. If any
|
|
|
|
|
|
individual file fails, an error entry is included for that file and
|
|
|
|
|
|
profiling continues for the remaining files.
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
file_paths: List of file paths to profile.
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
A tuple ``(safe_profile, local_profile)`` of merged markdown strings.
|
|
|
|
|
|
"""
|
|
|
|
|
|
max_workers = app_config.max_parallel_profiles
|
|
|
|
|
|
safe_profiles = []
|
|
|
|
|
|
local_profiles = []
|
|
|
|
|
|
|
|
|
|
|
|
def profile_single(path):
|
|
|
|
|
|
safe = build_safe_profile([path])
|
|
|
|
|
|
local = build_local_profile([path])
|
|
|
|
|
|
return path, safe, local
|
|
|
|
|
|
|
|
|
|
|
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
|
|
|
|
futures = {executor.submit(profile_single, p): p for p in file_paths}
|
|
|
|
|
|
for future in as_completed(futures):
|
|
|
|
|
|
path = futures[future]
|
|
|
|
|
|
try:
|
|
|
|
|
|
_, safe, local = future.result()
|
|
|
|
|
|
safe_profiles.append(safe)
|
|
|
|
|
|
local_profiles.append(local)
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
error_entry = f"## 文件: {os.path.basename(path)}\n[ERROR] 分析失败: {e}\n\n"
|
|
|
|
|
|
safe_profiles.append(error_entry)
|
|
|
|
|
|
local_profiles.append(error_entry)
|
|
|
|
|
|
|
|
|
|
|
|
return "\n".join(safe_profiles), "\n".join(local_profiles)
|
|
|
|
|
|
|
|
|
|
|
|
def analyze(self, user_input: str, files: List[str] = None, session_output_dir: str = None, reset_session: bool = True, max_rounds: int = None, template_name: str = None) -> Dict[str, Any]:
|
2026-01-06 19:44:17 +08:00
|
|
|
|
"""
|
|
|
|
|
|
开始分析流程
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
user_input: 用户的自然语言需求
|
|
|
|
|
|
files: 数据文件路径列表
|
|
|
|
|
|
session_output_dir: 指定的会话输出目录(可选)
|
2026-01-09 16:52:45 +08:00
|
|
|
|
reset_session: 是否重置会话 (True: 新开启分析; False: 在现有上下文中继续)
|
|
|
|
|
|
max_rounds: 本次分析的最大轮数 (可选,如果不填则使用默认值)
|
2026-04-19 21:30:08 +08:00
|
|
|
|
template_name: 分析模板名称 (可选,如果提供则使用模板引导分析)
|
2026-01-06 19:44:17 +08:00
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
分析结果字典
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
2026-01-09 16:52:45 +08:00
|
|
|
|
# 确定本次运行的轮数限制
|
|
|
|
|
|
current_max_rounds = max_rounds if max_rounds is not None else self.max_rounds
|
|
|
|
|
|
|
2026-04-19 21:30:08 +08:00
|
|
|
|
# Template integration: prepend template prompt to user input if provided
|
|
|
|
|
|
if template_name:
|
|
|
|
|
|
from utils.analysis_templates import get_template
|
|
|
|
|
|
template = get_template(template_name) # Raises ValueError if invalid
|
|
|
|
|
|
template_prompt = template.get_full_prompt()
|
|
|
|
|
|
user_input = f"{template_prompt}\n\n{user_input}"
|
|
|
|
|
|
|
2026-01-09 16:52:45 +08:00
|
|
|
|
if reset_session:
|
|
|
|
|
|
# --- 初始化新会话 ---
|
|
|
|
|
|
self.conversation_history = []
|
|
|
|
|
|
self.analysis_results = []
|
|
|
|
|
|
self.current_round = 0
|
2026-01-31 18:00:05 +08:00
|
|
|
|
self.data_files = files or [] # 保存数据文件列表
|
|
|
|
|
|
self.user_requirement = user_input # 保存用户需求
|
2026-01-09 16:52:45 +08:00
|
|
|
|
|
|
|
|
|
|
# 创建本次分析的专用输出目录
|
|
|
|
|
|
if session_output_dir:
|
|
|
|
|
|
self.session_output_dir = session_output_dir
|
|
|
|
|
|
else:
|
|
|
|
|
|
self.session_output_dir = create_session_output_dir(
|
|
|
|
|
|
self.base_output_dir, user_input
|
|
|
|
|
|
)
|
2026-01-06 19:44:17 +08:00
|
|
|
|
|
2026-01-09 16:52:45 +08:00
|
|
|
|
# 初始化代码执行器,使用会话目录
|
|
|
|
|
|
self.executor = CodeExecutor(self.session_output_dir)
|
|
|
|
|
|
|
|
|
|
|
|
# 设置会话目录变量到执行环境中
|
|
|
|
|
|
self.executor.set_variable("session_output_dir", self.session_output_dir)
|
|
|
|
|
|
|
2026-04-19 16:29:59 +08:00
|
|
|
|
# 生成数据画像(分级:安全级发给LLM,完整级留本地)
|
|
|
|
|
|
data_profile_safe = ""
|
|
|
|
|
|
data_profile_local = ""
|
2026-01-09 16:52:45 +08:00
|
|
|
|
if files:
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print("[SEARCH] 正在生成数据画像...")
|
2026-01-09 16:52:45 +08:00
|
|
|
|
try:
|
2026-04-19 21:30:08 +08:00
|
|
|
|
if len(files) > 1:
|
|
|
|
|
|
# Parallel profiling for multiple files
|
|
|
|
|
|
data_profile_safe, data_profile_local = self._profile_files_parallel(files)
|
|
|
|
|
|
else:
|
|
|
|
|
|
data_profile_safe = build_safe_profile(files)
|
|
|
|
|
|
data_profile_local = build_local_profile(files)
|
2026-04-19 16:29:59 +08:00
|
|
|
|
print("[OK] 数据画像生成完毕(安全级 + 本地级)")
|
2026-01-09 16:52:45 +08:00
|
|
|
|
except Exception as e:
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print(f"[WARN] 数据画像生成失败: {e}")
|
2026-04-19 21:30:08 +08:00
|
|
|
|
|
|
|
|
|
|
# Expose chunked iterators for large files in the Code_Executor namespace
|
|
|
|
|
|
for fp in files:
|
|
|
|
|
|
try:
|
|
|
|
|
|
if os.path.exists(fp):
|
|
|
|
|
|
file_size_mb = os.path.getsize(fp) / (1024 * 1024)
|
|
|
|
|
|
if file_size_mb > app_config.max_file_size_mb:
|
|
|
|
|
|
var_name = "chunked_iter_" + os.path.splitext(os.path.basename(fp))[0]
|
|
|
|
|
|
# Store a factory so the iterator can be re-created
|
|
|
|
|
|
self.executor.set_variable(var_name, lambda p=fp: load_data_chunked(p))
|
|
|
|
|
|
print(f"[OK] 大文件 {os.path.basename(fp)} 的分块迭代器已注入为 {var_name}()")
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
print(f"[WARN] 注入分块迭代器失败 ({os.path.basename(fp)}): {e}")
|
2026-01-09 16:52:45 +08:00
|
|
|
|
|
2026-04-19 16:29:59 +08:00
|
|
|
|
# 安全画像发给LLM,完整画像留给最终报告生成
|
|
|
|
|
|
self.data_profile = data_profile_local # 本地完整版用于最终报告
|
|
|
|
|
|
self.data_profile_safe = data_profile_safe # 安全版用于LLM对话
|
2026-01-06 19:44:17 +08:00
|
|
|
|
|
2026-04-19 16:29:59 +08:00
|
|
|
|
# 构建初始prompt(只发送安全级画像给LLM)
|
2026-01-09 16:52:45 +08:00
|
|
|
|
initial_prompt = f"""用户需求: {user_input}"""
|
|
|
|
|
|
if files:
|
|
|
|
|
|
initial_prompt += f"\n数据文件: {', '.join(files)}"
|
|
|
|
|
|
|
2026-04-19 16:29:59 +08:00
|
|
|
|
if data_profile_safe:
|
|
|
|
|
|
initial_prompt += f"\n\n{data_profile_safe}\n\n请根据上述【数据结构概览】中的列名、数据类型和特征描述来制定分析策略。先通过代码探索数据的实际分布,再进行深度分析。"
|
2026-01-09 16:52:45 +08:00
|
|
|
|
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print(f"[START] 开始数据分析任务")
|
|
|
|
|
|
print(f"[NOTE] 用户需求: {user_input}")
|
2026-01-09 16:52:45 +08:00
|
|
|
|
if files:
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print(f"[FOLDER] 数据文件: {', '.join(files)}")
|
|
|
|
|
|
print(f"[DIR] 输出目录: {self.session_output_dir}")
|
2026-01-09 16:52:45 +08:00
|
|
|
|
|
|
|
|
|
|
# 添加到对话历史
|
|
|
|
|
|
self.conversation_history.append({"role": "user", "content": initial_prompt})
|
|
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
# --- 继续现有会话 ---
|
|
|
|
|
|
# 如果是追问,且没有指定轮数,默认减少轮数,避免过度分析
|
|
|
|
|
|
if max_rounds is None:
|
|
|
|
|
|
current_max_rounds = 10 # 追问通常不需要那么长的思考链,10轮足够
|
|
|
|
|
|
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print(f"\n[START] 继续分析任务 (追问模式)")
|
|
|
|
|
|
print(f"[NOTE] 后续需求: {user_input}")
|
2026-01-09 16:52:45 +08:00
|
|
|
|
|
|
|
|
|
|
# 重置当前轮数计数器,以便给新任务足够的轮次
|
|
|
|
|
|
self.current_round = 0
|
|
|
|
|
|
|
|
|
|
|
|
# 添加到对话历史
|
|
|
|
|
|
# 提示Agent这是后续追问,可以简化步骤
|
|
|
|
|
|
follow_up_prompt = f"后续需求: {user_input}\n(注意:这是后续追问,请直接针对该问题进行分析,无需从头开始执行完整SOP。)"
|
|
|
|
|
|
self.conversation_history.append({"role": "user", "content": follow_up_prompt})
|
|
|
|
|
|
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print(f"[NUM] 本次最大轮数: {current_max_rounds}")
|
2026-01-06 19:44:17 +08:00
|
|
|
|
if self.force_max_rounds:
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print(f"[FAST] 强制模式: 将运行满 {current_max_rounds} 轮(忽略AI完成信号)")
|
2026-01-06 19:44:17 +08:00
|
|
|
|
print("=" * 60)
|
2026-01-09 16:52:45 +08:00
|
|
|
|
|
|
|
|
|
|
# 保存原始 max_rounds 以便恢复(虽然 analyze 结束后不需要恢复,但为了逻辑严谨)
|
|
|
|
|
|
original_max_rounds = self.max_rounds
|
|
|
|
|
|
self.max_rounds = current_max_rounds
|
2026-01-06 19:44:17 +08:00
|
|
|
|
|
2026-01-31 18:00:05 +08:00
|
|
|
|
# 初始化连续失败计数器
|
|
|
|
|
|
consecutive_failures = 0
|
2026-04-19 21:30:08 +08:00
|
|
|
|
# Per-round data-context retry counter
|
|
|
|
|
|
data_context_retries = 0
|
|
|
|
|
|
last_retry_round = 0
|
2026-01-31 18:00:05 +08:00
|
|
|
|
|
2026-01-06 19:44:17 +08:00
|
|
|
|
while self.current_round < self.max_rounds:
|
|
|
|
|
|
self.current_round += 1
|
2026-04-19 21:30:08 +08:00
|
|
|
|
# Notify progress callback
|
|
|
|
|
|
if self._progress_callback:
|
|
|
|
|
|
self._progress_callback(self.current_round, self.max_rounds, f"第{self.current_round}/{self.max_rounds}轮分析中...")
|
|
|
|
|
|
# Reset data-context retry counter when entering a new round
|
|
|
|
|
|
if self.current_round != last_retry_round:
|
|
|
|
|
|
data_context_retries = 0
|
|
|
|
|
|
|
|
|
|
|
|
# Trim conversation history after the first round to bound token usage
|
|
|
|
|
|
if self.current_round > 1:
|
|
|
|
|
|
self._trim_conversation_history()
|
|
|
|
|
|
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print(f"\n[LOOP] 第 {self.current_round} 轮分析")
|
2026-01-06 19:44:17 +08:00
|
|
|
|
# 调用LLM生成响应
|
|
|
|
|
|
try: # 获取当前执行环境的变量信息
|
|
|
|
|
|
notebook_variables = self.executor.get_environment_info()
|
|
|
|
|
|
|
2026-01-22 22:26:04 +08:00
|
|
|
|
# Select prompt based on mode
|
|
|
|
|
|
if self.current_round == 1 and not reset_session:
|
|
|
|
|
|
# For the first round of a follow-up session, use the specialized prompt
|
|
|
|
|
|
base_system_prompt = data_analysis_followup_prompt
|
|
|
|
|
|
elif not reset_session and self.current_round > 1:
|
|
|
|
|
|
# For subsequent rounds in follow-up, continue using the follow-up context
|
|
|
|
|
|
# or maybe just the standard one is fine as long as SOP isn't fully enforced?
|
|
|
|
|
|
# Let's stick to the follow-up prompt to prevent SOP regression
|
|
|
|
|
|
base_system_prompt = data_analysis_followup_prompt
|
|
|
|
|
|
else:
|
|
|
|
|
|
base_system_prompt = data_analysis_system_prompt
|
|
|
|
|
|
|
2026-01-06 19:44:17 +08:00
|
|
|
|
# 格式化系统提示词,填入动态的notebook变量信息
|
2026-01-22 22:26:04 +08:00
|
|
|
|
formatted_system_prompt = base_system_prompt.format(
|
2026-01-06 19:44:17 +08:00
|
|
|
|
notebook_variables=notebook_variables
|
|
|
|
|
|
)
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print(f"[DEBUG] [DEBUG] System Prompt Head:\n{formatted_system_prompt[:500]}...\n[...]")
|
|
|
|
|
|
print(f"[DEBUG] [DEBUG] System Prompt Rules Check: 'stop_words' in prompt? {'stop_words' in formatted_system_prompt}")
|
2026-01-06 19:44:17 +08:00
|
|
|
|
|
|
|
|
|
|
response = self.llm.call(
|
|
|
|
|
|
prompt=self._build_conversation_prompt(),
|
|
|
|
|
|
system_prompt=formatted_system_prompt,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print(f"[AI] 助手响应:\n{response}")
|
2026-01-06 19:44:17 +08:00
|
|
|
|
|
|
|
|
|
|
# 使用统一的响应处理方法
|
|
|
|
|
|
process_result = self._process_response(response)
|
|
|
|
|
|
|
|
|
|
|
|
# 根据处理结果决定是否继续(仅在非强制模式下)
|
2026-01-09 16:52:45 +08:00
|
|
|
|
if process_result.get("action") == "invalid_response":
|
|
|
|
|
|
consecutive_failures += 1
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print(f"[WARN] 连续失败次数: {consecutive_failures}/3")
|
2026-01-09 16:52:45 +08:00
|
|
|
|
if consecutive_failures >= 3:
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print(f"[ERROR] 连续3次无法获取有效响应,分析终止。请检查网络或配置。")
|
2026-01-09 16:52:45 +08:00
|
|
|
|
break
|
|
|
|
|
|
else:
|
|
|
|
|
|
consecutive_failures = 0 # 重置计数器
|
|
|
|
|
|
|
2026-01-06 19:44:17 +08:00
|
|
|
|
if not self.force_max_rounds and not process_result.get(
|
|
|
|
|
|
"continue", True
|
|
|
|
|
|
):
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print(f"\n[OK] 分析完成!")
|
2026-01-06 19:44:17 +08:00
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
# 添加到对话历史
|
|
|
|
|
|
self.conversation_history.append(
|
|
|
|
|
|
{"role": "assistant", "content": response}
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# 根据动作类型添加不同的反馈
|
|
|
|
|
|
if process_result["action"] == "generate_code":
|
|
|
|
|
|
feedback = process_result.get("feedback", "")
|
2026-04-19 21:30:08 +08:00
|
|
|
|
result = process_result.get("result", {})
|
|
|
|
|
|
execution_failed = not result.get("success", True)
|
|
|
|
|
|
|
|
|
|
|
|
# --- Data-context retry logic ---
|
|
|
|
|
|
if execution_failed:
|
|
|
|
|
|
error_output = result.get("error", "") or feedback
|
|
|
|
|
|
error_class = self._classify_error(error_output)
|
|
|
|
|
|
|
|
|
|
|
|
if error_class == "data_context" and data_context_retries < app_config.max_data_context_retries:
|
|
|
|
|
|
data_context_retries += 1
|
|
|
|
|
|
last_retry_round = self.current_round
|
|
|
|
|
|
print(f"[RETRY] 数据上下文错误,重试 {data_context_retries}/{app_config.max_data_context_retries}")
|
|
|
|
|
|
# Generate enriched hint from safe profile
|
|
|
|
|
|
enriched_hint = generate_enriched_hint(error_output, self.data_profile_safe)
|
|
|
|
|
|
# Add enriched hint to conversation history (assistant response already added above)
|
|
|
|
|
|
self.conversation_history.append(
|
|
|
|
|
|
{"role": "user", "content": enriched_hint}
|
|
|
|
|
|
)
|
|
|
|
|
|
# Record the failed attempt
|
|
|
|
|
|
self.analysis_results.append(
|
|
|
|
|
|
{
|
|
|
|
|
|
"round": self.current_round,
|
|
|
|
|
|
"code": process_result.get("code", ""),
|
|
|
|
|
|
"result": result,
|
|
|
|
|
|
"response": response,
|
|
|
|
|
|
"retry": True,
|
|
|
|
|
|
}
|
|
|
|
|
|
)
|
|
|
|
|
|
# Retry within the same round: decrement round counter so the
|
|
|
|
|
|
# outer loop's increment brings us back to the same round number
|
|
|
|
|
|
self.current_round -= 1
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
# Normal feedback path (no retry or non-data-context error or at limit)
|
2026-04-19 16:29:59 +08:00
|
|
|
|
safe_feedback = sanitize_execution_feedback(feedback)
|
2026-01-06 19:44:17 +08:00
|
|
|
|
self.conversation_history.append(
|
2026-04-19 16:29:59 +08:00
|
|
|
|
{"role": "user", "content": f"代码执行反馈:\n{safe_feedback}"}
|
2026-01-06 19:44:17 +08:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# 记录分析结果
|
|
|
|
|
|
self.analysis_results.append(
|
|
|
|
|
|
{
|
|
|
|
|
|
"round": self.current_round,
|
|
|
|
|
|
"code": process_result.get("code", ""),
|
|
|
|
|
|
"result": process_result.get("result", {}),
|
|
|
|
|
|
"response": response,
|
|
|
|
|
|
}
|
|
|
|
|
|
)
|
2026-04-19 21:30:08 +08:00
|
|
|
|
|
|
|
|
|
|
# --- Construct Round_Data and append to session ---
|
|
|
|
|
|
result = process_result.get("result", {})
|
|
|
|
|
|
round_data = {
|
|
|
|
|
|
"round": self.current_round,
|
|
|
|
|
|
"reasoning": process_result.get("reasoning", ""),
|
|
|
|
|
|
"code": process_result.get("code", ""),
|
|
|
|
|
|
"result_summary": self._summarize_result(result),
|
|
|
|
|
|
"evidence_rows": result.get("evidence_rows", []),
|
|
|
|
|
|
"raw_log": feedback,
|
|
|
|
|
|
"auto_exported_files": result.get("auto_exported_files", []),
|
|
|
|
|
|
"prompt_saved_files": result.get("prompt_saved_files", []),
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if self._session_ref:
|
|
|
|
|
|
self._session_ref.rounds.append(round_data)
|
|
|
|
|
|
# Merge file metadata into SessionData.data_files
|
|
|
|
|
|
for f in round_data.get("auto_exported_files", []):
|
|
|
|
|
|
if f.get("skipped"):
|
|
|
|
|
|
continue # Large DataFrame — not written to disk
|
|
|
|
|
|
self._session_ref.data_files.append({
|
|
|
|
|
|
"filename": f.get("filename", ""),
|
|
|
|
|
|
"description": f"自动导出: {f.get('variable_name', '')}",
|
|
|
|
|
|
"rows": f.get("rows", 0),
|
|
|
|
|
|
"cols": f.get("cols", 0),
|
|
|
|
|
|
"columns": f.get("columns", []),
|
|
|
|
|
|
"size_bytes": 0,
|
|
|
|
|
|
"source": "auto",
|
|
|
|
|
|
})
|
|
|
|
|
|
for f in round_data.get("prompt_saved_files", []):
|
|
|
|
|
|
self._session_ref.data_files.append({
|
|
|
|
|
|
"filename": f.get("filename", ""),
|
|
|
|
|
|
"description": f.get("description", ""),
|
|
|
|
|
|
"rows": f.get("rows", 0),
|
|
|
|
|
|
"cols": 0,
|
|
|
|
|
|
"columns": [],
|
|
|
|
|
|
"size_bytes": 0,
|
|
|
|
|
|
"source": "prompt",
|
|
|
|
|
|
})
|
2026-01-06 19:44:17 +08:00
|
|
|
|
elif process_result["action"] == "collect_figures":
|
|
|
|
|
|
# 记录图片收集结果
|
|
|
|
|
|
collected_figures = process_result.get("collected_figures", [])
|
2026-01-06 21:19:40 +08:00
|
|
|
|
|
2026-01-06 18:08:44 +08:00
|
|
|
|
missing_figures = process_result.get("missing_figures", [])
|
|
|
|
|
|
|
|
|
|
|
|
feedback = f"已收集 {len(collected_figures)} 个有效图片及其分析。"
|
|
|
|
|
|
if missing_figures:
|
2026-01-31 18:00:05 +08:00
|
|
|
|
feedback += f"\n[WARN] 以下图片未找到,请检查代码是否成功保存了这些图片: {missing_figures}"
|
2026-01-06 21:19:40 +08:00
|
|
|
|
|
2026-01-06 19:44:17 +08:00
|
|
|
|
self.conversation_history.append(
|
|
|
|
|
|
{
|
|
|
|
|
|
"role": "user",
|
|
|
|
|
|
"content": f"图片收集反馈:\n{feedback}\n请继续下一步分析。",
|
|
|
|
|
|
}
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# 记录到分析结果中
|
|
|
|
|
|
self.analysis_results.append(
|
|
|
|
|
|
{
|
|
|
|
|
|
"round": self.current_round,
|
|
|
|
|
|
"action": "collect_figures",
|
|
|
|
|
|
"collected_figures": collected_figures,
|
2026-01-06 18:08:44 +08:00
|
|
|
|
"missing_figures": missing_figures,
|
2026-01-06 21:19:40 +08:00
|
|
|
|
|
2026-01-06 19:44:17 +08:00
|
|
|
|
"response": response,
|
|
|
|
|
|
}
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
error_msg = f"LLM调用错误: {str(e)}"
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print(f"[ERROR] {error_msg}")
|
2026-01-06 19:44:17 +08:00
|
|
|
|
self.conversation_history.append(
|
|
|
|
|
|
{
|
|
|
|
|
|
"role": "user",
|
|
|
|
|
|
"content": f"发生错误: {error_msg},请重新生成代码。",
|
|
|
|
|
|
}
|
|
|
|
|
|
)
|
|
|
|
|
|
# 生成最终总结
|
|
|
|
|
|
if self.current_round >= self.max_rounds:
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print(f"\n[WARN] 已达到最大轮数 ({self.max_rounds}),分析结束")
|
2026-01-06 19:44:17 +08:00
|
|
|
|
|
|
|
|
|
|
return self._generate_final_report()
|
|
|
|
|
|
|
|
|
|
|
|
def _build_conversation_prompt(self) -> str:
|
|
|
|
|
|
"""构建对话提示词"""
|
|
|
|
|
|
prompt_parts = []
|
|
|
|
|
|
|
|
|
|
|
|
for msg in self.conversation_history:
|
|
|
|
|
|
role = msg["role"]
|
|
|
|
|
|
content = msg["content"]
|
|
|
|
|
|
if role == "user":
|
|
|
|
|
|
prompt_parts.append(f"用户: {content}")
|
|
|
|
|
|
else:
|
|
|
|
|
|
prompt_parts.append(f"助手: {content}")
|
|
|
|
|
|
|
|
|
|
|
|
return "\n\n".join(prompt_parts)
|
|
|
|
|
|
|
|
|
|
|
|
def _generate_final_report(self) -> Dict[str, Any]:
|
|
|
|
|
|
"""生成最终分析报告"""
|
|
|
|
|
|
# 收集所有生成的图片信息
|
|
|
|
|
|
all_figures = []
|
|
|
|
|
|
for result in self.analysis_results:
|
|
|
|
|
|
if result.get("action") == "collect_figures":
|
|
|
|
|
|
all_figures.extend(result.get("collected_figures", []))
|
|
|
|
|
|
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print(f"\n[CHART] 开始生成最终分析报告...")
|
|
|
|
|
|
print(f"[DIR] 输出目录: {self.session_output_dir}")
|
2026-01-09 16:52:45 +08:00
|
|
|
|
|
|
|
|
|
|
# --- 自动补全/发现图片机制 ---
|
|
|
|
|
|
# 扫描目录下所有的png文件
|
|
|
|
|
|
try:
|
|
|
|
|
|
import glob
|
|
|
|
|
|
existing_pngs = glob.glob(os.path.join(self.session_output_dir, "*.png"))
|
|
|
|
|
|
|
|
|
|
|
|
# 获取已收集的图片路径集合
|
|
|
|
|
|
collected_paths = set()
|
|
|
|
|
|
for fig in all_figures:
|
|
|
|
|
|
if fig.get("file_path"):
|
|
|
|
|
|
collected_paths.add(os.path.abspath(fig.get("file_path")))
|
|
|
|
|
|
|
|
|
|
|
|
# 检查是否有漏网之鱼
|
|
|
|
|
|
for png_path in existing_pngs:
|
|
|
|
|
|
abs_png_path = os.path.abspath(png_path)
|
|
|
|
|
|
if abs_png_path not in collected_paths:
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print(f"[SEARCH] [自动发现] 补充未显式收集的图片: {os.path.basename(png_path)}")
|
2026-01-09 16:52:45 +08:00
|
|
|
|
all_figures.append({
|
|
|
|
|
|
"figure_number": "Auto",
|
|
|
|
|
|
"filename": os.path.basename(png_path),
|
|
|
|
|
|
"file_path": abs_png_path,
|
|
|
|
|
|
"description": f"自动发现的分析图表: {os.path.basename(png_path)}",
|
|
|
|
|
|
"analysis": "(该图表由系统自动捕获,Agent未提供具体分析文本,请结合图表标题理解)"
|
|
|
|
|
|
})
|
|
|
|
|
|
except Exception as e:
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print(f"[WARN] 自动发现图片失败: {e}")
|
2026-01-09 16:52:45 +08:00
|
|
|
|
# ---------------------------
|
|
|
|
|
|
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print(f"[NUM] 总轮数: {self.current_round}")
|
|
|
|
|
|
print(f"[GRAPH] 收集图片: {len(all_figures)} 个")
|
2026-01-06 19:44:17 +08:00
|
|
|
|
|
|
|
|
|
|
# 构建用于生成最终报告的提示词
|
|
|
|
|
|
final_report_prompt = self._build_final_report_prompt(all_figures)
|
|
|
|
|
|
|
|
|
|
|
|
try: # 调用LLM生成最终报告
|
|
|
|
|
|
response = self.llm.call(
|
|
|
|
|
|
prompt=final_report_prompt,
|
|
|
|
|
|
system_prompt="你将会接收到一个数据分析任务的最终报告请求,请根据提供的分析结果和图片信息生成完整的分析报告。",
|
|
|
|
|
|
max_tokens=16384, # 设置较大的token限制以容纳完整报告
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2026-01-09 16:52:45 +08:00
|
|
|
|
# 直接使用LLM响应作为最终报告(因为我们在prompt中要求直接输出Markdown)
|
|
|
|
|
|
final_report_content = response
|
|
|
|
|
|
|
|
|
|
|
|
# 兼容旧逻辑:如果意外返回了YAML,尝试解析
|
|
|
|
|
|
if response.strip().startswith("action:") or "final_report:" in response:
|
|
|
|
|
|
try:
|
|
|
|
|
|
yaml_data = self.llm.parse_yaml_response(response)
|
|
|
|
|
|
if yaml_data.get("action") == "analysis_complete":
|
|
|
|
|
|
final_report_content = yaml_data.get("final_report", response)
|
|
|
|
|
|
except:
|
|
|
|
|
|
pass # 解析失败则保持原样
|
2026-01-06 19:44:17 +08:00
|
|
|
|
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print("[OK] 最终报告生成完成")
|
2026-01-06 19:44:17 +08:00
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print(f"[ERROR] 生成最终报告时出错: {str(e)}")
|
2026-01-06 19:44:17 +08:00
|
|
|
|
final_report_content = f"报告生成失败: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
|
# 保存最终报告到文件
|
|
|
|
|
|
report_file_path = os.path.join(self.session_output_dir, "最终分析报告.md")
|
|
|
|
|
|
try:
|
|
|
|
|
|
with open(report_file_path, "w", encoding="utf-8") as f:
|
|
|
|
|
|
f.write(final_report_content)
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print(f"[DOC] 最终报告已保存至: {report_file_path}")
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
print(f"[ERROR] 保存报告文件失败: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
# 生成可复用脚本
|
|
|
|
|
|
script_path = ""
|
|
|
|
|
|
try:
|
|
|
|
|
|
script_path = generate_reusable_script(
|
|
|
|
|
|
analysis_results=self.analysis_results,
|
|
|
|
|
|
data_files=self.data_files,
|
|
|
|
|
|
session_output_dir=self.session_output_dir,
|
|
|
|
|
|
user_requirement=self.user_requirement
|
|
|
|
|
|
)
|
2026-01-06 19:44:17 +08:00
|
|
|
|
except Exception as e:
|
2026-01-31 18:00:05 +08:00
|
|
|
|
print(f"[WARN] 脚本生成失败: {e}")
|
2026-01-06 19:44:17 +08:00
|
|
|
|
|
|
|
|
|
|
# 返回完整的分析结果
|
|
|
|
|
|
return {
|
|
|
|
|
|
"session_output_dir": self.session_output_dir,
|
|
|
|
|
|
"total_rounds": self.current_round,
|
|
|
|
|
|
"analysis_results": self.analysis_results,
|
|
|
|
|
|
"collected_figures": all_figures,
|
|
|
|
|
|
"conversation_history": self.conversation_history,
|
|
|
|
|
|
"final_report": final_report_content,
|
|
|
|
|
|
"report_file_path": report_file_path,
|
2026-01-31 18:00:05 +08:00
|
|
|
|
"reusable_script_path": script_path,
|
2026-01-06 19:44:17 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
def _build_final_report_prompt(self, all_figures: List[Dict[str, Any]]) -> str:
|
|
|
|
|
|
"""构建用于生成最终报告的提示词"""
|
|
|
|
|
|
|
|
|
|
|
|
# 构建图片信息摘要,使用相对路径
|
|
|
|
|
|
figures_summary = ""
|
|
|
|
|
|
if all_figures:
|
|
|
|
|
|
figures_summary = "\n生成的图片及分析:\n"
|
|
|
|
|
|
for i, figure in enumerate(all_figures, 1):
|
|
|
|
|
|
filename = figure.get("filename", "未知文件名")
|
|
|
|
|
|
# 使用相对路径格式,适合在报告中引用
|
|
|
|
|
|
relative_path = f"./{filename}"
|
|
|
|
|
|
figures_summary += f"{i}. {filename}\n"
|
|
|
|
|
|
figures_summary += f" 相对路径: {relative_path}\n"
|
|
|
|
|
|
figures_summary += f" 描述: {figure.get('description', '无描述')}\n"
|
|
|
|
|
|
figures_summary += f" 分析: {figure.get('analysis', '无分析')}\n\n"
|
|
|
|
|
|
else:
|
|
|
|
|
|
figures_summary = "\n本次分析未生成图片。\n"
|
|
|
|
|
|
|
|
|
|
|
|
# 构建代码执行结果摘要(仅包含成功执行的代码块)
|
|
|
|
|
|
code_results_summary = ""
|
|
|
|
|
|
success_code_count = 0
|
|
|
|
|
|
for result in self.analysis_results:
|
|
|
|
|
|
if result.get("action") != "collect_figures" and result.get("code"):
|
|
|
|
|
|
exec_result = result.get("result", {})
|
|
|
|
|
|
if exec_result.get("success"):
|
|
|
|
|
|
success_code_count += 1
|
|
|
|
|
|
code_results_summary += f"代码块 {success_code_count}: 执行成功\n"
|
|
|
|
|
|
if exec_result.get("output"):
|
|
|
|
|
|
code_results_summary += (
|
|
|
|
|
|
f"输出: {exec_result.get('output')[:]}\n\n"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2026-04-19 21:30:08 +08:00
|
|
|
|
# 构建各轮次证据数据摘要
|
|
|
|
|
|
evidence_summary = ""
|
|
|
|
|
|
if self._session_ref and self._session_ref.rounds:
|
|
|
|
|
|
evidence_parts = []
|
|
|
|
|
|
for rd in self._session_ref.rounds:
|
|
|
|
|
|
round_num = rd.get("round", 0)
|
|
|
|
|
|
summary = rd.get("result_summary", "")
|
|
|
|
|
|
evidence = rd.get("evidence_rows", [])
|
|
|
|
|
|
reasoning = rd.get("reasoning", "")
|
|
|
|
|
|
part = f"第{round_num}轮: {summary}"
|
|
|
|
|
|
if reasoning:
|
|
|
|
|
|
part += f"\n 推理: {reasoning[:200]}"
|
|
|
|
|
|
if evidence:
|
|
|
|
|
|
part += f"\n 数据样本({len(evidence)}行): {json.dumps(evidence[:3], ensure_ascii=False, default=str)}"
|
|
|
|
|
|
evidence_parts.append(part)
|
|
|
|
|
|
evidence_summary = "\n".join(evidence_parts)
|
|
|
|
|
|
|
2026-01-06 19:44:17 +08:00
|
|
|
|
# 使用 prompts.py 中的统一提示词模板,并添加相对路径使用说明
|
|
|
|
|
|
prompt = final_report_system_prompt.format(
|
|
|
|
|
|
current_round=self.current_round,
|
|
|
|
|
|
session_output_dir=self.session_output_dir,
|
|
|
|
|
|
data_profile=self.data_profile, # 注入数据画像
|
|
|
|
|
|
figures_summary=figures_summary,
|
|
|
|
|
|
code_results_summary=code_results_summary,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2026-04-19 21:30:08 +08:00
|
|
|
|
# Append evidence data from all rounds for evidence annotation
|
|
|
|
|
|
if evidence_summary:
|
|
|
|
|
|
prompt += f"""
|
|
|
|
|
|
|
|
|
|
|
|
**各轮次分析证据数据 (Evidence by Round)**:
|
|
|
|
|
|
以下是每轮分析的结果摘要和数据样本,请在报告中使用 `<!-- evidence:round_N -->` 标注引用了哪一轮的数据:
|
|
|
|
|
|
|
|
|
|
|
|
{evidence_summary}
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
2026-01-06 19:44:17 +08:00
|
|
|
|
# 在提示词中明确要求使用相对路径
|
|
|
|
|
|
prompt += """
|
|
|
|
|
|
|
2026-01-31 18:00:05 +08:00
|
|
|
|
[FOLDER] **图片路径使用说明**:
|
2026-01-06 19:44:17 +08:00
|
|
|
|
报告和图片都在同一目录下,请在报告中使用相对路径引用图片:
|
2026-04-19 21:30:08 +08:00
|
|
|
|
- 格式:
|
2026-01-06 19:44:17 +08:00
|
|
|
|
- 示例:
|
2026-04-19 21:30:08 +08:00
|
|
|
|
- 注意:必须使用实际生成的图片文件名,严禁使用占位符
|
2026-01-06 19:44:17 +08:00
|
|
|
|
"""
|
|
|
|
|
|
|
2026-04-20 09:50:35 +08:00
|
|
|
|
# Append actual data files list so the LLM uses real filenames in the report
|
|
|
|
|
|
if self._session_ref and self._session_ref.data_files:
|
|
|
|
|
|
data_files_summary = "\n**已生成的数据文件列表** (请在报告中使用这些实际文件名,替换模板中的占位文件名如 [4-1TSP问题聚类.xlsx]):\n"
|
|
|
|
|
|
for df_meta in self._session_ref.data_files:
|
|
|
|
|
|
fname = df_meta.get("filename", "")
|
|
|
|
|
|
desc = df_meta.get("description", "")
|
|
|
|
|
|
rows = df_meta.get("rows", 0)
|
|
|
|
|
|
data_files_summary += f"- {fname} ({rows}行): {desc}\n"
|
|
|
|
|
|
data_files_summary += "\n注意:报告模板中的 `[4-1TSP问题聚类.xlsx]` 等占位文件名必须替换为上述实际文件名。如果某类聚类文件未生成,请说明原因(如数据量不足或该分类不适用),不要保留占位符。\n"
|
|
|
|
|
|
prompt += data_files_summary
|
|
|
|
|
|
|
2026-01-06 19:44:17 +08:00
|
|
|
|
return prompt
|
|
|
|
|
|
|
|
|
|
|
|
def reset(self):
|
|
|
|
|
|
"""重置智能体状态"""
|
|
|
|
|
|
self.conversation_history = []
|
|
|
|
|
|
self.analysis_results = []
|
|
|
|
|
|
self.current_round = 0
|
|
|
|
|
|
self.executor.reset_environment()
|