Files
iov_data_analysis_agent/data_analysis_agent.py

1000 lines
45 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""
简化的 Notebook 数据分析智能体
仅包含用户和助手两个角
2. 图片必须保存到指定的会话目录中输出绝对路径禁止使用plt.show()
3. 表格输出控制超过15行只显示前5行和后5行
4. 强制使用SimHei字体plt.rcParams['font.sans-serif'] = ['SimHei']
5. 输出格式严格使用YAML共享上下文的单轮对话模式
"""
import os
import json
import re
import yaml
from typing import Dict, Any, List, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed
from utils.create_session_dir import create_session_output_dir
from utils.format_execution_result import format_execution_result
from utils.extract_code import extract_code_from_response
from utils.data_loader import load_and_profile_data, load_data_chunked, load_and_profile_data_smart
from utils.llm_helper import LLMHelper
from utils.code_executor import CodeExecutor
from utils.script_generator import generate_reusable_script
from utils.data_privacy import build_safe_profile, build_local_profile, sanitize_execution_feedback, generate_enriched_hint
from config.llm_config import LLMConfig
from config.app_config import app_config
from prompts import data_analysis_system_prompt, final_report_system_prompt, data_analysis_followup_prompt
# Regex patterns that indicate a data-context error (column/variable/DataFrame issues)
DATA_CONTEXT_PATTERNS = [
r"KeyError:\s*['\"](.+?)['\"]",
r"ValueError.*(?:column|col|field)",
r"NameError.*(?:df|data|frame)",
r"(?:empty|no\s+data|0\s+rows)",
r"IndexError.*(?:out of range|out of bounds)",
]
class DataAnalysisAgent:
"""
数据分析智能体
职责:
- 接收用户自然语言需求
- 生成Python分析代码
- 执行代码并收集结果
- 基于执行结果继续生成后续分析代码
"""
def __init__(
self,
llm_config: LLMConfig = None,
output_dir: str = "outputs",
max_rounds: int = 20,
force_max_rounds: bool = False,
):
"""
初始化智能体
Args:
config: LLM配置
output_dir: 输出目录
max_rounds: 最大对话轮数
force_max_rounds: 是否强制运行到最大轮数忽略AI的完成信号
"""
self.config = llm_config or LLMConfig()
self.llm = LLMHelper(self.config)
self.base_output_dir = output_dir
self.max_rounds = max_rounds
self.force_max_rounds = force_max_rounds
# 对话历史和上下文
self.conversation_history = []
self.analysis_results = []
self.current_round = 0
self.session_output_dir = None
self.executor = None
self.data_profile = "" # 存储数据画像(完整版,本地使用)
self.data_profile_safe = "" # 存储安全画像发给LLM
self.data_files = [] # 存储数据文件列表
self.user_requirement = "" # 存储用户需求
self._progress_callback = None # 进度回调函数
self._session_ref = None # Reference to SessionData for round tracking
def set_session_ref(self, session):
"""Set a reference to the SessionData instance for appending round data.
Args:
session: The SessionData instance for the current analysis session.
"""
self._session_ref = session
def set_progress_callback(self, callback):
"""Set a callback function(current_round, max_rounds, message) for progress updates."""
self._progress_callback = callback
def _summarize_result(self, result: Dict[str, Any]) -> str:
"""Produce a one-line summary from a code execution result.
Args:
result: The execution result dict from CodeExecutor.
Returns:
A concise summary string, e.g. "执行成功,输出 DataFrame (150行×8列)"
or "执行失败: KeyError: 'col_x'".
"""
if result.get("success"):
evidence_rows = result.get("evidence_rows", [])
if evidence_rows:
num_rows = len(evidence_rows)
num_cols = len(evidence_rows[0]) if evidence_rows else 0
# Check auto_exported_files for more accurate row/col counts
auto_files = result.get("auto_exported_files", [])
if auto_files:
last_file = auto_files[-1]
num_rows = last_file.get("rows", num_rows)
num_cols = last_file.get("cols", num_cols)
return f"执行成功,输出 DataFrame ({num_rows}行×{num_cols}列)"
output = result.get("output", "")
if output:
first_line = output.strip().split("\n")[0][:80]
return f"执行成功: {first_line}"
return "执行成功"
else:
error = result.get("error", "未知错误")
if len(error) > 100:
error = error[:100] + "..."
return f"执行失败: {error}"
def _process_response(self, response: str) -> Dict[str, Any]:
"""
统一处理LLM响应判断行动类型并执行相应操作
Args:
response: LLM的响应内容
Returns:
处理结果字典
"""
try:
yaml_data = self.llm.parse_yaml_response(response)
action = yaml_data.get("action", "generate_code")
print(f"[TARGET] 检测到动作: {action}")
if action == "analysis_complete":
return self._handle_analysis_complete(response, yaml_data)
elif action == "collect_figures":
return self._handle_collect_figures(response, yaml_data)
elif action == "generate_code":
return self._handle_generate_code(response, yaml_data)
else:
print(f"[WARN] 未知动作类型: {action}按generate_code处理")
return self._handle_generate_code(response, yaml_data)
except Exception as e:
print(f"[WARN] 解析响应失败: {str(e)}尝试提取代码并按generate_code处理")
# 即使YAML解析失败也尝试提取代码
extracted_code = extract_code_from_response(response)
if extracted_code:
return self._handle_generate_code(response, {"code": extracted_code})
return self._handle_generate_code(response, {})
def _handle_analysis_complete(
self, response: str, yaml_data: Dict[str, Any]
) -> Dict[str, Any]:
"""处理分析完成动作"""
print("[OK] 分析任务完成")
final_report = yaml_data.get("final_report", "分析完成,无最终报告")
return {
"action": "analysis_complete",
"final_report": final_report,
"response": response,
"continue": False,
}
def _handle_collect_figures(
self, response: str, yaml_data: Dict[str, Any]
) -> Dict[str, Any]:
"""处理图片收集动作"""
print("[CHART] 开始收集图片")
figures_to_collect = yaml_data.get("figures_to_collect", [])
collected_figures = []
# 使用seen_paths集合来去重防止重复收集
seen_paths = set()
for figure_info in figures_to_collect:
figure_number = figure_info.get("figure_number", "未知")
# 确保figure_number不为None时才用于文件名
if figure_number != "未知":
default_filename = f"figure_{figure_number}.png"
else:
default_filename = "figure_unknown.png"
filename = figure_info.get("filename", default_filename)
file_path = figure_info.get("file_path", "") # 获取具体的文件路径
description = figure_info.get("description", "")
analysis = figure_info.get("analysis", "")
print(f"[GRAPH] 收集图片 {figure_number}: {filename}")
print(f" [DIR] 路径: {file_path}")
print(f" [NOTE] 描述: {description}")
print(f" [SEARCH] 分析: {analysis}")
# 验证文件是否存在
# 只有文件真正存在时才加入列表,防止报告出现裂图
if file_path and os.path.exists(file_path):
# 检查是否已经收集过该路径
abs_path = os.path.abspath(file_path)
if abs_path not in seen_paths:
print(f" [OK] 文件存在: {file_path}")
# 记录图片信息
collected_figures.append(
{
"figure_number": figure_number,
"filename": filename,
"file_path": file_path,
"description": description,
"analysis": analysis,
}
)
seen_paths.add(abs_path)
else:
print(f" [WARN] 跳过重复图片: {file_path}")
else:
if file_path:
print(f" [WARN] 文件不存在: {file_path}")
else:
print(f" [WARN] 未提供文件路径")
return {
"action": "collect_figures",
"collected_figures": collected_figures,
"response": response,
"continue": True,
}
def _handle_generate_code(
self, response: str, yaml_data: Dict[str, Any]
) -> Dict[str, Any]:
"""处理代码生成和执行动作"""
# 从YAML数据中获取代码更准确
code = yaml_data.get("code", "")
reasoning = yaml_data.get("reasoning", "")
# 如果YAML中没有代码尝试从响应中提取
if not code:
code = extract_code_from_response(response)
# 二次清洗防止YAML中解析出的code包含markdown标记
if code:
code = code.strip()
if code.startswith("```"):
# 去除开头的 ```python 或 ```
code = re.sub(r"^```[a-zA-Z]*\n", "", code)
# 去除结尾的 ```
code = re.sub(r"\n```$", "", code)
code = code.strip()
if code:
print(f"[TOOL] 执行代码:\n{code}")
print("-" * 40)
# 执行代码
result = self.executor.execute_code(code)
# 格式化执行结果
feedback = format_execution_result(result)
print(f"[LIST] 执行反馈:\n{feedback}")
return {
"action": "generate_code",
"code": code,
"reasoning": reasoning,
"result": result,
"feedback": feedback,
"response": response,
"continue": True,
}
else:
# 如果没有代码说明LLM响应格式有问题需要重新生成
print("[WARN] 未从响应中提取到可执行代码要求LLM重新生成")
return {
"action": "invalid_response",
"reasoning": reasoning,
"error": "响应中缺少可执行代码",
"response": response,
"continue": True,
}
def _classify_error(self, error_message: str) -> str:
"""Classify execution error as data-context or other.
Inspects the error message against DATA_CONTEXT_PATTERNS to determine
if the error is related to data context (missing columns, undefined
data variables, empty DataFrames, etc.).
Args:
error_message: The error message string from code execution.
Returns:
"data_context" if the error matches a data-context pattern,
"other" otherwise.
"""
for pattern in DATA_CONTEXT_PATTERNS:
if re.search(pattern, error_message, re.IGNORECASE):
return "data_context"
return "other"
def _trim_conversation_history(self):
"""Apply sliding window trimming to conversation history.
Retains the first user message (original requirement + Safe_Profile) at
index 0, generates a compressed summary of old messages, and keeps only
the most recent ``conversation_window_size`` message pairs in full.
"""
window_size = app_config.conversation_window_size
max_messages = window_size * 2 # pairs of user+assistant messages
if len(self.conversation_history) <= max_messages:
return # No trimming needed
first_message = self.conversation_history[0] # Always retain
# Determine trim boundary: skip first message + possible existing summary
start_idx = 1
has_existing_summary = (
len(self.conversation_history) > 1
and self.conversation_history[1]["role"] == "user"
and self.conversation_history[1]["content"].startswith("[分析摘要]")
)
if has_existing_summary:
start_idx = 2
# Messages to trim vs keep
messages_to_consider = self.conversation_history[start_idx:]
messages_to_trim = messages_to_consider[:-max_messages]
messages_to_keep = messages_to_consider[-max_messages:]
if not messages_to_trim:
return
# Generate summary of trimmed messages
summary = self._compress_trimmed_messages(messages_to_trim)
# Rebuild history: first_message + summary + recent messages
self.conversation_history = [first_message]
if summary:
self.conversation_history.append({"role": "user", "content": summary})
self.conversation_history.extend(messages_to_keep)
def _compress_trimmed_messages(self, messages: list) -> str:
"""Compress trimmed messages into a concise summary string.
Extracts the action type from each assistant message, the execution
outcome (success / failure), and completed SOP stages from the
subsequent user feedback message. Code blocks and raw execution
output are excluded.
The summary explicitly lists completed SOP stages so the LLM does
not restart from stage 1 after conversation trimming.
Args:
messages: List of conversation message dicts to compress.
Returns:
A summary string prefixed with ``[分析摘要]``.
"""
summary_parts = ["[分析摘要] 以下是之前分析轮次的概要:"]
round_num = 0
completed_stages = set()
# SOP stage keywords to detect from assistant messages
stage_keywords = {
"阶段1": "数据探索与加载",
"阶段2": "基础分布分析",
"阶段3": "时序与来源分析",
"阶段4": "深度交叉分析",
"阶段5": "效率分析",
"阶段6": "高级挖掘",
}
for msg in messages:
content = msg["content"]
if msg["role"] == "assistant":
round_num += 1
# Extract action type from YAML-like content
action = "generate_code"
if "action: \"collect_figures\"" in content or "action: collect_figures" in content:
action = "collect_figures"
elif "action: \"analysis_complete\"" in content or "action: analysis_complete" in content:
action = "analysis_complete"
# Detect completed SOP stages
for stage_key, stage_name in stage_keywords.items():
if stage_key in content or stage_name in content:
completed_stages.add(f"{stage_key}: {stage_name}")
summary_parts.append(f"- 轮次{round_num}: 动作={action}")
elif msg["role"] == "user" and "代码执行反馈" in content:
success = "失败" if "[ERROR]" in content or "执行错误" in content else "成功"
if summary_parts and summary_parts[-1].startswith("- 轮次"):
summary_parts[-1] += f", 执行结果={success}"
# Append completed stages so the LLM knows where to continue
if completed_stages:
summary_parts.append("")
summary_parts.append("**已完成的SOP阶段** (请勿重复执行):")
for stage in sorted(completed_stages):
summary_parts.append(f"{stage}")
summary_parts.append("")
summary_parts.append("请从下一个未完成的阶段继续,不要重新执行已完成的阶段。")
return "\n".join(summary_parts)
def _profile_files_parallel(self, file_paths: list) -> tuple:
"""Profile multiple files concurrently using ThreadPoolExecutor.
Each file is profiled independently via ``build_safe_profile`` and
``build_local_profile``. Results are collected and merged. If any
individual file fails, an error entry is included for that file and
profiling continues for the remaining files.
Args:
file_paths: List of file paths to profile.
Returns:
A tuple ``(safe_profile, local_profile)`` of merged markdown strings.
"""
max_workers = app_config.max_parallel_profiles
safe_profiles = []
local_profiles = []
def profile_single(path):
safe = build_safe_profile([path])
local = build_local_profile([path])
return path, safe, local
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = {executor.submit(profile_single, p): p for p in file_paths}
for future in as_completed(futures):
path = futures[future]
try:
_, safe, local = future.result()
safe_profiles.append(safe)
local_profiles.append(local)
except Exception as e:
error_entry = f"## 文件: {os.path.basename(path)}\n[ERROR] 分析失败: {e}\n\n"
safe_profiles.append(error_entry)
local_profiles.append(error_entry)
return "\n".join(safe_profiles), "\n".join(local_profiles)
def analyze(self, user_input: str, files: List[str] = None, session_output_dir: str = None, reset_session: bool = True, max_rounds: int = None, template_name: str = None) -> Dict[str, Any]:
"""
开始分析流程
Args:
user_input: 用户的自然语言需求
files: 数据文件路径列表
session_output_dir: 指定的会话输出目录(可选)
reset_session: 是否重置会话 (True: 新开启分析; False: 在现有上下文中继续)
max_rounds: 本次分析的最大轮数 (可选,如果不填则使用默认值)
template_name: 分析模板名称 (可选,如果提供则使用模板引导分析)
Returns:
分析结果字典
"""
# 确定本次运行的轮数限制
current_max_rounds = max_rounds if max_rounds is not None else self.max_rounds
# Template integration: prepend template prompt to user input if provided
if template_name:
from utils.analysis_templates import get_template
template = get_template(template_name) # Raises ValueError if invalid
template_prompt = template.get_full_prompt()
user_input = f"{template_prompt}\n\n{user_input}"
if reset_session:
# --- 初始化新会话 ---
self.conversation_history = []
self.analysis_results = []
self.current_round = 0
self.data_files = files or [] # 保存数据文件列表
self.user_requirement = user_input # 保存用户需求
# 创建本次分析的专用输出目录
if session_output_dir:
self.session_output_dir = session_output_dir
else:
self.session_output_dir = create_session_output_dir(
self.base_output_dir, user_input
)
# 初始化代码执行器,使用会话目录
self.executor = CodeExecutor(self.session_output_dir)
# 设置会话目录变量到执行环境中
self.executor.set_variable("session_output_dir", self.session_output_dir)
# 生成数据画像分级安全级发给LLM完整级留本地
data_profile_safe = ""
data_profile_local = ""
if files:
print("[SEARCH] 正在生成数据画像...")
try:
if len(files) > 1:
# Parallel profiling for multiple files
data_profile_safe, data_profile_local = self._profile_files_parallel(files)
else:
data_profile_safe = build_safe_profile(files)
data_profile_local = build_local_profile(files)
print("[OK] 数据画像生成完毕(安全级 + 本地级)")
except Exception as e:
print(f"[WARN] 数据画像生成失败: {e}")
# Expose chunked iterators for large files in the Code_Executor namespace
for fp in files:
try:
if os.path.exists(fp):
file_size_mb = os.path.getsize(fp) / (1024 * 1024)
if file_size_mb > app_config.max_file_size_mb:
var_name = "chunked_iter_" + os.path.splitext(os.path.basename(fp))[0]
# Store a factory so the iterator can be re-created
self.executor.set_variable(var_name, lambda p=fp: load_data_chunked(p))
print(f"[OK] 大文件 {os.path.basename(fp)} 的分块迭代器已注入为 {var_name}()")
except Exception as e:
print(f"[WARN] 注入分块迭代器失败 ({os.path.basename(fp)}): {e}")
# 安全画像发给LLM完整画像留给最终报告生成
self.data_profile = data_profile_local # 本地完整版用于最终报告
self.data_profile_safe = data_profile_safe # 安全版用于LLM对话
# 构建初始prompt只发送安全级画像给LLM
initial_prompt = f"""用户需求: {user_input}"""
if files:
initial_prompt += f"\n数据文件: {', '.join(files)}"
if data_profile_safe:
initial_prompt += f"\n\n{data_profile_safe}\n\n请根据上述【数据结构概览】中的列名、数据类型和特征描述来制定分析策略。先通过代码探索数据的实际分布,再进行深度分析。"
print(f"[START] 开始数据分析任务")
print(f"[NOTE] 用户需求: {user_input}")
if files:
print(f"[FOLDER] 数据文件: {', '.join(files)}")
print(f"[DIR] 输出目录: {self.session_output_dir}")
# 添加到对话历史
self.conversation_history.append({"role": "user", "content": initial_prompt})
else:
# --- 继续现有会话 ---
# 如果是追问,且没有指定轮数,默认减少轮数,避免过度分析
if max_rounds is None:
current_max_rounds = 10 # 追问通常不需要那么长的思考链10轮足够
print(f"\n[START] 继续分析任务 (追问模式)")
print(f"[NOTE] 后续需求: {user_input}")
# 重置当前轮数计数器,以便给新任务足够的轮次
self.current_round = 0
# 添加到对话历史
# 提示Agent这是后续追问可以简化步骤
follow_up_prompt = f"后续需求: {user_input}\n(注意这是后续追问请直接针对该问题进行分析无需从头开始执行完整SOP。)"
self.conversation_history.append({"role": "user", "content": follow_up_prompt})
print(f"[NUM] 本次最大轮数: {current_max_rounds}")
if self.force_max_rounds:
print(f"[FAST] 强制模式: 将运行满 {current_max_rounds}忽略AI完成信号")
print("=" * 60)
# 保存原始 max_rounds 以便恢复(虽然 analyze 结束后不需要恢复,但为了逻辑严谨)
original_max_rounds = self.max_rounds
self.max_rounds = current_max_rounds
# 初始化连续失败计数器
consecutive_failures = 0
# Per-round data-context retry counter
data_context_retries = 0
last_retry_round = 0
while self.current_round < self.max_rounds:
self.current_round += 1
# Notify progress callback
if self._progress_callback:
self._progress_callback(self.current_round, self.max_rounds, f"{self.current_round}/{self.max_rounds}轮分析中...")
# Reset data-context retry counter when entering a new round
if self.current_round != last_retry_round:
data_context_retries = 0
# Trim conversation history after the first round to bound token usage
if self.current_round > 1:
self._trim_conversation_history()
print(f"\n[LOOP] 第 {self.current_round} 轮分析")
# 调用LLM生成响应
try: # 获取当前执行环境的变量信息
notebook_variables = self.executor.get_environment_info()
# Select prompt based on mode
if self.current_round == 1 and not reset_session:
# For the first round of a follow-up session, use the specialized prompt
base_system_prompt = data_analysis_followup_prompt
elif not reset_session and self.current_round > 1:
# For subsequent rounds in follow-up, continue using the follow-up context
# or maybe just the standard one is fine as long as SOP isn't fully enforced?
# Let's stick to the follow-up prompt to prevent SOP regression
base_system_prompt = data_analysis_followup_prompt
else:
base_system_prompt = data_analysis_system_prompt
# 格式化系统提示词填入动态的notebook变量信息
formatted_system_prompt = base_system_prompt.format(
notebook_variables=notebook_variables
)
print(f"[DEBUG] [DEBUG] System Prompt Head:\n{formatted_system_prompt[:500]}...\n[...]")
print(f"[DEBUG] [DEBUG] System Prompt Rules Check: 'stop_words' in prompt? {'stop_words' in formatted_system_prompt}")
response = self.llm.call(
prompt=self._build_conversation_prompt(),
system_prompt=formatted_system_prompt,
)
print(f"[AI] 助手响应:\n{response}")
# 使用统一的响应处理方法
process_result = self._process_response(response)
# 根据处理结果决定是否继续(仅在非强制模式下)
if process_result.get("action") == "invalid_response":
consecutive_failures += 1
print(f"[WARN] 连续失败次数: {consecutive_failures}/3")
if consecutive_failures >= 3:
print(f"[ERROR] 连续3次无法获取有效响应分析终止。请检查网络或配置。")
break
else:
consecutive_failures = 0 # 重置计数器
if not self.force_max_rounds and not process_result.get(
"continue", True
):
print(f"\n[OK] 分析完成!")
break
# 添加到对话历史
self.conversation_history.append(
{"role": "assistant", "content": response}
)
# 根据动作类型添加不同的反馈
if process_result["action"] == "generate_code":
feedback = process_result.get("feedback", "")
result = process_result.get("result", {})
execution_failed = not result.get("success", True)
# --- Data-context retry logic ---
if execution_failed:
error_output = result.get("error", "") or feedback
error_class = self._classify_error(error_output)
if error_class == "data_context" and data_context_retries < app_config.max_data_context_retries:
data_context_retries += 1
last_retry_round = self.current_round
print(f"[RETRY] 数据上下文错误,重试 {data_context_retries}/{app_config.max_data_context_retries}")
# Generate enriched hint from safe profile
enriched_hint = generate_enriched_hint(error_output, self.data_profile_safe)
# Add enriched hint to conversation history (assistant response already added above)
self.conversation_history.append(
{"role": "user", "content": enriched_hint}
)
# Record the failed attempt
self.analysis_results.append(
{
"round": self.current_round,
"code": process_result.get("code", ""),
"result": result,
"response": response,
"retry": True,
}
)
# Retry within the same round: decrement round counter so the
# outer loop's increment brings us back to the same round number
self.current_round -= 1
continue
# Normal feedback path (no retry or non-data-context error or at limit)
safe_feedback = sanitize_execution_feedback(feedback)
self.conversation_history.append(
{"role": "user", "content": f"代码执行反馈:\n{safe_feedback}"}
)
# 记录分析结果
self.analysis_results.append(
{
"round": self.current_round,
"code": process_result.get("code", ""),
"result": process_result.get("result", {}),
"response": response,
}
)
# --- Construct Round_Data and append to session ---
result = process_result.get("result", {})
round_data = {
"round": self.current_round,
"reasoning": process_result.get("reasoning", ""),
"code": process_result.get("code", ""),
"result_summary": self._summarize_result(result),
"evidence_rows": result.get("evidence_rows", []),
"raw_log": feedback,
"auto_exported_files": result.get("auto_exported_files", []),
"prompt_saved_files": result.get("prompt_saved_files", []),
}
if self._session_ref:
self._session_ref.rounds.append(round_data)
# Merge file metadata into SessionData.data_files
for f in round_data.get("auto_exported_files", []):
if f.get("skipped"):
continue # Large DataFrame — not written to disk
self._session_ref.data_files.append({
"filename": f.get("filename", ""),
"description": f"自动导出: {f.get('variable_name', '')}",
"rows": f.get("rows", 0),
"cols": f.get("cols", 0),
"columns": f.get("columns", []),
"size_bytes": 0,
"source": "auto",
})
for f in round_data.get("prompt_saved_files", []):
self._session_ref.data_files.append({
"filename": f.get("filename", ""),
"description": f.get("description", ""),
"rows": f.get("rows", 0),
"cols": 0,
"columns": [],
"size_bytes": 0,
"source": "prompt",
})
elif process_result["action"] == "collect_figures":
# 记录图片收集结果
collected_figures = process_result.get("collected_figures", [])
missing_figures = process_result.get("missing_figures", [])
feedback = f"已收集 {len(collected_figures)} 个有效图片及其分析。"
if missing_figures:
feedback += f"\n[WARN] 以下图片未找到,请检查代码是否成功保存了这些图片: {missing_figures}"
self.conversation_history.append(
{
"role": "user",
"content": f"图片收集反馈:\n{feedback}\n请继续下一步分析。",
}
)
# 记录到分析结果中
self.analysis_results.append(
{
"round": self.current_round,
"action": "collect_figures",
"collected_figures": collected_figures,
"missing_figures": missing_figures,
"response": response,
}
)
except Exception as e:
error_msg = f"LLM调用错误: {str(e)}"
print(f"[ERROR] {error_msg}")
self.conversation_history.append(
{
"role": "user",
"content": f"发生错误: {error_msg},请重新生成代码。",
}
)
# 生成最终总结
if self.current_round >= self.max_rounds:
print(f"\n[WARN] 已达到最大轮数 ({self.max_rounds}),分析结束")
return self._generate_final_report()
def _build_conversation_prompt(self) -> str:
"""构建对话提示词"""
prompt_parts = []
for msg in self.conversation_history:
role = msg["role"]
content = msg["content"]
if role == "user":
prompt_parts.append(f"用户: {content}")
else:
prompt_parts.append(f"助手: {content}")
return "\n\n".join(prompt_parts)
def _generate_final_report(self) -> Dict[str, Any]:
"""生成最终分析报告"""
# 收集所有生成的图片信息
all_figures = []
for result in self.analysis_results:
if result.get("action") == "collect_figures":
all_figures.extend(result.get("collected_figures", []))
print(f"\n[CHART] 开始生成最终分析报告...")
print(f"[DIR] 输出目录: {self.session_output_dir}")
# --- 自动补全/发现图片机制 ---
# 扫描目录下所有的png文件
try:
import glob
existing_pngs = glob.glob(os.path.join(self.session_output_dir, "*.png"))
# 获取已收集的图片路径集合
collected_paths = set()
for fig in all_figures:
if fig.get("file_path"):
collected_paths.add(os.path.abspath(fig.get("file_path")))
# 检查是否有漏网之鱼
for png_path in existing_pngs:
abs_png_path = os.path.abspath(png_path)
if abs_png_path not in collected_paths:
print(f"[SEARCH] [自动发现] 补充未显式收集的图片: {os.path.basename(png_path)}")
all_figures.append({
"figure_number": "Auto",
"filename": os.path.basename(png_path),
"file_path": abs_png_path,
"description": f"自动发现的分析图表: {os.path.basename(png_path)}",
"analysis": "该图表由系统自动捕获Agent未提供具体分析文本请结合图表标题理解"
})
except Exception as e:
print(f"[WARN] 自动发现图片失败: {e}")
# ---------------------------
print(f"[NUM] 总轮数: {self.current_round}")
print(f"[GRAPH] 收集图片: {len(all_figures)}")
# 构建用于生成最终报告的提示词
final_report_prompt = self._build_final_report_prompt(all_figures)
try: # 调用LLM生成最终报告
response = self.llm.call(
prompt=final_report_prompt,
system_prompt="你将会接收到一个数据分析任务的最终报告请求,请根据提供的分析结果和图片信息生成完整的分析报告。",
max_tokens=16384, # 设置较大的token限制以容纳完整报告
)
# 直接使用LLM响应作为最终报告因为我们在prompt中要求直接输出Markdown
final_report_content = response
# 兼容旧逻辑如果意外返回了YAML尝试解析
if response.strip().startswith("action:") or "final_report:" in response:
try:
yaml_data = self.llm.parse_yaml_response(response)
if yaml_data.get("action") == "analysis_complete":
final_report_content = yaml_data.get("final_report", response)
except:
pass # 解析失败则保持原样
print("[OK] 最终报告生成完成")
except Exception as e:
print(f"[ERROR] 生成最终报告时出错: {str(e)}")
final_report_content = f"报告生成失败: {str(e)}"
# 保存最终报告到文件
report_file_path = os.path.join(self.session_output_dir, "最终分析报告.md")
try:
with open(report_file_path, "w", encoding="utf-8") as f:
f.write(final_report_content)
print(f"[DOC] 最终报告已保存至: {report_file_path}")
except Exception as e:
print(f"[ERROR] 保存报告文件失败: {str(e)}")
# 生成可复用脚本
script_path = ""
try:
script_path = generate_reusable_script(
analysis_results=self.analysis_results,
data_files=self.data_files,
session_output_dir=self.session_output_dir,
user_requirement=self.user_requirement
)
except Exception as e:
print(f"[WARN] 脚本生成失败: {e}")
# 返回完整的分析结果
return {
"session_output_dir": self.session_output_dir,
"total_rounds": self.current_round,
"analysis_results": self.analysis_results,
"collected_figures": all_figures,
"conversation_history": self.conversation_history,
"final_report": final_report_content,
"report_file_path": report_file_path,
"reusable_script_path": script_path,
}
def _build_final_report_prompt(self, all_figures: List[Dict[str, Any]]) -> str:
"""构建用于生成最终报告的提示词"""
# 构建图片信息摘要,使用相对路径
figures_summary = ""
if all_figures:
figures_summary = "\n生成的图片及分析:\n"
for i, figure in enumerate(all_figures, 1):
filename = figure.get("filename", "未知文件名")
# 使用相对路径格式,适合在报告中引用
relative_path = f"./{filename}"
figures_summary += f"{i}. {filename}\n"
figures_summary += f" 相对路径: {relative_path}\n"
figures_summary += f" 描述: {figure.get('description', '无描述')}\n"
figures_summary += f" 分析: {figure.get('analysis', '无分析')}\n\n"
else:
figures_summary = "\n本次分析未生成图片。\n"
# 构建代码执行结果摘要(仅包含成功执行的代码块)
code_results_summary = ""
success_code_count = 0
for result in self.analysis_results:
if result.get("action") != "collect_figures" and result.get("code"):
exec_result = result.get("result", {})
if exec_result.get("success"):
success_code_count += 1
code_results_summary += f"代码块 {success_code_count}: 执行成功\n"
if exec_result.get("output"):
code_results_summary += (
f"输出: {exec_result.get('output')[:]}\n\n"
)
# 构建各轮次证据数据摘要
evidence_summary = ""
if self._session_ref and self._session_ref.rounds:
evidence_parts = []
for rd in self._session_ref.rounds:
round_num = rd.get("round", 0)
summary = rd.get("result_summary", "")
evidence = rd.get("evidence_rows", [])
reasoning = rd.get("reasoning", "")
part = f"{round_num}轮: {summary}"
if reasoning:
part += f"\n 推理: {reasoning[:200]}"
if evidence:
part += f"\n 数据样本({len(evidence)}行): {json.dumps(evidence[:3], ensure_ascii=False, default=str)}"
evidence_parts.append(part)
evidence_summary = "\n".join(evidence_parts)
# 使用 prompts.py 中的统一提示词模板,并添加相对路径使用说明
prompt = final_report_system_prompt.format(
current_round=self.current_round,
session_output_dir=self.session_output_dir,
data_profile=self.data_profile, # 注入数据画像
figures_summary=figures_summary,
code_results_summary=code_results_summary,
)
# Append evidence data from all rounds for evidence annotation
if evidence_summary:
prompt += f"""
**各轮次分析证据数据 (Evidence by Round)**
以下是每轮分析的结果摘要和数据样本,请在报告中使用 `<!-- evidence:round_N -->` 标注引用了哪一轮的数据:
{evidence_summary}
"""
# 在提示词中明确要求使用相对路径
prompt += """
[FOLDER] **图片路径使用说明**
报告和图片都在同一目录下,请在报告中使用相对路径引用图片:
- 格式:![图片描述](./具体图片名称.png)
- 示例:![营业总收入趋势](./营业总收入趋势.png)
- 注意:必须使用实际生成的图片文件名,严禁使用占位符
"""
# Append actual data files list so the LLM uses real filenames in the report
if self._session_ref and self._session_ref.data_files:
data_files_summary = "\n**已生成的数据文件列表** (请在报告中使用这些实际文件名,替换模板中的占位文件名如 [4-1TSP问题聚类.xlsx])\n"
for df_meta in self._session_ref.data_files:
fname = df_meta.get("filename", "")
desc = df_meta.get("description", "")
rows = df_meta.get("rows", 0)
data_files_summary += f"- {fname} ({rows}行): {desc}\n"
data_files_summary += "\n注意:报告模板中的 `[4-1TSP问题聚类.xlsx]` 等占位文件名必须替换为上述实际文件名。如果某类聚类文件未生成,请说明原因(如数据量不足或该分类不适用),不要保留占位符。\n"
prompt += data_files_summary
return prompt
def reset(self):
"""重置智能体状态"""
self.conversation_history = []
self.analysis_results = []
self.current_round = 0
self.executor.reset_environment()