# -*- coding: utf-8 -*- """ 简化的 Notebook 数据分析智能体 仅包含用户和助手两个角 2. 图片必须保存到指定的会话目录中,输出绝对路径,禁止使用plt.show() 3. 表格输出控制:超过15行只显示前5行和后5行 4. 强制使用SimHei字体:plt.rcParams['font.sans-serif'] = ['SimHei'] 5. 输出格式严格使用YAML共享上下文的单轮对话模式 """ import os import json import re import yaml from typing import Dict, Any, List, Optional from concurrent.futures import ThreadPoolExecutor, as_completed from utils.create_session_dir import create_session_output_dir from utils.format_execution_result import format_execution_result from utils.extract_code import extract_code_from_response from utils.data_loader import load_and_profile_data, load_data_chunked, load_and_profile_data_smart from utils.llm_helper import LLMHelper from utils.code_executor import CodeExecutor from utils.script_generator import generate_reusable_script from utils.data_privacy import build_safe_profile, build_local_profile, sanitize_execution_feedback, generate_enriched_hint from config.llm_config import LLMConfig from config.app_config import app_config from prompts import data_analysis_system_prompt, final_report_system_prompt, data_analysis_followup_prompt # Regex patterns that indicate a data-context error (column/variable/DataFrame issues) DATA_CONTEXT_PATTERNS = [ r"KeyError:\s*['\"](.+?)['\"]", r"ValueError.*(?:column|col|field)", r"NameError.*(?:df|data|frame)", r"(?:empty|no\s+data|0\s+rows)", r"IndexError.*(?:out of range|out of bounds)", ] class DataAnalysisAgent: """ 数据分析智能体 职责: - 接收用户自然语言需求 - 生成Python分析代码 - 执行代码并收集结果 - 基于执行结果继续生成后续分析代码 """ def __init__( self, llm_config: LLMConfig = None, output_dir: str = "outputs", max_rounds: int = 20, force_max_rounds: bool = False, ): """ 初始化智能体 Args: config: LLM配置 output_dir: 输出目录 max_rounds: 最大对话轮数 force_max_rounds: 是否强制运行到最大轮数(忽略AI的完成信号) """ self.config = llm_config or LLMConfig() self.llm = LLMHelper(self.config) self.base_output_dir = output_dir self.max_rounds = max_rounds self.force_max_rounds = force_max_rounds # 对话历史和上下文 self.conversation_history = [] self.analysis_results = [] self.current_round = 0 self.session_output_dir = None self.executor = None self.data_profile = "" # 存储数据画像(完整版,本地使用) self.data_profile_safe = "" # 存储安全画像(发给LLM) self.data_files = [] # 存储数据文件列表 self.user_requirement = "" # 存储用户需求 self._progress_callback = None # 进度回调函数 self._session_ref = None # Reference to SessionData for round tracking def set_session_ref(self, session): """Set a reference to the SessionData instance for appending round data. Args: session: The SessionData instance for the current analysis session. """ self._session_ref = session def set_progress_callback(self, callback): """Set a callback function(current_round, max_rounds, message) for progress updates.""" self._progress_callback = callback def _summarize_result(self, result: Dict[str, Any]) -> str: """Produce a one-line summary from a code execution result. Args: result: The execution result dict from CodeExecutor. Returns: A concise summary string, e.g. "执行成功,输出 DataFrame (150行×8列)" or "执行失败: KeyError: 'col_x'". """ if result.get("success"): evidence_rows = result.get("evidence_rows", []) if evidence_rows: num_rows = len(evidence_rows) num_cols = len(evidence_rows[0]) if evidence_rows else 0 # Check auto_exported_files for more accurate row/col counts auto_files = result.get("auto_exported_files", []) if auto_files: last_file = auto_files[-1] num_rows = last_file.get("rows", num_rows) num_cols = last_file.get("cols", num_cols) return f"执行成功,输出 DataFrame ({num_rows}行×{num_cols}列)" output = result.get("output", "") if output: first_line = output.strip().split("\n")[0][:80] return f"执行成功: {first_line}" return "执行成功" else: error = result.get("error", "未知错误") if len(error) > 100: error = error[:100] + "..." return f"执行失败: {error}" def _process_response(self, response: str) -> Dict[str, Any]: """ 统一处理LLM响应,判断行动类型并执行相应操作 Args: response: LLM的响应内容 Returns: 处理结果字典 """ try: yaml_data = self.llm.parse_yaml_response(response) action = yaml_data.get("action", "generate_code") print(f"[TARGET] 检测到动作: {action}") if action == "analysis_complete": return self._handle_analysis_complete(response, yaml_data) elif action == "collect_figures": return self._handle_collect_figures(response, yaml_data) elif action == "generate_code": return self._handle_generate_code(response, yaml_data) else: print(f"[WARN] 未知动作类型: {action},按generate_code处理") return self._handle_generate_code(response, yaml_data) except Exception as e: print(f"[WARN] 解析响应失败: {str(e)},尝试提取代码并按generate_code处理") # 即使YAML解析失败,也尝试提取代码 extracted_code = extract_code_from_response(response) if extracted_code: return self._handle_generate_code(response, {"code": extracted_code}) return self._handle_generate_code(response, {}) def _handle_analysis_complete( self, response: str, yaml_data: Dict[str, Any] ) -> Dict[str, Any]: """处理分析完成动作""" print("[OK] 分析任务完成") final_report = yaml_data.get("final_report", "分析完成,无最终报告") return { "action": "analysis_complete", "final_report": final_report, "response": response, "continue": False, } def _handle_collect_figures( self, response: str, yaml_data: Dict[str, Any] ) -> Dict[str, Any]: """处理图片收集动作""" print("[CHART] 开始收集图片") figures_to_collect = yaml_data.get("figures_to_collect", []) collected_figures = [] # 使用seen_paths集合来去重,防止重复收集 seen_paths = set() for figure_info in figures_to_collect: figure_number = figure_info.get("figure_number", "未知") # 确保figure_number不为None时才用于文件名 if figure_number != "未知": default_filename = f"figure_{figure_number}.png" else: default_filename = "figure_unknown.png" filename = figure_info.get("filename", default_filename) file_path = figure_info.get("file_path", "") # 获取具体的文件路径 description = figure_info.get("description", "") analysis = figure_info.get("analysis", "") print(f"[GRAPH] 收集图片 {figure_number}: {filename}") print(f" [DIR] 路径: {file_path}") print(f" [NOTE] 描述: {description}") print(f" [SEARCH] 分析: {analysis}") # 验证文件是否存在 # 只有文件真正存在时才加入列表,防止报告出现裂图 if file_path and os.path.exists(file_path): # 检查是否已经收集过该路径 abs_path = os.path.abspath(file_path) if abs_path not in seen_paths: print(f" [OK] 文件存在: {file_path}") # 记录图片信息 collected_figures.append( { "figure_number": figure_number, "filename": filename, "file_path": file_path, "description": description, "analysis": analysis, } ) seen_paths.add(abs_path) else: print(f" [WARN] 跳过重复图片: {file_path}") else: if file_path: print(f" [WARN] 文件不存在: {file_path}") else: print(f" [WARN] 未提供文件路径") return { "action": "collect_figures", "collected_figures": collected_figures, "response": response, "continue": True, } def _handle_generate_code( self, response: str, yaml_data: Dict[str, Any] ) -> Dict[str, Any]: """处理代码生成和执行动作""" # 从YAML数据中获取代码(更准确) code = yaml_data.get("code", "") reasoning = yaml_data.get("reasoning", "") # 如果YAML中没有代码,尝试从响应中提取 if not code: code = extract_code_from_response(response) # 二次清洗:防止YAML中解析出的code包含markdown标记 if code: code = code.strip() if code.startswith("```"): # 去除开头的 ```python 或 ``` code = re.sub(r"^```[a-zA-Z]*\n", "", code) # 去除结尾的 ``` code = re.sub(r"\n```$", "", code) code = code.strip() if code: print(f"[TOOL] 执行代码:\n{code}") print("-" * 40) # 执行代码 result = self.executor.execute_code(code) # 格式化执行结果 feedback = format_execution_result(result) print(f"[LIST] 执行反馈:\n{feedback}") return { "action": "generate_code", "code": code, "reasoning": reasoning, "result": result, "feedback": feedback, "response": response, "continue": True, } else: # 如果没有代码,说明LLM响应格式有问题,需要重新生成 print("[WARN] 未从响应中提取到可执行代码,要求LLM重新生成") return { "action": "invalid_response", "reasoning": reasoning, "error": "响应中缺少可执行代码", "response": response, "continue": True, } def _classify_error(self, error_message: str) -> str: """Classify execution error as data-context or other. Inspects the error message against DATA_CONTEXT_PATTERNS to determine if the error is related to data context (missing columns, undefined data variables, empty DataFrames, etc.). Args: error_message: The error message string from code execution. Returns: "data_context" if the error matches a data-context pattern, "other" otherwise. """ for pattern in DATA_CONTEXT_PATTERNS: if re.search(pattern, error_message, re.IGNORECASE): return "data_context" return "other" def _trim_conversation_history(self): """Apply sliding window trimming to conversation history. Retains the first user message (original requirement + Safe_Profile) at index 0, generates a compressed summary of old messages, and keeps only the most recent ``conversation_window_size`` message pairs in full. """ window_size = app_config.conversation_window_size max_messages = window_size * 2 # pairs of user+assistant messages if len(self.conversation_history) <= max_messages: return # No trimming needed first_message = self.conversation_history[0] # Always retain # Determine trim boundary: skip first message + possible existing summary start_idx = 1 has_existing_summary = ( len(self.conversation_history) > 1 and self.conversation_history[1]["role"] == "user" and self.conversation_history[1]["content"].startswith("[分析摘要]") ) if has_existing_summary: start_idx = 2 # Messages to trim vs keep messages_to_consider = self.conversation_history[start_idx:] messages_to_trim = messages_to_consider[:-max_messages] messages_to_keep = messages_to_consider[-max_messages:] if not messages_to_trim: return # Generate summary of trimmed messages summary = self._compress_trimmed_messages(messages_to_trim) # Rebuild history: first_message + summary + recent messages self.conversation_history = [first_message] if summary: self.conversation_history.append({"role": "user", "content": summary}) self.conversation_history.extend(messages_to_keep) def _compress_trimmed_messages(self, messages: list) -> str: """Compress trimmed messages into a concise summary string. Extracts the action type from each assistant message and the execution outcome (success / failure) from the subsequent user feedback message. Code blocks and raw execution output are excluded. Args: messages: List of conversation message dicts to compress. Returns: A summary string prefixed with ``[分析摘要]``. """ summary_parts = ["[分析摘要] 以下是之前分析轮次的概要:"] round_num = 0 for msg in messages: content = msg["content"] if msg["role"] == "assistant": round_num += 1 # Extract action type from YAML-like content action = "generate_code" if "action: \"collect_figures\"" in content or "action: collect_figures" in content: action = "collect_figures" elif "action: \"analysis_complete\"" in content or "action: analysis_complete" in content: action = "analysis_complete" summary_parts.append(f"- 轮次{round_num}: 动作={action}") elif msg["role"] == "user" and "代码执行反馈" in content: success = "失败" if "[ERROR]" in content or "执行错误" in content else "成功" if summary_parts and summary_parts[-1].startswith("- 轮次"): summary_parts[-1] += f", 执行结果={success}" return "\n".join(summary_parts) def _profile_files_parallel(self, file_paths: list) -> tuple: """Profile multiple files concurrently using ThreadPoolExecutor. Each file is profiled independently via ``build_safe_profile`` and ``build_local_profile``. Results are collected and merged. If any individual file fails, an error entry is included for that file and profiling continues for the remaining files. Args: file_paths: List of file paths to profile. Returns: A tuple ``(safe_profile, local_profile)`` of merged markdown strings. """ max_workers = app_config.max_parallel_profiles safe_profiles = [] local_profiles = [] def profile_single(path): safe = build_safe_profile([path]) local = build_local_profile([path]) return path, safe, local with ThreadPoolExecutor(max_workers=max_workers) as executor: futures = {executor.submit(profile_single, p): p for p in file_paths} for future in as_completed(futures): path = futures[future] try: _, safe, local = future.result() safe_profiles.append(safe) local_profiles.append(local) except Exception as e: error_entry = f"## 文件: {os.path.basename(path)}\n[ERROR] 分析失败: {e}\n\n" safe_profiles.append(error_entry) local_profiles.append(error_entry) return "\n".join(safe_profiles), "\n".join(local_profiles) def analyze(self, user_input: str, files: List[str] = None, session_output_dir: str = None, reset_session: bool = True, max_rounds: int = None, template_name: str = None) -> Dict[str, Any]: """ 开始分析流程 Args: user_input: 用户的自然语言需求 files: 数据文件路径列表 session_output_dir: 指定的会话输出目录(可选) reset_session: 是否重置会话 (True: 新开启分析; False: 在现有上下文中继续) max_rounds: 本次分析的最大轮数 (可选,如果不填则使用默认值) template_name: 分析模板名称 (可选,如果提供则使用模板引导分析) Returns: 分析结果字典 """ # 确定本次运行的轮数限制 current_max_rounds = max_rounds if max_rounds is not None else self.max_rounds # Template integration: prepend template prompt to user input if provided if template_name: from utils.analysis_templates import get_template template = get_template(template_name) # Raises ValueError if invalid template_prompt = template.get_full_prompt() user_input = f"{template_prompt}\n\n{user_input}" if reset_session: # --- 初始化新会话 --- self.conversation_history = [] self.analysis_results = [] self.current_round = 0 self.data_files = files or [] # 保存数据文件列表 self.user_requirement = user_input # 保存用户需求 # 创建本次分析的专用输出目录 if session_output_dir: self.session_output_dir = session_output_dir else: self.session_output_dir = create_session_output_dir( self.base_output_dir, user_input ) # 初始化代码执行器,使用会话目录 self.executor = CodeExecutor(self.session_output_dir) # 设置会话目录变量到执行环境中 self.executor.set_variable("session_output_dir", self.session_output_dir) # 生成数据画像(分级:安全级发给LLM,完整级留本地) data_profile_safe = "" data_profile_local = "" if files: print("[SEARCH] 正在生成数据画像...") try: if len(files) > 1: # Parallel profiling for multiple files data_profile_safe, data_profile_local = self._profile_files_parallel(files) else: data_profile_safe = build_safe_profile(files) data_profile_local = build_local_profile(files) print("[OK] 数据画像生成完毕(安全级 + 本地级)") except Exception as e: print(f"[WARN] 数据画像生成失败: {e}") # Expose chunked iterators for large files in the Code_Executor namespace for fp in files: try: if os.path.exists(fp): file_size_mb = os.path.getsize(fp) / (1024 * 1024) if file_size_mb > app_config.max_file_size_mb: var_name = "chunked_iter_" + os.path.splitext(os.path.basename(fp))[0] # Store a factory so the iterator can be re-created self.executor.set_variable(var_name, lambda p=fp: load_data_chunked(p)) print(f"[OK] 大文件 {os.path.basename(fp)} 的分块迭代器已注入为 {var_name}()") except Exception as e: print(f"[WARN] 注入分块迭代器失败 ({os.path.basename(fp)}): {e}") # 安全画像发给LLM,完整画像留给最终报告生成 self.data_profile = data_profile_local # 本地完整版用于最终报告 self.data_profile_safe = data_profile_safe # 安全版用于LLM对话 # 构建初始prompt(只发送安全级画像给LLM) initial_prompt = f"""用户需求: {user_input}""" if files: initial_prompt += f"\n数据文件: {', '.join(files)}" if data_profile_safe: initial_prompt += f"\n\n{data_profile_safe}\n\n请根据上述【数据结构概览】中的列名、数据类型和特征描述来制定分析策略。先通过代码探索数据的实际分布,再进行深度分析。" print(f"[START] 开始数据分析任务") print(f"[NOTE] 用户需求: {user_input}") if files: print(f"[FOLDER] 数据文件: {', '.join(files)}") print(f"[DIR] 输出目录: {self.session_output_dir}") # 添加到对话历史 self.conversation_history.append({"role": "user", "content": initial_prompt}) else: # --- 继续现有会话 --- # 如果是追问,且没有指定轮数,默认减少轮数,避免过度分析 if max_rounds is None: current_max_rounds = 10 # 追问通常不需要那么长的思考链,10轮足够 print(f"\n[START] 继续分析任务 (追问模式)") print(f"[NOTE] 后续需求: {user_input}") # 重置当前轮数计数器,以便给新任务足够的轮次 self.current_round = 0 # 添加到对话历史 # 提示Agent这是后续追问,可以简化步骤 follow_up_prompt = f"后续需求: {user_input}\n(注意:这是后续追问,请直接针对该问题进行分析,无需从头开始执行完整SOP。)" self.conversation_history.append({"role": "user", "content": follow_up_prompt}) print(f"[NUM] 本次最大轮数: {current_max_rounds}") if self.force_max_rounds: print(f"[FAST] 强制模式: 将运行满 {current_max_rounds} 轮(忽略AI完成信号)") print("=" * 60) # 保存原始 max_rounds 以便恢复(虽然 analyze 结束后不需要恢复,但为了逻辑严谨) original_max_rounds = self.max_rounds self.max_rounds = current_max_rounds # 初始化连续失败计数器 consecutive_failures = 0 # Per-round data-context retry counter data_context_retries = 0 last_retry_round = 0 while self.current_round < self.max_rounds: self.current_round += 1 # Notify progress callback if self._progress_callback: self._progress_callback(self.current_round, self.max_rounds, f"第{self.current_round}/{self.max_rounds}轮分析中...") # Reset data-context retry counter when entering a new round if self.current_round != last_retry_round: data_context_retries = 0 # Trim conversation history after the first round to bound token usage if self.current_round > 1: self._trim_conversation_history() print(f"\n[LOOP] 第 {self.current_round} 轮分析") # 调用LLM生成响应 try: # 获取当前执行环境的变量信息 notebook_variables = self.executor.get_environment_info() # Select prompt based on mode if self.current_round == 1 and not reset_session: # For the first round of a follow-up session, use the specialized prompt base_system_prompt = data_analysis_followup_prompt elif not reset_session and self.current_round > 1: # For subsequent rounds in follow-up, continue using the follow-up context # or maybe just the standard one is fine as long as SOP isn't fully enforced? # Let's stick to the follow-up prompt to prevent SOP regression base_system_prompt = data_analysis_followup_prompt else: base_system_prompt = data_analysis_system_prompt # 格式化系统提示词,填入动态的notebook变量信息 formatted_system_prompt = base_system_prompt.format( notebook_variables=notebook_variables ) print(f"[DEBUG] [DEBUG] System Prompt Head:\n{formatted_system_prompt[:500]}...\n[...]") print(f"[DEBUG] [DEBUG] System Prompt Rules Check: 'stop_words' in prompt? {'stop_words' in formatted_system_prompt}") response = self.llm.call( prompt=self._build_conversation_prompt(), system_prompt=formatted_system_prompt, ) print(f"[AI] 助手响应:\n{response}") # 使用统一的响应处理方法 process_result = self._process_response(response) # 根据处理结果决定是否继续(仅在非强制模式下) if process_result.get("action") == "invalid_response": consecutive_failures += 1 print(f"[WARN] 连续失败次数: {consecutive_failures}/3") if consecutive_failures >= 3: print(f"[ERROR] 连续3次无法获取有效响应,分析终止。请检查网络或配置。") break else: consecutive_failures = 0 # 重置计数器 if not self.force_max_rounds and not process_result.get( "continue", True ): print(f"\n[OK] 分析完成!") break # 添加到对话历史 self.conversation_history.append( {"role": "assistant", "content": response} ) # 根据动作类型添加不同的反馈 if process_result["action"] == "generate_code": feedback = process_result.get("feedback", "") result = process_result.get("result", {}) execution_failed = not result.get("success", True) # --- Data-context retry logic --- if execution_failed: error_output = result.get("error", "") or feedback error_class = self._classify_error(error_output) if error_class == "data_context" and data_context_retries < app_config.max_data_context_retries: data_context_retries += 1 last_retry_round = self.current_round print(f"[RETRY] 数据上下文错误,重试 {data_context_retries}/{app_config.max_data_context_retries}") # Generate enriched hint from safe profile enriched_hint = generate_enriched_hint(error_output, self.data_profile_safe) # Add enriched hint to conversation history (assistant response already added above) self.conversation_history.append( {"role": "user", "content": enriched_hint} ) # Record the failed attempt self.analysis_results.append( { "round": self.current_round, "code": process_result.get("code", ""), "result": result, "response": response, "retry": True, } ) # Retry within the same round: decrement round counter so the # outer loop's increment brings us back to the same round number self.current_round -= 1 continue # Normal feedback path (no retry or non-data-context error or at limit) safe_feedback = sanitize_execution_feedback(feedback) self.conversation_history.append( {"role": "user", "content": f"代码执行反馈:\n{safe_feedback}"} ) # 记录分析结果 self.analysis_results.append( { "round": self.current_round, "code": process_result.get("code", ""), "result": process_result.get("result", {}), "response": response, } ) # --- Construct Round_Data and append to session --- result = process_result.get("result", {}) round_data = { "round": self.current_round, "reasoning": process_result.get("reasoning", ""), "code": process_result.get("code", ""), "result_summary": self._summarize_result(result), "evidence_rows": result.get("evidence_rows", []), "raw_log": feedback, "auto_exported_files": result.get("auto_exported_files", []), "prompt_saved_files": result.get("prompt_saved_files", []), } if self._session_ref: self._session_ref.rounds.append(round_data) # Merge file metadata into SessionData.data_files for f in round_data.get("auto_exported_files", []): if f.get("skipped"): continue # Large DataFrame — not written to disk self._session_ref.data_files.append({ "filename": f.get("filename", ""), "description": f"自动导出: {f.get('variable_name', '')}", "rows": f.get("rows", 0), "cols": f.get("cols", 0), "columns": f.get("columns", []), "size_bytes": 0, "source": "auto", }) for f in round_data.get("prompt_saved_files", []): self._session_ref.data_files.append({ "filename": f.get("filename", ""), "description": f.get("description", ""), "rows": f.get("rows", 0), "cols": 0, "columns": [], "size_bytes": 0, "source": "prompt", }) elif process_result["action"] == "collect_figures": # 记录图片收集结果 collected_figures = process_result.get("collected_figures", []) missing_figures = process_result.get("missing_figures", []) feedback = f"已收集 {len(collected_figures)} 个有效图片及其分析。" if missing_figures: feedback += f"\n[WARN] 以下图片未找到,请检查代码是否成功保存了这些图片: {missing_figures}" self.conversation_history.append( { "role": "user", "content": f"图片收集反馈:\n{feedback}\n请继续下一步分析。", } ) # 记录到分析结果中 self.analysis_results.append( { "round": self.current_round, "action": "collect_figures", "collected_figures": collected_figures, "missing_figures": missing_figures, "response": response, } ) except Exception as e: error_msg = f"LLM调用错误: {str(e)}" print(f"[ERROR] {error_msg}") self.conversation_history.append( { "role": "user", "content": f"发生错误: {error_msg},请重新生成代码。", } ) # 生成最终总结 if self.current_round >= self.max_rounds: print(f"\n[WARN] 已达到最大轮数 ({self.max_rounds}),分析结束") return self._generate_final_report() def _build_conversation_prompt(self) -> str: """构建对话提示词""" prompt_parts = [] for msg in self.conversation_history: role = msg["role"] content = msg["content"] if role == "user": prompt_parts.append(f"用户: {content}") else: prompt_parts.append(f"助手: {content}") return "\n\n".join(prompt_parts) def _generate_final_report(self) -> Dict[str, Any]: """生成最终分析报告""" # 收集所有生成的图片信息 all_figures = [] for result in self.analysis_results: if result.get("action") == "collect_figures": all_figures.extend(result.get("collected_figures", [])) print(f"\n[CHART] 开始生成最终分析报告...") print(f"[DIR] 输出目录: {self.session_output_dir}") # --- 自动补全/发现图片机制 --- # 扫描目录下所有的png文件 try: import glob existing_pngs = glob.glob(os.path.join(self.session_output_dir, "*.png")) # 获取已收集的图片路径集合 collected_paths = set() for fig in all_figures: if fig.get("file_path"): collected_paths.add(os.path.abspath(fig.get("file_path"))) # 检查是否有漏网之鱼 for png_path in existing_pngs: abs_png_path = os.path.abspath(png_path) if abs_png_path not in collected_paths: print(f"[SEARCH] [自动发现] 补充未显式收集的图片: {os.path.basename(png_path)}") all_figures.append({ "figure_number": "Auto", "filename": os.path.basename(png_path), "file_path": abs_png_path, "description": f"自动发现的分析图表: {os.path.basename(png_path)}", "analysis": "(该图表由系统自动捕获,Agent未提供具体分析文本,请结合图表标题理解)" }) except Exception as e: print(f"[WARN] 自动发现图片失败: {e}") # --------------------------- print(f"[NUM] 总轮数: {self.current_round}") print(f"[GRAPH] 收集图片: {len(all_figures)} 个") # 构建用于生成最终报告的提示词 final_report_prompt = self._build_final_report_prompt(all_figures) try: # 调用LLM生成最终报告 response = self.llm.call( prompt=final_report_prompt, system_prompt="你将会接收到一个数据分析任务的最终报告请求,请根据提供的分析结果和图片信息生成完整的分析报告。", max_tokens=16384, # 设置较大的token限制以容纳完整报告 ) # 直接使用LLM响应作为最终报告(因为我们在prompt中要求直接输出Markdown) final_report_content = response # 兼容旧逻辑:如果意外返回了YAML,尝试解析 if response.strip().startswith("action:") or "final_report:" in response: try: yaml_data = self.llm.parse_yaml_response(response) if yaml_data.get("action") == "analysis_complete": final_report_content = yaml_data.get("final_report", response) except: pass # 解析失败则保持原样 print("[OK] 最终报告生成完成") except Exception as e: print(f"[ERROR] 生成最终报告时出错: {str(e)}") final_report_content = f"报告生成失败: {str(e)}" # 保存最终报告到文件 report_file_path = os.path.join(self.session_output_dir, "最终分析报告.md") try: with open(report_file_path, "w", encoding="utf-8") as f: f.write(final_report_content) print(f"[DOC] 最终报告已保存至: {report_file_path}") except Exception as e: print(f"[ERROR] 保存报告文件失败: {str(e)}") # 生成可复用脚本 script_path = "" try: script_path = generate_reusable_script( analysis_results=self.analysis_results, data_files=self.data_files, session_output_dir=self.session_output_dir, user_requirement=self.user_requirement ) except Exception as e: print(f"[WARN] 脚本生成失败: {e}") # 返回完整的分析结果 return { "session_output_dir": self.session_output_dir, "total_rounds": self.current_round, "analysis_results": self.analysis_results, "collected_figures": all_figures, "conversation_history": self.conversation_history, "final_report": final_report_content, "report_file_path": report_file_path, "reusable_script_path": script_path, } def _build_final_report_prompt(self, all_figures: List[Dict[str, Any]]) -> str: """构建用于生成最终报告的提示词""" # 构建图片信息摘要,使用相对路径 figures_summary = "" if all_figures: figures_summary = "\n生成的图片及分析:\n" for i, figure in enumerate(all_figures, 1): filename = figure.get("filename", "未知文件名") # 使用相对路径格式,适合在报告中引用 relative_path = f"./{filename}" figures_summary += f"{i}. {filename}\n" figures_summary += f" 相对路径: {relative_path}\n" figures_summary += f" 描述: {figure.get('description', '无描述')}\n" figures_summary += f" 分析: {figure.get('analysis', '无分析')}\n\n" else: figures_summary = "\n本次分析未生成图片。\n" # 构建代码执行结果摘要(仅包含成功执行的代码块) code_results_summary = "" success_code_count = 0 for result in self.analysis_results: if result.get("action") != "collect_figures" and result.get("code"): exec_result = result.get("result", {}) if exec_result.get("success"): success_code_count += 1 code_results_summary += f"代码块 {success_code_count}: 执行成功\n" if exec_result.get("output"): code_results_summary += ( f"输出: {exec_result.get('output')[:]}\n\n" ) # 构建各轮次证据数据摘要 evidence_summary = "" if self._session_ref and self._session_ref.rounds: evidence_parts = [] for rd in self._session_ref.rounds: round_num = rd.get("round", 0) summary = rd.get("result_summary", "") evidence = rd.get("evidence_rows", []) reasoning = rd.get("reasoning", "") part = f"第{round_num}轮: {summary}" if reasoning: part += f"\n 推理: {reasoning[:200]}" if evidence: part += f"\n 数据样本({len(evidence)}行): {json.dumps(evidence[:3], ensure_ascii=False, default=str)}" evidence_parts.append(part) evidence_summary = "\n".join(evidence_parts) # 使用 prompts.py 中的统一提示词模板,并添加相对路径使用说明 prompt = final_report_system_prompt.format( current_round=self.current_round, session_output_dir=self.session_output_dir, data_profile=self.data_profile, # 注入数据画像 figures_summary=figures_summary, code_results_summary=code_results_summary, ) # Append evidence data from all rounds for evidence annotation if evidence_summary: prompt += f""" **各轮次分析证据数据 (Evidence by Round)**: 以下是每轮分析的结果摘要和数据样本,请在报告中使用 `` 标注引用了哪一轮的数据: {evidence_summary} """ # 在提示词中明确要求使用相对路径 prompt += """ [FOLDER] **图片路径使用说明**: 报告和图片都在同一目录下,请在报告中使用相对路径引用图片: - 格式:![图片描述](./具体图片名称.png) - 示例:![营业总收入趋势](./营业总收入趋势.png) - 注意:必须使用实际生成的图片文件名,严禁使用占位符 """ return prompt def reset(self): """重置智能体状态""" self.conversation_history = [] self.analysis_results = [] self.current_round = 0 self.executor.reset_environment()