data_analysis_agent.py

# -*- coding: utf-8 -*-
"""
简化的 Notebook 数据分析智能体
仅包含用户和助手两个角
2. 图片必须保存到指定的会话目录中，输出绝对路径，禁止使用plt.show()
3. 表格输出控制：超过15行只显示前5行和后5行
4. 强制使用SimHei字体：plt.rcParams['font.sans-serif'] = ['SimHei']
5. 输出格式严格使用YAML共享上下文的单轮对话模式
"""

import os
import json
import re
import yaml
from typing import Dict, Any, List, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed
from utils.create_session_dir import create_session_output_dir
from utils.format_execution_result import format_execution_result
from utils.extract_code import extract_code_from_response
from utils.data_loader import load_and_profile_data, load_data_chunked, load_and_profile_data_smart
from utils.llm_helper import LLMHelper
from utils.code_executor import CodeExecutor
from utils.script_generator import generate_reusable_script
from utils.data_privacy import build_safe_profile, build_local_profile, sanitize_execution_feedback, generate_enriched_hint
from config.llm_config import LLMConfig
from config.app_config import app_config
from prompts import data_analysis_system_prompt, final_report_system_prompt, data_analysis_followup_prompt


# Regex patterns that indicate a data-context error (column/variable/DataFrame issues)
DATA_CONTEXT_PATTERNS = [
    r"KeyError:\s*['\"](.+?)['\"]",
    r"ValueError.*(?:column|col|field)",
    r"NameError.*(?:df|data|frame)",
    r"(?:empty|no\s+data|0\s+rows)",
    r"IndexError.*(?:out of range|out of bounds)",
]


class DataAnalysisAgent:
    """
    数据分析智能体

    职责：
    - 接收用户自然语言需求
    - 生成Python分析代码
    - 执行代码并收集结果
    - 基于执行结果继续生成后续分析代码
    """

    def __init__(
        self,
        llm_config: LLMConfig = None,
        output_dir: str = "outputs",
        max_rounds: int = 20,
        force_max_rounds: bool = False,
    ):
        """
        初始化智能体

        Args:
            config: LLM配置
            output_dir: 输出目录
            max_rounds: 最大对话轮数
            force_max_rounds: 是否强制运行到最大轮数（忽略AI的完成信号）
        """
        self.config = llm_config or LLMConfig()
        self.llm = LLMHelper(self.config)
        self.base_output_dir = output_dir
        self.max_rounds = max_rounds
        self.force_max_rounds = force_max_rounds
        # 对话历史和上下文
        self.conversation_history = []
        self.analysis_results = []
        self.current_round = 0
        self.session_output_dir = None
        self.executor = None
        self.data_profile = ""  # 存储数据画像（完整版，本地使用）
        self.data_profile_safe = ""  # 存储安全画像（发给LLM）
        self.data_files = []  # 存储数据文件列表
        self.user_requirement = ""  # 存储用户需求
        self._progress_callback = None  # 进度回调函数
        self._session_ref = None  # Reference to SessionData for round tracking

    def set_session_ref(self, session):
        """Set a reference to the SessionData instance for appending round data.

        Args:
            session: The SessionData instance for the current analysis session.
        """
        self._session_ref = session

    def set_progress_callback(self, callback):
        """Set a callback function(current_round, max_rounds, message) for progress updates."""
        self._progress_callback = callback

    def _summarize_result(self, result: Dict[str, Any]) -> str:
        """Produce a one-line summary from a code execution result.

        Args:
            result: The execution result dict from CodeExecutor.

        Returns:
            A concise summary string, e.g. "执行成功，输出 DataFrame (150行×8列)"
            or "执行失败: KeyError: 'col_x'".
        """
        if result.get("success"):
            evidence_rows = result.get("evidence_rows", [])
            if evidence_rows:
                num_rows = len(evidence_rows)
                num_cols = len(evidence_rows[0]) if evidence_rows else 0
                # Check auto_exported_files for more accurate row/col counts
                auto_files = result.get("auto_exported_files", [])
                if auto_files:
                    last_file = auto_files[-1]
                    num_rows = last_file.get("rows", num_rows)
                    num_cols = last_file.get("cols", num_cols)
                return f"执行成功，输出 DataFrame ({num_rows}行×{num_cols}列)"
            output = result.get("output", "")
            if output:
                first_line = output.strip().split("\n")[0][:80]
                return f"执行成功: {first_line}"
            return "执行成功"
        else:
            error = result.get("error", "未知错误")
            if len(error) > 100:
                error = error[:100] + "..."
            return f"执行失败: {error}"

    def _process_response(self, response: str) -> Dict[str, Any]:
        """
        统一处理LLM响应，判断行动类型并执行相应操作

        Args:
            response: LLM的响应内容

        Returns:
            处理结果字典
        """
        try:
            yaml_data = self.llm.parse_yaml_response(response)
            action = yaml_data.get("action", "generate_code")

            print(f"[TARGET] 检测到动作: {action}")

            if action == "analysis_complete":
                return self._handle_analysis_complete(response, yaml_data)
            elif action == "collect_figures":
                return self._handle_collect_figures(response, yaml_data)
            elif action == "generate_code":
                return self._handle_generate_code(response, yaml_data)
            else:
                print(f"[WARN] 未知动作类型: {action}，按generate_code处理")
                return self._handle_generate_code(response, yaml_data)

        except Exception as e:
            print(f"[WARN] 解析响应失败: {str(e)}，尝试提取代码并按generate_code处理")
            # 即使YAML解析失败，也尝试提取代码
            extracted_code = extract_code_from_response(response)
            if extracted_code:
                 return self._handle_generate_code(response, {"code": extracted_code})
            return self._handle_generate_code(response, {})

    def _handle_analysis_complete(
        self, response: str, yaml_data: Dict[str, Any]
    ) -> Dict[str, Any]:
        """处理分析完成动作"""
        print("[OK] 分析任务完成")
        final_report = yaml_data.get("final_report", "分析完成，无最终报告")
        return {
            "action": "analysis_complete",
            "final_report": final_report,
            "response": response,
            "continue": False,
        }

    def _handle_collect_figures(
        self, response: str, yaml_data: Dict[str, Any]
    ) -> Dict[str, Any]:
        """处理图片收集动作"""
        print("[CHART] 开始收集图片")
        figures_to_collect = yaml_data.get("figures_to_collect", [])

        collected_figures = []
        # 使用seen_paths集合来去重，防止重复收集
        seen_paths = set()

        for figure_info in figures_to_collect:
            figure_number = figure_info.get("figure_number", "未知")
            # 确保figure_number不为None时才用于文件名
            if figure_number != "未知":
                default_filename = f"figure_{figure_number}.png"
            else:
                default_filename = "figure_unknown.png"
            filename = figure_info.get("filename", default_filename)
            file_path = figure_info.get("file_path", "")  # 获取具体的文件路径
            description = figure_info.get("description", "")
            analysis = figure_info.get("analysis", "")

            print(f"[GRAPH] 收集图片 {figure_number}: {filename}")
            print(f"   [DIR] 路径: {file_path}")
            print(f"   [NOTE] 描述: {description}")
            print(f"   [SEARCH] 分析: {analysis}")

            # 验证文件是否存在
            # 只有文件真正存在时才加入列表，防止报告出现裂图
            if file_path and os.path.exists(file_path):
                # 检查是否已经收集过该路径
                abs_path = os.path.abspath(file_path)
                if abs_path not in seen_paths:
                    print(f"   [OK] 文件存在: {file_path}")
                    # 记录图片信息
                    collected_figures.append(
                        {
                            "figure_number": figure_number,
                            "filename": filename,
                            "file_path": file_path,
                            "description": description,
                            "analysis": analysis,
                        }
                    )
                    seen_paths.add(abs_path)
                else:
                    print(f"   [WARN] 跳过重复图片: {file_path}")
            else:
                if file_path:
                    print(f"   [WARN] 文件不存在: {file_path}")
                else:
                    print(f"   [WARN] 未提供文件路径")

        return {
            "action": "collect_figures",
            "collected_figures": collected_figures,
            "response": response,
            "continue": True,
        }

    def _handle_generate_code(
        self, response: str, yaml_data: Dict[str, Any]
    ) -> Dict[str, Any]:
        """处理代码生成和执行动作"""
        # 从YAML数据中获取代码（更准确）
        code = yaml_data.get("code", "")
        reasoning = yaml_data.get("reasoning", "")

        # 如果YAML中没有代码，尝试从响应中提取
        if not code:
            code = extract_code_from_response(response)
            
        # 二次清洗：防止YAML中解析出的code包含markdown标记
        if code:
            code = code.strip()
            if code.startswith("```"):
                # 去除开头的 ```python 或 ```
                code = re.sub(r"^```[a-zA-Z]*\n", "", code)
                # 去除结尾的 ```
                code = re.sub(r"\n```$", "", code)
                code = code.strip()

        if code:
            print(f"[TOOL] 执行代码:\n{code}")
            print("-" * 40)

            # 执行代码
            result = self.executor.execute_code(code)

            # 格式化执行结果
            feedback = format_execution_result(result)
            print(f"[LIST] 执行反馈:\n{feedback}")

            return {
                "action": "generate_code",
                "code": code,
                "reasoning": reasoning,
                "result": result,
                "feedback": feedback,
                "response": response,
                "continue": True,
            }
        else:
            # 如果没有代码，说明LLM响应格式有问题，需要重新生成
            print("[WARN] 未从响应中提取到可执行代码，要求LLM重新生成")
            return {
                "action": "invalid_response",
                "reasoning": reasoning,
                "error": "响应中缺少可执行代码",
                "response": response,
                "continue": True,
            }

    def _classify_error(self, error_message: str) -> str:
        """Classify execution error as data-context or other.

        Inspects the error message against DATA_CONTEXT_PATTERNS to determine
        if the error is related to data context (missing columns, undefined
        data variables, empty DataFrames, etc.).

        Args:
            error_message: The error message string from code execution.

        Returns:
            "data_context" if the error matches a data-context pattern,
            "other" otherwise.
        """
        for pattern in DATA_CONTEXT_PATTERNS:
            if re.search(pattern, error_message, re.IGNORECASE):
                return "data_context"
        return "other"

    def _trim_conversation_history(self):
        """Apply sliding window trimming to conversation history.

        Retains the first user message (original requirement + Safe_Profile) at
        index 0, generates a compressed summary of old messages, and keeps only
        the most recent ``conversation_window_size`` message pairs in full.
        """
        window_size = app_config.conversation_window_size
        max_messages = window_size * 2  # pairs of user+assistant messages

        if len(self.conversation_history) <= max_messages:
            return  # No trimming needed

        first_message = self.conversation_history[0]  # Always retain

        # Determine trim boundary: skip first message + possible existing summary
        start_idx = 1
        has_existing_summary = (
            len(self.conversation_history) > 1
            and self.conversation_history[1]["role"] == "user"
            and self.conversation_history[1]["content"].startswith("[分析摘要]")
        )
        if has_existing_summary:
            start_idx = 2

        # Messages to trim vs keep
        messages_to_consider = self.conversation_history[start_idx:]
        messages_to_trim = messages_to_consider[:-max_messages]
        messages_to_keep = messages_to_consider[-max_messages:]

        if not messages_to_trim:
            return

        # Generate summary of trimmed messages
        summary = self._compress_trimmed_messages(messages_to_trim)

        # Rebuild history: first_message + summary + recent messages
        self.conversation_history = [first_message]
        if summary:
            self.conversation_history.append({"role": "user", "content": summary})
        self.conversation_history.extend(messages_to_keep)

    def _compress_trimmed_messages(self, messages: list) -> str:
        """Compress trimmed messages into a concise summary string.

        Extracts the action type from each assistant message and the execution
        outcome (success / failure) from the subsequent user feedback message.
        Code blocks and raw execution output are excluded.

        Args:
            messages: List of conversation message dicts to compress.

        Returns:
            A summary string prefixed with ``[分析摘要]``.
        """
        summary_parts = ["[分析摘要] 以下是之前分析轮次的概要:"]
        round_num = 0

        for msg in messages:
            content = msg["content"]
            if msg["role"] == "assistant":
                round_num += 1
                # Extract action type from YAML-like content
                action = "generate_code"
                if "action: \"collect_figures\"" in content or "action: collect_figures" in content:
                    action = "collect_figures"
                elif "action: \"analysis_complete\"" in content or "action: analysis_complete" in content:
                    action = "analysis_complete"
                summary_parts.append(f"- 轮次{round_num}: 动作={action}")
            elif msg["role"] == "user" and "代码执行反馈" in content:
                success = "失败" if "[ERROR]" in content or "执行错误" in content else "成功"
                if summary_parts and summary_parts[-1].startswith("- 轮次"):
                    summary_parts[-1] += f", 执行结果={success}"

        return "\n".join(summary_parts)

    def _profile_files_parallel(self, file_paths: list) -> tuple:
        """Profile multiple files concurrently using ThreadPoolExecutor.

        Each file is profiled independently via ``build_safe_profile`` and
        ``build_local_profile``.  Results are collected and merged.  If any
        individual file fails, an error entry is included for that file and
        profiling continues for the remaining files.

        Args:
            file_paths: List of file paths to profile.

        Returns:
            A tuple ``(safe_profile, local_profile)`` of merged markdown strings.
        """
        max_workers = app_config.max_parallel_profiles
        safe_profiles = []
        local_profiles = []

        def profile_single(path):
            safe = build_safe_profile([path])
            local = build_local_profile([path])
            return path, safe, local

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = {executor.submit(profile_single, p): p for p in file_paths}
            for future in as_completed(futures):
                path = futures[future]
                try:
                    _, safe, local = future.result()
                    safe_profiles.append(safe)
                    local_profiles.append(local)
                except Exception as e:
                    error_entry = f"## 文件: {os.path.basename(path)}\n[ERROR] 分析失败: {e}\n\n"
                    safe_profiles.append(error_entry)
                    local_profiles.append(error_entry)

        return "\n".join(safe_profiles), "\n".join(local_profiles)

    def analyze(self, user_input: str, files: List[str] = None, session_output_dir: str = None, reset_session: bool = True, max_rounds: int = None, template_name: str = None) -> Dict[str, Any]:
        """
        开始分析流程

        Args:
            user_input: 用户的自然语言需求
            files: 数据文件路径列表
            session_output_dir: 指定的会话输出目录（可选）
            reset_session: 是否重置会话 (True: 新开启分析; False: 在现有上下文中继续)
            max_rounds: 本次分析的最大轮数 (可选，如果不填则使用默认值)
            template_name: 分析模板名称 (可选，如果提供则使用模板引导分析)

        Returns:
            分析结果字典
        """
        
        # 确定本次运行的轮数限制
        current_max_rounds = max_rounds if max_rounds is not None else self.max_rounds

        # Template integration: prepend template prompt to user input if provided
        if template_name:
            from utils.analysis_templates import get_template
            template = get_template(template_name)  # Raises ValueError if invalid
            template_prompt = template.get_full_prompt()
            user_input = f"{template_prompt}\n\n{user_input}"

        if reset_session:
            # --- 初始化新会话 ---
            self.conversation_history = []
            self.analysis_results = []
            self.current_round = 0
            self.data_files = files or []  # 保存数据文件列表
            self.user_requirement = user_input  # 保存用户需求
            
            # 创建本次分析的专用输出目录
            if session_output_dir:
                 self.session_output_dir = session_output_dir
            else:
                self.session_output_dir = create_session_output_dir(
                    self.base_output_dir, user_input
                )

            # 初始化代码执行器，使用会话目录
            self.executor = CodeExecutor(self.session_output_dir)

            # 设置会话目录变量到执行环境中
            self.executor.set_variable("session_output_dir", self.session_output_dir)

            # 生成数据画像（分级：安全级发给LLM，完整级留本地）
            data_profile_safe = ""
            data_profile_local = ""
            if files:
                print("[SEARCH] 正在生成数据画像...")
                try:
                    if len(files) > 1:
                        # Parallel profiling for multiple files
                        data_profile_safe, data_profile_local = self._profile_files_parallel(files)
                    else:
                        data_profile_safe = build_safe_profile(files)
                        data_profile_local = build_local_profile(files)
                    print("[OK] 数据画像生成完毕（安全级 + 本地级）")
                except Exception as e:
                    print(f"[WARN] 数据画像生成失败: {e}")

                # Expose chunked iterators for large files in the Code_Executor namespace
                for fp in files:
                    try:
                        if os.path.exists(fp):
                            file_size_mb = os.path.getsize(fp) / (1024 * 1024)
                            if file_size_mb > app_config.max_file_size_mb:
                                var_name = "chunked_iter_" + os.path.splitext(os.path.basename(fp))[0]
                                # Store a factory so the iterator can be re-created
                                self.executor.set_variable(var_name, lambda p=fp: load_data_chunked(p))
                                print(f"[OK] 大文件 {os.path.basename(fp)} 的分块迭代器已注入为 {var_name}()")
                    except Exception as e:
                        print(f"[WARN] 注入分块迭代器失败 ({os.path.basename(fp)}): {e}")
            
            # 安全画像发给LLM，完整画像留给最终报告生成
            self.data_profile = data_profile_local  # 本地完整版用于最终报告
            self.data_profile_safe = data_profile_safe  # 安全版用于LLM对话

            # 构建初始prompt（只发送安全级画像给LLM）
            initial_prompt = f"""用户需求: {user_input}"""
            if files:
                initial_prompt += f"\n数据文件: {', '.join(files)}"
            
            if data_profile_safe:
                initial_prompt += f"\n\n{data_profile_safe}\n\n请根据上述【数据结构概览】中的列名、数据类型和特征描述来制定分析策略。先通过代码探索数据的实际分布，再进行深度分析。"

            print(f"[START] 开始数据分析任务")
            print(f"[NOTE] 用户需求: {user_input}")
            if files:
                print(f"[FOLDER] 数据文件: {', '.join(files)}")
            print(f"[DIR] 输出目录: {self.session_output_dir}")
            
            # 添加到对话历史
            self.conversation_history.append({"role": "user", "content": initial_prompt})
            
        else:
            # --- 继续现有会话 ---
            # 如果是追问，且没有指定轮数，默认减少轮数，避免过度分析
            if max_rounds is None:
                current_max_rounds = 10 # 追问通常不需要那么长的思考链，10轮足够
            
            print(f"\n[START] 继续分析任务 (追问模式)")
            print(f"[NOTE] 后续需求: {user_input}")
            
            # 重置当前轮数计数器，以便给新任务足够的轮次
            self.current_round = 0 
            
            # 添加到对话历史
            # 提示Agent这是后续追问，可以简化步骤
            follow_up_prompt = f"后续需求: {user_input}\n(注意：这是后续追问，请直接针对该问题进行分析，无需从头开始执行完整SOP。)"
            self.conversation_history.append({"role": "user", "content": follow_up_prompt})

        print(f"[NUM] 本次最大轮数: {current_max_rounds}")
        if self.force_max_rounds:
            print(f"[FAST] 强制模式: 将运行满 {current_max_rounds} 轮（忽略AI完成信号）")
        print("=" * 60)
        
        # 保存原始 max_rounds 以便恢复（虽然 analyze 结束后不需要恢复，但为了逻辑严谨）
        original_max_rounds = self.max_rounds
        self.max_rounds = current_max_rounds

        # 初始化连续失败计数器
        consecutive_failures = 0
        # Per-round data-context retry counter
        data_context_retries = 0
        last_retry_round = 0

        while self.current_round < self.max_rounds:
            self.current_round += 1
            # Notify progress callback
            if self._progress_callback:
                self._progress_callback(self.current_round, self.max_rounds, f"第{self.current_round}/{self.max_rounds}轮分析中...")
            # Reset data-context retry counter when entering a new round
            if self.current_round != last_retry_round:
                data_context_retries = 0

            # Trim conversation history after the first round to bound token usage
            if self.current_round > 1:
                self._trim_conversation_history()

            print(f"\n[LOOP] 第 {self.current_round} 轮分析")
            # 调用LLM生成响应
            try:  # 获取当前执行环境的变量信息
                notebook_variables = self.executor.get_environment_info()

                # Select prompt based on mode
                if self.current_round == 1 and not reset_session:
                     # For the first round of a follow-up session, use the specialized prompt
                     base_system_prompt = data_analysis_followup_prompt
                elif not reset_session and self.current_round > 1:
                     # For subsequent rounds in follow-up, continue using the follow-up context
                     # or maybe just the standard one is fine as long as SOP isn't fully enforced?
                     # Let's stick to the follow-up prompt to prevent SOP regression
                     base_system_prompt = data_analysis_followup_prompt
                else:
                     base_system_prompt = data_analysis_system_prompt

                # 格式化系统提示词，填入动态的notebook变量信息
                formatted_system_prompt = base_system_prompt.format(
                    notebook_variables=notebook_variables
                )
                print(f"[DEBUG] [DEBUG] System Prompt Head:\n{formatted_system_prompt[:500]}...\n[...]")
                print(f"[DEBUG] [DEBUG] System Prompt Rules Check: 'stop_words' in prompt? {'stop_words' in formatted_system_prompt}")

                response = self.llm.call(
                    prompt=self._build_conversation_prompt(),
                    system_prompt=formatted_system_prompt,
                )

                print(f"[AI] 助手响应:\n{response}")

                # 使用统一的响应处理方法
                process_result = self._process_response(response)

                # 根据处理结果决定是否继续（仅在非强制模式下）
                if process_result.get("action") == "invalid_response":
                    consecutive_failures += 1
                    print(f"[WARN] 连续失败次数: {consecutive_failures}/3")
                    if consecutive_failures >= 3:
                        print(f"[ERROR] 连续3次无法获取有效响应，分析终止。请检查网络或配置。")
                        break
                else:
                    consecutive_failures = 0  # 重置计数器

                if not self.force_max_rounds and not process_result.get(
                    "continue", True
                ):
                    print(f"\n[OK] 分析完成！")
                    break

                # 添加到对话历史
                self.conversation_history.append(
                    {"role": "assistant", "content": response}
                )

                # 根据动作类型添加不同的反馈
                if process_result["action"] == "generate_code":
                    feedback = process_result.get("feedback", "")
                    result = process_result.get("result", {})
                    execution_failed = not result.get("success", True)

                    # --- Data-context retry logic ---
                    if execution_failed:
                        error_output = result.get("error", "") or feedback
                        error_class = self._classify_error(error_output)

                        if error_class == "data_context" and data_context_retries < app_config.max_data_context_retries:
                            data_context_retries += 1
                            last_retry_round = self.current_round
                            print(f"[RETRY] 数据上下文错误，重试 {data_context_retries}/{app_config.max_data_context_retries}")
                            # Generate enriched hint from safe profile
                            enriched_hint = generate_enriched_hint(error_output, self.data_profile_safe)
                            # Add enriched hint to conversation history (assistant response already added above)
                            self.conversation_history.append(
                                {"role": "user", "content": enriched_hint}
                            )
                            # Record the failed attempt
                            self.analysis_results.append(
                                {
                                    "round": self.current_round,
                                    "code": process_result.get("code", ""),
                                    "result": result,
                                    "response": response,
                                    "retry": True,
                                }
                            )
                            # Retry within the same round: decrement round counter so the
                            # outer loop's increment brings us back to the same round number
                            self.current_round -= 1
                            continue

                    # Normal feedback path (no retry or non-data-context error or at limit)
                    safe_feedback = sanitize_execution_feedback(feedback)
                    self.conversation_history.append(
                        {"role": "user", "content": f"代码执行反馈:\n{safe_feedback}"}
                    )

                    # 记录分析结果
                    self.analysis_results.append(
                        {
                            "round": self.current_round,
                            "code": process_result.get("code", ""),
                            "result": process_result.get("result", {}),
                            "response": response,
                        }
                    )

                    # --- Construct Round_Data and append to session ---
                    result = process_result.get("result", {})
                    round_data = {
                        "round": self.current_round,
                        "reasoning": process_result.get("reasoning", ""),
                        "code": process_result.get("code", ""),
                        "result_summary": self._summarize_result(result),
                        "evidence_rows": result.get("evidence_rows", []),
                        "raw_log": feedback,
                        "auto_exported_files": result.get("auto_exported_files", []),
                        "prompt_saved_files": result.get("prompt_saved_files", []),
                    }

                    if self._session_ref:
                        self._session_ref.rounds.append(round_data)
                        # Merge file metadata into SessionData.data_files
                        for f in round_data.get("auto_exported_files", []):
                            if f.get("skipped"):
                                continue  # Large DataFrame — not written to disk
                            self._session_ref.data_files.append({
                                "filename": f.get("filename", ""),
                                "description": f"自动导出: {f.get('variable_name', '')}",
                                "rows": f.get("rows", 0),
                                "cols": f.get("cols", 0),
                                "columns": f.get("columns", []),
                                "size_bytes": 0,
                                "source": "auto",
                            })
                        for f in round_data.get("prompt_saved_files", []):
                            self._session_ref.data_files.append({
                                "filename": f.get("filename", ""),
                                "description": f.get("description", ""),
                                "rows": f.get("rows", 0),
                                "cols": 0,
                                "columns": [],
                                "size_bytes": 0,
                                "source": "prompt",
                            })
                elif process_result["action"] == "collect_figures":
                    # 记录图片收集结果
                    collected_figures = process_result.get("collected_figures", [])

                    missing_figures = process_result.get("missing_figures", [])
                    
                    feedback = f"已收集 {len(collected_figures)} 个有效图片及其分析。"
                    if missing_figures:
                        feedback += f"\n[WARN] 以下图片未找到，请检查代码是否成功保存了这些图片: {missing_figures}"

                    self.conversation_history.append(
                        {
                            "role": "user",
                            "content": f"图片收集反馈:\n{feedback}\n请继续下一步分析。",
                        }
                    )

                    # 记录到分析结果中
                    self.analysis_results.append(
                        {
                            "round": self.current_round,
                            "action": "collect_figures",
                            "collected_figures": collected_figures,
                            "missing_figures": missing_figures,

                            "response": response,
                        }
                    )

            except Exception as e:
                error_msg = f"LLM调用错误: {str(e)}"
                print(f"[ERROR] {error_msg}")
                self.conversation_history.append(
                    {
                        "role": "user",
                        "content": f"发生错误: {error_msg}，请重新生成代码。",
                    }
                )
        # 生成最终总结
        if self.current_round >= self.max_rounds:
            print(f"\n[WARN] 已达到最大轮数 ({self.max_rounds})，分析结束")

        return self._generate_final_report()

    def _build_conversation_prompt(self) -> str:
        """构建对话提示词"""
        prompt_parts = []

        for msg in self.conversation_history:
            role = msg["role"]
            content = msg["content"]
            if role == "user":
                prompt_parts.append(f"用户: {content}")
            else:
                prompt_parts.append(f"助手: {content}")

        return "\n\n".join(prompt_parts)

    def _generate_final_report(self) -> Dict[str, Any]:
        """生成最终分析报告"""
        # 收集所有生成的图片信息
        all_figures = []
        for result in self.analysis_results:
            if result.get("action") == "collect_figures":
                all_figures.extend(result.get("collected_figures", []))

        print(f"\n[CHART] 开始生成最终分析报告...")
        print(f"[DIR] 输出目录: {self.session_output_dir}")
        
        # --- 自动补全/发现图片机制 ---
        # 扫描目录下所有的png文件
        try:
            import glob
            existing_pngs = glob.glob(os.path.join(self.session_output_dir, "*.png"))
            
            # 获取已收集的图片路径集合
            collected_paths = set()
            for fig in all_figures:
                if fig.get("file_path"):
                    collected_paths.add(os.path.abspath(fig.get("file_path")))
            
            # 检查是否有漏网之鱼
            for png_path in existing_pngs:
                abs_png_path = os.path.abspath(png_path)
                if abs_png_path not in collected_paths:
                    print(f"[SEARCH] [自动发现] 补充未显式收集的图片: {os.path.basename(png_path)}")
                    all_figures.append({
                        "figure_number": "Auto",
                        "filename": os.path.basename(png_path),
                        "file_path": abs_png_path,
                        "description": f"自动发现的分析图表: {os.path.basename(png_path)}",
                        "analysis": "（该图表由系统自动捕获，Agent未提供具体分析文本，请结合图表标题理解）"
                    })
        except Exception as e:
            print(f"[WARN] 自动发现图片失败: {e}")
        # ---------------------------

        print(f"[NUM] 总轮数: {self.current_round}")
        print(f"[GRAPH] 收集图片: {len(all_figures)} 个")

        # 构建用于生成最终报告的提示词
        final_report_prompt = self._build_final_report_prompt(all_figures)

        try:  # 调用LLM生成最终报告
            response = self.llm.call(
                prompt=final_report_prompt,
                system_prompt="你将会接收到一个数据分析任务的最终报告请求，请根据提供的分析结果和图片信息生成完整的分析报告。",
                max_tokens=16384,  # 设置较大的token限制以容纳完整报告
            )

            # 直接使用LLM响应作为最终报告（因为我们在prompt中要求直接输出Markdown）
            final_report_content = response
            
            # 兼容旧逻辑：如果意外返回了YAML，尝试解析
            if response.strip().startswith("action:") or "final_report:" in response:
                try:
                    yaml_data = self.llm.parse_yaml_response(response)
                    if yaml_data.get("action") == "analysis_complete":
                         final_report_content = yaml_data.get("final_report", response)
                except:
                    pass # 解析失败则保持原样

            print("[OK] 最终报告生成完成")

        except Exception as e:
            print(f"[ERROR] 生成最终报告时出错: {str(e)}")
            final_report_content = f"报告生成失败: {str(e)}"

        # 保存最终报告到文件
        report_file_path = os.path.join(self.session_output_dir, "最终分析报告.md")
        try:
            with open(report_file_path, "w", encoding="utf-8") as f:
                f.write(final_report_content)
            print(f"[DOC] 最终报告已保存至: {report_file_path}")
        except Exception as e:
            print(f"[ERROR] 保存报告文件失败: {str(e)}")

        # 生成可复用脚本
        script_path = ""
        try:
            script_path = generate_reusable_script(
                analysis_results=self.analysis_results,
                data_files=self.data_files,
                session_output_dir=self.session_output_dir,
                user_requirement=self.user_requirement
            )
        except Exception as e:
            print(f"[WARN] 脚本生成失败: {e}")

        # 返回完整的分析结果
        return {
            "session_output_dir": self.session_output_dir,
            "total_rounds": self.current_round,
            "analysis_results": self.analysis_results,
            "collected_figures": all_figures,
            "conversation_history": self.conversation_history,
            "final_report": final_report_content,
            "report_file_path": report_file_path,
            "reusable_script_path": script_path,
        }

    def _build_final_report_prompt(self, all_figures: List[Dict[str, Any]]) -> str:
        """构建用于生成最终报告的提示词"""

        # 构建图片信息摘要，使用相对路径
        figures_summary = ""
        if all_figures:
            figures_summary = "\n生成的图片及分析:\n"
            for i, figure in enumerate(all_figures, 1):
                filename = figure.get("filename", "未知文件名")
                # 使用相对路径格式，适合在报告中引用
                relative_path = f"./{filename}"
                figures_summary += f"{i}. {filename}\n"
                figures_summary += f"   相对路径: {relative_path}\n"
                figures_summary += f"   描述: {figure.get('description', '无描述')}\n"
                figures_summary += f"   分析: {figure.get('analysis', '无分析')}\n\n"
        else:
            figures_summary = "\n本次分析未生成图片。\n"

        # 构建代码执行结果摘要（仅包含成功执行的代码块）
        code_results_summary = ""
        success_code_count = 0
        for result in self.analysis_results:
            if result.get("action") != "collect_figures" and result.get("code"):
                exec_result = result.get("result", {})
                if exec_result.get("success"):
                    success_code_count += 1
                    code_results_summary += f"代码块 {success_code_count}: 执行成功\n"
                    if exec_result.get("output"):
                        code_results_summary += (
                            f"输出: {exec_result.get('output')[:]}\n\n"
                        )

        # 构建各轮次证据数据摘要
        evidence_summary = ""
        if self._session_ref and self._session_ref.rounds:
            evidence_parts = []
            for rd in self._session_ref.rounds:
                round_num = rd.get("round", 0)
                summary = rd.get("result_summary", "")
                evidence = rd.get("evidence_rows", [])
                reasoning = rd.get("reasoning", "")
                part = f"第{round_num}轮: {summary}"
                if reasoning:
                    part += f"\n  推理: {reasoning[:200]}"
                if evidence:
                    part += f"\n  数据样本({len(evidence)}行): {json.dumps(evidence[:3], ensure_ascii=False, default=str)}"
                evidence_parts.append(part)
            evidence_summary = "\n".join(evidence_parts)

        # 使用 prompts.py 中的统一提示词模板，并添加相对路径使用说明
        prompt = final_report_system_prompt.format(
            current_round=self.current_round,
            session_output_dir=self.session_output_dir,
            data_profile=self.data_profile,  # 注入数据画像
            figures_summary=figures_summary,
            code_results_summary=code_results_summary,
        )

        # Append evidence data from all rounds for evidence annotation
        if evidence_summary:
            prompt += f"""

**各轮次分析证据数据 (Evidence by Round)**：
以下是每轮分析的结果摘要和数据样本，请在报告中使用 `<!-- evidence:round_N -->` 标注引用了哪一轮的数据：

{evidence_summary}
"""

        # 在提示词中明确要求使用相对路径
        prompt += """

[FOLDER] **图片路径使用说明**：
报告和图片都在同一目录下，请在报告中使用相对路径引用图片：
- 格式：![图片描述](./具体图片名称.png)
- 示例：![营业总收入趋势](./营业总收入趋势.png)
- 注意：必须使用实际生成的图片文件名，严禁使用占位符
"""

        return prompt

    def reset(self):
        """重置智能体状态"""
        self.conversation_history = []
        self.analysis_results = []
        self.current_round = 0
        self.executor.reset_environment()