Merge branch 'main' of http://jeason.online:3000/zhaojie/iov_data_analysis_agent

2026-04-19 16:29:59 +08:00
parent c5083736e2 b033eb61cc
commit b256aa27d9
22 changed files with 2060 additions and 916 deletions
--- a/utils/init.py
+++ b/utils/init.py
@@ -6,5 +6,12 @@
 from utils.code_executor import CodeExecutor
 from utils.llm_helper import LLMHelper
 from utils.fallback_openai_client import AsyncFallbackOpenAIClient
+from utils.logger import PrintCapture, create_session_logger

-__all__ = ["CodeExecutor", "LLMHelper", "AsyncFallbackOpenAIClient"]
+__all__ = [
+    "CodeExecutor",
+    "LLMHelper",
+    "AsyncFallbackOpenAIClient",
+    "PrintCapture",
+    "create_session_logger",
+]
--- a/utils/data_privacy.py
+++ b/utils/data_privacy.py
@@ -0,0 +1,225 @@
+# -*- coding: utf-8 -*-
+"""
+数据隐私保护层
+
+核心原则：发给外部 LLM 的信息只包含 schema 级别的元数据，
+绝不包含真实数据值。所有真实数据仅在本地代码执行环境中使用。
+
+分级策略：
+  - SAFE（安全级）: 可发送给 LLM — 列名、数据类型、行列数、空值率、唯一值数量
+  - LOCAL（本地级）: 仅本地使用 — 真实数据值、TOP N 高频值、统计数值、样本行
+"""
+
+import re
+import pandas as pd
+from typing import List
+
+
+def build_safe_profile(file_paths: list) -> str:
+    """
+    生成可安全发送给外部 LLM 的数据画像。
+    只包含 schema 信息，不包含任何真实数据值。
+
+    Args:
+        file_paths: 数据文件路径列表
+
+    Returns:
+        安全的 Markdown 格式数据画像
+    """
+    import os
+
+    profile = "# 数据结构概览 (Schema Profile)\n\n"
+
+    if not file_paths:
+        return profile + "未提供数据文件。"
+
+    for file_path in file_paths:
+        file_name = os.path.basename(file_path)
+        profile += f"## 文件: {file_name}\n\n"
+
+        if not os.path.exists(file_path):
+            profile += f"[WARN] 文件不存在: {file_path}\n\n"
+            continue
+
+        try:
+            df = _load_dataframe(file_path)
+            if df is None:
+                continue
+
+            rows, cols = df.shape
+            profile += f"- **维度**: {rows} 行 x {cols} 列\n"
+            profile += f"- **列名**: `{', '.join(df.columns)}`\n\n"
+            profile += "### 列结构:\n\n"
+            profile += "| 列名 | 数据类型 | 空值率 | 唯一值数 | 特征描述 |\n"
+            profile += "|------|---------|--------|---------|----------|\n"
+
+            for col in df.columns:
+                dtype = str(df[col].dtype)
+                null_count = df[col].isnull().sum()
+                null_pct = f"{(null_count / rows) * 100:.1f}%" if rows > 0 else "0%"
+                unique_count = df[col].nunique()
+
+                # 特征描述：只描述数据特征，不暴露具体值
+                feature_desc = _describe_column_safe(df[col], unique_count, rows)
+
+                profile += f"| {col} | {dtype} | {null_pct} | {unique_count} | {feature_desc} |\n"
+
+            profile += "\n"
+
+        except Exception as e:
+            profile += f"[ERROR] 读取文件失败: {str(e)}\n\n"
+
+    return profile
+
+
+def build_local_profile(file_paths: list) -> str:
+    """
+    生成完整的本地数据画像（包含真实数据值）。
+    仅用于本地代码执行环境，不发送给 LLM。
+
+    这是原来 load_and_profile_data 的功能，保留完整信息。
+    """
+    from utils.data_loader import load_and_profile_data
+    return load_and_profile_data(file_paths)
+
+
+def sanitize_execution_feedback(feedback: str, max_lines: int = 30) -> str:
+    """
+    对代码执行反馈进行脱敏处理，移除可能包含真实数据的内容。
+
+    保留：
+      - 执行状态（成功/失败）
+      - 错误信息
+      - DataFrame 的 shape 信息
+      - 图片保存路径
+      - 列名信息
+
+    移除/截断：
+      - 具体的数据行（DataFrame 输出）
+      - 大段的数值输出
+
+    Args:
+        feedback: 原始执行反馈
+        max_lines: 最大保留行数
+
+    Returns:
+        脱敏后的反馈
+    """
+    if not feedback:
+        return feedback
+
+    lines = feedback.split("\n")
+    safe_lines = []
+    in_dataframe_output = False
+    df_line_count = 0
+
+    for line in lines:
+        stripped = line.strip()
+
+        # 始终保留的关键信息
+        if any(kw in stripped for kw in [
+            "图片已保存", "保存至", "[OK]", "[WARN]", "[ERROR]",
+            "[Auto-Save]", "数据表形状", "列名:", ".png",
+            "shape", "columns", "dtype", "info()", "describe()",
+        ]):
+            safe_lines.append(line)
+            in_dataframe_output = False
+            continue
+
+        # 检测 DataFrame 输出的开始（通常有列头行）
+        if _looks_like_dataframe_row(stripped):
+            if not in_dataframe_output:
+                in_dataframe_output = True
+                df_line_count = 0
+                safe_lines.append("[数据输出已省略 - 数据仅在本地执行环境中可见]")
+            df_line_count += 1
+            continue
+
+        # 检测纯数值行
+        if _is_numeric_heavy_line(stripped):
+            if not in_dataframe_output:
+                in_dataframe_output = True
+                safe_lines.append("[数值输出已省略]")
+            continue
+
+        # 普通文本行
+        in_dataframe_output = False
+        safe_lines.append(line)
+
+    # 限制总行数
+    if len(safe_lines) > max_lines:
+        safe_lines = safe_lines[:max_lines]
+        safe_lines.append(f"[... 输出已截断，共 {len(lines)} 行]")
+
+    return "\n".join(safe_lines)
+
+
+def _load_dataframe(file_path: str):
+    """加载 DataFrame，支持多种格式和编码"""
+    import os
+
+    ext = os.path.splitext(file_path)[1].lower()
+    if ext == ".csv":
+        for encoding in ["utf-8", "gbk", "gb18030", "latin1"]:
+            try:
+                return pd.read_csv(file_path, encoding=encoding)
+            except (UnicodeDecodeError, Exception):
+                continue
+    elif ext in [".xlsx", ".xls"]:
+        try:
+            return pd.read_excel(file_path)
+        except Exception:
+            pass
+    return None
+
+
+def _describe_column_safe(series: pd.Series, unique_count: int, total_rows: int) -> str:
+    """安全地描述列特征，不暴露具体值"""
+    dtype = series.dtype
+
+    if pd.api.types.is_numeric_dtype(dtype):
+        if unique_count <= 5:
+            return "低基数数值（可能是分类编码）"
+        elif unique_count < total_rows * 0.05:
+            return "离散数值"
+        else:
+            return "连续数值"
+
+    if pd.api.types.is_datetime64_any_dtype(dtype):
+        return "时间序列"
+
+    # 文本/分类列
+    if unique_count == 1:
+        return "单一值（常量列）"
+    elif unique_count <= 10:
+        return f"低基数分类（{unique_count}类）"
+    elif unique_count <= 50:
+        return f"中基数分类（{unique_count}类）"
+    elif unique_count > total_rows * 0.8:
+        return "高基数文本（可能是ID或描述）"
+    else:
+        return f"文本分类（{unique_count}类）"
+
+
+def _looks_like_dataframe_row(line: str) -> bool:
+    """判断一行是否看起来像 DataFrame 输出"""
+    if not line:
+        return False
+    # DataFrame 输出通常有多个空格分隔的列
+    parts = line.split()
+    if len(parts) >= 3:
+        # 第一个元素是索引（数字）
+        try:
+            int(parts[0])
+            return True
+        except ValueError:
+            pass
+    return False
+
+
+def _is_numeric_heavy_line(line: str) -> bool:
+    """判断一行是否主要由数值组成"""
+    if not line or len(line) < 5:
+        return False
+    digits_and_dots = sum(1 for c in line if c.isdigit() or c in ".,-+eE ")
+    return digits_and_dots / len(line) > 0.7
--- a/utils/logger.py
+++ b/utils/logger.py
@@ -0,0 +1,113 @@
+# -*- coding: utf-8 -*-
+"""
+统一日志模块 - 替代全局 sys.stdout 劫持
+
+提供线程安全的日志记录，支持同时输出到终端和文件。
+每个会话拥有独立的日志文件，不会互相干扰。
+"""
+
+import logging
+import os
+import sys
+from datetime import datetime
+from typing import Optional
+
+
+def create_session_logger(
+    session_id: str,
+    log_dir: str,
+    log_filename: str = "process.log",
+    level: int = logging.INFO,
+) -> logging.Logger:
+    """
+    为指定会话创建独立的 Logger 实例。
+
+    Args:
+        session_id: 会话唯一标识
+        log_dir: 日志文件所在目录
+        log_filename: 日志文件名
+        level: 日志级别
+
+    Returns:
+        配置好的 Logger 实例
+    """
+    logger = logging.getLogger(f"session.{session_id}")
+    logger.setLevel(level)
+
+    # 避免重复添加 handler
+    if logger.handlers:
+        return logger
+
+    formatter = logging.Formatter(
+        fmt="%(asctime)s %(message)s",
+        datefmt="%H:%M:%S",
+    )
+
+    # 文件 handler — 写入会话专属日志
+    os.makedirs(log_dir, exist_ok=True)
+    log_path = os.path.join(log_dir, log_filename)
+    file_handler = logging.FileHandler(log_path, encoding="utf-8", mode="a")
+    file_handler.setFormatter(formatter)
+    logger.addHandler(file_handler)
+
+    # 终端 handler — 输出到 stderr（不干扰 stdout）
+    console_handler = logging.StreamHandler(sys.stderr)
+    console_handler.setFormatter(formatter)
+    logger.addHandler(console_handler)
+
+    # 不向父 logger 传播
+    logger.propagate = False
+
+    return logger
+
+
+class PrintCapture:
+    """
+    轻量级 print 捕获器，将 print 输出同时写入日志文件。
+    用于兼容现有大量使用 print() 的代码，无需逐行改造。
+
+    用法:
+        with PrintCapture(log_path) as cap:
+            print("hello")  # 同时输出到终端和文件
+        # 退出后 sys.stdout 自动恢复
+    """
+
+    def __init__(self, log_path: str, filter_patterns: Optional[list] = None):
+        self.log_path = log_path
+        self.filter_patterns = filter_patterns or ["[TOOL] 执行代码:"]
+        self._original_stdout = None
+        self._log_file = None
+
+    def __enter__(self):
+        os.makedirs(os.path.dirname(self.log_path), exist_ok=True)
+        self._original_stdout = sys.stdout
+        self._log_file = open(self.log_path, "a", encoding="utf-8", buffering=1)
+        sys.stdout = self._DualWriter(
+            self._original_stdout, self._log_file, self.filter_patterns
+        )
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        sys.stdout = self._original_stdout
+        if self._log_file:
+            self._log_file.close()
+        return False
+
+    class _DualWriter:
+        """同时写入两个流，支持过滤"""
+
+        def __init__(self, terminal, log_file, filter_patterns):
+            self.terminal = terminal
+            self.log_file = log_file
+            self.filter_patterns = filter_patterns
+
+        def write(self, message):
+            self.terminal.write(message)
+            # 过滤不需要写入日志的内容
+            if any(p in message for p in self.filter_patterns):
+                return
+            self.log_file.write(message)
+
+        def flush(self):
+            self.terminal.flush()
+            self.log_file.flush()