utils/data_privacy.py

# -*- coding: utf-8 -*-
"""
数据隐私保护层

核心原则：发给外部 LLM 的信息只包含 schema 级别的元数据，
绝不包含真实数据值。所有真实数据仅在本地代码执行环境中使用。

分级策略：
  - SAFE（安全级）: 可发送给 LLM — 列名、数据类型、行列数、空值率、唯一值数量
  - LOCAL（本地级）: 仅本地使用 — 真实数据值、TOP N 高频值、统计数值、样本行
"""

import re
import pandas as pd
from typing import List


def build_safe_profile(file_paths: list) -> str:
    """
    生成可安全发送给外部 LLM 的数据画像。
    只包含 schema 信息，不包含任何真实数据值。

    Args:
        file_paths: 数据文件路径列表

    Returns:
        安全的 Markdown 格式数据画像
    """
    import os

    profile = "# 数据结构概览 (Schema Profile)\n\n"

    if not file_paths:
        return profile + "未提供数据文件。"

    for file_path in file_paths:
        file_name = os.path.basename(file_path)
        profile += f"## 文件: {file_name}\n\n"

        if not os.path.exists(file_path):
            profile += f"[WARN] 文件不存在: {file_path}\n\n"
            continue

        try:
            df = _load_dataframe(file_path)
            if df is None:
                continue

            rows, cols = df.shape
            profile += f"- **维度**: {rows} 行 x {cols} 列\n"
            profile += f"- **列名**: `{', '.join(df.columns)}`\n\n"
            profile += "### 列结构:\n\n"
            profile += "| 列名 | 数据类型 | 空值率 | 唯一值数 | 特征描述 |\n"
            profile += "|------|---------|--------|---------|----------|\n"

            for col in df.columns:
                dtype = str(df[col].dtype)
                null_count = df[col].isnull().sum()
                null_pct = f"{(null_count / rows) * 100:.1f}%" if rows > 0 else "0%"
                unique_count = df[col].nunique()

                # 特征描述：只描述数据特征，不暴露具体值
                feature_desc = _describe_column_safe(df[col], unique_count, rows)

                profile += f"| {col} | {dtype} | {null_pct} | {unique_count} | {feature_desc} |\n"

            profile += "\n"

        except Exception as e:
            profile += f"[ERROR] 读取文件失败: {str(e)}\n\n"

    return profile


def build_local_profile(file_paths: list) -> str:
    """
    生成完整的本地数据画像（包含真实数据值）。
    仅用于本地代码执行环境，不发送给 LLM。

    这是原来 load_and_profile_data 的功能，保留完整信息。
    """
    from utils.data_loader import load_and_profile_data
    return load_and_profile_data(file_paths)


def sanitize_execution_feedback(feedback: str, max_lines: int = 30) -> str:
    """
    对代码执行反馈进行脱敏处理，移除可能包含真实数据的内容。

    保留：
      - 执行状态（成功/失败）
      - 错误信息
      - DataFrame 的 shape 信息
      - 图片保存路径
      - 列名信息

    移除/截断：
      - 具体的数据行（DataFrame 输出）
      - 大段的数值输出

    Args:
        feedback: 原始执行反馈
        max_lines: 最大保留行数

    Returns:
        脱敏后的反馈
    """
    if not feedback:
        return feedback

    lines = feedback.split("\n")
    safe_lines = []
    in_dataframe_output = False
    df_line_count = 0

    for line in lines:
        stripped = line.strip()

        # 始终保留的关键信息
        if any(kw in stripped for kw in [
            "图片已保存", "保存至", "[OK]", "[WARN]", "[ERROR]",
            "[Auto-Save]", "数据表形状", "列名:", ".png",
            "shape", "columns", "dtype", "info()", "describe()",
        ]):
            safe_lines.append(line)
            in_dataframe_output = False
            continue

        # 检测 DataFrame 输出的开始（通常有列头行）
        if _looks_like_dataframe_row(stripped):
            if not in_dataframe_output:
                in_dataframe_output = True
                df_line_count = 0
                safe_lines.append("[数据输出已省略 - 数据仅在本地执行环境中可见]")
            df_line_count += 1
            continue

        # 检测纯数值行
        if _is_numeric_heavy_line(stripped):
            if not in_dataframe_output:
                in_dataframe_output = True
                safe_lines.append("[数值输出已省略]")
            continue

        # 普通文本行
        in_dataframe_output = False
        safe_lines.append(line)

    # 限制总行数
    if len(safe_lines) > max_lines:
        safe_lines = safe_lines[:max_lines]
        safe_lines.append(f"[... 输出已截断，共 {len(lines)} 行]")

    return "\n".join(safe_lines)


def _load_dataframe(file_path: str):
    """加载 DataFrame，支持多种格式和编码"""
    import os

    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".csv":
        for encoding in ["utf-8", "gbk", "gb18030", "latin1"]:
            try:
                return pd.read_csv(file_path, encoding=encoding)
            except (UnicodeDecodeError, Exception):
                continue
    elif ext in [".xlsx", ".xls"]:
        try:
            return pd.read_excel(file_path)
        except Exception:
            pass
    return None


def _describe_column_safe(series: pd.Series, unique_count: int, total_rows: int) -> str:
    """安全地描述列特征，不暴露具体值"""
    dtype = series.dtype

    if pd.api.types.is_numeric_dtype(dtype):
        if unique_count <= 5:
            return "低基数数值（可能是分类编码）"
        elif unique_count < total_rows * 0.05:
            return "离散数值"
        else:
            return "连续数值"

    if pd.api.types.is_datetime64_any_dtype(dtype):
        return "时间序列"

    # 文本/分类列
    if unique_count == 1:
        return "单一值（常量列）"
    elif unique_count <= 10:
        return f"低基数分类（{unique_count}类）"
    elif unique_count <= 50:
        return f"中基数分类（{unique_count}类）"
    elif unique_count > total_rows * 0.8:
        return "高基数文本（可能是ID或描述）"
    else:
        return f"文本分类（{unique_count}类）"


def _looks_like_dataframe_row(line: str) -> bool:
    """判断一行是否看起来像 DataFrame 输出"""
    if not line:
        return False
    # DataFrame 输出通常有多个空格分隔的列
    parts = line.split()
    if len(parts) >= 3:
        # 第一个元素是索引（数字）
        try:
            int(parts[0])
            return True
        except ValueError:
            pass
    return False


def _is_numeric_heavy_line(line: str) -> bool:
    """判断一行是否主要由数值组成"""
    if not line or len(line) < 5:
        return False
    digits_and_dots = sum(1 for c in line if c.isdigit() or c in ".,-+eE ")
    return digits_and_dots / len(line) > 0.7