大更新，架构调整，数据分析能力提升，

2026-04-19 21:30:08 +08:00
parent 9d01f004d4
commit 00bd48e7e7
26 changed files with 4375 additions and 252 deletions
--- a/utils/data_privacy.py
+++ b/utils/data_privacy.py
@@ -154,6 +154,82 @@ def sanitize_execution_feedback(feedback: str, max_lines: int = 30) -> str:
    return "\n".join(safe_lines)


+def _extract_column_from_error(error_message: str) -> Optional[str]:
+    """Extract column name from error message patterns like KeyError: 'col_name'.
+
+    Supports:
+      - KeyError: 'column_name' or KeyError: "column_name"
+      - column 'column_name' or column "column_name" (case-insensitive)
+
+    Returns:
+        The extracted column name, or None if no column reference is found.
+    """
+    match = re.search(r"KeyError:\s*['\"](.+?)['\"]", error_message)
+    if match:
+        return match.group(1)
+    match = re.search(r"column\s+['\"](.+?)['\"]", error_message, re.IGNORECASE)
+    if match:
+        return match.group(1)
+    return None
+
+
+def _lookup_column_in_profile(column_name: Optional[str], safe_profile: str) -> Optional[dict]:
+    """Look up column metadata in the safe profile markdown table.
+
+    Parses the markdown table rows produced by build_safe_profile() and returns
+    a dict with keys: dtype, null_rate, unique_count, description.
+
+    Args:
+        column_name: The column name to look up (may be None).
+        safe_profile: The safe profile markdown string.
+
+    Returns:
+        A dict of column metadata, or None if not found.
+    """
+    if not column_name:
+        return None
+    for line in safe_profile.split("\n"):
+        if line.startswith("|") and column_name in line:
+            parts = [p.strip() for p in line.split("|") if p.strip()]
+            if len(parts) >= 5 and parts[0] == column_name:
+                return {
+                    "dtype": parts[1],
+                    "null_rate": parts[2],
+                    "unique_count": parts[3],
+                    "description": parts[4],
+                }
+    return None
+
+
+def generate_enriched_hint(error_message: str, safe_profile: str) -> str:
+    """Generate an enriched hint from the safe profile for a data-context error.
+
+    Extracts the referenced column name from the error, looks it up in the safe
+    profile markdown table, and returns a hint string containing only schema-level
+    metadata — no real data values.
+
+    Args:
+        error_message: The error message from code execution.
+        safe_profile: The safe profile markdown string.
+
+    Returns:
+        A hint string with retry context and column metadata (if found).
+    """
+    column_name = _extract_column_from_error(error_message)
+    column_meta = _lookup_column_in_profile(column_name, safe_profile)
+
+    hint = "[RETRY CONTEXT] 上一次代码执行因数据上下文错误失败。\n"
+    hint += f"错误信息: {error_message}\n"
+    if column_meta:
+        hint += f"相关列 '{column_name}' 的结构信息:\n"
+        hint += f"  - 数据类型: {column_meta['dtype']}\n"
+        hint += f"  - 唯一值数量: {column_meta['unique_count']}\n"
+        hint += f"  - 空值率: {column_meta['null_rate']}\n"
+        hint += f"  - 特征描述: {column_meta['description']}\n"
+    hint += "请根据以上结构信息修正代码，不要假设具体的数据值。"
+    return hint
+
+
 def _load_dataframe(file_path: str):
    """加载 DataFrame，支持多种格式和编码"""
    import os