大更新，架构调整，数据分析能力提升，

2026-04-19 21:30:08 +08:00
parent 9d01f004d4
commit 00bd48e7e7
26 changed files with 4375 additions and 252 deletions
--- a/utils/code_executor.py
+++ b/utils/code_executor.py
@@ -4,6 +4,7 @@
 """

 import os
+import re
 import sys
 import ast
 import traceback
@@ -15,6 +16,7 @@ from IPython.utils.capture import capture_output
 import matplotlib
 import matplotlib.pyplot as plt
 import matplotlib.font_manager as fm
+import pandas as pd


 class CodeExecutor:
@@ -82,8 +84,27 @@ class CodeExecutor:
        "PIL",
        "random",
        "networkx",
+        "platform",
    }

+    # Maximum rows for auto-export; DataFrames larger than this are skipped
+    # to avoid heavy disk I/O on large datasets.
+    AUTO_EXPORT_MAX_ROWS = 50000
+
+    # Variable names to skip during DataFrame auto-export
+    # (common import aliases and built-in namespace names)
+    _SKIP_EXPORT_NAMES = {
+        "pd", "np", "plt", "sns", "os", "json", "sys", "re", "io",
+        "csv", "glob", "duckdb", "display", "math", "datetime", "time",
+        "warnings", "logging", "copy", "pickle", "pathlib", "collections",
+        "itertools", "functools", "operator", "random", "networkx",
+    }
+
+    # Regex for parsing DATA_FILE_SAVED markers
+    _DATA_FILE_SAVED_RE = re.compile(
+        r"\[DATA_FILE_SAVED\]\s*filename:\s*(.+?),\s*rows:\s*(\d+),\s*description:\s*(.+)"
+    )
+
    def __init__(self, output_dir: str = "outputs"):
        """
        初始化代码执行器
@@ -318,6 +339,142 @@ from IPython.display import display

        return str(obj)

+    @staticmethod
+    def _sanitize_for_json(rows: List[Dict]) -> List[Dict]:
+        """Replace NaN/inf/-inf with None so the data is JSON-serializable."""
+        import math
+        sanitized = []
+        for row in rows:
+            clean = {}
+            for k, v in row.items():
+                if isinstance(v, float) and (math.isnan(v) or math.isinf(v)):
+                    clean[k] = None
+                else:
+                    clean[k] = v
+            sanitized.append(clean)
+        return sanitized
+
+    def _capture_evidence_rows(self, result, shell) -> List[Dict]:
+        """
+        Capture up to 10 evidence rows from the execution result.
+        First checks result.result, then falls back to the last DataFrame in namespace.
+        """
+        try:
+            # Primary: check if result.result is a DataFrame
+            if result.result is not None and isinstance(result.result, pd.DataFrame):
+                return self._sanitize_for_json(
+                    result.result.head(10).to_dict(orient="records")
+                )
+        except Exception:
+            pass
+
+        # Fallback: find the last-assigned DataFrame variable in namespace
+        try:
+            last_df = None
+            for name, obj in shell.user_ns.items():
+                if (
+                    not name.startswith("_")
+                    and name not in self._SKIP_EXPORT_NAMES
+                    and isinstance(obj, pd.DataFrame)
+                ):
+                    last_df = obj
+            if last_df is not None:
+                return self._sanitize_for_json(
+                    last_df.head(10).to_dict(orient="records")
+                )
+        except Exception:
+            pass
+
+        return []
+
+    def _snapshot_dataframes(self, shell) -> Dict[str, int]:
+        """Snapshot current DataFrame variables as {name: id(obj)}."""
+        snapshot = {}
+        try:
+            for name, obj in shell.user_ns.items():
+                if (
+                    not name.startswith("_")
+                    and name not in self._SKIP_EXPORT_NAMES
+                    and isinstance(obj, pd.DataFrame)
+                ):
+                    snapshot[name] = id(obj)
+        except Exception:
+            pass
+        return snapshot
+
+    def _detect_new_dataframes(
+        self, before: Dict[str, int], after: Dict[str, int]
+    ) -> List[str]:
+        """Return variable names of new or changed DataFrames."""
+        new_or_changed = []
+        for name, obj_id in after.items():
+            if name not in before or before[name] != obj_id:
+                new_or_changed.append(name)
+        return new_or_changed
+
+    def _export_dataframe(self, var_name: str, df) -> Optional[Dict[str, Any]]:
+        """
+        Export a DataFrame to CSV with dedup suffix. Returns metadata dict or None.
+        Skips export for DataFrames exceeding AUTO_EXPORT_MAX_ROWS to avoid
+        heavy disk I/O on large datasets; only metadata is recorded.
+        """
+        try:
+            rows_count = len(df)
+            cols_count = len(df.columns)
+            col_names = list(df.columns)
+
+            # Skip writing large DataFrames to disk — record metadata only
+            if rows_count > self.AUTO_EXPORT_MAX_ROWS:
+                return {
+                    "variable_name": var_name,
+                    "filename": f"(skipped: {var_name} has {rows_count} rows)",
+                    "rows": rows_count,
+                    "cols": cols_count,
+                    "columns": col_names,
+                    "skipped": True,
+                }
+
+            base_filename = f"{var_name}.csv"
+            filepath = os.path.join(self.output_dir, base_filename)
+
+            # Dedup: if file exists, try _1, _2, ...
+            if os.path.exists(filepath):
+                suffix = 1
+                while True:
+                    dedup_filename = f"{var_name}_{suffix}.csv"
+                    filepath = os.path.join(self.output_dir, dedup_filename)
+                    if not os.path.exists(filepath):
+                        base_filename = dedup_filename
+                        break
+                    suffix += 1
+
+            df.to_csv(filepath, index=False)
+            return {
+                "variable_name": var_name,
+                "filename": base_filename,
+                "rows": rows_count,
+                "cols": cols_count,
+                "columns": col_names,
+            }
+        except Exception:
+            return None
+
+    def _parse_data_file_saved_markers(self, stdout_text: str) -> List[Dict[str, Any]]:
+        """Parse [DATA_FILE_SAVED] marker lines from captured stdout."""
+        results = []
+        try:
+            for line in stdout_text.splitlines():
+                m = self._DATA_FILE_SAVED_RE.search(line)
+                if m:
+                    results.append({
+                        "filename": m.group(1).strip(),
+                        "rows": int(m.group(2)),
+                        "description": m.group(3).strip(),
+                    })
+        except Exception:
+            pass
+        return results
+
    def execute_code(self, code: str) -> Dict[str, Any]:
        """
        执行代码并返回结果
@@ -330,7 +487,10 @@ from IPython.display import display
                'success': bool,
                'output': str,
                'error': str,
-                'variables': Dict[str, Any]  # 新生成的重要变量
+                'variables': Dict[str, Any],  # 新生成的重要变量
+                'evidence_rows': List[Dict],  # up to 10 evidence rows
+                'auto_exported_files': List[Dict],  # auto-detected DataFrame exports
+                'prompt_saved_files': List[Dict],  # parsed DATA_FILE_SAVED markers
            }
        """
        # 检查代码安全性
@@ -341,12 +501,18 @@ from IPython.display import display
                "output": "",
                "error": f"代码安全检查失败: {safety_error}",
                "variables": {},
+                "evidence_rows": [],
+                "auto_exported_files": [],
+                "prompt_saved_files": [],
            }

        # 记录执行前的变量
        vars_before = set(self.shell.user_ns.keys())

        try:
+            # --- Task 6.1: Snapshot DataFrame variables before execution ---
+            df_snapshot_before = self._snapshot_dataframes(self.shell)
+
            # 使用IPython的capture_output来捕获所有输出
            with capture_output() as captured:
                result = self.shell.run_cell(code)
@@ -359,6 +525,9 @@ from IPython.display import display
                    "output": captured.stdout,
                    "error": f"执行前错误: {error_msg}",
                    "variables": {},
+                    "evidence_rows": [],
+                    "auto_exported_files": [],
+                    "prompt_saved_files": self._parse_data_file_saved_markers(captured.stdout),
                }

            if result.error_in_exec:
@@ -368,6 +537,9 @@ from IPython.display import display
                    "output": captured.stdout,
                    "error": f"执行错误: {error_msg}",
                    "variables": {},
+                    "evidence_rows": [],
+                    "auto_exported_files": [],
+                    "prompt_saved_files": self._parse_data_file_saved_markers(captured.stdout),
                }

            # 获取输出
@@ -423,11 +595,36 @@ from IPython.display import display
                print(f"[WARN] [Auto-Save Global] 异常: {e}")
            # --- 自动保存机制 end ---

+            # --- Task 5: Evidence capture ---
+            evidence_rows = self._capture_evidence_rows(result, self.shell)
+
+            # --- Task 6.2-6.4: DataFrame auto-detection and export ---
+            auto_exported_files = []
+            try:
+                df_snapshot_after = self._snapshot_dataframes(self.shell)
+                new_df_names = self._detect_new_dataframes(df_snapshot_before, df_snapshot_after)
+                for var_name in new_df_names:
+                    try:
+                        df_obj = self.shell.user_ns[var_name]
+                        meta = self._export_dataframe(var_name, df_obj)
+                        if meta is not None:
+                            auto_exported_files.append(meta)
+                    except Exception:
+                        pass
+            except Exception:
+                pass
+
+            # --- Task 7: DATA_FILE_SAVED marker parsing ---
+            prompt_saved_files = self._parse_data_file_saved_markers(captured.stdout)
+
            return {
                "success": True,
                "output": output,
                "error": "",
                "variables": important_new_vars,
+                "evidence_rows": evidence_rows,
+                "auto_exported_files": auto_exported_files,
+                "prompt_saved_files": prompt_saved_files,
            }
        except Exception as e:
            return {
@@ -435,6 +632,9 @@ from IPython.display import display
                "output": captured.stdout if "captured" in locals() else "",
                "error": f"执行异常: {str(e)}\n{traceback.format_exc()}",
                "variables": {},
+                "evidence_rows": [],
+                "auto_exported_files": [],
+                "prompt_saved_files": [],
            }

    def reset_environment(self):