大更新,架构调整,数据分析能力提升,

This commit is contained in:
2026-04-19 21:30:08 +08:00
parent 9d01f004d4
commit 00bd48e7e7
26 changed files with 4375 additions and 252 deletions

View File

@@ -4,6 +4,7 @@
"""
import os
import re
import sys
import ast
import traceback
@@ -15,6 +16,7 @@ from IPython.utils.capture import capture_output
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import pandas as pd
class CodeExecutor:
@@ -82,8 +84,27 @@ class CodeExecutor:
"PIL",
"random",
"networkx",
"platform",
}
# Maximum rows for auto-export; DataFrames larger than this are skipped
# to avoid heavy disk I/O on large datasets.
AUTO_EXPORT_MAX_ROWS = 50000
# Variable names to skip during DataFrame auto-export
# (common import aliases and built-in namespace names)
_SKIP_EXPORT_NAMES = {
"pd", "np", "plt", "sns", "os", "json", "sys", "re", "io",
"csv", "glob", "duckdb", "display", "math", "datetime", "time",
"warnings", "logging", "copy", "pickle", "pathlib", "collections",
"itertools", "functools", "operator", "random", "networkx",
}
# Regex for parsing DATA_FILE_SAVED markers
_DATA_FILE_SAVED_RE = re.compile(
r"\[DATA_FILE_SAVED\]\s*filename:\s*(.+?),\s*rows:\s*(\d+),\s*description:\s*(.+)"
)
def __init__(self, output_dir: str = "outputs"):
"""
初始化代码执行器
@@ -318,6 +339,142 @@ from IPython.display import display
return str(obj)
@staticmethod
def _sanitize_for_json(rows: List[Dict]) -> List[Dict]:
"""Replace NaN/inf/-inf with None so the data is JSON-serializable."""
import math
sanitized = []
for row in rows:
clean = {}
for k, v in row.items():
if isinstance(v, float) and (math.isnan(v) or math.isinf(v)):
clean[k] = None
else:
clean[k] = v
sanitized.append(clean)
return sanitized
def _capture_evidence_rows(self, result, shell) -> List[Dict]:
"""
Capture up to 10 evidence rows from the execution result.
First checks result.result, then falls back to the last DataFrame in namespace.
"""
try:
# Primary: check if result.result is a DataFrame
if result.result is not None and isinstance(result.result, pd.DataFrame):
return self._sanitize_for_json(
result.result.head(10).to_dict(orient="records")
)
except Exception:
pass
# Fallback: find the last-assigned DataFrame variable in namespace
try:
last_df = None
for name, obj in shell.user_ns.items():
if (
not name.startswith("_")
and name not in self._SKIP_EXPORT_NAMES
and isinstance(obj, pd.DataFrame)
):
last_df = obj
if last_df is not None:
return self._sanitize_for_json(
last_df.head(10).to_dict(orient="records")
)
except Exception:
pass
return []
def _snapshot_dataframes(self, shell) -> Dict[str, int]:
"""Snapshot current DataFrame variables as {name: id(obj)}."""
snapshot = {}
try:
for name, obj in shell.user_ns.items():
if (
not name.startswith("_")
and name not in self._SKIP_EXPORT_NAMES
and isinstance(obj, pd.DataFrame)
):
snapshot[name] = id(obj)
except Exception:
pass
return snapshot
def _detect_new_dataframes(
self, before: Dict[str, int], after: Dict[str, int]
) -> List[str]:
"""Return variable names of new or changed DataFrames."""
new_or_changed = []
for name, obj_id in after.items():
if name not in before or before[name] != obj_id:
new_or_changed.append(name)
return new_or_changed
def _export_dataframe(self, var_name: str, df) -> Optional[Dict[str, Any]]:
"""
Export a DataFrame to CSV with dedup suffix. Returns metadata dict or None.
Skips export for DataFrames exceeding AUTO_EXPORT_MAX_ROWS to avoid
heavy disk I/O on large datasets; only metadata is recorded.
"""
try:
rows_count = len(df)
cols_count = len(df.columns)
col_names = list(df.columns)
# Skip writing large DataFrames to disk — record metadata only
if rows_count > self.AUTO_EXPORT_MAX_ROWS:
return {
"variable_name": var_name,
"filename": f"(skipped: {var_name} has {rows_count} rows)",
"rows": rows_count,
"cols": cols_count,
"columns": col_names,
"skipped": True,
}
base_filename = f"{var_name}.csv"
filepath = os.path.join(self.output_dir, base_filename)
# Dedup: if file exists, try _1, _2, ...
if os.path.exists(filepath):
suffix = 1
while True:
dedup_filename = f"{var_name}_{suffix}.csv"
filepath = os.path.join(self.output_dir, dedup_filename)
if not os.path.exists(filepath):
base_filename = dedup_filename
break
suffix += 1
df.to_csv(filepath, index=False)
return {
"variable_name": var_name,
"filename": base_filename,
"rows": rows_count,
"cols": cols_count,
"columns": col_names,
}
except Exception:
return None
def _parse_data_file_saved_markers(self, stdout_text: str) -> List[Dict[str, Any]]:
"""Parse [DATA_FILE_SAVED] marker lines from captured stdout."""
results = []
try:
for line in stdout_text.splitlines():
m = self._DATA_FILE_SAVED_RE.search(line)
if m:
results.append({
"filename": m.group(1).strip(),
"rows": int(m.group(2)),
"description": m.group(3).strip(),
})
except Exception:
pass
return results
def execute_code(self, code: str) -> Dict[str, Any]:
"""
执行代码并返回结果
@@ -330,7 +487,10 @@ from IPython.display import display
'success': bool,
'output': str,
'error': str,
'variables': Dict[str, Any] # 新生成的重要变量
'variables': Dict[str, Any], # 新生成的重要变量
'evidence_rows': List[Dict], # up to 10 evidence rows
'auto_exported_files': List[Dict], # auto-detected DataFrame exports
'prompt_saved_files': List[Dict], # parsed DATA_FILE_SAVED markers
}
"""
# 检查代码安全性
@@ -341,12 +501,18 @@ from IPython.display import display
"output": "",
"error": f"代码安全检查失败: {safety_error}",
"variables": {},
"evidence_rows": [],
"auto_exported_files": [],
"prompt_saved_files": [],
}
# 记录执行前的变量
vars_before = set(self.shell.user_ns.keys())
try:
# --- Task 6.1: Snapshot DataFrame variables before execution ---
df_snapshot_before = self._snapshot_dataframes(self.shell)
# 使用IPython的capture_output来捕获所有输出
with capture_output() as captured:
result = self.shell.run_cell(code)
@@ -359,6 +525,9 @@ from IPython.display import display
"output": captured.stdout,
"error": f"执行前错误: {error_msg}",
"variables": {},
"evidence_rows": [],
"auto_exported_files": [],
"prompt_saved_files": self._parse_data_file_saved_markers(captured.stdout),
}
if result.error_in_exec:
@@ -368,6 +537,9 @@ from IPython.display import display
"output": captured.stdout,
"error": f"执行错误: {error_msg}",
"variables": {},
"evidence_rows": [],
"auto_exported_files": [],
"prompt_saved_files": self._parse_data_file_saved_markers(captured.stdout),
}
# 获取输出
@@ -423,11 +595,36 @@ from IPython.display import display
print(f"[WARN] [Auto-Save Global] 异常: {e}")
# --- 自动保存机制 end ---
# --- Task 5: Evidence capture ---
evidence_rows = self._capture_evidence_rows(result, self.shell)
# --- Task 6.2-6.4: DataFrame auto-detection and export ---
auto_exported_files = []
try:
df_snapshot_after = self._snapshot_dataframes(self.shell)
new_df_names = self._detect_new_dataframes(df_snapshot_before, df_snapshot_after)
for var_name in new_df_names:
try:
df_obj = self.shell.user_ns[var_name]
meta = self._export_dataframe(var_name, df_obj)
if meta is not None:
auto_exported_files.append(meta)
except Exception:
pass
except Exception:
pass
# --- Task 7: DATA_FILE_SAVED marker parsing ---
prompt_saved_files = self._parse_data_file_saved_markers(captured.stdout)
return {
"success": True,
"output": output,
"error": "",
"variables": important_new_vars,
"evidence_rows": evidence_rows,
"auto_exported_files": auto_exported_files,
"prompt_saved_files": prompt_saved_files,
}
except Exception as e:
return {
@@ -435,6 +632,9 @@ from IPython.display import display
"output": captured.stdout if "captured" in locals() else "",
"error": f"执行异常: {str(e)}\n{traceback.format_exc()}",
"variables": {},
"evidence_rows": [],
"auto_exported_files": [],
"prompt_saved_files": [],
}
def reset_environment(self):