前后端页面同步策略，支持分析模板热编辑以及yaml配置，修改提示词编码，占用符等问题，优化文件扫描

2026-04-20 09:50:35 +08:00
parent 00bd48e7e7
commit 3e1ecf2549
14 changed files with 539 additions and 287 deletions
--- a/utils/script_generator.py
+++ b/utils/script_generator.py
@@ -71,6 +71,59 @@ def clean_code_block(code: str) -> str:
    return '\n'.join(result_lines)


+def _is_verification_code(code: str) -> bool:
+    """Detect code blocks that only check/list files without doing real analysis.
+
+    These are typically generated when the LLM runs os.listdir / os.path.exists
+    loops to verify outputs, and should not appear in the reusable script.
+    """
+    lines = [l.strip() for l in code.strip().splitlines() if l.strip() and not l.strip().startswith('#')]
+    if not lines:
+        return True
+
+    verification_indicators = 0
+    analysis_indicators = 0
+
+    for line in lines:
+        # Verification patterns
+        if any(kw in line for kw in [
+            'os.listdir(', 'os.path.exists(', 'os.path.getsize(',
+            'os.path.isfile(', '✓', '✗', 'all_exist',
+        ]):
+            verification_indicators += 1
+        # Analysis patterns (actual computation / plotting / saving)
+        if any(kw in line for kw in [
+            '.plot(', 'plt.', '.to_csv(', '.value_counts()',
+            '.groupby(', '.corr(', '.fit_transform(', '.fit_predict(',
+            'pd.read_csv(', 'pd.crosstab(', '.describe()',
+        ]):
+            analysis_indicators += 1
+
+    # If the block is dominated by verification with no real analysis, skip it
+    return verification_indicators > 0 and analysis_indicators == 0
+
+
+def _is_duplicate_data_load(code: str, seen_load_blocks: set) -> bool:
+    """Detect duplicate data loading blocks (LLM 'amnesia' repeats).
+
+    Computes a fingerprint from the code's structural lines (ignoring
+    whitespace and comments) and returns True if we've seen it before.
+    """
+    # Extract structural fingerprint: non-empty, non-comment lines
+    structural_lines = []
+    for line in code.splitlines():
+        stripped = line.strip()
+        if stripped and not stripped.startswith('#'):
+            structural_lines.append(stripped)
+
+    fingerprint = '\n'.join(structural_lines[:30])  # First 30 lines are enough
+
+    if fingerprint in seen_load_blocks:
+        return True
+    seen_load_blocks.add(fingerprint)
+    return False
+
+
 def generate_reusable_script(
    analysis_results: List[Dict[str, Any]],
    data_files: List[str],
@@ -92,17 +145,29 @@ def generate_reusable_script(
    # 收集所有成功执行的代码
    all_imports = set()
    code_blocks = []
+    seen_load_blocks: Set[str] = set()
    
    for result in analysis_results:
        # 只处理 generate_code 类型的结果
        if result.get("action") == "collect_figures":
            continue
+        # Skip retry attempts
+        if result.get("retry"):
+            continue
            
        code = result.get("code", "")
        exec_result = result.get("result", {})
        
        # 只收集成功执行的代码
        if code and exec_result.get("success", False):
+            # Skip pure verification/file-check code (e.g. os.listdir loops)
+            if _is_verification_code(code):
+                continue
+
+            # Skip duplicate data-loading blocks (LLM amnesia repeats)
+            if _is_duplicate_data_load(code, seen_load_blocks):
+                continue
+
            # 提取 imports
            imports = extract_imports(code)
            all_imports.update(imports)