YAML 反斜杠修复扩大范围 — 之前只匹配 "D:\..." 格式，现在匹配所有双引号内含反斜杠的字符串。"outputs\session_20260420..." 会被正确转成 "outputs/session_20260420..."，不再导致 YAML 解析失败。这直接解决了第 10-19 轮的死循环。

_process_response 的 analysis_complete 检测已经在上一轮修好了，配合反斜杠修复，YAML 能正确解析出 action: "analysis_complete"，不会再 fallback 到代码执行。文件选择改为只用最近一次上传的文件 — app.state.last_uploaded_files 记录上传的文件列表，/api/start 优先使用它，不再 glob("uploads/*.csv") 把所有历史文件都拿来分析。
2026-04-20 13:09:54 +08:00
parent 7303008f48
commit c7224153b1
5 changed files with 88 additions and 42 deletions
--- a/data_analysis_agent.py
+++ b/data_analysis_agent.py
@@ -139,7 +139,21 @@ class DataAnalysisAgent:
        """
        try:
            yaml_data = self.llm.parse_yaml_response(response)
-            action = yaml_data.get("action", "generate_code")
+            action = yaml_data.get("action", "")
            # If YAML parsing returned empty/no action, try to detect action from raw text
            if not action:
                if "analysis_complete" in response:
                    action = "analysis_complete"
                    # Try to extract final_report from raw text
                    if not yaml_data.get("final_report"):
                        yaml_data["action"] = "analysis_complete"
                        yaml_data["final_report"] = ""
                elif "collect_figures" in response:
                    action = "collect_figures"
                    yaml_data["action"] = "collect_figures"
                else:
                    action = "generate_code"
            print(f"[TARGET] 检测到动作: {action}")
@@ -155,6 +169,11 @@ class DataAnalysisAgent:
        except Exception as e:
            print(f"[WARN] 解析响应失败: {str(e)}，尝试提取代码并按generate_code处理")
            # Check if this is actually an analysis_complete or collect_figures response
            if "analysis_complete" in response:
                return self._handle_analysis_complete(response, {"final_report": ""})
            if "collect_figures" in response:
                return self._handle_collect_figures(response, {"figures_to_collect": []})
            # 即使YAML解析失败，也尝试提取代码
            extracted_code = extract_code_from_response(response)
            if extracted_code:
--- a/start_web.bat
+++ b/start_web.bat
@@ -1,5 +1,20 @@
@echo off
-echo Starting IOV Data Analysis Agent Web Interface...
+chcp 65001 >nul
-echo Please open http://localhost:8000 in your browser.
+set PYTHONIOENCODING=utf-8
-python -m uvicorn web.main:app --reload --reload-exclude "outputs/*"  --host 0.0.0.0 --port 8000
+
 :: Get local IP address
 for /f "tokens=2 delims=:" %%a in ('ipconfig ^| findstr /c:"IPv4"') do (
    for /f "tokens=1" %%b in ("%%a") do set LOCAL_IP=%%b
 )
 echo.
 echo  IOV Data Analysis Agent
 echo  ========================
 echo.
 echo   Local:   http://localhost:8000
 if defined LOCAL_IP (
    echo   Network: http://%LOCAL_IP%:8000
 )
 echo.
 python -m uvicorn web.main:app --reload --reload-exclude "outputs" --reload-exclude "uploads" --reload-exclude ".hypothesis" --reload-exclude ".cache" --host 0.0.0.0 --port 8000
 pause
--- a/test.py
+++ b/test.py
@@ -1,22 +0,0 @@
 # -*- coding: utf-8 -*-
 """
 快速测试 LLM 连接是否正常
 """
 import os
 from dotenv import load_dotenv
 from openai import OpenAI
 load_dotenv()
 client = OpenAI(
    base_url=os.getenv("OPENAI_BASE_URL", "http://127.0.0.1:9999/v1"),
    api_key=os.getenv("OPENAI_API_KEY", ""),
 )
 response = client.chat.completions.create(
    model=os.getenv("OPENAI_MODEL", "gpt-3.5-turbo"),
    messages=[{"role": "user", "content": "Hello"}],
 )
 print(response.choices[0].message.content)
--- a/utils/llm_helper.py
+++ b/utils/llm_helper.py
@@ -91,9 +91,10 @@ class LLMHelper:
                yaml_content = yaml_content.split('\n', 1)[1]
            # Fix Windows backslash paths that break YAML double-quoted strings.
-            # e.g. "D:\code\iov..." → "D:/code/iov..." inside quoted values
+            # Replace ALL backslashes inside double-quoted strings with forward slashes.
            # This handles both "D:\code\..." and "outputs\session_..." patterns.
            yaml_content = re.sub(
-                r'"([A-Za-z]:\\[^"]*)"',
+                r'"([^"]*\\[^"]*)"',
                lambda m: '"' + m.group(1).replace('\\', '/') + '"',
                yaml_content,
            )
--- a/web/main.py
+++ b/web/main.py
@@ -375,18 +375,21 @@ async def upload_files(files: list[UploadFile] = File(...)):
        with open(file_location, "wb+") as file_object:
            file_object.write(file.file.read())
        saved_files.append(file_location)
    # Track the most recently uploaded files for the next analysis
    app.state.last_uploaded_files = saved_files
    return {"info": f"Saved {len(saved_files)} files", "paths": saved_files}
@app.post("/api/start")
 async def start_analysis(request: StartRequest, background_tasks: BackgroundTasks):
    session_id = session_manager.create_session()
-    files = glob.glob("uploads/*.csv")
+    # Use only the most recently uploaded files, not everything in uploads/
    files = getattr(app.state, 'last_uploaded_files', None)
    if not files:
-        if os.path.exists("cleaned_data.csv"):
+        # Fallback: scan uploads directory
-            files = ["cleaned_data.csv"]
+        files = glob.glob("uploads/*.csv") + glob.glob("uploads/*.xlsx")
-        else:
+    if not files:
-            raise HTTPException(status_code=400, detail="No CSV files found")
+        raise HTTPException(status_code=400, detail="No data files found. Please upload files first.")
    files = [os.path.abspath(f) for f in files] # Only use absolute paths
@@ -948,10 +951,36 @@ async def polish_paragraph(request: PolishRequest):
    if not target:
        raise HTTPException(status_code=404, detail=f"Paragraph {request.paragraph_id} not found")
-    # 构建上下文窗口（前后各2个段落）
+    # Build the actual content to polish: include adjacent table paragraphs
    # so that when user clicks on text below a table, the table gets polished too
    polish_para_ids = [target["id"]]
    polish_content_parts = [target["content"]]
    # Check if previous paragraph is a table — include it
    if target_idx > 0 and paragraphs[target_idx - 1]["type"] == "table":
        polish_para_ids.insert(0, paragraphs[target_idx - 1]["id"])
        polish_content_parts.insert(0, paragraphs[target_idx - 1]["content"])
    # Check if next paragraph is a table — include it
    if target_idx + 1 < len(paragraphs) and paragraphs[target_idx + 1]["type"] == "table":
        polish_para_ids.append(paragraphs[target_idx + 1]["id"])
        polish_content_parts.append(paragraphs[target_idx + 1]["content"])
    # If the target itself is a table, include adjacent text too
    if target["type"] == "table":
        if target_idx + 1 < len(paragraphs) and paragraphs[target_idx + 1]["type"] == "text":
            polish_para_ids.append(paragraphs[target_idx + 1]["id"])
            polish_content_parts.append(paragraphs[target_idx + 1]["content"])
        if target_idx > 0 and paragraphs[target_idx - 1]["type"] == "text":
            polish_para_ids.insert(0, paragraphs[target_idx - 1]["id"])
            polish_content_parts.insert(0, paragraphs[target_idx - 1]["content"])
    combined_content = "\n\n".join(polish_content_parts)
    # 构建上下文窗口（前后各2个段落，排除已包含的）
    context_window = []
    for j in range(max(0, target_idx - 2), min(len(paragraphs), target_idx + 3)):
-        if j != target_idx:
+        if paragraphs[j]["id"] not in polish_para_ids:
            context_window.append(paragraphs[j]["content"])
    context_text = "\n\n".join(context_window)
@@ -985,11 +1014,12 @@ async def polish_paragraph(request: PolishRequest):
 ## 图表信息
 {figures_info}
-## 需要润色的段落
+## 需要润色的段落（可能包含表格和文字）
-{target['content']}
+{combined_content}
 ## 要求
 - 保持原有的 Markdown 格式（标题级别、表格结构等）
 - 如果包含表格，必须同时润色表格内容（补充数据、修正数值）
 - 用具体数据替换模糊描述
 - 增加业务洞察和趋势判断
 - 禁止使用第一人称
@@ -1007,11 +1037,12 @@ async def polish_paragraph(request: PolishRequest):
 ## 图表信息
 {figures_info}
-## 需要润色的段落
+## 需要润色的段落（可能包含表格和文字）
-{target['content']}
+{combined_content}
 ## 要求
 - 保持原有的 Markdown 格式
 - 如果包含表格，必须同时润色表格内容
 - 严格遵循用户指令
 - 禁止使用第一人称
 - 直接输出润色后的 Markdown 内容，不要包裹在代码块中"""
@@ -1025,11 +1056,12 @@ async def polish_paragraph(request: PolishRequest):
 ## 图表信息
 {figures_info}
-## 需要润色的段落
+## 需要润色的段落（可能包含表格和文字）
-{target['content']}
+{combined_content}
 ## 要求
 - 保持原有的 Markdown 格式（标题级别、表格结构等）
 - 如果包含表格，必须同时润色表格内容（补充数据、修正数值）
 - 提升专业性：使用同比、环比、占比等术语
 - 增加洞察：不仅描述现象，还要分析原因和影响
 - 禁止使用第一人称
@@ -1056,9 +1088,10 @@ async def polish_paragraph(request: PolishRequest):
        return {
            "paragraph_id": request.paragraph_id,
-            "original": target["content"],
+            "original": combined_content,
            "polished": polished_content,
            "mode": request.mode,
            "affected_paragraph_ids": polish_para_ids,
        }
    except Exception as e: