пре 1 месец · 75760e2e48
--- a/api-docs.md
+++ b/api-docs.md
@@ -695,7 +695,43 @@ smartctl -a -d jmb39x,1 /dev/sdb
 
				 }
			
 
				 ```
			
 
				 
			
 
				-后端会截取当前 Windows 屏幕，调用支持视觉输入的 AI 模型识别界面名称、描述、是否为 Windows 桌面、是否为浏览器网页，以及可操作元素列表。AI 返回的百分比坐标会按原始截图分辨率换算为像素坐标；截图和识别结果会保存到数据库。
			
 
				+后端会截取当前 Windows 屏幕，调用支持视觉输入的 AI 模型识别界面名称、描述、是否为 Windows 桌面、是否为浏览器网页，以及可操作元素列表。
			
 
				+
			
 
				+可操作元素在该步骤只要求返回：
			
 
				+
			
 
				+```json
			
 
				+{
			
 
				+  "name": "保存按钮",
			
 
				+  "approximate_location": "窗口右下角"
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+该步骤不要求 AI 返回元素坐标。坐标需要通过单个元素定位接口按需获取。
			
 
				+
			
 
				+### 定位单个可操作元素
			
 
				+
			
 
				+`POST /api/automation/screens/{screen_id}/elements/{element_id}/locate`
			
 
				+
			
 
				+```json
			
 
				+{
			
 
				+  "provider_id": 1,
			
 
				+  "model_id": 1,
			
 
				+  "temperature": 0.1
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+后端会把保存的界面截图和该元素的名称、大致位置描述发送给 AI，只定位这一个元素。AI 应返回：
			
 
				+
			
 
				+```json
			
 
				+{
			
 
				+  "has_element": true,
			
 
				+  "x_percent": 42.5,
			
 
				+  "y_percent": 68.2,
			
 
				+  "reason": "目标按钮位于窗口右下区域"
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+当 `has_element = true` 时，后端会按原始截图分辨率换算像素坐标并更新该元素记录；前端随后才会在截图上绘制坐标点。
			
 
				 
			
 
				 ### 截取当前屏幕
			
 
				 
			
--- a/backend/app/automation_service.py
+++ b/backend/app/automation_service.py
@@ -16,6 +16,7 @@ from .scanner import now_iso
 
				 from .schemas import (
			
 
				     AutomationKeyboardActionRequest,
			
 
				     AutomationMouseActionRequest,
			
 
				+    AutomationElementLocateRequest,
			
 
				     AutomationScreenshotCaptureRequest,
			
 
				     AutomationStartProgramRequest,
			
 
				     AutomationTextInputRequest,
			
@@ -42,8 +43,7 @@ SCREEN_ANALYZE_PROMPT = """请作为 AI 视觉自动化助手分析这张 Window
 
				 
			
 
				 元素字段：
			
 
				 - name：元素名称。
			
 
				-- x_percent：元素中心点 X 相对整张截图宽度的百分比，范围 0-100，可以保留 2 位小数。
			
 
				-- y_percent：元素中心点 Y 相对整张截图高度的百分比，范围 0-100，可以保留 2 位小数。
			
 
				+- approximate_location：元素在界面中的大致位置文字描述，例如“窗口右上角”“左侧导航栏中部”“底部任务栏靠左”。不要输出具体坐标或百分比。
			
 
				 
			
 
				 判断规则：
			
 
				 1. 如果截图位于 Windows 桌面，请识别桌面图标、开始菜单入口、任务栏应用、托盘区域等可操作元素。
			
@@ -51,6 +51,26 @@ SCREEN_ANALYZE_PROMPT = """请作为 AI 视觉自动化助手分析这张 Window
 
				 3. 不要输出 Markdown，不要解释，只输出 JSON。
			
 
				 """
			
 
				 
			
 
				+ELEMENT_LOCATE_PROMPT = """请作为 AI 视觉定位助手，在这张 Windows 屏幕截图中查找一个具体的可操作元素。
			
 
				+
			
 
				+目标元素名称：
			
 
				+{name}
			
 
				+
			
 
				+目标元素大致位置描述：
			
 
				+{approximate_location}
			
 
				+
			
 
				+所在界面描述：
			
 
				+{screen_description}
			
 
				+
			
 
				+请严格只输出 JSON 对象，字段为：
			
 
				+- has_element：boolean，图片中是否能找到该目标元素。
			
 
				+- x_percent：元素中心点 X 相对整张截图宽度的百分比，范围 0-100，可以保留 2 位小数。找不到时为 null。
			
 
				+- y_percent：元素中心点 Y 相对整张截图高度的百分比，范围 0-100，可以保留 2 位小数。找不到时为 null。
			
 
				+- reason：简短中文原因。
			
 
				+
			
 
				+只定位这个目标元素，不要列出其他元素。不要输出 Markdown，不要解释，只输出 JSON。
			
 
				+"""
			
 
				+
			
 
				 SCREEN_COMPARE_PROMPT = """请作为 AI 视觉自动化校验器判断两张截图是否处于同一个目标界面。
			
 
				 
			
 
				 图片1是当前实际屏幕截图。图片2是数据库中保存的目标界面截图。
			
@@ -213,9 +233,10 @@ def analyze_screen(payload: AutomationVisionAnalyzeRequest) -> dict[str, Any]:
 
				             conn.execute(
			
 
				                 """
			
 
				                 INSERT INTO automation_screen_elements (
			
 
				-                    screen_id, element_index, name, x_percent, y_percent, x, y, raw_json, created_at
			
 
				+                    screen_id, element_index, name, x_percent, y_percent, x, y,
			
 
				+                    approximate_location, is_located, raw_json, created_at
			
 
				                 )
			
 
				-                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
			
 
				+                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
			
 
				                 """,
			
 
				                 (
			
 
				                     screen_id,
			
@@ -225,6 +246,8 @@ def analyze_screen(payload: AutomationVisionAnalyzeRequest) -> dict[str, Any]:
 
				                     element["y_percent"],
			
 
				                     element["x"],
			
 
				                     element["y"],
			
 
				+                    element["approximate_location"],
			
 
				+                    1 if element["is_located"] else 0,
			
 
				                     json.dumps(element.get("raw") or element, ensure_ascii=False),
			
 
				                     now,
			
 
				                 ),
			
@@ -237,7 +260,7 @@ def analyze_screen(payload: AutomationVisionAnalyzeRequest) -> dict[str, Any]:
 
				 
			
 
				 
			
 
				 def normalize_elements(raw_elements: Any, width: int, height: int) -> list[dict[str, Any]]:
			
 
				-    """把 AI 返回的百分比坐标转换为截图像素坐标。"""
			
 
				+    """规范化 AI 返回的可操作元素清单；初始分析阶段不要求坐标。"""
			
 
				     if not isinstance(raw_elements, list):
			
 
				         return []
			
 
				     result = []
			
@@ -245,8 +268,10 @@ def normalize_elements(raw_elements: Any, width: int, height: int) -> list[dict[
 
				         if not isinstance(item, dict):
			
 
				             continue
			
 
				         name = str(item.get("name") or f"元素 {len(result) + 1}")[:160]
			
 
				-        x_percent = normalize_percent(item.get("x_percent"))
			
 
				-        y_percent = normalize_percent(item.get("y_percent"))
			
 
				+        approximate_location = str(item.get("approximate_location") or item.get("location") or "未定位")[:300]
			
 
				+        x_percent = normalize_percent(item.get("x_percent")) if item.get("x_percent") is not None else 0.0
			
 
				+        y_percent = normalize_percent(item.get("y_percent")) if item.get("y_percent") is not None else 0.0
			
 
				+        is_located = item.get("x_percent") is not None and item.get("y_percent") is not None
			
 
				         x = round(width * x_percent / 100)
			
 
				         y = round(height * y_percent / 100)
			
 
				         result.append(
			
@@ -256,12 +281,68 @@ def normalize_elements(raw_elements: Any, width: int, height: int) -> list[dict[
 
				                 "y_percent": y_percent,
			
 
				                 "x": max(0, min(width - 1, x)),
			
 
				                 "y": max(0, min(height - 1, y)),
			
 
				+                "approximate_location": approximate_location,
			
 
				+                "is_located": is_located,
			
 
				                 "raw": item,
			
 
				             }
			
 
				         )
			
 
				     return result
			
 
				 
			
 
				 
			
 
				+def locate_element(screen_id: int, element_id: int, payload: AutomationElementLocateRequest) -> dict[str, Any]:
			
 
				+    """针对单个可操作元素调用 AI 精确定位，并更新该元素的像素坐标。"""
			
 
				+    provider_id, model_id, temperature = resolve_ai_params(payload.provider_id, payload.model_id, payload.temperature)
			
 
				+    screen = get_screen(screen_id)
			
 
				+    element = next((item for item in screen.get("elements", []) if item["id"] == element_id), None)
			
 
				+    if not element:
			
 
				+        raise HTTPException(status_code=404, detail="Automation screen element not found")
			
 
				+
			
 
				+    prompt = (
			
 
				+        ELEMENT_LOCATE_PROMPT
			
 
				+        .replace("{name}", element.get("name") or "")
			
 
				+        .replace("{approximate_location}", element.get("approximate_location") or "")
			
 
				+        .replace("{screen_description}", screen.get("description") or screen.get("interface_name") or "")
			
 
				+    )
			
 
				+    ai_result = ai_service.chat_with_images(
			
 
				+        provider_id,
			
 
				+        model_id,
			
 
				+        prompt,
			
 
				+        [image_to_base64(screen["image_path"])],
			
 
				+        temperature,
			
 
				+    )
			
 
				+    try:
			
 
				+        parsed = json_from_ai(ai_result["content"])
			
 
				+    except (json.JSONDecodeError, ValueError) as exc:
			
 
				+        raise HTTPException(status_code=502, detail=f"AI locate output is not valid JSON: {exc}") from exc
			
 
				+
			
 
				+    if not bool(parsed.get("has_element")) or parsed.get("x_percent") is None or parsed.get("y_percent") is None:
			
 
				+        return {"located": False, "element": element, "ai_result": parsed, "ai_raw_content": ai_result["content"]}
			
 
				+
			
 
				+    x_percent = normalize_percent(parsed.get("x_percent"))
			
 
				+    y_percent = normalize_percent(parsed.get("y_percent"))
			
 
				+    x = max(0, min(int(screen["width"]) - 1, round(int(screen["width"]) * x_percent / 100)))
			
 
				+    y = max(0, min(int(screen["height"]) - 1, round(int(screen["height"]) * y_percent / 100)))
			
 
				+    raw = {**parsed, "previous": element.get("raw_json")}
			
 
				+    with get_db() as conn:
			
 
				+        conn.execute(
			
 
				+            """
			
 
				+            UPDATE automation_screen_elements
			
 
				+            SET x_percent = ?, y_percent = ?, x = ?, y = ?, is_located = 1, raw_json = ?
			
 
				+            WHERE id = ? AND screen_id = ?
			
 
				+            """,
			
 
				+            (x_percent, y_percent, x, y, json.dumps(raw, ensure_ascii=False), element_id, screen_id),
			
 
				+        )
			
 
				+    updated = get_screen(screen_id, include_image=True)
			
 
				+    updated_element = next(item for item in updated["elements"] if item["id"] == element_id)
			
 
				+    return {
			
 
				+        "located": True,
			
 
				+        "element": updated_element,
			
 
				+        "screen": updated,
			
 
				+        "ai_result": parsed,
			
 
				+        "ai_raw_content": ai_result["content"],
			
 
				+    }
			
 
				+
			
 
				+
			
 
				 def normalize_percent(value: Any) -> float:
			
 
				     """规范化百分比数值，兼容模型偶尔输出 0-1 小数的情况。"""
			
 
				     try:
			
@@ -331,6 +412,7 @@ def public_screen(row: dict[str, Any]) -> dict[str, Any]:
 
				 def public_element(row: dict[str, Any]) -> dict[str, Any]:
			
 
				     """把数据库中的元素行转换为接口返回格式。"""
			
 
				     item = dict(row)
			
 
				+    item["is_located"] = bool(item.get("is_located"))
			
 
				     return item
			
 
				 
			
 
				 
			
--- a/backend/app/database.py
+++ b/backend/app/database.py
@@ -180,6 +180,8 @@ def init_db() -> None:
 
				                 y_percent REAL NOT NULL,
			
 
				                 x INTEGER NOT NULL,
			
 
				                 y INTEGER NOT NULL,
			
 
				+                approximate_location TEXT,
			
 
				+                is_located INTEGER NOT NULL DEFAULT 0,
			
 
				                 raw_json TEXT,
			
 
				                 created_at TEXT NOT NULL,
			
 
				                 FOREIGN KEY(screen_id) REFERENCES automation_screens(id) ON DELETE CASCADE
			
@@ -243,6 +245,8 @@ def init_db() -> None:
 
				         ensure_column(conn, "automation_workflow_nodes", "position_x", "REAL NOT NULL DEFAULT 80")
			
 
				         ensure_column(conn, "automation_workflow_nodes", "position_y", "REAL NOT NULL DEFAULT 80")
			
 
				         ensure_column(conn, "automation_workflow_nodes", "next_node_keys", "TEXT")
			
 
				+        ensure_column(conn, "automation_screen_elements", "approximate_location", "TEXT")
			
 
				+        ensure_column(conn, "automation_screen_elements", "is_located", "INTEGER NOT NULL DEFAULT 0")
			
 
				         seed_default_tags(conn)
			
 
				         seed_default_settings(conn)
			
 
				 
			
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -29,6 +29,7 @@ from .schemas import (
 
				     AiProviderUpdate,
			
 
				     AutomationKeyboardRequest,
			
 
				     AutomationKeyboardActionRequest,
			
 
				+    AutomationElementLocateRequest,
			
 
				     AutomationMouseRequest,
			
 
				     AutomationMouseActionRequest,
			
 
				     AutomationPowerRequest,
			
@@ -757,6 +758,11 @@ def automation_vision_screenshot(payload: AutomationScreenshotCaptureRequest) ->
 
				     return automation_service.capture_screenshot(payload)
			
 
				 
			
 
				 
			
 
				+@app.post("/api/automation/screens/{screen_id}/elements/{element_id}/locate")
			
 
				+def automation_element_locate(screen_id: int, element_id: int, payload: AutomationElementLocateRequest) -> dict[str, Any]:
			
 
				+    return automation_service.locate_element(screen_id, element_id, payload)
			
 
				+
			
 
				+
			
 
				 @app.post("/api/automation/actions/mouse")
			
 
				 def automation_action_mouse(payload: AutomationMouseActionRequest) -> dict[str, Any]:
			
 
				     return automation_service.execute_mouse_action(payload)
			
--- a/backend/app/schemas.py
+++ b/backend/app/schemas.py
@@ -134,6 +134,12 @@ class AutomationScreenshotCaptureRequest(BaseModel):
 
				     save: bool = True
			
 
				 
			
 
				 
			
 
				+class AutomationElementLocateRequest(BaseModel):
			
 
				+    provider_id: int | None = None
			
 
				+    model_id: int | None = None
			
 
				+    temperature: float = Field(default=0.1, ge=0, le=2)
			
 
				+
			
 
				+
			
 
				 class AutomationActionBase(BaseModel):
			
 
				     screen_id: int | None = None
			
 
				     provider_id: int | None = None
			
--- a/backend/app/windows_automation.py
+++ b/backend/app/windows_automation.py
@@ -14,6 +14,27 @@ from fastapi import HTTPException
 
				 MouseAction = Literal["move_to", "move_rel", "click", "double_click", "right_click", "drag_to", "scroll"]
			
 
				 KeyboardAction = Literal["press", "hotkey", "write", "key_down", "key_up"]
			
 
				 
			
 
				+KEY_ALIASES = {
			
 
				+    "arrowup": "up",
			
 
				+    "↑": "up",
			
 
				+    "arrowdown": "down",
			
 
				+    "↓": "down",
			
 
				+    "arrowleft": "left",
			
 
				+    "←": "left",
			
 
				+    "arrowright": "right",
			
 
				+    "→": "right",
			
 
				+    "control": "ctrl",
			
 
				+    "cmd": "win",
			
 
				+    "command": "win",
			
 
				+    "meta": "win",
			
 
				+    "windows": "win",
			
 
				+    "esc": "escape",
			
 
				+    "del": "delete",
			
 
				+    "pgup": "pageup",
			
 
				+    "pgdn": "pagedown",
			
 
				+    " ": "space",
			
 
				+}
			
 
				+
			
 
				 
			
 
				 def hidden_creationflags() -> int:
			
 
				     """返回 Windows 下隐藏控制台窗口所需的启动标志。"""
			
@@ -46,6 +67,17 @@ def load_pyautogui():
 
				     return pyautogui
			
 
				 
			
 
				 
			
 
				+def normalize_key_name(key: str) -> str:
			
 
				+    """把浏览器或用户输入的按键名转换为 pyautogui 兼容名称。"""
			
 
				+    normalized = str(key).strip().lower()
			
 
				+    return KEY_ALIASES.get(normalized, normalized)
			
 
				+
			
 
				+
			
 
				+def normalize_key_list(keys: list[str] | None) -> list[str]:
			
 
				+    """规范化组合键列表，并去掉空值。"""
			
 
				+    return [normalize_key_name(key) for key in keys or [] if str(key).strip()]
			
 
				+
			
 
				+
			
 
				 def run_shutdown_command(args: list[str], timeout: int = 10) -> dict[str, Any]:
			
 
				     """执行 shutdown.exe 命令，并统一返回命令输出。"""
			
 
				     ensure_windows()
			
@@ -233,26 +265,28 @@ def keyboard_action(
 
				 ) -> dict[str, Any]:
			
 
				     """执行键盘动作，包括单键、组合键、输入文本、按下和释放。"""
			
 
				     pyautogui = load_pyautogui()
			
 
				+    normalized_key = normalize_key_name(key) if key else None
			
 
				+    normalized_keys = normalize_key_list(keys)
			
 
				     if action == "press":
			
 
				-        if not key:
			
 
				+        if not normalized_key:
			
 
				             raise HTTPException(status_code=400, detail="key is required")
			
 
				-        pyautogui.press(key, interval=interval)
			
 
				+        pyautogui.press(normalized_key, interval=interval)
			
 
				     elif action == "hotkey":
			
 
				-        if not keys:
			
 
				+        if not normalized_keys:
			
 
				             raise HTTPException(status_code=400, detail="keys are required")
			
 
				-        pyautogui.hotkey(*keys, interval=interval)
			
 
				+        pyautogui.hotkey(*normalized_keys, interval=interval)
			
 
				     elif action == "write":
			
 
				         if text is None:
			
 
				             raise HTTPException(status_code=400, detail="text is required")
			
 
				         pyautogui.write(text, interval=interval)
			
 
				     elif action == "key_down":
			
 
				-        if not key:
			
 
				+        if not normalized_key:
			
 
				             raise HTTPException(status_code=400, detail="key is required")
			
 
				-        pyautogui.keyDown(key)
			
 
				+        pyautogui.keyDown(normalized_key)
			
 
				     elif action == "key_up":
			
 
				-        if not key:
			
 
				+        if not normalized_key:
			
 
				             raise HTTPException(status_code=400, detail="key is required")
			
 
				-        pyautogui.keyUp(key)
			
 
				+        pyautogui.keyUp(normalized_key)
			
 
				     else:
			
 
				         raise HTTPException(status_code=400, detail="Unsupported keyboard action")
			
 
				-    return {"action": f"keyboard_{action}", "key": key, "keys": keys}
			
 
				+    return {"action": f"keyboard_{action}", "key": normalized_key, "keys": normalized_keys}
			
--- a/frontend/src/components/AutomationActionView.vue
+++ b/frontend/src/components/AutomationActionView.vue
@@ -21,9 +21,9 @@
 
				       <div class="screenshot-stage">
			
 
				         <div v-if="imageSrc" class="screenshot-canvas" :style="canvasStyle">
			
 
				           <img class="screenshot-image" :src="imageSrc" alt="当前 Windows 截图" />
			
 
				-          <template v-if="currentScreen?.elements">
			
 
				+          <template v-if="locatedElements.length">
			
 
				             <button
			
 
				-              v-for="element in currentScreen.elements"
			
 
				+              v-for="element in locatedElements"
			
 
				               :key="element.id || element.element_index"
			
 
				               class="element-marker"
			
 
				               :style="markerStyle(element)"
			
@@ -62,11 +62,16 @@
 
				         <el-table :data="currentScreen?.elements || []" height="420" border stripe>
			
 
				           <el-table-column prop="element_index" label="#" width="54" />
			
 
				           <el-table-column prop="name" label="名称" min-width="130" show-overflow-tooltip />
			
 
				+          <el-table-column prop="approximate_location" label="大致位置" min-width="130" show-overflow-tooltip />
			
 
				           <el-table-column label="坐标" width="110">
			
 
				-            <template #default="{ row }">{{ row.x }}, {{ row.y }}</template>
			
 
				+            <template #default="{ row }">
			
 
				+              <span v-if="row.is_located">{{ row.x }}, {{ row.y }}</span>
			
 
				+              <el-tag v-else type="info">未定位</el-tag>
			
 
				+            </template>
			
 
				           </el-table-column>
			
 
				-          <el-table-column label="操作" width="100" fixed="right">
			
 
				+          <el-table-column label="操作" width="160" fixed="right">
			
 
				             <template #default="{ row }">
			
 
				+              <el-button size="small" :loading="locatingElementId === row.id" @click="locateElement(row)">找位置</el-button>
			
 
				               <el-dropdown @command="(command) => runElementMouse(row, command)">
			
 
				                 <el-button size="small">点击</el-button>
			
 
				                 <template #dropdown>
			
@@ -84,10 +89,23 @@
 
				     </aside>
			
 
				 
			
 
				     <el-dialog v-model="keyboardDialog" title="执行键盘操作" width="420px" @opened="focusKeyCapture">
			
 
				+      <div class="keyboard-builder">
			
 
				+        <div class="muted">组合键</div>
			
 
				+        <el-checkbox-group v-model="modifierKeys">
			
 
				+          <el-checkbox-button label="win">Win</el-checkbox-button>
			
 
				+          <el-checkbox-button label="ctrl">Ctrl</el-checkbox-button>
			
 
				+          <el-checkbox-button label="alt">Alt</el-checkbox-button>
			
 
				+          <el-checkbox-button label="shift">Shift</el-checkbox-button>
			
 
				+        </el-checkbox-group>
			
 
				+        <div class="muted">主键</div>
			
 
				+        <el-select v-model="mainKey" filterable allow-create default-first-option placeholder="选择或输入主键，如 up、enter、a">
			
 
				+          <el-option v-for="key in commonKeys" :key="key.value" :label="key.label" :value="key.value" />
			
 
				+        </el-select>
			
 
				+      </div>
			
 
				       <div ref="keyCaptureRef" class="key-capture" tabindex="0" @keydown.prevent="captureKey">
			
 
				-        <div class="muted">点击此区域后按下单键或组合键</div>
			
 
				+        <div class="muted">也可以点击此区域捕获普通按键；Win 键建议用上方按钮选择</div>
			
 
				         <div class="key-list">
			
 
				-          <el-tag v-for="key in capturedKeys" :key="key">{{ key }}</el-tag>
			
 
				+          <el-tag v-for="key in finalKeyboardKeys" :key="key">{{ key }}</el-tag>
			
 
				         </div>
			
 
				       </div>
			
 
				       <template #footer>
			
@@ -128,6 +146,7 @@ const providers = ref([])
 
				 const models = ref([])
			
 
				 const analyzing = ref(false)
			
 
				 const screenshotLoading = ref(false)
			
 
				+const locatingElementId = ref(null)
			
 
				 const savingWorkflow = ref(false)
			
 
				 const currentScreen = ref(null)
			
 
				 const recording = ref(false)
			
@@ -137,6 +156,8 @@ const keyboardDialog = ref(false)
 
				 const textDialog = ref(false)
			
 
				 const programDialog = ref(false)
			
 
				 const capturedKeys = ref([])
			
 
				+const modifierKeys = ref([])
			
 
				+const mainKey = ref('')
			
 
				 const keyCaptureRef = ref(null)
			
 
				 const textInput = ref('')
			
 
				 const quickProgram = ref('')
			
@@ -159,6 +180,31 @@ const imageSrc = computed(() => {
 
				   if (!currentScreen.value?.image_base64) return ''
			
 
				   return `data:${currentScreen.value.mime_type || 'image/png'};base64,${currentScreen.value.image_base64}`
			
 
				 })
			
 
				+const locatedElements = computed(() => (currentScreen.value?.elements || []).filter((item) => item.is_located))
			
 
				+const finalKeyboardKeys = computed(() => {
			
 
				+  const keys = [...modifierKeys.value]
			
 
				+  if (mainKey.value) keys.push(normalizeKey(mainKey.value))
			
 
				+  for (const key of capturedKeys.value) {
			
 
				+    if (!keys.includes(key)) keys.push(key)
			
 
				+  }
			
 
				+  return keys
			
 
				+})
			
 
				+const commonKeys = [
			
 
				+  { label: '↑ 最大化 / 上', value: 'up' },
			
 
				+  { label: '↓ 下', value: 'down' },
			
 
				+  { label: '← 左', value: 'left' },
			
 
				+  { label: '→ 右', value: 'right' },
			
 
				+  { label: 'Enter', value: 'enter' },
			
 
				+  { label: 'Esc', value: 'escape' },
			
 
				+  { label: 'Tab', value: 'tab' },
			
 
				+  { label: 'Space', value: 'space' },
			
 
				+  { label: 'Delete', value: 'delete' },
			
 
				+  { label: 'Backspace', value: 'backspace' },
			
 
				+  { label: 'F4', value: 'f4' },
			
 
				+  { label: 'D', value: 'd' },
			
 
				+  { label: 'E', value: 'e' },
			
 
				+  { label: 'R', value: 'r' },
			
 
				+]
			
 
				 const canvasStyle = computed(() => {
			
 
				   if (!currentScreen.value?.width || !currentScreen.value?.height) return {}
			
 
				   return { aspectRatio: `${currentScreen.value.width} / ${currentScreen.value.height}` }
			
@@ -213,6 +259,32 @@ async function analyzeScreen() {
 
				   }
			
 
				 }
			
 
				 
			
 
				+async function locateElement(element) {
			
 
				+  if (!ensureAiSelected()) return
			
 
				+  if (!currentScreen.value?.id) {
			
 
				+    ElMessage.warning('请先分析界面后再定位元素')
			
 
				+    return
			
 
				+  }
			
 
				+  locatingElementId.value = element.id
			
 
				+  try {
			
 
				+    const { data } = await api.post(`/api/automation/screens/${currentScreen.value.id}/elements/${element.id}/locate`, {
			
 
				+      provider_id: ai.provider_id,
			
 
				+      model_id: ai.model_id,
			
 
				+      temperature: ai.temperature,
			
 
				+    })
			
 
				+    if (!data.located) {
			
 
				+      ElMessage.warning(data.ai_result?.reason || 'AI 未找到该元素')
			
 
				+      return
			
 
				+    }
			
 
				+    currentScreen.value = data.screen
			
 
				+    ElMessage.success(`已定位：${data.element.x}, ${data.element.y}`)
			
 
				+  } catch (error) {
			
 
				+    ElMessage.error(error.response?.data?.detail || '定位元素失败')
			
 
				+  } finally {
			
 
				+    locatingElementId.value = null
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				 async function captureScreenshot(silent = false) {
			
 
				   screenshotLoading.value = true
			
 
				   try {
			
@@ -254,6 +326,10 @@ function addNode(node) {
 
				 
			
 
				 async function runElementMouse(element, mouseAction) {
			
 
				   if (!ensureAiSelected()) return
			
 
				+  if (!element.is_located) {
			
 
				+    ElMessage.warning('请先点击“找位置”定位该元素')
			
 
				+    return
			
 
				+  }
			
 
				   try {
			
 
				     const { data } = await api.post('/api/automation/actions/mouse', {
			
 
				       ...actionBase(),
			
@@ -276,6 +352,8 @@ async function runElementMouse(element, mouseAction) {
 
				 
			
 
				 function openKeyboardDialog() {
			
 
				   capturedKeys.value = []
			
 
				+  modifierKeys.value = []
			
 
				+  mainKey.value = ''
			
 
				   keyboardDialog.value = true
			
 
				 }
			
 
				 
			
@@ -286,18 +364,38 @@ async function focusKeyCapture() {
 
				 
			
 
				 function captureKey(event) {
			
 
				   const key = normalizeKey(event.key)
			
 
				-  if (!capturedKeys.value.includes(key)) capturedKeys.value.push(key)
			
 
				+  if (['ctrl', 'alt', 'shift', 'win'].includes(key)) {
			
 
				+    if (!modifierKeys.value.includes(key)) modifierKeys.value.push(key)
			
 
				+    return
			
 
				+  }
			
 
				+  mainKey.value = key
			
 
				+  capturedKeys.value = []
			
 
				 }
			
 
				 
			
 
				 function normalizeKey(key) {
			
 
				-  const map = { Control: 'ctrl', Shift: 'shift', Alt: 'alt', Meta: 'win', Escape: 'esc', ' ': 'space' }
			
 
				+  const map = {
			
 
				+    Control: 'ctrl',
			
 
				+    Shift: 'shift',
			
 
				+    Alt: 'alt',
			
 
				+    Meta: 'win',
			
 
				+    OS: 'win',
			
 
				+    Win: 'win',
			
 
				+    Escape: 'escape',
			
 
				+    ' ': 'space',
			
 
				+    ArrowUp: 'up',
			
 
				+    ArrowDown: 'down',
			
 
				+    ArrowLeft: 'left',
			
 
				+    ArrowRight: 'right',
			
 
				+    PageUp: 'pageup',
			
 
				+    PageDown: 'pagedown',
			
 
				+  }
			
 
				   return map[key] || key.toLowerCase()
			
 
				 }
			
 
				 
			
 
				 async function runKeyboard() {
			
 
				-  if (!capturedKeys.value.length || !ensureAiSelected()) return
			
 
				+  if (!finalKeyboardKeys.value.length || !ensureAiSelected()) return
			
 
				   try {
			
 
				-    const keys = [...capturedKeys.value]
			
 
				+    const keys = [...finalKeyboardKeys.value]
			
 
				     const { data } = await api.post('/api/automation/actions/keyboard', { ...actionBase(), keys })
			
 
				     rememberProcesses(data.new_processes)
			
 
				     addNode({ node_type: 'keyboard', screen_id: currentScreen.value?.id || null, title: keys.join('+'), config: { keys } })
			
--- a/frontend/src/components/AutomationScreensView.vue
+++ b/frontend/src/components/AutomationScreensView.vue
@@ -34,7 +34,7 @@
 
				             <div v-if="detailImageSrc" class="screenshot-canvas" :style="canvasStyle">
			
 
				               <img class="screenshot-image" :src="detailImageSrc" alt="已识别界面截图" />
			
 
				               <button
			
 
				-                v-for="element in detail.elements || []"
			
 
				+                v-for="element in locatedElements"
			
 
				                 :key="element.id"
			
 
				                 class="element-marker"
			
 
				                 :style="markerStyle(element)"
			
@@ -53,8 +53,12 @@
 
				           <el-table :data="detail.elements || []" height="360" border stripe style="margin-top: 12px">
			
 
				             <el-table-column prop="element_index" label="#" width="54" />
			
 
				             <el-table-column prop="name" label="名称" min-width="140" show-overflow-tooltip />
			
 
				+            <el-table-column prop="approximate_location" label="大致位置" min-width="160" show-overflow-tooltip />
			
 
				             <el-table-column label="坐标" width="120">
			
 
				-              <template #default="{ row }">{{ row.x }}, {{ row.y }}</template>
			
 
				+              <template #default="{ row }">
			
 
				+                <span v-if="row.is_located">{{ row.x }}, {{ row.y }}</span>
			
 
				+                <el-tag v-else type="info">未定位</el-tag>
			
 
				+              </template>
			
 
				             </el-table-column>
			
 
				           </el-table>
			
 
				         </div>
			
@@ -76,6 +80,7 @@ const detailImageSrc = computed(() => {
 
				   if (!detail.value?.image_base64) return ''
			
 
				   return `data:${detail.value.mime_type || 'image/png'};base64,${detail.value.image_base64}`
			
 
				 })
			
 
				+const locatedElements = computed(() => (detail.value?.elements || []).filter((item) => item.is_located))
			
 
				 const canvasStyle = computed(() => {
			
 
				   if (!detail.value?.width || !detail.value?.height) return {}
			
 
				   return { aspectRatio: `${detail.value.width} / ${detail.value.height}` }
			
--- a/frontend/src/styles.css
+++ b/frontend/src/styles.css
@@ -251,6 +251,12 @@ body {
 
				   border-color: #409eff;
			
 
				 }
			
 
				 
			
 
				+.keyboard-builder {
			
 
				+  display: grid;
			
 
				+  gap: 10px;
			
 
				+  margin-bottom: 12px;
			
 
				+}
			
 
				+
			
 
				 .key-list {
			
 
				   display: flex;
			
 
				   gap: 8px;
			
--- a/task.md
+++ b/task.md
@@ -56,6 +56,8 @@
 
				 - [x] 自动化工作流页面改为可拖动、可连线的大画布编辑，并支持点击执行工作流
			
 
				 - [x] 后端增加按工作流 ID 执行完整工作流接口
			
 
				 - [x] 前端开发服务和默认 API 地址支持局域网访问，并更新部署文档
			
 
				+- [x] 调整界面分析流程：可操作元素先只记录大致位置，按单个元素再调用 AI 精确定位坐标
			
 
				+- [x] 优化键盘操作弹窗，支持手动选择 Win/Ctrl/Alt/Shift 组合键并规范化方向键名称
			
 
				 
			
 
				 ## 进度日志
			
 
				 
			
@@ -73,3 +75,5 @@
 
				 - 2026-05-10：开始并完成 Windows 自动化操作模块，新增关机/重启、程序启动/关闭、屏幕截图、pyautogui 鼠标和键盘操作接口；更新后端依赖和接口文档。
			
 
				 - 2026-05-10：完成 AI 视觉自动化基础功能，支持截图识别并保存界面元素、动作前界面对比校验、错误记录、自动化工作流保存与节点管理；前端增加自动化四个菜单页面，并验证后端编译、前端构建和页面渲染。
			
 
				 - 2026-05-11：提交 AI 自动化基础代码基线；新增系统设置、自动化截图刷新、工作流画布编辑和后端整条工作流执行接口；前端支持局域网访问默认 API 地址。
			
 
				+- 2026-05-11：根据本地多模态模型定位效果，调整自动化界面分析为“元素清单 + 大致位置”，新增单元素“找位置”接口和前端按钮，定位成功后再更新元素坐标并绘制标记。
			
 
				+- 2026-05-11：修复浏览器无法可靠捕获 Win 组合键的问题；键盘操作改为可手动选择修饰键和主键，并在后端将 ArrowUp/Meta 等按键名转换为 pyautogui 兼容名称。