Преглед изворни кода

Improve automation element locating and hotkeys

codex пре 1 месец
родитељ
комит
75760e2e48

+ 37 - 1
api-docs.md

@@ -695,7 +695,43 @@ smartctl -a -d jmb39x,1 /dev/sdb
 }
 ```
 
-后端会截取当前 Windows 屏幕,调用支持视觉输入的 AI 模型识别界面名称、描述、是否为 Windows 桌面、是否为浏览器网页,以及可操作元素列表。AI 返回的百分比坐标会按原始截图分辨率换算为像素坐标;截图和识别结果会保存到数据库。
+后端会截取当前 Windows 屏幕,调用支持视觉输入的 AI 模型识别界面名称、描述、是否为 Windows 桌面、是否为浏览器网页,以及可操作元素列表。
+
+可操作元素在该步骤只要求返回:
+
+```json
+{
+  "name": "保存按钮",
+  "approximate_location": "窗口右下角"
+}
+```
+
+该步骤不要求 AI 返回元素坐标。坐标需要通过单个元素定位接口按需获取。
+
+### 定位单个可操作元素
+
+`POST /api/automation/screens/{screen_id}/elements/{element_id}/locate`
+
+```json
+{
+  "provider_id": 1,
+  "model_id": 1,
+  "temperature": 0.1
+}
+```
+
+后端会把保存的界面截图和该元素的名称、大致位置描述发送给 AI,只定位这一个元素。AI 应返回:
+
+```json
+{
+  "has_element": true,
+  "x_percent": 42.5,
+  "y_percent": 68.2,
+  "reason": "目标按钮位于窗口右下区域"
+}
+```
+
+当 `has_element = true` 时,后端会按原始截图分辨率换算像素坐标并更新该元素记录;前端随后才会在截图上绘制坐标点。
 
 ### 截取当前屏幕
 

+ 89 - 7
backend/app/automation_service.py

@@ -16,6 +16,7 @@ from .scanner import now_iso
 from .schemas import (
     AutomationKeyboardActionRequest,
     AutomationMouseActionRequest,
+    AutomationElementLocateRequest,
     AutomationScreenshotCaptureRequest,
     AutomationStartProgramRequest,
     AutomationTextInputRequest,
@@ -42,8 +43,7 @@ SCREEN_ANALYZE_PROMPT = """请作为 AI 视觉自动化助手分析这张 Window
 
 元素字段:
 - name:元素名称。
-- x_percent:元素中心点 X 相对整张截图宽度的百分比,范围 0-100,可以保留 2 位小数。
-- y_percent:元素中心点 Y 相对整张截图高度的百分比,范围 0-100,可以保留 2 位小数。
+- approximate_location:元素在界面中的大致位置文字描述,例如“窗口右上角”“左侧导航栏中部”“底部任务栏靠左”。不要输出具体坐标或百分比。
 
 判断规则:
 1. 如果截图位于 Windows 桌面,请识别桌面图标、开始菜单入口、任务栏应用、托盘区域等可操作元素。
@@ -51,6 +51,26 @@ SCREEN_ANALYZE_PROMPT = """请作为 AI 视觉自动化助手分析这张 Window
 3. 不要输出 Markdown,不要解释,只输出 JSON。
 """
 
+ELEMENT_LOCATE_PROMPT = """请作为 AI 视觉定位助手,在这张 Windows 屏幕截图中查找一个具体的可操作元素。
+
+目标元素名称:
+{name}
+
+目标元素大致位置描述:
+{approximate_location}
+
+所在界面描述:
+{screen_description}
+
+请严格只输出 JSON 对象,字段为:
+- has_element:boolean,图片中是否能找到该目标元素。
+- x_percent:元素中心点 X 相对整张截图宽度的百分比,范围 0-100,可以保留 2 位小数。找不到时为 null。
+- y_percent:元素中心点 Y 相对整张截图高度的百分比,范围 0-100,可以保留 2 位小数。找不到时为 null。
+- reason:简短中文原因。
+
+只定位这个目标元素,不要列出其他元素。不要输出 Markdown,不要解释,只输出 JSON。
+"""
+
 SCREEN_COMPARE_PROMPT = """请作为 AI 视觉自动化校验器判断两张截图是否处于同一个目标界面。
 
 图片1是当前实际屏幕截图。图片2是数据库中保存的目标界面截图。
@@ -213,9 +233,10 @@ def analyze_screen(payload: AutomationVisionAnalyzeRequest) -> dict[str, Any]:
             conn.execute(
                 """
                 INSERT INTO automation_screen_elements (
-                    screen_id, element_index, name, x_percent, y_percent, x, y, raw_json, created_at
+                    screen_id, element_index, name, x_percent, y_percent, x, y,
+                    approximate_location, is_located, raw_json, created_at
                 )
-                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
+                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                 """,
                 (
                     screen_id,
@@ -225,6 +246,8 @@ def analyze_screen(payload: AutomationVisionAnalyzeRequest) -> dict[str, Any]:
                     element["y_percent"],
                     element["x"],
                     element["y"],
+                    element["approximate_location"],
+                    1 if element["is_located"] else 0,
                     json.dumps(element.get("raw") or element, ensure_ascii=False),
                     now,
                 ),
@@ -237,7 +260,7 @@ def analyze_screen(payload: AutomationVisionAnalyzeRequest) -> dict[str, Any]:
 
 
 def normalize_elements(raw_elements: Any, width: int, height: int) -> list[dict[str, Any]]:
-    """把 AI 返回的百分比坐标转换为截图像素坐标。"""
+    """规范化 AI 返回的可操作元素清单;初始分析阶段不要求坐标。"""
     if not isinstance(raw_elements, list):
         return []
     result = []
@@ -245,8 +268,10 @@ def normalize_elements(raw_elements: Any, width: int, height: int) -> list[dict[
         if not isinstance(item, dict):
             continue
         name = str(item.get("name") or f"元素 {len(result) + 1}")[:160]
-        x_percent = normalize_percent(item.get("x_percent"))
-        y_percent = normalize_percent(item.get("y_percent"))
+        approximate_location = str(item.get("approximate_location") or item.get("location") or "未定位")[:300]
+        x_percent = normalize_percent(item.get("x_percent")) if item.get("x_percent") is not None else 0.0
+        y_percent = normalize_percent(item.get("y_percent")) if item.get("y_percent") is not None else 0.0
+        is_located = item.get("x_percent") is not None and item.get("y_percent") is not None
         x = round(width * x_percent / 100)
         y = round(height * y_percent / 100)
         result.append(
@@ -256,12 +281,68 @@ def normalize_elements(raw_elements: Any, width: int, height: int) -> list[dict[
                 "y_percent": y_percent,
                 "x": max(0, min(width - 1, x)),
                 "y": max(0, min(height - 1, y)),
+                "approximate_location": approximate_location,
+                "is_located": is_located,
                 "raw": item,
             }
         )
     return result
 
 
+def locate_element(screen_id: int, element_id: int, payload: AutomationElementLocateRequest) -> dict[str, Any]:
+    """针对单个可操作元素调用 AI 精确定位,并更新该元素的像素坐标。"""
+    provider_id, model_id, temperature = resolve_ai_params(payload.provider_id, payload.model_id, payload.temperature)
+    screen = get_screen(screen_id)
+    element = next((item for item in screen.get("elements", []) if item["id"] == element_id), None)
+    if not element:
+        raise HTTPException(status_code=404, detail="Automation screen element not found")
+
+    prompt = (
+        ELEMENT_LOCATE_PROMPT
+        .replace("{name}", element.get("name") or "")
+        .replace("{approximate_location}", element.get("approximate_location") or "")
+        .replace("{screen_description}", screen.get("description") or screen.get("interface_name") or "")
+    )
+    ai_result = ai_service.chat_with_images(
+        provider_id,
+        model_id,
+        prompt,
+        [image_to_base64(screen["image_path"])],
+        temperature,
+    )
+    try:
+        parsed = json_from_ai(ai_result["content"])
+    except (json.JSONDecodeError, ValueError) as exc:
+        raise HTTPException(status_code=502, detail=f"AI locate output is not valid JSON: {exc}") from exc
+
+    if not bool(parsed.get("has_element")) or parsed.get("x_percent") is None or parsed.get("y_percent") is None:
+        return {"located": False, "element": element, "ai_result": parsed, "ai_raw_content": ai_result["content"]}
+
+    x_percent = normalize_percent(parsed.get("x_percent"))
+    y_percent = normalize_percent(parsed.get("y_percent"))
+    x = max(0, min(int(screen["width"]) - 1, round(int(screen["width"]) * x_percent / 100)))
+    y = max(0, min(int(screen["height"]) - 1, round(int(screen["height"]) * y_percent / 100)))
+    raw = {**parsed, "previous": element.get("raw_json")}
+    with get_db() as conn:
+        conn.execute(
+            """
+            UPDATE automation_screen_elements
+            SET x_percent = ?, y_percent = ?, x = ?, y = ?, is_located = 1, raw_json = ?
+            WHERE id = ? AND screen_id = ?
+            """,
+            (x_percent, y_percent, x, y, json.dumps(raw, ensure_ascii=False), element_id, screen_id),
+        )
+    updated = get_screen(screen_id, include_image=True)
+    updated_element = next(item for item in updated["elements"] if item["id"] == element_id)
+    return {
+        "located": True,
+        "element": updated_element,
+        "screen": updated,
+        "ai_result": parsed,
+        "ai_raw_content": ai_result["content"],
+    }
+
+
 def normalize_percent(value: Any) -> float:
     """规范化百分比数值,兼容模型偶尔输出 0-1 小数的情况。"""
     try:
@@ -331,6 +412,7 @@ def public_screen(row: dict[str, Any]) -> dict[str, Any]:
 def public_element(row: dict[str, Any]) -> dict[str, Any]:
     """把数据库中的元素行转换为接口返回格式。"""
     item = dict(row)
+    item["is_located"] = bool(item.get("is_located"))
     return item
 
 

+ 4 - 0
backend/app/database.py

@@ -180,6 +180,8 @@ def init_db() -> None:
                 y_percent REAL NOT NULL,
                 x INTEGER NOT NULL,
                 y INTEGER NOT NULL,
+                approximate_location TEXT,
+                is_located INTEGER NOT NULL DEFAULT 0,
                 raw_json TEXT,
                 created_at TEXT NOT NULL,
                 FOREIGN KEY(screen_id) REFERENCES automation_screens(id) ON DELETE CASCADE
@@ -243,6 +245,8 @@ def init_db() -> None:
         ensure_column(conn, "automation_workflow_nodes", "position_x", "REAL NOT NULL DEFAULT 80")
         ensure_column(conn, "automation_workflow_nodes", "position_y", "REAL NOT NULL DEFAULT 80")
         ensure_column(conn, "automation_workflow_nodes", "next_node_keys", "TEXT")
+        ensure_column(conn, "automation_screen_elements", "approximate_location", "TEXT")
+        ensure_column(conn, "automation_screen_elements", "is_located", "INTEGER NOT NULL DEFAULT 0")
         seed_default_tags(conn)
         seed_default_settings(conn)
 

+ 6 - 0
backend/app/main.py

@@ -29,6 +29,7 @@ from .schemas import (
     AiProviderUpdate,
     AutomationKeyboardRequest,
     AutomationKeyboardActionRequest,
+    AutomationElementLocateRequest,
     AutomationMouseRequest,
     AutomationMouseActionRequest,
     AutomationPowerRequest,
@@ -757,6 +758,11 @@ def automation_vision_screenshot(payload: AutomationScreenshotCaptureRequest) ->
     return automation_service.capture_screenshot(payload)
 
 
+@app.post("/api/automation/screens/{screen_id}/elements/{element_id}/locate")
+def automation_element_locate(screen_id: int, element_id: int, payload: AutomationElementLocateRequest) -> dict[str, Any]:
+    return automation_service.locate_element(screen_id, element_id, payload)
+
+
 @app.post("/api/automation/actions/mouse")
 def automation_action_mouse(payload: AutomationMouseActionRequest) -> dict[str, Any]:
     return automation_service.execute_mouse_action(payload)

+ 6 - 0
backend/app/schemas.py

@@ -134,6 +134,12 @@ class AutomationScreenshotCaptureRequest(BaseModel):
     save: bool = True
 
 
+class AutomationElementLocateRequest(BaseModel):
+    provider_id: int | None = None
+    model_id: int | None = None
+    temperature: float = Field(default=0.1, ge=0, le=2)
+
+
 class AutomationActionBase(BaseModel):
     screen_id: int | None = None
     provider_id: int | None = None

+ 43 - 9
backend/app/windows_automation.py

@@ -14,6 +14,27 @@ from fastapi import HTTPException
 MouseAction = Literal["move_to", "move_rel", "click", "double_click", "right_click", "drag_to", "scroll"]
 KeyboardAction = Literal["press", "hotkey", "write", "key_down", "key_up"]
 
+KEY_ALIASES = {
+    "arrowup": "up",
+    "↑": "up",
+    "arrowdown": "down",
+    "↓": "down",
+    "arrowleft": "left",
+    "←": "left",
+    "arrowright": "right",
+    "→": "right",
+    "control": "ctrl",
+    "cmd": "win",
+    "command": "win",
+    "meta": "win",
+    "windows": "win",
+    "esc": "escape",
+    "del": "delete",
+    "pgup": "pageup",
+    "pgdn": "pagedown",
+    " ": "space",
+}
+
 
 def hidden_creationflags() -> int:
     """返回 Windows 下隐藏控制台窗口所需的启动标志。"""
@@ -46,6 +67,17 @@ def load_pyautogui():
     return pyautogui
 
 
+def normalize_key_name(key: str) -> str:
+    """把浏览器或用户输入的按键名转换为 pyautogui 兼容名称。"""
+    normalized = str(key).strip().lower()
+    return KEY_ALIASES.get(normalized, normalized)
+
+
+def normalize_key_list(keys: list[str] | None) -> list[str]:
+    """规范化组合键列表,并去掉空值。"""
+    return [normalize_key_name(key) for key in keys or [] if str(key).strip()]
+
+
 def run_shutdown_command(args: list[str], timeout: int = 10) -> dict[str, Any]:
     """执行 shutdown.exe 命令,并统一返回命令输出。"""
     ensure_windows()
@@ -233,26 +265,28 @@ def keyboard_action(
 ) -> dict[str, Any]:
     """执行键盘动作,包括单键、组合键、输入文本、按下和释放。"""
     pyautogui = load_pyautogui()
+    normalized_key = normalize_key_name(key) if key else None
+    normalized_keys = normalize_key_list(keys)
     if action == "press":
-        if not key:
+        if not normalized_key:
             raise HTTPException(status_code=400, detail="key is required")
-        pyautogui.press(key, interval=interval)
+        pyautogui.press(normalized_key, interval=interval)
     elif action == "hotkey":
-        if not keys:
+        if not normalized_keys:
             raise HTTPException(status_code=400, detail="keys are required")
-        pyautogui.hotkey(*keys, interval=interval)
+        pyautogui.hotkey(*normalized_keys, interval=interval)
     elif action == "write":
         if text is None:
             raise HTTPException(status_code=400, detail="text is required")
         pyautogui.write(text, interval=interval)
     elif action == "key_down":
-        if not key:
+        if not normalized_key:
             raise HTTPException(status_code=400, detail="key is required")
-        pyautogui.keyDown(key)
+        pyautogui.keyDown(normalized_key)
     elif action == "key_up":
-        if not key:
+        if not normalized_key:
             raise HTTPException(status_code=400, detail="key is required")
-        pyautogui.keyUp(key)
+        pyautogui.keyUp(normalized_key)
     else:
         raise HTTPException(status_code=400, detail="Unsupported keyboard action")
-    return {"action": f"keyboard_{action}", "key": key, "keys": keys}
+    return {"action": f"keyboard_{action}", "key": normalized_key, "keys": normalized_keys}

+ 108 - 10
frontend/src/components/AutomationActionView.vue

@@ -21,9 +21,9 @@
       <div class="screenshot-stage">
         <div v-if="imageSrc" class="screenshot-canvas" :style="canvasStyle">
           <img class="screenshot-image" :src="imageSrc" alt="当前 Windows 截图" />
-          <template v-if="currentScreen?.elements">
+          <template v-if="locatedElements.length">
             <button
-              v-for="element in currentScreen.elements"
+              v-for="element in locatedElements"
               :key="element.id || element.element_index"
               class="element-marker"
               :style="markerStyle(element)"
@@ -62,11 +62,16 @@
         <el-table :data="currentScreen?.elements || []" height="420" border stripe>
           <el-table-column prop="element_index" label="#" width="54" />
           <el-table-column prop="name" label="名称" min-width="130" show-overflow-tooltip />
+          <el-table-column prop="approximate_location" label="大致位置" min-width="130" show-overflow-tooltip />
           <el-table-column label="坐标" width="110">
-            <template #default="{ row }">{{ row.x }}, {{ row.y }}</template>
+            <template #default="{ row }">
+              <span v-if="row.is_located">{{ row.x }}, {{ row.y }}</span>
+              <el-tag v-else type="info">未定位</el-tag>
+            </template>
           </el-table-column>
-          <el-table-column label="操作" width="100" fixed="right">
+          <el-table-column label="操作" width="160" fixed="right">
             <template #default="{ row }">
+              <el-button size="small" :loading="locatingElementId === row.id" @click="locateElement(row)">找位置</el-button>
               <el-dropdown @command="(command) => runElementMouse(row, command)">
                 <el-button size="small">点击</el-button>
                 <template #dropdown>
@@ -84,10 +89,23 @@
     </aside>
 
     <el-dialog v-model="keyboardDialog" title="执行键盘操作" width="420px" @opened="focusKeyCapture">
+      <div class="keyboard-builder">
+        <div class="muted">组合键</div>
+        <el-checkbox-group v-model="modifierKeys">
+          <el-checkbox-button label="win">Win</el-checkbox-button>
+          <el-checkbox-button label="ctrl">Ctrl</el-checkbox-button>
+          <el-checkbox-button label="alt">Alt</el-checkbox-button>
+          <el-checkbox-button label="shift">Shift</el-checkbox-button>
+        </el-checkbox-group>
+        <div class="muted">主键</div>
+        <el-select v-model="mainKey" filterable allow-create default-first-option placeholder="选择或输入主键,如 up、enter、a">
+          <el-option v-for="key in commonKeys" :key="key.value" :label="key.label" :value="key.value" />
+        </el-select>
+      </div>
       <div ref="keyCaptureRef" class="key-capture" tabindex="0" @keydown.prevent="captureKey">
-        <div class="muted">点击此区域后按下单键或组合键</div>
+        <div class="muted">也可以点击此区域捕获普通按键;Win 键建议用上方按钮选择</div>
         <div class="key-list">
-          <el-tag v-for="key in capturedKeys" :key="key">{{ key }}</el-tag>
+          <el-tag v-for="key in finalKeyboardKeys" :key="key">{{ key }}</el-tag>
         </div>
       </div>
       <template #footer>
@@ -128,6 +146,7 @@ const providers = ref([])
 const models = ref([])
 const analyzing = ref(false)
 const screenshotLoading = ref(false)
+const locatingElementId = ref(null)
 const savingWorkflow = ref(false)
 const currentScreen = ref(null)
 const recording = ref(false)
@@ -137,6 +156,8 @@ const keyboardDialog = ref(false)
 const textDialog = ref(false)
 const programDialog = ref(false)
 const capturedKeys = ref([])
+const modifierKeys = ref([])
+const mainKey = ref('')
 const keyCaptureRef = ref(null)
 const textInput = ref('')
 const quickProgram = ref('')
@@ -159,6 +180,31 @@ const imageSrc = computed(() => {
   if (!currentScreen.value?.image_base64) return ''
   return `data:${currentScreen.value.mime_type || 'image/png'};base64,${currentScreen.value.image_base64}`
 })
+const locatedElements = computed(() => (currentScreen.value?.elements || []).filter((item) => item.is_located))
+const finalKeyboardKeys = computed(() => {
+  const keys = [...modifierKeys.value]
+  if (mainKey.value) keys.push(normalizeKey(mainKey.value))
+  for (const key of capturedKeys.value) {
+    if (!keys.includes(key)) keys.push(key)
+  }
+  return keys
+})
+const commonKeys = [
+  { label: '↑ 最大化 / 上', value: 'up' },
+  { label: '↓ 下', value: 'down' },
+  { label: '← 左', value: 'left' },
+  { label: '→ 右', value: 'right' },
+  { label: 'Enter', value: 'enter' },
+  { label: 'Esc', value: 'escape' },
+  { label: 'Tab', value: 'tab' },
+  { label: 'Space', value: 'space' },
+  { label: 'Delete', value: 'delete' },
+  { label: 'Backspace', value: 'backspace' },
+  { label: 'F4', value: 'f4' },
+  { label: 'D', value: 'd' },
+  { label: 'E', value: 'e' },
+  { label: 'R', value: 'r' },
+]
 const canvasStyle = computed(() => {
   if (!currentScreen.value?.width || !currentScreen.value?.height) return {}
   return { aspectRatio: `${currentScreen.value.width} / ${currentScreen.value.height}` }
@@ -213,6 +259,32 @@ async function analyzeScreen() {
   }
 }
 
+async function locateElement(element) {
+  if (!ensureAiSelected()) return
+  if (!currentScreen.value?.id) {
+    ElMessage.warning('请先分析界面后再定位元素')
+    return
+  }
+  locatingElementId.value = element.id
+  try {
+    const { data } = await api.post(`/api/automation/screens/${currentScreen.value.id}/elements/${element.id}/locate`, {
+      provider_id: ai.provider_id,
+      model_id: ai.model_id,
+      temperature: ai.temperature,
+    })
+    if (!data.located) {
+      ElMessage.warning(data.ai_result?.reason || 'AI 未找到该元素')
+      return
+    }
+    currentScreen.value = data.screen
+    ElMessage.success(`已定位:${data.element.x}, ${data.element.y}`)
+  } catch (error) {
+    ElMessage.error(error.response?.data?.detail || '定位元素失败')
+  } finally {
+    locatingElementId.value = null
+  }
+}
+
 async function captureScreenshot(silent = false) {
   screenshotLoading.value = true
   try {
@@ -254,6 +326,10 @@ function addNode(node) {
 
 async function runElementMouse(element, mouseAction) {
   if (!ensureAiSelected()) return
+  if (!element.is_located) {
+    ElMessage.warning('请先点击“找位置”定位该元素')
+    return
+  }
   try {
     const { data } = await api.post('/api/automation/actions/mouse', {
       ...actionBase(),
@@ -276,6 +352,8 @@ async function runElementMouse(element, mouseAction) {
 
 function openKeyboardDialog() {
   capturedKeys.value = []
+  modifierKeys.value = []
+  mainKey.value = ''
   keyboardDialog.value = true
 }
 
@@ -286,18 +364,38 @@ async function focusKeyCapture() {
 
 function captureKey(event) {
   const key = normalizeKey(event.key)
-  if (!capturedKeys.value.includes(key)) capturedKeys.value.push(key)
+  if (['ctrl', 'alt', 'shift', 'win'].includes(key)) {
+    if (!modifierKeys.value.includes(key)) modifierKeys.value.push(key)
+    return
+  }
+  mainKey.value = key
+  capturedKeys.value = []
 }
 
 function normalizeKey(key) {
-  const map = { Control: 'ctrl', Shift: 'shift', Alt: 'alt', Meta: 'win', Escape: 'esc', ' ': 'space' }
+  const map = {
+    Control: 'ctrl',
+    Shift: 'shift',
+    Alt: 'alt',
+    Meta: 'win',
+    OS: 'win',
+    Win: 'win',
+    Escape: 'escape',
+    ' ': 'space',
+    ArrowUp: 'up',
+    ArrowDown: 'down',
+    ArrowLeft: 'left',
+    ArrowRight: 'right',
+    PageUp: 'pageup',
+    PageDown: 'pagedown',
+  }
   return map[key] || key.toLowerCase()
 }
 
 async function runKeyboard() {
-  if (!capturedKeys.value.length || !ensureAiSelected()) return
+  if (!finalKeyboardKeys.value.length || !ensureAiSelected()) return
   try {
-    const keys = [...capturedKeys.value]
+    const keys = [...finalKeyboardKeys.value]
     const { data } = await api.post('/api/automation/actions/keyboard', { ...actionBase(), keys })
     rememberProcesses(data.new_processes)
     addNode({ node_type: 'keyboard', screen_id: currentScreen.value?.id || null, title: keys.join('+'), config: { keys } })

+ 7 - 2
frontend/src/components/AutomationScreensView.vue

@@ -34,7 +34,7 @@
             <div v-if="detailImageSrc" class="screenshot-canvas" :style="canvasStyle">
               <img class="screenshot-image" :src="detailImageSrc" alt="已识别界面截图" />
               <button
-                v-for="element in detail.elements || []"
+                v-for="element in locatedElements"
                 :key="element.id"
                 class="element-marker"
                 :style="markerStyle(element)"
@@ -53,8 +53,12 @@
           <el-table :data="detail.elements || []" height="360" border stripe style="margin-top: 12px">
             <el-table-column prop="element_index" label="#" width="54" />
             <el-table-column prop="name" label="名称" min-width="140" show-overflow-tooltip />
+            <el-table-column prop="approximate_location" label="大致位置" min-width="160" show-overflow-tooltip />
             <el-table-column label="坐标" width="120">
-              <template #default="{ row }">{{ row.x }}, {{ row.y }}</template>
+              <template #default="{ row }">
+                <span v-if="row.is_located">{{ row.x }}, {{ row.y }}</span>
+                <el-tag v-else type="info">未定位</el-tag>
+              </template>
             </el-table-column>
           </el-table>
         </div>
@@ -76,6 +80,7 @@ const detailImageSrc = computed(() => {
   if (!detail.value?.image_base64) return ''
   return `data:${detail.value.mime_type || 'image/png'};base64,${detail.value.image_base64}`
 })
+const locatedElements = computed(() => (detail.value?.elements || []).filter((item) => item.is_located))
 const canvasStyle = computed(() => {
   if (!detail.value?.width || !detail.value?.height) return {}
   return { aspectRatio: `${detail.value.width} / ${detail.value.height}` }

+ 6 - 0
frontend/src/styles.css

@@ -251,6 +251,12 @@ body {
   border-color: #409eff;
 }
 
+.keyboard-builder {
+  display: grid;
+  gap: 10px;
+  margin-bottom: 12px;
+}
+
 .key-list {
   display: flex;
   gap: 8px;

+ 4 - 0
task.md

@@ -56,6 +56,8 @@
 - [x] 自动化工作流页面改为可拖动、可连线的大画布编辑,并支持点击执行工作流
 - [x] 后端增加按工作流 ID 执行完整工作流接口
 - [x] 前端开发服务和默认 API 地址支持局域网访问,并更新部署文档
+- [x] 调整界面分析流程:可操作元素先只记录大致位置,按单个元素再调用 AI 精确定位坐标
+- [x] 优化键盘操作弹窗,支持手动选择 Win/Ctrl/Alt/Shift 组合键并规范化方向键名称
 
 ## 进度日志
 
@@ -73,3 +75,5 @@
 - 2026-05-10:开始并完成 Windows 自动化操作模块,新增关机/重启、程序启动/关闭、屏幕截图、pyautogui 鼠标和键盘操作接口;更新后端依赖和接口文档。
 - 2026-05-10:完成 AI 视觉自动化基础功能,支持截图识别并保存界面元素、动作前界面对比校验、错误记录、自动化工作流保存与节点管理;前端增加自动化四个菜单页面,并验证后端编译、前端构建和页面渲染。
 - 2026-05-11:提交 AI 自动化基础代码基线;新增系统设置、自动化截图刷新、工作流画布编辑和后端整条工作流执行接口;前端支持局域网访问默认 API 地址。
+- 2026-05-11:根据本地多模态模型定位效果,调整自动化界面分析为“元素清单 + 大致位置”,新增单元素“找位置”接口和前端按钮,定位成功后再更新元素坐标并绘制标记。
+- 2026-05-11:修复浏览器无法可靠捕获 Win 组合键的问题;键盘操作改为可手动选择修饰键和主键,并在后端将 ArrowUp/Meta 等按键名转换为 pyautogui 兼容名称。