from __future__ import annotations import json import random import time import uuid from pathlib import Path from typing import Any from fastapi import HTTPException from ... import ai_service, settings_service, windows_automation from ..context import WorkflowContext from ..registry import control_ports, field_def, register_node LOCATE_TARGET_PROMPT = """请作为 AI 视觉自动化定位助手,在这张真实屏幕截图中寻找用户指定的可点击目标。 目标描述: {target_description} 当前页面/操作上下文: {screen_context} 选择要求: 1. 如果有多个候选目标,{selection_rule} 2. 返回目标可点击区域的中心点,不要返回窗口、浏览器地址栏或整块页面的中心。 3. 坐标必须是相对整张截图宽高的百分比,范围 0-100。 4. 如果目标不可见、被遮挡、需要滚动、页面未加载完成或你不确定,请返回 found=false。 严格只输出 JSON 对象,不要输出 Markdown: {{ "found": boolean, "x_percent": number|null, "y_percent": number|null, "confidence": number, "target_label": string, "reason": string }}""" VERIFY_PAGE_PROMPT = """请作为 AI 视觉自动化校验器,判断当前屏幕是否符合预期状态。 预期状态: {expected_state} 当前页面/操作上下文: {screen_context} 严格只输出 JSON 对象,不要输出 Markdown: {{ "matched": boolean, "page_state": string, "confidence": number, "reason": string }}""" def _number(value: Any, default: float = 0) -> float: try: return float(value) except (TypeError, ValueError): return default def _boolean(value: Any, default: bool = False) -> bool: if value in (None, ""): return default if isinstance(value, str): return value.strip().lower() in {"1", "true", "yes", "y", "on"} return bool(value) def _percent(value: Any) -> float | None: try: number = float(value) except (TypeError, ValueError): return None if 0 <= number <= 1: number *= 100 return max(0.0, min(100.0, number)) def _runtime_screenshot_path() -> Path: """生成 workflow 运行期截图路径,便于失败排查和任务结果追踪。""" folder = settings_service.resolve_data_path("automation_runtime_path", "automation/runtime") folder.mkdir(parents=True, exist_ok=True) return folder / f"vision_locate_{int(time.time() * 1000)}_{uuid.uuid4().hex[:8]}.png" def _capture_screen(save_screenshot: bool) -> dict[str, Any]: save_path = _runtime_screenshot_path() if save_screenshot else None screenshot = windows_automation.take_screenshot(str(save_path) if save_path else None, include_base64=True) screenshot["mime_type"] = "image/png" return screenshot def _parse_ai_json(content: str) -> dict[str, Any]: parsed = json.loads(ai_service.extract_json_text(content)) if not isinstance(parsed, dict): raise ValueError("AI locate output must be a JSON object") return parsed def _vision_json(context: WorkflowContext, prompt: str, screenshot: dict[str, Any], temperature: float) -> tuple[dict[str, Any], dict[str, Any]]: ai_result = ai_service.chat_with_images( int(context.provider_id), int(context.model_id), prompt, [{"base64": screenshot["image_base64"], "mime_type": screenshot["mime_type"]}], temperature, ) return _parse_ai_json(ai_result["content"]), ai_result def _locate_target( context: WorkflowContext, target_description: str, screen_context: str, randomize: bool, save_screenshot: bool, temperature: float, ) -> dict[str, Any]: screenshot = _capture_screen(save_screenshot) if screenshot.get("path"): context.runtime["current_screenshot_path"] = screenshot["path"] if randomize: selection_rule = f"请结合随机种子 {random.randint(1, 1_000_000)},从可见候选中随机挑选一个" else: selection_rule = "请选择最符合目标描述、最容易点击的一个" prompt = LOCATE_TARGET_PROMPT.format( target_description=target_description, screen_context=screen_context, selection_rule=selection_rule, ) try: parsed, ai_result = _vision_json(context, prompt, screenshot, temperature) except (json.JSONDecodeError, ValueError) as exc: raise HTTPException(status_code=502, detail=f"AI locate output is not valid JSON: {exc}") from exc found = bool(parsed.get("found")) x_percent = _percent(parsed.get("x_percent")) y_percent = _percent(parsed.get("y_percent")) base = { "screenshot_path": screenshot.get("path"), "width": screenshot.get("width"), "height": screenshot.get("height"), "ai_result": parsed, "ai_raw_content": ai_result["content"], } if not found or x_percent is None or y_percent is None: return {"located": False, "found": False, "next_port": "not_found", **base} width = int(screenshot["width"]) height = int(screenshot["height"]) x = max(0, min(width - 1, round(width * x_percent / 100))) y = max(0, min(height - 1, round(height * y_percent / 100))) return { "located": True, "found": True, "x_percent": x_percent, "y_percent": y_percent, "x": x, "y": y, "confidence": parsed.get("confidence"), "target_label": parsed.get("target_label"), "reason": parsed.get("reason"), **base, } def locate_element_node(node: dict[str, Any], inputs: dict[str, Any], context: WorkflowContext) -> dict[str, Any]: params = node.get("params", {}) if not context.provider_id or not context.model_id: raise HTTPException(status_code=400, detail="AI 视觉定位节点需要配置默认 AI 服务商和模型") target_description = str(inputs.get("target_description", params.get("target_description")) or "").strip() if not target_description: raise HTTPException(status_code=400, detail="target_description is required") screen_context = str(inputs.get("screen_context", params.get("screen_context")) or "当前屏幕").strip() randomize = _boolean(inputs.get("randomize", params.get("randomize")), False) save_screenshot = _boolean(inputs.get("save_screenshot", params.get("save_screenshot")), True) fail_if_not_found = _boolean(inputs.get("fail_if_not_found", params.get("fail_if_not_found")), True) temperature = _number(inputs.get("temperature", params.get("temperature")), context.temperature) result = _locate_target( context, target_description=target_description, screen_context=screen_context, randomize=randomize, save_screenshot=save_screenshot, temperature=temperature, ) if not result.get("located"): if fail_if_not_found: ai_result = result.get("ai_result") if isinstance(result.get("ai_result"), dict) else {} raise HTTPException(status_code=404, detail=ai_result.get("reason") or "AI 未定位到目标元素") return result return result def verify_page_node(node: dict[str, Any], inputs: dict[str, Any], context: WorkflowContext) -> dict[str, Any]: params = node.get("params", {}) if not context.provider_id or not context.model_id: raise HTTPException(status_code=400, detail="AI 页面校验节点需要配置默认 AI 服务商和模型") expected_state = str(inputs.get("expected_state", params.get("expected_state")) or "").strip() if not expected_state: raise HTTPException(status_code=400, detail="expected_state is required") screen_context = str(inputs.get("screen_context", params.get("screen_context")) or "当前屏幕").strip() save_screenshot = _boolean(inputs.get("save_screenshot", params.get("save_screenshot")), True) temperature = _number(inputs.get("temperature", params.get("temperature")), context.temperature) screenshot = _capture_screen(save_screenshot) if screenshot.get("path"): context.runtime["current_screenshot_path"] = screenshot["path"] prompt = VERIFY_PAGE_PROMPT.format(expected_state=expected_state, screen_context=screen_context) try: parsed, ai_result = _vision_json(context, prompt, screenshot, temperature) except (json.JSONDecodeError, ValueError) as exc: raise HTTPException(status_code=502, detail=f"AI verify output is not valid JSON: {exc}") from exc matched = bool(parsed.get("matched")) return { "matched": matched, "next_port": "matched" if matched else "not_matched", "page_state": parsed.get("page_state"), "confidence": parsed.get("confidence"), "reason": parsed.get("reason"), "screenshot_path": screenshot.get("path"), "width": screenshot.get("width"), "height": screenshot.get("height"), "ai_result": parsed, "ai_raw_content": ai_result["content"], } def click_target_node(node: dict[str, Any], inputs: dict[str, Any], context: WorkflowContext) -> dict[str, Any]: params = node.get("params", {}) target_description = str(inputs.get("target_description", params.get("target_description")) or "").strip() if not target_description: raise HTTPException(status_code=400, detail="target_description is required") screen_context = str(inputs.get("screen_context", params.get("screen_context")) or "当前屏幕").strip() randomize = _boolean(inputs.get("randomize", params.get("randomize")), False) save_screenshot = _boolean(inputs.get("save_screenshot", params.get("save_screenshot")), True) fail_if_not_found = _boolean(inputs.get("fail_if_not_found", params.get("fail_if_not_found")), True) temperature = _number(inputs.get("temperature", params.get("temperature")), context.temperature) button = str(inputs.get("button", params.get("button")) or "left") clicks = int(max(1, min(_number(inputs.get("clicks", params.get("clicks")), 1), 20))) result = _locate_target(context, target_description, screen_context, randomize, save_screenshot, temperature) if not result.get("located"): if fail_if_not_found: ai_result = result.get("ai_result") if isinstance(result.get("ai_result"), dict) else {} raise HTTPException(status_code=404, detail=ai_result.get("reason") or "AI 未定位到可点击目标") return result clicked = windows_automation.mouse_action("click", x=int(result["x"]), y=int(result["y"]), button=button, clicks=clicks) return {**result, "clicked": True, "click": clicked, "button": button, "clicks": clicks} def close_popups_node(node: dict[str, Any], inputs: dict[str, Any], context: WorkflowContext) -> dict[str, Any]: params = node.get("params", {}) target_description = str( inputs.get("target_description", params.get("target_description")) or "当前页面可见的弹窗关闭按钮、跳过按钮、稍后再说按钮、我知道了按钮或拒绝按钮" ) screen_context = str(inputs.get("screen_context", params.get("screen_context")) or "当前浏览器页面").strip() attempts = int(max(1, min(_number(inputs.get("attempts", params.get("attempts")), 2), 5))) temperature = _number(inputs.get("temperature", params.get("temperature")), context.temperature) closed: list[dict[str, Any]] = [] for _ in range(attempts): result = _locate_target(context, target_description, screen_context, False, True, temperature) if not result.get("located"): return {"closed_count": len(closed), "items": closed, "next_port": "success"} clicked = windows_automation.mouse_action("click", x=int(result["x"]), y=int(result["y"])) closed.append({**result, "click": clicked}) time.sleep(0.8) return {"closed_count": len(closed), "items": closed, "next_port": "success"} register_node( { "type": "vision.locate_element", "category": "vision", "label": "AI 视觉定位元素", "params": { "target_description": field_def("text", "目标描述", required=True), "screen_context": field_def("text", "页面上下文"), "randomize": field_def("boolean", "多候选随机选择", False), "save_screenshot": field_def("boolean", "保存截图", True), "fail_if_not_found": field_def("boolean", "找不到时报错", True), "temperature": field_def("number", "定位温度", 0.1, minimum=0, maximum=2), }, "inputs": { "target_description": field_def("string", "目标描述"), "screen_context": field_def("string", "页面上下文"), "randomize": field_def("boolean", "多候选随机选择"), "save_screenshot": field_def("boolean", "保存截图"), "fail_if_not_found": field_def("boolean", "找不到时报错"), "temperature": field_def("number", "定位温度"), }, "outputs": { "located": {"type": "boolean", "label": "是否定位成功"}, "x_percent": {"type": "number", "label": "X 百分比"}, "y_percent": {"type": "number", "label": "Y 百分比"}, "x": {"type": "number", "label": "X 坐标"}, "y": {"type": "number", "label": "Y 坐标"}, "confidence": {"type": "number", "label": "置信度"}, "target_label": {"type": "string", "label": "目标标签"}, "screenshot_path": {"type": "string", "label": "截图路径"}, "ai_result": {"type": "object", "label": "AI 结果"}, }, "control_ports": control_ports(["success", "not_found", "failure"]), }, locate_element_node, ) register_node( { "type": "vision.verify_page", "category": "vision", "label": "AI 校验页面状态", "description": "截取当前屏幕,让多模态 AI 判断页面是否符合预期,并按 matched/not_matched 分支继续。", "params": { "expected_state": field_def("text", "预期状态", required=True), "screen_context": field_def("text", "页面上下文"), "save_screenshot": field_def("boolean", "保存截图", True), "temperature": field_def("number", "校验温度", 0.1, minimum=0, maximum=2), }, "inputs": { "expected_state": field_def("string", "预期状态"), "screen_context": field_def("string", "页面上下文"), "save_screenshot": field_def("boolean", "保存截图"), "temperature": field_def("number", "校验温度"), }, "outputs": { "matched": {"type": "boolean", "label": "是否匹配"}, "page_state": {"type": "string", "label": "页面状态"}, "confidence": {"type": "number", "label": "置信度"}, "reason": {"type": "string", "label": "原因"}, "screenshot_path": {"type": "string", "label": "截图路径"}, "ai_result": {"type": "object", "label": "AI 结果"}, }, "control_ports": control_ports(["matched", "not_matched", "failure"]), }, verify_page_node, ) register_node( { "type": "vision.click_target", "category": "vision", "label": "AI 定位并点击", "description": "截屏定位目标元素,换算坐标后立即点击,适合封装常见视觉点击步骤。", "params": { "target_description": field_def("text", "目标描述", required=True), "screen_context": field_def("text", "页面上下文"), "randomize": field_def("boolean", "多候选随机选择", False), "button": field_def("select", "按键", "left", options=["left", "middle", "right"]), "clicks": field_def("number", "点击次数", 1, minimum=1, maximum=20), "save_screenshot": field_def("boolean", "保存截图", True), "fail_if_not_found": field_def("boolean", "找不到时报错", True), "temperature": field_def("number", "定位温度", 0.1, minimum=0, maximum=2), }, "inputs": { "target_description": field_def("string", "目标描述"), "screen_context": field_def("string", "页面上下文"), "randomize": field_def("boolean", "多候选随机选择"), "button": field_def("string", "按键"), "clicks": field_def("number", "点击次数"), "save_screenshot": field_def("boolean", "保存截图"), "fail_if_not_found": field_def("boolean", "找不到时报错"), "temperature": field_def("number", "定位温度"), }, "outputs": { "located": {"type": "boolean", "label": "是否定位成功"}, "clicked": {"type": "boolean", "label": "是否已点击"}, "x": {"type": "number", "label": "X 坐标"}, "y": {"type": "number", "label": "Y 坐标"}, "confidence": {"type": "number", "label": "置信度"}, "target_label": {"type": "string", "label": "目标标签"}, "click": {"type": "object", "label": "点击结果"}, "ai_result": {"type": "object", "label": "AI 结果"}, }, "control_ports": control_ports(["success", "not_found", "failure"]), }, click_target_node, ) register_node( { "type": "vision.close_popups", "category": "vision", "label": "AI 关闭弹窗", "description": "尝试识别并点击当前页面上的关闭、跳过、稍后再说等弹窗按钮。", "params": { "target_description": field_def("text", "关闭目标", "当前页面可见的弹窗关闭按钮、跳过按钮、稍后再说按钮、我知道了按钮或拒绝按钮"), "screen_context": field_def("text", "页面上下文", "当前浏览器页面"), "attempts": field_def("number", "最多尝试", 2, minimum=1, maximum=5), "temperature": field_def("number", "定位温度", 0.1, minimum=0, maximum=2), }, "inputs": { "target_description": field_def("string", "关闭目标"), "screen_context": field_def("string", "页面上下文"), "attempts": field_def("number", "最多尝试"), "temperature": field_def("number", "定位温度"), }, "outputs": { "closed_count": {"type": "number", "label": "关闭数量"}, "items": {"type": "array", "label": "关闭记录"}, }, "control_ports": control_ports(), }, close_popups_node, )