| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412 |
- from __future__ import annotations
- import json
- import random
- import time
- import uuid
- from pathlib import Path
- from typing import Any
- from fastapi import HTTPException
- from ... import ai_service, settings_service, windows_automation
- from ..context import WorkflowContext
- from ..registry import control_ports, field_def, register_node
- LOCATE_TARGET_PROMPT = """请作为 AI 视觉自动化定位助手,在这张真实屏幕截图中寻找用户指定的可点击目标。
- 目标描述:
- {target_description}
- 当前页面/操作上下文:
- {screen_context}
- 选择要求:
- 1. 如果有多个候选目标,{selection_rule}
- 2. 返回目标可点击区域的中心点,不要返回窗口、浏览器地址栏或整块页面的中心。
- 3. 坐标必须是相对整张截图宽高的百分比,范围 0-100。
- 4. 如果目标不可见、被遮挡、需要滚动、页面未加载完成或你不确定,请返回 found=false。
- 严格只输出 JSON 对象,不要输出 Markdown:
- {{
- "found": boolean,
- "x_percent": number|null,
- "y_percent": number|null,
- "confidence": number,
- "target_label": string,
- "reason": string
- }}"""
- VERIFY_PAGE_PROMPT = """请作为 AI 视觉自动化校验器,判断当前屏幕是否符合预期状态。
- 预期状态:
- {expected_state}
- 当前页面/操作上下文:
- {screen_context}
- 严格只输出 JSON 对象,不要输出 Markdown:
- {{
- "matched": boolean,
- "page_state": string,
- "confidence": number,
- "reason": string
- }}"""
- def _number(value: Any, default: float = 0) -> float:
- try:
- return float(value)
- except (TypeError, ValueError):
- return default
- def _boolean(value: Any, default: bool = False) -> bool:
- if value in (None, ""):
- return default
- if isinstance(value, str):
- return value.strip().lower() in {"1", "true", "yes", "y", "on"}
- return bool(value)
- def _percent(value: Any) -> float | None:
- try:
- number = float(value)
- except (TypeError, ValueError):
- return None
- if 0 <= number <= 1:
- number *= 100
- return max(0.0, min(100.0, number))
- def _runtime_screenshot_path() -> Path:
- """生成 workflow 运行期截图路径,便于失败排查和任务结果追踪。"""
- folder = settings_service.resolve_data_path("automation_runtime_path", "automation/runtime")
- folder.mkdir(parents=True, exist_ok=True)
- return folder / f"vision_locate_{int(time.time() * 1000)}_{uuid.uuid4().hex[:8]}.png"
- def _capture_screen(save_screenshot: bool) -> dict[str, Any]:
- save_path = _runtime_screenshot_path() if save_screenshot else None
- screenshot = windows_automation.take_screenshot(str(save_path) if save_path else None, include_base64=True)
- screenshot["mime_type"] = "image/png"
- return screenshot
- def _parse_ai_json(content: str) -> dict[str, Any]:
- parsed = json.loads(ai_service.extract_json_text(content))
- if not isinstance(parsed, dict):
- raise ValueError("AI locate output must be a JSON object")
- return parsed
- def _vision_json(context: WorkflowContext, prompt: str, screenshot: dict[str, Any], temperature: float) -> tuple[dict[str, Any], dict[str, Any]]:
- ai_result = ai_service.chat_with_images(
- int(context.provider_id),
- int(context.model_id),
- prompt,
- [{"base64": screenshot["image_base64"], "mime_type": screenshot["mime_type"]}],
- temperature,
- )
- return _parse_ai_json(ai_result["content"]), ai_result
- def _locate_target(
- context: WorkflowContext,
- target_description: str,
- screen_context: str,
- randomize: bool,
- save_screenshot: bool,
- temperature: float,
- ) -> dict[str, Any]:
- screenshot = _capture_screen(save_screenshot)
- if screenshot.get("path"):
- context.runtime["current_screenshot_path"] = screenshot["path"]
- if randomize:
- selection_rule = f"请结合随机种子 {random.randint(1, 1_000_000)},从可见候选中随机挑选一个"
- else:
- selection_rule = "请选择最符合目标描述、最容易点击的一个"
- prompt = LOCATE_TARGET_PROMPT.format(
- target_description=target_description,
- screen_context=screen_context,
- selection_rule=selection_rule,
- )
- try:
- parsed, ai_result = _vision_json(context, prompt, screenshot, temperature)
- except (json.JSONDecodeError, ValueError) as exc:
- raise HTTPException(status_code=502, detail=f"AI locate output is not valid JSON: {exc}") from exc
- found = bool(parsed.get("found"))
- x_percent = _percent(parsed.get("x_percent"))
- y_percent = _percent(parsed.get("y_percent"))
- base = {
- "screenshot_path": screenshot.get("path"),
- "width": screenshot.get("width"),
- "height": screenshot.get("height"),
- "ai_result": parsed,
- "ai_raw_content": ai_result["content"],
- }
- if not found or x_percent is None or y_percent is None:
- return {"located": False, "found": False, "next_port": "not_found", **base}
- width = int(screenshot["width"])
- height = int(screenshot["height"])
- x = max(0, min(width - 1, round(width * x_percent / 100)))
- y = max(0, min(height - 1, round(height * y_percent / 100)))
- return {
- "located": True,
- "found": True,
- "x_percent": x_percent,
- "y_percent": y_percent,
- "x": x,
- "y": y,
- "confidence": parsed.get("confidence"),
- "target_label": parsed.get("target_label"),
- "reason": parsed.get("reason"),
- **base,
- }
- def locate_element_node(node: dict[str, Any], inputs: dict[str, Any], context: WorkflowContext) -> dict[str, Any]:
- params = node.get("params", {})
- if not context.provider_id or not context.model_id:
- raise HTTPException(status_code=400, detail="AI 视觉定位节点需要配置默认 AI 服务商和模型")
- target_description = str(inputs.get("target_description", params.get("target_description")) or "").strip()
- if not target_description:
- raise HTTPException(status_code=400, detail="target_description is required")
- screen_context = str(inputs.get("screen_context", params.get("screen_context")) or "当前屏幕").strip()
- randomize = _boolean(inputs.get("randomize", params.get("randomize")), False)
- save_screenshot = _boolean(inputs.get("save_screenshot", params.get("save_screenshot")), True)
- fail_if_not_found = _boolean(inputs.get("fail_if_not_found", params.get("fail_if_not_found")), True)
- temperature = _number(inputs.get("temperature", params.get("temperature")), context.temperature)
- result = _locate_target(
- context,
- target_description=target_description,
- screen_context=screen_context,
- randomize=randomize,
- save_screenshot=save_screenshot,
- temperature=temperature,
- )
- if not result.get("located"):
- if fail_if_not_found:
- ai_result = result.get("ai_result") if isinstance(result.get("ai_result"), dict) else {}
- raise HTTPException(status_code=404, detail=ai_result.get("reason") or "AI 未定位到目标元素")
- return result
- return result
- def verify_page_node(node: dict[str, Any], inputs: dict[str, Any], context: WorkflowContext) -> dict[str, Any]:
- params = node.get("params", {})
- if not context.provider_id or not context.model_id:
- raise HTTPException(status_code=400, detail="AI 页面校验节点需要配置默认 AI 服务商和模型")
- expected_state = str(inputs.get("expected_state", params.get("expected_state")) or "").strip()
- if not expected_state:
- raise HTTPException(status_code=400, detail="expected_state is required")
- screen_context = str(inputs.get("screen_context", params.get("screen_context")) or "当前屏幕").strip()
- save_screenshot = _boolean(inputs.get("save_screenshot", params.get("save_screenshot")), True)
- temperature = _number(inputs.get("temperature", params.get("temperature")), context.temperature)
- screenshot = _capture_screen(save_screenshot)
- if screenshot.get("path"):
- context.runtime["current_screenshot_path"] = screenshot["path"]
- prompt = VERIFY_PAGE_PROMPT.format(expected_state=expected_state, screen_context=screen_context)
- try:
- parsed, ai_result = _vision_json(context, prompt, screenshot, temperature)
- except (json.JSONDecodeError, ValueError) as exc:
- raise HTTPException(status_code=502, detail=f"AI verify output is not valid JSON: {exc}") from exc
- matched = bool(parsed.get("matched"))
- return {
- "matched": matched,
- "next_port": "matched" if matched else "not_matched",
- "page_state": parsed.get("page_state"),
- "confidence": parsed.get("confidence"),
- "reason": parsed.get("reason"),
- "screenshot_path": screenshot.get("path"),
- "width": screenshot.get("width"),
- "height": screenshot.get("height"),
- "ai_result": parsed,
- "ai_raw_content": ai_result["content"],
- }
- def click_target_node(node: dict[str, Any], inputs: dict[str, Any], context: WorkflowContext) -> dict[str, Any]:
- params = node.get("params", {})
- target_description = str(inputs.get("target_description", params.get("target_description")) or "").strip()
- if not target_description:
- raise HTTPException(status_code=400, detail="target_description is required")
- screen_context = str(inputs.get("screen_context", params.get("screen_context")) or "当前屏幕").strip()
- randomize = _boolean(inputs.get("randomize", params.get("randomize")), False)
- save_screenshot = _boolean(inputs.get("save_screenshot", params.get("save_screenshot")), True)
- fail_if_not_found = _boolean(inputs.get("fail_if_not_found", params.get("fail_if_not_found")), True)
- temperature = _number(inputs.get("temperature", params.get("temperature")), context.temperature)
- button = str(inputs.get("button", params.get("button")) or "left")
- clicks = int(max(1, min(_number(inputs.get("clicks", params.get("clicks")), 1), 20)))
- result = _locate_target(context, target_description, screen_context, randomize, save_screenshot, temperature)
- if not result.get("located"):
- if fail_if_not_found:
- ai_result = result.get("ai_result") if isinstance(result.get("ai_result"), dict) else {}
- raise HTTPException(status_code=404, detail=ai_result.get("reason") or "AI 未定位到可点击目标")
- return result
- clicked = windows_automation.mouse_action("click", x=int(result["x"]), y=int(result["y"]), button=button, clicks=clicks)
- return {**result, "clicked": True, "click": clicked, "button": button, "clicks": clicks}
- def close_popups_node(node: dict[str, Any], inputs: dict[str, Any], context: WorkflowContext) -> dict[str, Any]:
- params = node.get("params", {})
- target_description = str(
- inputs.get("target_description", params.get("target_description"))
- or "当前页面可见的弹窗关闭按钮、跳过按钮、稍后再说按钮、我知道了按钮或拒绝按钮"
- )
- screen_context = str(inputs.get("screen_context", params.get("screen_context")) or "当前浏览器页面").strip()
- attempts = int(max(1, min(_number(inputs.get("attempts", params.get("attempts")), 2), 5)))
- temperature = _number(inputs.get("temperature", params.get("temperature")), context.temperature)
- closed: list[dict[str, Any]] = []
- for _ in range(attempts):
- result = _locate_target(context, target_description, screen_context, False, True, temperature)
- if not result.get("located"):
- return {"closed_count": len(closed), "items": closed, "next_port": "success"}
- clicked = windows_automation.mouse_action("click", x=int(result["x"]), y=int(result["y"]))
- closed.append({**result, "click": clicked})
- time.sleep(0.8)
- return {"closed_count": len(closed), "items": closed, "next_port": "success"}
- register_node(
- {
- "type": "vision.locate_element",
- "category": "vision",
- "label": "AI 视觉定位元素",
- "params": {
- "target_description": field_def("text", "目标描述", required=True),
- "screen_context": field_def("text", "页面上下文"),
- "randomize": field_def("boolean", "多候选随机选择", False),
- "save_screenshot": field_def("boolean", "保存截图", True),
- "fail_if_not_found": field_def("boolean", "找不到时报错", True),
- "temperature": field_def("number", "定位温度", 0.1, minimum=0, maximum=2),
- },
- "inputs": {
- "target_description": field_def("string", "目标描述"),
- "screen_context": field_def("string", "页面上下文"),
- "randomize": field_def("boolean", "多候选随机选择"),
- "save_screenshot": field_def("boolean", "保存截图"),
- "fail_if_not_found": field_def("boolean", "找不到时报错"),
- "temperature": field_def("number", "定位温度"),
- },
- "outputs": {
- "located": {"type": "boolean", "label": "是否定位成功"},
- "x_percent": {"type": "number", "label": "X 百分比"},
- "y_percent": {"type": "number", "label": "Y 百分比"},
- "x": {"type": "number", "label": "X 坐标"},
- "y": {"type": "number", "label": "Y 坐标"},
- "confidence": {"type": "number", "label": "置信度"},
- "target_label": {"type": "string", "label": "目标标签"},
- "screenshot_path": {"type": "string", "label": "截图路径"},
- "ai_result": {"type": "object", "label": "AI 结果"},
- },
- "control_ports": control_ports(["success", "not_found", "failure"]),
- },
- locate_element_node,
- )
- register_node(
- {
- "type": "vision.verify_page",
- "category": "vision",
- "label": "AI 校验页面状态",
- "description": "截取当前屏幕,让多模态 AI 判断页面是否符合预期,并按 matched/not_matched 分支继续。",
- "params": {
- "expected_state": field_def("text", "预期状态", required=True),
- "screen_context": field_def("text", "页面上下文"),
- "save_screenshot": field_def("boolean", "保存截图", True),
- "temperature": field_def("number", "校验温度", 0.1, minimum=0, maximum=2),
- },
- "inputs": {
- "expected_state": field_def("string", "预期状态"),
- "screen_context": field_def("string", "页面上下文"),
- "save_screenshot": field_def("boolean", "保存截图"),
- "temperature": field_def("number", "校验温度"),
- },
- "outputs": {
- "matched": {"type": "boolean", "label": "是否匹配"},
- "page_state": {"type": "string", "label": "页面状态"},
- "confidence": {"type": "number", "label": "置信度"},
- "reason": {"type": "string", "label": "原因"},
- "screenshot_path": {"type": "string", "label": "截图路径"},
- "ai_result": {"type": "object", "label": "AI 结果"},
- },
- "control_ports": control_ports(["matched", "not_matched", "failure"]),
- },
- verify_page_node,
- )
- register_node(
- {
- "type": "vision.click_target",
- "category": "vision",
- "label": "AI 定位并点击",
- "description": "截屏定位目标元素,换算坐标后立即点击,适合封装常见视觉点击步骤。",
- "params": {
- "target_description": field_def("text", "目标描述", required=True),
- "screen_context": field_def("text", "页面上下文"),
- "randomize": field_def("boolean", "多候选随机选择", False),
- "button": field_def("select", "按键", "left", options=["left", "middle", "right"]),
- "clicks": field_def("number", "点击次数", 1, minimum=1, maximum=20),
- "save_screenshot": field_def("boolean", "保存截图", True),
- "fail_if_not_found": field_def("boolean", "找不到时报错", True),
- "temperature": field_def("number", "定位温度", 0.1, minimum=0, maximum=2),
- },
- "inputs": {
- "target_description": field_def("string", "目标描述"),
- "screen_context": field_def("string", "页面上下文"),
- "randomize": field_def("boolean", "多候选随机选择"),
- "button": field_def("string", "按键"),
- "clicks": field_def("number", "点击次数"),
- "save_screenshot": field_def("boolean", "保存截图"),
- "fail_if_not_found": field_def("boolean", "找不到时报错"),
- "temperature": field_def("number", "定位温度"),
- },
- "outputs": {
- "located": {"type": "boolean", "label": "是否定位成功"},
- "clicked": {"type": "boolean", "label": "是否已点击"},
- "x": {"type": "number", "label": "X 坐标"},
- "y": {"type": "number", "label": "Y 坐标"},
- "confidence": {"type": "number", "label": "置信度"},
- "target_label": {"type": "string", "label": "目标标签"},
- "click": {"type": "object", "label": "点击结果"},
- "ai_result": {"type": "object", "label": "AI 结果"},
- },
- "control_ports": control_ports(["success", "not_found", "failure"]),
- },
- click_target_node,
- )
- register_node(
- {
- "type": "vision.close_popups",
- "category": "vision",
- "label": "AI 关闭弹窗",
- "description": "尝试识别并点击当前页面上的关闭、跳过、稍后再说等弹窗按钮。",
- "params": {
- "target_description": field_def("text", "关闭目标", "当前页面可见的弹窗关闭按钮、跳过按钮、稍后再说按钮、我知道了按钮或拒绝按钮"),
- "screen_context": field_def("text", "页面上下文", "当前浏览器页面"),
- "attempts": field_def("number", "最多尝试", 2, minimum=1, maximum=5),
- "temperature": field_def("number", "定位温度", 0.1, minimum=0, maximum=2),
- },
- "inputs": {
- "target_description": field_def("string", "关闭目标"),
- "screen_context": field_def("string", "页面上下文"),
- "attempts": field_def("number", "最多尝试"),
- "temperature": field_def("number", "定位温度"),
- },
- "outputs": {
- "closed_count": {"type": "number", "label": "关闭数量"},
- "items": {"type": "array", "label": "关闭记录"},
- },
- "control_ports": control_ports(),
- },
- close_popups_node,
- )
|