Luxnk
/
win_monitor


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412
							from __future__ import annotations

import json
import random
import time
import uuid
from pathlib import Path
from typing import Any

from fastapi import HTTPException

from ... import ai_service, settings_service, windows_automation
from ..context import WorkflowContext
from ..registry import control_ports, field_def, register_node


LOCATE_TARGET_PROMPT = """请作为 AI 视觉自动化定位助手，在这张真实屏幕截图中寻找用户指定的可点击目标。

目标描述：
{target_description}

当前页面/操作上下文：
{screen_context}

选择要求：
1. 如果有多个候选目标，{selection_rule}
2. 返回目标可点击区域的中心点，不要返回窗口、浏览器地址栏或整块页面的中心。
3. 坐标必须是相对整张截图宽高的百分比，范围 0-100。
4. 如果目标不可见、被遮挡、需要滚动、页面未加载完成或你不确定，请返回 found=false。

严格只输出 JSON 对象，不要输出 Markdown：
{{
  "found": boolean,
  "x_percent": number|null,
  "y_percent": number|null,
  "confidence": number,
  "target_label": string,
  "reason": string
}}"""

VERIFY_PAGE_PROMPT = """请作为 AI 视觉自动化校验器，判断当前屏幕是否符合预期状态。

预期状态：
{expected_state}

当前页面/操作上下文：
{screen_context}

严格只输出 JSON 对象，不要输出 Markdown：
{{
  "matched": boolean,
  "page_state": string,
  "confidence": number,
  "reason": string
}}"""


def _number(value: Any, default: float = 0) -> float:
    try:
        return float(value)
    except (TypeError, ValueError):
        return default


def _boolean(value: Any, default: bool = False) -> bool:
    if value in (None, ""):
        return default
    if isinstance(value, str):
        return value.strip().lower() in {"1", "true", "yes", "y", "on"}
    return bool(value)


def _percent(value: Any) -> float | None:
    try:
        number = float(value)
    except (TypeError, ValueError):
        return None
    if 0 <= number <= 1:
        number *= 100
    return max(0.0, min(100.0, number))


def _runtime_screenshot_path() -> Path:
    """生成 workflow 运行期截图路径，便于失败排查和任务结果追踪。"""
    folder = settings_service.resolve_data_path("automation_runtime_path", "automation/runtime")
    folder.mkdir(parents=True, exist_ok=True)
    return folder / f"vision_locate_{int(time.time() * 1000)}_{uuid.uuid4().hex[:8]}.png"


def _capture_screen(save_screenshot: bool) -> dict[str, Any]:
    save_path = _runtime_screenshot_path() if save_screenshot else None
    screenshot = windows_automation.take_screenshot(str(save_path) if save_path else None, include_base64=True)
    screenshot["mime_type"] = "image/png"
    return screenshot


def _parse_ai_json(content: str) -> dict[str, Any]:
    parsed = json.loads(ai_service.extract_json_text(content))
    if not isinstance(parsed, dict):
        raise ValueError("AI locate output must be a JSON object")
    return parsed


def _vision_json(context: WorkflowContext, prompt: str, screenshot: dict[str, Any], temperature: float) -> tuple[dict[str, Any], dict[str, Any]]:
    ai_result = ai_service.chat_with_images(
        int(context.provider_id),
        int(context.model_id),
        prompt,
        [{"base64": screenshot["image_base64"], "mime_type": screenshot["mime_type"]}],
        temperature,
    )
    return _parse_ai_json(ai_result["content"]), ai_result


def _locate_target(
    context: WorkflowContext,
    target_description: str,
    screen_context: str,
    randomize: bool,
    save_screenshot: bool,
    temperature: float,
) -> dict[str, Any]:
    screenshot = _capture_screen(save_screenshot)
    if screenshot.get("path"):
        context.runtime["current_screenshot_path"] = screenshot["path"]

    if randomize:
        selection_rule = f"请结合随机种子 {random.randint(1, 1_000_000)}，从可见候选中随机挑选一个"
    else:
        selection_rule = "请选择最符合目标描述、最容易点击的一个"
    prompt = LOCATE_TARGET_PROMPT.format(
        target_description=target_description,
        screen_context=screen_context,
        selection_rule=selection_rule,
    )
    try:
        parsed, ai_result = _vision_json(context, prompt, screenshot, temperature)
    except (json.JSONDecodeError, ValueError) as exc:
        raise HTTPException(status_code=502, detail=f"AI locate output is not valid JSON: {exc}") from exc

    found = bool(parsed.get("found"))
    x_percent = _percent(parsed.get("x_percent"))
    y_percent = _percent(parsed.get("y_percent"))
    base = {
        "screenshot_path": screenshot.get("path"),
        "width": screenshot.get("width"),
        "height": screenshot.get("height"),
        "ai_result": parsed,
        "ai_raw_content": ai_result["content"],
    }
    if not found or x_percent is None or y_percent is None:
        return {"located": False, "found": False, "next_port": "not_found", **base}

    width = int(screenshot["width"])
    height = int(screenshot["height"])
    x = max(0, min(width - 1, round(width * x_percent / 100)))
    y = max(0, min(height - 1, round(height * y_percent / 100)))
    return {
        "located": True,
        "found": True,
        "x_percent": x_percent,
        "y_percent": y_percent,
        "x": x,
        "y": y,
        "confidence": parsed.get("confidence"),
        "target_label": parsed.get("target_label"),
        "reason": parsed.get("reason"),
        **base,
    }


def locate_element_node(node: dict[str, Any], inputs: dict[str, Any], context: WorkflowContext) -> dict[str, Any]:
    params = node.get("params", {})
    if not context.provider_id or not context.model_id:
        raise HTTPException(status_code=400, detail="AI 视觉定位节点需要配置默认 AI 服务商和模型")

    target_description = str(inputs.get("target_description", params.get("target_description")) or "").strip()
    if not target_description:
        raise HTTPException(status_code=400, detail="target_description is required")

    screen_context = str(inputs.get("screen_context", params.get("screen_context")) or "当前屏幕").strip()
    randomize = _boolean(inputs.get("randomize", params.get("randomize")), False)
    save_screenshot = _boolean(inputs.get("save_screenshot", params.get("save_screenshot")), True)
    fail_if_not_found = _boolean(inputs.get("fail_if_not_found", params.get("fail_if_not_found")), True)
    temperature = _number(inputs.get("temperature", params.get("temperature")), context.temperature)

    result = _locate_target(
        context,
        target_description=target_description,
        screen_context=screen_context,
        randomize=randomize,
        save_screenshot=save_screenshot,
        temperature=temperature,
    )
    if not result.get("located"):
        if fail_if_not_found:
            ai_result = result.get("ai_result") if isinstance(result.get("ai_result"), dict) else {}
            raise HTTPException(status_code=404, detail=ai_result.get("reason") or "AI 未定位到目标元素")
        return result
    return result


def verify_page_node(node: dict[str, Any], inputs: dict[str, Any], context: WorkflowContext) -> dict[str, Any]:
    params = node.get("params", {})
    if not context.provider_id or not context.model_id:
        raise HTTPException(status_code=400, detail="AI 页面校验节点需要配置默认 AI 服务商和模型")
    expected_state = str(inputs.get("expected_state", params.get("expected_state")) or "").strip()
    if not expected_state:
        raise HTTPException(status_code=400, detail="expected_state is required")
    screen_context = str(inputs.get("screen_context", params.get("screen_context")) or "当前屏幕").strip()
    save_screenshot = _boolean(inputs.get("save_screenshot", params.get("save_screenshot")), True)
    temperature = _number(inputs.get("temperature", params.get("temperature")), context.temperature)
    screenshot = _capture_screen(save_screenshot)
    if screenshot.get("path"):
        context.runtime["current_screenshot_path"] = screenshot["path"]
    prompt = VERIFY_PAGE_PROMPT.format(expected_state=expected_state, screen_context=screen_context)
    try:
        parsed, ai_result = _vision_json(context, prompt, screenshot, temperature)
    except (json.JSONDecodeError, ValueError) as exc:
        raise HTTPException(status_code=502, detail=f"AI verify output is not valid JSON: {exc}") from exc
    matched = bool(parsed.get("matched"))
    return {
        "matched": matched,
        "next_port": "matched" if matched else "not_matched",
        "page_state": parsed.get("page_state"),
        "confidence": parsed.get("confidence"),
        "reason": parsed.get("reason"),
        "screenshot_path": screenshot.get("path"),
        "width": screenshot.get("width"),
        "height": screenshot.get("height"),
        "ai_result": parsed,
        "ai_raw_content": ai_result["content"],
    }


def click_target_node(node: dict[str, Any], inputs: dict[str, Any], context: WorkflowContext) -> dict[str, Any]:
    params = node.get("params", {})
    target_description = str(inputs.get("target_description", params.get("target_description")) or "").strip()
    if not target_description:
        raise HTTPException(status_code=400, detail="target_description is required")
    screen_context = str(inputs.get("screen_context", params.get("screen_context")) or "当前屏幕").strip()
    randomize = _boolean(inputs.get("randomize", params.get("randomize")), False)
    save_screenshot = _boolean(inputs.get("save_screenshot", params.get("save_screenshot")), True)
    fail_if_not_found = _boolean(inputs.get("fail_if_not_found", params.get("fail_if_not_found")), True)
    temperature = _number(inputs.get("temperature", params.get("temperature")), context.temperature)
    button = str(inputs.get("button", params.get("button")) or "left")
    clicks = int(max(1, min(_number(inputs.get("clicks", params.get("clicks")), 1), 20)))
    result = _locate_target(context, target_description, screen_context, randomize, save_screenshot, temperature)
    if not result.get("located"):
        if fail_if_not_found:
            ai_result = result.get("ai_result") if isinstance(result.get("ai_result"), dict) else {}
            raise HTTPException(status_code=404, detail=ai_result.get("reason") or "AI 未定位到可点击目标")
        return result
    clicked = windows_automation.mouse_action("click", x=int(result["x"]), y=int(result["y"]), button=button, clicks=clicks)
    return {**result, "clicked": True, "click": clicked, "button": button, "clicks": clicks}


def close_popups_node(node: dict[str, Any], inputs: dict[str, Any], context: WorkflowContext) -> dict[str, Any]:
    params = node.get("params", {})
    target_description = str(
        inputs.get("target_description", params.get("target_description"))
        or "当前页面可见的弹窗关闭按钮、跳过按钮、稍后再说按钮、我知道了按钮或拒绝按钮"
    )
    screen_context = str(inputs.get("screen_context", params.get("screen_context")) or "当前浏览器页面").strip()
    attempts = int(max(1, min(_number(inputs.get("attempts", params.get("attempts")), 2), 5)))
    temperature = _number(inputs.get("temperature", params.get("temperature")), context.temperature)
    closed: list[dict[str, Any]] = []
    for _ in range(attempts):
        result = _locate_target(context, target_description, screen_context, False, True, temperature)
        if not result.get("located"):
            return {"closed_count": len(closed), "items": closed, "next_port": "success"}
        clicked = windows_automation.mouse_action("click", x=int(result["x"]), y=int(result["y"]))
        closed.append({**result, "click": clicked})
        time.sleep(0.8)
    return {"closed_count": len(closed), "items": closed, "next_port": "success"}


register_node(
    {
        "type": "vision.locate_element",
        "category": "vision",
        "label": "AI 视觉定位元素",
        "params": {
            "target_description": field_def("text", "目标描述", required=True),
            "screen_context": field_def("text", "页面上下文"),
            "randomize": field_def("boolean", "多候选随机选择", False),
            "save_screenshot": field_def("boolean", "保存截图", True),
            "fail_if_not_found": field_def("boolean", "找不到时报错", True),
            "temperature": field_def("number", "定位温度", 0.1, minimum=0, maximum=2),
        },
        "inputs": {
            "target_description": field_def("string", "目标描述"),
            "screen_context": field_def("string", "页面上下文"),
            "randomize": field_def("boolean", "多候选随机选择"),
            "save_screenshot": field_def("boolean", "保存截图"),
            "fail_if_not_found": field_def("boolean", "找不到时报错"),
            "temperature": field_def("number", "定位温度"),
        },
        "outputs": {
            "located": {"type": "boolean", "label": "是否定位成功"},
            "x_percent": {"type": "number", "label": "X 百分比"},
            "y_percent": {"type": "number", "label": "Y 百分比"},
            "x": {"type": "number", "label": "X 坐标"},
            "y": {"type": "number", "label": "Y 坐标"},
            "confidence": {"type": "number", "label": "置信度"},
            "target_label": {"type": "string", "label": "目标标签"},
            "screenshot_path": {"type": "string", "label": "截图路径"},
            "ai_result": {"type": "object", "label": "AI 结果"},
        },
        "control_ports": control_ports(["success", "not_found", "failure"]),
    },
    locate_element_node,
)

register_node(
    {
        "type": "vision.verify_page",
        "category": "vision",
        "label": "AI 校验页面状态",
        "description": "截取当前屏幕，让多模态 AI 判断页面是否符合预期，并按 matched/not_matched 分支继续。",
        "params": {
            "expected_state": field_def("text", "预期状态", required=True),
            "screen_context": field_def("text", "页面上下文"),
            "save_screenshot": field_def("boolean", "保存截图", True),
            "temperature": field_def("number", "校验温度", 0.1, minimum=0, maximum=2),
        },
        "inputs": {
            "expected_state": field_def("string", "预期状态"),
            "screen_context": field_def("string", "页面上下文"),
            "save_screenshot": field_def("boolean", "保存截图"),
            "temperature": field_def("number", "校验温度"),
        },
        "outputs": {
            "matched": {"type": "boolean", "label": "是否匹配"},
            "page_state": {"type": "string", "label": "页面状态"},
            "confidence": {"type": "number", "label": "置信度"},
            "reason": {"type": "string", "label": "原因"},
            "screenshot_path": {"type": "string", "label": "截图路径"},
            "ai_result": {"type": "object", "label": "AI 结果"},
        },
        "control_ports": control_ports(["matched", "not_matched", "failure"]),
    },
    verify_page_node,
)

register_node(
    {
        "type": "vision.click_target",
        "category": "vision",
        "label": "AI 定位并点击",
        "description": "截屏定位目标元素，换算坐标后立即点击，适合封装常见视觉点击步骤。",
        "params": {
            "target_description": field_def("text", "目标描述", required=True),
            "screen_context": field_def("text", "页面上下文"),
            "randomize": field_def("boolean", "多候选随机选择", False),
            "button": field_def("select", "按键", "left", options=["left", "middle", "right"]),
            "clicks": field_def("number", "点击次数", 1, minimum=1, maximum=20),
            "save_screenshot": field_def("boolean", "保存截图", True),
            "fail_if_not_found": field_def("boolean", "找不到时报错", True),
            "temperature": field_def("number", "定位温度", 0.1, minimum=0, maximum=2),
        },
        "inputs": {
            "target_description": field_def("string", "目标描述"),
            "screen_context": field_def("string", "页面上下文"),
            "randomize": field_def("boolean", "多候选随机选择"),
            "button": field_def("string", "按键"),
            "clicks": field_def("number", "点击次数"),
            "save_screenshot": field_def("boolean", "保存截图"),
            "fail_if_not_found": field_def("boolean", "找不到时报错"),
            "temperature": field_def("number", "定位温度"),
        },
        "outputs": {
            "located": {"type": "boolean", "label": "是否定位成功"},
            "clicked": {"type": "boolean", "label": "是否已点击"},
            "x": {"type": "number", "label": "X 坐标"},
            "y": {"type": "number", "label": "Y 坐标"},
            "confidence": {"type": "number", "label": "置信度"},
            "target_label": {"type": "string", "label": "目标标签"},
            "click": {"type": "object", "label": "点击结果"},
            "ai_result": {"type": "object", "label": "AI 结果"},
        },
        "control_ports": control_ports(["success", "not_found", "failure"]),
    },
    click_target_node,
)

register_node(
    {
        "type": "vision.close_popups",
        "category": "vision",
        "label": "AI 关闭弹窗",
        "description": "尝试识别并点击当前页面上的关闭、跳过、稍后再说等弹窗按钮。",
        "params": {
            "target_description": field_def("text", "关闭目标", "当前页面可见的弹窗关闭按钮、跳过按钮、稍后再说按钮、我知道了按钮或拒绝按钮"),
            "screen_context": field_def("text", "页面上下文", "当前浏览器页面"),
            "attempts": field_def("number", "最多尝试", 2, minimum=1, maximum=5),
            "temperature": field_def("number", "定位温度", 0.1, minimum=0, maximum=2),
        },
        "inputs": {
            "target_description": field_def("string", "关闭目标"),
            "screen_context": field_def("string", "页面上下文"),
            "attempts": field_def("number", "最多尝试"),
            "temperature": field_def("number", "定位温度"),
        },
        "outputs": {
            "closed_count": {"type": "number", "label": "关闭数量"},
            "items": {"type": "array", "label": "关闭记录"},
        },
        "control_ports": control_ports(),
    },
    close_popups_node,
)