Skip to content

Commit

Permalink
Merge pull request microsoft#92 from microsoft/vyokky/dev
Browse files Browse the repository at this point in the history
Vyokky/dev Improve HostAgent and pre-release
  • Loading branch information
vyokky authored Jun 22, 2024
2 parents bf742af + 9fab5aa commit c69ffbd
Show file tree
Hide file tree
Showing 31 changed files with 1,453 additions and 908 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ UFO sightings have garnered attention from various media outlets, including:
- [Microsoft's UFO abducts traditional user interfaces for a smarter Windows experience](https://the-decoder.com/microsofts-ufo-abducts-traditional-user-interfaces-for-a-smarter-windows-experience/)
- [🚀 UFO & GPT-4-V: Sit back and relax, mientras GPT lo hace todo🌌](https://www.linkedin.com/posts/gutierrezfrancois_ai-ufo-microsoft-activity-7176819900399652865-pLoo?utm_source=share&utm_medium=member_desktop)
- [The AI PC - The Future of Computers? - Microsoft UFO](https://www.youtube.com/watch?v=1k4LcffCq3E)
- [下一代Windows系统曝光:基于GPT-4V,Agent跨应用调度,代号UFO](https://www.qbitai.com/2024/02/121048.html)
- [下一代Windows系统曝光:基于GPT-4V,Agent跨应用调度,代号UFO](https://baijiahao.baidu.com/s?id=1790938358152188625&wfr=spider&for=pc)
- [下一代智能版 Windows 要来了?微软推出首个 Windows Agent,命名为 UFO!](https://blog.csdn.net/csdnnews/article/details/136161570)
- [Microsoft発のオープンソース版「UFO」登場! Windowsを自動操縦するAIエージェントを試す](https://internet.watch.impress.co.jp/docs/column/shimizu/1570581.html)
- ...
Expand Down
23 changes: 16 additions & 7 deletions ufo/agents/agent/app_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,11 @@ def message_constructor(
dynamic_knowledge: str,
image_list: List,
control_info: str,
prev_subtask: List[Dict[str, str]],
plan: List[str],
request: str,
subtask: str,
host_message: List[str],
include_last_screenshot: bool,
) -> List[Dict[str, Union[str, List[Dict[str, str]]]]]:
"""
Expand All @@ -101,7 +104,9 @@ def message_constructor(
:param image_list: The list of screenshot images.
:param control_info: The control information.
:param plan: The plan list.
:param request: The request.
:param request: The overall user request.
:param subtask: The subtask for the current AppAgent to process.
:param host_message: The message from the HostAgent.
:param include_last_screenshot: The flag indicating whether to include the last screenshot.
:return: The prompt message.
"""
Expand All @@ -110,12 +115,16 @@ def message_constructor(
)

appagent_prompt_user_message = self.prompter.user_content_construction(
image_list,
control_info,
plan,
request,
dynamic_knowledge,
include_last_screenshot,
image_list=image_list,
control_item=control_info,
prev_subtask=prev_subtask,
prev_plan=plan,
user_request=request,
subtask=subtask,
current_application=self._process_name,
host_message=host_message,
retrieved_docs=dynamic_knowledge,
include_last_screenshot=include_last_screenshot,
)

if not self.blackboard.is_empty():
Expand Down
26 changes: 18 additions & 8 deletions ufo/agents/agent/follower_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,11 @@ def message_constructor(
dynamic_knowledge: str,
image_list: List[str],
control_info: str,
prev_subtask: List[str],
plan: List[str],
request: str,
subtask: str,
host_message: List[str],
current_state: Dict[str, str],
state_diff: Dict[str, str],
include_last_screenshot: bool,
Expand All @@ -108,8 +111,11 @@ def message_constructor(
:param dynamic_knowledge: The dynamic knowledge retrieved from the self-demonstration and human demonstration.
:param image_list: The list of screenshot images.
:param control_info: The control information.
:param prev_subtask: The previous subtask.
:param plan: The plan.
:param request: The request.
:param subtask: The subtask.
:param host_message: The host message.
:param current_state: The current state of the app.
:param state_diff: The state difference between the current state and the previous state.
:param include_last_screenshot: The flag indicating whether the last screenshot should be included.
Expand All @@ -119,14 +125,18 @@ def message_constructor(
dynamic_examples, dynamic_tips
)
followagent_prompt_user_message = self.prompter.user_content_construction(
image_list,
control_info,
plan,
request,
dynamic_knowledge,
current_state,
state_diff,
include_last_screenshot,
image_list=image_list,
control_item=control_info,
prev_subtask=prev_subtask,
prev_plan=plan,
user_request=request,
subtask=subtask,
current_application=self._process_name,
host_message=host_message,
retrieved_docs=dynamic_knowledge,
current_state=current_state,
state_diff=state_diff,
include_last_screenshot=include_last_screenshot,
)

followagent_prompt_message = self.prompter.prompt_construction(
Expand Down
44 changes: 36 additions & 8 deletions ufo/agents/agent/host_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,14 @@ def create_subagent(

return app_agent

@property
def sub_agent_amount(self) -> int:
"""
Get the amount of sub agents.
:return: The amount of sub agents.
"""
return len(self.appagent_dict)

def get_active_appagent(self) -> AppAgent:
"""
Get the active app agent.
Expand All @@ -165,19 +173,25 @@ def message_constructor(
image_list: List[str],
os_info: str,
plan: List[str],
prev_subtask: List[Dict[str, str]],
request: str,
) -> List[Dict[str, Union[str, List[Dict[str, str]]]]]:
"""
Construct the message.
:param image_list: The list of screenshot images.
:param os_info: The OS information.
:param prev_subtask: The previous subtask.
:param plan: The plan.
:param request: The request.
:return: The message.
"""
hostagent_prompt_system_message = self.prompter.system_prompt_construction()
hostagent_prompt_user_message = self.prompter.user_content_construction(
image_list, os_info, plan, request
image_list=image_list,
control_item=os_info,
prev_subtask=prev_subtask,
prev_plan=plan,
user_request=request,
)

if not self.blackboard.is_empty():
Expand Down Expand Up @@ -238,7 +252,17 @@ def print_response(self, response_dict: Dict) -> None:
application = "[The required application needs to be opened.]"
observation = response_dict.get("Observation")
thought = response_dict.get("Thought")
plan = response_dict.get("Plan")
subtask = response_dict.get("CurrentSubtask")

# Convert the message from a list to a string.
message = list(response_dict.get("Message"))
message = "\n".join(message)

# Concatenate the subtask with the plan and convert the plan from a list to a string.
plan = list(response_dict.get("Plan"))
plan = [subtask] + plan
plan = "\n".join([f"({i+1}) " + str(item) for i, item in enumerate(plan)])

status = response_dict.get("Status")
comment = response_dict.get("Comment")

Expand All @@ -247,16 +271,20 @@ def print_response(self, response_dict: Dict) -> None:
)
utils.print_with_color("Thoughts💡: {thought}".format(thought=thought), "green")
utils.print_with_color(
"Selected application📲: {application}".format(application=application),
"yellow",
"Plans📚: {plan}".format(plan=plan),
"cyan",
)
utils.print_with_color("Status📊: {status}".format(status=status), "blue")
utils.print_with_color(
"Next Plan📚: {plan}".format(
plan= "\n".join(plan) if isinstance(plan, list) else str(plan)
"Next Selected application📲: {application}".format(
application=application
),
"cyan",
"yellow",
)
utils.print_with_color(
"Messages to AppAgent📩: {message}".format(message=message), "cyan"
)
utils.print_with_color("Status📊: {status}".format(status=status), "blue")

utils.print_with_color("Comment💬: {comment}".format(comment=comment), "green")

@property
Expand Down
2 changes: 0 additions & 2 deletions ufo/agents/memory/blackboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,8 +310,6 @@ def read_json_file(file_path: str, last_k=-1) -> Dict[str, str]:
data_list.append(data)
except json.JSONDecodeError:
print(f"Warning: Unable to parse line as JSON: {line}")
else:
print(f"File does not exist: {file_path}")

return data_list

Expand Down
72 changes: 37 additions & 35 deletions ufo/agents/processors/app_agent_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from pywinauto.controls.uiawrapper import UIAWrapper

from ufo import utils
from ufo.agents.memory.memory import MemoryItem
from ufo.agents.processors.basic import BaseProcessor
from ufo.automator.ui_control.control_filter import ControlFilterFactory
from ufo.config.config import Config
Expand Down Expand Up @@ -47,7 +46,6 @@ def __init__(self, agent: "AppAgent", context: Context) -> None:
self._image_url = []
self._plan = []
self.action = ""
self.prev_plan = []
self.control_filter_factory = ControlFilterFactory()
self.filtered_annotation_dict = None

Expand Down Expand Up @@ -88,9 +86,10 @@ def print_step_info(self) -> None:
Print the step information.
"""
utils.print_with_color(
"Round {round_num}, Step {step}: Taking an action on application {application}.".format(
"Round {round_num}, Step {step}, AppAgent: Completing the subtask [{subtask}] on application [{application}].".format(
round_num=self.round_num + 1,
step=self.round_step + 1,
subtask=self.subtask,
application=self.application_process_name,
),
"magenta",
Expand All @@ -110,6 +109,14 @@ def capture_screenshot(self) -> None:
self.log_path + f"action_step{self.session_step}_concat.png"
)

self._memory_data.set_values_from_dict(
{
"CleanScreenshot": screenshot_save_path,
"AnnotatedScreenshot": annotated_screenshot_save_path,
"ConcatScreenshot": concat_screenshot_save_path,
}
)

# Get the control elements in the application window if the control items are not provided for reannotation.
if type(self.control_reannotate) == list and len(self.control_reannotate) > 0:
control_list = self.control_reannotate
Expand All @@ -125,8 +132,6 @@ def capture_screenshot(self) -> None:
self.application_window, control_list, annotation_type="number"
)

self.prev_plan = self.get_prev_plan()

# Attempt to filter out irrelevant control items based on the previous plan.
self.filtered_annotation_dict = self.get_filtered_annotation_dict(
self._annotation_dict
Expand Down Expand Up @@ -220,14 +225,17 @@ def get_prompt_message(self) -> None:

# Construct the prompt message for the AppAgent.
self._prompt_message = self.app_agent.message_constructor(
examples,
tips,
external_knowledge_prompt,
self._image_url,
self.filtered_control_info,
self.prev_plan,
self.request,
configs["INCLUDE_LAST_SCREENSHOT"],
dynamic_examples=examples,
dynamic_tips=tips,
dynamic_knowledge=external_knowledge_prompt,
image_list=self._image_url,
control_info=self.filtered_control_info,
prev_subtask=self.previous_subtasks,
plan=self.prev_plan,
request=self.request,
subtask=self.subtask,
host_message=self.host_message,
include_last_screenshot=configs["INCLUDE_LAST_SCREENSHOT"],
)

# Log the prompt message. Only save them in debug mode.
Expand Down Expand Up @@ -331,6 +339,11 @@ def capture_control_screenshot(self, control_selected: UIAWrapper) -> None:
control_screenshot_save_path = (
self.log_path + f"action_step{self.session_step}_selected_controls.png"
)

self._memory_data.set_values_from_dict(
{"SelectedControlScreenshot": control_screenshot_save_path}
)

self.photographer.capture_app_window_screenshot_with_rectangle(
self.application_window,
sub_control_list=[control_selected],
Expand All @@ -354,8 +367,6 @@ def update_memory(self) -> None:
"""
Update the memory of the Agent.
"""
# Create a memory item for the app agent
app_agent_step_memory = MemoryItem()

app_root = self.control_inspector.get_application_root_name(
self.application_window
Expand All @@ -367,6 +378,8 @@ def update_memory(self) -> None:
"RoundStep": self.round_step,
"AgentStep": self.app_agent.step,
"Round": self.round_num,
"Subtask": self.subtask,
"SubtaskIndex": self.round_subtask_amount,
"Action": self.action,
"ActionType": self.app_agent.Puppeteer.get_command_types(self._operation),
"Request": self.request,
Expand All @@ -376,18 +389,21 @@ def update_memory(self) -> None:
"Cost": self._cost,
"Results": self._results,
}
app_agent_step_memory.set_values_from_dict(self._response_json)
app_agent_step_memory.set_values_from_dict(additional_memory)
self._memory_data.set_values_from_dict(self._response_json)
self._memory_data.set_values_from_dict(additional_memory)

if self.status.upper() == self._agent_status_manager.CONFIRM.value:
self._memory_data.set_values_from_dict({"UserConfirm": "Yes"})

self.app_agent.add_memory(app_agent_step_memory)
self.app_agent.add_memory(self._memory_data)

# Log the memory item.
self.log(app_agent_step_memory.to_dict())
self.context.add_to_structural_logs(self._memory_data.to_dict())
self.log(self._memory_data.to_dict())

# Only memorize the keys in the HISTORY_KEYS list to feed into the prompt message in the future steps.
memorized_action = {
key: app_agent_step_memory.to_dict().get(key)
for key in configs["HISTORY_KEYS"]
key: self._memory_data.to_dict().get(key) for key in configs["HISTORY_KEYS"]
}

# Save the screenshot to the blackboard if the SaveScreenshot flag is set to True by the AppAgent.
Expand Down Expand Up @@ -421,20 +437,6 @@ def _save_to_xml(self) -> None:
)
self.app_agent.Puppeteer.save_to_xml(xml_save_path)

def get_prev_plan(self) -> str:
"""
Retrieves the previous plan from the agent's memory.
:return: The previous plan, or an empty string if the agent's memory is empty.
"""
agent_memory = self.app_agent.memory

if agent_memory.length > 0:
prev_plan = agent_memory.get_latest_item().to_dict()["Plan"]
else:
prev_plan = []

return prev_plan

def demonstration_prompt_helper(self) -> Tuple[List[str], List[str]]:
"""
Get the examples and tips for the AppAgent using the demonstration retriever.
Expand Down
Loading

0 comments on commit c69ffbd

Please sign in to comment.