Merge pull request microsoft#92 from microsoft/vyokky/dev

Vyokky/dev Improve HostAgent and pre-release
Optimose · Jun 22, 2024 · c69ffbd · c69ffbd
2 parents bf742af + 9fab5aa
commit c69ffbd
Show file tree

Hide file tree

Showing 31 changed files with 1,453 additions and 908 deletions.
diff --git a/README.md b/README.md
@@ -54,7 +54,7 @@ UFO sightings have garnered attention from various media outlets, including:
 - [Microsoft's UFO abducts traditional user interfaces for a smarter Windows experience](https://the-decoder.com/microsofts-ufo-abducts-traditional-user-interfaces-for-a-smarter-windows-experience/)
 - [🚀 UFO & GPT-4-V: Sit back and relax, mientras GPT lo hace todo🌌](https://www.linkedin.com/posts/gutierrezfrancois_ai-ufo-microsoft-activity-7176819900399652865-pLoo?utm_source=share&utm_medium=member_desktop)
 - [The AI PC - The Future of Computers? - Microsoft UFO](https://www.youtube.com/watch?v=1k4LcffCq3E)
-- [下一代Windows系统曝光：基于GPT-4V，Agent跨应用调度，代号UFO](https://www.qbitai.com/2024/02/121048.html)
+- [下一代Windows系统曝光：基于GPT-4V，Agent跨应用调度，代号UFO](https://baijiahao.baidu.com/s?id=1790938358152188625&wfr=spider&for=pc)
 - [下一代智能版 Windows 要来了？微软推出首个 Windows Agent，命名为 UFO！](https://blog.csdn.net/csdnnews/article/details/136161570)
 - [Microsoft発のオープンソース版「UFO」登場！　Windowsを自動操縦するAIエージェントを試す](https://internet.watch.impress.co.jp/docs/column/shimizu/1570581.html)
 - ...

diff --git a/ufo/agents/agent/app_agent.py b/ufo/agents/agent/app_agent.py
@@ -89,8 +89,11 @@ def message_constructor(
         dynamic_knowledge: str,
         image_list: List,
         control_info: str,
+        prev_subtask: List[Dict[str, str]],
         plan: List[str],
         request: str,
+        subtask: str,
+        host_message: List[str],
         include_last_screenshot: bool,
     ) -> List[Dict[str, Union[str, List[Dict[str, str]]]]]:
         """
@@ -101,7 +104,9 @@ def message_constructor(
         :param image_list: The list of screenshot images.
         :param control_info: The control information.
         :param plan: The plan list.
-        :param request: The request.
+        :param request: The overall user request.
+        :param subtask: The subtask for the current AppAgent to process.
+        :param host_message: The message from the HostAgent.
         :param include_last_screenshot: The flag indicating whether to include the last screenshot.
         :return: The prompt message.
         """
@@ -110,12 +115,16 @@ def message_constructor(
         )
 
         appagent_prompt_user_message = self.prompter.user_content_construction(
-            image_list,
-            control_info,
-            plan,
-            request,
-            dynamic_knowledge,
-            include_last_screenshot,
+            image_list=image_list,
+            control_item=control_info,
+            prev_subtask=prev_subtask,
+            prev_plan=plan,
+            user_request=request,
+            subtask=subtask,
+            current_application=self._process_name,
+            host_message=host_message,
+            retrieved_docs=dynamic_knowledge,
+            include_last_screenshot=include_last_screenshot,
         )
 
         if not self.blackboard.is_empty():

diff --git a/ufo/agents/agent/follower_agent.py b/ufo/agents/agent/follower_agent.py
@@ -95,8 +95,11 @@ def message_constructor(
         dynamic_knowledge: str,
         image_list: List[str],
         control_info: str,
+        prev_subtask: List[str],
         plan: List[str],
         request: str,
+        subtask: str,
+        host_message: List[str],
         current_state: Dict[str, str],
         state_diff: Dict[str, str],
         include_last_screenshot: bool,
@@ -108,8 +111,11 @@ def message_constructor(
         :param dynamic_knowledge: The dynamic knowledge retrieved from the self-demonstration and human demonstration.
         :param image_list: The list of screenshot images.
         :param control_info: The control information.
+        :param prev_subtask: The previous subtask.
         :param plan: The plan.
         :param request: The request.
+        :param subtask: The subtask.
+        :param host_message: The host message.
         :param current_state: The current state of the app.
         :param state_diff: The state difference between the current state and the previous state.
         :param include_last_screenshot: The flag indicating whether the last screenshot should be included.
@@ -119,14 +125,18 @@ def message_constructor(
             dynamic_examples, dynamic_tips
         )
         followagent_prompt_user_message = self.prompter.user_content_construction(
-            image_list,
-            control_info,
-            plan,
-            request,
-            dynamic_knowledge,
-            current_state,
-            state_diff,
-            include_last_screenshot,
+            image_list=image_list,
+            control_item=control_info,
+            prev_subtask=prev_subtask,
+            prev_plan=plan,
+            user_request=request,
+            subtask=subtask,
+            current_application=self._process_name,
+            host_message=host_message,
+            retrieved_docs=dynamic_knowledge,
+            current_state=current_state,
+            state_diff=state_diff,
+            include_last_screenshot=include_last_screenshot,
         )
 
         followagent_prompt_message = self.prompter.prompt_construction(

diff --git a/ufo/agents/agent/host_agent.py b/ufo/agents/agent/host_agent.py
@@ -146,6 +146,14 @@ def create_subagent(
 
         return app_agent
 
+    @property
+    def sub_agent_amount(self) -> int:
+        """
+        Get the amount of sub agents.
+        :return: The amount of sub agents.
+        """
+        return len(self.appagent_dict)
+
     def get_active_appagent(self) -> AppAgent:
         """
         Get the active app agent.
@@ -165,19 +173,25 @@ def message_constructor(
         image_list: List[str],
         os_info: str,
         plan: List[str],
+        prev_subtask: List[Dict[str, str]],
         request: str,
     ) -> List[Dict[str, Union[str, List[Dict[str, str]]]]]:
         """
         Construct the message.
         :param image_list: The list of screenshot images.
         :param os_info: The OS information.
+        :param prev_subtask: The previous subtask.
         :param plan: The plan.
         :param request: The request.
         :return: The message.
         """
         hostagent_prompt_system_message = self.prompter.system_prompt_construction()
         hostagent_prompt_user_message = self.prompter.user_content_construction(
-            image_list, os_info, plan, request
+            image_list=image_list,
+            control_item=os_info,
+            prev_subtask=prev_subtask,
+            prev_plan=plan,
+            user_request=request,
         )
 
         if not self.blackboard.is_empty():
@@ -238,7 +252,17 @@ def print_response(self, response_dict: Dict) -> None:
             application = "[The required application needs to be opened.]"
         observation = response_dict.get("Observation")
         thought = response_dict.get("Thought")
-        plan = response_dict.get("Plan")
+        subtask = response_dict.get("CurrentSubtask")
+
+        # Convert the message from a list to a string.
+        message = list(response_dict.get("Message"))
+        message = "\n".join(message)
+
+        # Concatenate the subtask with the plan and convert the plan from a list to a string.
+        plan = list(response_dict.get("Plan"))
+        plan = [subtask] + plan
+        plan = "\n".join([f"({i+1}) " + str(item) for i, item in enumerate(plan)])
+
         status = response_dict.get("Status")
         comment = response_dict.get("Comment")
 
@@ -247,16 +271,20 @@ def print_response(self, response_dict: Dict) -> None:
         )
         utils.print_with_color("Thoughts💡: {thought}".format(thought=thought), "green")
         utils.print_with_color(
-            "Selected application📲: {application}".format(application=application),
-            "yellow",
+            "Plans📚: {plan}".format(plan=plan),
+            "cyan",
         )
-        utils.print_with_color("Status📊: {status}".format(status=status), "blue")
         utils.print_with_color(
-            "Next Plan📚: {plan}".format(
-                plan= "\n".join(plan) if isinstance(plan, list) else str(plan)
+            "Next Selected application📲: {application}".format(
+                application=application
             ),
-            "cyan",
+            "yellow",
         )
+        utils.print_with_color(
+            "Messages to AppAgent📩: {message}".format(message=message), "cyan"
+        )
+        utils.print_with_color("Status📊: {status}".format(status=status), "blue")
+
         utils.print_with_color("Comment💬: {comment}".format(comment=comment), "green")
 
     @property

diff --git a/ufo/agents/memory/blackboard.py b/ufo/agents/memory/blackboard.py
@@ -310,8 +310,6 @@ def read_json_file(file_path: str, last_k=-1) -> Dict[str, str]:
                     data_list.append(data)
                 except json.JSONDecodeError:
                     print(f"Warning: Unable to parse line as JSON: {line}")
-        else:
-            print(f"File does not exist: {file_path}")
 
         return data_list
 

diff --git a/ufo/agents/processors/app_agent_processor.py b/ufo/agents/processors/app_agent_processor.py
@@ -10,7 +10,6 @@
 from pywinauto.controls.uiawrapper import UIAWrapper
 
 from ufo import utils
-from ufo.agents.memory.memory import MemoryItem
 from ufo.agents.processors.basic import BaseProcessor
 from ufo.automator.ui_control.control_filter import ControlFilterFactory
 from ufo.config.config import Config
@@ -47,7 +46,6 @@ def __init__(self, agent: "AppAgent", context: Context) -> None:
         self._image_url = []
         self._plan = []
         self.action = ""
-        self.prev_plan = []
         self.control_filter_factory = ControlFilterFactory()
         self.filtered_annotation_dict = None
 
@@ -88,9 +86,10 @@ def print_step_info(self) -> None:
         Print the step information.
         """
         utils.print_with_color(
-            "Round {round_num}, Step {step}: Taking an action on application {application}.".format(
+            "Round {round_num}, Step {step}, AppAgent: Completing the subtask [{subtask}] on application [{application}].".format(
                 round_num=self.round_num + 1,
                 step=self.round_step + 1,
+                subtask=self.subtask,
                 application=self.application_process_name,
             ),
             "magenta",
@@ -110,6 +109,14 @@ def capture_screenshot(self) -> None:
             self.log_path + f"action_step{self.session_step}_concat.png"
         )
 
+        self._memory_data.set_values_from_dict(
+            {
+                "CleanScreenshot": screenshot_save_path,
+                "AnnotatedScreenshot": annotated_screenshot_save_path,
+                "ConcatScreenshot": concat_screenshot_save_path,
+            }
+        )
+
         # Get the control elements in the application window if the control items are not provided for reannotation.
         if type(self.control_reannotate) == list and len(self.control_reannotate) > 0:
             control_list = self.control_reannotate
@@ -125,8 +132,6 @@ def capture_screenshot(self) -> None:
             self.application_window, control_list, annotation_type="number"
         )
 
-        self.prev_plan = self.get_prev_plan()
-
         # Attempt to filter out irrelevant control items based on the previous plan.
         self.filtered_annotation_dict = self.get_filtered_annotation_dict(
             self._annotation_dict
@@ -220,14 +225,17 @@ def get_prompt_message(self) -> None:
 
         # Construct the prompt message for the AppAgent.
         self._prompt_message = self.app_agent.message_constructor(
-            examples,
-            tips,
-            external_knowledge_prompt,
-            self._image_url,
-            self.filtered_control_info,
-            self.prev_plan,
-            self.request,
-            configs["INCLUDE_LAST_SCREENSHOT"],
+            dynamic_examples=examples,
+            dynamic_tips=tips,
+            dynamic_knowledge=external_knowledge_prompt,
+            image_list=self._image_url,
+            control_info=self.filtered_control_info,
+            prev_subtask=self.previous_subtasks,
+            plan=self.prev_plan,
+            request=self.request,
+            subtask=self.subtask,
+            host_message=self.host_message,
+            include_last_screenshot=configs["INCLUDE_LAST_SCREENSHOT"],
         )
 
         # Log the prompt message. Only save them in debug mode.
@@ -331,6 +339,11 @@ def capture_control_screenshot(self, control_selected: UIAWrapper) -> None:
         control_screenshot_save_path = (
             self.log_path + f"action_step{self.session_step}_selected_controls.png"
         )
+
+        self._memory_data.set_values_from_dict(
+            {"SelectedControlScreenshot": control_screenshot_save_path}
+        )
+
         self.photographer.capture_app_window_screenshot_with_rectangle(
             self.application_window,
             sub_control_list=[control_selected],
@@ -354,8 +367,6 @@ def update_memory(self) -> None:
         """
         Update the memory of the Agent.
         """
-        # Create a memory item for the app agent
-        app_agent_step_memory = MemoryItem()
 
         app_root = self.control_inspector.get_application_root_name(
             self.application_window
@@ -367,6 +378,8 @@ def update_memory(self) -> None:
             "RoundStep": self.round_step,
             "AgentStep": self.app_agent.step,
             "Round": self.round_num,
+            "Subtask": self.subtask,
+            "SubtaskIndex": self.round_subtask_amount,
             "Action": self.action,
             "ActionType": self.app_agent.Puppeteer.get_command_types(self._operation),
             "Request": self.request,
@@ -376,18 +389,21 @@ def update_memory(self) -> None:
             "Cost": self._cost,
             "Results": self._results,
         }
-        app_agent_step_memory.set_values_from_dict(self._response_json)
-        app_agent_step_memory.set_values_from_dict(additional_memory)
+        self._memory_data.set_values_from_dict(self._response_json)
+        self._memory_data.set_values_from_dict(additional_memory)
+
+        if self.status.upper() == self._agent_status_manager.CONFIRM.value:
+            self._memory_data.set_values_from_dict({"UserConfirm": "Yes"})
 
-        self.app_agent.add_memory(app_agent_step_memory)
+        self.app_agent.add_memory(self._memory_data)
 
         # Log the memory item.
-        self.log(app_agent_step_memory.to_dict())
+        self.context.add_to_structural_logs(self._memory_data.to_dict())
+        self.log(self._memory_data.to_dict())
 
         # Only memorize the keys in the HISTORY_KEYS list to feed into the prompt message in the future steps.
         memorized_action = {
-            key: app_agent_step_memory.to_dict().get(key)
-            for key in configs["HISTORY_KEYS"]
+            key: self._memory_data.to_dict().get(key) for key in configs["HISTORY_KEYS"]
         }
 
         # Save the screenshot to the blackboard if the SaveScreenshot flag is set to True by the AppAgent.
@@ -421,20 +437,6 @@ def _save_to_xml(self) -> None:
         )
         self.app_agent.Puppeteer.save_to_xml(xml_save_path)
 
-    def get_prev_plan(self) -> str:
-        """
-        Retrieves the previous plan from the agent's memory.
-        :return: The previous plan, or an empty string if the agent's memory is empty.
-        """
-        agent_memory = self.app_agent.memory
-
-        if agent_memory.length > 0:
-            prev_plan = agent_memory.get_latest_item().to_dict()["Plan"]
-        else:
-            prev_plan = []
-
-        return prev_plan
-
     def demonstration_prompt_helper(self) -> Tuple[List[str], List[str]]:
         """
         Get the examples and tips for the AppAgent using the demonstration retriever.