first commit

2026-03-13 12:56:43 -07:00
commit 159cf9fcfe
309 changed files with 64584 additions and 0 deletions
--- a/agent_dhal/agentdhal_extensions/agents/web_surfer/init.py
+++ b/agent_dhal/agentdhal_extensions/agents/web_surfer/init.py
@@ -0,0 +1,4 @@
+from ._multimodal_web_surfer import MultimodalWebSurfer
+from .playwright_controller import PlaywrightController
+
+__all__ = ["MultimodalWebSurfer", "PlaywrightController"]
--- a/agent_dhal/agentdhal_extensions/agents/web_surfer/_events.py
+++ b/agent_dhal/agentdhal_extensions/agents/web_surfer/_events.py
@@ -0,0 +1,11 @@
+from dataclasses import dataclass
+from typing import Any, Dict
+
+
+@dataclass
+class WebSurferEvent:
+    source: str
+    message: str
+    url: str
+    action: str | None = None
+    arguments: Dict[str, Any] | None = None
--- a/agent_dhal/agentdhal_extensions/agents/web_surfer/_multimodal_web_surfer.py
+++ b/agent_dhal/agentdhal_extensions/agents/web_surfer/_multimodal_web_surfer.py
@@ -0,0 +1,988 @@
+import asyncio
+import base64
+import hashlib
+import io
+import json
+import logging
+import os
+import re
+import sys
+import time
+import traceback
+import warnings
+from typing import (
+    Any,
+    AsyncGenerator,
+    Dict,
+    List,
+    Optional,
+    Sequence,
+)
+from urllib.parse import quote_plus
+
+import aiofiles
+import PIL.Image
+from agentdhal_agentchat.agents import BaseChatAgent
+from agentdhal_agentchat.base import Response
+from agentdhal_agentchat.messages import BaseAgentEvent, BaseChatMessage, MultiModalMessage, TextMessage
+from agentdhal_agentchat.utils import content_to_str, remove_images
+from agentdhal_core import EVENT_LOGGER_NAME, CancellationToken, Component, ComponentModel, FunctionCall
+from agentdhal_core import Image as AGImage
+from agentdhal_core.models import (
+    AssistantMessage,
+    ChatCompletionClient,
+    LLMMessage,
+    ModelFamily,
+    RequestUsage,
+    SystemMessage,
+    UserMessage,
+)
+from PIL import Image
+from playwright.async_api import BrowserContext, Download, Page, Playwright, async_playwright
+from pydantic import BaseModel
+from typing_extensions import Self
+
+from ._events import WebSurferEvent
+from ._prompts import (
+    WEB_SURFER_QA_PROMPT,
+    WEB_SURFER_QA_SYSTEM_MESSAGE,
+    WEB_SURFER_TOOL_PROMPT_MM,
+    WEB_SURFER_TOOL_PROMPT_TEXT,
+)
+from ._set_of_mark import add_set_of_mark
+from ._tool_definitions import (
+    TOOL_CLICK,
+    TOOL_HISTORY_BACK,
+    TOOL_HOVER,
+    TOOL_READ_PAGE_AND_ANSWER,
+    TOOL_SCROLL_DOWN,
+    TOOL_SCROLL_UP,
+    TOOL_SLEEP,
+    TOOL_SUMMARIZE_PAGE,
+    TOOL_TYPE,
+    TOOL_VISIT_URL,
+    TOOL_WEB_SEARCH,
+)
+from ._types import InteractiveRegion, UserContent
+from .playwright_controller import PlaywrightController
+
+DEFAULT_CONTEXT_SIZE = 128000
+
+
+class MultimodalWebSurferConfig(BaseModel):
+    name: str
+    model_client: ComponentModel
+    downloads_folder: str | None = None
+    description: str | None = None
+    debug_dir: str | None = None
+    headless: bool = True
+    start_page: str | None = "https://www.bing.com/"
+    animate_actions: bool = False
+    to_save_screenshots: bool = False
+    use_ocr: bool = False
+    browser_channel: str | None = None
+    browser_data_dir: str | None = None
+    to_resize_viewport: bool = True
+
+
+class MultimodalWebSurfer(BaseChatAgent, Component[MultimodalWebSurferConfig]):
+    """
+    MultimodalWebSurfer is a multimodal agent that acts as a web surfer that can search the web and visit web pages.
+
+    Installation:
+
+    .. code-block:: bash
+
+        pip install "agentdhal-ext[web-surfer]"
+
+    It launches a chromium browser and allows the playwright to interact with the web browser and can perform a variety of actions. The browser is launched on the first call to the agent and is reused for subsequent calls.
+
+    It must be used with a multimodal model client that supports function/tool calling, ideally GPT-4o currently.
+
+
+    When :meth:`on_messages` or :meth:`on_messages_stream` is called, the following occurs:
+        1) If this is the first call, the browser is initialized and the page is loaded. This is done in :meth:`_lazy_init`. The browser is only closed when :meth:`close` is called.
+        2) The method :meth:`_generate_reply` is called, which then creates the final response as below.
+        3) The agent takes a screenshot of the page, extracts the interactive elements, and prepares a set-of-mark screenshot with bounding boxes around the interactive elements.
+        4) The agent makes a call to the :attr:`model_client` with the SOM screenshot, history of messages, and the list of available tools.
+            - If the model returns a string, the agent returns the string as the final response.
+            - If the model returns a list of tool calls, the agent executes the tool calls with :meth:`_execute_tool` using :attr:`_playwright_controller`.
+            - The agent returns a final response which includes a screenshot of the page, page metadata, description of the action taken and the inner text of the webpage.
+        5) If at any point the agent encounters an error, it returns the error message as the final response.
+
+
+    .. note::
+        Please note that using the MultimodalWebSurfer involves interacting with a digital world designed for humans, which carries inherent risks.
+        Be aware that agents may occasionally attempt risky actions, such as recruiting humans for help or accepting cookie agreements without human involvement. Always ensure agents are monitored and operate within a controlled environment to prevent unintended consequences.
+        Moreover, be cautious that MultimodalWebSurfer may be susceptible to prompt injection attacks from webpages.
+
+    .. note::
+
+        On Windows, the event loop policy must be set to `WindowsProactorEventLoopPolicy` to avoid issues with subprocesses.
+
+        .. code-block:: python
+
+            import sys
+            import asyncio
+
+            if sys.platform == "win32":
+                asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
+
+    Args:
+        name (str): The name of the agent.
+        model_client (ChatCompletionClient): The model client used by the agent. Must be multimodal and support function calling.
+        downloads_folder (str, optional): The folder where downloads are saved. Defaults to None, no downloads are saved.
+        description (str, optional): The description of the agent. Defaults to MultimodalWebSurfer.DEFAULT_DESCRIPTION.
+        debug_dir (str, optional): The directory where debug information is saved. Defaults to None.
+        headless (bool, optional): Whether the browser should be headless. Defaults to True.
+        start_page (str, optional): The start page for the browser. Defaults to MultimodalWebSurfer.DEFAULT_START_PAGE.
+        animate_actions (bool, optional): Whether to animate actions. Defaults to False.
+        to_save_screenshots (bool, optional): Whether to save screenshots. Defaults to False.
+        use_ocr (bool, optional): Whether to use OCR. Defaults to False.
+        browser_channel (str, optional): The browser channel. Defaults to None.
+        browser_data_dir (str, optional): The browser data directory. Defaults to None.
+        to_resize_viewport (bool, optional): Whether to resize the viewport. Defaults to True.
+        playwright (Playwright, optional): The playwright instance. Defaults to None.
+        context (BrowserContext, optional): The browser context. Defaults to None.
+
+
+
+
+    Example usage:
+
+    The following example demonstrates how to create a web surfing agent with
+    a model client and run it for multiple turns.
+
+        .. code-block:: python
+
+
+            import asyncio
+            from agentdhal_agentchat.ui import Console
+            from agentdhal_agentchat.teams import RoundRobinGroupChat
+            from agentdhal_extensions.models.openai import OpenAIChatCompletionClient
+            from agentdhal_extensions.agents.web_surfer import MultimodalWebSurfer
+
+
+            async def main() -> None:
+                # Define an agent
+                web_surfer_agent = MultimodalWebSurfer(
+                    name="MultimodalWebSurfer",
+                    model_client=OpenAIChatCompletionClient(model="gpt-4o-2024-08-06"),
+                )
+
+                # Define a team
+                agent_team = RoundRobinGroupChat([web_surfer_agent], max_turns=3)
+
+                # Run the team and stream messages to the console
+                stream = agent_team.run_stream(task="Navigate to the AutoGen readme on GitHub.")
+                await Console(stream)
+                # Close the browser controlled by the agent
+                await web_surfer_agent.close()
+
+
+            asyncio.run(main())
+    """
+
+    component_type = "agent"
+    component_config_schema = MultimodalWebSurferConfig
+    component_provider_override = "agentdhal_extensions.agents.web_surfer.MultimodalWebSurfer"
+
+    DEFAULT_DESCRIPTION = """
+    A helpful assistant with access to a web browser.
+    Ask them to perform web searches, open pages, and interact with content (e.g., clicking links, scrolling the viewport, filling in form fields, etc.).
+    It can also summarize the entire page, or answer questions based on the content of the page.
+    It can also be asked to sleep and wait for pages to load, in cases where the page seems not yet fully loaded.
+    """
+    DEFAULT_START_PAGE = "https://www.bing.com/"
+
+    # Viewport dimensions
+    VIEWPORT_HEIGHT = 900
+    VIEWPORT_WIDTH = 1440
+
+    # Size of the image we send to the MLM
+    # Current values represent a 0.85 scaling to fit within the GPT-4v short-edge constraints (768px)
+    MLM_HEIGHT = 765
+    MLM_WIDTH = 1224
+
+    SCREENSHOT_TOKENS = 1105
+
+    def __init__(
+        self,
+        name: str,
+        model_client: ChatCompletionClient,
+        downloads_folder: str | None = None,
+        description: str = DEFAULT_DESCRIPTION,
+        debug_dir: str | None = None,
+        headless: bool = True,
+        start_page: str | None = DEFAULT_START_PAGE,
+        animate_actions: bool = False,
+        to_save_screenshots: bool = False,
+        use_ocr: bool = False,
+        browser_channel: str | None = None,
+        browser_data_dir: str | None = None,
+        to_resize_viewport: bool = True,
+        playwright: Playwright | None = None,
+        context: BrowserContext | None = None,
+    ):
+        """
+        Initialize the MultimodalWebSurfer.
+        """
+        super().__init__(name, description)
+        if debug_dir is None and to_save_screenshots:
+            raise ValueError(
+                "Cannot save screenshots without a debug directory. Set it using the 'debug_dir' parameter. The debug directory is created if it does not exist."
+            )
+        if model_client.model_info["function_calling"] is False:
+            raise ValueError(
+                "The model does not support function calling. MultimodalWebSurfer requires a model that supports function calling."
+            )
+
+        self._model_client = model_client
+        self.headless = headless
+        self.browser_channel = browser_channel
+        self.browser_data_dir = browser_data_dir
+        self.start_page = start_page or self.DEFAULT_START_PAGE
+        self.downloads_folder = downloads_folder
+        self.debug_dir = debug_dir
+        self.to_save_screenshots = to_save_screenshots
+        self.use_ocr = use_ocr
+        self.to_resize_viewport = to_resize_viewport
+        self.animate_actions = animate_actions
+
+        # Call init to set these in case not set
+        self._playwright: Playwright | None = playwright
+        self._context: BrowserContext | None = context
+        self._page: Page | None = None
+        self._last_download: Download | None = None
+        self._prior_metadata_hash: str | None = None
+        self.logger = logging.getLogger(EVENT_LOGGER_NAME + f".{self.name}.MultimodalWebSurfer")
+        self._chat_history: List[LLMMessage] = []
+
+        # Define the download handler
+        def _download_handler(download: Download) -> None:
+            self._last_download = download
+
+        self._download_handler = _download_handler
+
+        # Define the Playwright controller that handles the browser interactions
+        self._playwright_controller = PlaywrightController(
+            animate_actions=self.animate_actions,
+            downloads_folder=self.downloads_folder,
+            viewport_width=self.VIEWPORT_WIDTH,
+            viewport_height=self.VIEWPORT_HEIGHT,
+            _download_handler=self._download_handler,
+            to_resize_viewport=self.to_resize_viewport,
+        )
+        self.default_tools = [
+            TOOL_VISIT_URL,
+            TOOL_WEB_SEARCH,
+            TOOL_HISTORY_BACK,
+            TOOL_CLICK,
+            TOOL_TYPE,
+            TOOL_READ_PAGE_AND_ANSWER,
+            TOOL_SUMMARIZE_PAGE,
+            TOOL_SLEEP,
+            TOOL_HOVER,
+        ]
+        self.did_lazy_init = False  # flag to check if we have initialized the browser
+
+    async def _lazy_init(
+        self,
+    ) -> None:
+        """
+        On the first call, we initialize the browser and the page.
+        """
+
+        # Check the current event loop policy if on windows.
+        if sys.platform == "win32":
+            current_policy = asyncio.get_event_loop_policy()
+            if hasattr(asyncio, "WindowsProactorEventLoopPolicy") and not isinstance(
+                current_policy, asyncio.WindowsProactorEventLoopPolicy
+            ):
+                warnings.warn(
+                    "The current event loop policy is not WindowsProactorEventLoopPolicy. "
+                    "This may cause issues with subprocesses. "
+                    "Try setting the event loop policy to WindowsProactorEventLoopPolicy. "
+                    "For example: `asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())`. "
+                    "See https://docs.python.org/3/library/asyncio-eventloop.html#asyncio.ProactorEventLoop.",
+                    stacklevel=2,
+                )
+
+        self._last_download = None
+        self._prior_metadata_hash = None
+
+        # Create the playwright self
+        launch_args: Dict[str, Any] = {"headless": self.headless}
+        if self.browser_channel is not None:
+            launch_args["channel"] = self.browser_channel
+        if self._playwright is None:
+            self._playwright = await async_playwright().start()
+
+        # Create the context -- are we launching persistent?
+        if self._context is None:
+            if self.browser_data_dir is None:
+                browser = await self._playwright.chromium.launch(**launch_args)
+                self._context = await browser.new_context(
+                    user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0"
+                )
+            else:
+                self._context = await self._playwright.chromium.launch_persistent_context(
+                    self.browser_data_dir, **launch_args
+                )
+
+        # Create the page
+        self._context.set_default_timeout(60000)  # One minute
+        self._page = await self._context.new_page()
+        assert self._page is not None
+        # self._page.route(lambda x: True, self._route_handler)
+        self._page.on("download", self._download_handler)
+        if self.to_resize_viewport:
+            await self._page.set_viewport_size({"width": self.VIEWPORT_WIDTH, "height": self.VIEWPORT_HEIGHT})
+        await self._page.add_init_script(
+            path=os.path.join(os.path.abspath(os.path.dirname(__file__)), "page_script.js")
+        )
+        await self._page.goto(self.start_page)
+        await self._page.wait_for_load_state()
+
+        # Prepare the debug directory -- which stores the screenshots generated throughout the process
+        await self._set_debug_dir(self.debug_dir)
+        self.did_lazy_init = True
+
+    async def close(self) -> None:
+        """
+        Close the browser and the page.
+        Should be called when the agent is no longer needed.
+        """
+        if self._page is not None:
+            await self._page.close()
+            self._page = None
+        if self._context is not None:
+            await self._context.close()
+            self._context = None
+        if self._playwright is not None:
+            await self._playwright.stop()
+            self._playwright = None
+
+    async def _set_debug_dir(self, debug_dir: str | None) -> None:
+        assert self._page is not None
+        if self.debug_dir is None:
+            return
+
+        if not os.path.isdir(self.debug_dir):
+            os.mkdir(self.debug_dir)
+
+        if self.to_save_screenshots:
+            current_timestamp = "_" + int(time.time()).__str__()
+            screenshot_png_name = "screenshot" + current_timestamp + ".png"
+
+            await self._page.screenshot(path=os.path.join(self.debug_dir, screenshot_png_name))  # type: ignore
+            self.logger.info(
+                WebSurferEvent(
+                    source=self.name,
+                    url=self._page.url,
+                    message="Screenshot: " + screenshot_png_name,
+                )
+            )
+
+    @property
+    def produced_message_types(self) -> Sequence[type[BaseChatMessage]]:
+        return (MultiModalMessage,)
+
+    async def on_reset(self, cancellation_token: CancellationToken) -> None:
+        if not self.did_lazy_init:
+            return
+        assert self._page is not None
+
+        self._chat_history.clear()
+        reset_prior_metadata, reset_last_download = await self._playwright_controller.visit_page(
+            self._page, self.start_page
+        )
+        if reset_last_download and self._last_download is not None:
+            self._last_download = None
+        if reset_prior_metadata and self._prior_metadata_hash is not None:
+            self._prior_metadata_hash = None
+        if self.to_save_screenshots:
+            current_timestamp = "_" + int(time.time()).__str__()
+            screenshot_png_name = "screenshot" + current_timestamp + ".png"
+
+            await self._page.screenshot(path=os.path.join(self.debug_dir, screenshot_png_name))  # type: ignore
+            self.logger.info(
+                WebSurferEvent(
+                    source=self.name,
+                    url=self._page.url,
+                    message="Screenshot: " + screenshot_png_name,
+                )
+            )
+
+        self.logger.info(
+            WebSurferEvent(
+                source=self.name,
+                url=self._page.url,
+                message="Resetting browser.",
+            )
+        )
+
+    async def on_messages(self, messages: Sequence[BaseChatMessage], cancellation_token: CancellationToken) -> Response:
+        async for message in self.on_messages_stream(messages, cancellation_token):
+            if isinstance(message, Response):
+                return message
+        raise AssertionError("The stream should have returned the final result.")
+
+    async def on_messages_stream(
+        self, messages: Sequence[BaseChatMessage], cancellation_token: CancellationToken
+    ) -> AsyncGenerator[BaseAgentEvent | BaseChatMessage | Response, None]:
+        for chat_message in messages:
+            self._chat_history.append(chat_message.to_model_message())
+
+        self.inner_messages: List[BaseAgentEvent | BaseChatMessage] = []
+        self.model_usage: List[RequestUsage] = []
+        try:
+            content = await self._generate_reply(cancellation_token=cancellation_token)
+            self._chat_history.append(AssistantMessage(content=content_to_str(content), source=self.name))
+            final_usage = RequestUsage(
+                prompt_tokens=sum([u.prompt_tokens for u in self.model_usage]),
+                completion_tokens=sum([u.completion_tokens for u in self.model_usage]),
+            )
+            if isinstance(content, str):
+                yield Response(
+                    chat_message=TextMessage(content=content, source=self.name, models_usage=final_usage),
+                    inner_messages=self.inner_messages,
+                )
+            else:
+                yield Response(
+                    chat_message=MultiModalMessage(content=content, source=self.name, models_usage=final_usage),
+                    inner_messages=self.inner_messages,
+                )
+
+        except BaseException:
+            content = f"Web surfing error:\n\n{traceback.format_exc()}"
+            self._chat_history.append(AssistantMessage(content=content, source=self.name))
+            yield Response(chat_message=TextMessage(content=content, source=self.name))
+
+    async def _generate_reply(self, cancellation_token: CancellationToken) -> UserContent:
+        """Generates the actual reply. First calls the LLM to figure out which tool to use, then executes the tool."""
+
+        # Lazy init, initialize the browser and the page on the first generate reply only
+        if not self.did_lazy_init:
+            await self._lazy_init()
+
+        assert self._page is not None
+
+        # Clone the messages, removing old screenshots
+        history: List[LLMMessage] = remove_images(self._chat_history)
+
+        # Split the history, removing the last message
+        if len(history):
+            user_request = history.pop()
+        else:
+            user_request = UserMessage(content="Empty request.", source="user")
+
+        # Truncate the history for smaller models
+        if self._model_client.model_info["family"] not in [
+            ModelFamily.GPT_4O,
+            ModelFamily.O1,
+            ModelFamily.O3,
+            ModelFamily.GPT_4,
+            ModelFamily.GPT_35,
+        ]:
+            history = []
+
+        # Ask the page for interactive elements, then prepare the state-of-mark screenshot
+        rects = await self._playwright_controller.get_interactive_rects(self._page)
+        viewport = await self._playwright_controller.get_visual_viewport(self._page)
+        screenshot = await self._page.screenshot()
+        som_screenshot, visible_rects, rects_above, rects_below = add_set_of_mark(screenshot, rects)
+
+        if self.to_save_screenshots:
+            current_timestamp = "_" + int(time.time()).__str__()
+            screenshot_png_name = "screenshot_som" + current_timestamp + ".png"
+            som_screenshot.save(os.path.join(self.debug_dir, screenshot_png_name))  # type: ignore
+            self.logger.info(
+                WebSurferEvent(
+                    source=self.name,
+                    url=self._page.url,
+                    message="Screenshot: " + screenshot_png_name,
+                )
+            )
+        # What tools are available?
+        tools = self.default_tools.copy()
+
+        # We can scroll up
+        if viewport["pageTop"] > 5:
+            tools.append(TOOL_SCROLL_UP)
+
+        # Can scroll down
+        if (viewport["pageTop"] + viewport["height"] + 5) < viewport["scrollHeight"]:
+            tools.append(TOOL_SCROLL_DOWN)
+
+        # Focus hint
+        focused = await self._playwright_controller.get_focused_rect_id(self._page)
+        focused_hint = ""
+        if focused:
+            name = self._target_name(focused, rects)
+            if name:
+                name = f"(and name '{name}') "
+            else:
+                name = ""
+
+            role = "control"
+            try:
+                role = rects[focused]["role"]
+            except KeyError:
+                pass
+
+            focused_hint = f"\nThe {role} with ID {focused} {name}currently has the input focus.\n\n"
+
+        # Everything visible
+        visible_targets = "\n".join(self._format_target_list(visible_rects, rects)) + "\n\n"
+
+        # Everything else
+        other_targets: List[str] = []
+        other_targets.extend(self._format_target_list(rects_above, rects))
+        other_targets.extend(self._format_target_list(rects_below, rects))
+
+        if len(other_targets) > 0:
+            if len(other_targets) > 30:
+                other_targets = other_targets[0:30]
+                other_targets.append("...")
+            other_targets_str = (
+                "Additional valid interaction targets include (but are not limited to):\n"
+                + "\n".join(other_targets)
+                + "\n\n"
+            )
+        else:
+            other_targets_str = ""
+
+        state_description = "Your " + await self._get_state_description()
+        tool_names = "\n".join([t["name"] for t in tools])
+        page_title = await self._page.title()
+
+        prompt_message = None
+        if self._model_client.model_info["vision"]:
+            text_prompt = WEB_SURFER_TOOL_PROMPT_MM.format(
+                state_description=state_description,
+                visible_targets=visible_targets,
+                other_targets_str=other_targets_str,
+                focused_hint=focused_hint,
+                tool_names=tool_names,
+                title=page_title,
+                url=self._page.url,
+            ).strip()
+
+            # Scale the screenshot for the MLM, and close the original
+            scaled_screenshot = som_screenshot.resize((self.MLM_WIDTH, self.MLM_HEIGHT))
+            som_screenshot.close()
+            if self.to_save_screenshots:
+                scaled_screenshot.save(os.path.join(self.debug_dir, "screenshot_scaled.png"))  # type: ignore
+
+            # Create the message
+            prompt_message = UserMessage(
+                content=[re.sub(r"(\n\s*){3,}", "\n\n", text_prompt), AGImage.from_pil(scaled_screenshot)],
+                source=self.name,
+            )
+        else:
+            text_prompt = WEB_SURFER_TOOL_PROMPT_TEXT.format(
+                state_description=state_description,
+                visible_targets=visible_targets,
+                other_targets_str=other_targets_str,
+                focused_hint=focused_hint,
+                tool_names=tool_names,
+                title=page_title,
+                url=self._page.url,
+            ).strip()
+
+            # Create the message
+            prompt_message = UserMessage(content=re.sub(r"(\n\s*){3,}", "\n\n", text_prompt), source=self.name)
+
+        history.append(prompt_message)
+        history.append(user_request)
+
+        # {history[-2].content if isinstance(history[-2].content, str) else history[-2].content[0]}
+        # print(f"""
+        # ================={len(history)}=================
+        # {history[-2].content}
+        # =====
+        # {history[-1].content}
+        # ===================================================
+        # """)
+
+        # Make the request
+        response = await self._model_client.create(
+            history, tools=tools, extra_create_args={"tool_choice": "auto"}, cancellation_token=cancellation_token
+        )  # , "parallel_tool_calls": False})
+
+        self.model_usage.append(response.usage)
+        message = response.content
+        self._last_download = None
+        if isinstance(message, str):
+            # Answer directly
+            self.inner_messages.append(TextMessage(content=message, source=self.name))
+            return message
+        elif isinstance(message, list):
+            # Take an action
+            return await self._execute_tool(message, rects, tool_names, cancellation_token=cancellation_token)
+        else:
+            # Not sure what happened here
+            raise AssertionError(f"Unknown response format '{message}'")
+
+    async def _execute_tool(
+        self,
+        message: List[FunctionCall],
+        rects: Dict[str, InteractiveRegion],
+        tool_names: str,
+        cancellation_token: Optional[CancellationToken] = None,
+    ) -> UserContent:
+        # Execute the tool
+        name = message[0].name
+        args = json.loads(message[0].arguments)
+        action_description = ""
+        assert self._page is not None
+        self.logger.info(
+            WebSurferEvent(
+                source=self.name,
+                url=self._page.url,
+                action=name,
+                arguments=args,
+                message=f"{name}( {json.dumps(args)} )",
+            )
+        )
+        self.inner_messages.append(TextMessage(content=f"{name}( {json.dumps(args)} )", source=self.name))
+
+        if name == "visit_url":
+            url = args.get("url")
+            action_description = f"I typed '{url}' into the browser address bar."
+            # Check if the argument starts with a known protocol
+            if url.startswith(("https://", "http://", "file://", "about:")):
+                reset_prior_metadata, reset_last_download = await self._playwright_controller.visit_page(
+                    self._page, url
+                )
+            # If the argument contains a space, treat it as a search query
+            elif " " in url:
+                reset_prior_metadata, reset_last_download = await self._playwright_controller.visit_page(
+                    self._page, f"https://www.bing.com/search?q={quote_plus(url)}&FORM=QBLH"
+                )
+            # Otherwise, prefix with https://
+            else:
+                reset_prior_metadata, reset_last_download = await self._playwright_controller.visit_page(
+                    self._page, "https://" + url
+                )
+            if reset_last_download and self._last_download is not None:
+                self._last_download = None
+            if reset_prior_metadata and self._prior_metadata_hash is not None:
+                self._prior_metadata_hash = None
+        elif name == "history_back":
+            action_description = "I clicked the browser back button."
+            await self._playwright_controller.back(self._page)
+
+        elif name == "web_search":
+            query = args.get("query")
+            action_description = f"I typed '{query}' into the browser search bar."
+            reset_prior_metadata, reset_last_download = await self._playwright_controller.visit_page(
+                self._page, f"https://www.bing.com/search?q={quote_plus(query)}&FORM=QBLH"
+            )
+            if reset_last_download and self._last_download is not None:
+                self._last_download = None
+            if reset_prior_metadata and self._prior_metadata_hash is not None:
+                self._prior_metadata_hash = None
+        elif name == "scroll_up":
+            action_description = "I scrolled up one page in the browser."
+            await self._playwright_controller.page_up(self._page)
+        elif name == "scroll_down":
+            action_description = "I scrolled down one page in the browser."
+            await self._playwright_controller.page_down(self._page)
+
+        elif name == "click":
+            target_id = str(args.get("target_id"))
+            target_name = self._target_name(target_id, rects)
+            if target_name:
+                action_description = f"I clicked '{target_name}'."
+            else:
+                action_description = "I clicked the control."
+            new_page_tentative = await self._playwright_controller.click_id(self._page, target_id)
+            if new_page_tentative is not None:
+                self._page = new_page_tentative
+                self._prior_metadata_hash = None
+                self.logger.info(
+                    WebSurferEvent(
+                        source=self.name,
+                        url=self._page.url,
+                        message="New tab or window.",
+                    )
+                )
+        elif name == "input_text":
+            input_field_id = str(args.get("input_field_id"))
+            text_value = str(args.get("text_value"))
+            input_field_name = self._target_name(input_field_id, rects)
+            if input_field_name:
+                action_description = f"I typed '{text_value}' into '{input_field_name}'."
+            else:
+                action_description = f"I input '{text_value}'."
+            await self._playwright_controller.fill_id(self._page, input_field_id, text_value)
+
+        elif name == "scroll_element_up":
+            target_id = str(args.get("target_id"))
+            target_name = self._target_name(target_id, rects)
+
+            if target_name:
+                action_description = f"I scrolled '{target_name}' up."
+            else:
+                action_description = "I scrolled the control up."
+
+            await self._playwright_controller.scroll_id(self._page, target_id, "up")
+
+        elif name == "scroll_element_down":
+            target_id = str(args.get("target_id"))
+            target_name = self._target_name(target_id, rects)
+
+            if target_name:
+                action_description = f"I scrolled '{target_name}' down."
+            else:
+                action_description = "I scrolled the control down."
+
+            await self._playwright_controller.scroll_id(self._page, target_id, "down")
+
+        elif name == "answer_question":
+            question = str(args.get("question"))
+            action_description = f"I answered the following question '{question}' based on the web page."
+            # Do Q&A on the DOM. No need to take further action. Browser state does not change.
+            return await self._summarize_page(question=question, cancellation_token=cancellation_token)
+        elif name == "summarize_page":
+            # Summarize the DOM. No need to take further action. Browser state does not change.
+            action_description = "I summarized the current web page"
+            return await self._summarize_page(cancellation_token=cancellation_token)
+
+        elif name == "hover":
+            target_id = str(args.get("target_id"))
+            target_name = self._target_name(target_id, rects)
+            if target_name:
+                action_description = f"I hovered over '{target_name}'."
+            else:
+                action_description = "I hovered over the control."
+            await self._playwright_controller.hover_id(self._page, target_id)
+
+        elif name == "sleep":
+            action_description = "I am waiting a short period of time before taking further action."
+            await self._playwright_controller.sleep(self._page, 3)
+
+        else:
+            raise ValueError(f"Unknown tool '{name}'. Please choose from:\n\n{tool_names}")
+
+        await self._page.wait_for_load_state()
+        await self._playwright_controller.sleep(self._page, 3)
+
+        # Handle downloads
+        if self._last_download is not None and self.downloads_folder is not None:
+            fname = os.path.join(self.downloads_folder, self._last_download.suggested_filename)
+            await self._last_download.save_as(fname)  # type: ignore
+            page_body = f"<html><head><title>Download Successful</title></head><body style=\"margin: 20px;\"><h1>Successfully downloaded '{self._last_download.suggested_filename}' to local path:<br><br>{fname}</h1></body></html>"
+            await self._page.goto(
+                "data:text/html;base64," + base64.b64encode(page_body.encode("utf-8")).decode("utf-8")
+            )
+            await self._page.wait_for_load_state()
+
+        # Handle metadata
+        page_metadata = json.dumps(await self._playwright_controller.get_page_metadata(self._page), indent=4)
+        metadata_hash = hashlib.md5(page_metadata.encode("utf-8")).hexdigest()
+        if metadata_hash != self._prior_metadata_hash:
+            page_metadata = (
+                "\n\nThe following metadata was extracted from the webpage:\n\n" + page_metadata.strip() + "\n"
+            )
+        else:
+            page_metadata = ""
+        self._prior_metadata_hash = metadata_hash
+
+        new_screenshot = await self._page.screenshot()
+        if self.to_save_screenshots:
+            current_timestamp = "_" + int(time.time()).__str__()
+            screenshot_png_name = "screenshot" + current_timestamp + ".png"
+
+            async with aiofiles.open(os.path.join(self.debug_dir, screenshot_png_name), "wb") as file:  # type: ignore
+                await file.write(new_screenshot)  # type: ignore
+            self.logger.info(
+                WebSurferEvent(
+                    source=self.name,
+                    url=self._page.url,
+                    message="Screenshot: " + screenshot_png_name,
+                )
+            )
+
+        # Return the complete observation
+        state_description = "The " + await self._get_state_description()
+        message_content = (
+            f"{action_description}\n\n" + state_description + page_metadata + "\nHere is a screenshot of the page."
+        )
+
+        return [
+            re.sub(r"(\n\s*){3,}", "\n\n", message_content),  # Removing blank lines
+            AGImage.from_pil(PIL.Image.open(io.BytesIO(new_screenshot))),
+        ]
+
+    async def _get_state_description(self) -> str:
+        assert self._playwright_controller is not None
+        assert self._page is not None
+
+        # Describe the viewport of the new page in words
+        viewport = await self._playwright_controller.get_visual_viewport(self._page)
+        percent_visible = int(viewport["height"] * 100 / viewport["scrollHeight"])
+        percent_scrolled = int(viewport["pageTop"] * 100 / viewport["scrollHeight"])
+        if percent_scrolled < 1:  # Allow some rounding error
+            position_text = "at the top of the page"
+        elif percent_scrolled + percent_visible >= 99:  # Allow some rounding error
+            position_text = "at the bottom of the page"
+        else:
+            position_text = str(percent_scrolled) + "% down from the top of the page"
+
+        visible_text = await self._playwright_controller.get_visible_text(self._page)
+
+        # Return the complete observation
+        page_title = await self._page.title()
+        message_content = f"web browser is open to the page [{page_title}]({self._page.url}).\nThe viewport shows {percent_visible}% of the webpage, and is positioned {position_text}\n"
+        message_content += f"The following text is visible in the viewport:\n\n{visible_text}"
+        return message_content
+
+    def _target_name(self, target: str, rects: Dict[str, InteractiveRegion]) -> str | None:
+        try:
+            return rects[target]["aria_name"].strip()
+        except KeyError:
+            return None
+
+    def _format_target_list(self, ids: List[str], rects: Dict[str, InteractiveRegion]) -> List[str]:
+        """
+        Format the list of targets in the webpage as a string to be used in the agent's prompt.
+        """
+        targets: List[str] = []
+        for r in list(set(ids)):
+            if r in rects:
+                # Get the role
+                aria_role = rects[r].get("role", "").strip()
+                if len(aria_role) == 0:
+                    aria_role = rects[r].get("tag_name", "").strip()
+
+                # Get the name
+                aria_name = re.sub(r"[\n\r]+", " ", rects[r].get("aria_name", "")).strip()
+
+                # What are the actions?
+                actions = ['"click", "hover"']
+                if rects[r]["role"] in ["textbox", "searchbox", "search"]:
+                    actions = ['"input_text"']
+                actions_str = "[" + ",".join(actions) + "]"
+
+                targets.append(f'{{"id": {r}, "name": "{aria_name}", "role": "{aria_role}", "tools": {actions_str} }}')
+
+        return targets
+
+    async def _summarize_page(
+        self,
+        question: str | None = None,
+        cancellation_token: Optional[CancellationToken] = None,
+    ) -> str:
+        assert self._page is not None
+
+        page_markdown: str = await self._playwright_controller.get_page_markdown(self._page)
+
+        title: str = self._page.url
+        try:
+            title = await self._page.title()
+        except Exception:
+            pass
+
+        # Take a screenshot and scale it
+        screenshot = Image.open(io.BytesIO(await self._page.screenshot()))
+        scaled_screenshot = screenshot.resize((self.MLM_WIDTH, self.MLM_HEIGHT))
+        screenshot.close()
+        ag_image = AGImage.from_pil(scaled_screenshot)
+
+        # Prepare the system prompt
+        messages: List[LLMMessage] = []
+        messages.append(SystemMessage(content=WEB_SURFER_QA_SYSTEM_MESSAGE))
+        prompt = WEB_SURFER_QA_PROMPT(title, question)
+        # Grow the buffer (which is added to the prompt) until we overflow the context window or run out of lines
+        buffer = ""
+        # for line in re.split(r"([\r\n]+)", page_markdown):
+        for line in page_markdown.splitlines():
+            trial_message = UserMessage(
+                content=prompt + buffer + line,
+                source=self.name,
+            )
+
+            try:
+                remaining = self._model_client.remaining_tokens(messages + [trial_message])
+            except KeyError:
+                # Use the default if the model isn't found
+                remaining = DEFAULT_CONTEXT_SIZE - self._model_client.count_tokens(messages + [trial_message])
+
+            if self._model_client.model_info["vision"] and remaining <= 0:
+                break
+
+            if self._model_client.model_info["vision"] and remaining <= self.SCREENSHOT_TOKENS:
+                break
+
+            buffer += line
+
+        # Nothing to do
+        buffer = buffer.strip()
+        if len(buffer) == 0:
+            return "Nothing to summarize."
+
+        # Append the message
+        if self._model_client.model_info["vision"]:
+            # Multimodal
+            messages.append(
+                UserMessage(
+                    content=[
+                        prompt + buffer,
+                        ag_image,
+                    ],
+                    source=self.name,
+                )
+            )
+        else:
+            # Text only
+            messages.append(
+                UserMessage(
+                    content=prompt + buffer,
+                    source=self.name,
+                )
+            )
+
+        # Generate the response
+        response = await self._model_client.create(messages, cancellation_token=cancellation_token)
+        self.model_usage.append(response.usage)
+        scaled_screenshot.close()
+        assert isinstance(response.content, str)
+        return response.content
+
+    def _to_config(self) -> MultimodalWebSurferConfig:
+        return MultimodalWebSurferConfig(
+            name=self.name,
+            model_client=self._model_client.dump_component(),
+            downloads_folder=self.downloads_folder,
+            description=self.description,
+            debug_dir=self.debug_dir,
+            headless=self.headless,
+            start_page=self.start_page,
+            animate_actions=self.animate_actions,
+            to_save_screenshots=self.to_save_screenshots,
+            use_ocr=self.use_ocr,
+            browser_channel=self.browser_channel,
+            browser_data_dir=self.browser_data_dir,
+            to_resize_viewport=self.to_resize_viewport,
+        )
+
+    @classmethod
+    def _from_config(cls, config: MultimodalWebSurferConfig) -> Self:
+        return cls(
+            name=config.name,
+            model_client=ChatCompletionClient.load_component(config.model_client),
+            downloads_folder=config.downloads_folder,
+            description=config.description or cls.DEFAULT_DESCRIPTION,
+            debug_dir=config.debug_dir,
+            headless=config.headless,
+            start_page=config.start_page or cls.DEFAULT_START_PAGE,
+            animate_actions=config.animate_actions,
+            to_save_screenshots=config.to_save_screenshots,
+            use_ocr=config.use_ocr,
+            browser_channel=config.browser_channel,
+            browser_data_dir=config.browser_data_dir,
+            to_resize_viewport=config.to_resize_viewport,
+        )
--- a/agent_dhal/agentdhal_extensions/agents/web_surfer/_prompts.py
+++ b/agent_dhal/agentdhal_extensions/agents/web_surfer/_prompts.py
@@ -0,0 +1,52 @@
+WEB_SURFER_TOOL_PROMPT_MM = """
+{state_description}
+
+Consider the following screenshot of the page. In this screenshot, interactive elements are outlined in bounding boxes of different colors. Each bounding box has a numeric ID label in the same color. Additional information about each visible label is listed below:
+
+{visible_targets}{other_targets_str}{focused_hint}
+
+You are to respond to my next request by selecting an appropriate tool from the following set, or by answering the question directly if possible:
+
+{tool_names}
+
+When deciding between tools, consider if the request can be best addressed by:
+    - the contents of the CURRENT VIEWPORT (in which case actions like clicking links, clicking buttons, inputting text, or hovering over an element, might be more appropriate)
+    - contents found elsewhere on the CURRENT WEBPAGE [{title}]({url}), in which case actions like scrolling, summarization, or full-page Q&A might be most appropriate
+    - on ANOTHER WEBSITE entirely (in which case actions like performing a new web search might be the best option)
+
+My request follows:
+"""
+
+WEB_SURFER_TOOL_PROMPT_TEXT = """
+{state_description}
+
+You have also identified the following interactive components:
+
+{visible_targets}{other_targets_str}{focused_hint}
+
+You are to respond to my next request by selecting an appropriate tool from the following set, or by answering the question directly if possible:
+
+{tool_names}
+
+When deciding between tools, consider if the request can be best addressed by:
+    - the contents of the CURRENT VIEWPORT (in which case actions like clicking links, clicking buttons, inputting text, or hovering over an element, might be more appropriate)
+    - contents found elsewhere on the CURRENT WEBPAGE [{title}]({url}), in which case actions like scrolling, summarization, or full-page Q&A might be most appropriate
+    - on ANOTHER WEBSITE entirely (in which case actions like performing a new web search might be the best option)
+
+My request follows:
+"""
+
+
+WEB_SURFER_QA_SYSTEM_MESSAGE = """
+You are a helpful assistant that can summarize long documents to answer question.
+"""
+
+
+def WEB_SURFER_QA_PROMPT(title: str, question: str | None = None) -> str:
+    base_prompt = f"We are visiting the webpage '{title}'. Its full-text content are pasted below, along with a screenshot of the page's current viewport."
+    if question is not None:
+        return (
+            f"{base_prompt} Please summarize the webpage into one or two paragraphs with respect to '{question}':\n\n"
+        )
+    else:
+        return f"{base_prompt} Please summarize the webpage into one or two paragraphs:\n\n"
--- a/agent_dhal/agentdhal_extensions/agents/web_surfer/_set_of_mark.py
+++ b/agent_dhal/agentdhal_extensions/agents/web_surfer/_set_of_mark.py
@@ -0,0 +1,96 @@
+import io
+import random
+from typing import BinaryIO, Dict, List, Tuple, cast
+
+from PIL import Image, ImageDraw, ImageFont
+
+from ._types import DOMRectangle, InteractiveRegion
+
+TOP_NO_LABEL_ZONE = 20  # Don't print any labels close the top of the page
+
+
+def add_set_of_mark(
+    screenshot: bytes | Image.Image | io.BufferedIOBase, ROIs: Dict[str, InteractiveRegion]
+) -> Tuple[Image.Image, List[str], List[str], List[str]]:
+    if isinstance(screenshot, Image.Image):
+        return _add_set_of_mark(screenshot, ROIs)
+
+    if isinstance(screenshot, bytes):
+        screenshot = io.BytesIO(screenshot)
+
+    # TODO: Not sure why this cast was needed, but by this point screenshot is a binary file-like object
+    image = Image.open(cast(BinaryIO, screenshot))
+    comp, visible_rects, rects_above, rects_below = _add_set_of_mark(image, ROIs)
+    image.close()
+    return comp, visible_rects, rects_above, rects_below
+
+
+def _add_set_of_mark(
+    screenshot: Image.Image, ROIs: Dict[str, InteractiveRegion]
+) -> Tuple[Image.Image, List[str], List[str], List[str]]:
+    visible_rects: List[str] = list()
+    rects_above: List[str] = list()  # Scroll up to see
+    rects_below: List[str] = list()  # Scroll down to see
+
+    fnt = ImageFont.load_default(14)
+    base = screenshot.convert("L").convert("RGBA")
+    overlay = Image.new("RGBA", base.size)
+
+    draw = ImageDraw.Draw(overlay)
+    for r in ROIs:
+        for rect in ROIs[r]["rects"]:
+            # Empty rectangles
+            if not rect:
+                continue
+            if rect["width"] * rect["height"] == 0:
+                continue
+
+            mid = ((rect["right"] + rect["left"]) / 2.0, (rect["top"] + rect["bottom"]) / 2.0)
+
+            if 0 <= mid[0] and mid[0] < base.size[0]:
+                if mid[1] < 0:
+                    rects_above.append(r)
+                elif mid[1] >= base.size[1]:
+                    rects_below.append(r)
+                else:
+                    visible_rects.append(r)
+                    _draw_roi(draw, int(r), fnt, rect)
+
+    comp = Image.alpha_composite(base, overlay)
+    overlay.close()
+    return comp, visible_rects, rects_above, rects_below
+
+
+def _draw_roi(
+    draw: ImageDraw.ImageDraw, idx: int, font: ImageFont.FreeTypeFont | ImageFont.ImageFont, rect: DOMRectangle
+) -> None:
+    color = _color(idx)
+    luminance = color[0] * 0.3 + color[1] * 0.59 + color[2] * 0.11
+    text_color = (0, 0, 0, 255) if luminance > 90 else (255, 255, 255, 255)
+
+    roi = ((rect["left"], rect["top"]), (rect["right"], rect["bottom"]))
+
+    label_location = (rect["right"], rect["top"])
+    label_anchor = "rb"
+
+    if label_location[1] <= TOP_NO_LABEL_ZONE:
+        label_location = (rect["right"], rect["bottom"])
+        label_anchor = "rt"
+
+    draw.rectangle(roi, outline=color, fill=(color[0], color[1], color[2], 48), width=2)
+
+    # TODO: Having trouble with these types being partially Unknown.
+    bbox = draw.textbbox(label_location, str(idx), font=font, anchor=label_anchor, align="center")  # type: ignore
+    bbox = (bbox[0] - 3, bbox[1] - 3, bbox[2] + 3, bbox[3] + 3)
+    draw.rectangle(bbox, fill=color)
+
+    # TODO: Having trouble with these types being partially Unknown.
+    draw.text(label_location, str(idx), fill=text_color, font=font, anchor=label_anchor, align="center")  # type: ignore
+
+
+def _color(identifier: int) -> Tuple[int, int, int, int]:
+    rnd = random.Random(int(identifier))
+    color = [rnd.randint(0, 255), rnd.randint(125, 255), rnd.randint(0, 50)]
+    rnd.shuffle(color)
+    color.append(255)
+    return cast(Tuple[int, int, int, int], tuple(color))
--- a/agent_dhal/agentdhal_extensions/agents/web_surfer/_tool_definitions.py
+++ b/agent_dhal/agentdhal_extensions/agents/web_surfer/_tool_definitions.py
@@ -0,0 +1,317 @@
+from typing import Any, Dict
+
+from agentdhal_core.tools._base import ParametersSchema, ToolSchema
+
+
+def _load_tool(tooldef: Dict[str, Any]) -> ToolSchema:
+    return ToolSchema(
+        name=tooldef["function"]["name"],
+        description=tooldef["function"]["description"],
+        parameters=ParametersSchema(
+            type="object",
+            properties=tooldef["function"]["parameters"]["properties"],
+            required=tooldef["function"]["parameters"]["required"],
+        ),
+    )
+
+
+REASONING_TOOL_PROMPT = (
+    "A short description of the action to be performed and reason for doing so, do not mention the user."
+)
+
+TOOL_VISIT_URL: ToolSchema = _load_tool(
+    {
+        "type": "function",
+        "function": {
+            "name": "visit_url",
+            "description": "Navigate directly to a provided URL using the browser's address bar. Prefer this tool over other navigation techniques in cases where the user provides a fully-qualified URL (e.g., choose it over clicking links, or inputing queries into search boxes).",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "reasoning": {
+                        "type": "string",
+                        "description": REASONING_TOOL_PROMPT,
+                    },
+                    "url": {
+                        "type": "string",
+                        "description": "The URL to visit in the browser.",
+                    },
+                },
+                "required": ["reasoning", "url"],
+            },
+        },
+    }
+)
+
+TOOL_WEB_SEARCH: ToolSchema = _load_tool(
+    {
+        "type": "function",
+        "function": {
+            "name": "web_search",
+            "description": "Performs a web search on Bing.com with the given query.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "reasoning": {
+                        "type": "string",
+                        "description": REASONING_TOOL_PROMPT,
+                    },
+                    "query": {
+                        "type": "string",
+                        "description": "The web search query to use.",
+                    },
+                },
+                "required": ["reasoning", "query"],
+            },
+        },
+    }
+)
+
+TOOL_HISTORY_BACK: ToolSchema = _load_tool(
+    {
+        "type": "function",
+        "function": {
+            "name": "history_back",
+            "description": "Navigates back one page in the browser's history. This is equivalent to clicking the browser back button.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "reasoning": {
+                        "type": "string",
+                        "description": REASONING_TOOL_PROMPT,
+                    },
+                },
+                "required": ["reasoning"],
+            },
+        },
+    }
+)
+
+TOOL_SCROLL_UP: ToolSchema = _load_tool(
+    {
+        "type": "function",
+        "function": {
+            "name": "scroll_up",
+            "description": "Scrolls the entire browser viewport one page UP towards the beginning.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "reasoning": {
+                        "type": "string",
+                        "description": REASONING_TOOL_PROMPT,
+                    },
+                },
+                "required": ["reasoning"],
+            },
+        },
+    }
+)
+
+TOOL_SCROLL_DOWN: ToolSchema = _load_tool(
+    {
+        "type": "function",
+        "function": {
+            "name": "scroll_down",
+            "description": "Scrolls the entire browser viewport one page DOWN towards the end.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "reasoning": {
+                        "type": "string",
+                        "description": REASONING_TOOL_PROMPT,
+                    },
+                },
+                "required": ["reasoning"],
+            },
+        },
+    }
+)
+
+TOOL_CLICK: ToolSchema = _load_tool(
+    {
+        "type": "function",
+        "function": {
+            "name": "click",
+            "description": "Clicks the mouse on the target with the given id.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "reasoning": {
+                        "type": "string",
+                        "description": REASONING_TOOL_PROMPT,
+                    },
+                    "target_id": {
+                        "type": "integer",
+                        "description": "The numeric id of the target to click.",
+                    },
+                },
+                "required": ["reasoning", "target_id"],
+            },
+        },
+    }
+)
+
+TOOL_TYPE: ToolSchema = _load_tool(
+    {
+        "type": "function",
+        "function": {
+            "name": "input_text",
+            "description": "Types the given text value into the specified field.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "reasoning": {
+                        "type": "string",
+                        "description": REASONING_TOOL_PROMPT,
+                    },
+                    "input_field_id": {
+                        "type": "integer",
+                        "description": "The numeric id of the input field to receive the text.",
+                    },
+                    "text_value": {
+                        "type": "string",
+                        "description": "The text to type into the input field.",
+                    },
+                },
+                "required": ["reasoning", "input_field_id", "text_value"],
+            },
+        },
+    }
+)
+
+TOOL_SCROLL_ELEMENT_DOWN: ToolSchema = _load_tool(
+    {
+        "type": "function",
+        "function": {
+            "name": "scroll_element_down",
+            "description": "Scrolls a given html element (e.g., a div or a menu) DOWN.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "reasoning": {
+                        "type": "string",
+                        "description": REASONING_TOOL_PROMPT,
+                    },
+                    "target_id": {
+                        "type": "integer",
+                        "description": "The numeric id of the target to scroll down.",
+                    },
+                },
+                "required": ["reasoning", "target_id"],
+            },
+        },
+    }
+)
+
+TOOL_SCROLL_ELEMENT_UP: ToolSchema = _load_tool(
+    {
+        "type": "function",
+        "function": {
+            "name": "scroll_element_up",
+            "description": "Scrolls a given html element (e.g., a div or a menu) UP.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "reasoning": {
+                        "type": "string",
+                        "description": REASONING_TOOL_PROMPT,
+                    },
+                    "target_id": {
+                        "type": "integer",
+                        "description": "The numeric id of the target to scroll UP.",
+                    },
+                },
+                "required": ["reasoning", "target_id"],
+            },
+        },
+    }
+)
+
+TOOL_HOVER: ToolSchema = _load_tool(
+    {
+        "type": "function",
+        "function": {
+            "name": "hover",
+            "description": "Hovers the mouse over the target with the given id.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "reasoning": {
+                        "type": "string",
+                        "description": REASONING_TOOL_PROMPT,
+                    },
+                    "target_id": {
+                        "type": "integer",
+                        "description": "The numeric id of the target to hover over.",
+                    },
+                },
+                "required": ["reasoning", "target_id"],
+            },
+        },
+    }
+)
+
+
+TOOL_READ_PAGE_AND_ANSWER: ToolSchema = _load_tool(
+    {
+        "type": "function",
+        "function": {
+            "name": "answer_question",
+            "description": "Uses AI to answer a question about the current webpage's content.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "reasoning": {
+                        "type": "string",
+                        "description": REASONING_TOOL_PROMPT,
+                    },
+                    "question": {
+                        "type": "string",
+                        "description": "The question to answer.",
+                    },
+                },
+                "required": ["reasoning", "question"],
+            },
+        },
+    }
+)
+
+TOOL_SUMMARIZE_PAGE: ToolSchema = _load_tool(
+    {
+        "type": "function",
+        "function": {
+            "name": "summarize_page",
+            "description": "Uses AI to summarize the entire page.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "reasoning": {
+                        "type": "string",
+                        "description": REASONING_TOOL_PROMPT,
+                    },
+                },
+                "required": ["reasoning"],
+            },
+        },
+    }
+)
+
+TOOL_SLEEP: ToolSchema = _load_tool(
+    {
+        "type": "function",
+        "function": {
+            "name": "sleep",
+            "description": "Wait a short period of time. Call this function if the page has not yet fully loaded, or if it is determined that a small delay would increase the task's chances of success.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "reasoning": {
+                        "type": "string",
+                        "description": REASONING_TOOL_PROMPT,
+                    },
+                },
+                "required": ["reasoning"],
+            },
+        },
+    }
+)
--- a/agent_dhal/agentdhal_extensions/agents/web_surfer/_types.py
+++ b/agent_dhal/agentdhal_extensions/agents/web_surfer/_types.py
@@ -0,0 +1,106 @@
+from typing import Any, Dict, List, TypedDict, Union
+
+from agentdhal_core import FunctionCall, Image
+from agentdhal_core.models import FunctionExecutionResult
+
+UserContent = Union[str, List[Union[str, Image]]]
+AssistantContent = Union[str, List[FunctionCall]]
+FunctionExecutionContent = List[FunctionExecutionResult]
+SystemContent = str
+
+
+class DOMRectangle(TypedDict):
+    x: Union[int, float]
+    y: Union[int, float]
+    width: Union[int, float]
+    height: Union[int, float]
+    top: Union[int, float]
+    right: Union[int, float]
+    bottom: Union[int, float]
+    left: Union[int, float]
+
+
+class VisualViewport(TypedDict):
+    height: Union[int, float]
+    width: Union[int, float]
+    offsetLeft: Union[int, float]
+    offsetTop: Union[int, float]
+    pageLeft: Union[int, float]
+    pageTop: Union[int, float]
+    scale: Union[int, float]
+    clientWidth: Union[int, float]
+    clientHeight: Union[int, float]
+    scrollWidth: Union[int, float]
+    scrollHeight: Union[int, float]
+
+
+class InteractiveRegion(TypedDict):
+    tag_name: str
+    role: str
+    aria_name: str
+    v_scrollable: bool
+    rects: List[DOMRectangle]
+
+
+# Helper functions for dealing with JSON. Not sure there's a better way?
+
+
+def _get_str(d: Any, k: str) -> str:
+    val = d[k]
+    assert isinstance(val, str)
+    return val
+
+
+def _get_number(d: Any, k: str) -> Union[int, float]:
+    val = d[k]
+    assert isinstance(val, int) or isinstance(val, float)
+    return val
+
+
+def _get_bool(d: Any, k: str) -> bool:
+    val = d[k]
+    assert isinstance(val, bool)
+    return val
+
+
+def domrectangle_from_dict(rect: Dict[str, Any]) -> DOMRectangle:
+    return DOMRectangle(
+        x=_get_number(rect, "x"),
+        y=_get_number(rect, "y"),
+        width=_get_number(rect, "width"),
+        height=_get_number(rect, "height"),
+        top=_get_number(rect, "top"),
+        right=_get_number(rect, "right"),
+        bottom=_get_number(rect, "bottom"),
+        left=_get_number(rect, "left"),
+    )
+
+
+def interactiveregion_from_dict(region: Dict[str, Any]) -> InteractiveRegion:
+    typed_rects: List[DOMRectangle] = []
+    for rect in region["rects"]:
+        typed_rects.append(domrectangle_from_dict(rect))
+
+    return InteractiveRegion(
+        tag_name=_get_str(region, "tag_name"),
+        role=_get_str(region, "role"),
+        aria_name=_get_str(region, "aria-name"),
+        v_scrollable=_get_bool(region, "v-scrollable"),
+        rects=typed_rects,
+    )
+
+
+def visualviewport_from_dict(viewport: Dict[str, Any]) -> VisualViewport:
+    return VisualViewport(
+        height=_get_number(viewport, "height"),
+        width=_get_number(viewport, "width"),
+        offsetLeft=_get_number(viewport, "offsetLeft"),
+        offsetTop=_get_number(viewport, "offsetTop"),
+        pageLeft=_get_number(viewport, "pageLeft"),
+        pageTop=_get_number(viewport, "pageTop"),
+        scale=_get_number(viewport, "scale"),
+        clientWidth=_get_number(viewport, "clientWidth"),
+        clientHeight=_get_number(viewport, "clientHeight"),
+        scrollWidth=_get_number(viewport, "scrollWidth"),
+        scrollHeight=_get_number(viewport, "scrollHeight"),
+    )
--- a/agent_dhal/agentdhal_extensions/agents/web_surfer/page_script.js
+++ b/agent_dhal/agentdhal_extensions/agents/web_surfer/page_script.js
@@ -0,0 +1,429 @@
+var MultimodalWebSurfer = MultimodalWebSurfer || (function() {
+  let nextLabel = 10;
+
+  let roleMapping = {
+      "a": "link",
+      "area": "link",
+      "button": "button",
+      "input, type=button": "button",
+      "input, type=checkbox": "checkbox",
+      "input, type=email": "textbox",
+      "input, type=number": "spinbutton",
+      "input, type=radio": "radio",
+      "input, type=range": "slider",
+      "input, type=reset": "button",
+      "input, type=search": "searchbox",
+      "input, type=submit": "button",
+      "input, type=tel": "textbox",
+      "input, type=text": "textbox",
+      "input, type=url": "textbox",
+      "search": "search",
+      "select": "combobox",
+      "option": "option",
+      "textarea": "textbox"
+  };
+
+  let getCursor = function(elm) {
+      return window.getComputedStyle(elm)["cursor"];
+  };
+
+  let getInteractiveElements = function() {
+
+      let results = []
+      let roles = ["scrollbar", "searchbox", "slider", "spinbutton", "switch", "tab", "treeitem", "button", "checkbox", "gridcell", "link", "menuitem", "menuitemcheckbox", "menuitemradio", "option", "progressbar", "radio", "textbox", "combobox", "menu", "tree", "treegrid", "grid", "listbox", "radiogroup", "widget"];
+      let inertCursors = ["auto", "default", "none", "text", "vertical-text", "not-allowed", "no-drop"];
+
+      // Get the main interactive elements
+      let nodeList = document.querySelectorAll("input, select, textarea, button, [href], [onclick], [contenteditable], [tabindex]:not([tabindex='-1'])");
+      for (let i=0; i<nodeList.length; i++) { // Copy to something mutable
+          results.push(nodeList[i]);
+      }
+
+      // Anything not already included that has a suitable role
+      nodeList = document.querySelectorAll("[role]");
+      for (let i=0; i<nodeList.length; i++) { // Copy to something mutable
+          if (results.indexOf(nodeList[i]) == -1) {
+              let role = nodeList[i].getAttribute("role");
+	      if (roles.indexOf(role) > -1) {
+                  results.push(nodeList[i]);
+	      }
+	  }
+      }
+
+      // Any element that changes the cursor to something implying interactivity
+      nodeList = document.querySelectorAll("*");
+      for (let i=0; i<nodeList.length; i++) {
+         let node = nodeList[i];
+
+         // Cursor is default, or does not suggest interactivity
+         let cursor = getCursor(node);
+         if (inertCursors.indexOf(cursor) >= 0) {
+             continue;
+         }
+
+         // Move up to the first instance of this cursor change
+         parent = node.parentNode;
+         while (parent && getCursor(parent) == cursor) {
+             node = parent;
+	     parent = node.parentNode;
+         }
+
+         // Add the node if it is new
+         if (results.indexOf(node) == -1) {
+             results.push(node);
+         }
+      }
+
+      return results;
+  };
+
+  let labelElements = function(elements) {
+      for (let i=0; i<elements.length; i++) {
+          if (!elements[i].hasAttribute("__elementId")) {
+              elements[i].setAttribute("__elementId", "" + (nextLabel++));
+          }
+      }
+  };
+
+  let isTopmost = function(element, x, y) {
+     let hit = document.elementFromPoint(x, y);
+
+     // Hack to handle elements outside the viewport
+     if (hit === null) {
+         return true;
+     }
+
+     while (hit) {
+         if (hit == element) return true;
+         hit = hit.parentNode;
+     }
+     return false;
+  };
+
+  let getFocusedElementId = function() {
+     let elm = document.activeElement;
+     while (elm) {
+         if (elm.hasAttribute && elm.hasAttribute("__elementId")) {
+	     return elm.getAttribute("__elementId");
+	 }
+         elm = elm.parentNode;
+     }
+     return null;
+  };
+
+  let trimmedInnerText = function(element) {
+      if (!element) {
+          return "";
+      }
+      let text = element.innerText;
+      if (!text) {
+          return "";
+      }
+      return text.trim();
+  };
+
+  let getApproximateAriaName = function(element) {
+      // Check for aria labels
+      if (element.hasAttribute("aria-labelledby")) {
+          let buffer = "";
+	  let ids = element.getAttribute("aria-labelledby").split(" ");
+	  for (let i=0; i<ids.length; i++) {
+              let label = document.getElementById(ids[i]);
+	      if (label) {
+	          buffer = buffer + " " + trimmedInnerText(label);
+              }
+          }
+	  return buffer.trim();
+      }
+
+      if (element.hasAttribute("aria-label")) {
+	  return element.getAttribute("aria-label");
+      }
+
+      // Check for labels
+      if (element.hasAttribute("id")) {
+          let label_id = element.getAttribute("id");
+          let label = "";
+          let labels = document.querySelectorAll("label[for='" + label_id + "']");
+          for (let j=0; j<labels.length; j++) {
+              label += labels[j].innerText + " ";
+          }
+          label = label.trim();
+          if (label != "") {
+              return label;
+          }
+      }
+
+      if (element.parentElement && element.parentElement.tagName == "LABEL") {
+          return element.parentElement.innerText;
+      }
+
+      // Check for alt text or titles
+      if (element.hasAttribute("alt")) {
+	  return element.getAttribute("alt")
+      }
+
+      if (element.hasAttribute("title")) {
+	  return element.getAttribute("title")
+      }
+
+      return trimmedInnerText(element);
+  };
+
+  let getApproximateAriaRole = function(element) {
+      let tag = element.tagName.toLowerCase();
+      if (tag == "input" && element.hasAttribute("type")) {
+          tag = tag + ", type=" + element.getAttribute("type");
+      }
+
+      if (element.hasAttribute("role")) {
+          return [element.getAttribute("role"), tag];
+      }
+      else if (tag in roleMapping) {
+          return [roleMapping[tag], tag];
+      }
+      else {
+	  return ["", tag];
+      }
+  };
+
+  let getInteractiveRects = function() {
+      labelElements(getInteractiveElements());
+      let elements = document.querySelectorAll("[__elementId]");
+      let results = {};
+      for (let i=0; i<elements.length; i++) {
+         let key = elements[i].getAttribute("__elementId");
+         let rects = elements[i].getClientRects();
+	 let ariaRole = getApproximateAriaRole(elements[i]);
+	 let ariaName = getApproximateAriaName(elements[i]);
+	 let vScrollable = elements[i].scrollHeight - elements[i].clientHeight >= 1;
+
+	 let record = {
+             "tag_name": ariaRole[1],
+	     "role": ariaRole[0],
+	     "aria-name": ariaName,
+	     "v-scrollable": vScrollable,
+	     "rects": []
+	 };
+
+         for (const rect of rects) {
+	     let x = rect.left + rect.width/2;
+             let y = rect.top + rect.height/2;
+             if (isTopmost(elements[i], x, y)) {
+		 record["rects"].push(JSON.parse(JSON.stringify(rect)));
+             }
+         }
+
+	 if (record["rects"].length > 0) {
+             results[key] = record;
+         }
+      }
+      return results;
+  };
+
+  let getVisualViewport = function() {
+      let vv = window.visualViewport;
+      let de = document.documentElement;
+      return {
+          "height":     vv ? vv.height : 0,
+	  "width":      vv ? vv.width : 0,
+	  "offsetLeft": vv ? vv.offsetLeft : 0,
+	  "offsetTop":  vv ? vv.offsetTop : 0,
+	  "pageLeft":   vv ? vv.pageLeft  : 0,
+	  "pageTop":    vv ? vv.pageTop : 0,
+	  "scale":      vv ? vv.scale : 0,
+	  "clientWidth":  de ? de.clientWidth : 0,
+	  "clientHeight": de ? de.clientHeight : 0,
+	  "scrollWidth":  de ? de.scrollWidth : 0,
+	  "scrollHeight": de ? de.scrollHeight : 0
+      };
+  };
+
+  let _getMetaTags = function() {
+      let meta = document.querySelectorAll("meta");
+      let results = {};
+      for (let i = 0; i<meta.length; i++) {
+          let key = null;
+          if (meta[i].hasAttribute("name")) {
+              key = meta[i].getAttribute("name");
+          }
+          else if (meta[i].hasAttribute("property")) {
+              key = meta[i].getAttribute("property");
+          }
+          else {
+              continue;
+          }
+          if (meta[i].hasAttribute("content")) {
+              results[key] = meta[i].getAttribute("content");
+          }
+      }
+      return results;
+  };
+
+  let _getJsonLd = function() {
+      let jsonld = [];
+      let scripts = document.querySelectorAll('script[type="application/ld+json"]');
+      for (let i=0; i<scripts.length; i++) {
+          jsonld.push(scripts[i].innerHTML.trim());
+      }
+      return jsonld;
+   };
+
+   // From: https://www.stevefenton.co.uk/blog/2022/12/parse-microdata-with-javascript/
+   let _getMicrodata = function() {
+      function sanitize(input) {
+          return input.replace(/\s/gi, ' ').trim();
+      }
+
+      function addValue(information, name, value) {
+          if (information[name]) {
+              if (typeof information[name] === 'array') {
+                  information[name].push(value);
+              } else {
+                  const arr = [];
+                  arr.push(information[name]);
+                  arr.push(value);
+                  information[name] = arr;
+              }
+          } else {
+              information[name] = value;
+          }
+      }
+
+      function traverseItem(item, information) {
+         const children = item.children;
+
+         for (let i = 0; i < children.length; i++) {
+             const child = children[i];
+
+             if (child.hasAttribute('itemscope')) {
+                 if (child.hasAttribute('itemprop')) {
+                     const itemProp = child.getAttribute('itemprop');
+                     const itemType = child.getAttribute('itemtype');
+
+                     const childInfo = {
+                         itemType: itemType
+                     };
+
+                     traverseItem(child, childInfo);
+
+                     itemProp.split(' ').forEach(propName => {
+                         addValue(information, propName, childInfo);
+                     });
+                 }
+
+             } else if (child.hasAttribute('itemprop')) {
+                 const itemProp = child.getAttribute('itemprop');
+                 itemProp.split(' ').forEach(propName => {
+                     if (propName === 'url') {
+                         addValue(information, propName, child.href);
+                     } else {
+                         addValue(information, propName, sanitize(child.getAttribute("content") || child.content || child.textContent || child.src || ""));
+                     }
+                 });
+                 traverseItem(child, information);
+             } else {
+                 traverseItem(child, information);
+             }
+         }
+      }
+
+      const microdata = [];
+
+      document.querySelectorAll("[itemscope]").forEach(function(elem, i) {
+         const itemType = elem.getAttribute('itemtype');
+         const information = {
+             itemType: itemType
+         };
+         traverseItem(elem, information);
+         microdata.push(information);
+      });
+
+      return microdata;
+   };
+
+   let getPageMetadata = function() {
+       let jsonld = _getJsonLd();
+       let metaTags = _getMetaTags();
+       let microdata = _getMicrodata();
+       let results = {}
+       if (jsonld.length > 0) {
+           try {
+               results["jsonld"] = JSON.parse(jsonld);
+           }
+	   catch (e) {
+               results["jsonld"] = jsonld;
+	   }
+       }
+       if (microdata.length > 0) {
+	   results["microdata"] = microdata;
+       }
+       for (let key in metaTags) {
+	   if (metaTags.hasOwnProperty(key)) {
+	       results["meta_tags"] = metaTags;
+	       break;
+           }
+       }
+       return results;
+   };
+
+
+   let getVisibleText = function() {
+     // Get the window’s current viewport boundaries
+     const viewportHeight = window.innerHeight || document.documentElement.clientHeight;
+     const viewportWidth = window.innerWidth || document.documentElement.clientWidth;
+
+     let textInView = "";
+     const walker = document.createTreeWalker(
+       document.body,
+       NodeFilter.SHOW_TEXT,
+       null,
+       false
+     );
+
+     while (walker.nextNode()) {
+       const textNode = walker.currentNode;
+       // Create a range to retrieve bounding rectangles of the current text node
+       const range = document.createRange();
+       range.selectNodeContents(textNode);
+
+       const rects = range.getClientRects();
+
+       // Check if any rect is inside (or partially inside) the viewport
+       for (const rect of rects) {
+         const isVisible =
+           rect.width > 0 &&
+           rect.height > 0 &&
+           rect.bottom >= 0 &&
+           rect.right >= 0 &&
+           rect.top <= viewportHeight &&
+           rect.left <= viewportWidth;
+
+         if (isVisible) {
+           textInView += textNode.nodeValue.replace(/\s+/g, " ");
+           // Is the parent a block element?
+           if (textNode.parentNode) {
+             const parent = textNode.parentNode;
+             const style = window.getComputedStyle(parent);
+             if (["inline", "hidden", "none"].indexOf(style.display) === -1) {
+               textInView += "\n";
+             }
+           }
+           break; // No need to check other rects once found visible
+         }
+       }
+     }
+
+     // Remove blank lines from textInView
+     textInView = textInView.replace(/^\s*\n/gm, "").trim().replace(/\n+/g, "\n");
+     return textInView;
+   };	
+
+   return {
+       getInteractiveRects: getInteractiveRects,
+       getVisualViewport: getVisualViewport,
+       getFocusedElementId: getFocusedElementId,
+       getPageMetadata: getPageMetadata,
+       getVisibleText: getVisibleText,
+   };
+})();
--- a/agent_dhal/agentdhal_extensions/agents/web_surfer/playwright_controller.py
+++ b/agent_dhal/agentdhal_extensions/agents/web_surfer/playwright_controller.py
@@ -0,0 +1,578 @@
+import asyncio
+import base64
+import io
+import os
+import random
+import warnings
+from types import ModuleType
+from typing import Any, Callable, Dict, Optional, Tuple, Union, cast
+
+from playwright._impl._errors import Error as PlaywrightError
+from playwright._impl._errors import TimeoutError
+from playwright.async_api import Download, Page
+
+from ._types import (
+    InteractiveRegion,
+    VisualViewport,
+    interactiveregion_from_dict,
+    visualviewport_from_dict,
+)
+
+markitdown: ModuleType | None = None
+try:
+    # Suppress warnings from markitdown -- which is pretty chatty
+    warnings.filterwarnings(action="ignore", module="markitdown")
+    import markitdown
+except ImportError:
+    pass
+
+
+class PlaywrightController:
+    """
+    A helper class to allow Playwright to interact with web pages to perform actions such as clicking, filling, and scrolling.
+
+    Args:
+        downloads_folder (str | None): The folder to save downloads to. If None, downloads are not saved.
+        animate_actions (bool): Whether to animate the actions (create fake cursor to click).
+        viewport_width (int): The width of the viewport.
+        viewport_height (int): The height of the viewport.
+        _download_handler (Optional[Callable[[Download], None]]): A function to handle downloads.
+        to_resize_viewport (bool): Whether to resize the viewport
+    """
+
+    def __init__(
+        self,
+        downloads_folder: str | None = None,
+        animate_actions: bool = False,
+        viewport_width: int = 1440,
+        viewport_height: int = 900,
+        _download_handler: Optional[Callable[[Download], None]] = None,
+        to_resize_viewport: bool = True,
+    ) -> None:
+        """
+        Initialize the PlaywrightController.
+        """
+        assert isinstance(animate_actions, bool)
+        assert isinstance(viewport_width, int)
+        assert isinstance(viewport_height, int)
+        assert viewport_height > 0
+        assert viewport_width > 0
+
+        self.animate_actions = animate_actions
+        self.downloads_folder = downloads_folder
+        self.viewport_width = viewport_width
+        self.viewport_height = viewport_height
+        self._download_handler = _download_handler
+        self.to_resize_viewport = to_resize_viewport
+        self._page_script: str = ""
+        self.last_cursor_position: Tuple[float, float] = (0.0, 0.0)
+        self._markdown_converter: Optional[Any] | None = None
+
+        # Read page_script
+        with open(
+            os.path.join(os.path.abspath(os.path.dirname(__file__)), "page_script.js"), "rt", encoding="utf-8"
+        ) as fh:
+            self._page_script = fh.read()
+
+    async def sleep(self, page: Page, duration: Union[int, float]) -> None:
+        """
+        Pause the execution for a specified duration.
+
+        Args:
+            page (Page): The Playwright page object.
+            duration (Union[int, float]): The duration to sleep in milliseconds.
+        """
+        assert page is not None
+        await page.wait_for_timeout(duration * 1000)
+
+    async def get_interactive_rects(self, page: Page) -> Dict[str, InteractiveRegion]:
+        """
+        Retrieve interactive regions from the web page.
+
+        Args:
+            page (Page): The Playwright page object.
+
+        Returns:
+            Dict[str, InteractiveRegion]: A dictionary of interactive regions.
+        """
+        assert page is not None
+        # Read the regions from the DOM
+        try:
+            await page.evaluate(self._page_script)
+        except Exception:
+            pass
+        result = cast(Dict[str, Dict[str, Any]], await page.evaluate("MultimodalWebSurfer.getInteractiveRects();"))
+
+        # Convert the results into appropriate types
+        assert isinstance(result, dict)
+        typed_results: Dict[str, InteractiveRegion] = {}
+        for k in result:
+            assert isinstance(k, str)
+            typed_results[k] = interactiveregion_from_dict(result[k])
+
+        return typed_results
+
+    async def get_visual_viewport(self, page: Page) -> VisualViewport:
+        """
+        Retrieve the visual viewport of the web page.
+
+        Args:
+            page (Page): The Playwright page object.
+
+        Returns:
+            VisualViewport: The visual viewport of the page.
+        """
+        assert page is not None
+        try:
+            await page.evaluate(self._page_script)
+        except Exception:
+            pass
+        return visualviewport_from_dict(await page.evaluate("MultimodalWebSurfer.getVisualViewport();"))
+
+    async def get_focused_rect_id(self, page: Page) -> str | None:
+        """
+        Retrieve the ID of the currently focused element.
+
+        Args:
+            page (Page): The Playwright page object.
+
+        Returns:
+            str: The ID of the focused element or None if no control has focus.
+        """
+        assert page is not None
+        try:
+            await page.evaluate(self._page_script)
+        except Exception:
+            pass
+        result = await page.evaluate("MultimodalWebSurfer.getFocusedElementId();")
+        return None if result is None else str(result)
+
+    async def get_page_metadata(self, page: Page) -> Dict[str, Any]:
+        """
+        Retrieve metadata from the web page.
+
+        Args:
+            page (Page): The Playwright page object.
+
+        Returns:
+            Dict[str, Any]: A dictionary of page metadata.
+        """
+        assert page is not None
+        try:
+            await page.evaluate(self._page_script)
+        except Exception:
+            pass
+        result = await page.evaluate("MultimodalWebSurfer.getPageMetadata();")
+        assert isinstance(result, dict)
+        return cast(Dict[str, Any], result)
+
+    async def on_new_page(self, page: Page) -> None:
+        """
+        Handle actions to perform on a new page.
+
+        Args:
+            page (Page): The Playwright page object.
+        """
+        assert page is not None
+        page.on("download", self._download_handler)  # type: ignore
+        if self.to_resize_viewport and self.viewport_width and self.viewport_height:
+            await page.set_viewport_size({"width": self.viewport_width, "height": self.viewport_height})
+        await self.sleep(page, 0.2)
+        await page.add_init_script(path=os.path.join(os.path.abspath(os.path.dirname(__file__)), "page_script.js"))
+        await page.wait_for_load_state()
+
+    async def back(self, page: Page) -> None:
+        """
+        Navigate back to the previous page.
+
+        Args:
+            page (Page): The Playwright page object.
+        """
+        assert page is not None
+        await page.go_back()
+
+    async def visit_page(self, page: Page, url: str) -> Tuple[bool, bool]:
+        """
+        Visit a specified URL.
+
+        Args:
+            page (Page): The Playwright page object.
+            url (str): The URL to visit.
+
+        Returns:
+            Tuple[bool, bool]: A tuple indicating whether to reset prior metadata hash and last download.
+        """
+        assert page is not None
+        reset_prior_metadata_hash = False
+        reset_last_download = False
+        try:
+            # Regular webpage
+            await page.goto(url)
+            await page.wait_for_load_state()
+            reset_prior_metadata_hash = True
+        except Exception as e_outer:
+            # Downloaded file
+            if self.downloads_folder and "net::ERR_ABORTED" in str(e_outer):
+                async with page.expect_download() as download_info:
+                    try:
+                        await page.goto(url)
+                    except Exception as e_inner:
+                        if "net::ERR_ABORTED" in str(e_inner):
+                            pass
+                        else:
+                            raise e_inner
+                    download = await download_info.value
+                    fname = os.path.join(self.downloads_folder, download.suggested_filename)
+                    await download.save_as(fname)
+                    message = f"<body style=\"margin: 20px;\"><h1>Successfully downloaded '{download.suggested_filename}' to local path:<br><br>{fname}</h1></body>"
+                    await page.goto(
+                        "data:text/html;base64," + base64.b64encode(message.encode("utf-8")).decode("utf-8")
+                    )
+                    reset_last_download = True
+            else:
+                raise e_outer
+        return reset_prior_metadata_hash, reset_last_download
+
+    async def page_down(self, page: Page) -> None:
+        """
+        Scroll the page down by one viewport height minus 50 pixels.
+
+        Args:
+            page (Page): The Playwright page object.
+        """
+        assert page is not None
+        await page.evaluate(f"window.scrollBy(0, {self.viewport_height-50});")
+
+    async def page_up(self, page: Page) -> None:
+        """
+        Scroll the page up by one viewport height minus 50 pixels.
+
+        Args:
+            page (Page): The Playwright page object.
+        """
+        assert page is not None
+        await page.evaluate(f"window.scrollBy(0, -{self.viewport_height-50});")
+
+    async def gradual_cursor_animation(
+        self, page: Page, start_x: float, start_y: float, end_x: float, end_y: float
+    ) -> None:
+        """
+        Animate the cursor movement gradually from start to end coordinates.
+
+        Args:
+            page (Page): The Playwright page object.
+            start_x (float): The starting x-coordinate.
+            start_y (float): The starting y-coordinate.
+            end_x (float): The ending x-coordinate.
+            end_y (float): The ending y-coordinate.
+        """
+        # animation helper
+        steps = 20
+        for step in range(steps):
+            x = start_x + (end_x - start_x) * (step / steps)
+            y = start_y + (end_y - start_y) * (step / steps)
+            # await page.mouse.move(x, y, steps=1)
+            await page.evaluate(f"""
+                (function() {{
+                    let cursor = document.getElementById('red-cursor');
+                    cursor.style.left = '{x}px';
+                    cursor.style.top = '{y}px';
+                }})();
+            """)
+            await asyncio.sleep(0.05)
+
+        self.last_cursor_position = (end_x, end_y)
+
+    async def add_cursor_box(self, page: Page, identifier: str) -> None:
+        """
+        Add a red cursor box around the element with the given identifier.
+
+        Args:
+            page (Page): The Playwright page object.
+            identifier (str): The element identifier.
+        """
+        # animation helper
+        await page.evaluate(f"""
+            (function() {{
+                let elm = document.querySelector("[__elementId='{identifier}']");
+                if (elm) {{
+                    elm.style.transition = 'border 0.3s ease-in-out';
+                    elm.style.border = '2px solid red';
+                }}
+            }})();
+        """)
+        await asyncio.sleep(0.3)
+
+        # Create a red cursor
+        await page.evaluate("""
+            (function() {
+                let cursor = document.createElement('div');
+                cursor.id = 'red-cursor';
+                cursor.style.width = '10px';
+                cursor.style.height = '10px';
+                cursor.style.backgroundColor = 'red';
+                cursor.style.position = 'absolute';
+                cursor.style.borderRadius = '50%';
+                cursor.style.zIndex = '10000';
+                document.body.appendChild(cursor);
+            })();
+        """)
+
+    async def remove_cursor_box(self, page: Page, identifier: str) -> None:
+        """
+        Remove the red cursor box around the element with the given identifier.
+
+        Args:
+            page (Page): The Playwright page object.
+            identifier (str): The element identifier.
+        """
+        # Remove the highlight and cursor
+        await page.evaluate(f"""
+            (function() {{
+                let elm = document.querySelector("[__elementId='{identifier}']");
+                if (elm) {{
+                    elm.style.border = '';
+                }}
+                let cursor = document.getElementById('red-cursor');
+                if (cursor) {{
+                    cursor.remove();
+                }}
+            }})();
+        """)
+
+    async def click_id(self, page: Page, identifier: str) -> Page | None:
+        """
+        Click the element with the given identifier.
+
+        Args:
+            page (Page): The Playwright page object.
+            identifier (str): The element identifier.
+
+        Returns:
+            Page | None: The new page if a new page is opened, otherwise None.
+        """
+        new_page: Page | None = None
+        assert page is not None
+        target = page.locator(f"[__elementId='{identifier}']")
+
+        # See if it exists
+        try:
+            await target.wait_for(timeout=5000)
+        except TimeoutError:
+            raise ValueError("No such element.") from None
+
+        # Click it
+        await target.scroll_into_view_if_needed()
+        await asyncio.sleep(0.3)
+
+        box = cast(Dict[str, Union[int, float]], await target.bounding_box())
+
+        if self.animate_actions:
+            await self.add_cursor_box(page, identifier)
+            # Move cursor to the box slowly
+            start_x, start_y = self.last_cursor_position
+            end_x, end_y = box["x"] + box["width"] / 2, box["y"] + box["height"] / 2
+            await self.gradual_cursor_animation(page, start_x, start_y, end_x, end_y)
+            await asyncio.sleep(0.1)
+
+            try:
+                # Give it a chance to open a new page
+                async with page.expect_event("popup", timeout=1000) as page_info:  # type: ignore
+                    await page.mouse.click(end_x, end_y, delay=10)
+                    new_page = await page_info.value  # type: ignore
+                    assert isinstance(new_page, Page)
+                    await self.on_new_page(new_page)
+            except TimeoutError:
+                pass
+            await self.remove_cursor_box(page, identifier)
+
+        else:
+            try:
+                # Give it a chance to open a new page
+                async with page.expect_event("popup", timeout=1000) as page_info:  # type: ignore
+                    await page.mouse.click(box["x"] + box["width"] / 2, box["y"] + box["height"] / 2, delay=10)
+                    new_page = await page_info.value  # type: ignore
+                    assert isinstance(new_page, Page)
+                    await self.on_new_page(new_page)
+            except TimeoutError:
+                pass
+        return new_page  # type: ignore
+
+    async def hover_id(self, page: Page, identifier: str) -> None:
+        """
+        Hover the mouse over the element with the given identifier.
+
+        Args:
+            page (Page): The Playwright page object.
+            identifier (str): The element identifier.
+        """
+        assert page is not None
+        target = page.locator(f"[__elementId='{identifier}']")
+
+        # See if it exists
+        try:
+            await target.wait_for(timeout=5000)
+        except TimeoutError:
+            raise ValueError("No such element.") from None
+
+        # Hover over it
+        await target.scroll_into_view_if_needed()
+        await asyncio.sleep(0.3)
+
+        box = cast(Dict[str, Union[int, float]], await target.bounding_box())
+
+        if self.animate_actions:
+            await self.add_cursor_box(page, identifier)
+            # Move cursor to the box slowly
+            start_x, start_y = self.last_cursor_position
+            end_x, end_y = box["x"] + box["width"] / 2, box["y"] + box["height"] / 2
+            await self.gradual_cursor_animation(page, start_x, start_y, end_x, end_y)
+            await asyncio.sleep(0.1)
+            await page.mouse.move(box["x"] + box["width"] / 2, box["y"] + box["height"] / 2)
+
+            await self.remove_cursor_box(page, identifier)
+        else:
+            await page.mouse.move(box["x"] + box["width"] / 2, box["y"] + box["height"] / 2)
+
+    async def fill_id(self, page: Page, identifier: str, value: str, press_enter: bool = True) -> None:
+        """
+        Fill the element with the given identifier with the specified value.
+
+        Args:
+            page (Page): The Playwright page object.
+            identifier (str): The element identifier.
+            value (str): The value to fill.
+        """
+        assert page is not None
+        target = page.locator(f"[__elementId='{identifier}']")
+
+        # See if it exists
+        try:
+            await target.wait_for(timeout=5000)
+        except TimeoutError:
+            raise ValueError("No such element.") from None
+
+        # Fill it
+        await target.scroll_into_view_if_needed()
+        box = cast(Dict[str, Union[int, float]], await target.bounding_box())
+
+        if self.animate_actions:
+            await self.add_cursor_box(page, identifier)
+            # Move cursor to the box slowly
+            start_x, start_y = self.last_cursor_position
+            end_x, end_y = box["x"] + box["width"] / 2, box["y"] + box["height"] / 2
+            await self.gradual_cursor_animation(page, start_x, start_y, end_x, end_y)
+            await asyncio.sleep(0.1)
+
+        # Focus on the element
+        await target.focus()
+        if self.animate_actions:
+            # fill char by char to mimic human speed for short text and type fast for long text
+            if len(value) < 100:
+                delay_typing_speed = 50 + 100 * random.random()
+            else:
+                delay_typing_speed = 10
+            await target.press_sequentially(value, delay=delay_typing_speed)
+        else:
+            try:
+                await target.fill(value)
+            except PlaywrightError:
+                await target.press_sequentially(value)
+        if press_enter:
+            await target.press("Enter")
+
+        if self.animate_actions:
+            await self.remove_cursor_box(page, identifier)
+
+    async def scroll_id(self, page: Page, identifier: str, direction: str) -> None:
+        """
+        Scroll the element with the given identifier in the specified direction.
+
+        Args:
+            page (Page): The Playwright page object.
+            identifier (str): The element identifier.
+            direction (str): The direction to scroll ("up" or "down").
+        """
+        assert page is not None
+        await page.evaluate(
+            f"""
+        (function() {{
+            let elm = document.querySelector("[__elementId='{identifier}']");
+            if (elm) {{
+                if ("{direction}" == "up") {{
+                    elm.scrollTop = Math.max(0, elm.scrollTop - elm.clientHeight);
+                }}
+                else {{
+                    elm.scrollTop = Math.min(elm.scrollHeight - elm.clientHeight, elm.scrollTop + elm.clientHeight);
+                }}
+            }}
+        }})();
+    """
+        )
+
+    async def get_webpage_text(self, page: Page, n_lines: int = 50) -> str:
+        """
+        Retrieve the text content of the web page.
+
+        Args:
+            page (Page): The Playwright page object.
+            n_lines (int): The number of lines to return from the page inner text.
+
+        Returns:
+            str: The text content of the page.
+        """
+        assert page is not None
+        try:
+            text_in_viewport = await page.evaluate("""() => {
+                return document.body.innerText;
+            }""")
+            text_in_viewport = "\n".join(text_in_viewport.split("\n")[:n_lines])
+            # remove empty lines
+            text_in_viewport = "\n".join([line for line in text_in_viewport.split("\n") if line.strip()])
+            assert isinstance(text_in_viewport, str)
+            return text_in_viewport
+        except Exception:
+            return ""
+
+    async def get_visible_text(self, page: Page) -> str:
+        """
+        Retrieve the text content of the browser viewport (approximately).
+
+        Args:
+            page (Page): The Playwright page object.
+
+        Returns:
+            str: The text content of the page.
+        """
+        assert page is not None
+        try:
+            await page.evaluate(self._page_script)
+        except Exception:
+            pass
+        result = await page.evaluate("MultimodalWebSurfer.getVisibleText();")
+        assert isinstance(result, str)
+        return result
+
+    async def get_page_markdown(self, page: Page) -> str:
+        """
+        Retrieve the markdown content of the web page.
+        Currently not implemented.
+
+        Args:
+            page (Page): The Playwright page object.
+
+        Returns:
+            str: The markdown content of the page.
+        """
+        assert page is not None
+        if self._markdown_converter is None and markitdown is not None:
+            self._markdown_converter = markitdown.MarkItDown()
+            assert self._markdown_converter is not None
+            html = await page.evaluate("document.documentElement.outerHTML;")
+            res = self._markdown_converter.convert_stream(
+                io.BytesIO(html.encode("utf-8")), file_extension=".html", url=page.url
+            )
+            assert hasattr(res, "text_content") and isinstance(res.text_content, str)
+            return res.text_content
+        else:
+            return await self.get_webpage_text(page, n_lines=200)