first commit

2026-03-13 12:56:43 -07:00
commit 159cf9fcfe
309 changed files with 64584 additions and 0 deletions
--- a/agent_dhal/agentdhal_extensions/agents/video_surfer/tools.py
+++ b/agent_dhal/agentdhal_extensions/agents/video_surfer/tools.py
@@ -0,0 +1,156 @@
+import base64
+from typing import Any, Dict, List, Tuple
+
+import cv2
+import ffmpeg
+import numpy as np
+import whisper
+from agentdhal_core import Image as AGImage
+from agentdhal_core.models import (
+    ChatCompletionClient,
+    UserMessage,
+)
+
+
+def extract_audio(video_path: str, audio_output_path: str) -> str:
+    """
+    Extracts audio from a video file and saves it as an MP3 file.
+
+    :param video_path: Path to the video file.
+    :param audio_output_path: Path to save the extracted audio file.
+    :return: Confirmation message with the path to the saved audio file.
+    """
+    (ffmpeg.input(video_path).output(audio_output_path, format="mp3").run(quiet=True, overwrite_output=True))  # type: ignore
+    return f"Audio extracted and saved to {audio_output_path}."
+
+
+def transcribe_audio_with_timestamps(audio_path: str) -> str:
+    """
+    Transcribes the audio file with timestamps using the Whisper model.
+
+    :param audio_path: Path to the audio file.
+    :return: Transcription with timestamps.
+    """
+    model = whisper.load_model("base")  # type: ignore
+    result: Dict[str, Any] = model.transcribe(audio_path, task="transcribe", language="en", verbose=False)  # type: ignore
+
+    segments: List[Dict[str, Any]] = result["segments"]
+    transcription_with_timestamps = ""
+
+    for segment in segments:
+        start: float = segment["start"]
+        end: float = segment["end"]
+        text: str = segment["text"]
+        transcription_with_timestamps += f"[{start:.2f} - {end:.2f}] {text}\n"
+
+    return transcription_with_timestamps
+
+
+def get_video_length(video_path: str) -> str:
+    """
+    Returns the length of the video in seconds.
+
+    :param video_path: Path to the video file.
+    :return: Duration of the video in seconds.
+    """
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise IOError(f"Cannot open video file {video_path}")
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
+    duration = frame_count / fps
+    cap.release()
+
+    return f"The video is {duration:.2f} seconds long."
+
+
+def save_screenshot(video_path: str, timestamp: float, output_path: str) -> None:
+    """
+    Captures a screenshot at the specified timestamp and saves it to the output path.
+
+    :param video_path: Path to the video file.
+    :param timestamp: Timestamp in seconds.
+    :param output_path: Path to save the screenshot. The file format is determined by the extension in the path.
+    """
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise IOError(f"Cannot open video file {video_path}")
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    frame_number = int(timestamp * fps)
+    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
+    ret, frame = cap.read()
+    if ret:
+        cv2.imwrite(output_path, frame)
+    else:
+        raise IOError(f"Failed to capture frame at {timestamp:.2f}s")
+    cap.release()
+
+
+async def transcribe_video_screenshot(video_path: str, timestamp: float, model_client: ChatCompletionClient) -> str:
+    """
+    Transcribes the content of a video screenshot captured at the specified timestamp using OpenAI API.
+
+    :param video_path: Path to the video file.
+    :param timestamp: Timestamp in seconds.
+    :param model_client: ChatCompletionClient instance.
+    :return: Description of the screenshot content.
+    """
+    screenshots = get_screenshot_at(video_path, [timestamp])
+    if not screenshots:
+        return "Failed to capture screenshot."
+
+    _, frame = screenshots[0]
+    # Convert the frame to bytes and then to base64 encoding
+    _, buffer = cv2.imencode(".jpg", frame)
+    frame_bytes = buffer.tobytes()
+    frame_base64 = base64.b64encode(frame_bytes).decode("utf-8")
+    screenshot_uri = f"data:image/jpeg;base64,{frame_base64}"
+
+    messages = [
+        UserMessage(
+            content=[
+                "Following is a screenshot from the video at {} seconds. Describe what you see here.",
+                AGImage.from_uri(screenshot_uri),
+            ],
+            source="tool",
+        )
+    ]
+
+    result = await model_client.create(messages=messages)
+    return str(result.content)
+
+
+def get_screenshot_at(video_path: str, timestamps: List[float]) -> List[Tuple[float, np.ndarray[Any, Any]]]:
+    """
+    Captures screenshots at the specified timestamps and returns them as Python objects.
+
+    :param video_path: Path to the video file.
+    :param timestamps: List of timestamps in seconds.
+    :return: List of tuples containing timestamp and the corresponding frame (image).
+             Each frame is a NumPy array (height x width x channels).
+    """
+    screenshots: List[Tuple[float, np.ndarray[Any, Any]]] = []
+
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise IOError(f"Cannot open video file {video_path}")
+
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
+    duration = total_frames / fps
+
+    for timestamp in timestamps:
+        if 0 <= timestamp <= duration:
+            frame_number = int(timestamp * fps)
+            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
+            ret, frame = cap.read()
+            if ret:
+                # Append the timestamp and frame to the list
+                screenshots.append((timestamp, frame))
+            else:
+                raise IOError(f"Failed to capture frame at {timestamp:.2f}s")
+        else:
+            raise ValueError(f"Timestamp {timestamp:.2f}s is out of range [0s, {duration:.2f}s]")
+
+    cap.release()
+    return screenshots