diff --git a/atomic-agents/atomic_agents/context/chat_history.py b/atomic-agents/atomic_agents/context/chat_history.py index 13a18a84..c511ef2a 100644 --- a/atomic-agents/atomic_agents/context/chat_history.py +++ b/atomic-agents/atomic_agents/context/chat_history.py @@ -1,16 +1,109 @@ +from __future__ import annotations + import json import uuid +from dataclasses import dataclass, field from enum import Enum from pathlib import Path -from typing import Dict, List, Optional, Type +from typing import TYPE_CHECKING, Any -from instructor.multimodal import PDF, Image, Audio +from instructor.processing.multimodal import PDF, Image, Audio from pydantic import BaseModel, Field from atomic_agents.base.base_io_schema import BaseIOSchema +if TYPE_CHECKING: + from typing import Type + + +MULTIMODAL_TYPES = (Image, Audio, PDF) + + +@dataclass +class MultimodalContent: + """Result of extracting multimodal content from nested structures.""" + + objects: list = field(default_factory=list) + json_data: Any = None + + @property + def has_multimodal(self) -> bool: + return len(self.objects) > 0 + -INSTRUCTOR_MULTIMODAL_TYPES = (Image, Audio, PDF) +def _extract_multimodal_content(obj: Any, _seen: set[int] | None = None) -> MultimodalContent: + """ + Single-pass extraction of multimodal content from nested structures. + + Returns both the multimodal objects and a JSON-serializable representation + with multimodal content removed. + """ + if _seen is None: + _seen = set() + + match obj: + case Image() | Audio() | PDF(): + return MultimodalContent(objects=[obj], json_data=None) + + case list(): + if id(obj) in _seen: + return MultimodalContent() + _seen.add(id(obj)) + + all_objects = [] + json_items = [] + for item in obj: + result = _extract_multimodal_content(item, _seen) + all_objects.extend(result.objects) + if result.json_data is not None: + json_items.append(result.json_data) + + return MultimodalContent( + objects=all_objects, + json_data=json_items or None, + ) + + case dict(): + if id(obj) in _seen: + return MultimodalContent() + _seen.add(id(obj)) + + all_objects = [] + json_dict = {} + for key, value in obj.items(): + result = _extract_multimodal_content(value, _seen) + all_objects.extend(result.objects) + if result.json_data is not None: + json_dict[key] = result.json_data + + return MultimodalContent( + objects=all_objects, + json_data=json_dict or None, + ) + + case BaseModel(): + if id(obj) in _seen: + return MultimodalContent() + _seen.add(id(obj)) + + all_objects = [] + json_dict = {} + for field_name in type(obj).model_fields: + result = _extract_multimodal_content(getattr(obj, field_name), _seen) + all_objects.extend(result.objects) + if result.json_data is not None: + json_dict[field_name] = result.json_data + + return MultimodalContent( + objects=all_objects, + json_data=json_dict or None, + ) + + case _ if hasattr(obj, "model_dump"): + return MultimodalContent(json_data=obj.model_dump()) + + case _: + return MultimodalContent(json_data=obj) class Message(BaseModel): @@ -25,7 +118,7 @@ class Message(BaseModel): role: str content: BaseIOSchema - turn_id: Optional[str] = None + turn_id: str | None = None class ChatHistory: @@ -33,22 +126,21 @@ class ChatHistory: Manages the chat history for an AI agent. Attributes: - history (List[Message]): A list of messages representing the chat history. - max_messages (Optional[int]): Maximum number of messages to keep in history. - current_turn_id (Optional[str]): The ID of the current turn. + history: A list of messages representing the chat history. + max_messages: Maximum number of messages to keep in history. + current_turn_id: The ID of the current turn. """ - def __init__(self, max_messages: Optional[int] = None): + def __init__(self, max_messages: int | None = None): """ Initializes the ChatHistory with an empty history and optional constraints. Args: - max_messages (Optional[int]): Maximum number of messages to keep in history. - When exceeded, oldest messages are removed first. + max_messages: Maximum number of messages to keep. Oldest removed first. """ - self.history: List[Message] = [] + self.history: list[Message] = [] self.max_messages = max_messages - self.current_turn_id: Optional[str] = None + self.current_turn_id: str | None = None def initialize_turn(self) -> None: """ @@ -87,71 +179,35 @@ def _manage_overflow(self) -> None: while len(self.history) > self.max_messages: self.history.pop(0) - def get_history(self) -> List[Dict]: + def get_history(self) -> list[dict]: """ Retrieves the chat history, handling both regular and multimodal content. + This method supports multimodal content (Image, Audio, PDF) including when + nested within other schemas. Multimodal objects are kept separate from + the JSON serialization to allow Instructor to handle them appropriately. + Returns: List[Dict]: The list of messages in the chat history as dictionaries. Each dictionary has 'role' and 'content' keys, where 'content' contains either a single JSON string or a mixed array of JSON and multimodal objects. Note: - This method supports multimodal content by keeping multimodal objects - separate while generating cohesive JSON for text-based fields. + This method supports nested multimodal content by recursively detecting + and extracting multimodal objects from any level of the schema hierarchy. """ history = [] for message in self.history: - input_content = message.content - - # Check if content has any multimodal fields - multimodal_objects = [] - has_multimodal = False - - # Extract multimodal content first - for field_name, field in input_content.__class__.model_fields.items(): - field_value = getattr(input_content, field_name) - - if isinstance(field_value, list): - for item in field_value: - if isinstance(item, INSTRUCTOR_MULTIMODAL_TYPES): - multimodal_objects.append(item) - has_multimodal = True - elif isinstance(field_value, INSTRUCTOR_MULTIMODAL_TYPES): - multimodal_objects.append(field_value) - has_multimodal = True - - if has_multimodal: - # For multimodal content: create mixed array with JSON + multimodal objects - processed_content = [] - - # Add single cohesive JSON for all non-multimodal fields - non_multimodal_data = {} - for field_name, field in input_content.__class__.model_fields.items(): - field_value = getattr(input_content, field_name) - - if isinstance(field_value, list): - # Only include non-multimodal items from lists - non_multimodal_items = [ - item for item in field_value if not isinstance(item, INSTRUCTOR_MULTIMODAL_TYPES) - ] - if non_multimodal_items: - non_multimodal_data[field_name] = non_multimodal_items - elif not isinstance(field_value, INSTRUCTOR_MULTIMODAL_TYPES): - non_multimodal_data[field_name] = field_value - - # Add single JSON string if there are non-multimodal fields - if non_multimodal_data: - processed_content.append(json.dumps(non_multimodal_data, ensure_ascii=False)) - - # Add all multimodal objects - processed_content.extend(multimodal_objects) - - history.append({"role": message.role, "content": processed_content}) + extracted = _extract_multimodal_content(message.content) + + if extracted.has_multimodal: + content = [] + if extracted.json_data: + content.append(json.dumps(extracted.json_data, ensure_ascii=False)) + content.extend(extracted.objects) + history.append({"role": message.role, "content": content}) else: - # No multimodal content: generate single cohesive JSON string - content_json = input_content.model_dump_json() - history.append({"role": message.role, "content": content_json}) + history.append({"role": message.role, "content": message.content.model_dump_json()}) return history @@ -167,7 +223,7 @@ def copy(self) -> "ChatHistory": new_history.current_turn_id = self.current_turn_id return new_history - def get_current_turn_id(self) -> Optional[str]: + def get_current_turn_id(self) -> str | None: """ Returns the current turn ID. @@ -352,7 +408,7 @@ class MultimodalSchema(BaseIOSchema): """Schema for testing multimodal content""" instruction_text: str = Field(..., description="The instruction text") - images: List[instructor.Image] = Field(..., description="The images to analyze") + images: list[instructor.Image] = Field(..., description="The images to analyze") # Create and populate the original history with complex data original_history = ChatHistory(max_messages=10) @@ -409,8 +465,8 @@ class MultimodalSchema(BaseIOSchema): print(f"Turn ID: {message.turn_id}") print(f"Content type: {type(message.content).__name__}") print("Content:") - for field, value in message.content.model_dump().items(): - print(f" {field}: {value}") + for field_name, value in message.content.model_dump().items(): + print(f" {field_name}: {value}") # Final verification print("\nFinal verification:") diff --git a/atomic-agents/tests/context/test_chat_history.py b/atomic-agents/tests/context/test_chat_history.py index 91d3f77d..f037cb0e 100644 --- a/atomic-agents/tests/context/test_chat_history.py +++ b/atomic-agents/tests/context/test_chat_history.py @@ -8,6 +8,7 @@ from atomic_agents.context import ChatHistory, Message from atomic_agents import BaseIOSchema import instructor +from instructor.processing.multimodal import Image, PDF, Audio class InputSchema(BaseIOSchema): @@ -50,9 +51,9 @@ class MockMultimodalSchema(BaseIOSchema): """Test schema for multimodal content""" instruction_text: str = Field(..., description="The instruction text") - images: List[instructor.Image] = Field(..., description="The images to analyze") - pdfs: List[instructor.multimodal.PDF] = Field(..., description="The PDFs to analyze") - audio: instructor.multimodal.Audio = Field(..., description="The audio to analyze") + images: List[Image] = Field(..., description="The images to analyze") + pdfs: List[PDF] = Field(..., description="The PDFs to analyze") + audio: Audio = Field(..., description="The audio to analyze") class ColorEnum(str, Enum): @@ -218,8 +219,8 @@ def test_dump_and_load_multimodal_data(history): base_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) test_image = instructor.Image.from_path(path=os.path.join(base_path, "files/image_sample.jpg")) - test_pdf = instructor.multimodal.PDF.from_path(path=os.path.join(base_path, "files/pdf_sample.pdf")) - test_audio = instructor.multimodal.Audio.from_path(path=os.path.join(base_path, "files/audio_sample.mp3")) + test_pdf = PDF.from_path(path=os.path.join(base_path, "files/pdf_sample.pdf")) + test_audio = Audio.from_path(path=os.path.join(base_path, "files/audio_sample.mp3")) # multimodal message history.add_message( @@ -361,8 +362,8 @@ def test_get_history_with_multimodal_content(history): """Test that get_history correctly handles multimodal content""" # Create mock multimodal objects mock_image = instructor.Image(source="test_url", media_type="image/jpeg", detail="low") - mock_pdf = instructor.multimodal.PDF(source="test_pdf_url", media_type="application/pdf", detail="low") - mock_audio = instructor.multimodal.Audio(source="test_audio_url", media_type="audio/mp3", detail="low") + mock_pdf = PDF(source="test_pdf_url", media_type="application/pdf", detail="low") + mock_audio = Audio(source="test_audio_url", media_type="audio/mp3", detail="low") # Add a multimodal message history.add_message( @@ -454,7 +455,7 @@ def test_process_multimodal_paths_comprehensive(): image_file = instructor.Image(source="test/image.jpg", media_type="image/jpeg") image_url = instructor.Image(source="https://example.com/image.jpg", media_type="image/jpeg") image_data = instructor.Image(source="data:image/jpeg;base64,xyz", media_type="image/jpeg") - pdf_file = instructor.multimodal.PDF(source="test/doc.pdf", media_type="application/pdf") + pdf_file = PDF(source="test/doc.pdf", media_type="application/pdf") history._process_multimodal_paths(image_file) history._process_multimodal_paths(image_url) @@ -521,8 +522,8 @@ class TestEnum(Enum): MockMultimodalSchema( instruction_text="Process this file", images=[instructor.Image(source="test/sample.jpg", media_type="image/jpeg")], - pdfs=[instructor.multimodal.PDF(source="test/doc.pdf", media_type="application/pdf")], - audio=instructor.multimodal.Audio(source="test/audio.mp3", media_type="audio/mp3"), + pdfs=[PDF(source="test/doc.pdf", media_type="application/pdf")], + audio=Audio(source="test/audio.mp3", media_type="audio/mp3"), ), ) @@ -538,3 +539,218 @@ class TestEnum(Enum): assert loaded_content.images[0].source == Path("test/sample.jpg") assert isinstance(loaded_content.pdfs[0].source, Path) assert loaded_content.pdfs[0].source == Path("test/doc.pdf") + + +# ======================================== +# Tests for nested multimodal content (GitHub Issue #141) +# ======================================== + + +class DocumentWithPDF(BaseIOSchema): + """PDF document with owner - for testing nested multimodal content.""" + + pdf: PDF = Field(..., description="The PDF data") + owner: str = Field(..., description="The PDF owner") + + +class DocumentWithImage(BaseIOSchema): + """Image document with metadata - for testing nested multimodal content.""" + + image: instructor.Image = Field(..., description="The image data") + description: str = Field(..., description="The image description") + + +class NestedDocumentsInputSchema(BaseIOSchema): + """A list of documents to analyze - for testing nested multimodal content.""" + + documents: List[DocumentWithPDF] = Field(..., description="List of documents") + query: str = Field(..., description="The query about the documents") + + +class DeeplyNestedSchema(BaseIOSchema): + """Schema with multiple levels of nesting.""" + + title: str = Field(..., description="The title") + nested_doc: DocumentWithImage = Field(..., description="A nested document with image") + more_docs: List[DocumentWithPDF] = Field(..., description="More nested documents") + + +class DictWithNestedMultimodal(BaseIOSchema): + """Schema with dict containing nested multimodal content.""" + + metadata: str = Field(..., description="Some metadata") + documents_by_name: Dict[str, DocumentWithPDF] = Field(..., description="Documents keyed by name") + + +def test_get_history_with_nested_multimodal_content(history): + """Test that get_history correctly handles nested multimodal content (Issue #141).""" + mock_pdf_1 = PDF(source="test_pdf_1.pdf", media_type="application/pdf", detail="low") + mock_pdf_2 = PDF(source="test_pdf_2.pdf", media_type="application/pdf", detail="low") + + # Create nested documents with PDFs + doc1 = DocumentWithPDF(pdf=mock_pdf_1, owner="Alice") + doc2 = DocumentWithPDF(pdf=mock_pdf_2, owner="Bob") + + # Add a message with nested multimodal content + history.add_message( + "user", + NestedDocumentsInputSchema( + documents=[doc1, doc2], + query="Summarize these documents", + ), + ) + + # Get history and verify format + result = history.get_history() + assert len(result) == 1 + assert result[0]["role"] == "user" + assert isinstance(result[0]["content"], list) + + # Should have JSON with the non-multimodal content + json_content = json.loads(result[0]["content"][0]) + assert json_content["query"] == "Summarize these documents" + # The nested documents should be present but without the PDF content + assert "documents" in json_content + assert len(json_content["documents"]) == 2 + assert json_content["documents"][0]["owner"] == "Alice" + assert json_content["documents"][1]["owner"] == "Bob" + # PDF fields should not be in the JSON + assert "pdf" not in json_content["documents"][0] + assert "pdf" not in json_content["documents"][1] + + # The multimodal objects should be in the content list + assert mock_pdf_1 in result[0]["content"] + assert mock_pdf_2 in result[0]["content"] + + +def test_get_history_with_deeply_nested_multimodal_content(history): + """Test that get_history handles multiple levels of nested multimodal content.""" + mock_image = instructor.Image(source="nested_image.jpg", media_type="image/jpeg", detail="low") + mock_pdf_1 = PDF(source="nested_pdf_1.pdf", media_type="application/pdf", detail="low") + mock_pdf_2 = PDF(source="nested_pdf_2.pdf", media_type="application/pdf", detail="low") + + # Create deeply nested content + history.add_message( + "user", + DeeplyNestedSchema( + title="Test Report", + nested_doc=DocumentWithImage(image=mock_image, description="A test image"), + more_docs=[ + DocumentWithPDF(pdf=mock_pdf_1, owner="Charlie"), + DocumentWithPDF(pdf=mock_pdf_2, owner="Diana"), + ], + ), + ) + + # Get history and verify format + result = history.get_history() + assert len(result) == 1 + assert isinstance(result[0]["content"], list) + + # Verify JSON content structure + json_content = json.loads(result[0]["content"][0]) + assert json_content["title"] == "Test Report" + assert json_content["nested_doc"]["description"] == "A test image" + assert "image" not in json_content["nested_doc"] # Image should be extracted + assert len(json_content["more_docs"]) == 2 + assert json_content["more_docs"][0]["owner"] == "Charlie" + assert json_content["more_docs"][1]["owner"] == "Diana" + + # All multimodal objects should be extracted + assert mock_image in result[0]["content"] + assert mock_pdf_1 in result[0]["content"] + assert mock_pdf_2 in result[0]["content"] + + +def test_get_history_with_dict_nested_multimodal_content(history): + """Test that get_history handles nested multimodal content in dicts.""" + mock_pdf_1 = PDF(source="dict_pdf_1.pdf", media_type="application/pdf", detail="low") + mock_pdf_2 = PDF(source="dict_pdf_2.pdf", media_type="application/pdf", detail="low") + + history.add_message( + "user", + DictWithNestedMultimodal( + metadata="Important documents", + documents_by_name={ + "contract": DocumentWithPDF(pdf=mock_pdf_1, owner="Legal"), + "invoice": DocumentWithPDF(pdf=mock_pdf_2, owner="Finance"), + }, + ), + ) + + result = history.get_history() + assert len(result) == 1 + assert isinstance(result[0]["content"], list) + + # Verify JSON content + json_content = json.loads(result[0]["content"][0]) + assert json_content["metadata"] == "Important documents" + assert "documents_by_name" in json_content + assert json_content["documents_by_name"]["contract"]["owner"] == "Legal" + assert json_content["documents_by_name"]["invoice"]["owner"] == "Finance" + + # Multimodal objects should be extracted + assert mock_pdf_1 in result[0]["content"] + assert mock_pdf_2 in result[0]["content"] + + +def test_extract_multimodal_content(): + """Test the unified multimodal content extraction function.""" + from atomic_agents.context.chat_history import _extract_multimodal_content + + mock_pdf = PDF(source="test.pdf", media_type="application/pdf", detail="low") + mock_image = instructor.Image(source="test.jpg", media_type="image/jpeg", detail="low") + + # Direct multimodal - has_multimodal=True, json_data=None + result = _extract_multimodal_content(mock_pdf) + assert result.has_multimodal is True + assert result.objects == [mock_pdf] + assert result.json_data is None + + # Non-multimodal - has_multimodal=False, json_data preserved + result = _extract_multimodal_content("string") + assert result.has_multimodal is False + assert result.json_data == "string" + + result = _extract_multimodal_content({"key": "value"}) + assert result.has_multimodal is False + assert result.json_data == {"key": "value"} + + # Nested in list + result = _extract_multimodal_content([mock_pdf, "text"]) + assert result.has_multimodal is True + assert mock_pdf in result.objects + assert result.json_data == ["text"] + + # Nested in dict + result = _extract_multimodal_content({"pdf": mock_pdf, "text": "value"}) + assert result.has_multimodal is True + assert mock_pdf in result.objects + assert result.json_data == {"text": "value"} + + # Nested Pydantic model + doc = DocumentWithPDF(pdf=mock_pdf, owner="Alice") + result = _extract_multimodal_content(doc) + assert result.has_multimodal is True + assert result.objects == [mock_pdf] + assert result.json_data == {"owner": "Alice"} + + # Deeply nested structure + mock_pdf_2 = PDF(source="test2.pdf", media_type="application/pdf", detail="low") + nested = DeeplyNestedSchema( + title="Test", + nested_doc=DocumentWithImage(image=mock_image, description="desc"), + more_docs=[ + DocumentWithPDF(pdf=mock_pdf, owner="A"), + DocumentWithPDF(pdf=mock_pdf_2, owner="B"), + ], + ) + result = _extract_multimodal_content(nested) + assert result.has_multimodal is True + assert len(result.objects) == 3 + assert mock_image in result.objects + assert mock_pdf in result.objects + assert mock_pdf_2 in result.objects + assert result.json_data["title"] == "Test" + assert result.json_data["nested_doc"]["description"] == "desc" + assert "image" not in result.json_data["nested_doc"] diff --git a/atomic-examples/nested-multimodal/README.md b/atomic-examples/nested-multimodal/README.md new file mode 100644 index 00000000..97a11515 --- /dev/null +++ b/atomic-examples/nested-multimodal/README.md @@ -0,0 +1,76 @@ +# Nested Multimodal Content Example + +Analyze multiple images (or PDFs/audio) in a single request using nested schemas. + +## What This Does + +Pass a **list of documents** - each containing an image plus metadata - to an agent that analyzes them all and provides a comparative summary. + +```python +class ImageDocument(BaseIOSchema): + image: Image = Field(...) + owner: str = Field(...) + category: str = Field(...) + +class Input(BaseIOSchema): + documents: list[ImageDocument] = Field(...) # Multiple images with metadata + query: str = Field(...) + +# Analyze multiple images at once +result = agent.run(Input( + documents=[doc1, doc2, doc3], + query="Compare these images" +)) +``` + +## Setup + +```bash +cd atomic-examples/nested-multimodal +uv sync +``` + +Set your API key in `.env`: +``` +OPENAI_API_KEY=your_key_here +# or +GEMINI_API_KEY=your_key_here +``` + +## Run + +```bash +uv run python nested_multimodal/main.py +``` + +## Example Output + +``` +Using OpenAI GPT-5.1 + +Creating nested document structure... + - Document 1: Image owned by 'Marketing Team', category 'photo' + - Document 2: Image owned by 'Content Team', category 'photo' + +============================================================ +ANALYSIS RESULTS +============================================================ + +Image 1: + Owner: Marketing Team + Description: A black-and-white mountain valley with dramatic lighting... + Dominant Colors: black, white, gray + Key Elements: mountain slopes, valley, diagonal light beam + +Image 2: + Owner: Content Team + Description: Layered blue mountain ridges receding into distance... + Dominant Colors: various blues, soft white haze + Key Elements: overlapping ridges, atmospheric haze + +Comparative Summary: + Both images depict mountainous landscapes with atmospheric depth... + The first is high-contrast black-and-white, the second uses blue tones... + +SUCCESS: Nested multimodal content handled correctly! +``` diff --git a/atomic-examples/nested-multimodal/nested_multimodal/__init__.py b/atomic-examples/nested-multimodal/nested_multimodal/__init__.py new file mode 100644 index 00000000..3c0a0bd2 --- /dev/null +++ b/atomic-examples/nested-multimodal/nested_multimodal/__init__.py @@ -0,0 +1,2 @@ +# Nested Multimodal Example +# Demonstrates support for nested multimodal content in Atomic Agents (Issue #141) diff --git a/atomic-examples/nested-multimodal/nested_multimodal/main.py b/atomic-examples/nested-multimodal/nested_multimodal/main.py new file mode 100644 index 00000000..636fd98e --- /dev/null +++ b/atomic-examples/nested-multimodal/nested_multimodal/main.py @@ -0,0 +1,247 @@ +""" +Nested Multimodal Content Example + +This example demonstrates the support for nested multimodal content in Atomic Agents, +as implemented for GitHub Issue #141: "AgentMemory: support nested multimodal data" + +The key scenario demonstrated here is: +- A Document schema containing an Image field AND metadata (owner) +- An InputSchema containing a LIST of Documents +- The agent correctly processes all nested Images and extracts information from each + +Previously, nested multimodal content like this would be incorrectly serialized with json.dumps, +causing issues. Now the ChatHistory recursively detects and extracts multimodal content at any depth. + +This example supports both: +- OpenAI (GPT-5.1) with OPENAI_API_KEY +- Google Gemini with GEMINI_API_KEY +""" + +import os +from pathlib import Path +from typing import List + +import instructor +from openai import OpenAI +from atomic_agents import AtomicAgent, AgentConfig, BaseIOSchema +from atomic_agents.context import SystemPromptGenerator +from instructor.processing.multimodal import Image +from pydantic import Field + + +def _load_env(): + """Load .env file from current or parent directories.""" + for directory in [Path.cwd(), *Path.cwd().parents]: + env_file = directory / ".env" + if env_file.exists(): + for line in env_file.read_text().splitlines(): + line = line.strip() + if line and not line.startswith("#") and "=" in line: + key, _, value = line.partition("=") + os.environ.setdefault(key.strip(), value.strip().strip("\"'")) + break + + +_load_env() + + +# ============================================================================= +# Schema Definitions - Demonstrating Nested Multimodal Content (Issue #141) +# ============================================================================= + + +class ImageDocument(BaseIOSchema): + """ + A document with Image content and metadata. + + This is the KEY nested structure - the Image is inside this schema, + not at the top level of the InputSchema. + """ + + image: Image = Field(..., description="The image content") + owner: str = Field(..., description="The owner/author of this image") + category: str = Field(..., description="Category of the image (e.g., 'logo', 'photo', 'diagram')") + + +class NestedMultimodalInput(BaseIOSchema): + """ + Input schema with nested multimodal content. + + This demonstrates Issue #141 - multimodal content (Images) nested within + a list of ImageDocument objects, not at the top level. + """ + + documents: List[ImageDocument] = Field(..., description="List of image documents to analyze") + analysis_query: str = Field(..., description="What to analyze or compare across the images") + + +class ImageAnalysis(BaseIOSchema): + """Analysis result for a single image document.""" + + owner: str = Field(..., description="The owner of this image") + category: str = Field(..., description="The category of the image") + description: str = Field(..., description="Description of what's in the image") + dominant_colors: List[str] = Field(..., description="Main colors visible in the image") + key_elements: List[str] = Field(..., description="Key visual elements identified") + + +class AnalysisResult(BaseIOSchema): + """Combined analysis of all image documents.""" + + image_analyses: List[ImageAnalysis] = Field(..., description="Analysis of each individual image") + comparative_summary: str = Field(..., description="Comparative analysis addressing the user's query") + + +# ============================================================================= +# Agent Setup with Provider Auto-Detection +# ============================================================================= + + +def create_agent(): + """Create the image analysis agent with auto-detected provider.""" + + # Try OpenAI first, then Gemini + openai_key = os.getenv("OPENAI_API_KEY") + gemini_key = os.getenv("GEMINI_API_KEY") + + if openai_key: + print("Using OpenAI GPT-5.1") + client = instructor.from_openai(OpenAI(api_key=openai_key)) + model = "gpt-5.1" + elif gemini_key: + print("Using Google Gemini") + from google import genai + + client = instructor.from_genai(client=genai.Client(api_key=gemini_key), mode=instructor.Mode.GENAI_TOOLS) + model = "gemini-2.0-flash" + else: + raise ValueError("No API key found. Please set OPENAI_API_KEY or GEMINI_API_KEY in your .env file.") + + system_prompt_generator = SystemPromptGenerator( + background=[ + "You are an image analysis expert.", + "You can analyze multiple images and provide comparative insights.", + ], + steps=[ + "For each image document in the input, analyze and describe what you see.", + "Consider the owner and category metadata provided for each image.", + "Identify key visual elements, colors, and notable features.", + "After analyzing all images, provide a comparative summary based on the user's query.", + ], + output_instructions=[ + "Return detailed analysis for each image.", + "Include a comparative summary that addresses the user's specific query.", + ], + ) + + agent = AtomicAgent[NestedMultimodalInput, AnalysisResult]( + config=AgentConfig( + client=client, + model=model, + system_prompt_generator=system_prompt_generator, + input_schema=NestedMultimodalInput, + output_schema=AnalysisResult, + ) + ) + + return agent + + +def main(): + """ + Main function demonstrating nested multimodal content handling. + + This creates multiple ImageDocument objects, each containing an Image, + and passes them as a list to the agent - the exact scenario from Issue #141. + """ + print("=" * 60) + print("Nested Multimodal Content Example (Issue #141)") + print("=" * 60) + print() + + # Get the test image paths + script_directory = os.path.dirname(os.path.abspath(__file__)) + test_media_directory = os.path.join(os.path.dirname(script_directory), "test_media") + image_path1 = os.path.join(test_media_directory, "image_sample.jpg") + image_path2 = os.path.join(test_media_directory, "image_sample2.jpg") + + # Check for test images + if not os.path.exists(image_path1) or not os.path.exists(image_path2): + print(f"Error: Test images not found in {test_media_directory}") + print("Please ensure image_sample.jpg and image_sample2.jpg exist.") + return + + print(f"Using test images from: {test_media_directory}") + print() + + # Create the agent + try: + agent = create_agent() + except ValueError as e: + print(f"Setup error: {e}") + return + + print() + + # ========================================================================== + # KEY DEMONSTRATION: Nested multimodal content + # ========================================================================== + # We create MULTIPLE ImageDocument objects, each containing an Image. + # This is the exact scenario from Issue #141 that wasn't working before. + # The Images are nested inside ImageDocument schemas, inside a list. + + print("Creating nested document structure...") + print(" - Document 1: Image owned by 'Marketing Team', category 'random photo'") + print(" - Document 2: Image owned by 'Content Team', category 'random photo'") + print() + + document1 = ImageDocument(image=Image.from_path(image_path1), owner="Marketing Team", category="random photo") + document2 = ImageDocument(image=Image.from_path(image_path2), owner="Content Team", category="random photo") + + # Create the nested input - this is what Issue #141 fixed + nested_input = NestedMultimodalInput( + documents=[document1, document2], + analysis_query="Compare these images and describe what you see in each one. Note any similarities or differences.", + ) + + print("Sending nested multimodal content to agent...") + print("(Previously this would fail due to incorrect JSON serialization)") + print() + + try: + # Run the agent with nested multimodal content + result = agent.run(nested_input) + + # Display results + print("=" * 60) + print("ANALYSIS RESULTS") + print("=" * 60) + print() + + for i, img_analysis in enumerate(result.image_analyses, 1): + print(f"Image {i}:") + print(f" Owner: {img_analysis.owner}") + print(f" Category: {img_analysis.category}") + print(f" Description: {img_analysis.description}") + print(f" Dominant Colors: {', '.join(img_analysis.dominant_colors)}") + print(f" Key Elements: {', '.join(img_analysis.key_elements)}") + print() + + print("Comparative Summary:") + print(f" {result.comparative_summary}") + print() + + print("=" * 60) + print("SUCCESS: Nested multimodal content handled correctly!") + print("=" * 60) + + except Exception as e: + print(f"Error during analysis: {e}") + import traceback + + traceback.print_exc() + raise + + +if __name__ == "__main__": + main() diff --git a/atomic-examples/nested-multimodal/pyproject.toml b/atomic-examples/nested-multimodal/pyproject.toml new file mode 100644 index 00000000..67575f54 --- /dev/null +++ b/atomic-examples/nested-multimodal/pyproject.toml @@ -0,0 +1,25 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["nested_multimodal"] + +[project] +name = "nested-multimodal" +version = "1.0.0" +description = "Example demonstrating nested multimodal content support in Atomic Agents (Issue #141)" +readme = "README.md" +authors = [ + { name = "Atomic Agents Team" } +] +requires-python = ">=3.12,<3.14" +dependencies = [ + "atomic-agents", + "instructor>=1.7.0", + "google-genai>=1.18.0,<2.0.0", + "jsonref>=1.1.0,<2.0.0", +] + +[tool.uv.sources] +atomic-agents = { workspace = true } diff --git a/atomic-examples/nested-multimodal/test_media/image_sample.jpg b/atomic-examples/nested-multimodal/test_media/image_sample.jpg new file mode 100644 index 00000000..c9f24223 Binary files /dev/null and b/atomic-examples/nested-multimodal/test_media/image_sample.jpg differ diff --git a/atomic-examples/nested-multimodal/test_media/image_sample2.jpg b/atomic-examples/nested-multimodal/test_media/image_sample2.jpg new file mode 100644 index 00000000..8823a6d7 Binary files /dev/null and b/atomic-examples/nested-multimodal/test_media/image_sample2.jpg differ diff --git a/uv.lock b/uv.lock index 1f044391..e729d009 100644 --- a/uv.lock +++ b/uv.lock @@ -17,6 +17,7 @@ members = [ "example-mcp-server", "fastapi-memory", "hooks-example", + "nested-multimodal", "orchestration-agent", "quickstart", "rag-chatbot", @@ -2006,6 +2007,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1d/f6/6d61a023d758f488e36638076e8a4ec4447a2cdf86938cf6c60cf1c860e6/myst_parser-2.0.0-py3-none-any.whl", hash = "sha256:7c36344ae39c8e740dad7fdabf5aa6fc4897a813083c6cc9990044eb93656b14", size = 77158, upload-time = "2023-06-13T16:30:27.697Z" }, ] +[[package]] +name = "nested-multimodal" +version = "1.0.0" +source = { editable = "atomic-examples/nested-multimodal" } +dependencies = [ + { name = "atomic-agents" }, + { name = "google-genai" }, + { name = "instructor" }, + { name = "jsonref" }, +] + +[package.metadata] +requires-dist = [ + { name = "atomic-agents", editable = "." }, + { name = "google-genai", specifier = ">=1.18.0,<2.0.0" }, + { name = "instructor", specifier = ">=1.7.0" }, + { name = "jsonref", specifier = ">=1.1.0,<2.0.0" }, +] + [[package]] name = "nodeenv" version = "1.9.1"