diff --git a/atomic-agents/atomic_agents/context/chat_history.py b/atomic-agents/atomic_agents/context/chat_history.py
index 13a18a84..c511ef2a 100644
--- a/atomic-agents/atomic_agents/context/chat_history.py
+++ b/atomic-agents/atomic_agents/context/chat_history.py
@@ -1,16 +1,109 @@
+from __future__ import annotations
+
 import json
 import uuid
+from dataclasses import dataclass, field
 from enum import Enum
 from pathlib import Path
-from typing import Dict, List, Optional, Type
+from typing import TYPE_CHECKING, Any
 
-from instructor.multimodal import PDF, Image, Audio
+from instructor.processing.multimodal import PDF, Image, Audio
 from pydantic import BaseModel, Field
 
 from atomic_agents.base.base_io_schema import BaseIOSchema
 
+if TYPE_CHECKING:
+    from typing import Type
+
+
+MULTIMODAL_TYPES = (Image, Audio, PDF)
+
+
+@dataclass
+class MultimodalContent:
+    """Result of extracting multimodal content from nested structures."""
+
+    objects: list = field(default_factory=list)
+    json_data: Any = None
+
+    @property
+    def has_multimodal(self) -> bool:
+        return len(self.objects) > 0
+
 
-INSTRUCTOR_MULTIMODAL_TYPES = (Image, Audio, PDF)
+def _extract_multimodal_content(obj: Any, _seen: set[int] | None = None) -> MultimodalContent:
+    """
+    Single-pass extraction of multimodal content from nested structures.
+
+    Returns both the multimodal objects and a JSON-serializable representation
+    with multimodal content removed.
+    """
+    if _seen is None:
+        _seen = set()
+
+    match obj:
+        case Image() | Audio() | PDF():
+            return MultimodalContent(objects=[obj], json_data=None)
+
+        case list():
+            if id(obj) in _seen:
+                return MultimodalContent()
+            _seen.add(id(obj))
+
+            all_objects = []
+            json_items = []
+            for item in obj:
+                result = _extract_multimodal_content(item, _seen)
+                all_objects.extend(result.objects)
+                if result.json_data is not None:
+                    json_items.append(result.json_data)
+
+            return MultimodalContent(
+                objects=all_objects,
+                json_data=json_items or None,
+            )
+
+        case dict():
+            if id(obj) in _seen:
+                return MultimodalContent()
+            _seen.add(id(obj))
+
+            all_objects = []
+            json_dict = {}
+            for key, value in obj.items():
+                result = _extract_multimodal_content(value, _seen)
+                all_objects.extend(result.objects)
+                if result.json_data is not None:
+                    json_dict[key] = result.json_data
+
+            return MultimodalContent(
+                objects=all_objects,
+                json_data=json_dict or None,
+            )
+
+        case BaseModel():
+            if id(obj) in _seen:
+                return MultimodalContent()
+            _seen.add(id(obj))
+
+            all_objects = []
+            json_dict = {}
+            for field_name in type(obj).model_fields:
+                result = _extract_multimodal_content(getattr(obj, field_name), _seen)
+                all_objects.extend(result.objects)
+                if result.json_data is not None:
+                    json_dict[field_name] = result.json_data
+
+            return MultimodalContent(
+                objects=all_objects,
+                json_data=json_dict or None,
+            )
+
+        case _ if hasattr(obj, "model_dump"):
+            return MultimodalContent(json_data=obj.model_dump())
+
+        case _:
+            return MultimodalContent(json_data=obj)
 
 
 class Message(BaseModel):
@@ -25,7 +118,7 @@ class Message(BaseModel):
 
     role: str
     content: BaseIOSchema
-    turn_id: Optional[str] = None
+    turn_id: str | None = None
 
 
 class ChatHistory:
@@ -33,22 +126,21 @@ class ChatHistory:
     Manages the chat history for an AI agent.
 
     Attributes:
-        history (List[Message]): A list of messages representing the chat history.
-        max_messages (Optional[int]): Maximum number of messages to keep in history.
-        current_turn_id (Optional[str]): The ID of the current turn.
+        history: A list of messages representing the chat history.
+        max_messages: Maximum number of messages to keep in history.
+        current_turn_id: The ID of the current turn.
     """
 
-    def __init__(self, max_messages: Optional[int] = None):
+    def __init__(self, max_messages: int | None = None):
         """
         Initializes the ChatHistory with an empty history and optional constraints.
 
         Args:
-            max_messages (Optional[int]): Maximum number of messages to keep in history.
-                When exceeded, oldest messages are removed first.
+            max_messages: Maximum number of messages to keep. Oldest removed first.
         """
-        self.history: List[Message] = []
+        self.history: list[Message] = []
         self.max_messages = max_messages
-        self.current_turn_id: Optional[str] = None
+        self.current_turn_id: str | None = None
 
     def initialize_turn(self) -> None:
         """
@@ -87,71 +179,35 @@ def _manage_overflow(self) -> None:
             while len(self.history) > self.max_messages:
                 self.history.pop(0)
 
-    def get_history(self) -> List[Dict]:
+    def get_history(self) -> list[dict]:
         """
         Retrieves the chat history, handling both regular and multimodal content.
 
+        This method supports multimodal content (Image, Audio, PDF) including when
+        nested within other schemas. Multimodal objects are kept separate from
+        the JSON serialization to allow Instructor to handle them appropriately.
+
         Returns:
             List[Dict]: The list of messages in the chat history as dictionaries.
             Each dictionary has 'role' and 'content' keys, where 'content' contains
             either a single JSON string or a mixed array of JSON and multimodal objects.
 
         Note:
-            This method supports multimodal content by keeping multimodal objects
-            separate while generating cohesive JSON for text-based fields.
+            This method supports nested multimodal content by recursively detecting
+            and extracting multimodal objects from any level of the schema hierarchy.
         """
         history = []
         for message in self.history:
-            input_content = message.content
-
-            # Check if content has any multimodal fields
-            multimodal_objects = []
-            has_multimodal = False
-
-            # Extract multimodal content first
-            for field_name, field in input_content.__class__.model_fields.items():
-                field_value = getattr(input_content, field_name)
-
-                if isinstance(field_value, list):
-                    for item in field_value:
-                        if isinstance(item, INSTRUCTOR_MULTIMODAL_TYPES):
-                            multimodal_objects.append(item)
-                            has_multimodal = True
-                elif isinstance(field_value, INSTRUCTOR_MULTIMODAL_TYPES):
-                    multimodal_objects.append(field_value)
-                    has_multimodal = True
-
-            if has_multimodal:
-                # For multimodal content: create mixed array with JSON + multimodal objects
-                processed_content = []
-
-                # Add single cohesive JSON for all non-multimodal fields
-                non_multimodal_data = {}
-                for field_name, field in input_content.__class__.model_fields.items():
-                    field_value = getattr(input_content, field_name)
-
-                    if isinstance(field_value, list):
-                        # Only include non-multimodal items from lists
-                        non_multimodal_items = [
-                            item for item in field_value if not isinstance(item, INSTRUCTOR_MULTIMODAL_TYPES)
-                        ]
-                        if non_multimodal_items:
-                            non_multimodal_data[field_name] = non_multimodal_items
-                    elif not isinstance(field_value, INSTRUCTOR_MULTIMODAL_TYPES):
-                        non_multimodal_data[field_name] = field_value
-
-                # Add single JSON string if there are non-multimodal fields
-                if non_multimodal_data:
-                    processed_content.append(json.dumps(non_multimodal_data, ensure_ascii=False))
-
-                # Add all multimodal objects
-                processed_content.extend(multimodal_objects)
-
-                history.append({"role": message.role, "content": processed_content})
+            extracted = _extract_multimodal_content(message.content)
+
+            if extracted.has_multimodal:
+                content = []
+                if extracted.json_data:
+                    content.append(json.dumps(extracted.json_data, ensure_ascii=False))
+                content.extend(extracted.objects)
+                history.append({"role": message.role, "content": content})
             else:
-                # No multimodal content: generate single cohesive JSON string
-                content_json = input_content.model_dump_json()
-                history.append({"role": message.role, "content": content_json})
+                history.append({"role": message.role, "content": message.content.model_dump_json()})
 
         return history
 
@@ -167,7 +223,7 @@ def copy(self) -> "ChatHistory":
         new_history.current_turn_id = self.current_turn_id
         return new_history
 
-    def get_current_turn_id(self) -> Optional[str]:
+    def get_current_turn_id(self) -> str | None:
         """
         Returns the current turn ID.
 
@@ -352,7 +408,7 @@ class MultimodalSchema(BaseIOSchema):
         """Schema for testing multimodal content"""
 
         instruction_text: str = Field(..., description="The instruction text")
-        images: List[instructor.Image] = Field(..., description="The images to analyze")
+        images: list[instructor.Image] = Field(..., description="The images to analyze")
 
     # Create and populate the original history with complex data
     original_history = ChatHistory(max_messages=10)
@@ -409,8 +465,8 @@ class MultimodalSchema(BaseIOSchema):
         print(f"Turn ID: {message.turn_id}")
         print(f"Content type: {type(message.content).__name__}")
         print("Content:")
-        for field, value in message.content.model_dump().items():
-            print(f"  {field}: {value}")
+        for field_name, value in message.content.model_dump().items():
+            print(f"  {field_name}: {value}")
 
     # Final verification
     print("\nFinal verification:")
diff --git a/atomic-agents/tests/context/test_chat_history.py b/atomic-agents/tests/context/test_chat_history.py
index 91d3f77d..f037cb0e 100644
--- a/atomic-agents/tests/context/test_chat_history.py
+++ b/atomic-agents/tests/context/test_chat_history.py
@@ -8,6 +8,7 @@
 from atomic_agents.context import ChatHistory, Message
 from atomic_agents import BaseIOSchema
 import instructor
+from instructor.processing.multimodal import Image, PDF, Audio
 
 
 class InputSchema(BaseIOSchema):
@@ -50,9 +51,9 @@ class MockMultimodalSchema(BaseIOSchema):
     """Test schema for multimodal content"""
 
     instruction_text: str = Field(..., description="The instruction text")
-    images: List[instructor.Image] = Field(..., description="The images to analyze")
-    pdfs: List[instructor.multimodal.PDF] = Field(..., description="The PDFs to analyze")
-    audio: instructor.multimodal.Audio = Field(..., description="The audio to analyze")
+    images: List[Image] = Field(..., description="The images to analyze")
+    pdfs: List[PDF] = Field(..., description="The PDFs to analyze")
+    audio: Audio = Field(..., description="The audio to analyze")
 
 
 class ColorEnum(str, Enum):
@@ -218,8 +219,8 @@ def test_dump_and_load_multimodal_data(history):
 
     base_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
     test_image = instructor.Image.from_path(path=os.path.join(base_path, "files/image_sample.jpg"))
-    test_pdf = instructor.multimodal.PDF.from_path(path=os.path.join(base_path, "files/pdf_sample.pdf"))
-    test_audio = instructor.multimodal.Audio.from_path(path=os.path.join(base_path, "files/audio_sample.mp3"))
+    test_pdf = PDF.from_path(path=os.path.join(base_path, "files/pdf_sample.pdf"))
+    test_audio = Audio.from_path(path=os.path.join(base_path, "files/audio_sample.mp3"))
 
     # multimodal message
     history.add_message(
@@ -361,8 +362,8 @@ def test_get_history_with_multimodal_content(history):
     """Test that get_history correctly handles multimodal content"""
     # Create mock multimodal objects
     mock_image = instructor.Image(source="test_url", media_type="image/jpeg", detail="low")
-    mock_pdf = instructor.multimodal.PDF(source="test_pdf_url", media_type="application/pdf", detail="low")
-    mock_audio = instructor.multimodal.Audio(source="test_audio_url", media_type="audio/mp3", detail="low")
+    mock_pdf = PDF(source="test_pdf_url", media_type="application/pdf", detail="low")
+    mock_audio = Audio(source="test_audio_url", media_type="audio/mp3", detail="low")
 
     # Add a multimodal message
     history.add_message(
@@ -454,7 +455,7 @@ def test_process_multimodal_paths_comprehensive():
     image_file = instructor.Image(source="test/image.jpg", media_type="image/jpeg")
     image_url = instructor.Image(source="https://example.com/image.jpg", media_type="image/jpeg")
     image_data = instructor.Image(source="data:image/jpeg;base64,xyz", media_type="image/jpeg")
-    pdf_file = instructor.multimodal.PDF(source="test/doc.pdf", media_type="application/pdf")
+    pdf_file = PDF(source="test/doc.pdf", media_type="application/pdf")
 
     history._process_multimodal_paths(image_file)
     history._process_multimodal_paths(image_url)
@@ -521,8 +522,8 @@ class TestEnum(Enum):
         MockMultimodalSchema(
             instruction_text="Process this file",
             images=[instructor.Image(source="test/sample.jpg", media_type="image/jpeg")],
-            pdfs=[instructor.multimodal.PDF(source="test/doc.pdf", media_type="application/pdf")],
-            audio=instructor.multimodal.Audio(source="test/audio.mp3", media_type="audio/mp3"),
+            pdfs=[PDF(source="test/doc.pdf", media_type="application/pdf")],
+            audio=Audio(source="test/audio.mp3", media_type="audio/mp3"),
         ),
     )
 
@@ -538,3 +539,218 @@ class TestEnum(Enum):
     assert loaded_content.images[0].source == Path("test/sample.jpg")
     assert isinstance(loaded_content.pdfs[0].source, Path)
     assert loaded_content.pdfs[0].source == Path("test/doc.pdf")
+
+
+# ========================================
+# Tests for nested multimodal content (GitHub Issue #141)
+# ========================================
+
+
+class DocumentWithPDF(BaseIOSchema):
+    """PDF document with owner - for testing nested multimodal content."""
+
+    pdf: PDF = Field(..., description="The PDF data")
+    owner: str = Field(..., description="The PDF owner")
+
+
+class DocumentWithImage(BaseIOSchema):
+    """Image document with metadata - for testing nested multimodal content."""
+
+    image: instructor.Image = Field(..., description="The image data")
+    description: str = Field(..., description="The image description")
+
+
+class NestedDocumentsInputSchema(BaseIOSchema):
+    """A list of documents to analyze - for testing nested multimodal content."""
+
+    documents: List[DocumentWithPDF] = Field(..., description="List of documents")
+    query: str = Field(..., description="The query about the documents")
+
+
+class DeeplyNestedSchema(BaseIOSchema):
+    """Schema with multiple levels of nesting."""
+
+    title: str = Field(..., description="The title")
+    nested_doc: DocumentWithImage = Field(..., description="A nested document with image")
+    more_docs: List[DocumentWithPDF] = Field(..., description="More nested documents")
+
+
+class DictWithNestedMultimodal(BaseIOSchema):
+    """Schema with dict containing nested multimodal content."""
+
+    metadata: str = Field(..., description="Some metadata")
+    documents_by_name: Dict[str, DocumentWithPDF] = Field(..., description="Documents keyed by name")
+
+
+def test_get_history_with_nested_multimodal_content(history):
+    """Test that get_history correctly handles nested multimodal content (Issue #141)."""
+    mock_pdf_1 = PDF(source="test_pdf_1.pdf", media_type="application/pdf", detail="low")
+    mock_pdf_2 = PDF(source="test_pdf_2.pdf", media_type="application/pdf", detail="low")
+
+    # Create nested documents with PDFs
+    doc1 = DocumentWithPDF(pdf=mock_pdf_1, owner="Alice")
+    doc2 = DocumentWithPDF(pdf=mock_pdf_2, owner="Bob")
+
+    # Add a message with nested multimodal content
+    history.add_message(
+        "user",
+        NestedDocumentsInputSchema(
+            documents=[doc1, doc2],
+            query="Summarize these documents",
+        ),
+    )
+
+    # Get history and verify format
+    result = history.get_history()
+    assert len(result) == 1
+    assert result[0]["role"] == "user"
+    assert isinstance(result[0]["content"], list)
+
+    # Should have JSON with the non-multimodal content
+    json_content = json.loads(result[0]["content"][0])
+    assert json_content["query"] == "Summarize these documents"
+    # The nested documents should be present but without the PDF content
+    assert "documents" in json_content
+    assert len(json_content["documents"]) == 2
+    assert json_content["documents"][0]["owner"] == "Alice"
+    assert json_content["documents"][1]["owner"] == "Bob"
+    # PDF fields should not be in the JSON
+    assert "pdf" not in json_content["documents"][0]
+    assert "pdf" not in json_content["documents"][1]
+
+    # The multimodal objects should be in the content list
+    assert mock_pdf_1 in result[0]["content"]
+    assert mock_pdf_2 in result[0]["content"]
+
+
+def test_get_history_with_deeply_nested_multimodal_content(history):
+    """Test that get_history handles multiple levels of nested multimodal content."""
+    mock_image = instructor.Image(source="nested_image.jpg", media_type="image/jpeg", detail="low")
+    mock_pdf_1 = PDF(source="nested_pdf_1.pdf", media_type="application/pdf", detail="low")
+    mock_pdf_2 = PDF(source="nested_pdf_2.pdf", media_type="application/pdf", detail="low")
+
+    # Create deeply nested content
+    history.add_message(
+        "user",
+        DeeplyNestedSchema(
+            title="Test Report",
+            nested_doc=DocumentWithImage(image=mock_image, description="A test image"),
+            more_docs=[
+                DocumentWithPDF(pdf=mock_pdf_1, owner="Charlie"),
+                DocumentWithPDF(pdf=mock_pdf_2, owner="Diana"),
+            ],
+        ),
+    )
+
+    # Get history and verify format
+    result = history.get_history()
+    assert len(result) == 1
+    assert isinstance(result[0]["content"], list)
+
+    # Verify JSON content structure
+    json_content = json.loads(result[0]["content"][0])
+    assert json_content["title"] == "Test Report"
+    assert json_content["nested_doc"]["description"] == "A test image"
+    assert "image" not in json_content["nested_doc"]  # Image should be extracted
+    assert len(json_content["more_docs"]) == 2
+    assert json_content["more_docs"][0]["owner"] == "Charlie"
+    assert json_content["more_docs"][1]["owner"] == "Diana"
+
+    # All multimodal objects should be extracted
+    assert mock_image in result[0]["content"]
+    assert mock_pdf_1 in result[0]["content"]
+    assert mock_pdf_2 in result[0]["content"]
+
+
+def test_get_history_with_dict_nested_multimodal_content(history):
+    """Test that get_history handles nested multimodal content in dicts."""
+    mock_pdf_1 = PDF(source="dict_pdf_1.pdf", media_type="application/pdf", detail="low")
+    mock_pdf_2 = PDF(source="dict_pdf_2.pdf", media_type="application/pdf", detail="low")
+
+    history.add_message(
+        "user",
+        DictWithNestedMultimodal(
+            metadata="Important documents",
+            documents_by_name={
+                "contract": DocumentWithPDF(pdf=mock_pdf_1, owner="Legal"),
+                "invoice": DocumentWithPDF(pdf=mock_pdf_2, owner="Finance"),
+            },
+        ),
+    )
+
+    result = history.get_history()
+    assert len(result) == 1
+    assert isinstance(result[0]["content"], list)
+
+    # Verify JSON content
+    json_content = json.loads(result[0]["content"][0])
+    assert json_content["metadata"] == "Important documents"
+    assert "documents_by_name" in json_content
+    assert json_content["documents_by_name"]["contract"]["owner"] == "Legal"
+    assert json_content["documents_by_name"]["invoice"]["owner"] == "Finance"
+
+    # Multimodal objects should be extracted
+    assert mock_pdf_1 in result[0]["content"]
+    assert mock_pdf_2 in result[0]["content"]
+
+
+def test_extract_multimodal_content():
+    """Test the unified multimodal content extraction function."""
+    from atomic_agents.context.chat_history import _extract_multimodal_content
+
+    mock_pdf = PDF(source="test.pdf", media_type="application/pdf", detail="low")
+    mock_image = instructor.Image(source="test.jpg", media_type="image/jpeg", detail="low")
+
+    # Direct multimodal - has_multimodal=True, json_data=None
+    result = _extract_multimodal_content(mock_pdf)
+    assert result.has_multimodal is True
+    assert result.objects == [mock_pdf]
+    assert result.json_data is None
+
+    # Non-multimodal - has_multimodal=False, json_data preserved
+    result = _extract_multimodal_content("string")
+    assert result.has_multimodal is False
+    assert result.json_data == "string"
+
+    result = _extract_multimodal_content({"key": "value"})
+    assert result.has_multimodal is False
+    assert result.json_data == {"key": "value"}
+
+    # Nested in list
+    result = _extract_multimodal_content([mock_pdf, "text"])
+    assert result.has_multimodal is True
+    assert mock_pdf in result.objects
+    assert result.json_data == ["text"]
+
+    # Nested in dict
+    result = _extract_multimodal_content({"pdf": mock_pdf, "text": "value"})
+    assert result.has_multimodal is True
+    assert mock_pdf in result.objects
+    assert result.json_data == {"text": "value"}
+
+    # Nested Pydantic model
+    doc = DocumentWithPDF(pdf=mock_pdf, owner="Alice")
+    result = _extract_multimodal_content(doc)
+    assert result.has_multimodal is True
+    assert result.objects == [mock_pdf]
+    assert result.json_data == {"owner": "Alice"}
+
+    # Deeply nested structure
+    mock_pdf_2 = PDF(source="test2.pdf", media_type="application/pdf", detail="low")
+    nested = DeeplyNestedSchema(
+        title="Test",
+        nested_doc=DocumentWithImage(image=mock_image, description="desc"),
+        more_docs=[
+            DocumentWithPDF(pdf=mock_pdf, owner="A"),
+            DocumentWithPDF(pdf=mock_pdf_2, owner="B"),
+        ],
+    )
+    result = _extract_multimodal_content(nested)
+    assert result.has_multimodal is True
+    assert len(result.objects) == 3
+    assert mock_image in result.objects
+    assert mock_pdf in result.objects
+    assert mock_pdf_2 in result.objects
+    assert result.json_data["title"] == "Test"
+    assert result.json_data["nested_doc"]["description"] == "desc"
+    assert "image" not in result.json_data["nested_doc"]
diff --git a/atomic-examples/nested-multimodal/README.md b/atomic-examples/nested-multimodal/README.md
new file mode 100644
index 00000000..97a11515
--- /dev/null
+++ b/atomic-examples/nested-multimodal/README.md
@@ -0,0 +1,76 @@
+# Nested Multimodal Content Example
+
+Analyze multiple images (or PDFs/audio) in a single request using nested schemas.
+
+## What This Does
+
+Pass a **list of documents** - each containing an image plus metadata - to an agent that analyzes them all and provides a comparative summary.
+
+```python
+class ImageDocument(BaseIOSchema):
+    image: Image = Field(...)
+    owner: str = Field(...)
+    category: str = Field(...)
+
+class Input(BaseIOSchema):
+    documents: list[ImageDocument] = Field(...)  # Multiple images with metadata
+    query: str = Field(...)
+
+# Analyze multiple images at once
+result = agent.run(Input(
+    documents=[doc1, doc2, doc3],
+    query="Compare these images"
+))
+```
+
+## Setup
+
+```bash
+cd atomic-examples/nested-multimodal
+uv sync
+```
+
+Set your API key in `.env`:
+```
+OPENAI_API_KEY=your_key_here
+# or
+GEMINI_API_KEY=your_key_here
+```
+
+## Run
+
+```bash
+uv run python nested_multimodal/main.py
+```
+
+## Example Output
+
+```
+Using OpenAI GPT-5.1
+
+Creating nested document structure...
+  - Document 1: Image owned by 'Marketing Team', category 'photo'
+  - Document 2: Image owned by 'Content Team', category 'photo'
+
+============================================================
+ANALYSIS RESULTS
+============================================================
+
+Image 1:
+  Owner: Marketing Team
+  Description: A black-and-white mountain valley with dramatic lighting...
+  Dominant Colors: black, white, gray
+  Key Elements: mountain slopes, valley, diagonal light beam
+
+Image 2:
+  Owner: Content Team
+  Description: Layered blue mountain ridges receding into distance...
+  Dominant Colors: various blues, soft white haze
+  Key Elements: overlapping ridges, atmospheric haze
+
+Comparative Summary:
+  Both images depict mountainous landscapes with atmospheric depth...
+  The first is high-contrast black-and-white, the second uses blue tones...
+
+SUCCESS: Nested multimodal content handled correctly!
+```
diff --git a/atomic-examples/nested-multimodal/nested_multimodal/__init__.py b/atomic-examples/nested-multimodal/nested_multimodal/__init__.py
new file mode 100644
index 00000000..3c0a0bd2
--- /dev/null
+++ b/atomic-examples/nested-multimodal/nested_multimodal/__init__.py
@@ -0,0 +1,2 @@
+# Nested Multimodal Example
+# Demonstrates support for nested multimodal content in Atomic Agents (Issue #141)
diff --git a/atomic-examples/nested-multimodal/nested_multimodal/main.py b/atomic-examples/nested-multimodal/nested_multimodal/main.py
new file mode 100644
index 00000000..636fd98e
--- /dev/null
+++ b/atomic-examples/nested-multimodal/nested_multimodal/main.py
@@ -0,0 +1,247 @@
+"""
+Nested Multimodal Content Example
+
+This example demonstrates the support for nested multimodal content in Atomic Agents,
+as implemented for GitHub Issue #141: "AgentMemory: support nested multimodal data"
+
+The key scenario demonstrated here is:
+- A Document schema containing an Image field AND metadata (owner)
+- An InputSchema containing a LIST of Documents
+- The agent correctly processes all nested Images and extracts information from each
+
+Previously, nested multimodal content like this would be incorrectly serialized with json.dumps,
+causing issues. Now the ChatHistory recursively detects and extracts multimodal content at any depth.
+
+This example supports both:
+- OpenAI (GPT-5.1) with OPENAI_API_KEY
+- Google Gemini with GEMINI_API_KEY
+"""
+
+import os
+from pathlib import Path
+from typing import List
+
+import instructor
+from openai import OpenAI
+from atomic_agents import AtomicAgent, AgentConfig, BaseIOSchema
+from atomic_agents.context import SystemPromptGenerator
+from instructor.processing.multimodal import Image
+from pydantic import Field
+
+
+def _load_env():
+    """Load .env file from current or parent directories."""
+    for directory in [Path.cwd(), *Path.cwd().parents]:
+        env_file = directory / ".env"
+        if env_file.exists():
+            for line in env_file.read_text().splitlines():
+                line = line.strip()
+                if line and not line.startswith("#") and "=" in line:
+                    key, _, value = line.partition("=")
+                    os.environ.setdefault(key.strip(), value.strip().strip("\"'"))
+            break
+
+
+_load_env()
+
+
+# =============================================================================
+# Schema Definitions - Demonstrating Nested Multimodal Content (Issue #141)
+# =============================================================================
+
+
+class ImageDocument(BaseIOSchema):
+    """
+    A document with Image content and metadata.
+
+    This is the KEY nested structure - the Image is inside this schema,
+    not at the top level of the InputSchema.
+    """
+
+    image: Image = Field(..., description="The image content")
+    owner: str = Field(..., description="The owner/author of this image")
+    category: str = Field(..., description="Category of the image (e.g., 'logo', 'photo', 'diagram')")
+
+
+class NestedMultimodalInput(BaseIOSchema):
+    """
+    Input schema with nested multimodal content.
+
+    This demonstrates Issue #141 - multimodal content (Images) nested within
+    a list of ImageDocument objects, not at the top level.
+    """
+
+    documents: List[ImageDocument] = Field(..., description="List of image documents to analyze")
+    analysis_query: str = Field(..., description="What to analyze or compare across the images")
+
+
+class ImageAnalysis(BaseIOSchema):
+    """Analysis result for a single image document."""
+
+    owner: str = Field(..., description="The owner of this image")
+    category: str = Field(..., description="The category of the image")
+    description: str = Field(..., description="Description of what's in the image")
+    dominant_colors: List[str] = Field(..., description="Main colors visible in the image")
+    key_elements: List[str] = Field(..., description="Key visual elements identified")
+
+
+class AnalysisResult(BaseIOSchema):
+    """Combined analysis of all image documents."""
+
+    image_analyses: List[ImageAnalysis] = Field(..., description="Analysis of each individual image")
+    comparative_summary: str = Field(..., description="Comparative analysis addressing the user's query")
+
+
+# =============================================================================
+# Agent Setup with Provider Auto-Detection
+# =============================================================================
+
+
+def create_agent():
+    """Create the image analysis agent with auto-detected provider."""
+
+    # Try OpenAI first, then Gemini
+    openai_key = os.getenv("OPENAI_API_KEY")
+    gemini_key = os.getenv("GEMINI_API_KEY")
+
+    if openai_key:
+        print("Using OpenAI GPT-5.1")
+        client = instructor.from_openai(OpenAI(api_key=openai_key))
+        model = "gpt-5.1"
+    elif gemini_key:
+        print("Using Google Gemini")
+        from google import genai
+
+        client = instructor.from_genai(client=genai.Client(api_key=gemini_key), mode=instructor.Mode.GENAI_TOOLS)
+        model = "gemini-2.0-flash"
+    else:
+        raise ValueError("No API key found. Please set OPENAI_API_KEY or GEMINI_API_KEY in your .env file.")
+
+    system_prompt_generator = SystemPromptGenerator(
+        background=[
+            "You are an image analysis expert.",
+            "You can analyze multiple images and provide comparative insights.",
+        ],
+        steps=[
+            "For each image document in the input, analyze and describe what you see.",
+            "Consider the owner and category metadata provided for each image.",
+            "Identify key visual elements, colors, and notable features.",
+            "After analyzing all images, provide a comparative summary based on the user's query.",
+        ],
+        output_instructions=[
+            "Return detailed analysis for each image.",
+            "Include a comparative summary that addresses the user's specific query.",
+        ],
+    )
+
+    agent = AtomicAgent[NestedMultimodalInput, AnalysisResult](
+        config=AgentConfig(
+            client=client,
+            model=model,
+            system_prompt_generator=system_prompt_generator,
+            input_schema=NestedMultimodalInput,
+            output_schema=AnalysisResult,
+        )
+    )
+
+    return agent
+
+
+def main():
+    """
+    Main function demonstrating nested multimodal content handling.
+
+    This creates multiple ImageDocument objects, each containing an Image,
+    and passes them as a list to the agent - the exact scenario from Issue #141.
+    """
+    print("=" * 60)
+    print("Nested Multimodal Content Example (Issue #141)")
+    print("=" * 60)
+    print()
+
+    # Get the test image paths
+    script_directory = os.path.dirname(os.path.abspath(__file__))
+    test_media_directory = os.path.join(os.path.dirname(script_directory), "test_media")
+    image_path1 = os.path.join(test_media_directory, "image_sample.jpg")
+    image_path2 = os.path.join(test_media_directory, "image_sample2.jpg")
+
+    # Check for test images
+    if not os.path.exists(image_path1) or not os.path.exists(image_path2):
+        print(f"Error: Test images not found in {test_media_directory}")
+        print("Please ensure image_sample.jpg and image_sample2.jpg exist.")
+        return
+
+    print(f"Using test images from: {test_media_directory}")
+    print()
+
+    # Create the agent
+    try:
+        agent = create_agent()
+    except ValueError as e:
+        print(f"Setup error: {e}")
+        return
+
+    print()
+
+    # ==========================================================================
+    # KEY DEMONSTRATION: Nested multimodal content
+    # ==========================================================================
+    # We create MULTIPLE ImageDocument objects, each containing an Image.
+    # This is the exact scenario from Issue #141 that wasn't working before.
+    # The Images are nested inside ImageDocument schemas, inside a list.
+
+    print("Creating nested document structure...")
+    print("  - Document 1: Image owned by 'Marketing Team', category 'random photo'")
+    print("  - Document 2: Image owned by 'Content Team', category 'random photo'")
+    print()
+
+    document1 = ImageDocument(image=Image.from_path(image_path1), owner="Marketing Team", category="random photo")
+    document2 = ImageDocument(image=Image.from_path(image_path2), owner="Content Team", category="random photo")
+
+    # Create the nested input - this is what Issue #141 fixed
+    nested_input = NestedMultimodalInput(
+        documents=[document1, document2],
+        analysis_query="Compare these images and describe what you see in each one. Note any similarities or differences.",
+    )
+
+    print("Sending nested multimodal content to agent...")
+    print("(Previously this would fail due to incorrect JSON serialization)")
+    print()
+
+    try:
+        # Run the agent with nested multimodal content
+        result = agent.run(nested_input)
+
+        # Display results
+        print("=" * 60)
+        print("ANALYSIS RESULTS")
+        print("=" * 60)
+        print()
+
+        for i, img_analysis in enumerate(result.image_analyses, 1):
+            print(f"Image {i}:")
+            print(f"  Owner: {img_analysis.owner}")
+            print(f"  Category: {img_analysis.category}")
+            print(f"  Description: {img_analysis.description}")
+            print(f"  Dominant Colors: {', '.join(img_analysis.dominant_colors)}")
+            print(f"  Key Elements: {', '.join(img_analysis.key_elements)}")
+            print()
+
+        print("Comparative Summary:")
+        print(f"  {result.comparative_summary}")
+        print()
+
+        print("=" * 60)
+        print("SUCCESS: Nested multimodal content handled correctly!")
+        print("=" * 60)
+
+    except Exception as e:
+        print(f"Error during analysis: {e}")
+        import traceback
+
+        traceback.print_exc()
+        raise
+
+
+if __name__ == "__main__":
+    main()
diff --git a/atomic-examples/nested-multimodal/pyproject.toml b/atomic-examples/nested-multimodal/pyproject.toml
new file mode 100644
index 00000000..67575f54
--- /dev/null
+++ b/atomic-examples/nested-multimodal/pyproject.toml
@@ -0,0 +1,25 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["nested_multimodal"]
+
+[project]
+name = "nested-multimodal"
+version = "1.0.0"
+description = "Example demonstrating nested multimodal content support in Atomic Agents (Issue #141)"
+readme = "README.md"
+authors = [
+    { name = "Atomic Agents Team" }
+]
+requires-python = ">=3.12,<3.14"
+dependencies = [
+    "atomic-agents",
+    "instructor>=1.7.0",
+    "google-genai>=1.18.0,<2.0.0",
+    "jsonref>=1.1.0,<2.0.0",
+]
+
+[tool.uv.sources]
+atomic-agents = { workspace = true }
diff --git a/atomic-examples/nested-multimodal/test_media/image_sample.jpg b/atomic-examples/nested-multimodal/test_media/image_sample.jpg
new file mode 100644
index 00000000..c9f24223
Binary files /dev/null and b/atomic-examples/nested-multimodal/test_media/image_sample.jpg differ
diff --git a/atomic-examples/nested-multimodal/test_media/image_sample2.jpg b/atomic-examples/nested-multimodal/test_media/image_sample2.jpg
new file mode 100644
index 00000000..8823a6d7
Binary files /dev/null and b/atomic-examples/nested-multimodal/test_media/image_sample2.jpg differ
diff --git a/uv.lock b/uv.lock
index 1f044391..e729d009 100644
--- a/uv.lock
+++ b/uv.lock
@@ -17,6 +17,7 @@ members = [
     "example-mcp-server",
     "fastapi-memory",
     "hooks-example",
+    "nested-multimodal",
     "orchestration-agent",
     "quickstart",
     "rag-chatbot",
@@ -2006,6 +2007,25 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/1d/f6/6d61a023d758f488e36638076e8a4ec4447a2cdf86938cf6c60cf1c860e6/myst_parser-2.0.0-py3-none-any.whl", hash = "sha256:7c36344ae39c8e740dad7fdabf5aa6fc4897a813083c6cc9990044eb93656b14", size = 77158, upload-time = "2023-06-13T16:30:27.697Z" },
 ]
 
+[[package]]
+name = "nested-multimodal"
+version = "1.0.0"
+source = { editable = "atomic-examples/nested-multimodal" }
+dependencies = [
+    { name = "atomic-agents" },
+    { name = "google-genai" },
+    { name = "instructor" },
+    { name = "jsonref" },
+]
+
+[package.metadata]
+requires-dist = [
+    { name = "atomic-agents", editable = "." },
+    { name = "google-genai", specifier = ">=1.18.0,<2.0.0" },
+    { name = "instructor", specifier = ">=1.7.0" },
+    { name = "jsonref", specifier = ">=1.1.0,<2.0.0" },
+]
+
 [[package]]
 name = "nodeenv"
 version = "1.9.1"