feat: support multimodal user messages in captain v2 (#13581)

Extract and pass image attachments from the latest user message to the runner, excluding the last user message from the context for processing. Fixes #13588 # Pull Request Template ## Description Adds image support to captain v2 ## Type of change Please delete options that are not relevant. - [x] Bug fix (non-breaking change which fixes an issue) ## How Has This Been Tested? Please describe the tests that you ran to verify your changes. Provide instructions so we can reproduce. Please also list any relevant details for your test configuration. specs and local testing <img width="754" height="1008" alt="image" src="https://github.com/user-attachments/assets/914cbc2c-9d30-42d0-87d4-9e5430845c87" /> langfuse also shows media correctly with the instrumentation code: <img width="1800" height="1260" alt="image" src="https://github.com/user-attachments/assets/ce0f5fa6-b1a5-42ec-a213-9a82b1751037" /> ## Checklist: - [x] My code follows the style guidelines of this project - [x] I have performed a self-review of my code - [x] I have commented on my code, particularly in hard-to-understand areas - [ ] I have made corresponding changes to the documentation - [x] My changes generate no new warnings - [x] I have added tests that prove my fix is effective or that my feature works - [x] New and existing unit tests pass locally with my changes - [x] Any dependent changes have been merged and published in downstream modules --------- Co-authored-by: Shivam Mishra <scm.mymail@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-24 19:37:41 +05:30
parent 6be95e79f8
commit 7cec4ebaae
6 changed files with 264 additions and 62 deletions
--- a/2
+++ b/2
@@ -191,7 +191,7 @@ gem 'reverse_markdown'

 gem 'iso-639'
 gem 'ruby-openai'
-gem 'ai-agents'
+gem 'ai-agents', '>= 0.9.1'

 # TODO: Move this gem as a dependency of ai-agents
 gem 'ruby_llm', '>= 1.8.2'
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -126,7 +126,7 @@ GEM
      jbuilder (~> 2)
      rails (>= 4.2, < 7.2)
      selectize-rails (~> 0.6)
-    ai-agents (0.9.0)
+    ai-agents (0.9.1)
      ruby_llm (~> 1.9.1)
    annotaterb (4.20.0)
      activerecord (>= 6.0.0)
@@ -1024,7 +1024,7 @@ DEPENDENCIES
  administrate (>= 0.20.1)
  administrate-field-active_storage (>= 1.0.3)
  administrate-field-belongs_to_search (>= 0.9.0)
-  ai-agents
+  ai-agents (>= 0.9.1)
  annotaterb
  attr_extras
  audited (~> 5.4, >= 5.4.1)
--- a/enterprise/app/services/captain/assistant/agent_runner_service.rb
+++ b/enterprise/app/services/captain/assistant/agent_runner_service.rb
@@ -3,6 +3,8 @@ require 'agents/instrumentation'

 class Captain::Assistant::AgentRunnerService
  include Integrations::LlmInstrumentationConstants
+  include Captain::Assistant::RunnerCallbacksHelper
+  include Captain::Assistant::TracePayloadHelper

  CONVERSATION_STATE_ATTRIBUTES = %i[
    id display_id inbox_id contact_id status priority
@@ -21,13 +23,7 @@ class Captain::Assistant::AgentRunnerService
  end

  def generate_response(message_history: [])
-    agents = build_and_wire_agents
-    context = build_context(message_history)
-    message_to_process = extract_last_user_message(message_history)
-    runner = Agents::Runner.with_agents(*agents)
-    runner = add_usage_metadata_callback(runner)
-    runner = add_callbacks_to_runner(runner) if @callbacks.any?
-    install_instrumentation(runner)
+    message_to_process, context = run_payload(message_history)
    result = runner.run(message_to_process, context: context, max_turns: 100)

    process_agent_result(result)
@@ -45,7 +41,10 @@ class Captain::Assistant::AgentRunnerService

  def build_context(message_history)
    conversation_history = message_history.map do |msg|
-      content = extract_text_from_content(msg[:content])
+      content = msg[:content]
+      # Preserve multimodal arrays (with image_url entries) as-is for the runner to restore with attachments.
+      # Only extract text from non-array formats (hashes from agent structured output, plain strings).
+      content = extract_text_from_content(content) unless content.is_a?(Array)

      {
        role: msg[:role].to_sym,
@@ -63,8 +62,22 @@ class Captain::Assistant::AgentRunnerService

  def extract_last_user_message(message_history)
    last_user_msg = message_history.reverse.find { |msg| msg[:role] == 'user' }
+    return '' if last_user_msg.blank?

-    extract_text_from_content(last_user_msg[:content])
+    content = last_user_msg[:content]
+    return extract_text_from_content(content) unless content.is_a?(Array)
+
+    text, attachments = Captain::OpenAiMessageBuilderService.extract_text_and_attachments(content)
+    return text if attachments.blank?
+
+    RubyLLM::Content.new(text, attachments)
+  end
+
+  def message_history_without_last_user_message(message_history)
+    last_user_index = message_history.rindex { |msg| msg[:role] == 'user' }
+    return message_history if last_user_index.nil?
+
+    message_history.reject.with_index { |_msg, index| index == last_user_index }
  end

  def extract_text_from_content(content)
@@ -143,28 +156,25 @@ class Captain::Assistant::AgentRunnerService
      },
      attribute_provider: ->(context_wrapper) { dynamic_trace_attributes(context_wrapper) }
    )
+    register_trace_input_callback(runner)
  end

  def dynamic_trace_attributes(context_wrapper)
    state = context_wrapper&.context&.dig(:state) || {}
    conversation = state[:conversation] || {}
+    trace_input = context_wrapper&.context&.dig(:captain_v2_trace_input)
+
    {
      ATTR_LANGFUSE_USER_ID => state[:account_id],
      format(ATTR_LANGFUSE_METADATA, 'assistant_id') => state[:assistant_id],
      format(ATTR_LANGFUSE_METADATA, 'conversation_id') => conversation[:id],
      format(ATTR_LANGFUSE_METADATA, 'conversation_display_id') => conversation[:display_id],
-      format(ATTR_LANGFUSE_METADATA, 'channel_type') => state[:channel_type]
+      format(ATTR_LANGFUSE_METADATA, 'channel_type') => state[:channel_type],
+      ATTR_LANGFUSE_TRACE_INPUT => trace_input,
+      ATTR_LANGFUSE_OBSERVATION_INPUT => trace_input
    }.compact.transform_values(&:to_s)
  end

-  def add_callbacks_to_runner(runner)
-    runner = add_agent_thinking_callback(runner) if @callbacks[:on_agent_thinking]
-    runner = add_tool_start_callback(runner) if @callbacks[:on_tool_start]
-    runner = add_tool_complete_callback(runner) if @callbacks[:on_tool_complete]
-    runner = add_agent_handoff_callback(runner) if @callbacks[:on_agent_handoff]
-    runner
-  end
-
  def add_usage_metadata_callback(runner)
    return runner unless ChatwootApp.otel_enabled?

@@ -195,35 +205,20 @@ class Captain::Assistant::AgentRunnerService
    root_span.set_attribute(format(ATTR_LANGFUSE_METADATA, 'credit_used'), credit_used.to_s)
  end

-  def add_agent_thinking_callback(runner)
-    runner.on_agent_thinking do |*args|
-      @callbacks[:on_agent_thinking].call(*args)
-    rescue StandardError => e
-      Rails.logger.warn "[Captain] Callback error for agent_thinking: #{e.message}"
+  def runner
+    @runner ||= begin
+      configured_runner = Agents::Runner.with_agents(*build_and_wire_agents)
+      configured_runner = add_usage_metadata_callback(configured_runner)
+      configured_runner = add_callbacks_to_runner(configured_runner) if @callbacks.any?
+      install_instrumentation(configured_runner)
+      configured_runner
    end
  end

-  def add_tool_start_callback(runner)
-    runner.on_tool_start do |*args|
-      @callbacks[:on_tool_start].call(*args)
-    rescue StandardError => e
-      Rails.logger.warn "[Captain] Callback error for tool_start: #{e.message}"
-    end
-  end
-
-  def add_tool_complete_callback(runner)
-    runner.on_tool_complete do |*args|
-      @callbacks[:on_tool_complete].call(*args)
-    rescue StandardError => e
-      Rails.logger.warn "[Captain] Callback error for tool_complete: #{e.message}"
-    end
-  end
-
-  def add_agent_handoff_callback(runner)
-    runner.on_agent_handoff do |*args|
-      @callbacks[:on_agent_handoff].call(*args)
-    rescue StandardError => e
-      Rails.logger.warn "[Captain] Callback error for agent_handoff: #{e.message}"
-    end
+  def run_payload(message_history)
+    message_to_process = extract_last_user_message(message_history)
+    context = build_context(message_history_without_last_user_message(message_history))
+    enrich_context_with_trace_payload!(context, message_history, message_to_process)
+    [message_to_process, context]
  end
 end
--- a/enterprise/app/services/captain/assistant/runner_callbacks_helper.rb
+++ b/enterprise/app/services/captain/assistant/runner_callbacks_helper.rb
@@ -0,0 +1,53 @@
+module Captain::Assistant::RunnerCallbacksHelper
+  private
+
+  def add_callbacks_to_runner(runner)
+    runner = add_agent_thinking_callback(runner) if @callbacks[:on_agent_thinking]
+    runner = add_tool_start_callback(runner) if @callbacks[:on_tool_start]
+    runner = add_tool_complete_callback(runner) if @callbacks[:on_tool_complete]
+    runner = add_agent_handoff_callback(runner) if @callbacks[:on_agent_handoff]
+    runner
+  end
+
+  def register_trace_input_callback(runner)
+    runner.on_agent_thinking do |_agent_name, _input, context_wrapper|
+      tracing = context_wrapper&.context&.dig(:__otel_tracing)
+      next unless tracing
+
+      trace_input = context_wrapper.context[:captain_v2_trace_current_input]
+      tracing[:pending_llm_input] = trace_input if trace_input.present?
+    end
+  end
+
+  def add_agent_thinking_callback(runner)
+    runner.on_agent_thinking do |*args|
+      @callbacks[:on_agent_thinking].call(*args)
+    rescue StandardError => e
+      Rails.logger.warn "[Captain] Callback error for agent_thinking: #{e.message}"
+    end
+  end
+
+  def add_tool_start_callback(runner)
+    runner.on_tool_start do |*args|
+      @callbacks[:on_tool_start].call(*args)
+    rescue StandardError => e
+      Rails.logger.warn "[Captain] Callback error for tool_start: #{e.message}"
+    end
+  end
+
+  def add_tool_complete_callback(runner)
+    runner.on_tool_complete do |*args|
+      @callbacks[:on_tool_complete].call(*args)
+    rescue StandardError => e
+      Rails.logger.warn "[Captain] Callback error for tool_complete: #{e.message}"
+    end
+  end
+
+  def add_agent_handoff_callback(runner)
+    runner.on_agent_handoff do |*args|
+      @callbacks[:on_agent_handoff].call(*args)
+    rescue StandardError => e
+      Rails.logger.warn "[Captain] Callback error for agent_handoff: #{e.message}"
+    end
+  end
+end
--- a/enterprise/app/services/captain/assistant/trace_payload_helper.rb
+++ b/enterprise/app/services/captain/assistant/trace_payload_helper.rb
@@ -0,0 +1,51 @@
+module Captain::Assistant::TracePayloadHelper
+  private
+
+  def enrich_context_with_trace_payload!(context, message_history, message_to_process)
+    context[:captain_v2_trace_input] = serialize_trace_messages(message_history)
+    context[:captain_v2_trace_current_input] = serialize_trace_content(message_to_process)
+  end
+
+  def serialize_trace_messages(message_history)
+    message_history.map do |message|
+      {
+        role: message[:role].to_s,
+        content: trace_content_payload(message[:content])
+      }
+    end.to_json
+  end
+
+  def serialize_trace_content(content)
+    payload = trace_content_payload(content)
+    return '' if payload.blank?
+
+    payload.is_a?(String) ? payload : payload.to_json
+  end
+
+  def trace_content_payload(content)
+    case content
+    when RubyLLM::Content
+      trace_parts_from_ruby_llm_content(content)
+    when Array, Hash
+      content
+    when NilClass
+      ''
+    else
+      content.to_s
+    end
+  end
+
+  def trace_parts_from_ruby_llm_content(content)
+    parts = []
+    parts << { type: 'text', text: content.text } if content.text.present?
+
+    content.attachments.each do |attachment|
+      parts << { type: 'image_url', image_url: { url: attachment.source.to_s } }
+    end
+
+    return '' if parts.blank?
+    return parts.first[:text] if parts.one? && parts.first[:type] == 'text'
+
+    parts
+  end
+end
--- a/spec/enterprise/services/captain/assistant/agent_runner_service_spec.rb
+++ b/spec/enterprise/services/captain/assistant/agent_runner_service_spec.rb
@@ -74,12 +74,11 @@ RSpec.describe Captain::Assistant::AgentRunnerService do
    end

    it 'runs agent with extracted user message and context' do
-      expected_context = {
+      expected_context = hash_including(
        session_id: "#{account.id}_#{conversation.display_id}",
        conversation_history: [
          { role: :user, content: 'Hello there', agent_name: nil },
-          { role: :assistant, content: 'Hi! How can I help you?', agent_name: 'Assistant' },
-          { role: :user, content: 'I need help with my account', agent_name: nil }
+          { role: :assistant, content: 'Hi! How can I help you?', agent_name: 'Assistant' }
        ],
        state: hash_including(
          account_id: account.id,
@@ -87,7 +86,7 @@ RSpec.describe Captain::Assistant::AgentRunnerService do
          conversation: hash_including(id: conversation.id),
          contact: hash_including(id: contact.id)
        )
-      }
+      )

      expect(mock_runner).to receive(:run).with(
        'I need help with my account',
@@ -98,6 +97,71 @@ RSpec.describe Captain::Assistant::AgentRunnerService do
      service.generate_response(message_history: message_history)
    end

+    context 'when the latest user message is multimodal' do
+      let(:multimodal_message_history) do
+        [
+          { role: 'assistant', content: 'Please share a screenshot' },
+          {
+            role: 'user',
+            content: [
+              { type: 'text', text: 'What does this error mean?' },
+              { type: 'image_url', image_url: { url: 'https://example.com/error.png' } }
+            ]
+          }
+        ]
+      end
+
+      it 'passes image attachments to the runner input' do
+        expect(mock_runner).to receive(:run) do |input, context:, max_turns:|
+          expect(input).to be_a(RubyLLM::Content)
+          expect(input.text).to eq('What does this error mean?')
+          expect(input.attachments.first.source.to_s).to eq('https://example.com/error.png')
+          expect(context[:conversation_history]).to eq([{ role: :assistant, content: 'Please share a screenshot', agent_name: nil }])
+          expect(max_turns).to eq(100)
+        end
+
+        service.generate_response(message_history: multimodal_message_history)
+      end
+
+      it 'preserves multimodal content in earlier history messages' do
+        history_with_prior_image = [
+          {
+            role: 'user',
+            content: [
+              { type: 'text', text: 'Here is my error screenshot' },
+              { type: 'image_url', image_url: { url: 'https://example.com/error.png' } }
+            ]
+          },
+          { role: 'assistant', content: 'I see the error. Try restarting.' },
+          { role: 'user', content: 'It still does not work' }
+        ]
+
+        expect(mock_runner).to receive(:run) do |input, context:, max_turns:|
+          expect(input).to eq('It still does not work')
+          # The earlier user message with the image should preserve the multimodal array
+          first_history_msg = context[:conversation_history].first
+          expect(first_history_msg[:content]).to be_a(Array)
+          expect(first_history_msg[:content]).to include(
+            { type: 'text', text: 'Here is my error screenshot' },
+            { type: 'image_url', image_url: { url: 'https://example.com/error.png' } }
+          )
+          expect(max_turns).to eq(100)
+        end
+
+        service.generate_response(message_history: history_with_prior_image)
+      end
+
+      it 'stores multimodal trace payloads in runner context' do
+        expect(mock_runner).to receive(:run) do |_input, context:, max_turns:|
+          expect(context[:captain_v2_trace_input]).to include('image_url')
+          expect(context[:captain_v2_trace_current_input]).to include('image_url')
+          expect(max_turns).to eq(100)
+        end
+
+        service.generate_response(message_history: multimodal_message_history)
+      end
+    end
+
    it 'processes and formats agent result' do
      result = service.generate_response(message_history: message_history)

@@ -197,22 +261,21 @@ RSpec.describe Captain::Assistant::AgentRunnerService do
    end

    context 'with multimodal content' do
-      let(:multimodal_message_history) do
+      let(:multimodal_content) do
        [
-          {
-            role: 'user',
-            content: [
-              { type: 'text', text: 'Can you help with this image?' },
-              { type: 'image_url', image_url: { url: 'https://example.com/image.jpg' } }
-            ]
-          }
+          { type: 'text', text: 'Can you help with this image?' },
+          { type: 'image_url', image_url: { url: 'https://example.com/image.jpg' } }
        ]
      end

-      it 'extracts text content from multimodal messages' do
+      let(:multimodal_message_history) do
+        [{ role: 'user', content: multimodal_content }]
+      end
+
+      it 'preserves multimodal arrays in conversation history for image context retention' do
        context = service.send(:build_context, multimodal_message_history)

-        expect(context[:conversation_history].first[:content]).to eq('Can you help with this image?')
+        expect(context[:conversation_history].first[:content]).to eq(multimodal_content)
      end
    end
  end
@@ -225,6 +288,24 @@ RSpec.describe Captain::Assistant::AgentRunnerService do

      expect(result).to eq('I need help with my account')
    end
+
+    it 'returns multimodal content with image attachments for the runner input' do
+      multimodal_message_history = [
+        {
+          role: 'user',
+          content: [
+            { type: 'text', text: 'Can you check this screenshot?' },
+            { type: 'image_url', image_url: { url: 'https://example.com/image.jpg' } }
+          ]
+        }
+      ]
+
+      result = service.send(:extract_last_user_message, multimodal_message_history)
+
+      expect(result).to be_a(RubyLLM::Content)
+      expect(result.text).to eq('Can you check this screenshot?')
+      expect(result.attachments.first.source.to_s).to eq('https://example.com/image.jpg')
+    end
  end

  describe '#extract_text_from_content' do
@@ -256,6 +337,28 @@ RSpec.describe Captain::Assistant::AgentRunnerService do
    end
  end

+  describe '#dynamic_trace_attributes' do
+    subject(:service) { described_class.new(assistant: assistant, conversation: conversation) }
+
+    it 'adds serialized trace input attributes when present in context' do
+      context = {
+        state: {
+          account_id: account.id,
+          assistant_id: assistant.id,
+          conversation: { id: conversation.id, display_id: conversation.display_id }
+        },
+        captain_v2_trace_input: '[{"role":"user","content":[{"type":"image_url","image_url":{"url":"https://example.com/image.jpg"}}]}]'
+      }
+      context_wrapper = Struct.new(:context).new(context)
+
+      attributes = service.send(:dynamic_trace_attributes, context_wrapper)
+
+      expect(attributes['langfuse.trace.input']).to include('image_url')
+      expect(attributes['langfuse.observation.input']).to include('image_url')
+      expect(attributes['langfuse.user.id']).to eq(account.id.to_s)
+    end
+  end
+
  describe '#build_state' do
    subject(:service) { described_class.new(assistant: assistant, conversation: conversation) }