feat: support multimodal user messages in captain v2 (#13581)
Extract and pass image attachments from the latest user message to the runner, excluding the last user message from the context for processing. Fixes #13588 # Pull Request Template ## Description Adds image support to captain v2 ## Type of change Please delete options that are not relevant. - [x] Bug fix (non-breaking change which fixes an issue) ## How Has This Been Tested? Please describe the tests that you ran to verify your changes. Provide instructions so we can reproduce. Please also list any relevant details for your test configuration. specs and local testing <img width="754" height="1008" alt="image" src="https://github.com/user-attachments/assets/914cbc2c-9d30-42d0-87d4-9e5430845c87" /> langfuse also shows media correctly with the instrumentation code: <img width="1800" height="1260" alt="image" src="https://github.com/user-attachments/assets/ce0f5fa6-b1a5-42ec-a213-9a82b1751037" /> ## Checklist: - [x] My code follows the style guidelines of this project - [x] I have performed a self-review of my code - [x] I have commented on my code, particularly in hard-to-understand areas - [ ] I have made corresponding changes to the documentation - [x] My changes generate no new warnings - [x] I have added tests that prove my fix is effective or that my feature works - [x] New and existing unit tests pass locally with my changes - [x] Any dependent changes have been merged and published in downstream modules --------- Co-authored-by: Shivam Mishra <scm.mymail@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -74,12 +74,11 @@ RSpec.describe Captain::Assistant::AgentRunnerService do
|
||||
end
|
||||
|
||||
it 'runs agent with extracted user message and context' do
|
||||
expected_context = {
|
||||
expected_context = hash_including(
|
||||
session_id: "#{account.id}_#{conversation.display_id}",
|
||||
conversation_history: [
|
||||
{ role: :user, content: 'Hello there', agent_name: nil },
|
||||
{ role: :assistant, content: 'Hi! How can I help you?', agent_name: 'Assistant' },
|
||||
{ role: :user, content: 'I need help with my account', agent_name: nil }
|
||||
{ role: :assistant, content: 'Hi! How can I help you?', agent_name: 'Assistant' }
|
||||
],
|
||||
state: hash_including(
|
||||
account_id: account.id,
|
||||
@@ -87,7 +86,7 @@ RSpec.describe Captain::Assistant::AgentRunnerService do
|
||||
conversation: hash_including(id: conversation.id),
|
||||
contact: hash_including(id: contact.id)
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
expect(mock_runner).to receive(:run).with(
|
||||
'I need help with my account',
|
||||
@@ -98,6 +97,71 @@ RSpec.describe Captain::Assistant::AgentRunnerService do
|
||||
service.generate_response(message_history: message_history)
|
||||
end
|
||||
|
||||
context 'when the latest user message is multimodal' do
|
||||
let(:multimodal_message_history) do
|
||||
[
|
||||
{ role: 'assistant', content: 'Please share a screenshot' },
|
||||
{
|
||||
role: 'user',
|
||||
content: [
|
||||
{ type: 'text', text: 'What does this error mean?' },
|
||||
{ type: 'image_url', image_url: { url: 'https://example.com/error.png' } }
|
||||
]
|
||||
}
|
||||
]
|
||||
end
|
||||
|
||||
it 'passes image attachments to the runner input' do
|
||||
expect(mock_runner).to receive(:run) do |input, context:, max_turns:|
|
||||
expect(input).to be_a(RubyLLM::Content)
|
||||
expect(input.text).to eq('What does this error mean?')
|
||||
expect(input.attachments.first.source.to_s).to eq('https://example.com/error.png')
|
||||
expect(context[:conversation_history]).to eq([{ role: :assistant, content: 'Please share a screenshot', agent_name: nil }])
|
||||
expect(max_turns).to eq(100)
|
||||
end
|
||||
|
||||
service.generate_response(message_history: multimodal_message_history)
|
||||
end
|
||||
|
||||
it 'preserves multimodal content in earlier history messages' do
|
||||
history_with_prior_image = [
|
||||
{
|
||||
role: 'user',
|
||||
content: [
|
||||
{ type: 'text', text: 'Here is my error screenshot' },
|
||||
{ type: 'image_url', image_url: { url: 'https://example.com/error.png' } }
|
||||
]
|
||||
},
|
||||
{ role: 'assistant', content: 'I see the error. Try restarting.' },
|
||||
{ role: 'user', content: 'It still does not work' }
|
||||
]
|
||||
|
||||
expect(mock_runner).to receive(:run) do |input, context:, max_turns:|
|
||||
expect(input).to eq('It still does not work')
|
||||
# The earlier user message with the image should preserve the multimodal array
|
||||
first_history_msg = context[:conversation_history].first
|
||||
expect(first_history_msg[:content]).to be_a(Array)
|
||||
expect(first_history_msg[:content]).to include(
|
||||
{ type: 'text', text: 'Here is my error screenshot' },
|
||||
{ type: 'image_url', image_url: { url: 'https://example.com/error.png' } }
|
||||
)
|
||||
expect(max_turns).to eq(100)
|
||||
end
|
||||
|
||||
service.generate_response(message_history: history_with_prior_image)
|
||||
end
|
||||
|
||||
it 'stores multimodal trace payloads in runner context' do
|
||||
expect(mock_runner).to receive(:run) do |_input, context:, max_turns:|
|
||||
expect(context[:captain_v2_trace_input]).to include('image_url')
|
||||
expect(context[:captain_v2_trace_current_input]).to include('image_url')
|
||||
expect(max_turns).to eq(100)
|
||||
end
|
||||
|
||||
service.generate_response(message_history: multimodal_message_history)
|
||||
end
|
||||
end
|
||||
|
||||
it 'processes and formats agent result' do
|
||||
result = service.generate_response(message_history: message_history)
|
||||
|
||||
@@ -197,22 +261,21 @@ RSpec.describe Captain::Assistant::AgentRunnerService do
|
||||
end
|
||||
|
||||
context 'with multimodal content' do
|
||||
let(:multimodal_message_history) do
|
||||
let(:multimodal_content) do
|
||||
[
|
||||
{
|
||||
role: 'user',
|
||||
content: [
|
||||
{ type: 'text', text: 'Can you help with this image?' },
|
||||
{ type: 'image_url', image_url: { url: 'https://example.com/image.jpg' } }
|
||||
]
|
||||
}
|
||||
{ type: 'text', text: 'Can you help with this image?' },
|
||||
{ type: 'image_url', image_url: { url: 'https://example.com/image.jpg' } }
|
||||
]
|
||||
end
|
||||
|
||||
it 'extracts text content from multimodal messages' do
|
||||
let(:multimodal_message_history) do
|
||||
[{ role: 'user', content: multimodal_content }]
|
||||
end
|
||||
|
||||
it 'preserves multimodal arrays in conversation history for image context retention' do
|
||||
context = service.send(:build_context, multimodal_message_history)
|
||||
|
||||
expect(context[:conversation_history].first[:content]).to eq('Can you help with this image?')
|
||||
expect(context[:conversation_history].first[:content]).to eq(multimodal_content)
|
||||
end
|
||||
end
|
||||
end
|
||||
@@ -225,6 +288,24 @@ RSpec.describe Captain::Assistant::AgentRunnerService do
|
||||
|
||||
expect(result).to eq('I need help with my account')
|
||||
end
|
||||
|
||||
it 'returns multimodal content with image attachments for the runner input' do
|
||||
multimodal_message_history = [
|
||||
{
|
||||
role: 'user',
|
||||
content: [
|
||||
{ type: 'text', text: 'Can you check this screenshot?' },
|
||||
{ type: 'image_url', image_url: { url: 'https://example.com/image.jpg' } }
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
result = service.send(:extract_last_user_message, multimodal_message_history)
|
||||
|
||||
expect(result).to be_a(RubyLLM::Content)
|
||||
expect(result.text).to eq('Can you check this screenshot?')
|
||||
expect(result.attachments.first.source.to_s).to eq('https://example.com/image.jpg')
|
||||
end
|
||||
end
|
||||
|
||||
describe '#extract_text_from_content' do
|
||||
@@ -256,6 +337,28 @@ RSpec.describe Captain::Assistant::AgentRunnerService do
|
||||
end
|
||||
end
|
||||
|
||||
describe '#dynamic_trace_attributes' do
|
||||
subject(:service) { described_class.new(assistant: assistant, conversation: conversation) }
|
||||
|
||||
it 'adds serialized trace input attributes when present in context' do
|
||||
context = {
|
||||
state: {
|
||||
account_id: account.id,
|
||||
assistant_id: assistant.id,
|
||||
conversation: { id: conversation.id, display_id: conversation.display_id }
|
||||
},
|
||||
captain_v2_trace_input: '[{"role":"user","content":[{"type":"image_url","image_url":{"url":"https://example.com/image.jpg"}}]}]'
|
||||
}
|
||||
context_wrapper = Struct.new(:context).new(context)
|
||||
|
||||
attributes = service.send(:dynamic_trace_attributes, context_wrapper)
|
||||
|
||||
expect(attributes['langfuse.trace.input']).to include('image_url')
|
||||
expect(attributes['langfuse.observation.input']).to include('image_url')
|
||||
expect(attributes['langfuse.user.id']).to eq(account.id.to_s)
|
||||
end
|
||||
end
|
||||
|
||||
describe '#build_state' do
|
||||
subject(:service) { described_class.new(assistant: assistant, conversation: conversation) }
|
||||
|
||||
|
||||
Reference in New Issue
Block a user