fix: captain assistant image comprehension (#13390)

# Pull Request Template

## Description

Fixes # (issue)

When we migrated to RubyLLM, images weren't being sent properly in
RubyLLM format to the model, so it did not understand images.

## Type of change

Please delete options that are not relevant.

- [x] Bug fix (non-breaking change which fixes an issue)

## How Has This Been Tested?

Please describe the tests that you ran to verify your changes. Provide
instructions so we can reproduce. Please also list any relevant details
for your test configuration.

specs + local testing

Current behaviour on staging:
<img width="772" height="1012" alt="image"
src="https://github.com/user-attachments/assets/7b7d360f-dea4-48af-b20b-ee4c98a38a85"
/>

local testing with fix:
<img width="792" height="1216" alt="image"
src="https://github.com/user-attachments/assets/5ef82452-015e-4bda-a68f-884d00acb014"
/>


## Checklist:

- [x] My code follows the style guidelines of this project
- [x] I have performed a self-review of my code
- [x] I have commented on my code, particularly in hard-to-understand
areas
- [ ] I have made corresponding changes to the documentation
- [x] My changes generate no new warnings
- [x] I have added tests that prove my fix is effective or that my
feature works
- [x] New and existing unit tests pass locally with my changes
- [x] Any dependent changes have been merged and published in downstream
modules

---------

Co-authored-by: Sojan Jose <sojan@pepalo.com>
This commit is contained in:
Aakash Bakhle
2026-01-29 05:37:13 +05:30
committed by GitHub
parent 0d9c0b2ed2
commit 77493c5d0f
3 changed files with 150 additions and 2 deletions

View File

@@ -10,7 +10,10 @@ module Captain::ChatHelper
add_messages_to_chat(chat)
with_agent_session do
response = chat.ask(conversation_messages.last[:content])
last_content = conversation_messages.last[:content]
text, attachments = Captain::OpenAiMessageBuilderService.extract_text_and_attachments(last_content)
response = attachments.any? ? chat.ask(text, with: attachments) : chat.ask(text)
build_response(response)
end
rescue StandardError => e
@@ -68,7 +71,9 @@ module Captain::ChatHelper
def add_messages_to_chat(chat)
conversation_messages[0...-1].each do |msg|
chat.add_message(role: msg[:role].to_sym, content: msg[:content])
text, attachments = Captain::OpenAiMessageBuilderService.extract_text_and_attachments(msg[:content])
content = attachments.any? ? RubyLLM::Content.new(text, attachments) : text
chat.add_message(role: msg[:role].to_sym, content: content)
end
end

View File

@@ -1,6 +1,15 @@
class Captain::OpenAiMessageBuilderService
pattr_initialize [:message!]
# Extracts text and image URLs from multimodal content array (reverse of generate_content)
def self.extract_text_and_attachments(content)
return [content, []] unless content.is_a?(Array)
text_parts = content.select { |part| part[:type] == 'text' }.pluck(:text)
image_urls = content.select { |part| part[:type] == 'image_url' }.filter_map { |part| part.dig(:image_url, :url) }
[text_parts.join(' ').presence, image_urls]
end
def generate_content
parts = []
parts << text_part(@message.content) if @message.content.present?

View File

@@ -0,0 +1,134 @@
require 'rails_helper'
RSpec.describe Captain::Llm::AssistantChatService do
let(:account) { create(:account) }
let(:assistant) { create(:captain_assistant, account: account) }
let(:conversation) { create(:conversation, account: account) }
let(:mock_chat) { instance_double(RubyLLM::Chat) }
let(:mock_response) do
instance_double(
RubyLLM::Message,
content: '{"response": "I can see the image shows a pricing table", "reasoning": "Analyzed the image"}'
)
end
before do
create(:installation_config, name: 'CAPTAIN_OPEN_AI_API_KEY', value: 'test-key')
allow(RubyLLM).to receive(:chat).and_return(mock_chat)
allow(mock_chat).to receive(:with_temperature).and_return(mock_chat)
allow(mock_chat).to receive(:with_params).and_return(mock_chat)
allow(mock_chat).to receive(:with_tool).and_return(mock_chat)
allow(mock_chat).to receive(:with_instructions).and_return(mock_chat)
allow(mock_chat).to receive(:add_message).and_return(mock_chat)
allow(mock_chat).to receive(:on_end_message).and_return(mock_chat)
allow(mock_chat).to receive(:on_tool_call).and_return(mock_chat)
allow(mock_chat).to receive(:on_tool_result).and_return(mock_chat)
allow(mock_chat).to receive(:messages).and_return([])
end
describe 'image analysis' do
context 'when user sends a message with an image attachment' do
let(:message_history) do
[
{
role: 'user',
content: [
{ type: 'text', text: 'What do you see in this image?' },
{ type: 'image_url', image_url: { url: 'https://example.com/screenshot.png' } }
]
}
]
end
it 'sends the image to the LLM for analysis' do
expect(mock_chat).to receive(:ask).with(
'What do you see in this image?',
with: ['https://example.com/screenshot.png']
).and_return(mock_response)
service = described_class.new(assistant: assistant, conversation_id: conversation.display_id)
service.generate_response(message_history: message_history)
end
end
context 'when user sends only an image without text' do
let(:message_history) do
[
{
role: 'user',
content: [
{ type: 'image_url', image_url: { url: 'https://example.com/photo.jpg' } }
]
}
]
end
it 'sends the image to the LLM with nil text' do
expect(mock_chat).to receive(:ask).with(
nil,
with: ['https://example.com/photo.jpg']
).and_return(mock_response)
service = described_class.new(assistant: assistant, conversation_id: conversation.display_id)
service.generate_response(message_history: message_history)
end
end
context 'when user sends a plain text message' do
let(:message_history) do
[
{ role: 'user', content: 'Hello, how can you help me?' }
]
end
it 'sends the text without attachments' do
expect(mock_chat).to receive(:ask).with('Hello, how can you help me?').and_return(mock_response)
service = described_class.new(assistant: assistant, conversation_id: conversation.display_id)
service.generate_response(message_history: message_history)
end
end
end
describe 'conversation history with images' do
context 'when previous messages contain images' do
let(:message_history) do
[
{
role: 'user',
content: [
{ type: 'text', text: 'Here is my error screenshot' },
{ type: 'image_url', image_url: { url: 'https://example.com/error.png' } }
]
},
{ role: 'assistant', content: 'I see the error. Try restarting.' },
{ role: 'user', content: 'It still does not work' }
]
end
it 'includes images from conversation history in context' do
# First historical message should include the image via RubyLLM::Content
expect(mock_chat).to receive(:add_message) do |args|
expect(args[:role]).to eq(:user)
expect(args[:content]).to be_a(RubyLLM::Content)
expect(args[:content].text).to eq('Here is my error screenshot')
expect(args[:content].attachments.first.source.to_s).to eq('https://example.com/error.png')
end.ordered
# Second historical message is plain text
expect(mock_chat).to receive(:add_message).with(
role: :assistant,
content: 'I see the error. Try restarting.'
).ordered
# Current message asked via chat.ask
expect(mock_chat).to receive(:ask).with('It still does not work').and_return(mock_response)
service = described_class.new(assistant: assistant, conversation_id: conversation.display_id)
service.generate_response(message_history: message_history)
end
end
end
end