fix: captain assistant image comprehension (#13390)
# Pull Request Template ## Description Fixes # (issue) When we migrated to RubyLLM, images weren't being sent properly in RubyLLM format to the model, so it did not understand images. ## Type of change Please delete options that are not relevant. - [x] Bug fix (non-breaking change which fixes an issue) ## How Has This Been Tested? Please describe the tests that you ran to verify your changes. Provide instructions so we can reproduce. Please also list any relevant details for your test configuration. specs + local testing Current behaviour on staging: <img width="772" height="1012" alt="image" src="https://github.com/user-attachments/assets/7b7d360f-dea4-48af-b20b-ee4c98a38a85" /> local testing with fix: <img width="792" height="1216" alt="image" src="https://github.com/user-attachments/assets/5ef82452-015e-4bda-a68f-884d00acb014" /> ## Checklist: - [x] My code follows the style guidelines of this project - [x] I have performed a self-review of my code - [x] I have commented on my code, particularly in hard-to-understand areas - [ ] I have made corresponding changes to the documentation - [x] My changes generate no new warnings - [x] I have added tests that prove my fix is effective or that my feature works - [x] New and existing unit tests pass locally with my changes - [x] Any dependent changes have been merged and published in downstream modules --------- Co-authored-by: Sojan Jose <sojan@pepalo.com>
This commit is contained in:
@@ -10,7 +10,10 @@ module Captain::ChatHelper
|
||||
|
||||
add_messages_to_chat(chat)
|
||||
with_agent_session do
|
||||
response = chat.ask(conversation_messages.last[:content])
|
||||
last_content = conversation_messages.last[:content]
|
||||
text, attachments = Captain::OpenAiMessageBuilderService.extract_text_and_attachments(last_content)
|
||||
|
||||
response = attachments.any? ? chat.ask(text, with: attachments) : chat.ask(text)
|
||||
build_response(response)
|
||||
end
|
||||
rescue StandardError => e
|
||||
@@ -68,7 +71,9 @@ module Captain::ChatHelper
|
||||
|
||||
def add_messages_to_chat(chat)
|
||||
conversation_messages[0...-1].each do |msg|
|
||||
chat.add_message(role: msg[:role].to_sym, content: msg[:content])
|
||||
text, attachments = Captain::OpenAiMessageBuilderService.extract_text_and_attachments(msg[:content])
|
||||
content = attachments.any? ? RubyLLM::Content.new(text, attachments) : text
|
||||
chat.add_message(role: msg[:role].to_sym, content: content)
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
@@ -1,6 +1,15 @@
|
||||
class Captain::OpenAiMessageBuilderService
|
||||
pattr_initialize [:message!]
|
||||
|
||||
# Extracts text and image URLs from multimodal content array (reverse of generate_content)
|
||||
def self.extract_text_and_attachments(content)
|
||||
return [content, []] unless content.is_a?(Array)
|
||||
|
||||
text_parts = content.select { |part| part[:type] == 'text' }.pluck(:text)
|
||||
image_urls = content.select { |part| part[:type] == 'image_url' }.filter_map { |part| part.dig(:image_url, :url) }
|
||||
[text_parts.join(' ').presence, image_urls]
|
||||
end
|
||||
|
||||
def generate_content
|
||||
parts = []
|
||||
parts << text_part(@message.content) if @message.content.present?
|
||||
|
||||
@@ -0,0 +1,134 @@
|
||||
require 'rails_helper'
|
||||
|
||||
RSpec.describe Captain::Llm::AssistantChatService do
|
||||
let(:account) { create(:account) }
|
||||
let(:assistant) { create(:captain_assistant, account: account) }
|
||||
let(:conversation) { create(:conversation, account: account) }
|
||||
|
||||
let(:mock_chat) { instance_double(RubyLLM::Chat) }
|
||||
let(:mock_response) do
|
||||
instance_double(
|
||||
RubyLLM::Message,
|
||||
content: '{"response": "I can see the image shows a pricing table", "reasoning": "Analyzed the image"}'
|
||||
)
|
||||
end
|
||||
|
||||
before do
|
||||
create(:installation_config, name: 'CAPTAIN_OPEN_AI_API_KEY', value: 'test-key')
|
||||
|
||||
allow(RubyLLM).to receive(:chat).and_return(mock_chat)
|
||||
allow(mock_chat).to receive(:with_temperature).and_return(mock_chat)
|
||||
allow(mock_chat).to receive(:with_params).and_return(mock_chat)
|
||||
allow(mock_chat).to receive(:with_tool).and_return(mock_chat)
|
||||
allow(mock_chat).to receive(:with_instructions).and_return(mock_chat)
|
||||
allow(mock_chat).to receive(:add_message).and_return(mock_chat)
|
||||
allow(mock_chat).to receive(:on_end_message).and_return(mock_chat)
|
||||
allow(mock_chat).to receive(:on_tool_call).and_return(mock_chat)
|
||||
allow(mock_chat).to receive(:on_tool_result).and_return(mock_chat)
|
||||
allow(mock_chat).to receive(:messages).and_return([])
|
||||
end
|
||||
|
||||
describe 'image analysis' do
|
||||
context 'when user sends a message with an image attachment' do
|
||||
let(:message_history) do
|
||||
[
|
||||
{
|
||||
role: 'user',
|
||||
content: [
|
||||
{ type: 'text', text: 'What do you see in this image?' },
|
||||
{ type: 'image_url', image_url: { url: 'https://example.com/screenshot.png' } }
|
||||
]
|
||||
}
|
||||
]
|
||||
end
|
||||
|
||||
it 'sends the image to the LLM for analysis' do
|
||||
expect(mock_chat).to receive(:ask).with(
|
||||
'What do you see in this image?',
|
||||
with: ['https://example.com/screenshot.png']
|
||||
).and_return(mock_response)
|
||||
|
||||
service = described_class.new(assistant: assistant, conversation_id: conversation.display_id)
|
||||
service.generate_response(message_history: message_history)
|
||||
end
|
||||
end
|
||||
|
||||
context 'when user sends only an image without text' do
|
||||
let(:message_history) do
|
||||
[
|
||||
{
|
||||
role: 'user',
|
||||
content: [
|
||||
{ type: 'image_url', image_url: { url: 'https://example.com/photo.jpg' } }
|
||||
]
|
||||
}
|
||||
]
|
||||
end
|
||||
|
||||
it 'sends the image to the LLM with nil text' do
|
||||
expect(mock_chat).to receive(:ask).with(
|
||||
nil,
|
||||
with: ['https://example.com/photo.jpg']
|
||||
).and_return(mock_response)
|
||||
|
||||
service = described_class.new(assistant: assistant, conversation_id: conversation.display_id)
|
||||
service.generate_response(message_history: message_history)
|
||||
end
|
||||
end
|
||||
|
||||
context 'when user sends a plain text message' do
|
||||
let(:message_history) do
|
||||
[
|
||||
{ role: 'user', content: 'Hello, how can you help me?' }
|
||||
]
|
||||
end
|
||||
|
||||
it 'sends the text without attachments' do
|
||||
expect(mock_chat).to receive(:ask).with('Hello, how can you help me?').and_return(mock_response)
|
||||
|
||||
service = described_class.new(assistant: assistant, conversation_id: conversation.display_id)
|
||||
service.generate_response(message_history: message_history)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe 'conversation history with images' do
|
||||
context 'when previous messages contain images' do
|
||||
let(:message_history) do
|
||||
[
|
||||
{
|
||||
role: 'user',
|
||||
content: [
|
||||
{ type: 'text', text: 'Here is my error screenshot' },
|
||||
{ type: 'image_url', image_url: { url: 'https://example.com/error.png' } }
|
||||
]
|
||||
},
|
||||
{ role: 'assistant', content: 'I see the error. Try restarting.' },
|
||||
{ role: 'user', content: 'It still does not work' }
|
||||
]
|
||||
end
|
||||
|
||||
it 'includes images from conversation history in context' do
|
||||
# First historical message should include the image via RubyLLM::Content
|
||||
expect(mock_chat).to receive(:add_message) do |args|
|
||||
expect(args[:role]).to eq(:user)
|
||||
expect(args[:content]).to be_a(RubyLLM::Content)
|
||||
expect(args[:content].text).to eq('Here is my error screenshot')
|
||||
expect(args[:content].attachments.first.source.to_s).to eq('https://example.com/error.png')
|
||||
end.ordered
|
||||
|
||||
# Second historical message is plain text
|
||||
expect(mock_chat).to receive(:add_message).with(
|
||||
role: :assistant,
|
||||
content: 'I see the error. Try restarting.'
|
||||
).ordered
|
||||
|
||||
# Current message asked via chat.ask
|
||||
expect(mock_chat).to receive(:ask).with('It still does not work').and_return(mock_response)
|
||||
|
||||
service = described_class.new(assistant: assistant, conversation_id: conversation.display_id)
|
||||
service.generate_response(message_history: message_history)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
Reference in New Issue
Block a user