From 77493c5d0feefa7b913c0352bbd76530df3ea5f8 Mon Sep 17 00:00:00 2001 From: Aakash Bakhle <48802744+aakashb95@users.noreply.github.com> Date: Thu, 29 Jan 2026 05:37:13 +0530 Subject: [PATCH] fix: captain assistant image comprehension (#13390) # Pull Request Template ## Description Fixes # (issue) When we migrated to RubyLLM, images weren't being sent properly in RubyLLM format to the model, so it did not understand images. ## Type of change Please delete options that are not relevant. - [x] Bug fix (non-breaking change which fixes an issue) ## How Has This Been Tested? Please describe the tests that you ran to verify your changes. Provide instructions so we can reproduce. Please also list any relevant details for your test configuration. specs + local testing Current behaviour on staging: image local testing with fix: image ## Checklist: - [x] My code follows the style guidelines of this project - [x] I have performed a self-review of my code - [x] I have commented on my code, particularly in hard-to-understand areas - [ ] I have made corresponding changes to the documentation - [x] My changes generate no new warnings - [x] I have added tests that prove my fix is effective or that my feature works - [x] New and existing unit tests pass locally with my changes - [x] Any dependent changes have been merged and published in downstream modules --------- Co-authored-by: Sojan Jose --- enterprise/app/helpers/captain/chat_helper.rb | 9 +- .../open_ai_message_builder_service.rb | 9 ++ .../llm/assistant_chat_service_spec.rb | 134 ++++++++++++++++++ 3 files changed, 150 insertions(+), 2 deletions(-) create mode 100644 spec/enterprise/services/captain/llm/assistant_chat_service_spec.rb diff --git a/enterprise/app/helpers/captain/chat_helper.rb b/enterprise/app/helpers/captain/chat_helper.rb index 9bdba5553..2734e510e 100644 --- a/enterprise/app/helpers/captain/chat_helper.rb +++ b/enterprise/app/helpers/captain/chat_helper.rb @@ -10,7 +10,10 @@ module Captain::ChatHelper add_messages_to_chat(chat) with_agent_session do - response = chat.ask(conversation_messages.last[:content]) + last_content = conversation_messages.last[:content] + text, attachments = Captain::OpenAiMessageBuilderService.extract_text_and_attachments(last_content) + + response = attachments.any? ? chat.ask(text, with: attachments) : chat.ask(text) build_response(response) end rescue StandardError => e @@ -68,7 +71,9 @@ module Captain::ChatHelper def add_messages_to_chat(chat) conversation_messages[0...-1].each do |msg| - chat.add_message(role: msg[:role].to_sym, content: msg[:content]) + text, attachments = Captain::OpenAiMessageBuilderService.extract_text_and_attachments(msg[:content]) + content = attachments.any? ? RubyLLM::Content.new(text, attachments) : text + chat.add_message(role: msg[:role].to_sym, content: content) end end diff --git a/enterprise/app/services/captain/open_ai_message_builder_service.rb b/enterprise/app/services/captain/open_ai_message_builder_service.rb index 4aaa64e0a..54c959761 100644 --- a/enterprise/app/services/captain/open_ai_message_builder_service.rb +++ b/enterprise/app/services/captain/open_ai_message_builder_service.rb @@ -1,6 +1,15 @@ class Captain::OpenAiMessageBuilderService pattr_initialize [:message!] + # Extracts text and image URLs from multimodal content array (reverse of generate_content) + def self.extract_text_and_attachments(content) + return [content, []] unless content.is_a?(Array) + + text_parts = content.select { |part| part[:type] == 'text' }.pluck(:text) + image_urls = content.select { |part| part[:type] == 'image_url' }.filter_map { |part| part.dig(:image_url, :url) } + [text_parts.join(' ').presence, image_urls] + end + def generate_content parts = [] parts << text_part(@message.content) if @message.content.present? diff --git a/spec/enterprise/services/captain/llm/assistant_chat_service_spec.rb b/spec/enterprise/services/captain/llm/assistant_chat_service_spec.rb new file mode 100644 index 000000000..bc4ff0b63 --- /dev/null +++ b/spec/enterprise/services/captain/llm/assistant_chat_service_spec.rb @@ -0,0 +1,134 @@ +require 'rails_helper' + +RSpec.describe Captain::Llm::AssistantChatService do + let(:account) { create(:account) } + let(:assistant) { create(:captain_assistant, account: account) } + let(:conversation) { create(:conversation, account: account) } + + let(:mock_chat) { instance_double(RubyLLM::Chat) } + let(:mock_response) do + instance_double( + RubyLLM::Message, + content: '{"response": "I can see the image shows a pricing table", "reasoning": "Analyzed the image"}' + ) + end + + before do + create(:installation_config, name: 'CAPTAIN_OPEN_AI_API_KEY', value: 'test-key') + + allow(RubyLLM).to receive(:chat).and_return(mock_chat) + allow(mock_chat).to receive(:with_temperature).and_return(mock_chat) + allow(mock_chat).to receive(:with_params).and_return(mock_chat) + allow(mock_chat).to receive(:with_tool).and_return(mock_chat) + allow(mock_chat).to receive(:with_instructions).and_return(mock_chat) + allow(mock_chat).to receive(:add_message).and_return(mock_chat) + allow(mock_chat).to receive(:on_end_message).and_return(mock_chat) + allow(mock_chat).to receive(:on_tool_call).and_return(mock_chat) + allow(mock_chat).to receive(:on_tool_result).and_return(mock_chat) + allow(mock_chat).to receive(:messages).and_return([]) + end + + describe 'image analysis' do + context 'when user sends a message with an image attachment' do + let(:message_history) do + [ + { + role: 'user', + content: [ + { type: 'text', text: 'What do you see in this image?' }, + { type: 'image_url', image_url: { url: 'https://example.com/screenshot.png' } } + ] + } + ] + end + + it 'sends the image to the LLM for analysis' do + expect(mock_chat).to receive(:ask).with( + 'What do you see in this image?', + with: ['https://example.com/screenshot.png'] + ).and_return(mock_response) + + service = described_class.new(assistant: assistant, conversation_id: conversation.display_id) + service.generate_response(message_history: message_history) + end + end + + context 'when user sends only an image without text' do + let(:message_history) do + [ + { + role: 'user', + content: [ + { type: 'image_url', image_url: { url: 'https://example.com/photo.jpg' } } + ] + } + ] + end + + it 'sends the image to the LLM with nil text' do + expect(mock_chat).to receive(:ask).with( + nil, + with: ['https://example.com/photo.jpg'] + ).and_return(mock_response) + + service = described_class.new(assistant: assistant, conversation_id: conversation.display_id) + service.generate_response(message_history: message_history) + end + end + + context 'when user sends a plain text message' do + let(:message_history) do + [ + { role: 'user', content: 'Hello, how can you help me?' } + ] + end + + it 'sends the text without attachments' do + expect(mock_chat).to receive(:ask).with('Hello, how can you help me?').and_return(mock_response) + + service = described_class.new(assistant: assistant, conversation_id: conversation.display_id) + service.generate_response(message_history: message_history) + end + end + end + + describe 'conversation history with images' do + context 'when previous messages contain images' do + let(:message_history) do + [ + { + role: 'user', + content: [ + { type: 'text', text: 'Here is my error screenshot' }, + { type: 'image_url', image_url: { url: 'https://example.com/error.png' } } + ] + }, + { role: 'assistant', content: 'I see the error. Try restarting.' }, + { role: 'user', content: 'It still does not work' } + ] + end + + it 'includes images from conversation history in context' do + # First historical message should include the image via RubyLLM::Content + expect(mock_chat).to receive(:add_message) do |args| + expect(args[:role]).to eq(:user) + expect(args[:content]).to be_a(RubyLLM::Content) + expect(args[:content].text).to eq('Here is my error screenshot') + expect(args[:content].attachments.first.source.to_s).to eq('https://example.com/error.png') + end.ordered + + # Second historical message is plain text + expect(mock_chat).to receive(:add_message).with( + role: :assistant, + content: 'I see the error. Try restarting.' + ).ordered + + # Current message asked via chat.ask + expect(mock_chat).to receive(:ask).with('It still does not work').and_return(mock_response) + + service = described_class.new(assistant: assistant, conversation_id: conversation.display_id) + service.generate_response(message_history: message_history) + end + end + end +end