From 811eb6661546da8ddcb4253faad25ef00344ce75 Mon Sep 17 00:00:00 2001
From: Tanmay Deep Sharma <32020192+tds-1@users.noreply.github.com>
Date: Thu, 26 Jun 2025 07:46:09 +0530
Subject: [PATCH] feat: Add support for image files in Captain (#11730)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
# Pull Request Template
## Linear links:
-
https://linear.app/chatwoot/issue/CW-4479/if-image-is-sent-by-the-customer-send-it-to-openai
## Description
This pull request adds “Captain image support” to Chatwoot. It
introduces multimodal message handling so that when a customer sends an
image, Captain can forward the file to OpenAI’s vision endpoint,
generate a caption/analysis
## Type of change
Please delete options that are not relevant.
- [x] New feature (non-breaking change which adds functionality)
## How Has This Been Tested?
## Checklist:
- [ ] My code follows the style guidelines of this project
- [ ] I have performed a self-review of my code
- [ ] I have commented on my code, particularly in hard-to-understand
areas
- [ ] I have made corresponding changes to the documentation
- [ ] My changes generate no new warnings
- [ ] I have added tests that prove my fix is effective or that my
feature works
- [ ] New and existing unit tests pass locally with my changes
- [ ] Any dependent changes have been merged and published in downstream
modules
---------
Co-authored-by: Pranav
---
.../accounts/captain/assistants_controller.rb | 4 +-
.../conversation/response_builder_job.rb | 37 +--
.../captain/llm/assistant_chat_service.rb | 13 +-
.../open_ai_message_builder_service.rb | 59 ++++
.../captain/assistants_controller_spec.rb | 8 +-
.../conversation/response_builder_job_spec.rb | 25 ++
.../open_ai_message_builder_service_spec.rb | 309 ++++++++++++++++++
7 files changed, 418 insertions(+), 37 deletions(-)
create mode 100644 enterprise/app/services/captain/open_ai_message_builder_service.rb
create mode 100644 spec/enterprise/services/captain/open_ai_message_builder_service_spec.rb
diff --git a/enterprise/app/controllers/api/v1/accounts/captain/assistants_controller.rb b/enterprise/app/controllers/api/v1/accounts/captain/assistants_controller.rb
index e5a055836..ec8e8e653 100644
--- a/enterprise/app/controllers/api/v1/accounts/captain/assistants_controller.rb
+++ b/enterprise/app/controllers/api/v1/accounts/captain/assistants_controller.rb
@@ -25,8 +25,8 @@ class Api::V1::Accounts::Captain::AssistantsController < Api::V1::Accounts::Base
def playground
response = Captain::Llm::AssistantChatService.new(assistant: @assistant).generate_response(
- params[:message_content],
- message_history
+ additional_message: params[:message_content],
+ message_history: message_history
)
render json: response
diff --git a/enterprise/app/jobs/captain/conversation/response_builder_job.rb b/enterprise/app/jobs/captain/conversation/response_builder_job.rb
index f341a6e98..431945896 100644
--- a/enterprise/app/jobs/captain/conversation/response_builder_job.rb
+++ b/enterprise/app/jobs/captain/conversation/response_builder_job.rb
@@ -26,8 +26,7 @@ class Captain::Conversation::ResponseBuilderJob < ApplicationJob
def generate_and_process_response
@response = Captain::Llm::AssistantChatService.new(assistant: @assistant).generate_response(
- @conversation.messages.incoming.last.content,
- collect_previous_messages
+ message_history: collect_previous_messages
)
return process_action('handoff') if handoff_requested?
@@ -43,33 +42,11 @@ class Captain::Conversation::ResponseBuilderJob < ApplicationJob
.where(message_type: [:incoming, :outgoing])
.where(private: false)
.map do |message|
- {
- content: message_content(message),
- role: determine_role(message)
- }
- end
- end
-
- def message_content(message)
- return message.content if message.content.present?
- return 'User has shared a message without content' unless message.attachments.any?
-
- audio_transcriptions = extract_audio_transcriptions(message.attachments)
- return audio_transcriptions if audio_transcriptions.present?
-
- 'User has shared an attachment'
- end
-
- def extract_audio_transcriptions(attachments)
- audio_attachments = attachments.where(file_type: :audio)
- return '' if audio_attachments.blank?
-
- transcriptions = ''
- audio_attachments.each do |attachment|
- result = Messages::AudioTranscriptionService.new(attachment).perform
- transcriptions += result[:transcriptions] if result[:success]
+ {
+ content: prepare_multimodal_message_content(message),
+ role: determine_role(message)
+ }
end
- transcriptions
end
def determine_role(message)
@@ -78,6 +55,10 @@ class Captain::Conversation::ResponseBuilderJob < ApplicationJob
message.message_type == 'incoming' ? 'user' : 'system'
end
+ def prepare_multimodal_message_content(message)
+ Captain::OpenAiMessageBuilderService.new(message: message).generate_content
+ end
+
def handoff_requested?
@response['response'] == 'conversation_handoff'
end
diff --git a/enterprise/app/services/captain/llm/assistant_chat_service.rb b/enterprise/app/services/captain/llm/assistant_chat_service.rb
index 569931d44..ca8fafaa0 100644
--- a/enterprise/app/services/captain/llm/assistant_chat_service.rb
+++ b/enterprise/app/services/captain/llm/assistant_chat_service.rb
@@ -12,9 +12,16 @@ class Captain::Llm::AssistantChatService < Llm::BaseOpenAiService
register_tools
end
- def generate_response(input, previous_messages = [], role = 'user')
- @messages += previous_messages
- @messages << { role: role, content: input } if input.present?
+ # additional_message: A single message (String) from the user that should be appended to the chat.
+ # It can be an empty String or nil when you only want to supply historical messages.
+ # message_history: An Array of already formatted messages that provide the previous context.
+ # role: The role for the additional_message (defaults to `user`).
+ #
+ # NOTE: Parameters are provided as keyword arguments to improve clarity and avoid relying on
+ # positional ordering.
+ def generate_response(additional_message: nil, message_history: [], role: 'user')
+ @messages += message_history
+ @messages << { role: role, content: additional_message } if additional_message.present?
request_chat_completion
end
diff --git a/enterprise/app/services/captain/open_ai_message_builder_service.rb b/enterprise/app/services/captain/open_ai_message_builder_service.rb
new file mode 100644
index 000000000..3320ad537
--- /dev/null
+++ b/enterprise/app/services/captain/open_ai_message_builder_service.rb
@@ -0,0 +1,59 @@
+class Captain::OpenAiMessageBuilderService
+ pattr_initialize [:message!]
+
+ def generate_content
+ parts = []
+ parts << text_part(@message.content) if @message.content.present?
+ parts.concat(attachment_parts(@message.attachments)) if @message.attachments.any?
+
+ return 'Message without content' if parts.blank?
+ return parts.first[:text] if parts.one? && parts.first[:type] == 'text'
+
+ parts
+ end
+
+ private
+
+ def text_part(text)
+ { type: 'text', text: text }
+ end
+
+ def image_part(image_url)
+ { type: 'image_url', image_url: { url: image_url } }
+ end
+
+ def attachment_parts(attachments)
+ image_attachments = attachments.where(file_type: :image)
+ image_content = image_parts(image_attachments)
+
+ transcription = extract_audio_transcriptions(attachments)
+ transcription_part = text_part(transcription) if transcription.present?
+
+ attachment_part = text_part('User has shared an attachment') if attachments.where.not(file_type: %i[image audio]).exists?
+
+ [image_content, transcription_part, attachment_part].flatten.compact
+ end
+
+ def image_parts(image_attachments)
+ image_attachments.each_with_object([]) do |attachment, parts|
+ url = get_attachment_url(attachment)
+ parts << image_part(url) if url.present?
+ end
+ end
+
+ def get_attachment_url(attachment)
+ return attachment.external_url if attachment.external_url.present?
+
+ attachment.file.attached? ? attachment.file_url : nil
+ end
+
+ def extract_audio_transcriptions(attachments)
+ audio_attachments = attachments.where(file_type: :audio)
+ return '' if audio_attachments.blank?
+
+ audio_attachments.map do |attachment|
+ result = Messages::AudioTranscriptionService.new(attachment).perform
+ result[:success] ? result[:transcriptions] : ''
+ end.join
+ end
+end
\ No newline at end of file
diff --git a/spec/enterprise/controllers/api/v1/accounts/captain/assistants_controller_spec.rb b/spec/enterprise/controllers/api/v1/accounts/captain/assistants_controller_spec.rb
index 1f6d83d80..80be6f30f 100644
--- a/spec/enterprise/controllers/api/v1/accounts/captain/assistants_controller_spec.rb
+++ b/spec/enterprise/controllers/api/v1/accounts/captain/assistants_controller_spec.rb
@@ -211,8 +211,8 @@ RSpec.describe 'Api::V1::Accounts::Captain::Assistants', type: :request do
expect(response).to have_http_status(:success)
expect(chat_service).to have_received(:generate_response).with(
- valid_params[:message_content],
- valid_params[:message_history]
+ additional_message: valid_params[:message_content],
+ message_history: valid_params[:message_history]
)
expect(json_response[:content]).to eq('Assistant response')
end
@@ -232,8 +232,8 @@ RSpec.describe 'Api::V1::Accounts::Captain::Assistants', type: :request do
expect(response).to have_http_status(:success)
expect(chat_service).to have_received(:generate_response).with(
- params_without_history[:message_content],
- []
+ additional_message: params_without_history[:message_content],
+ message_history: []
)
end
end
diff --git a/spec/enterprise/jobs/captain/conversation/response_builder_job_spec.rb b/spec/enterprise/jobs/captain/conversation/response_builder_job_spec.rb
index 1e4a6e824..ca8d4a6c0 100644
--- a/spec/enterprise/jobs/captain/conversation/response_builder_job_spec.rb
+++ b/spec/enterprise/jobs/captain/conversation/response_builder_job_spec.rb
@@ -30,5 +30,30 @@ RSpec.describe Captain::Conversation::ResponseBuilderJob, type: :job do
account.reload
expect(account.usage_limits[:captain][:responses][:consumed]).to eq(1)
end
+
+ context 'when message contains an image' do
+ let(:message_with_image) { create(:message, conversation: conversation, message_type: :incoming, content: 'Can you help with this error?') }
+ let(:image_attachment) { message_with_image.attachments.create!(account: account, file_type: :image, external_url: 'https://example.com/error.jpg') }
+
+ before do
+ image_attachment
+ end
+
+ it 'includes image URL directly in the message content for OpenAI vision analysis' do
+ # Expect the generate_response to receive multimodal content with image URL
+ expect(mock_llm_chat_service).to receive(:generate_response) do |**kwargs|
+ history = kwargs[:message_history]
+ last_entry = history.last
+ expect(last_entry[:content]).to be_an(Array)
+ expect(last_entry[:content].any? { |part| part[:type] == 'text' && part[:text] == 'Can you help with this error?' }).to be true
+ expect(last_entry[:content].any? do |part|
+ part[:type] == 'image_url' && part[:image_url][:url] == 'https://example.com/error.jpg'
+ end).to be true
+ { 'response' => 'I can see the error in your image. It appears to be a database connection issue.' }
+ end
+
+ described_class.perform_now(conversation, assistant)
+ end
+ end
end
end
diff --git a/spec/enterprise/services/captain/open_ai_message_builder_service_spec.rb b/spec/enterprise/services/captain/open_ai_message_builder_service_spec.rb
new file mode 100644
index 000000000..13c29f756
--- /dev/null
+++ b/spec/enterprise/services/captain/open_ai_message_builder_service_spec.rb
@@ -0,0 +1,309 @@
+require 'rails_helper'
+
+RSpec.describe Captain::OpenAiMessageBuilderService do
+ subject(:service) { described_class.new(message: message) }
+
+ let(:message) { create(:message, content: 'Hello world') }
+
+ describe '#generate_content' do
+ context 'when message has only text content' do
+ it 'returns the text content directly' do
+ expect(service.generate_content).to eq('Hello world')
+ end
+ end
+
+ context 'when message has no content and no attachments' do
+ let(:message) { create(:message, content: nil) }
+
+ it 'returns default message' do
+ expect(service.generate_content).to eq('Message without content')
+ end
+ end
+
+ context 'when message has text content and attachments' do
+ before do
+ attachment = message.attachments.build(account_id: message.account_id, file_type: :image, external_url: 'https://example.com/image.jpg')
+ attachment.save!
+ end
+
+ it 'returns an array of content parts' do
+ result = service.generate_content
+ expect(result).to be_an(Array)
+ expect(result).to include({ type: 'text', text: 'Hello world' })
+ expect(result).to include({ type: 'image_url', image_url: { url: 'https://example.com/image.jpg' } })
+ end
+ end
+
+ context 'when message has only non-text attachments' do
+ let(:message) { create(:message, content: nil) }
+
+ before do
+ attachment = message.attachments.build(account_id: message.account_id, file_type: :image, external_url: 'https://example.com/image.jpg')
+ attachment.save!
+ end
+
+ it 'returns an array of content parts without text' do
+ result = service.generate_content
+ expect(result).to be_an(Array)
+ expect(result).to include({ type: 'image_url', image_url: { url: 'https://example.com/image.jpg' } })
+ expect(result).not_to include(hash_including(type: 'text', text: 'Hello world'))
+ end
+ end
+ end
+
+ describe '#attachment_parts' do
+ let(:message) { create(:message, content: nil) }
+ let(:attachments) { message.attachments }
+
+ context 'with image attachments' do
+ before do
+ attachment = message.attachments.build(account_id: message.account_id, file_type: :image, external_url: 'https://example.com/image.jpg')
+ attachment.save!
+ end
+
+ it 'includes image parts' do
+ result = service.send(:attachment_parts, attachments)
+ expect(result).to include({ type: 'image_url', image_url: { url: 'https://example.com/image.jpg' } })
+ end
+ end
+
+ context 'with audio attachments' do
+ let(:audio_attachment) do
+ attachment = message.attachments.build(account_id: message.account_id, file_type: :audio)
+ attachment.save!
+ attachment
+ end
+
+ before do
+ allow(Messages::AudioTranscriptionService).to receive(:new).with(audio_attachment).and_return(
+ instance_double(Messages::AudioTranscriptionService, perform: { success: true, transcriptions: 'Audio transcription text' })
+ )
+ end
+
+ it 'includes transcription text part' do
+ audio_attachment # trigger creation
+ result = service.send(:attachment_parts, attachments)
+ expect(result).to include({ type: 'text', text: 'Audio transcription text' })
+ end
+ end
+
+ context 'with other file types' do
+ before do
+ attachment = message.attachments.build(account_id: message.account_id, file_type: :file)
+ attachment.save!
+ end
+
+ it 'includes generic attachment message' do
+ result = service.send(:attachment_parts, attachments)
+ expect(result).to include({ type: 'text', text: 'User has shared an attachment' })
+ end
+ end
+
+ context 'with mixed attachment types' do
+ let(:image_attachment) do
+ attachment = message.attachments.build(account_id: message.account_id, file_type: :image, external_url: 'https://example.com/image.jpg')
+ attachment.save!
+ attachment
+ end
+
+ let(:audio_attachment) do
+ attachment = message.attachments.build(account_id: message.account_id, file_type: :audio)
+ attachment.save!
+ attachment
+ end
+
+ let(:document_attachment) do
+ attachment = message.attachments.build(account_id: message.account_id, file_type: :file)
+ attachment.save!
+ attachment
+ end
+
+ before do
+ allow(Messages::AudioTranscriptionService).to receive(:new).with(audio_attachment).and_return(
+ instance_double(Messages::AudioTranscriptionService, perform: { success: true, transcriptions: 'Audio text' })
+ )
+ end
+
+ it 'includes all relevant parts' do
+ image_attachment # trigger creation
+ audio_attachment # trigger creation
+ document_attachment # trigger creation
+
+ result = service.send(:attachment_parts, attachments)
+ expect(result).to include({ type: 'image_url', image_url: { url: 'https://example.com/image.jpg' } })
+ expect(result).to include({ type: 'text', text: 'Audio text' })
+ expect(result).to include({ type: 'text', text: 'User has shared an attachment' })
+ end
+ end
+ end
+
+ describe '#image_parts' do
+ let(:message) { create(:message, content: nil) }
+
+ context 'with valid image attachments' do
+ let(:image1) do
+ attachment = message.attachments.build(account_id: message.account_id, file_type: :image, external_url: 'https://example.com/image1.jpg')
+ attachment.save!
+ attachment
+ end
+
+ let(:image2) do
+ attachment = message.attachments.build(account_id: message.account_id, file_type: :image, external_url: 'https://example.com/image2.jpg')
+ attachment.save!
+ attachment
+ end
+
+ it 'returns image parts for all valid images' do
+ image1 # trigger creation
+ image2 # trigger creation
+
+ image_attachments = message.attachments.where(file_type: :image)
+ result = service.send(:image_parts, image_attachments)
+
+ expect(result).to include({ type: 'image_url', image_url: { url: 'https://example.com/image1.jpg' } })
+ expect(result).to include({ type: 'image_url', image_url: { url: 'https://example.com/image2.jpg' } })
+ end
+ end
+
+ context 'with image attachments without URLs' do
+ let(:image_attachment) do
+ attachment = message.attachments.build(account_id: message.account_id, file_type: :image, external_url: nil)
+ attachment.save!
+ attachment
+ end
+
+ before do
+ allow(image_attachment).to receive(:file).and_return(instance_double(ActiveStorage::Attached::One, attached?: false))
+ end
+
+ it 'skips images without valid URLs' do
+ image_attachment # trigger creation
+
+ image_attachments = message.attachments.where(file_type: :image)
+ result = service.send(:image_parts, image_attachments)
+
+ expect(result).to be_empty
+ end
+ end
+ end
+
+ describe '#get_attachment_url' do
+ let(:attachment) do
+ attachment = message.attachments.build(account_id: message.account_id, file_type: :image)
+ attachment.save!
+ attachment
+ end
+
+ context 'when attachment has external_url' do
+ before { attachment.update(external_url: 'https://example.com/image.jpg') }
+
+ it 'returns external_url' do
+ expect(service.send(:get_attachment_url, attachment)).to eq('https://example.com/image.jpg')
+ end
+ end
+
+ context 'when attachment has attached file' do
+ before do
+ attachment.update(external_url: nil)
+ allow(attachment).to receive(:file).and_return(instance_double(ActiveStorage::Attached::One, attached?: true))
+ allow(attachment).to receive(:file_url).and_return('https://local.com/file.jpg')
+ end
+
+ it 'returns file_url' do
+ expect(service.send(:get_attachment_url, attachment)).to eq('https://local.com/file.jpg')
+ end
+ end
+
+ context 'when attachment has no URL or file' do
+ before do
+ attachment.update(external_url: nil)
+ allow(attachment).to receive(:file).and_return(instance_double(ActiveStorage::Attached::One, attached?: false))
+ end
+
+ it 'returns nil' do
+ expect(service.send(:get_attachment_url, attachment)).to be_nil
+ end
+ end
+ end
+
+ describe '#extract_audio_transcriptions' do
+ let(:message) { create(:message, content: nil) }
+
+ context 'with no audio attachments' do
+ it 'returns empty string' do
+ result = service.send(:extract_audio_transcriptions, message.attachments)
+ expect(result).to eq('')
+ end
+ end
+
+ context 'with successful audio transcriptions' do
+ let(:audio1) do
+ attachment = message.attachments.build(account_id: message.account_id, file_type: :audio)
+ attachment.save!
+ attachment
+ end
+
+ let(:audio2) do
+ attachment = message.attachments.build(account_id: message.account_id, file_type: :audio)
+ attachment.save!
+ attachment
+ end
+
+ before do
+ allow(Messages::AudioTranscriptionService).to receive(:new).with(audio1).and_return(
+ instance_double(Messages::AudioTranscriptionService, perform: { success: true, transcriptions: 'First audio text. ' })
+ )
+ allow(Messages::AudioTranscriptionService).to receive(:new).with(audio2).and_return(
+ instance_double(Messages::AudioTranscriptionService, perform: { success: true, transcriptions: 'Second audio text.' })
+ )
+ end
+
+ it 'concatenates all successful transcriptions' do
+ audio1 # trigger creation
+ audio2 # trigger creation
+
+ attachments = message.attachments
+ result = service.send(:extract_audio_transcriptions, attachments)
+ expect(result).to eq('First audio text. Second audio text.')
+ end
+ end
+
+ context 'with failed audio transcriptions' do
+ let(:audio_attachment) do
+ attachment = message.attachments.build(account_id: message.account_id, file_type: :audio)
+ attachment.save!
+ attachment
+ end
+
+ before do
+ allow(Messages::AudioTranscriptionService).to receive(:new).with(audio_attachment).and_return(
+ instance_double(Messages::AudioTranscriptionService, perform: { success: false, transcriptions: nil })
+ )
+ end
+
+ it 'returns empty string for failed transcriptions' do
+ audio_attachment # trigger creation
+
+ attachments = message.attachments
+ result = service.send(:extract_audio_transcriptions, attachments)
+ expect(result).to eq('')
+ end
+ end
+ end
+
+ describe 'private helper methods' do
+ describe '#text_part' do
+ it 'returns correct text part format' do
+ result = service.send(:text_part, 'Hello world')
+ expect(result).to eq({ type: 'text', text: 'Hello world' })
+ end
+ end
+
+ describe '#image_part' do
+ it 'returns correct image part format' do
+ result = service.send(:image_part, 'https://example.com/image.jpg')
+ expect(result).to eq({ type: 'image_url', image_url: { url: 'https://example.com/image.jpg' } })
+ end
+ end
+ end
+end