Files
leadchat/enterprise/app/services/captain/open_ai_message_builder_service.rb
Aakash Bakhle 77493c5d0f fix: captain assistant image comprehension (#13390)
# Pull Request Template

## Description

Fixes # (issue)

When we migrated to RubyLLM, images weren't being sent properly in
RubyLLM format to the model, so it did not understand images.

## Type of change

Please delete options that are not relevant.

- [x] Bug fix (non-breaking change which fixes an issue)

## How Has This Been Tested?

Please describe the tests that you ran to verify your changes. Provide
instructions so we can reproduce. Please also list any relevant details
for your test configuration.

specs + local testing

Current behaviour on staging:
<img width="772" height="1012" alt="image"
src="https://github.com/user-attachments/assets/7b7d360f-dea4-48af-b20b-ee4c98a38a85"
/>

local testing with fix:
<img width="792" height="1216" alt="image"
src="https://github.com/user-attachments/assets/5ef82452-015e-4bda-a68f-884d00acb014"
/>


## Checklist:

- [x] My code follows the style guidelines of this project
- [x] I have performed a self-review of my code
- [x] I have commented on my code, particularly in hard-to-understand
areas
- [ ] I have made corresponding changes to the documentation
- [x] My changes generate no new warnings
- [x] I have added tests that prove my fix is effective or that my
feature works
- [x] New and existing unit tests pass locally with my changes
- [x] Any dependent changes have been merged and published in downstream
modules

---------

Co-authored-by: Sojan Jose <sojan@pepalo.com>
2026-01-28 16:07:13 -08:00

70 lines
2.3 KiB
Ruby

class Captain::OpenAiMessageBuilderService
pattr_initialize [:message!]
# Extracts text and image URLs from multimodal content array (reverse of generate_content)
def self.extract_text_and_attachments(content)
return [content, []] unless content.is_a?(Array)
text_parts = content.select { |part| part[:type] == 'text' }.pluck(:text)
image_urls = content.select { |part| part[:type] == 'image_url' }.filter_map { |part| part.dig(:image_url, :url) }
[text_parts.join(' ').presence, image_urls]
end
def generate_content
parts = []
parts << text_part(@message.content) if @message.content.present?
parts.concat(attachment_parts(@message.attachments)) if @message.attachments.any?
return 'Message without content' if parts.blank?
return parts.first[:text] if parts.one? && parts.first[:type] == 'text'
parts
end
private
def text_part(text)
{ type: 'text', text: text }
end
def image_part(image_url)
{ type: 'image_url', image_url: { url: image_url } }
end
def attachment_parts(attachments)
image_attachments = attachments.where(file_type: :image)
image_content = image_parts(image_attachments)
transcription = extract_audio_transcriptions(attachments)
transcription_part = text_part(transcription) if transcription.present?
attachment_part = text_part('User has shared an attachment') if attachments.where.not(file_type: %i[image audio]).exists?
[image_content, transcription_part, attachment_part].flatten.compact
end
def image_parts(image_attachments)
image_attachments.each_with_object([]) do |attachment, parts|
url = get_attachment_url(attachment)
parts << image_part(url) if url.present?
end
end
def get_attachment_url(attachment)
return attachment.download_url if attachment.download_url.present?
return attachment.external_url if attachment.external_url.present?
attachment.file.attached? ? attachment.file_url : nil
end
def extract_audio_transcriptions(attachments)
audio_attachments = attachments.where(file_type: :audio)
return '' if audio_attachments.blank?
audio_attachments.map do |attachment|
result = Messages::AudioTranscriptionService.new(attachment).perform
result[:success] ? result[:transcriptions] : ''
end.join
end
end