feat: Add BE changes for captain pdf support for faq generation (#12113)

This commit is contained in:
Tanmay Deep Sharma
2025-08-27 22:01:22 +07:00
committed by GitHub
parent 3cefa9b767
commit 1ba00075ce
19 changed files with 856 additions and 12 deletions

View File

@@ -105,5 +105,29 @@ RSpec.describe Captain::Documents::CrawlJob, type: :job do
described_class.perform_now(document)
end
end
context 'when document is a PDF' do
let(:pdf_document) do
doc = create(:captain_document, external_link: 'https://example.com/document')
allow(doc).to receive(:pdf_document?).and_return(true)
allow(doc).to receive(:update!).and_return(true)
doc
end
it 'processes PDF using PdfProcessingService' do
pdf_service = instance_double(Captain::Llm::PdfProcessingService)
expect(Captain::Llm::PdfProcessingService).to receive(:new).with(pdf_document).and_return(pdf_service)
expect(pdf_service).to receive(:process)
expect(pdf_document).to receive(:update!).with(status: :available)
described_class.perform_now(pdf_document)
end
it 'handles PDF processing errors' do
allow(Captain::Llm::PdfProcessingService).to receive(:new).and_raise(StandardError, 'Processing failed')
expect { described_class.perform_now(pdf_document) }.to raise_error(StandardError, 'Processing failed')
end
end
end
end

View File

@@ -64,5 +64,41 @@ RSpec.describe Captain::Documents::ResponseBuilderJob, type: :job do
.with(spanish_document.content, 'portuguese')
end
end
context 'when processing a PDF document' do
let(:pdf_document) do
doc = create(:captain_document, assistant: assistant)
allow(doc).to receive(:pdf_document?).and_return(true)
allow(doc).to receive(:openai_file_id).and_return('file-123')
allow(doc).to receive(:update!).and_return(true)
allow(doc).to receive(:metadata).and_return({})
doc
end
let(:paginated_service) { instance_double(Captain::Llm::PaginatedFaqGeneratorService) }
let(:pdf_faqs) do
[{ 'question' => 'What is in the PDF?', 'answer' => 'Important content' }]
end
before do
allow(Captain::Llm::PaginatedFaqGeneratorService).to receive(:new)
.with(pdf_document, anything)
.and_return(paginated_service)
allow(paginated_service).to receive(:generate).and_return(pdf_faqs)
allow(paginated_service).to receive(:total_pages_processed).and_return(10)
allow(paginated_service).to receive(:iterations_completed).and_return(1)
end
it 'uses paginated FAQ generator for PDFs' do
expect(Captain::Llm::PaginatedFaqGeneratorService).to receive(:new).with(pdf_document, anything)
described_class.new.perform(pdf_document)
end
it 'stores pagination metadata' do
expect(pdf_document).to receive(:update!).with(hash_including(metadata: hash_including('faq_generation')))
described_class.new.perform(pdf_document)
end
end
end
end

View File

@@ -0,0 +1,85 @@
require 'rails_helper'
RSpec.describe Captain::Document, type: :model do
let(:account) { create(:account) }
let(:assistant) { create(:captain_assistant, account: account) }
describe 'PDF support' do
let(:pdf_document) do
doc = build(:captain_document, assistant: assistant, account: account)
doc.pdf_file.attach(
io: StringIO.new('PDF content'),
filename: 'test.pdf',
content_type: 'application/pdf'
)
doc
end
describe 'validations' do
it 'allows PDF file without external link' do
pdf_document.external_link = nil
expect(pdf_document).to be_valid
end
it 'validates PDF file size' do
doc = build(:captain_document, assistant: assistant, account: account)
doc.pdf_file.attach(
io: StringIO.new('x' * 11.megabytes),
filename: 'large.pdf',
content_type: 'application/pdf'
)
doc.external_link = nil
expect(doc).not_to be_valid
expect(doc.errors[:pdf_file]).to include(I18n.t('captain.documents.pdf_size_error'))
end
end
describe '#pdf_document?' do
it 'returns true for attached PDF' do
expect(pdf_document.pdf_document?).to be true
end
it 'returns true for .pdf external links' do
doc = build(:captain_document, external_link: 'https://example.com/document.pdf')
expect(doc.pdf_document?).to be true
end
it 'returns false for non-PDF documents' do
doc = build(:captain_document, external_link: 'https://example.com')
expect(doc.pdf_document?).to be false
end
end
describe '#display_url' do
it 'returns Rails blob URL for attached PDFs' do
pdf_document.save!
# The display_url method calls rails_blob_url which returns a URL containing 'rails/active_storage'
url = pdf_document.display_url
expect(url).to be_present
end
it 'returns external_link for web documents' do
doc = create(:captain_document, external_link: 'https://example.com')
expect(doc.display_url).to eq('https://example.com')
end
end
describe '#store_openai_file_id' do
it 'stores the file ID in metadata' do
pdf_document.save!
pdf_document.store_openai_file_id('file-abc123')
expect(pdf_document.reload.openai_file_id).to eq('file-abc123')
end
end
describe 'automatic external_link generation' do
it 'generates unique external_link for PDFs' do
pdf_document.external_link = nil
pdf_document.save!
expect(pdf_document.external_link).to start_with('PDF: test_')
end
end
end
end

View File

@@ -0,0 +1,106 @@
require 'rails_helper'
require 'custom_exceptions/pdf_processing_error'
RSpec.describe Captain::Llm::PaginatedFaqGeneratorService do
let(:document) { create(:captain_document) }
let(:service) { described_class.new(document, pages_per_chunk: 5) }
let(:openai_client) { instance_double(OpenAI::Client) }
before do
# Mock OpenAI configuration
installation_config = instance_double(InstallationConfig, value: 'test-api-key')
allow(InstallationConfig).to receive(:find_by!)
.with(name: 'CAPTAIN_OPEN_AI_API_KEY')
.and_return(installation_config)
allow(OpenAI::Client).to receive(:new).and_return(openai_client)
end
describe '#generate' do
context 'when document lacks OpenAI file ID' do
before do
allow(document).to receive(:openai_file_id).and_return(nil)
end
it 'raises an error' do
expect { service.generate }.to raise_error(CustomExceptions::PdfFaqGenerationError)
end
end
context 'when generating FAQs from PDF pages' do
let(:faq_response) do
{
'choices' => [{
'message' => {
'content' => JSON.generate({
'faqs' => [
{ 'question' => 'What is this document about?', 'answer' => 'It explains key concepts.' }
],
'has_content' => true
})
}
}]
}
end
let(:empty_response) do
{
'choices' => [{
'message' => {
'content' => JSON.generate({
'faqs' => [],
'has_content' => false
})
}
}]
}
end
before do
allow(document).to receive(:openai_file_id).and_return('file-123')
end
it 'generates FAQs from paginated content' do
allow(openai_client).to receive(:chat).and_return(faq_response, empty_response)
faqs = service.generate
expect(faqs).to have_attributes(size: 1)
expect(faqs.first['question']).to eq('What is this document about?')
end
it 'stops when no more content' do
allow(openai_client).to receive(:chat).and_return(empty_response)
faqs = service.generate
expect(faqs).to be_empty
end
it 'respects max iterations limit' do
allow(openai_client).to receive(:chat).and_return(faq_response)
# Force max iterations
service.instance_variable_set(:@iterations_completed, 19)
service.generate
expect(service.iterations_completed).to eq(20)
end
end
end
describe '#should_continue_processing?' do
it 'stops at max iterations' do
service.instance_variable_set(:@iterations_completed, 20)
expect(service.should_continue_processing?(faqs: ['faq'], has_content: true)).to be false
end
it 'stops when no FAQs returned' do
expect(service.should_continue_processing?(faqs: [], has_content: true)).to be false
end
it 'continues when FAQs exist and under limits' do
expect(service.should_continue_processing?(faqs: ['faq'], has_content: true)).to be true
end
end
end

View File

@@ -0,0 +1,58 @@
require 'rails_helper'
require 'custom_exceptions/pdf_processing_error'
RSpec.describe Captain::Llm::PdfProcessingService do
let(:document) { create(:captain_document) }
let(:service) { described_class.new(document) }
before do
# Mock OpenAI configuration
installation_config = instance_double(InstallationConfig, value: 'test-api-key')
allow(InstallationConfig).to receive(:find_by!)
.with(name: 'CAPTAIN_OPEN_AI_API_KEY')
.and_return(installation_config)
end
describe '#process' do
context 'when document already has OpenAI file ID' do
before do
allow(document).to receive(:openai_file_id).and_return('existing-file-id')
end
it 'skips upload' do
expect(document).not_to receive(:store_openai_file_id)
service.process
end
end
context 'when uploading PDF to OpenAI' do
let(:mock_client) { instance_double(OpenAI::Client) }
let(:pdf_content) { 'PDF content' }
before do
allow(document).to receive(:openai_file_id).and_return(nil)
# Use a simple double for ActiveStorage since it's a complex Rails object
pdf_file = double('pdf_file', download: pdf_content) # rubocop:disable RSpec/VerifiedDoubles
allow(document).to receive(:pdf_file).and_return(pdf_file)
allow(OpenAI::Client).to receive(:new).and_return(mock_client)
# Use a simple double for OpenAI::Files as it may not be loaded
files_api = double('files_api') # rubocop:disable RSpec/VerifiedDoubles
allow(files_api).to receive(:upload).and_return({ 'id' => 'file-abc123' })
allow(mock_client).to receive(:files).and_return(files_api)
end
it 'uploads PDF and stores file ID' do
expect(document).to receive(:store_openai_file_id).with('file-abc123')
service.process
end
it 'raises error when upload fails' do
allow(mock_client.files).to receive(:upload).and_return({ 'id' => nil })
expect { service.process }.to raise_error(CustomExceptions::PdfUploadError)
end
end
end
end