From 38af08534cf87225328ff98786b988cf16645294 Mon Sep 17 00:00:00 2001 From: Sojan Jose Date: Tue, 28 Oct 2025 18:31:08 -0700 Subject: [PATCH] fix: Captain response builder not getting triggered (#12729) ## Summary - Fix captain response builder not getting triggered for cases where responses are created as completed. ## Testing Instructions - Test articles with firecrawl - Test articles without firecrawl - Test PDF documents --------- Co-authored-by: Pranav --- .../captain/tools/firecrawl_parser_job.rb | 10 +- .../tools/simple_page_crawl_parser_job.rb | 10 +- enterprise/app/models/captain/document.rb | 21 ++- .../tools/firecrawl_parser_job_spec.rb | 6 +- .../simple_page_crawl_parser_job_spec.rb | 6 +- .../models/captain/document_spec.rb | 168 ++++++++++++++++++ 6 files changed, 207 insertions(+), 14 deletions(-) diff --git a/enterprise/app/jobs/captain/tools/firecrawl_parser_job.rb b/enterprise/app/jobs/captain/tools/firecrawl_parser_job.rb index 41a50a35e..f34b9a640 100644 --- a/enterprise/app/jobs/captain/tools/firecrawl_parser_job.rb +++ b/enterprise/app/jobs/captain/tools/firecrawl_parser_job.rb @@ -5,11 +5,13 @@ class Captain::Tools::FirecrawlParserJob < ApplicationJob assistant = Captain::Assistant.find(assistant_id) metadata = payload[:metadata] + canonical_url = normalize_link(metadata['url']) document = assistant.documents.find_or_initialize_by( - external_link: metadata['url'] + external_link: canonical_url ) document.update!( + external_link: canonical_url, content: payload[:markdown], name: metadata['title'], status: :available @@ -17,4 +19,10 @@ class Captain::Tools::FirecrawlParserJob < ApplicationJob rescue StandardError => e raise "Failed to parse FireCrawl data: #{e.message}" end + + private + + def normalize_link(raw_url) + raw_url.to_s.delete_suffix('/') + end end diff --git a/enterprise/app/jobs/captain/tools/simple_page_crawl_parser_job.rb b/enterprise/app/jobs/captain/tools/simple_page_crawl_parser_job.rb index ccf34adf2..89054f385 100644 --- a/enterprise/app/jobs/captain/tools/simple_page_crawl_parser_job.rb +++ b/enterprise/app/jobs/captain/tools/simple_page_crawl_parser_job.rb @@ -15,11 +15,11 @@ class Captain::Tools::SimplePageCrawlParserJob < ApplicationJob page_title = crawler.page_title || '' content = crawler.body_text_content || '' - document = assistant.documents.find_or_initialize_by( - external_link: page_link - ) + normalized_link = normalize_link(page_link) + document = assistant.documents.find_or_initialize_by(external_link: normalized_link) document.update!( + external_link: normalized_link, name: page_title[0..254], content: content[0..14_999], status: :available ) rescue StandardError => e @@ -28,6 +28,10 @@ class Captain::Tools::SimplePageCrawlParserJob < ApplicationJob private + def normalize_link(raw_link) + raw_link.to_s.delete_suffix('/') + end + def limit_exceeded?(account) limits = account.usage_limits[:captain][:documents] limits[:current_available].negative? || limits[:current_available].zero? diff --git a/enterprise/app/models/captain/document.rb b/enterprise/app/models/captain/document.rb index c278eb879..751162b28 100644 --- a/enterprise/app/models/captain/document.rb +++ b/enterprise/app/models/captain/document.rb @@ -37,6 +37,7 @@ class Captain::Document < ApplicationRecord validate :validate_file_attachment, if: -> { pdf_file.attached? } before_validation :ensure_account_id before_validation :set_external_link_for_pdf + before_validation :normalize_external_link enum status: { in_progress: 0, @@ -47,7 +48,7 @@ class Captain::Document < ApplicationRecord after_create_commit :enqueue_crawl_job after_create_commit :update_document_usage after_destroy :update_document_usage - after_commit :enqueue_response_builder_job, on: :update, if: :should_enqueue_response_builder? + after_commit :enqueue_response_builder_job scope :ordered, -> { order(created_at: :desc) } scope :for_account, ->(account_id) { where(account_id: account_id) } @@ -94,15 +95,18 @@ class Captain::Document < ApplicationRecord end def enqueue_response_builder_job - return if status != 'available' + return unless should_enqueue_response_builder? Captain::Documents::ResponseBuilderJob.perform_later(self) end def should_enqueue_response_builder? - # Only enqueue when status changes to available - # Avoid re-enqueueing when metadata is updated by the job itself - saved_change_to_status? && status == 'available' + return false if destroyed? + return false unless available? + + return saved_change_to_status? if pdf_document? + + (saved_change_to_status? || saved_change_to_content?) && content.present? end def update_document_usage @@ -140,4 +144,11 @@ class Captain::Document < ApplicationRecord timestamp = Time.current.strftime('%Y%m%d%H%M%S') self.external_link = "PDF: #{pdf_file.filename.base}_#{timestamp}" end + + def normalize_external_link + return if external_link.blank? + return if pdf_document? + + self.external_link = external_link.delete_suffix('/') + end end diff --git a/spec/enterprise/jobs/captain/tools/firecrawl_parser_job_spec.rb b/spec/enterprise/jobs/captain/tools/firecrawl_parser_job_spec.rb index f2a42c7a2..bfffe7894 100644 --- a/spec/enterprise/jobs/captain/tools/firecrawl_parser_job_spec.rb +++ b/spec/enterprise/jobs/captain/tools/firecrawl_parser_job_spec.rb @@ -23,7 +23,7 @@ RSpec.describe Captain::Tools::FirecrawlParserJob, type: :job do expect(document).to have_attributes( content: payload[:markdown], name: payload[:metadata]['title'], - external_link: payload[:metadata]['url'], + external_link: 'https://www.firecrawl.dev', status: 'available' ) end @@ -32,7 +32,7 @@ RSpec.describe Captain::Tools::FirecrawlParserJob, type: :job do existing_document = create(:captain_document, assistant: assistant, account: assistant.account, - external_link: payload[:metadata]['url'], + external_link: 'https://www.firecrawl.dev', content: 'old content', name: 'old title', status: :in_progress) @@ -42,7 +42,9 @@ RSpec.describe Captain::Tools::FirecrawlParserJob, type: :job do end.not_to change(assistant.documents, :count) existing_document.reload + # Payload URL ends with '/', but we persist the canonical URL without it. expect(existing_document).to have_attributes( + external_link: 'https://www.firecrawl.dev', content: payload[:markdown], name: payload[:metadata]['title'], status: 'available' diff --git a/spec/enterprise/jobs/captain/tools/simple_page_crawl_parser_job_spec.rb b/spec/enterprise/jobs/captain/tools/simple_page_crawl_parser_job_spec.rb index f563ba44d..3872629e5 100644 --- a/spec/enterprise/jobs/captain/tools/simple_page_crawl_parser_job_spec.rb +++ b/spec/enterprise/jobs/captain/tools/simple_page_crawl_parser_job_spec.rb @@ -3,7 +3,7 @@ require 'rails_helper' RSpec.describe Captain::Tools::SimplePageCrawlParserJob, type: :job do describe '#perform' do let(:assistant) { create(:captain_assistant) } - let(:page_link) { 'https://example.com/page' } + let(:page_link) { 'https://example.com/page/' } let(:page_title) { 'Example Page Title' } let(:content) { 'Some page content here' } let(:crawler) { instance_double(Captain::Tools::SimplePageCrawlService) } @@ -24,7 +24,7 @@ RSpec.describe Captain::Tools::SimplePageCrawlParserJob, type: :job do end.to change(assistant.documents, :count).by(1) document = assistant.documents.last - expect(document.external_link).to eq(page_link) + expect(document.external_link).to eq('https://example.com/page') expect(document.name).to eq(page_title) expect(document.content).to eq(content) expect(document.status).to eq('available') @@ -33,7 +33,7 @@ RSpec.describe Captain::Tools::SimplePageCrawlParserJob, type: :job do it 'updates existing document if one exists' do existing_document = create(:captain_document, assistant: assistant, - external_link: page_link, + external_link: 'https://example.com/page', name: 'Old Title', content: 'Old content') diff --git a/spec/enterprise/models/captain/document_spec.rb b/spec/enterprise/models/captain/document_spec.rb index 56dc1727c..e6543e0d7 100644 --- a/spec/enterprise/models/captain/document_spec.rb +++ b/spec/enterprise/models/captain/document_spec.rb @@ -4,6 +4,17 @@ RSpec.describe Captain::Document, type: :model do let(:account) { create(:account) } let(:assistant) { create(:captain_assistant, account: account) } + describe 'URL normalization' do + it 'removes a trailing slash before validation' do + document = create(:captain_document, + assistant: assistant, + account: account, + external_link: 'https://example.com/path/') + + expect(document.external_link).to eq('https://example.com/path') + end + end + describe 'PDF support' do let(:pdf_document) do doc = build(:captain_document, assistant: assistant, account: account) @@ -82,4 +93,161 @@ RSpec.describe Captain::Document, type: :model do end end end + + describe 'response builder job callback' do + before { clear_enqueued_jobs } + + describe 'non-PDF documents' do + it 'enqueues when created with available status and content' do + expect do + create(:captain_document, assistant: assistant, account: account, status: :available) + end.to have_enqueued_job(Captain::Documents::ResponseBuilderJob) + end + + it 'does not enqueue when created available without content' do + expect do + create(:captain_document, assistant: assistant, account: account, status: :available, content: nil) + end.not_to have_enqueued_job(Captain::Documents::ResponseBuilderJob) + end + + it 'enqueues when status transitions to available with existing content' do + document = create(:captain_document, assistant: assistant, account: account, status: :in_progress) + + expect do + document.update!(status: :available) + end.to have_enqueued_job(Captain::Documents::ResponseBuilderJob) + end + + it 'does not enqueue when status transitions to available without content' do + document = create( + :captain_document, + assistant: assistant, + account: account, + status: :in_progress, + content: nil + ) + + expect do + document.update!(status: :available) + end.not_to have_enqueued_job(Captain::Documents::ResponseBuilderJob) + end + + it 'enqueues when content is populated on an available document' do + document = create( + :captain_document, + assistant: assistant, + account: account, + status: :available, + content: nil + ) + clear_enqueued_jobs + + expect do + document.update!(content: 'Fresh content from crawl') + end.to have_enqueued_job(Captain::Documents::ResponseBuilderJob) + end + + it 'enqueues when content changes on an available document' do + document = create( + :captain_document, + assistant: assistant, + account: account, + status: :available, + content: 'Initial content' + ) + clear_enqueued_jobs + + expect do + document.update!(content: 'Updated crawl content') + end.to have_enqueued_job(Captain::Documents::ResponseBuilderJob) + end + + it 'does not enqueue when content is cleared on an available document' do + document = create( + :captain_document, + assistant: assistant, + account: account, + status: :available, + content: 'Initial content' + ) + clear_enqueued_jobs + + expect do + document.update!(content: nil) + end.not_to have_enqueued_job(Captain::Documents::ResponseBuilderJob) + end + + it 'does not enqueue for metadata-only updates' do + document = create(:captain_document, assistant: assistant, account: account, status: :available) + clear_enqueued_jobs + + expect do + document.update!(metadata: { 'title' => 'Updated Again' }) + end.not_to have_enqueued_job(Captain::Documents::ResponseBuilderJob) + end + + it 'does not enqueue while document remains in progress' do + document = create(:captain_document, assistant: assistant, account: account, status: :in_progress) + + expect do + document.update!(metadata: { 'title' => 'Updated' }) + end.not_to have_enqueued_job(Captain::Documents::ResponseBuilderJob) + end + end + + describe 'PDF documents' do + def build_pdf_document(status:, content:) + build( + :captain_document, + assistant: assistant, + account: account, + status: status, + content: content + ).tap do |doc| + doc.pdf_file.attach( + io: StringIO.new('PDF content'), + filename: 'sample.pdf', + content_type: 'application/pdf' + ) + end + end + + it 'enqueues when created available without content' do + document = build_pdf_document(status: :available, content: nil) + + expect do + document.save! + end.to have_enqueued_job(Captain::Documents::ResponseBuilderJob) + end + + it 'enqueues when status transitions to available' do + document = build_pdf_document(status: :in_progress, content: nil) + document.save! + clear_enqueued_jobs + + expect do + document.update!(status: :available) + end.to have_enqueued_job(Captain::Documents::ResponseBuilderJob) + end + + it 'does not enqueue when content updates without status change' do + document = build_pdf_document(status: :available, content: nil) + document.save! + clear_enqueued_jobs + + expect do + document.update!(content: 'Extracted PDF text') + end.not_to have_enqueued_job(Captain::Documents::ResponseBuilderJob) + end + end + + it 'does not enqueue when the document is destroyed' do + document = create(:captain_document, assistant: assistant, account: account, status: :available) + clear_enqueued_jobs + + expect do + document.destroy! + end.not_to have_enqueued_job(Captain::Documents::ResponseBuilderJob) + end + end end