fix: Captain response builder not getting triggered (#12729)

## Summary
- Fix captain response builder not getting triggered for cases where
responses are created as completed.

## Testing Instructions 
- Test articles with firecrawl
- Test articles without firecrawl
- Test PDF documents

---------

Co-authored-by: Pranav <pranav@chatwoot.com>
This commit is contained in:
Sojan Jose
2025-10-28 18:31:08 -07:00
committed by GitHub
parent ee1ea9576b
commit 38af08534c
6 changed files with 207 additions and 14 deletions

View File

@@ -5,11 +5,13 @@ class Captain::Tools::FirecrawlParserJob < ApplicationJob
assistant = Captain::Assistant.find(assistant_id)
metadata = payload[:metadata]
canonical_url = normalize_link(metadata['url'])
document = assistant.documents.find_or_initialize_by(
external_link: metadata['url']
external_link: canonical_url
)
document.update!(
external_link: canonical_url,
content: payload[:markdown],
name: metadata['title'],
status: :available
@@ -17,4 +19,10 @@ class Captain::Tools::FirecrawlParserJob < ApplicationJob
rescue StandardError => e
raise "Failed to parse FireCrawl data: #{e.message}"
end
private
def normalize_link(raw_url)
raw_url.to_s.delete_suffix('/')
end
end

View File

@@ -15,11 +15,11 @@ class Captain::Tools::SimplePageCrawlParserJob < ApplicationJob
page_title = crawler.page_title || ''
content = crawler.body_text_content || ''
document = assistant.documents.find_or_initialize_by(
external_link: page_link
)
normalized_link = normalize_link(page_link)
document = assistant.documents.find_or_initialize_by(external_link: normalized_link)
document.update!(
external_link: normalized_link,
name: page_title[0..254], content: content[0..14_999], status: :available
)
rescue StandardError => e
@@ -28,6 +28,10 @@ class Captain::Tools::SimplePageCrawlParserJob < ApplicationJob
private
def normalize_link(raw_link)
raw_link.to_s.delete_suffix('/')
end
def limit_exceeded?(account)
limits = account.usage_limits[:captain][:documents]
limits[:current_available].negative? || limits[:current_available].zero?

View File

@@ -37,6 +37,7 @@ class Captain::Document < ApplicationRecord
validate :validate_file_attachment, if: -> { pdf_file.attached? }
before_validation :ensure_account_id
before_validation :set_external_link_for_pdf
before_validation :normalize_external_link
enum status: {
in_progress: 0,
@@ -47,7 +48,7 @@ class Captain::Document < ApplicationRecord
after_create_commit :enqueue_crawl_job
after_create_commit :update_document_usage
after_destroy :update_document_usage
after_commit :enqueue_response_builder_job, on: :update, if: :should_enqueue_response_builder?
after_commit :enqueue_response_builder_job
scope :ordered, -> { order(created_at: :desc) }
scope :for_account, ->(account_id) { where(account_id: account_id) }
@@ -94,15 +95,18 @@ class Captain::Document < ApplicationRecord
end
def enqueue_response_builder_job
return if status != 'available'
return unless should_enqueue_response_builder?
Captain::Documents::ResponseBuilderJob.perform_later(self)
end
def should_enqueue_response_builder?
# Only enqueue when status changes to available
# Avoid re-enqueueing when metadata is updated by the job itself
saved_change_to_status? && status == 'available'
return false if destroyed?
return false unless available?
return saved_change_to_status? if pdf_document?
(saved_change_to_status? || saved_change_to_content?) && content.present?
end
def update_document_usage
@@ -140,4 +144,11 @@ class Captain::Document < ApplicationRecord
timestamp = Time.current.strftime('%Y%m%d%H%M%S')
self.external_link = "PDF: #{pdf_file.filename.base}_#{timestamp}"
end
def normalize_external_link
return if external_link.blank?
return if pdf_document?
self.external_link = external_link.delete_suffix('/')
end
end

View File

@@ -23,7 +23,7 @@ RSpec.describe Captain::Tools::FirecrawlParserJob, type: :job do
expect(document).to have_attributes(
content: payload[:markdown],
name: payload[:metadata]['title'],
external_link: payload[:metadata]['url'],
external_link: 'https://www.firecrawl.dev',
status: 'available'
)
end
@@ -32,7 +32,7 @@ RSpec.describe Captain::Tools::FirecrawlParserJob, type: :job do
existing_document = create(:captain_document,
assistant: assistant,
account: assistant.account,
external_link: payload[:metadata]['url'],
external_link: 'https://www.firecrawl.dev',
content: 'old content',
name: 'old title',
status: :in_progress)
@@ -42,7 +42,9 @@ RSpec.describe Captain::Tools::FirecrawlParserJob, type: :job do
end.not_to change(assistant.documents, :count)
existing_document.reload
# Payload URL ends with '/', but we persist the canonical URL without it.
expect(existing_document).to have_attributes(
external_link: 'https://www.firecrawl.dev',
content: payload[:markdown],
name: payload[:metadata]['title'],
status: 'available'

View File

@@ -3,7 +3,7 @@ require 'rails_helper'
RSpec.describe Captain::Tools::SimplePageCrawlParserJob, type: :job do
describe '#perform' do
let(:assistant) { create(:captain_assistant) }
let(:page_link) { 'https://example.com/page' }
let(:page_link) { 'https://example.com/page/' }
let(:page_title) { 'Example Page Title' }
let(:content) { 'Some page content here' }
let(:crawler) { instance_double(Captain::Tools::SimplePageCrawlService) }
@@ -24,7 +24,7 @@ RSpec.describe Captain::Tools::SimplePageCrawlParserJob, type: :job do
end.to change(assistant.documents, :count).by(1)
document = assistant.documents.last
expect(document.external_link).to eq(page_link)
expect(document.external_link).to eq('https://example.com/page')
expect(document.name).to eq(page_title)
expect(document.content).to eq(content)
expect(document.status).to eq('available')
@@ -33,7 +33,7 @@ RSpec.describe Captain::Tools::SimplePageCrawlParserJob, type: :job do
it 'updates existing document if one exists' do
existing_document = create(:captain_document,
assistant: assistant,
external_link: page_link,
external_link: 'https://example.com/page',
name: 'Old Title',
content: 'Old content')

View File

@@ -4,6 +4,17 @@ RSpec.describe Captain::Document, type: :model do
let(:account) { create(:account) }
let(:assistant) { create(:captain_assistant, account: account) }
describe 'URL normalization' do
it 'removes a trailing slash before validation' do
document = create(:captain_document,
assistant: assistant,
account: account,
external_link: 'https://example.com/path/')
expect(document.external_link).to eq('https://example.com/path')
end
end
describe 'PDF support' do
let(:pdf_document) do
doc = build(:captain_document, assistant: assistant, account: account)
@@ -82,4 +93,161 @@ RSpec.describe Captain::Document, type: :model do
end
end
end
describe 'response builder job callback' do
before { clear_enqueued_jobs }
describe 'non-PDF documents' do
it 'enqueues when created with available status and content' do
expect do
create(:captain_document, assistant: assistant, account: account, status: :available)
end.to have_enqueued_job(Captain::Documents::ResponseBuilderJob)
end
it 'does not enqueue when created available without content' do
expect do
create(:captain_document, assistant: assistant, account: account, status: :available, content: nil)
end.not_to have_enqueued_job(Captain::Documents::ResponseBuilderJob)
end
it 'enqueues when status transitions to available with existing content' do
document = create(:captain_document, assistant: assistant, account: account, status: :in_progress)
expect do
document.update!(status: :available)
end.to have_enqueued_job(Captain::Documents::ResponseBuilderJob)
end
it 'does not enqueue when status transitions to available without content' do
document = create(
:captain_document,
assistant: assistant,
account: account,
status: :in_progress,
content: nil
)
expect do
document.update!(status: :available)
end.not_to have_enqueued_job(Captain::Documents::ResponseBuilderJob)
end
it 'enqueues when content is populated on an available document' do
document = create(
:captain_document,
assistant: assistant,
account: account,
status: :available,
content: nil
)
clear_enqueued_jobs
expect do
document.update!(content: 'Fresh content from crawl')
end.to have_enqueued_job(Captain::Documents::ResponseBuilderJob)
end
it 'enqueues when content changes on an available document' do
document = create(
:captain_document,
assistant: assistant,
account: account,
status: :available,
content: 'Initial content'
)
clear_enqueued_jobs
expect do
document.update!(content: 'Updated crawl content')
end.to have_enqueued_job(Captain::Documents::ResponseBuilderJob)
end
it 'does not enqueue when content is cleared on an available document' do
document = create(
:captain_document,
assistant: assistant,
account: account,
status: :available,
content: 'Initial content'
)
clear_enqueued_jobs
expect do
document.update!(content: nil)
end.not_to have_enqueued_job(Captain::Documents::ResponseBuilderJob)
end
it 'does not enqueue for metadata-only updates' do
document = create(:captain_document, assistant: assistant, account: account, status: :available)
clear_enqueued_jobs
expect do
document.update!(metadata: { 'title' => 'Updated Again' })
end.not_to have_enqueued_job(Captain::Documents::ResponseBuilderJob)
end
it 'does not enqueue while document remains in progress' do
document = create(:captain_document, assistant: assistant, account: account, status: :in_progress)
expect do
document.update!(metadata: { 'title' => 'Updated' })
end.not_to have_enqueued_job(Captain::Documents::ResponseBuilderJob)
end
end
describe 'PDF documents' do
def build_pdf_document(status:, content:)
build(
:captain_document,
assistant: assistant,
account: account,
status: status,
content: content
).tap do |doc|
doc.pdf_file.attach(
io: StringIO.new('PDF content'),
filename: 'sample.pdf',
content_type: 'application/pdf'
)
end
end
it 'enqueues when created available without content' do
document = build_pdf_document(status: :available, content: nil)
expect do
document.save!
end.to have_enqueued_job(Captain::Documents::ResponseBuilderJob)
end
it 'enqueues when status transitions to available' do
document = build_pdf_document(status: :in_progress, content: nil)
document.save!
clear_enqueued_jobs
expect do
document.update!(status: :available)
end.to have_enqueued_job(Captain::Documents::ResponseBuilderJob)
end
it 'does not enqueue when content updates without status change' do
document = build_pdf_document(status: :available, content: nil)
document.save!
clear_enqueued_jobs
expect do
document.update!(content: 'Extracted PDF text')
end.not_to have_enqueued_job(Captain::Documents::ResponseBuilderJob)
end
end
it 'does not enqueue when the document is destroyed' do
document = create(:captain_document, assistant: assistant, account: account, status: :available)
clear_enqueued_jobs
expect do
document.destroy!
end.not_to have_enqueued_job(Captain::Documents::ResponseBuilderJob)
end
end
end