feat: Add BE changes for captain pdf support for faq generation (#12113)

This commit is contained in:
Tanmay Deep Sharma
2025-08-27 22:01:22 +07:00
committed by GitHub
parent 3cefa9b767
commit 1ba00075ce
19 changed files with 856 additions and 12 deletions

View File

@@ -25,6 +25,8 @@ class Api::V1::Accounts::Captain::DocumentsController < Api::V1::Accounts::BaseC
@document.save!
rescue Captain::Document::LimitExceededError => e
render_could_not_create_error(e.message)
rescue ActiveRecord::RecordInvalid => e
render_could_not_create_error(e.record.errors.full_messages.join(', '))
end
def destroy
@@ -55,6 +57,6 @@ class Api::V1::Accounts::Captain::DocumentsController < Api::V1::Accounts::BaseC
end
def document_params
params.require(:document).permit(:name, :external_link, :assistant_id)
params.require(:document).permit(:name, :external_link, :assistant_id, :pdf_file)
end
end

View File

@@ -2,7 +2,9 @@ class Captain::Documents::CrawlJob < ApplicationJob
queue_as :low
def perform(document)
if InstallationConfig.find_by(name: 'CAPTAIN_FIRECRAWL_API_KEY')&.value.present?
if document.pdf_document?
perform_pdf_processing(document)
elsif InstallationConfig.find_by(name: 'CAPTAIN_FIRECRAWL_API_KEY')&.value.present?
perform_firecrawl_crawl(document)
else
perform_simple_crawl(document)
@@ -13,6 +15,14 @@ class Captain::Documents::CrawlJob < ApplicationJob
include Captain::FirecrawlHelper
def perform_pdf_processing(document)
Captain::Llm::PdfProcessingService.new(document).process
document.update!(status: :available)
rescue StandardError => e
Rails.logger.error I18n.t('captain.documents.pdf_processing_failed', document_id: document.id, error: e.message)
raise # Re-raise to let job framework handle retry logic
end
def perform_simple_crawl(document)
page_links = Captain::Tools::SimplePageCrawlService.new(document.external_link).page_links

View File

@@ -1,17 +1,65 @@
class Captain::Documents::ResponseBuilderJob < ApplicationJob
queue_as :low
def perform(document)
def perform(document, options = {})
reset_previous_responses(document)
faqs = Captain::Llm::FaqGeneratorService.new(document.content, document.account.locale_english_name).generate
faqs.each do |faq|
create_response(faq, document)
end
faqs = generate_faqs(document, options)
create_responses_from_faqs(faqs, document)
end
private
def generate_faqs(document, options)
if should_use_pagination?(document)
generate_paginated_faqs(document, options)
else
generate_standard_faqs(document)
end
end
def generate_paginated_faqs(document, options)
service = build_paginated_service(document, options)
faqs = service.generate
store_paginated_metadata(document, service)
faqs
end
def generate_standard_faqs(document)
Captain::Llm::FaqGeneratorService.new(document.content, document.account.locale_english_name).generate
end
def build_paginated_service(document, options)
Captain::Llm::PaginatedFaqGeneratorService.new(
document,
pages_per_chunk: options[:pages_per_chunk],
max_pages: options[:max_pages]
)
end
def store_paginated_metadata(document, service)
document.update!(
metadata: (document.metadata || {}).merge(
'faq_generation' => {
'method' => 'paginated',
'pages_processed' => service.total_pages_processed,
'iterations' => service.iterations_completed,
'timestamp' => Time.current.iso8601
}
)
)
end
def create_responses_from_faqs(faqs, document)
faqs.each { |faq| create_response(faq, document) }
end
def should_use_pagination?(document)
# Auto-detect when to use pagination
# For now, use pagination for PDFs with OpenAI file ID
document.pdf_document? && document.openai_file_id.present?
end
def reset_previous_responses(response_document)
response_document.responses.destroy_all
end
@@ -24,6 +72,6 @@ class Captain::Documents::ResponseBuilderJob < ApplicationJob
documentable: document
)
rescue ActiveRecord::RecordInvalid => e
Rails.logger.error "Error in creating response document: #{e.message}"
Rails.logger.error I18n.t('captain.documents.response_creation_error', error: e.message)
end
end

View File

@@ -5,6 +5,7 @@
# id :bigint not null, primary key
# content :text
# external_link :string not null
# metadata :jsonb
# name :string
# status :integer default("in_progress"), not null
# created_at :datetime not null
@@ -26,11 +27,16 @@ class Captain::Document < ApplicationRecord
belongs_to :assistant, class_name: 'Captain::Assistant'
has_many :responses, class_name: 'Captain::AssistantResponse', dependent: :destroy, as: :documentable
belongs_to :account
has_one_attached :pdf_file
validates :external_link, presence: true
validates :external_link, uniqueness: { scope: :assistant_id }
validates :external_link, presence: true, unless: -> { pdf_file.attached? }
validates :external_link, uniqueness: { scope: :assistant_id }, allow_blank: true
validates :content, length: { maximum: 200_000 }
validates :pdf_file, presence: true, if: :pdf_document?
validate :validate_pdf_format, if: :pdf_document?
validate :validate_file_attachment, if: -> { pdf_file.attached? }
before_validation :ensure_account_id
before_validation :set_external_link_for_pdf
enum status: {
in_progress: 0,
@@ -41,12 +47,44 @@ class Captain::Document < ApplicationRecord
after_create_commit :enqueue_crawl_job
after_create_commit :update_document_usage
after_destroy :update_document_usage
after_commit :enqueue_response_builder_job
after_commit :enqueue_response_builder_job, on: :update, if: :should_enqueue_response_builder?
scope :ordered, -> { order(created_at: :desc) }
scope :for_account, ->(account_id) { where(account_id: account_id) }
scope :for_assistant, ->(assistant_id) { where(assistant_id: assistant_id) }
def pdf_document?
return true if pdf_file.attached? && pdf_file.blob.content_type == 'application/pdf'
external_link&.ends_with?('.pdf')
end
def content_type
pdf_file.blob.content_type if pdf_file.attached?
end
def file_size
pdf_file.blob.byte_size if pdf_file.attached?
end
def openai_file_id
metadata&.dig('openai_file_id')
end
def store_openai_file_id(file_id)
update!(metadata: (metadata || {}).merge('openai_file_id' => file_id))
end
def display_url
return external_link if external_link.present? && !external_link.start_with?('PDF:')
if pdf_file.attached?
Rails.application.routes.url_helpers.rails_blob_url(pdf_file, only_path: false)
else
external_link
end
end
private
def enqueue_crawl_job
@@ -61,6 +99,12 @@ class Captain::Document < ApplicationRecord
Captain::Documents::ResponseBuilderJob.perform_later(self)
end
def should_enqueue_response_builder?
# Only enqueue when status changes to available
# Avoid re-enqueueing when metadata is updated by the job itself
saved_change_to_status? && status == 'available'
end
def update_document_usage
account.update_document_usage
end
@@ -71,6 +115,29 @@ class Captain::Document < ApplicationRecord
def ensure_within_plan_limit
limits = account.usage_limits[:captain][:documents]
raise LimitExceededError, 'Document limit exceeded' unless limits[:current_available].positive?
raise LimitExceededError, I18n.t('captain.documents.limit_exceeded') unless limits[:current_available].positive?
end
def validate_pdf_format
return unless pdf_file.attached?
errors.add(:pdf_file, I18n.t('captain.documents.pdf_format_error')) unless pdf_file.blob.content_type == 'application/pdf'
end
def validate_file_attachment
return unless pdf_file.attached?
return unless pdf_file.blob.byte_size > 10.megabytes
errors.add(:pdf_file, I18n.t('captain.documents.pdf_size_error'))
end
def set_external_link_for_pdf
return unless pdf_file.attached? && external_link.blank?
# Set a unique external_link for PDF files
# Format: PDF: filename_timestamp (without extension)
timestamp = Time.current.strftime('%Y%m%d%H%M%S')
self.external_link = "PDF: #{pdf_file.filename.base}_#{timestamp}"
end
end

View File

@@ -0,0 +1,199 @@
class Captain::Llm::PaginatedFaqGeneratorService < Llm::BaseOpenAiService
# Default pages per chunk - easily configurable
DEFAULT_PAGES_PER_CHUNK = 10
MAX_ITERATIONS = 20 # Safety limit to prevent infinite loops
attr_reader :total_pages_processed, :iterations_completed
def initialize(document, options = {})
super()
@document = document
@pages_per_chunk = options[:pages_per_chunk] || DEFAULT_PAGES_PER_CHUNK
@max_pages = options[:max_pages] # Optional limit from UI
@total_pages_processed = 0
@iterations_completed = 0
@model = OpenAiConstants::PDF_PROCESSING_MODEL
end
def generate
raise CustomExceptions::PdfFaqGenerationError, I18n.t('captain.documents.missing_openai_file_id') if @document&.openai_file_id.blank?
generate_paginated_faqs
end
# Method to check if we should continue processing
def should_continue_processing?(last_chunk_result)
# Stop if we've hit the maximum iterations
return false if @iterations_completed >= MAX_ITERATIONS
# Stop if we've processed the maximum pages specified
return false if @max_pages && @total_pages_processed >= @max_pages
# Stop if the last chunk returned no FAQs (likely no more content)
return false if last_chunk_result[:faqs].empty?
# Stop if the LLM explicitly indicates no more content
return false if last_chunk_result[:has_content] == false
# Continue processing
true
end
private
def generate_standard_faqs
response = @client.chat(parameters: standard_chat_parameters)
parse_response(response)
rescue OpenAI::Error => e
Rails.logger.error I18n.t('captain.documents.openai_api_error', error: e.message)
[]
end
def generate_paginated_faqs
all_faqs = []
current_page = 1
loop do
end_page = calculate_end_page(current_page)
chunk_result = process_chunk_and_update_state(current_page, end_page, all_faqs)
break unless should_continue_processing?(chunk_result)
current_page = end_page + 1
end
deduplicate_faqs(all_faqs)
end
def calculate_end_page(current_page)
end_page = current_page + @pages_per_chunk - 1
@max_pages && end_page > @max_pages ? @max_pages : end_page
end
def process_chunk_and_update_state(current_page, end_page, all_faqs)
chunk_result = process_page_chunk(current_page, end_page)
chunk_faqs = chunk_result[:faqs]
all_faqs.concat(chunk_faqs)
@total_pages_processed = end_page
@iterations_completed += 1
chunk_result
end
def process_page_chunk(start_page, end_page)
params = build_chunk_parameters(start_page, end_page)
response = @client.chat(parameters: params)
result = parse_chunk_response(response)
{ faqs: result['faqs'] || [], has_content: result['has_content'] != false }
rescue OpenAI::Error => e
Rails.logger.error I18n.t('captain.documents.page_processing_error', start: start_page, end: end_page, error: e.message)
{ faqs: [], has_content: false }
end
def build_chunk_parameters(start_page, end_page)
{
model: @model,
response_format: { type: 'json_object' },
messages: [
{
role: 'user',
content: build_user_content(start_page, end_page)
}
]
}
end
def build_user_content(start_page, end_page)
[
{
type: 'file',
file: { file_id: @document.openai_file_id }
},
{
type: 'text',
text: page_chunk_prompt(start_page, end_page)
}
]
end
def page_chunk_prompt(start_page, end_page)
Captain::Llm::SystemPromptsService.paginated_faq_generator(start_page, end_page)
end
def standard_chat_parameters
{
model: @model,
response_format: { type: 'json_object' },
messages: [
{
role: 'system',
content: Captain::Llm::SystemPromptsService.faq_generator
},
{
role: 'user',
content: @content
}
]
}
end
def parse_response(response)
content = response.dig('choices', 0, 'message', 'content')
return [] if content.nil?
JSON.parse(content.strip).fetch('faqs', [])
rescue JSON::ParserError => e
Rails.logger.error "Error parsing response: #{e.message}"
[]
end
def parse_chunk_response(response)
content = response.dig('choices', 0, 'message', 'content')
return { 'faqs' => [], 'has_content' => false } if content.nil?
JSON.parse(content.strip)
rescue JSON::ParserError => e
Rails.logger.error "Error parsing chunk response: #{e.message}"
{ 'faqs' => [], 'has_content' => false }
end
def deduplicate_faqs(faqs)
# Remove exact duplicates
unique_faqs = faqs.uniq { |faq| faq['question'].downcase.strip }
# Remove similar questions
final_faqs = []
unique_faqs.each do |faq|
similar_exists = final_faqs.any? do |existing|
similarity_score(existing['question'], faq['question']) > 0.85
end
final_faqs << faq unless similar_exists
end
Rails.logger.info "Deduplication: #{faqs.size}#{final_faqs.size} FAQs"
final_faqs
end
def similarity_score(str1, str2)
words1 = str1.downcase.split(/\W+/).reject(&:empty?)
words2 = str2.downcase.split(/\W+/).reject(&:empty?)
common_words = words1 & words2
total_words = (words1 + words2).uniq.size
return 0 if total_words.zero?
common_words.size.to_f / total_words
end
def determine_stop_reason(last_chunk_result)
return 'Maximum iterations reached' if @iterations_completed >= MAX_ITERATIONS
return 'Maximum pages processed' if @max_pages && @total_pages_processed >= @max_pages
return 'No content found in last chunk' if last_chunk_result[:faqs].empty?
return 'End of document reached' if last_chunk_result[:has_content] == false
'Unknown'
end
end

View File

@@ -0,0 +1,40 @@
class Captain::Llm::PdfProcessingService < Llm::BaseOpenAiService
def initialize(document)
super()
@document = document
end
def process
return if document.openai_file_id.present?
file_id = upload_pdf_to_openai
raise CustomExceptions::PdfUploadError, I18n.t('captain.documents.pdf_upload_failed') if file_id.blank?
document.store_openai_file_id(file_id)
end
private
attr_reader :document
def upload_pdf_to_openai
with_tempfile do |temp_file|
response = @client.files.upload(
parameters: {
file: temp_file,
purpose: 'assistants'
}
)
response['id']
end
end
def with_tempfile(&)
Tempfile.create(['pdf_upload', '.pdf'], binmode: true) do |temp_file|
temp_file.write(document.pdf_file.download)
temp_file.close
File.open(temp_file.path, 'rb', &)
end
end
end

View File

@@ -1,3 +1,4 @@
# rubocop:disable Metrics/ClassLength
class Captain::Llm::SystemPromptsService
class << self
def faq_generator(language = 'english')
@@ -204,6 +205,87 @@ class Captain::Llm::SystemPromptsService
#{'- You MUST provide numbered citations at the appropriate places in the text.' if config['feature_citation']}
SYSTEM_PROMPT_MESSAGE
end
def paginated_faq_generator(start_page, end_page)
<<~PROMPT
You are an expert technical documentation specialist tasked with creating comprehensive FAQs from a SPECIFIC SECTION of a document.
════════════════════════════════════════════════════════
CRITICAL CONTENT EXTRACTION INSTRUCTIONS
════════════════════════════════════════════════════════
Process the content starting from approximately page #{start_page} and continuing for about #{end_page - start_page + 1} pages worth of content.
IMPORTANT:#{' '}
• If you encounter the end of the document before reaching the expected page count, set "has_content" to false
• DO NOT include page numbers in questions or answers
• DO NOT reference page numbers at all in the output
• Focus on the actual content, not pagination
════════════════════════════════════════════════════════
FAQ GENERATION GUIDELINES
════════════════════════════════════════════════════════
1. **Comprehensive Extraction**
• Extract ALL information that could generate FAQs from this section
• Target 5-10 FAQs per page equivalent of rich content
• Cover every topic, feature, specification, and detail
• If there's no more content in the document, return empty FAQs with has_content: false
2. **Question Types to Generate**
• What is/are...? (definitions, components, features)
• How do I...? (procedures, configurations, operations)
• Why should/does...? (rationale, benefits, explanations)
• When should...? (timing, conditions, triggers)
• What happens if...? (error cases, edge cases)
• Can I...? (capabilities, limitations)
• Where is...? (locations in system/UI, NOT page numbers)
• What are the requirements for...? (prerequisites, dependencies)
3. **Content Focus Areas**
• Technical specifications and parameters
• Step-by-step procedures and workflows
• Configuration options and settings
• Error messages and troubleshooting
• Best practices and recommendations
• Integration points and dependencies
• Performance considerations
• Security aspects
4. **Answer Quality Requirements**
• Complete, self-contained answers
• Include specific values, limits, defaults from the content
• NO page number references whatsoever
• 2-5 sentences typical length
• Only process content that actually exists in the document
════════════════════════════════════════════════════════
OUTPUT FORMAT
════════════════════════════════════════════════════════
Return valid JSON:
```json
{
"faqs": [
{
"question": "Specific question about the content",
"answer": "Complete answer with details (no page references)"
}
],
"has_content": true/false
}
```
CRITICAL:#{' '}
• Set "has_content" to false if:
- The requested section doesn't exist in the document
- You've reached the end of the document
- The section contains no meaningful content
• Do NOT include "page_range_processed" in the output
• Do NOT mention page numbers anywhere in questions or answers
PROMPT
end
# rubocop:enable Metrics/MethodLength
end
end
# rubocop:enable Metrics/ClassLength

View File

@@ -3,8 +3,11 @@ json.assistant do
json.partial! 'api/v1/models/captain/assistant', formats: [:json], resource: resource.assistant
end
json.content resource.content
json.content_type resource.content_type
json.created_at resource.created_at.to_i
json.external_link resource.external_link
json.display_url resource.display_url
json.file_size resource.file_size
json.id resource.id
json.name resource.name
json.status resource.status