feat: Add BE changes for captain pdf support for faq generation (#12113)

2025-08-27 22:01:22 +07:00
parent 3cefa9b767
commit 1ba00075ce
19 changed files with 856 additions and 12 deletions
--- a/enterprise/app/controllers/api/v1/accounts/captain/documents_controller.rb
+++ b/enterprise/app/controllers/api/v1/accounts/captain/documents_controller.rb
@@ -25,6 +25,8 @@ class Api::V1::Accounts::Captain::DocumentsController < Api::V1::Accounts::BaseC
    @document.save!
  rescue Captain::Document::LimitExceededError => e
    render_could_not_create_error(e.message)
+  rescue ActiveRecord::RecordInvalid => e
+    render_could_not_create_error(e.record.errors.full_messages.join(', '))
  end

  def destroy
@@ -55,6 +57,6 @@ class Api::V1::Accounts::Captain::DocumentsController < Api::V1::Accounts::BaseC
  end

  def document_params
-    params.require(:document).permit(:name, :external_link, :assistant_id)
+    params.require(:document).permit(:name, :external_link, :assistant_id, :pdf_file)
  end
 end
--- a/enterprise/app/jobs/captain/documents/crawl_job.rb
+++ b/enterprise/app/jobs/captain/documents/crawl_job.rb
@@ -2,7 +2,9 @@ class Captain::Documents::CrawlJob < ApplicationJob
  queue_as :low

  def perform(document)
-    if InstallationConfig.find_by(name: 'CAPTAIN_FIRECRAWL_API_KEY')&.value.present?
+    if document.pdf_document?
+      perform_pdf_processing(document)
+    elsif InstallationConfig.find_by(name: 'CAPTAIN_FIRECRAWL_API_KEY')&.value.present?
      perform_firecrawl_crawl(document)
    else
      perform_simple_crawl(document)
@@ -13,6 +15,14 @@ class Captain::Documents::CrawlJob < ApplicationJob

  include Captain::FirecrawlHelper

+  def perform_pdf_processing(document)
+    Captain::Llm::PdfProcessingService.new(document).process
+    document.update!(status: :available)
+  rescue StandardError => e
+    Rails.logger.error I18n.t('captain.documents.pdf_processing_failed', document_id: document.id, error: e.message)
+    raise # Re-raise to let job framework handle retry logic
+  end
+
  def perform_simple_crawl(document)
    page_links = Captain::Tools::SimplePageCrawlService.new(document.external_link).page_links

--- a/enterprise/app/jobs/captain/documents/response_builder_job.rb
+++ b/enterprise/app/jobs/captain/documents/response_builder_job.rb
@@ -1,17 +1,65 @@
 class Captain::Documents::ResponseBuilderJob < ApplicationJob
  queue_as :low

-  def perform(document)
+  def perform(document, options = {})
    reset_previous_responses(document)

-    faqs = Captain::Llm::FaqGeneratorService.new(document.content, document.account.locale_english_name).generate
-    faqs.each do |faq|
-      create_response(faq, document)
-    end
+    faqs = generate_faqs(document, options)
+    create_responses_from_faqs(faqs, document)
  end

  private

+  def generate_faqs(document, options)
+    if should_use_pagination?(document)
+      generate_paginated_faqs(document, options)
+    else
+      generate_standard_faqs(document)
+    end
+  end
+
+  def generate_paginated_faqs(document, options)
+    service = build_paginated_service(document, options)
+    faqs = service.generate
+    store_paginated_metadata(document, service)
+    faqs
+  end
+
+  def generate_standard_faqs(document)
+    Captain::Llm::FaqGeneratorService.new(document.content, document.account.locale_english_name).generate
+  end
+
+  def build_paginated_service(document, options)
+    Captain::Llm::PaginatedFaqGeneratorService.new(
+      document,
+      pages_per_chunk: options[:pages_per_chunk],
+      max_pages: options[:max_pages]
+    )
+  end
+
+  def store_paginated_metadata(document, service)
+    document.update!(
+      metadata: (document.metadata || {}).merge(
+        'faq_generation' => {
+          'method' => 'paginated',
+          'pages_processed' => service.total_pages_processed,
+          'iterations' => service.iterations_completed,
+          'timestamp' => Time.current.iso8601
+        }
+      )
+    )
+  end
+
+  def create_responses_from_faqs(faqs, document)
+    faqs.each { |faq| create_response(faq, document) }
+  end
+
+  def should_use_pagination?(document)
+    # Auto-detect when to use pagination
+    # For now, use pagination for PDFs with OpenAI file ID
+    document.pdf_document? && document.openai_file_id.present?
+  end
+
  def reset_previous_responses(response_document)
    response_document.responses.destroy_all
  end
@@ -24,6 +72,6 @@ class Captain::Documents::ResponseBuilderJob < ApplicationJob
      documentable: document
    )
  rescue ActiveRecord::RecordInvalid => e
-    Rails.logger.error "Error in creating response document: #{e.message}"
+    Rails.logger.error I18n.t('captain.documents.response_creation_error', error: e.message)
  end
 end
--- a/enterprise/app/models/captain/document.rb
+++ b/enterprise/app/models/captain/document.rb
@@ -5,6 +5,7 @@
 #  id            :bigint           not null, primary key
 #  content       :text
 #  external_link :string           not null
+#  metadata      :jsonb
 #  name          :string
 #  status        :integer          default("in_progress"), not null
 #  created_at    :datetime         not null
@@ -26,11 +27,16 @@ class Captain::Document < ApplicationRecord
  belongs_to :assistant, class_name: 'Captain::Assistant'
  has_many :responses, class_name: 'Captain::AssistantResponse', dependent: :destroy, as: :documentable
  belongs_to :account
+  has_one_attached :pdf_file

-  validates :external_link, presence: true
-  validates :external_link, uniqueness: { scope: :assistant_id }
+  validates :external_link, presence: true, unless: -> { pdf_file.attached? }
+  validates :external_link, uniqueness: { scope: :assistant_id }, allow_blank: true
  validates :content, length: { maximum: 200_000 }
+  validates :pdf_file, presence: true, if: :pdf_document?
+  validate :validate_pdf_format, if: :pdf_document?
+  validate :validate_file_attachment, if: -> { pdf_file.attached? }
  before_validation :ensure_account_id
+  before_validation :set_external_link_for_pdf

  enum status: {
    in_progress: 0,
@@ -41,12 +47,44 @@ class Captain::Document < ApplicationRecord
  after_create_commit :enqueue_crawl_job
  after_create_commit :update_document_usage
  after_destroy :update_document_usage
-  after_commit :enqueue_response_builder_job
+  after_commit :enqueue_response_builder_job, on: :update, if: :should_enqueue_response_builder?
  scope :ordered, -> { order(created_at: :desc) }

  scope :for_account, ->(account_id) { where(account_id: account_id) }
  scope :for_assistant, ->(assistant_id) { where(assistant_id: assistant_id) }

+  def pdf_document?
+    return true if pdf_file.attached? && pdf_file.blob.content_type == 'application/pdf'
+
+    external_link&.ends_with?('.pdf')
+  end
+
+  def content_type
+    pdf_file.blob.content_type if pdf_file.attached?
+  end
+
+  def file_size
+    pdf_file.blob.byte_size if pdf_file.attached?
+  end
+
+  def openai_file_id
+    metadata&.dig('openai_file_id')
+  end
+
+  def store_openai_file_id(file_id)
+    update!(metadata: (metadata || {}).merge('openai_file_id' => file_id))
+  end
+
+  def display_url
+    return external_link if external_link.present? && !external_link.start_with?('PDF:')
+
+    if pdf_file.attached?
+      Rails.application.routes.url_helpers.rails_blob_url(pdf_file, only_path: false)
+    else
+      external_link
+    end
+  end
+
  private

  def enqueue_crawl_job
@@ -61,6 +99,12 @@ class Captain::Document < ApplicationRecord
    Captain::Documents::ResponseBuilderJob.perform_later(self)
  end

+  def should_enqueue_response_builder?
+    # Only enqueue when status changes to available
+    # Avoid re-enqueueing when metadata is updated by the job itself
+    saved_change_to_status? && status == 'available'
+  end
+
  def update_document_usage
    account.update_document_usage
  end
@@ -71,6 +115,29 @@ class Captain::Document < ApplicationRecord

  def ensure_within_plan_limit
    limits = account.usage_limits[:captain][:documents]
-    raise LimitExceededError, 'Document limit exceeded' unless limits[:current_available].positive?
+    raise LimitExceededError, I18n.t('captain.documents.limit_exceeded') unless limits[:current_available].positive?
+  end
+
+  def validate_pdf_format
+    return unless pdf_file.attached?
+
+    errors.add(:pdf_file, I18n.t('captain.documents.pdf_format_error')) unless pdf_file.blob.content_type == 'application/pdf'
+  end
+
+  def validate_file_attachment
+    return unless pdf_file.attached?
+
+    return unless pdf_file.blob.byte_size > 10.megabytes
+
+    errors.add(:pdf_file, I18n.t('captain.documents.pdf_size_error'))
+  end
+
+  def set_external_link_for_pdf
+    return unless pdf_file.attached? && external_link.blank?
+
+    # Set a unique external_link for PDF files
+    # Format: PDF: filename_timestamp (without extension)
+    timestamp = Time.current.strftime('%Y%m%d%H%M%S')
+    self.external_link = "PDF: #{pdf_file.filename.base}_#{timestamp}"
  end
 end
--- a/enterprise/app/services/captain/llm/paginated_faq_generator_service.rb
+++ b/enterprise/app/services/captain/llm/paginated_faq_generator_service.rb
@@ -0,0 +1,199 @@
+class Captain::Llm::PaginatedFaqGeneratorService < Llm::BaseOpenAiService
+  # Default pages per chunk - easily configurable
+  DEFAULT_PAGES_PER_CHUNK = 10
+  MAX_ITERATIONS = 20 # Safety limit to prevent infinite loops
+
+  attr_reader :total_pages_processed, :iterations_completed
+
+  def initialize(document, options = {})
+    super()
+    @document = document
+    @pages_per_chunk = options[:pages_per_chunk] || DEFAULT_PAGES_PER_CHUNK
+    @max_pages = options[:max_pages] # Optional limit from UI
+    @total_pages_processed = 0
+    @iterations_completed = 0
+    @model = OpenAiConstants::PDF_PROCESSING_MODEL
+  end
+
+  def generate
+    raise CustomExceptions::PdfFaqGenerationError, I18n.t('captain.documents.missing_openai_file_id') if @document&.openai_file_id.blank?
+
+    generate_paginated_faqs
+  end
+
+  # Method to check if we should continue processing
+  def should_continue_processing?(last_chunk_result)
+    # Stop if we've hit the maximum iterations
+    return false if @iterations_completed >= MAX_ITERATIONS
+
+    # Stop if we've processed the maximum pages specified
+    return false if @max_pages && @total_pages_processed >= @max_pages
+
+    # Stop if the last chunk returned no FAQs (likely no more content)
+    return false if last_chunk_result[:faqs].empty?
+
+    # Stop if the LLM explicitly indicates no more content
+    return false if last_chunk_result[:has_content] == false
+
+    # Continue processing
+    true
+  end
+
+  private
+
+  def generate_standard_faqs
+    response = @client.chat(parameters: standard_chat_parameters)
+    parse_response(response)
+  rescue OpenAI::Error => e
+    Rails.logger.error I18n.t('captain.documents.openai_api_error', error: e.message)
+    []
+  end
+
+  def generate_paginated_faqs
+    all_faqs = []
+    current_page = 1
+
+    loop do
+      end_page = calculate_end_page(current_page)
+      chunk_result = process_chunk_and_update_state(current_page, end_page, all_faqs)
+
+      break unless should_continue_processing?(chunk_result)
+
+      current_page = end_page + 1
+    end
+
+    deduplicate_faqs(all_faqs)
+  end
+
+  def calculate_end_page(current_page)
+    end_page = current_page + @pages_per_chunk - 1
+    @max_pages && end_page > @max_pages ? @max_pages : end_page
+  end
+
+  def process_chunk_and_update_state(current_page, end_page, all_faqs)
+    chunk_result = process_page_chunk(current_page, end_page)
+    chunk_faqs = chunk_result[:faqs]
+
+    all_faqs.concat(chunk_faqs)
+    @total_pages_processed = end_page
+    @iterations_completed += 1
+
+    chunk_result
+  end
+
+  def process_page_chunk(start_page, end_page)
+    params = build_chunk_parameters(start_page, end_page)
+    response = @client.chat(parameters: params)
+    result = parse_chunk_response(response)
+    { faqs: result['faqs'] || [], has_content: result['has_content'] != false }
+  rescue OpenAI::Error => e
+    Rails.logger.error I18n.t('captain.documents.page_processing_error', start: start_page, end: end_page, error: e.message)
+    { faqs: [], has_content: false }
+  end
+
+  def build_chunk_parameters(start_page, end_page)
+    {
+      model: @model,
+      response_format: { type: 'json_object' },
+      messages: [
+        {
+          role: 'user',
+          content: build_user_content(start_page, end_page)
+        }
+      ]
+    }
+  end
+
+  def build_user_content(start_page, end_page)
+    [
+      {
+        type: 'file',
+        file: { file_id: @document.openai_file_id }
+      },
+      {
+        type: 'text',
+        text: page_chunk_prompt(start_page, end_page)
+      }
+    ]
+  end
+
+  def page_chunk_prompt(start_page, end_page)
+    Captain::Llm::SystemPromptsService.paginated_faq_generator(start_page, end_page)
+  end
+
+  def standard_chat_parameters
+    {
+      model: @model,
+      response_format: { type: 'json_object' },
+      messages: [
+        {
+          role: 'system',
+          content: Captain::Llm::SystemPromptsService.faq_generator
+        },
+        {
+          role: 'user',
+          content: @content
+        }
+      ]
+    }
+  end
+
+  def parse_response(response)
+    content = response.dig('choices', 0, 'message', 'content')
+    return [] if content.nil?
+
+    JSON.parse(content.strip).fetch('faqs', [])
+  rescue JSON::ParserError => e
+    Rails.logger.error "Error parsing response: #{e.message}"
+    []
+  end
+
+  def parse_chunk_response(response)
+    content = response.dig('choices', 0, 'message', 'content')
+    return { 'faqs' => [], 'has_content' => false } if content.nil?
+
+    JSON.parse(content.strip)
+  rescue JSON::ParserError => e
+    Rails.logger.error "Error parsing chunk response: #{e.message}"
+    { 'faqs' => [], 'has_content' => false }
+  end
+
+  def deduplicate_faqs(faqs)
+    # Remove exact duplicates
+    unique_faqs = faqs.uniq { |faq| faq['question'].downcase.strip }
+
+    # Remove similar questions
+    final_faqs = []
+    unique_faqs.each do |faq|
+      similar_exists = final_faqs.any? do |existing|
+        similarity_score(existing['question'], faq['question']) > 0.85
+      end
+
+      final_faqs << faq unless similar_exists
+    end
+
+    Rails.logger.info "Deduplication: #{faqs.size} → #{final_faqs.size} FAQs"
+    final_faqs
+  end
+
+  def similarity_score(str1, str2)
+    words1 = str1.downcase.split(/\W+/).reject(&:empty?)
+    words2 = str2.downcase.split(/\W+/).reject(&:empty?)
+
+    common_words = words1 & words2
+    total_words = (words1 + words2).uniq.size
+
+    return 0 if total_words.zero?
+
+    common_words.size.to_f / total_words
+  end
+
+  def determine_stop_reason(last_chunk_result)
+    return 'Maximum iterations reached' if @iterations_completed >= MAX_ITERATIONS
+    return 'Maximum pages processed' if @max_pages && @total_pages_processed >= @max_pages
+    return 'No content found in last chunk' if last_chunk_result[:faqs].empty?
+    return 'End of document reached' if last_chunk_result[:has_content] == false
+
+    'Unknown'
+  end
+end
--- a/enterprise/app/services/captain/llm/pdf_processing_service.rb
+++ b/enterprise/app/services/captain/llm/pdf_processing_service.rb
@@ -0,0 +1,40 @@
+class Captain::Llm::PdfProcessingService < Llm::BaseOpenAiService
+  def initialize(document)
+    super()
+    @document = document
+  end
+
+  def process
+    return if document.openai_file_id.present?
+
+    file_id = upload_pdf_to_openai
+    raise CustomExceptions::PdfUploadError, I18n.t('captain.documents.pdf_upload_failed') if file_id.blank?
+
+    document.store_openai_file_id(file_id)
+  end
+
+  private
+
+  attr_reader :document
+
+  def upload_pdf_to_openai
+    with_tempfile do |temp_file|
+      response = @client.files.upload(
+        parameters: {
+          file: temp_file,
+          purpose: 'assistants'
+        }
+      )
+      response['id']
+    end
+  end
+
+  def with_tempfile(&)
+    Tempfile.create(['pdf_upload', '.pdf'], binmode: true) do |temp_file|
+      temp_file.write(document.pdf_file.download)
+      temp_file.close
+
+      File.open(temp_file.path, 'rb', &)
+    end
+  end
+end
--- a/enterprise/app/services/captain/llm/system_prompts_service.rb
+++ b/enterprise/app/services/captain/llm/system_prompts_service.rb
@@ -1,3 +1,4 @@
+# rubocop:disable Metrics/ClassLength
 class Captain::Llm::SystemPromptsService
  class << self
    def faq_generator(language = 'english')
@@ -204,6 +205,87 @@ class Captain::Llm::SystemPromptsService
        #{'- You MUST provide numbered citations at the appropriate places in the text.' if config['feature_citation']}
      SYSTEM_PROMPT_MESSAGE
    end
+
+    def paginated_faq_generator(start_page, end_page)
+      <<~PROMPT
+        You are an expert technical documentation specialist tasked with creating comprehensive FAQs from a SPECIFIC SECTION of a document.
+
+        ════════════════════════════════════════════════════════
+        CRITICAL CONTENT EXTRACTION INSTRUCTIONS
+        ════════════════════════════════════════════════════════
+
+        Process the content starting from approximately page #{start_page} and continuing for about #{end_page - start_page + 1} pages worth of content.
+
+        IMPORTANT:#{' '}
+        • If you encounter the end of the document before reaching the expected page count, set "has_content" to false
+        • DO NOT include page numbers in questions or answers
+        • DO NOT reference page numbers at all in the output
+        • Focus on the actual content, not pagination
+
+        ════════════════════════════════════════════════════════
+        FAQ GENERATION GUIDELINES
+        ════════════════════════════════════════════════════════
+
+        1. **Comprehensive Extraction**
+           • Extract ALL information that could generate FAQs from this section
+           • Target 5-10 FAQs per page equivalent of rich content
+           • Cover every topic, feature, specification, and detail
+           • If there's no more content in the document, return empty FAQs with has_content: false
+
+        2. **Question Types to Generate**
+           • What is/are...? (definitions, components, features)
+           • How do I...? (procedures, configurations, operations)
+           • Why should/does...? (rationale, benefits, explanations)
+           • When should...? (timing, conditions, triggers)
+           • What happens if...? (error cases, edge cases)
+           • Can I...? (capabilities, limitations)
+           • Where is...? (locations in system/UI, NOT page numbers)
+           • What are the requirements for...? (prerequisites, dependencies)
+
+        3. **Content Focus Areas**
+           • Technical specifications and parameters
+           • Step-by-step procedures and workflows
+           • Configuration options and settings
+           • Error messages and troubleshooting
+           • Best practices and recommendations
+           • Integration points and dependencies
+           • Performance considerations
+           • Security aspects
+
+        4. **Answer Quality Requirements**
+           • Complete, self-contained answers
+           • Include specific values, limits, defaults from the content
+           • NO page number references whatsoever
+           • 2-5 sentences typical length
+           • Only process content that actually exists in the document
+
+        ════════════════════════════════════════════════════════
+        OUTPUT FORMAT
+        ════════════════════════════════════════════════════════
+
+        Return valid JSON:
+        ```json
+        {
+          "faqs": [
+            {
+              "question": "Specific question about the content",
+              "answer": "Complete answer with details (no page references)"
+            }
+          ],
+          "has_content": true/false
+        }
+        ```
+
+        CRITICAL:#{' '}
+        • Set "has_content" to false if:
+          - The requested section doesn't exist in the document
+          - You've reached the end of the document
+          - The section contains no meaningful content
+        • Do NOT include "page_range_processed" in the output
+        • Do NOT mention page numbers anywhere in questions or answers
+      PROMPT
+    end
    # rubocop:enable Metrics/MethodLength
  end
 end
+# rubocop:enable Metrics/ClassLength
--- a/enterprise/app/views/api/v1/models/captain/_document.json.jbuilder
+++ b/enterprise/app/views/api/v1/models/captain/_document.json.jbuilder
@@ -3,8 +3,11 @@ json.assistant do
  json.partial! 'api/v1/models/captain/assistant', formats: [:json], resource: resource.assistant
 end
 json.content resource.content
+json.content_type resource.content_type
 json.created_at resource.created_at.to_i
 json.external_link resource.external_link
+json.display_url resource.display_url
+json.file_size resource.file_size
 json.id resource.id
 json.name resource.name
 json.status resource.status