fix: Added authentication to FireCrawl API, remove unused RobinAI references (#10737)

- Fixed Firecrawl webhook payloads to ensure proper data handling and
delivery.
- Removed unused Robin AI code to improve codebase cleanliness and
maintainability.
- Implement authentication for the Firecrawl endpoint to improve
security. A key is generated to secure the webhook URLs from FireCrawl.

---------

Co-authored-by: Pranav <pranavrajs@gmail.com>
This commit is contained in:
Sojan Jose
2025-01-23 07:44:25 +05:30
committed by GitHub
parent 3b366f43e6
commit be8205657e
28 changed files with 345 additions and 470 deletions

View File

@@ -10,7 +10,7 @@ module Enterprise::SuperAdmin::AppConfigsController
when 'internal'
@allowed_configs = internal_config_options
when 'captain'
@allowed_configs = %w[CAPTAIN_OPEN_AI_API_KEY CAPTAIN_OPEN_AI_MODEL]
@allowed_configs = %w[CAPTAIN_OPEN_AI_API_KEY CAPTAIN_OPEN_AI_MODEL CAPTAIN_FIRECRAWL_API_KEY]
else
super
end

View File

@@ -1,16 +1,32 @@
class Enterprise::Webhooks::FirecrawlController < ActionController::API
before_action :validate_token
def process_payload
if crawl_page_event?
Captain::Tools::FirecrawlParserJob.perform_later(
assistant_id: permitted_params[:assistant_id],
payload: permitted_params[:data]
)
end
Captain::Tools::FirecrawlParserJob.perform_later(assistant_id: assistant.id, payload: payload) if crawl_page_event?
head :ok
end
private
include Captain::FirecrawlHelper
def payload
permitted_params[:data]&.first&.to_h
end
def validate_token
render json: { error: 'Invalid access_token' }, status: :unauthorized if assistant_token != permitted_params[:token]
end
def assistant
@assistant ||= Captain::Assistant.find(permitted_params[:assistant_id])
end
def assistant_token
generate_firecrawl_token(assistant.id, assistant.account_id)
end
def crawl_page_event?
permitted_params[:type] == 'crawl.page'
end
@@ -19,12 +35,13 @@ class Enterprise::Webhooks::FirecrawlController < ActionController::API
params.permit(
:type,
:assistant_id,
:token,
:success,
:id,
:metadata,
:format,
:firecrawl,
{ data: {} }
data: [:markdown, { metadata: {} }]
)
end
end

View File

@@ -1,89 +0,0 @@
require 'administrate/base_dashboard'
class ResponseDashboard < Administrate::BaseDashboard
# ATTRIBUTE_TYPES
# a hash that describes the type of each of the model's fields.
#
# Each different type represents an Administrate::Field object,
# which determines how the attribute is displayed
# on pages throughout the dashboard.
ATTRIBUTE_TYPES = {
id: Field::Number.with_options(searchable: true),
account: Field::BelongsToSearch.with_options(class_name: 'Account', searchable_field: [:name, :id], order: 'id DESC'),
response_source: Field::BelongsToSearch.with_options(class_name: 'ResponseSource', searchable_field: [:name, :id, :source_link],
order: 'id DESC'),
answer: Field::Text.with_options(searchable: true),
question: Field::String.with_options(searchable: true),
status: Field::Select.with_options(searchable: false, collection: lambda { |field|
field.resource.class.send(field.attribute.to_s.pluralize).keys
}),
response_document: Field::BelongsToSearch.with_options(class_name: 'ResponseDocument', searchable_field: [:document_link, :content, :id],
order: 'id DESC'),
created_at: Field::DateTime,
updated_at: Field::DateTime
}.freeze
# COLLECTION_ATTRIBUTES
# an array of attributes that will be displayed on the model's index page.
#
# By default, it's limited to four items to reduce clutter on index pages.
# Feel free to add, remove, or rearrange items.
COLLECTION_ATTRIBUTES = %i[
id
question
answer
status
response_document
response_source
account
].freeze
# SHOW_PAGE_ATTRIBUTES
# an array of attributes that will be displayed on the model's show page.
SHOW_PAGE_ATTRIBUTES = %i[
id
status
question
answer
response_document
response_source
account
created_at
updated_at
].freeze
# FORM_ATTRIBUTES
# an array of attributes that will be displayed
# on the model's form (`new` and `edit`) pages.
FORM_ATTRIBUTES = %i[
response_source
response_document
question
answer
status
].freeze
# COLLECTION_FILTERS
# a hash that defines filters that can be used while searching via the search
# field of the dashboard.
#
# For example to add an option to search for open resources by typing "open:"
# in the search field:
#
# COLLECTION_FILTERS = {
# open: ->(resources) { resources.where(open: true) }
# }.freeze
COLLECTION_FILTERS = {
account: ->(resources, attr) { resources.where(account_id: attr) },
response_source: ->(resources, attr) { resources.where(response_source_id: attr) },
response_document: ->(resources, attr) { resources.where(response_document_id: attr) },
status: ->(resources, attr) { resources.where(status: attr) }
}.freeze
# Overwrite this method to customize how responses are displayed
# across all pages of the admin dashboard.
#
def display_resource(response)
"Response: ##{response.id} - #{response.question}"
end
end

View File

@@ -1,84 +0,0 @@
require 'administrate/base_dashboard'
class ResponseDocumentDashboard < Administrate::BaseDashboard
# ATTRIBUTE_TYPES
# a hash that describes the type of each of the model's fields.
#
# Each different type represents an Administrate::Field object,
# which determines how the attribute is displayed
# on pages throughout the dashboard.
ATTRIBUTE_TYPES = {
id: Field::Number.with_options(searchable: true),
account: Field::BelongsToSearch.with_options(class_name: 'Account', searchable_field: [:name, :id], order: 'id DESC'),
content: Field::Text.with_options(searchable: true),
document_id: Field::Number,
document_link: Field::String.with_options(searchable: true),
document_type: Field::String,
response_source: Field::BelongsToSearch.with_options(class_name: 'ResponseSource', searchable_field: [:name, :id, :source_link],
order: 'id DESC'),
responses: Field::HasMany,
created_at: Field::DateTime,
updated_at: Field::DateTime
}.freeze
# COLLECTION_ATTRIBUTES
# an array of attributes that will be displayed on the model's index page.
#
# By default, it's limited to four items to reduce clutter on index pages.
# Feel free to add, remove, or rearrange items.
COLLECTION_ATTRIBUTES = %i[
id
account
response_source
document_link
].freeze
# SHOW_PAGE_ATTRIBUTES
# an array of attributes that will be displayed on the model's show page.
SHOW_PAGE_ATTRIBUTES = %i[
id
account
response_source
document_link
document_id
document_type
content
created_at
updated_at
responses
].freeze
# FORM_ATTRIBUTES
# an array of attributes that will be displayed
# on the model's form (`new` and `edit`) pages.
FORM_ATTRIBUTES = %i[
account
response_source
document_link
document_id
document_type
content
].freeze
# COLLECTION_FILTERS
# a hash that defines filters that can be used while searching via the search
# field of the dashboard.
#
# For example to add an option to search for open resources by typing "open:"
# in the search field:
#
# COLLECTION_FILTERS = {
# open: ->(resources) { resources.where(open: true) }
# }.freeze
COLLECTION_FILTERS = {
account: ->(resources, attr) { resources.where(account_id: attr) },
response_source: ->(resources, attr) { resources.where(response_source_id: attr) }
}.freeze
# Overwrite this method to customize how response documents are displayed
# across all pages of the admin dashboard.
#
def display_resource(response_document)
"Document: ##{response_document.id} - #{response_document.document_link}"
end
end

View File

@@ -1,86 +0,0 @@
require 'administrate/base_dashboard'
class ResponseSourceDashboard < Administrate::BaseDashboard
# ATTRIBUTE_TYPES
# a hash that describes the type of each of the model's fields.
#
# Each different type represents an Administrate::Field object,
# which determines how the attribute is displayed
# on pages throughout the dashboard.
ATTRIBUTE_TYPES = {
id: Field::Number.with_options(searchable: true),
account: Field::BelongsToSearch.with_options(class_name: 'Account', searchable_field: [:name, :id], order: 'id DESC'),
name: Field::String.with_options(searchable: true),
response_documents: Field::HasMany,
responses: Field::HasMany,
source_link: Field::String.with_options(searchable: true),
source_model_id: Field::Number,
source_model_type: Field::String,
source_type: Field::Select.with_options(searchable: false, collection: lambda { |field|
field.resource.class.send(field.attribute.to_s.pluralize).keys
}),
created_at: Field::DateTime,
updated_at: Field::DateTime
}.freeze
# COLLECTION_ATTRIBUTES
# an array of attributes that will be displayed on the model's index page.
#
# By default, it's limited to four items to reduce clutter on index pages.
# Feel free to add, remove, or rearrange items.
COLLECTION_ATTRIBUTES = %i[
id
name
account
source_link
].freeze
# SHOW_PAGE_ATTRIBUTES
# an array of attributes that will be displayed on the model's show page.
SHOW_PAGE_ATTRIBUTES = %i[
id
name
account
source_link
source_model_id
source_model_type
source_type
created_at
updated_at
response_documents
responses
].freeze
# FORM_ATTRIBUTES
# an array of attributes that will be displayed
# on the model's form (`new` and `edit`) pages.
FORM_ATTRIBUTES = %i[
account
name
source_link
source_model_id
source_model_type
source_type
].freeze
# COLLECTION_FILTERS
# a hash that defines filters that can be used while searching via the search
# field of the dashboard.
#
# For example to add an option to search for open resources by typing "open:"
# in the search field:
#
# COLLECTION_FILTERS = {
# open: ->(resources) { resources.where(open: true) }
# }.freeze
COLLECTION_FILTERS = {
account: ->(resources, attr) { resources.where(account_id: attr) }
}.freeze
# Overwrite this method to customize how response sources are displayed
# across all pages of the admin dashboard.
#
def display_resource(response_source)
"Source: ##{response_source.id} - #{response_source.name}"
end
end

View File

@@ -0,0 +1,9 @@
module Captain::FirecrawlHelper
def generate_firecrawl_token(assistant_id, account_id)
api_key = InstallationConfig.find_by(name: 'CAPTAIN_FIRECRAWL_API_KEY')&.value
return nil unless api_key
token_base = "#{api_key[-4..]}#{assistant_id}#{account_id}"
Digest::SHA256.hexdigest(token_base)
end
end

View File

@@ -11,6 +11,8 @@ class Captain::Documents::CrawlJob < ApplicationJob
private
include Captain::FirecrawlHelper
def perform_simple_crawl(document)
page_links = Captain::Tools::SimplePageCrawlService.new(document.external_link).page_links
@@ -28,13 +30,22 @@ class Captain::Documents::CrawlJob < ApplicationJob
end
def perform_firecrawl_crawl(document)
webhook_url = Rails.application.routes.url_helpers.enterprise_webhooks_firecrawl_url
captain_usage_limits = document.account.usage_limits[:captain] || {}
document_limit = captain_usage_limits[:documents] || {}
crawl_limit = [document_limit[:available] || 10, 500].min
Captain::Tools::FirecrawlService
.new
.perform(
document.external_link,
"#{webhook_url}?assistant_id=#{document.assistant_id}"
firecrawl_webhook_url(document),
crawl_limit
)
end
def firecrawl_webhook_url(document)
webhook_url = Rails.application.routes.url_helpers.enterprise_webhooks_firecrawl_url
"#{webhook_url}?assistant_id=#{document.assistant_id}&token=#{generate_firecrawl_token(document.assistant_id, document.account_id)}"
end
end

View File

@@ -6,12 +6,12 @@ class Captain::Tools::FirecrawlParserJob < ApplicationJob
metadata = payload[:metadata]
document = assistant.documents.find_or_initialize_by(
external_link: metadata[:ogUrl]
external_link: metadata['url']
)
document.update!(
content: payload[:markdown],
name: metadata[:ogTitle],
name: metadata['title'],
status: :available
)
rescue StandardError => e

View File

@@ -29,6 +29,7 @@ class Captain::Document < ApplicationRecord
validates :external_link, presence: true
validates :external_link, uniqueness: { scope: :assistant_id }
validates :content, length: { maximum: 200_000 }
before_validation :ensure_account_id
enum status: {

View File

@@ -4,10 +4,10 @@ class Captain::Tools::FirecrawlService
raise 'Missing API key' if @api_key.nil?
end
def perform(url, webhook_url = '')
def perform(url, webhook_url, crawl_limit = 10)
HTTParty.post(
'https://api.firecrawl.dev/v1/crawl',
body: crawl_payload(url, webhook_url),
body: crawl_payload(url, webhook_url, crawl_limit),
headers: headers
)
rescue StandardError => e
@@ -16,12 +16,12 @@ class Captain::Tools::FirecrawlService
private
def crawl_payload(url, webhook_url)
def crawl_payload(url, webhook_url, crawl_limit)
{
url: url,
maxDepth: 50,
ignoreSitemap: false,
limit: 10,
limit: crawl_limit,
webhook: webhook_url,
scrapeOptions: {
onlyMainContent: false,

View File

@@ -1,5 +0,0 @@
<% content_for :title, "Robin AI playground: #{@response_source.name}" %>
<%= render_vue_component('PlaygroundIndex', {
responseSourceName: @response_source.name,
responseSourcePath: super_admin_response_source_path(@response_source)
}) %>

View File

@@ -1,71 +0,0 @@
<%#
# Show
This view is the template for the show page.
It renders the attributes of a resource,
as well as a link to its edit page.
## Local variables:
- `page`:
An instance of [Administrate::Page::Show][1].
Contains methods for accessing the resource to be displayed on the page,
as well as helpers for describing how each attribute of the resource
should be displayed.
[1]: http://www.rubydoc.info/gems/administrate/Administrate/Page/Show
%>
<% content_for(:title) { t("administrate.actions.show_resource", name: page.page_title) } %>
<header class="main-content__header">
<h1 class="main-content__page-title">
<%= content_for(:title) %>
</h1>
<div>
<%= link_to(
"Chat",
[:chat, namespace, page.resource],
class: "button"
) %>
<%= link_to(
"Edit",
[:edit, namespace, page.resource],
class: "button",
) if accessible_action?(page.resource, :edit) %>
<%= link_to(
t("administrate.actions.destroy"),
[namespace, page.resource],
class: "button button--danger",
method: :delete,
data: { confirm: t("administrate.actions.confirm") }
) if accessible_action?(page.resource, :destroy) %>
</div>
</header>
<section class="main-content__body">
<dl>
<% page.attributes.each do |title, attributes| %>
<fieldset class="<%= "field-unit--nested" if title.present? %>">
<% if title.present? %>
<legend><%= t "helpers.label.#{page.resource_name}.#{title}", default: title %></legend>
<% end %>
<% attributes.each do |attribute| %>
<dt class="attribute-label" id="<%= attribute.name %>">
<%= t(
"helpers.label.#{resource_name}.#{attribute.name}",
default: page.resource.class.human_attribute_name(attribute.name),
) %>
</dt>
<dd class="attribute-data attribute-data--<%=attribute.html_class%>"
><%= render_field attribute, page: page %></dd>
<% end %>
</fieldset>
<% end %>
</dl>
</section>