feat(ee): Add a service to fetch website content and prepare a persona of Captain Assistant (#12732)
This PR is the first of many to simplify the process of building an assistant. The new flow will only require the user’s website. We’ll automatically crawl it, identify the business name and what the business does, and then generate a suggested assistant persona, complete with a proposed name and description. This service returns the following. Example: tooljet.com <img width="795" height="217" alt="Screenshot 2025-10-25 at 2 55 04 PM" src="https://github.com/user-attachments/assets/9cb3594a-9c9c-4970-a0a1-4c9c8869c193" /> Example: replit.com <img width="797" height="176" alt="Screenshot 2025-10-25 at 2 56 42 PM" src="https://github.com/user-attachments/assets/6a1b4266-aab6-455f-a5e3-696d3a8243c9" />
This commit is contained in:
@@ -0,0 +1,129 @@
|
||||
class Captain::Onboarding::WebsiteAnalyzerService < Llm::BaseOpenAiService
|
||||
MAX_CONTENT_LENGTH = 8000
|
||||
|
||||
def initialize(website_url)
|
||||
super()
|
||||
@website_url = normalize_url(website_url)
|
||||
@website_content = nil
|
||||
@favicon_url = nil
|
||||
end
|
||||
|
||||
def analyze
|
||||
fetch_website_content
|
||||
return error_response('Failed to fetch website content') unless @website_content
|
||||
|
||||
extract_business_info
|
||||
rescue StandardError => e
|
||||
Rails.logger.error "[Captain Onboarding] Website analysis error: #{e.message}"
|
||||
error_response(e.message)
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def normalize_url(url)
|
||||
return url if url.match?(%r{\Ahttps?://})
|
||||
|
||||
"https://#{url}"
|
||||
end
|
||||
|
||||
def fetch_website_content
|
||||
crawler = Captain::Tools::SimplePageCrawlService.new(@website_url)
|
||||
|
||||
text_content = crawler.body_text_content
|
||||
page_title = crawler.page_title
|
||||
meta_description = crawler.meta_description
|
||||
|
||||
if page_title.blank? && meta_description.blank? && text_content.blank?
|
||||
Rails.logger.error "[Captain Onboarding] Failed to fetch #{@website_url}: No content found"
|
||||
return false
|
||||
end
|
||||
|
||||
combined_content = []
|
||||
combined_content << "Title: #{page_title}" if page_title.present?
|
||||
combined_content << "Description: #{meta_description}" if meta_description.present?
|
||||
combined_content << text_content
|
||||
|
||||
@website_content = clean_and_truncate_content(combined_content.join("\n\n"))
|
||||
@favicon_url = crawler.favicon_url
|
||||
true
|
||||
rescue StandardError => e
|
||||
Rails.logger.error "[Captain Onboarding] Failed to fetch #{@website_url}: #{e.message}"
|
||||
false
|
||||
end
|
||||
|
||||
def clean_and_truncate_content(content)
|
||||
cleaned = content.gsub(/\s+/, ' ').strip
|
||||
cleaned.length > MAX_CONTENT_LENGTH ? cleaned[0...MAX_CONTENT_LENGTH] : cleaned
|
||||
end
|
||||
|
||||
def extract_business_info
|
||||
prompt = build_analysis_prompt
|
||||
|
||||
response = client.chat(
|
||||
parameters: {
|
||||
model: model,
|
||||
messages: [{ role: 'user', content: prompt }],
|
||||
response_format: { type: 'json_object' },
|
||||
temperature: 0.1,
|
||||
max_tokens: 1000
|
||||
}
|
||||
)
|
||||
|
||||
parse_llm_response(response.dig('choices', 0, 'message', 'content'))
|
||||
end
|
||||
|
||||
def build_analysis_prompt
|
||||
<<~PROMPT
|
||||
Analyze the following website content and extract business information. Return a JSON response with the following structure:
|
||||
|
||||
{
|
||||
"business_name": "The company or business name",
|
||||
"suggested_assistant_name": "A friendly assistant name (e.g., 'Captain Assistant', 'Support Genie', etc.)",
|
||||
"description": "Persona of the assistant based on the business type"
|
||||
}
|
||||
|
||||
Guidelines:
|
||||
- business_name: Extract the actual company/brand name from the content
|
||||
- suggested_assistant_name: Create a friendly, professional name that customers would want to interact with
|
||||
- description: Provide context about the business and what the assistant can help with. Keep it general and adaptable rather than overly specific. For example: "You specialize in helping customers with their orders and product questions" or "You assist customers with their account needs and general inquiries"
|
||||
|
||||
Website content:
|
||||
#{@website_content}
|
||||
|
||||
Return only valid JSON, no additional text.
|
||||
PROMPT
|
||||
end
|
||||
|
||||
def parse_llm_response(response_text)
|
||||
parsed_response = JSON.parse(response_text)
|
||||
|
||||
{
|
||||
success: true,
|
||||
data: {
|
||||
business_name: parsed_response['business_name'],
|
||||
suggested_assistant_name: parsed_response['suggested_assistant_name'],
|
||||
description: parsed_response['description'],
|
||||
website_url: @website_url,
|
||||
favicon_url: @favicon_url
|
||||
}
|
||||
}
|
||||
rescue JSON::ParserError => e
|
||||
Rails.logger.error "[Captain Onboarding] JSON parsing error: #{e.message}"
|
||||
Rails.logger.error "[Captain Onboarding] Raw response: #{response_text}"
|
||||
error_response('Failed to parse business information from website')
|
||||
end
|
||||
|
||||
def error_response(message)
|
||||
{
|
||||
success: false,
|
||||
error: message,
|
||||
data: {
|
||||
business_name: '',
|
||||
suggested_assistant_name: '',
|
||||
description: '',
|
||||
website_url: @website_url,
|
||||
favicon_url: nil
|
||||
}
|
||||
}
|
||||
end
|
||||
end
|
||||
@@ -19,6 +19,20 @@ class Captain::Tools::SimplePageCrawlService
|
||||
ReverseMarkdown.convert @doc.at_xpath('//body'), unknown_tags: :bypass, github_flavored: true
|
||||
end
|
||||
|
||||
def meta_description
|
||||
meta_desc = @doc.at_css('meta[name="description"]')
|
||||
return nil unless meta_desc && meta_desc['content']
|
||||
|
||||
meta_desc['content'].strip
|
||||
end
|
||||
|
||||
def favicon_url
|
||||
favicon_link = @doc.at_css('link[rel*="icon"]')
|
||||
return nil unless favicon_link && favicon_link['href']
|
||||
|
||||
resolve_url(favicon_link['href'])
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def sitemap?
|
||||
@@ -35,4 +49,12 @@ class Captain::Tools::SimplePageCrawlService
|
||||
absolute_url
|
||||
end
|
||||
end
|
||||
|
||||
def resolve_url(url)
|
||||
return url if url.start_with?('http')
|
||||
|
||||
URI.join(@external_link, url).to_s
|
||||
rescue StandardError
|
||||
url
|
||||
end
|
||||
end
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
class Llm::BaseOpenAiService
|
||||
DEFAULT_MODEL = 'gpt-4o-mini'.freeze
|
||||
attr_reader :client, :model
|
||||
|
||||
def initialize
|
||||
@client = OpenAI::Client.new(
|
||||
|
||||
Reference in New Issue
Block a user