feat(ee): Add a service to fetch website content and prepare a persona of Captain Assistant (#12732)

This PR is the first of many to simplify the process of building an
assistant. The new flow will only require the user’s website. We’ll
automatically crawl it, identify the business name and what the business
does, and then generate a suggested assistant persona, complete with a
proposed name and description.

This service returns the following.
Example: tooljet.com
<img width="795" height="217" alt="Screenshot 2025-10-25 at 2 55 04 PM"
src="https://github.com/user-attachments/assets/9cb3594a-9c9c-4970-a0a1-4c9c8869c193"
/>

Example: replit.com
<img width="797" height="176" alt="Screenshot 2025-10-25 at 2 56 42 PM"
src="https://github.com/user-attachments/assets/6a1b4266-aab6-455f-a5e3-696d3a8243c9"
/>
This commit is contained in:
Pranav
2025-10-25 15:50:50 -07:00
committed by GitHub
parent b9864fe1f6
commit 5891fd6f49
5 changed files with 310 additions and 0 deletions

View File

@@ -0,0 +1,129 @@
class Captain::Onboarding::WebsiteAnalyzerService < Llm::BaseOpenAiService
MAX_CONTENT_LENGTH = 8000
def initialize(website_url)
super()
@website_url = normalize_url(website_url)
@website_content = nil
@favicon_url = nil
end
def analyze
fetch_website_content
return error_response('Failed to fetch website content') unless @website_content
extract_business_info
rescue StandardError => e
Rails.logger.error "[Captain Onboarding] Website analysis error: #{e.message}"
error_response(e.message)
end
private
def normalize_url(url)
return url if url.match?(%r{\Ahttps?://})
"https://#{url}"
end
def fetch_website_content
crawler = Captain::Tools::SimplePageCrawlService.new(@website_url)
text_content = crawler.body_text_content
page_title = crawler.page_title
meta_description = crawler.meta_description
if page_title.blank? && meta_description.blank? && text_content.blank?
Rails.logger.error "[Captain Onboarding] Failed to fetch #{@website_url}: No content found"
return false
end
combined_content = []
combined_content << "Title: #{page_title}" if page_title.present?
combined_content << "Description: #{meta_description}" if meta_description.present?
combined_content << text_content
@website_content = clean_and_truncate_content(combined_content.join("\n\n"))
@favicon_url = crawler.favicon_url
true
rescue StandardError => e
Rails.logger.error "[Captain Onboarding] Failed to fetch #{@website_url}: #{e.message}"
false
end
def clean_and_truncate_content(content)
cleaned = content.gsub(/\s+/, ' ').strip
cleaned.length > MAX_CONTENT_LENGTH ? cleaned[0...MAX_CONTENT_LENGTH] : cleaned
end
def extract_business_info
prompt = build_analysis_prompt
response = client.chat(
parameters: {
model: model,
messages: [{ role: 'user', content: prompt }],
response_format: { type: 'json_object' },
temperature: 0.1,
max_tokens: 1000
}
)
parse_llm_response(response.dig('choices', 0, 'message', 'content'))
end
def build_analysis_prompt
<<~PROMPT
Analyze the following website content and extract business information. Return a JSON response with the following structure:
{
"business_name": "The company or business name",
"suggested_assistant_name": "A friendly assistant name (e.g., 'Captain Assistant', 'Support Genie', etc.)",
"description": "Persona of the assistant based on the business type"
}
Guidelines:
- business_name: Extract the actual company/brand name from the content
- suggested_assistant_name: Create a friendly, professional name that customers would want to interact with
- description: Provide context about the business and what the assistant can help with. Keep it general and adaptable rather than overly specific. For example: "You specialize in helping customers with their orders and product questions" or "You assist customers with their account needs and general inquiries"
Website content:
#{@website_content}
Return only valid JSON, no additional text.
PROMPT
end
def parse_llm_response(response_text)
parsed_response = JSON.parse(response_text)
{
success: true,
data: {
business_name: parsed_response['business_name'],
suggested_assistant_name: parsed_response['suggested_assistant_name'],
description: parsed_response['description'],
website_url: @website_url,
favicon_url: @favicon_url
}
}
rescue JSON::ParserError => e
Rails.logger.error "[Captain Onboarding] JSON parsing error: #{e.message}"
Rails.logger.error "[Captain Onboarding] Raw response: #{response_text}"
error_response('Failed to parse business information from website')
end
def error_response(message)
{
success: false,
error: message,
data: {
business_name: '',
suggested_assistant_name: '',
description: '',
website_url: @website_url,
favicon_url: nil
}
}
end
end

View File

@@ -19,6 +19,20 @@ class Captain::Tools::SimplePageCrawlService
ReverseMarkdown.convert @doc.at_xpath('//body'), unknown_tags: :bypass, github_flavored: true
end
def meta_description
meta_desc = @doc.at_css('meta[name="description"]')
return nil unless meta_desc && meta_desc['content']
meta_desc['content'].strip
end
def favicon_url
favicon_link = @doc.at_css('link[rel*="icon"]')
return nil unless favicon_link && favicon_link['href']
resolve_url(favicon_link['href'])
end
private
def sitemap?
@@ -35,4 +49,12 @@ class Captain::Tools::SimplePageCrawlService
absolute_url
end
end
def resolve_url(url)
return url if url.start_with?('http')
URI.join(@external_link, url).to_s
rescue StandardError
url
end
end

View File

@@ -1,5 +1,6 @@
class Llm::BaseOpenAiService
DEFAULT_MODEL = 'gpt-4o-mini'.freeze
attr_reader :client, :model
def initialize
@client = OpenAI::Client.new(