feat(ee): Add a service to fetch website content and prepare a persona of Captain Assistant (#12732)

This PR is the first of many to simplify the process of building an
assistant. The new flow will only require the user’s website. We’ll
automatically crawl it, identify the business name and what the business
does, and then generate a suggested assistant persona, complete with a
proposed name and description.

This service returns the following.
Example: tooljet.com
<img width="795" height="217" alt="Screenshot 2025-10-25 at 2 55 04 PM"
src="https://github.com/user-attachments/assets/9cb3594a-9c9c-4970-a0a1-4c9c8869c193"
/>

Example: replit.com
<img width="797" height="176" alt="Screenshot 2025-10-25 at 2 56 42 PM"
src="https://github.com/user-attachments/assets/6a1b4266-aab6-455f-a5e3-696d3a8243c9"
/>
This commit is contained in:
Pranav
2025-10-25 15:50:50 -07:00
committed by GitHub
parent b9864fe1f6
commit 5891fd6f49
5 changed files with 310 additions and 0 deletions

View File

@@ -0,0 +1,129 @@
class Captain::Onboarding::WebsiteAnalyzerService < Llm::BaseOpenAiService
MAX_CONTENT_LENGTH = 8000
def initialize(website_url)
super()
@website_url = normalize_url(website_url)
@website_content = nil
@favicon_url = nil
end
def analyze
fetch_website_content
return error_response('Failed to fetch website content') unless @website_content
extract_business_info
rescue StandardError => e
Rails.logger.error "[Captain Onboarding] Website analysis error: #{e.message}"
error_response(e.message)
end
private
def normalize_url(url)
return url if url.match?(%r{\Ahttps?://})
"https://#{url}"
end
def fetch_website_content
crawler = Captain::Tools::SimplePageCrawlService.new(@website_url)
text_content = crawler.body_text_content
page_title = crawler.page_title
meta_description = crawler.meta_description
if page_title.blank? && meta_description.blank? && text_content.blank?
Rails.logger.error "[Captain Onboarding] Failed to fetch #{@website_url}: No content found"
return false
end
combined_content = []
combined_content << "Title: #{page_title}" if page_title.present?
combined_content << "Description: #{meta_description}" if meta_description.present?
combined_content << text_content
@website_content = clean_and_truncate_content(combined_content.join("\n\n"))
@favicon_url = crawler.favicon_url
true
rescue StandardError => e
Rails.logger.error "[Captain Onboarding] Failed to fetch #{@website_url}: #{e.message}"
false
end
def clean_and_truncate_content(content)
cleaned = content.gsub(/\s+/, ' ').strip
cleaned.length > MAX_CONTENT_LENGTH ? cleaned[0...MAX_CONTENT_LENGTH] : cleaned
end
def extract_business_info
prompt = build_analysis_prompt
response = client.chat(
parameters: {
model: model,
messages: [{ role: 'user', content: prompt }],
response_format: { type: 'json_object' },
temperature: 0.1,
max_tokens: 1000
}
)
parse_llm_response(response.dig('choices', 0, 'message', 'content'))
end
def build_analysis_prompt
<<~PROMPT
Analyze the following website content and extract business information. Return a JSON response with the following structure:
{
"business_name": "The company or business name",
"suggested_assistant_name": "A friendly assistant name (e.g., 'Captain Assistant', 'Support Genie', etc.)",
"description": "Persona of the assistant based on the business type"
}
Guidelines:
- business_name: Extract the actual company/brand name from the content
- suggested_assistant_name: Create a friendly, professional name that customers would want to interact with
- description: Provide context about the business and what the assistant can help with. Keep it general and adaptable rather than overly specific. For example: "You specialize in helping customers with their orders and product questions" or "You assist customers with their account needs and general inquiries"
Website content:
#{@website_content}
Return only valid JSON, no additional text.
PROMPT
end
def parse_llm_response(response_text)
parsed_response = JSON.parse(response_text)
{
success: true,
data: {
business_name: parsed_response['business_name'],
suggested_assistant_name: parsed_response['suggested_assistant_name'],
description: parsed_response['description'],
website_url: @website_url,
favicon_url: @favicon_url
}
}
rescue JSON::ParserError => e
Rails.logger.error "[Captain Onboarding] JSON parsing error: #{e.message}"
Rails.logger.error "[Captain Onboarding] Raw response: #{response_text}"
error_response('Failed to parse business information from website')
end
def error_response(message)
{
success: false,
error: message,
data: {
business_name: '',
suggested_assistant_name: '',
description: '',
website_url: @website_url,
favicon_url: nil
}
}
end
end

View File

@@ -19,6 +19,20 @@ class Captain::Tools::SimplePageCrawlService
ReverseMarkdown.convert @doc.at_xpath('//body'), unknown_tags: :bypass, github_flavored: true
end
def meta_description
meta_desc = @doc.at_css('meta[name="description"]')
return nil unless meta_desc && meta_desc['content']
meta_desc['content'].strip
end
def favicon_url
favicon_link = @doc.at_css('link[rel*="icon"]')
return nil unless favicon_link && favicon_link['href']
resolve_url(favicon_link['href'])
end
private
def sitemap?
@@ -35,4 +49,12 @@ class Captain::Tools::SimplePageCrawlService
absolute_url
end
end
def resolve_url(url)
return url if url.start_with?('http')
URI.join(@external_link, url).to_s
rescue StandardError
url
end
end

View File

@@ -1,5 +1,6 @@
class Llm::BaseOpenAiService
DEFAULT_MODEL = 'gpt-4o-mini'.freeze
attr_reader :client, :model
def initialize
@client = OpenAI::Client.new(

View File

@@ -0,0 +1,99 @@
require 'rails_helper'
RSpec.describe Captain::Onboarding::WebsiteAnalyzerService do
let(:website_url) { 'https://example.com' }
let(:service) { described_class.new(website_url) }
let(:mock_crawler) { instance_double(Captain::Tools::SimplePageCrawlService) }
let(:mock_client) { instance_double(OpenAI::Client) }
before do
create(:installation_config, name: 'CAPTAIN_OPEN_AI_API_KEY', value: 'test-key')
allow(Captain::Tools::SimplePageCrawlService).to receive(:new).and_return(mock_crawler)
allow(service).to receive(:client).and_return(mock_client)
allow(service).to receive(:model).and_return('gpt-3.5-turbo')
end
describe '#analyze' do
context 'when website content is available and OpenAI call is successful' do
let(:openai_response) do
{
'choices' => [{
'message' => {
'content' => {
'business_name' => 'Example Corp',
'suggested_assistant_name' => 'Alex from Example Corp',
'description' => 'You specialize in helping customers with business solutions and support'
}.to_json
}
}]
}
end
before do
allow(mock_crawler).to receive(:body_text_content).and_return('Welcome to Example Corp')
allow(mock_crawler).to receive(:page_title).and_return('Example Corp - Home')
allow(mock_crawler).to receive(:meta_description).and_return('Leading provider of business solutions')
allow(mock_crawler).to receive(:favicon_url).and_return('https://example.com/favicon.ico')
allow(mock_client).to receive(:chat).and_return(openai_response)
end
it 'returns success' do
result = service.analyze
expect(result[:success]).to be true
expect(result[:data]).to include(
business_name: 'Example Corp',
suggested_assistant_name: 'Alex from Example Corp',
description: 'You specialize in helping customers with business solutions and support',
website_url: website_url,
favicon_url: 'https://example.com/favicon.ico'
)
end
end
context 'when website content is errored' do
before do
allow(mock_crawler).to receive(:body_text_content).and_raise(StandardError, 'Network error')
end
it 'returns error' do
result = service.analyze
expect(result[:success]).to be false
expect(result[:error]).to eq('Failed to fetch website content')
end
end
context 'when website content is unavailable' do
before do
allow(mock_crawler).to receive(:body_text_content).and_return('')
allow(mock_crawler).to receive(:page_title).and_return('')
allow(mock_crawler).to receive(:meta_description).and_return('')
end
it 'returns error' do
result = service.analyze
expect(result[:success]).to be false
expect(result[:error]).to eq('Failed to fetch website content')
end
end
context 'when OpenAI error' do
before do
allow(mock_crawler).to receive(:body_text_content).and_return('Welcome to Example Corp')
allow(mock_crawler).to receive(:page_title).and_return('Example Corp - Home')
allow(mock_crawler).to receive(:meta_description).and_return('Leading provider of business solutions')
allow(mock_crawler).to receive(:favicon_url).and_return('https://example.com/favicon.ico')
allow(mock_client).to receive(:chat).and_raise(StandardError, 'API error')
end
it 'returns error' do
result = service.analyze
expect(result[:success]).to be false
expect(result[:error]).to eq('API error')
end
end
end
end

View File

@@ -125,4 +125,63 @@ RSpec.describe Captain::Tools::SimplePageCrawlService do
)
end
end
describe '#meta_description' do
context 'when meta description exists' do
before do
stub_request(:get, base_url)
.to_return(body: '<html><head><meta name="description" content="This is a test page description"></head></html>')
end
it 'returns the meta description content' do
expect(service.meta_description).to eq('This is a test page description')
end
end
context 'when meta description does not exist' do
before do
stub_request(:get, base_url)
.to_return(body: '<html><head><title>Test</title></head></html>')
end
it 'returns nil' do
expect(service.meta_description).to be_nil
end
end
end
describe '#favicon_url' do
context 'when favicon exists with relative URL' do
before do
stub_request(:get, base_url)
.to_return(body: '<html><head><link rel="icon" href="/favicon.ico"></head></html>')
end
it 'returns the resolved absolute favicon URL' do
expect(service.favicon_url).to eq('https://example.com/favicon.ico')
end
end
context 'when favicon exists with absolute URL' do
before do
stub_request(:get, base_url)
.to_return(body: '<html><head><link rel="icon" href="https://cdn.example.com/favicon.ico"></head></html>')
end
it 'returns the absolute favicon URL' do
expect(service.favicon_url).to eq('https://cdn.example.com/favicon.ico')
end
end
context 'when favicon does not exist' do
before do
stub_request(:get, base_url)
.to_return(body: '<html><head><title>Test</title></head></html>')
end
it 'returns nil' do
expect(service.favicon_url).to be_nil
end
end
end
end