From 5891fd6f4980e4b92e284e8a6669018578b34c05 Mon Sep 17 00:00:00 2001 From: Pranav Date: Sat, 25 Oct 2025 15:50:50 -0700 Subject: [PATCH] feat(ee): Add a service to fetch website content and prepare a persona of Captain Assistant (#12732) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR is the first of many to simplify the process of building an assistant. The new flow will only require the user’s website. We’ll automatically crawl it, identify the business name and what the business does, and then generate a suggested assistant persona, complete with a proposed name and description. This service returns the following. Example: tooljet.com Screenshot 2025-10-25 at 2 55 04 PM Example: replit.com Screenshot 2025-10-25 at 2 56 42 PM --- .../onboarding/website_analyzer_service.rb | 129 ++++++++++++++++++ .../tools/simple_page_crawl_service.rb | 22 +++ .../app/services/llm/base_open_ai_service.rb | 1 + .../website_analyzer_service_spec.rb | 99 ++++++++++++++ .../tools/simple_page_crawl_service_spec.rb | 59 ++++++++ 5 files changed, 310 insertions(+) create mode 100644 enterprise/app/services/captain/onboarding/website_analyzer_service.rb create mode 100644 spec/enterprise/services/captain/onboarding/website_analyzer_service_spec.rb diff --git a/enterprise/app/services/captain/onboarding/website_analyzer_service.rb b/enterprise/app/services/captain/onboarding/website_analyzer_service.rb new file mode 100644 index 000000000..d3e7d4983 --- /dev/null +++ b/enterprise/app/services/captain/onboarding/website_analyzer_service.rb @@ -0,0 +1,129 @@ +class Captain::Onboarding::WebsiteAnalyzerService < Llm::BaseOpenAiService + MAX_CONTENT_LENGTH = 8000 + + def initialize(website_url) + super() + @website_url = normalize_url(website_url) + @website_content = nil + @favicon_url = nil + end + + def analyze + fetch_website_content + return error_response('Failed to fetch website content') unless @website_content + + extract_business_info + rescue StandardError => e + Rails.logger.error "[Captain Onboarding] Website analysis error: #{e.message}" + error_response(e.message) + end + + private + + def normalize_url(url) + return url if url.match?(%r{\Ahttps?://}) + + "https://#{url}" + end + + def fetch_website_content + crawler = Captain::Tools::SimplePageCrawlService.new(@website_url) + + text_content = crawler.body_text_content + page_title = crawler.page_title + meta_description = crawler.meta_description + + if page_title.blank? && meta_description.blank? && text_content.blank? + Rails.logger.error "[Captain Onboarding] Failed to fetch #{@website_url}: No content found" + return false + end + + combined_content = [] + combined_content << "Title: #{page_title}" if page_title.present? + combined_content << "Description: #{meta_description}" if meta_description.present? + combined_content << text_content + + @website_content = clean_and_truncate_content(combined_content.join("\n\n")) + @favicon_url = crawler.favicon_url + true + rescue StandardError => e + Rails.logger.error "[Captain Onboarding] Failed to fetch #{@website_url}: #{e.message}" + false + end + + def clean_and_truncate_content(content) + cleaned = content.gsub(/\s+/, ' ').strip + cleaned.length > MAX_CONTENT_LENGTH ? cleaned[0...MAX_CONTENT_LENGTH] : cleaned + end + + def extract_business_info + prompt = build_analysis_prompt + + response = client.chat( + parameters: { + model: model, + messages: [{ role: 'user', content: prompt }], + response_format: { type: 'json_object' }, + temperature: 0.1, + max_tokens: 1000 + } + ) + + parse_llm_response(response.dig('choices', 0, 'message', 'content')) + end + + def build_analysis_prompt + <<~PROMPT + Analyze the following website content and extract business information. Return a JSON response with the following structure: + + { + "business_name": "The company or business name", + "suggested_assistant_name": "A friendly assistant name (e.g., 'Captain Assistant', 'Support Genie', etc.)", + "description": "Persona of the assistant based on the business type" + } + + Guidelines: + - business_name: Extract the actual company/brand name from the content + - suggested_assistant_name: Create a friendly, professional name that customers would want to interact with + - description: Provide context about the business and what the assistant can help with. Keep it general and adaptable rather than overly specific. For example: "You specialize in helping customers with their orders and product questions" or "You assist customers with their account needs and general inquiries" + + Website content: + #{@website_content} + + Return only valid JSON, no additional text. + PROMPT + end + + def parse_llm_response(response_text) + parsed_response = JSON.parse(response_text) + + { + success: true, + data: { + business_name: parsed_response['business_name'], + suggested_assistant_name: parsed_response['suggested_assistant_name'], + description: parsed_response['description'], + website_url: @website_url, + favicon_url: @favicon_url + } + } + rescue JSON::ParserError => e + Rails.logger.error "[Captain Onboarding] JSON parsing error: #{e.message}" + Rails.logger.error "[Captain Onboarding] Raw response: #{response_text}" + error_response('Failed to parse business information from website') + end + + def error_response(message) + { + success: false, + error: message, + data: { + business_name: '', + suggested_assistant_name: '', + description: '', + website_url: @website_url, + favicon_url: nil + } + } + end +end diff --git a/enterprise/app/services/captain/tools/simple_page_crawl_service.rb b/enterprise/app/services/captain/tools/simple_page_crawl_service.rb index d6baa1ebe..65731ad90 100644 --- a/enterprise/app/services/captain/tools/simple_page_crawl_service.rb +++ b/enterprise/app/services/captain/tools/simple_page_crawl_service.rb @@ -19,6 +19,20 @@ class Captain::Tools::SimplePageCrawlService ReverseMarkdown.convert @doc.at_xpath('//body'), unknown_tags: :bypass, github_flavored: true end + def meta_description + meta_desc = @doc.at_css('meta[name="description"]') + return nil unless meta_desc && meta_desc['content'] + + meta_desc['content'].strip + end + + def favicon_url + favicon_link = @doc.at_css('link[rel*="icon"]') + return nil unless favicon_link && favicon_link['href'] + + resolve_url(favicon_link['href']) + end + private def sitemap? @@ -35,4 +49,12 @@ class Captain::Tools::SimplePageCrawlService absolute_url end end + + def resolve_url(url) + return url if url.start_with?('http') + + URI.join(@external_link, url).to_s + rescue StandardError + url + end end diff --git a/enterprise/app/services/llm/base_open_ai_service.rb b/enterprise/app/services/llm/base_open_ai_service.rb index 2d3932246..e3e88453c 100644 --- a/enterprise/app/services/llm/base_open_ai_service.rb +++ b/enterprise/app/services/llm/base_open_ai_service.rb @@ -1,5 +1,6 @@ class Llm::BaseOpenAiService DEFAULT_MODEL = 'gpt-4o-mini'.freeze + attr_reader :client, :model def initialize @client = OpenAI::Client.new( diff --git a/spec/enterprise/services/captain/onboarding/website_analyzer_service_spec.rb b/spec/enterprise/services/captain/onboarding/website_analyzer_service_spec.rb new file mode 100644 index 000000000..a2735bd69 --- /dev/null +++ b/spec/enterprise/services/captain/onboarding/website_analyzer_service_spec.rb @@ -0,0 +1,99 @@ +require 'rails_helper' + +RSpec.describe Captain::Onboarding::WebsiteAnalyzerService do + let(:website_url) { 'https://example.com' } + let(:service) { described_class.new(website_url) } + let(:mock_crawler) { instance_double(Captain::Tools::SimplePageCrawlService) } + let(:mock_client) { instance_double(OpenAI::Client) } + + before do + create(:installation_config, name: 'CAPTAIN_OPEN_AI_API_KEY', value: 'test-key') + allow(Captain::Tools::SimplePageCrawlService).to receive(:new).and_return(mock_crawler) + allow(service).to receive(:client).and_return(mock_client) + allow(service).to receive(:model).and_return('gpt-3.5-turbo') + end + + describe '#analyze' do + context 'when website content is available and OpenAI call is successful' do + let(:openai_response) do + { + 'choices' => [{ + 'message' => { + 'content' => { + 'business_name' => 'Example Corp', + 'suggested_assistant_name' => 'Alex from Example Corp', + 'description' => 'You specialize in helping customers with business solutions and support' + }.to_json + } + }] + } + end + + before do + allow(mock_crawler).to receive(:body_text_content).and_return('Welcome to Example Corp') + allow(mock_crawler).to receive(:page_title).and_return('Example Corp - Home') + allow(mock_crawler).to receive(:meta_description).and_return('Leading provider of business solutions') + allow(mock_crawler).to receive(:favicon_url).and_return('https://example.com/favicon.ico') + allow(mock_client).to receive(:chat).and_return(openai_response) + end + + it 'returns success' do + result = service.analyze + + expect(result[:success]).to be true + expect(result[:data]).to include( + business_name: 'Example Corp', + suggested_assistant_name: 'Alex from Example Corp', + description: 'You specialize in helping customers with business solutions and support', + website_url: website_url, + favicon_url: 'https://example.com/favicon.ico' + ) + end + end + + context 'when website content is errored' do + before do + allow(mock_crawler).to receive(:body_text_content).and_raise(StandardError, 'Network error') + end + + it 'returns error' do + result = service.analyze + + expect(result[:success]).to be false + expect(result[:error]).to eq('Failed to fetch website content') + end + end + + context 'when website content is unavailable' do + before do + allow(mock_crawler).to receive(:body_text_content).and_return('') + allow(mock_crawler).to receive(:page_title).and_return('') + allow(mock_crawler).to receive(:meta_description).and_return('') + end + + it 'returns error' do + result = service.analyze + + expect(result[:success]).to be false + expect(result[:error]).to eq('Failed to fetch website content') + end + end + + context 'when OpenAI error' do + before do + allow(mock_crawler).to receive(:body_text_content).and_return('Welcome to Example Corp') + allow(mock_crawler).to receive(:page_title).and_return('Example Corp - Home') + allow(mock_crawler).to receive(:meta_description).and_return('Leading provider of business solutions') + allow(mock_crawler).to receive(:favicon_url).and_return('https://example.com/favicon.ico') + allow(mock_client).to receive(:chat).and_raise(StandardError, 'API error') + end + + it 'returns error' do + result = service.analyze + + expect(result[:success]).to be false + expect(result[:error]).to eq('API error') + end + end + end +end diff --git a/spec/enterprise/services/captain/tools/simple_page_crawl_service_spec.rb b/spec/enterprise/services/captain/tools/simple_page_crawl_service_spec.rb index 5868c0e22..5dfe7177d 100644 --- a/spec/enterprise/services/captain/tools/simple_page_crawl_service_spec.rb +++ b/spec/enterprise/services/captain/tools/simple_page_crawl_service_spec.rb @@ -125,4 +125,63 @@ RSpec.describe Captain::Tools::SimplePageCrawlService do ) end end + + describe '#meta_description' do + context 'when meta description exists' do + before do + stub_request(:get, base_url) + .to_return(body: '') + end + + it 'returns the meta description content' do + expect(service.meta_description).to eq('This is a test page description') + end + end + + context 'when meta description does not exist' do + before do + stub_request(:get, base_url) + .to_return(body: 'Test') + end + + it 'returns nil' do + expect(service.meta_description).to be_nil + end + end + end + + describe '#favicon_url' do + context 'when favicon exists with relative URL' do + before do + stub_request(:get, base_url) + .to_return(body: '') + end + + it 'returns the resolved absolute favicon URL' do + expect(service.favicon_url).to eq('https://example.com/favicon.ico') + end + end + + context 'when favicon exists with absolute URL' do + before do + stub_request(:get, base_url) + .to_return(body: '') + end + + it 'returns the absolute favicon URL' do + expect(service.favicon_url).to eq('https://cdn.example.com/favicon.ico') + end + end + + context 'when favicon does not exist' do + before do + stub_request(:get, base_url) + .to_return(body: 'Test') + end + + it 'returns nil' do + expect(service.favicon_url).to be_nil + end + end + end end