Files
leadchat/enterprise/app/services/captain/tools/simple_page_crawl_service.rb
Pranav 5891fd6f49 feat(ee): Add a service to fetch website content and prepare a persona of Captain Assistant (#12732)
This PR is the first of many to simplify the process of building an
assistant. The new flow will only require the user’s website. We’ll
automatically crawl it, identify the business name and what the business
does, and then generate a suggested assistant persona, complete with a
proposed name and description.

This service returns the following.
Example: tooljet.com
<img width="795" height="217" alt="Screenshot 2025-10-25 at 2 55 04 PM"
src="https://github.com/user-attachments/assets/9cb3594a-9c9c-4970-a0a1-4c9c8869c193"
/>

Example: replit.com
<img width="797" height="176" alt="Screenshot 2025-10-25 at 2 56 42 PM"
src="https://github.com/user-attachments/assets/6a1b4266-aab6-455f-a5e3-696d3a8243c9"
/>
2025-10-25 15:50:50 -07:00

61 lines
1.3 KiB
Ruby

class Captain::Tools::SimplePageCrawlService
attr_reader :external_link
def initialize(external_link)
@external_link = external_link
@doc = Nokogiri::HTML(HTTParty.get(external_link).body)
end
def page_links
sitemap? ? extract_links_from_sitemap : extract_links_from_html
end
def page_title
title_element = @doc.at_xpath('//title')
title_element&.text&.strip
end
def body_text_content
ReverseMarkdown.convert @doc.at_xpath('//body'), unknown_tags: :bypass, github_flavored: true
end
def meta_description
meta_desc = @doc.at_css('meta[name="description"]')
return nil unless meta_desc && meta_desc['content']
meta_desc['content'].strip
end
def favicon_url
favicon_link = @doc.at_css('link[rel*="icon"]')
return nil unless favicon_link && favicon_link['href']
resolve_url(favicon_link['href'])
end
private
def sitemap?
@external_link.end_with?('.xml')
end
def extract_links_from_sitemap
@doc.xpath('//loc').to_set(&:text)
end
def extract_links_from_html
@doc.xpath('//a/@href').to_set do |link|
absolute_url = URI.join(@external_link, link.value).to_s
absolute_url
end
end
def resolve_url(url)
return url if url.start_with?('http')
URI.join(@external_link, url).to_s
rescue StandardError
url
end
end