This PR is the first of many to simplify the process of building an assistant. The new flow will only require the user’s website. We’ll automatically crawl it, identify the business name and what the business does, and then generate a suggested assistant persona, complete with a proposed name and description. This service returns the following. Example: tooljet.com <img width="795" height="217" alt="Screenshot 2025-10-25 at 2 55 04 PM" src="https://github.com/user-attachments/assets/9cb3594a-9c9c-4970-a0a1-4c9c8869c193" /> Example: replit.com <img width="797" height="176" alt="Screenshot 2025-10-25 at 2 56 42 PM" src="https://github.com/user-attachments/assets/6a1b4266-aab6-455f-a5e3-696d3a8243c9" />
61 lines
1.3 KiB
Ruby
61 lines
1.3 KiB
Ruby
class Captain::Tools::SimplePageCrawlService
|
|
attr_reader :external_link
|
|
|
|
def initialize(external_link)
|
|
@external_link = external_link
|
|
@doc = Nokogiri::HTML(HTTParty.get(external_link).body)
|
|
end
|
|
|
|
def page_links
|
|
sitemap? ? extract_links_from_sitemap : extract_links_from_html
|
|
end
|
|
|
|
def page_title
|
|
title_element = @doc.at_xpath('//title')
|
|
title_element&.text&.strip
|
|
end
|
|
|
|
def body_text_content
|
|
ReverseMarkdown.convert @doc.at_xpath('//body'), unknown_tags: :bypass, github_flavored: true
|
|
end
|
|
|
|
def meta_description
|
|
meta_desc = @doc.at_css('meta[name="description"]')
|
|
return nil unless meta_desc && meta_desc['content']
|
|
|
|
meta_desc['content'].strip
|
|
end
|
|
|
|
def favicon_url
|
|
favicon_link = @doc.at_css('link[rel*="icon"]')
|
|
return nil unless favicon_link && favicon_link['href']
|
|
|
|
resolve_url(favicon_link['href'])
|
|
end
|
|
|
|
private
|
|
|
|
def sitemap?
|
|
@external_link.end_with?('.xml')
|
|
end
|
|
|
|
def extract_links_from_sitemap
|
|
@doc.xpath('//loc').to_set(&:text)
|
|
end
|
|
|
|
def extract_links_from_html
|
|
@doc.xpath('//a/@href').to_set do |link|
|
|
absolute_url = URI.join(@external_link, link.value).to_s
|
|
absolute_url
|
|
end
|
|
end
|
|
|
|
def resolve_url(url)
|
|
return url if url.start_with?('http')
|
|
|
|
URI.join(@external_link, url).to_s
|
|
rescue StandardError
|
|
url
|
|
end
|
|
end
|