feat: firecrawl branding api [UPM-15] (#13903)
Adds `WebsiteBrandingService` (OSS) with an Enterprise override using
Firecrawl v2 to extract branding and business data from a URL for
onboarding auto-fill.
OSS version uses HTTParty + Nokogiri to extract:
- Business name (og:site_name or title)
- Language (html lang)
- Favicon
- Social links from `<a>` tags
Enterprise version makes a single Firecrawl call to fetch:
- Structured JSON (name, language, industry via LLM)
- Branding (favicon, primary color)
- Page links
Falls back to OSS if Firecrawl is unavailable or fails.
Social handles (WhatsApp, Facebook, Instagram, Telegram, TikTok, LINE)
are parsed deterministically via a shared `SocialLinkParser`.
> We use links for socials, since the LLM extraction was unreliable,
mostly returned empty, and hallucinated in some rare scenarios
## How to test
```ruby
# OSS (no Firecrawl key needed)
WebsiteBrandingService.new('chatwoot.com').perform
# Enterprise (requires CAPTAIN_FIRECRAWL_API_KEY)
WebsiteBrandingService.new('notion.so').perform
WebsiteBrandingService.new('postman.com').perform
```
Verify the returned hash includes business_name, language,
industry_category, social_handles, and branding with
favicon/primary_color.
<img width="908" height="393" alt="image"
src="https://github.com/user-attachments/assets/e3696887-d366-485a-89a0-8e1a9698a788"
/>
This commit is contained in:
65
app/services/concerns/social_link_parser.rb
Normal file
65
app/services/concerns/social_link_parser.rb
Normal file
@@ -0,0 +1,65 @@
|
||||
module SocialLinkParser
|
||||
extend ActiveSupport::Concern
|
||||
|
||||
SOCIAL_DOMAIN_MAP = {
|
||||
whatsapp: %w[wa.me api.whatsapp.com],
|
||||
line: %w[line.me],
|
||||
facebook: %w[facebook.com fb.com fb.me],
|
||||
instagram: %w[instagram.com],
|
||||
telegram: %w[t.me telegram.me],
|
||||
tiktok: %w[tiktok.com]
|
||||
}.freeze
|
||||
|
||||
private
|
||||
|
||||
def extract_social_from_links(links)
|
||||
handles = {}
|
||||
SOCIAL_DOMAIN_MAP.each do |platform, domains|
|
||||
handles[platform] = find_social_handle(links, platform, domains)
|
||||
end
|
||||
handles
|
||||
end
|
||||
|
||||
def find_social_handle(links, platform, domains)
|
||||
matching_links = links.select do |l|
|
||||
uri = URI.parse(l)
|
||||
domains.any? { |d| match_social_domain?(uri.host, d) }
|
||||
rescue URI::InvalidURIError
|
||||
false
|
||||
end
|
||||
|
||||
matching_links.each do |link|
|
||||
handle = parse_social_handle(platform, link)
|
||||
return handle if handle.present?
|
||||
end
|
||||
nil
|
||||
end
|
||||
|
||||
def match_social_domain?(host, domain)
|
||||
return false if host.blank?
|
||||
|
||||
host == domain || host.end_with?(".#{domain}")
|
||||
end
|
||||
|
||||
SHARE_PATH_PREFIXES = %w[sharer share intent dialog].freeze
|
||||
|
||||
def parse_social_handle(platform, link)
|
||||
uri = URI.parse(link)
|
||||
return extract_whatsapp_phone(uri) if platform == :whatsapp
|
||||
|
||||
handle = uri.path.to_s.delete_prefix('/').delete_suffix('/')
|
||||
return nil if handle.blank?
|
||||
return nil if SHARE_PATH_PREFIXES.any? { |prefix| handle.start_with?(prefix) }
|
||||
|
||||
handle.presence
|
||||
rescue URI::InvalidURIError
|
||||
nil
|
||||
end
|
||||
|
||||
# wa.me/1234567890 or api.whatsapp.com/send?phone=1234567890
|
||||
def extract_whatsapp_phone(uri)
|
||||
phone = CGI.parse(uri.query.to_s)['phone']&.first
|
||||
phone = uri.path.to_s.delete_prefix('/').delete_suffix('/') if phone.blank?
|
||||
phone.presence&.gsub(/[^\d]/, '')
|
||||
end
|
||||
end
|
||||
93
app/services/website_branding_service.rb
Normal file
93
app/services/website_branding_service.rb
Normal file
@@ -0,0 +1,93 @@
|
||||
class WebsiteBrandingService
|
||||
include SocialLinkParser
|
||||
|
||||
def initialize(url)
|
||||
@url = normalize_url(url)
|
||||
end
|
||||
|
||||
def perform
|
||||
doc = fetch_page
|
||||
return nil if doc.nil?
|
||||
|
||||
links = extract_links(doc)
|
||||
|
||||
{
|
||||
business_name: extract_business_name(doc),
|
||||
language: extract_language(doc),
|
||||
industry_category: nil,
|
||||
social_handles: extract_social_from_links(links),
|
||||
branding: extract_branding(doc)
|
||||
}
|
||||
rescue StandardError => e
|
||||
Rails.logger.error "[WebsiteBranding] #{e.message}"
|
||||
nil
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def normalize_url(url)
|
||||
url.match?(%r{\Ahttps?://}) ? url : "https://#{url}"
|
||||
end
|
||||
|
||||
def fetch_page
|
||||
response = HTTParty.get(@url, follow_redirects: true, timeout: 15)
|
||||
return nil unless response.success?
|
||||
|
||||
Nokogiri::HTML(response.body)
|
||||
rescue StandardError => e
|
||||
Rails.logger.error "[WebsiteBranding] Failed to fetch #{@url}: #{e.message}"
|
||||
nil
|
||||
end
|
||||
|
||||
def extract_business_name(doc)
|
||||
og_site_name = doc.at_css('meta[property="og:site_name"]')&.[]('content')
|
||||
return og_site_name.strip if og_site_name.present?
|
||||
|
||||
title = doc.at_xpath('//title')&.text
|
||||
title&.strip&.split(/\s*[|\-–—·:]+\s*/)&.first
|
||||
end
|
||||
|
||||
def extract_language(doc)
|
||||
doc.at_css('html')&.[]('lang')&.split('-')&.first&.downcase
|
||||
end
|
||||
|
||||
def extract_links(doc)
|
||||
doc.css('a[href]').filter_map do |a|
|
||||
href = a['href']&.strip
|
||||
next if href.blank? || href.start_with?('#', 'javascript:', 'mailto:', 'tel:')
|
||||
|
||||
href.start_with?('http') ? href : URI.join(@url, href).to_s
|
||||
rescue URI::InvalidURIError
|
||||
nil
|
||||
end.uniq
|
||||
end
|
||||
|
||||
def extract_branding(doc)
|
||||
{
|
||||
favicon: extract_favicon(doc),
|
||||
primary_color: extract_theme_color(doc)
|
||||
}
|
||||
end
|
||||
|
||||
def extract_favicon(doc)
|
||||
favicon = doc.at_css('link[rel*="icon"]')&.[]('href')
|
||||
return nil if favicon.blank?
|
||||
|
||||
resolve_url(favicon)
|
||||
end
|
||||
|
||||
def extract_theme_color(doc)
|
||||
doc.at_css('meta[name="theme-color"]')&.[]('content')
|
||||
end
|
||||
|
||||
def resolve_url(url)
|
||||
return nil if url.blank?
|
||||
return url if url.start_with?('http')
|
||||
|
||||
URI.join(@url, url).to_s
|
||||
rescue URI::InvalidURIError
|
||||
nil
|
||||
end
|
||||
end
|
||||
|
||||
WebsiteBrandingService.prepend_mod_with('WebsiteBrandingService')
|
||||
Reference in New Issue
Block a user