diff --git a/app/services/concerns/social_link_parser.rb b/app/services/concerns/social_link_parser.rb new file mode 100644 index 000000000..fa0cfca06 --- /dev/null +++ b/app/services/concerns/social_link_parser.rb @@ -0,0 +1,65 @@ +module SocialLinkParser + extend ActiveSupport::Concern + + SOCIAL_DOMAIN_MAP = { + whatsapp: %w[wa.me api.whatsapp.com], + line: %w[line.me], + facebook: %w[facebook.com fb.com fb.me], + instagram: %w[instagram.com], + telegram: %w[t.me telegram.me], + tiktok: %w[tiktok.com] + }.freeze + + private + + def extract_social_from_links(links) + handles = {} + SOCIAL_DOMAIN_MAP.each do |platform, domains| + handles[platform] = find_social_handle(links, platform, domains) + end + handles + end + + def find_social_handle(links, platform, domains) + matching_links = links.select do |l| + uri = URI.parse(l) + domains.any? { |d| match_social_domain?(uri.host, d) } + rescue URI::InvalidURIError + false + end + + matching_links.each do |link| + handle = parse_social_handle(platform, link) + return handle if handle.present? + end + nil + end + + def match_social_domain?(host, domain) + return false if host.blank? + + host == domain || host.end_with?(".#{domain}") + end + + SHARE_PATH_PREFIXES = %w[sharer share intent dialog].freeze + + def parse_social_handle(platform, link) + uri = URI.parse(link) + return extract_whatsapp_phone(uri) if platform == :whatsapp + + handle = uri.path.to_s.delete_prefix('/').delete_suffix('/') + return nil if handle.blank? + return nil if SHARE_PATH_PREFIXES.any? { |prefix| handle.start_with?(prefix) } + + handle.presence + rescue URI::InvalidURIError + nil + end + + # wa.me/1234567890 or api.whatsapp.com/send?phone=1234567890 + def extract_whatsapp_phone(uri) + phone = CGI.parse(uri.query.to_s)['phone']&.first + phone = uri.path.to_s.delete_prefix('/').delete_suffix('/') if phone.blank? + phone.presence&.gsub(/[^\d]/, '') + end +end diff --git a/app/services/website_branding_service.rb b/app/services/website_branding_service.rb new file mode 100644 index 000000000..89326e4b6 --- /dev/null +++ b/app/services/website_branding_service.rb @@ -0,0 +1,93 @@ +class WebsiteBrandingService + include SocialLinkParser + + def initialize(url) + @url = normalize_url(url) + end + + def perform + doc = fetch_page + return nil if doc.nil? + + links = extract_links(doc) + + { + business_name: extract_business_name(doc), + language: extract_language(doc), + industry_category: nil, + social_handles: extract_social_from_links(links), + branding: extract_branding(doc) + } + rescue StandardError => e + Rails.logger.error "[WebsiteBranding] #{e.message}" + nil + end + + private + + def normalize_url(url) + url.match?(%r{\Ahttps?://}) ? url : "https://#{url}" + end + + def fetch_page + response = HTTParty.get(@url, follow_redirects: true, timeout: 15) + return nil unless response.success? + + Nokogiri::HTML(response.body) + rescue StandardError => e + Rails.logger.error "[WebsiteBranding] Failed to fetch #{@url}: #{e.message}" + nil + end + + def extract_business_name(doc) + og_site_name = doc.at_css('meta[property="og:site_name"]')&.[]('content') + return og_site_name.strip if og_site_name.present? + + title = doc.at_xpath('//title')&.text + title&.strip&.split(/\s*[|\-–—·:]+\s*/)&.first + end + + def extract_language(doc) + doc.at_css('html')&.[]('lang')&.split('-')&.first&.downcase + end + + def extract_links(doc) + doc.css('a[href]').filter_map do |a| + href = a['href']&.strip + next if href.blank? || href.start_with?('#', 'javascript:', 'mailto:', 'tel:') + + href.start_with?('http') ? href : URI.join(@url, href).to_s + rescue URI::InvalidURIError + nil + end.uniq + end + + def extract_branding(doc) + { + favicon: extract_favicon(doc), + primary_color: extract_theme_color(doc) + } + end + + def extract_favicon(doc) + favicon = doc.at_css('link[rel*="icon"]')&.[]('href') + return nil if favicon.blank? + + resolve_url(favicon) + end + + def extract_theme_color(doc) + doc.at_css('meta[name="theme-color"]')&.[]('content') + end + + def resolve_url(url) + return nil if url.blank? + return url if url.start_with?('http') + + URI.join(@url, url).to_s + rescue URI::InvalidURIError + nil + end +end + +WebsiteBrandingService.prepend_mod_with('WebsiteBrandingService') diff --git a/enterprise/app/services/enterprise/website_branding_service.rb b/enterprise/app/services/enterprise/website_branding_service.rb new file mode 100644 index 000000000..6efdd5051 --- /dev/null +++ b/enterprise/app/services/enterprise/website_branding_service.rb @@ -0,0 +1,112 @@ +module Enterprise::WebsiteBrandingService + FIRECRAWL_SCRAPE_ENDPOINT = 'https://api.firecrawl.dev/v2/scrape'.freeze + + INDUSTRY_CATEGORIES = [ + 'Technology', + 'E-commerce', + 'Healthcare', + 'Education', + 'Finance', + 'Real Estate', + 'Marketing', + 'Travel & Hospitality', + 'Food & Beverage', + 'Media & Entertainment', + 'Professional Services', + 'Non-profit', + 'Other' + ].freeze + + def perform + return super unless firecrawl_enabled? + + response = perform_firecrawl_request + process_firecrawl_response(response) + rescue StandardError => e + Rails.logger.error "[WebsiteBranding] Firecrawl failed: #{e.message}, falling back to basic scrape" + super + end + + private + + def firecrawl_enabled? + firecrawl_api_key.present? + end + + def firecrawl_api_key + InstallationConfig.find_by(name: 'CAPTAIN_FIRECRAWL_API_KEY')&.value + end + + def perform_firecrawl_request + HTTParty.post( + FIRECRAWL_SCRAPE_ENDPOINT, + body: scrape_payload.to_json, + headers: { + 'Authorization' => "Bearer #{firecrawl_api_key}", + 'Content-Type' => 'application/json' + } + ) + end + + def scrape_payload + { + url: @url, + onlyMainContent: false, + formats: [ + { + type: 'json', + schema: extract_schema, + prompt: 'Extract the business name, primary language, and industry category from this website.' + }, + 'branding', + 'links' + ] + } + end + + def extract_schema + { + type: 'object', + properties: { + business_name: { type: 'string', description: 'The name of the business or company' }, + language: { type: 'string', description: 'Primary language as ISO 639-1 code (e.g., en, es, fr)' }, + industry_category: { type: 'string', enum: INDUSTRY_CATEGORIES, description: 'Industry category for this business' } + }, + required: %w[business_name] + } + end + + def process_firecrawl_response(response) + raise "API Error: #{response.message} (Status: #{response.code})" unless response.success? + + format_firecrawl_response(response) + end + + def format_firecrawl_response(response) + data = response.parsed_response + extract = data.dig('data', 'json') || {} + brand = data.dig('data', 'branding') || {} + links = data.dig('data', 'links') || [] + + { + business_name: extract['business_name'], + language: extract['language'], + industry_category: extract['industry_category'], + social_handles: extract_social_from_links(links), + branding: extract_firecrawl_branding(brand) + } + end + + def extract_firecrawl_branding(brand) + { + favicon: url_or_nil(brand.dig('images', 'favicon')), + primary_color: brand.dig('colors', 'primary') + } + end + + def url_or_nil(value) + return nil if value.blank? || !value.start_with?('http') + + value + end +end diff --git a/spec/enterprise/services/enterprise/website_branding_service_spec.rb b/spec/enterprise/services/enterprise/website_branding_service_spec.rb new file mode 100644 index 000000000..0907db518 --- /dev/null +++ b/spec/enterprise/services/enterprise/website_branding_service_spec.rb @@ -0,0 +1,171 @@ +require 'rails_helper' + +# Simulate the prepend_mod_with behavior for testing +test_klass = Class.new(WebsiteBrandingService) do + prepend Enterprise::WebsiteBrandingService +end + +RSpec.describe Enterprise::WebsiteBrandingService do + describe '#perform' do + subject(:service) { test_klass.new(url) } + + let(:url) { 'https://example.com' } + let(:api_key) { 'test-firecrawl-api-key' } + let(:scrape_endpoint) { described_class::FIRECRAWL_SCRAPE_ENDPOINT } + let(:fallback_html) { '