diff --git a/app/services/concerns/social_link_parser.rb b/app/services/concerns/social_link_parser.rb new file mode 100644 index 000000000..fa0cfca06 --- /dev/null +++ b/app/services/concerns/social_link_parser.rb @@ -0,0 +1,65 @@ +module SocialLinkParser + extend ActiveSupport::Concern + + SOCIAL_DOMAIN_MAP = { + whatsapp: %w[wa.me api.whatsapp.com], + line: %w[line.me], + facebook: %w[facebook.com fb.com fb.me], + instagram: %w[instagram.com], + telegram: %w[t.me telegram.me], + tiktok: %w[tiktok.com] + }.freeze + + private + + def extract_social_from_links(links) + handles = {} + SOCIAL_DOMAIN_MAP.each do |platform, domains| + handles[platform] = find_social_handle(links, platform, domains) + end + handles + end + + def find_social_handle(links, platform, domains) + matching_links = links.select do |l| + uri = URI.parse(l) + domains.any? { |d| match_social_domain?(uri.host, d) } + rescue URI::InvalidURIError + false + end + + matching_links.each do |link| + handle = parse_social_handle(platform, link) + return handle if handle.present? + end + nil + end + + def match_social_domain?(host, domain) + return false if host.blank? + + host == domain || host.end_with?(".#{domain}") + end + + SHARE_PATH_PREFIXES = %w[sharer share intent dialog].freeze + + def parse_social_handle(platform, link) + uri = URI.parse(link) + return extract_whatsapp_phone(uri) if platform == :whatsapp + + handle = uri.path.to_s.delete_prefix('/').delete_suffix('/') + return nil if handle.blank? + return nil if SHARE_PATH_PREFIXES.any? { |prefix| handle.start_with?(prefix) } + + handle.presence + rescue URI::InvalidURIError + nil + end + + # wa.me/1234567890 or api.whatsapp.com/send?phone=1234567890 + def extract_whatsapp_phone(uri) + phone = CGI.parse(uri.query.to_s)['phone']&.first + phone = uri.path.to_s.delete_prefix('/').delete_suffix('/') if phone.blank? + phone.presence&.gsub(/[^\d]/, '') + end +end diff --git a/app/services/website_branding_service.rb b/app/services/website_branding_service.rb new file mode 100644 index 000000000..89326e4b6 --- /dev/null +++ b/app/services/website_branding_service.rb @@ -0,0 +1,93 @@ +class WebsiteBrandingService + include SocialLinkParser + + def initialize(url) + @url = normalize_url(url) + end + + def perform + doc = fetch_page + return nil if doc.nil? + + links = extract_links(doc) + + { + business_name: extract_business_name(doc), + language: extract_language(doc), + industry_category: nil, + social_handles: extract_social_from_links(links), + branding: extract_branding(doc) + } + rescue StandardError => e + Rails.logger.error "[WebsiteBranding] #{e.message}" + nil + end + + private + + def normalize_url(url) + url.match?(%r{\Ahttps?://}) ? url : "https://#{url}" + end + + def fetch_page + response = HTTParty.get(@url, follow_redirects: true, timeout: 15) + return nil unless response.success? + + Nokogiri::HTML(response.body) + rescue StandardError => e + Rails.logger.error "[WebsiteBranding] Failed to fetch #{@url}: #{e.message}" + nil + end + + def extract_business_name(doc) + og_site_name = doc.at_css('meta[property="og:site_name"]')&.[]('content') + return og_site_name.strip if og_site_name.present? + + title = doc.at_xpath('//title')&.text + title&.strip&.split(/\s*[|\-–—·:]+\s*/)&.first + end + + def extract_language(doc) + doc.at_css('html')&.[]('lang')&.split('-')&.first&.downcase + end + + def extract_links(doc) + doc.css('a[href]').filter_map do |a| + href = a['href']&.strip + next if href.blank? || href.start_with?('#', 'javascript:', 'mailto:', 'tel:') + + href.start_with?('http') ? href : URI.join(@url, href).to_s + rescue URI::InvalidURIError + nil + end.uniq + end + + def extract_branding(doc) + { + favicon: extract_favicon(doc), + primary_color: extract_theme_color(doc) + } + end + + def extract_favicon(doc) + favicon = doc.at_css('link[rel*="icon"]')&.[]('href') + return nil if favicon.blank? + + resolve_url(favicon) + end + + def extract_theme_color(doc) + doc.at_css('meta[name="theme-color"]')&.[]('content') + end + + def resolve_url(url) + return nil if url.blank? + return url if url.start_with?('http') + + URI.join(@url, url).to_s + rescue URI::InvalidURIError + nil + end +end + +WebsiteBrandingService.prepend_mod_with('WebsiteBrandingService') diff --git a/enterprise/app/services/enterprise/website_branding_service.rb b/enterprise/app/services/enterprise/website_branding_service.rb new file mode 100644 index 000000000..6efdd5051 --- /dev/null +++ b/enterprise/app/services/enterprise/website_branding_service.rb @@ -0,0 +1,112 @@ +module Enterprise::WebsiteBrandingService + FIRECRAWL_SCRAPE_ENDPOINT = 'https://api.firecrawl.dev/v2/scrape'.freeze + + INDUSTRY_CATEGORIES = [ + 'Technology', + 'E-commerce', + 'Healthcare', + 'Education', + 'Finance', + 'Real Estate', + 'Marketing', + 'Travel & Hospitality', + 'Food & Beverage', + 'Media & Entertainment', + 'Professional Services', + 'Non-profit', + 'Other' + ].freeze + + def perform + return super unless firecrawl_enabled? + + response = perform_firecrawl_request + process_firecrawl_response(response) + rescue StandardError => e + Rails.logger.error "[WebsiteBranding] Firecrawl failed: #{e.message}, falling back to basic scrape" + super + end + + private + + def firecrawl_enabled? + firecrawl_api_key.present? + end + + def firecrawl_api_key + InstallationConfig.find_by(name: 'CAPTAIN_FIRECRAWL_API_KEY')&.value + end + + def perform_firecrawl_request + HTTParty.post( + FIRECRAWL_SCRAPE_ENDPOINT, + body: scrape_payload.to_json, + headers: { + 'Authorization' => "Bearer #{firecrawl_api_key}", + 'Content-Type' => 'application/json' + } + ) + end + + def scrape_payload + { + url: @url, + onlyMainContent: false, + formats: [ + { + type: 'json', + schema: extract_schema, + prompt: 'Extract the business name, primary language, and industry category from this website.' + }, + 'branding', + 'links' + ] + } + end + + def extract_schema + { + type: 'object', + properties: { + business_name: { type: 'string', description: 'The name of the business or company' }, + language: { type: 'string', description: 'Primary language as ISO 639-1 code (e.g., en, es, fr)' }, + industry_category: { type: 'string', enum: INDUSTRY_CATEGORIES, description: 'Industry category for this business' } + }, + required: %w[business_name] + } + end + + def process_firecrawl_response(response) + raise "API Error: #{response.message} (Status: #{response.code})" unless response.success? + + format_firecrawl_response(response) + end + + def format_firecrawl_response(response) + data = response.parsed_response + extract = data.dig('data', 'json') || {} + brand = data.dig('data', 'branding') || {} + links = data.dig('data', 'links') || [] + + { + business_name: extract['business_name'], + language: extract['language'], + industry_category: extract['industry_category'], + social_handles: extract_social_from_links(links), + branding: extract_firecrawl_branding(brand) + } + end + + def extract_firecrawl_branding(brand) + { + favicon: url_or_nil(brand.dig('images', 'favicon')), + primary_color: brand.dig('colors', 'primary') + } + end + + def url_or_nil(value) + return nil if value.blank? || !value.start_with?('http') + + value + end +end diff --git a/spec/enterprise/services/enterprise/website_branding_service_spec.rb b/spec/enterprise/services/enterprise/website_branding_service_spec.rb new file mode 100644 index 000000000..0907db518 --- /dev/null +++ b/spec/enterprise/services/enterprise/website_branding_service_spec.rb @@ -0,0 +1,171 @@ +require 'rails_helper' + +# Simulate the prepend_mod_with behavior for testing +test_klass = Class.new(WebsiteBrandingService) do + prepend Enterprise::WebsiteBrandingService +end + +RSpec.describe Enterprise::WebsiteBrandingService do + describe '#perform' do + subject(:service) { test_klass.new(url) } + + let(:url) { 'https://example.com' } + let(:api_key) { 'test-firecrawl-api-key' } + let(:scrape_endpoint) { described_class::FIRECRAWL_SCRAPE_ENDPOINT } + let(:fallback_html) { 'Fallback' } + let(:success_response_body) do + { + success: true, + data: { + json: { + business_name: 'Acme Corp', + language: 'en', + industry_category: 'Technology' + }, + branding: { + images: { logo: 'https://example.com/logo.png', favicon: 'https://example.com/favicon.png' }, + colors: { primary: '#FF5733' } + }, + links: [ + 'https://example.com/about', + 'https://facebook.com/acmecorp', + 'https://instagram.com/acme_corp', + 'https://wa.me/1234567890', + 'https://t.me/acmecorp', + 'https://tiktok.com/@acmetok' + ] + } + }.to_json + end + + before do + stub_request(:get, url).to_return(status: 200, body: fallback_html, headers: { 'content-type' => 'text/html' }) + end + + context 'when firecrawl is configured and API returns success' do + before do + create(:installation_config, name: 'CAPTAIN_FIRECRAWL_API_KEY', value: api_key) + stub_request(:post, scrape_endpoint) + .with(headers: { 'Authorization' => "Bearer #{api_key}", 'Content-Type' => 'application/json' }) + .to_return(status: 200, body: success_response_body, headers: { 'content-type' => 'application/json' }) + end + + it 'returns business info and branding from firecrawl' do + result = service.perform + + expect(result).to eq({ + business_name: 'Acme Corp', + language: 'en', + industry_category: 'Technology', + social_handles: { + whatsapp: '1234567890', + line: nil, + facebook: 'acmecorp', + instagram: 'acme_corp', + telegram: 'acmecorp', + tiktok: '@acmetok' + }, + branding: { + favicon: 'https://example.com/favicon.png', + primary_color: '#FF5733' + } + }) + end + end + + context 'when firecrawl API returns an error' do + before do + create(:installation_config, name: 'CAPTAIN_FIRECRAWL_API_KEY', value: api_key) + stub_request(:post, scrape_endpoint) + .to_return(status: 422, body: '{"error": "Invalid URL"}', headers: {}) + end + + it 'falls back to basic scrape' do + result = service.perform + expect(result[:business_name]).to eq('Fallback') + expect(result[:industry_category]).to be_nil + end + end + + context 'when firecrawl raises an exception' do + before do + create(:installation_config, name: 'CAPTAIN_FIRECRAWL_API_KEY', value: api_key) + stub_request(:post, scrape_endpoint).to_raise(StandardError.new('connection refused')) + end + + it 'falls back to basic scrape' do + result = service.perform + expect(result[:business_name]).to eq('Fallback') + end + end + + context 'when firecrawl is not configured' do + it 'uses basic scrape' do + expect(HTTParty).not_to receive(:post) + result = service.perform + expect(result[:business_name]).to eq('Fallback') + end + end + + context 'when WhatsApp link uses api.whatsapp.com format' do + before do + create(:installation_config, name: 'CAPTAIN_FIRECRAWL_API_KEY', value: api_key) + response = { + success: true, + data: { + json: { business_name: 'Acme Corp' }, + links: ['https://api.whatsapp.com/send?phone=5511999999999&text=Hello'] + } + }.to_json + stub_request(:post, scrape_endpoint) + .to_return(status: 200, body: response, headers: { 'content-type' => 'application/json' }) + end + + it 'extracts phone number from query param' do + result = service.perform + expect(result[:social_handles][:whatsapp]).to eq('5511999999999') + end + end + + context 'when WhatsApp link uses wa.me format' do + before do + create(:installation_config, name: 'CAPTAIN_FIRECRAWL_API_KEY', value: api_key) + response = { + success: true, + data: { + json: { business_name: 'Acme Corp' }, + links: ['https://wa.me/+5511999999999'] + } + }.to_json + stub_request(:post, scrape_endpoint) + .to_return(status: 200, body: response, headers: { 'content-type' => 'application/json' }) + end + + it 'extracts phone number from path' do + result = service.perform + expect(result[:social_handles][:whatsapp]).to eq('5511999999999') + end + end + + context 'when links contain lookalike domains' do + before do + create(:installation_config, name: 'CAPTAIN_FIRECRAWL_API_KEY', value: api_key) + response = { + success: true, + data: { + json: { business_name: 'Acme Corp' }, + links: ['https://notfacebook.com/page', 'https://fakeinstagram.com/user'] + } + }.to_json + stub_request(:post, scrape_endpoint) + .to_return(status: 200, body: response, headers: { 'content-type' => 'application/json' }) + end + + it 'does not match lookalike domains' do + result = service.perform + expect(result[:social_handles][:facebook]).to be_nil + expect(result[:social_handles][:instagram]).to be_nil + end + end + end +end diff --git a/spec/services/website_branding_service_spec.rb b/spec/services/website_branding_service_spec.rb new file mode 100644 index 000000000..19598fb59 --- /dev/null +++ b/spec/services/website_branding_service_spec.rb @@ -0,0 +1,156 @@ +require 'rails_helper' + +RSpec.describe WebsiteBrandingService do + describe '#perform' do + let(:url) { 'https://example.com' } + let(:html_body) do + <<~HTML + + + Acme Corp | Home + + + + + + +
Home
+ + + + HTML + end + + before do + stub_request(:get, url).to_return(status: 200, body: html_body, headers: { 'content-type' => 'text/html' }) + end + + it 'extracts business info, branding, and social handles' do + result = described_class.new(url).perform + + expect(result).to eq({ + business_name: 'Acme Corp', + language: 'en', + industry_category: nil, + social_handles: { + whatsapp: '1234567890', + line: nil, + facebook: 'acmecorp', + instagram: 'acme_corp', + telegram: 'acmecorp', + tiktok: '@acmetok' + }, + branding: { + favicon: 'https://example.com/favicon.ico', + primary_color: '#FF5733' + } + }) + end + + context 'when og:site_name is missing' do + let(:html_body) do + <<~HTML + + Mon Entreprise - Bienvenue + + + HTML + end + + it 'falls back to the first segment of the title' do + result = described_class.new(url).perform + expect(result[:business_name]).to eq('Mon Entreprise') + expect(result[:language]).to eq('fr') + end + end + + context 'when the page fails to load' do + before { stub_request(:get, url).to_return(status: 500, body: '') } + + it 'returns nil' do + expect(described_class.new(url).perform).to be_nil + end + end + + context 'when a network error occurs' do + before { stub_request(:get, url).to_raise(StandardError.new('connection refused')) } + + it 'logs the error and returns nil' do + expect(Rails.logger).to receive(:error).with(/connection refused/) + expect(described_class.new(url).perform).to be_nil + end + end + + context 'when URL has no scheme' do + before do + stub_request(:get, 'https://example.com').to_return(status: 200, body: html_body, headers: { 'content-type' => 'text/html' }) + end + + it 'prepends https://' do + result = described_class.new('example.com').perform + expect(result[:business_name]).to eq('Acme Corp') + end + end + + context 'when WhatsApp link uses api.whatsapp.com format' do + let(:html_body) do + <<~HTML + + Test + Chat + + HTML + end + + it 'extracts phone from query param' do + result = described_class.new(url).perform + expect(result[:social_handles][:whatsapp]).to eq('5511999999999') + end + end + + context 'when links contain lookalike domains' do + let(:html_body) do + <<~HTML + + Test + + Not FB + Not IG + + + HTML + end + + it 'does not match lookalike domains' do + result = described_class.new(url).perform + expect(result[:social_handles][:facebook]).to be_nil + expect(result[:social_handles][:instagram]).to be_nil + end + end + + context 'when favicon uses a relative path without leading slash' do + let(:html_body) do + <<~HTML + + + Test + + + + + HTML + end + + it 'resolves the relative favicon URL' do + result = described_class.new(url).perform + expect(result[:branding][:favicon]).to eq('https://example.com/favicon.ico') + end + end + end +end