feat: firecrawl branding api [UPM-15] (#13903)

Adds `WebsiteBrandingService` (OSS) with an Enterprise override using Firecrawl v2 to extract branding and business data from a URL for onboarding auto-fill. OSS version uses HTTParty + Nokogiri to extract: - Business name (og:site_name or title) - Language (html lang) - Favicon - Social links from `<a>` tags Enterprise version makes a single Firecrawl call to fetch: - Structured JSON (name, language, industry via LLM) - Branding (favicon, primary color) - Page links Falls back to OSS if Firecrawl is unavailable or fails. Social handles (WhatsApp, Facebook, Instagram, Telegram, TikTok, LINE) are parsed deterministically via a shared `SocialLinkParser`. > We use links for socials, since the LLM extraction was unreliable, mostly returned empty, and hallucinated in some rare scenarios ## How to test ```ruby # OSS (no Firecrawl key needed) WebsiteBrandingService.new('chatwoot.com').perform # Enterprise (requires CAPTAIN_FIRECRAWL_API_KEY) WebsiteBrandingService.new('notion.so').perform WebsiteBrandingService.new('postman.com').perform ``` Verify the returned hash includes business_name, language, industry_category, social_handles, and branding with favicon/primary_color. <img width="908" height="393" alt="image" src="https://github.com/user-attachments/assets/e3696887-d366-485a-89a0-8e1a9698a788" />
2026-03-30 11:32:03 +05:30
parent 04acc16609
commit 7651c18b48
5 changed files with 597 additions and 0 deletions
--- a/app/services/concerns/social_link_parser.rb
+++ b/app/services/concerns/social_link_parser.rb
@@ -0,0 +1,65 @@
+module SocialLinkParser
+  extend ActiveSupport::Concern
+
+  SOCIAL_DOMAIN_MAP = {
+    whatsapp: %w[wa.me api.whatsapp.com],
+    line: %w[line.me],
+    facebook: %w[facebook.com fb.com fb.me],
+    instagram: %w[instagram.com],
+    telegram: %w[t.me telegram.me],
+    tiktok: %w[tiktok.com]
+  }.freeze
+
+  private
+
+  def extract_social_from_links(links)
+    handles = {}
+    SOCIAL_DOMAIN_MAP.each do |platform, domains|
+      handles[platform] = find_social_handle(links, platform, domains)
+    end
+    handles
+  end
+
+  def find_social_handle(links, platform, domains)
+    matching_links = links.select do |l|
+      uri = URI.parse(l)
+      domains.any? { |d| match_social_domain?(uri.host, d) }
+    rescue URI::InvalidURIError
+      false
+    end
+
+    matching_links.each do |link|
+      handle = parse_social_handle(platform, link)
+      return handle if handle.present?
+    end
+    nil
+  end
+
+  def match_social_domain?(host, domain)
+    return false if host.blank?
+
+    host == domain || host.end_with?(".#{domain}")
+  end
+
+  SHARE_PATH_PREFIXES = %w[sharer share intent dialog].freeze
+
+  def parse_social_handle(platform, link)
+    uri = URI.parse(link)
+    return extract_whatsapp_phone(uri) if platform == :whatsapp
+
+    handle = uri.path.to_s.delete_prefix('/').delete_suffix('/')
+    return nil if handle.blank?
+    return nil if SHARE_PATH_PREFIXES.any? { |prefix| handle.start_with?(prefix) }
+
+    handle.presence
+  rescue URI::InvalidURIError
+    nil
+  end
+
+  # wa.me/1234567890 or api.whatsapp.com/send?phone=1234567890
+  def extract_whatsapp_phone(uri)
+    phone = CGI.parse(uri.query.to_s)['phone']&.first
+    phone = uri.path.to_s.delete_prefix('/').delete_suffix('/') if phone.blank?
+    phone.presence&.gsub(/[^\d]/, '')
+  end
+end
--- a/app/services/website_branding_service.rb
+++ b/app/services/website_branding_service.rb
@@ -0,0 +1,93 @@
+class WebsiteBrandingService
+  include SocialLinkParser
+
+  def initialize(url)
+    @url = normalize_url(url)
+  end
+
+  def perform
+    doc = fetch_page
+    return nil if doc.nil?
+
+    links = extract_links(doc)
+
+    {
+      business_name: extract_business_name(doc),
+      language: extract_language(doc),
+      industry_category: nil,
+      social_handles: extract_social_from_links(links),
+      branding: extract_branding(doc)
+    }
+  rescue StandardError => e
+    Rails.logger.error "[WebsiteBranding] #{e.message}"
+    nil
+  end
+
+  private
+
+  def normalize_url(url)
+    url.match?(%r{\Ahttps?://}) ? url : "https://#{url}"
+  end
+
+  def fetch_page
+    response = HTTParty.get(@url, follow_redirects: true, timeout: 15)
+    return nil unless response.success?
+
+    Nokogiri::HTML(response.body)
+  rescue StandardError => e
+    Rails.logger.error "[WebsiteBranding] Failed to fetch #{@url}: #{e.message}"
+    nil
+  end
+
+  def extract_business_name(doc)
+    og_site_name = doc.at_css('meta[property="og:site_name"]')&.[]('content')
+    return og_site_name.strip if og_site_name.present?
+
+    title = doc.at_xpath('//title')&.text
+    title&.strip&.split(/\s*[|\-–—·:]+\s*/)&.first
+  end
+
+  def extract_language(doc)
+    doc.at_css('html')&.[]('lang')&.split('-')&.first&.downcase
+  end
+
+  def extract_links(doc)
+    doc.css('a[href]').filter_map do |a|
+      href = a['href']&.strip
+      next if href.blank? || href.start_with?('#', 'javascript:', 'mailto:', 'tel:')
+
+      href.start_with?('http') ? href : URI.join(@url, href).to_s
+    rescue URI::InvalidURIError
+      nil
+    end.uniq
+  end
+
+  def extract_branding(doc)
+    {
+      favicon: extract_favicon(doc),
+      primary_color: extract_theme_color(doc)
+    }
+  end
+
+  def extract_favicon(doc)
+    favicon = doc.at_css('link[rel*="icon"]')&.[]('href')
+    return nil if favicon.blank?
+
+    resolve_url(favicon)
+  end
+
+  def extract_theme_color(doc)
+    doc.at_css('meta[name="theme-color"]')&.[]('content')
+  end
+
+  def resolve_url(url)
+    return nil if url.blank?
+    return url if url.start_with?('http')
+
+    URI.join(@url, url).to_s
+  rescue URI::InvalidURIError
+    nil
+  end
+end
+
+WebsiteBrandingService.prepend_mod_with('WebsiteBrandingService')
--- a/enterprise/app/services/enterprise/website_branding_service.rb
+++ b/enterprise/app/services/enterprise/website_branding_service.rb
@@ -0,0 +1,112 @@
+module Enterprise::WebsiteBrandingService
+  FIRECRAWL_SCRAPE_ENDPOINT = 'https://api.firecrawl.dev/v2/scrape'.freeze
+
+  INDUSTRY_CATEGORIES = [
+    'Technology',
+    'E-commerce',
+    'Healthcare',
+    'Education',
+    'Finance',
+    'Real Estate',
+    'Marketing',
+    'Travel & Hospitality',
+    'Food & Beverage',
+    'Media & Entertainment',
+    'Professional Services',
+    'Non-profit',
+    'Other'
+  ].freeze
+
+  def perform
+    return super unless firecrawl_enabled?
+
+    response = perform_firecrawl_request
+    process_firecrawl_response(response)
+  rescue StandardError => e
+    Rails.logger.error "[WebsiteBranding] Firecrawl failed: #{e.message}, falling back to basic scrape"
+    super
+  end
+
+  private
+
+  def firecrawl_enabled?
+    firecrawl_api_key.present?
+  end
+
+  def firecrawl_api_key
+    InstallationConfig.find_by(name: 'CAPTAIN_FIRECRAWL_API_KEY')&.value
+  end
+
+  def perform_firecrawl_request
+    HTTParty.post(
+      FIRECRAWL_SCRAPE_ENDPOINT,
+      body: scrape_payload.to_json,
+      headers: {
+        'Authorization' => "Bearer #{firecrawl_api_key}",
+        'Content-Type' => 'application/json'
+      }
+    )
+  end
+
+  def scrape_payload
+    {
+      url: @url,
+      onlyMainContent: false,
+      formats: [
+        {
+          type: 'json',
+          schema: extract_schema,
+          prompt: 'Extract the business name, primary language, and industry category from this website.'
+        },
+        'branding',
+        'links'
+      ]
+    }
+  end
+
+  def extract_schema
+    {
+      type: 'object',
+      properties: {
+        business_name: { type: 'string', description: 'The name of the business or company' },
+        language: { type: 'string', description: 'Primary language as ISO 639-1 code (e.g., en, es, fr)' },
+        industry_category: { type: 'string', enum: INDUSTRY_CATEGORIES, description: 'Industry category for this business' }
+      },
+      required: %w[business_name]
+    }
+  end
+
+  def process_firecrawl_response(response)
+    raise "API Error: #{response.message} (Status: #{response.code})" unless response.success?
+
+    format_firecrawl_response(response)
+  end
+
+  def format_firecrawl_response(response)
+    data = response.parsed_response
+    extract = data.dig('data', 'json') || {}
+    brand = data.dig('data', 'branding') || {}
+    links = data.dig('data', 'links') || []
+
+    {
+      business_name: extract['business_name'],
+      language: extract['language'],
+      industry_category: extract['industry_category'],
+      social_handles: extract_social_from_links(links),
+      branding: extract_firecrawl_branding(brand)
+    }
+  end
+
+  def extract_firecrawl_branding(brand)
+    {
+      favicon: url_or_nil(brand.dig('images', 'favicon')),
+      primary_color: brand.dig('colors', 'primary')
+    }
+  end
+
+  def url_or_nil(value)
+    return nil if value.blank? || !value.start_with?('http')
+
+    value
+  end
+end
--- a/spec/enterprise/services/enterprise/website_branding_service_spec.rb
+++ b/spec/enterprise/services/enterprise/website_branding_service_spec.rb
@@ -0,0 +1,171 @@
+require 'rails_helper'
+
+# Simulate the prepend_mod_with behavior for testing
+test_klass = Class.new(WebsiteBrandingService) do
+  prepend Enterprise::WebsiteBrandingService
+end
+
+RSpec.describe Enterprise::WebsiteBrandingService do
+  describe '#perform' do
+    subject(:service) { test_klass.new(url) }
+
+    let(:url) { 'https://example.com' }
+    let(:api_key) { 'test-firecrawl-api-key' }
+    let(:scrape_endpoint) { described_class::FIRECRAWL_SCRAPE_ENDPOINT }
+    let(:fallback_html) { '<html lang="en"><head><title>Fallback</title></head><body></body></html>' }
+    let(:success_response_body) do
+      {
+        success: true,
+        data: {
+          json: {
+            business_name: 'Acme Corp',
+            language: 'en',
+            industry_category: 'Technology'
+          },
+          branding: {
+            images: { logo: 'https://example.com/logo.png', favicon: 'https://example.com/favicon.png' },
+            colors: { primary: '#FF5733' }
+          },
+          links: [
+            'https://example.com/about',
+            'https://facebook.com/acmecorp',
+            'https://instagram.com/acme_corp',
+            'https://wa.me/1234567890',
+            'https://t.me/acmecorp',
+            'https://tiktok.com/@acmetok'
+          ]
+        }
+      }.to_json
+    end
+
+    before do
+      stub_request(:get, url).to_return(status: 200, body: fallback_html, headers: { 'content-type' => 'text/html' })
+    end
+
+    context 'when firecrawl is configured and API returns success' do
+      before do
+        create(:installation_config, name: 'CAPTAIN_FIRECRAWL_API_KEY', value: api_key)
+        stub_request(:post, scrape_endpoint)
+          .with(headers: { 'Authorization' => "Bearer #{api_key}", 'Content-Type' => 'application/json' })
+          .to_return(status: 200, body: success_response_body, headers: { 'content-type' => 'application/json' })
+      end
+
+      it 'returns business info and branding from firecrawl' do
+        result = service.perform
+
+        expect(result).to eq({
+                               business_name: 'Acme Corp',
+                               language: 'en',
+                               industry_category: 'Technology',
+                               social_handles: {
+                                 whatsapp: '1234567890',
+                                 line: nil,
+                                 facebook: 'acmecorp',
+                                 instagram: 'acme_corp',
+                                 telegram: 'acmecorp',
+                                 tiktok: '@acmetok'
+                               },
+                               branding: {
+                                 favicon: 'https://example.com/favicon.png',
+                                 primary_color: '#FF5733'
+                               }
+                             })
+      end
+    end
+
+    context 'when firecrawl API returns an error' do
+      before do
+        create(:installation_config, name: 'CAPTAIN_FIRECRAWL_API_KEY', value: api_key)
+        stub_request(:post, scrape_endpoint)
+          .to_return(status: 422, body: '{"error": "Invalid URL"}', headers: {})
+      end
+
+      it 'falls back to basic scrape' do
+        result = service.perform
+        expect(result[:business_name]).to eq('Fallback')
+        expect(result[:industry_category]).to be_nil
+      end
+    end
+
+    context 'when firecrawl raises an exception' do
+      before do
+        create(:installation_config, name: 'CAPTAIN_FIRECRAWL_API_KEY', value: api_key)
+        stub_request(:post, scrape_endpoint).to_raise(StandardError.new('connection refused'))
+      end
+
+      it 'falls back to basic scrape' do
+        result = service.perform
+        expect(result[:business_name]).to eq('Fallback')
+      end
+    end
+
+    context 'when firecrawl is not configured' do
+      it 'uses basic scrape' do
+        expect(HTTParty).not_to receive(:post)
+        result = service.perform
+        expect(result[:business_name]).to eq('Fallback')
+      end
+    end
+
+    context 'when WhatsApp link uses api.whatsapp.com format' do
+      before do
+        create(:installation_config, name: 'CAPTAIN_FIRECRAWL_API_KEY', value: api_key)
+        response = {
+          success: true,
+          data: {
+            json: { business_name: 'Acme Corp' },
+            links: ['https://api.whatsapp.com/send?phone=5511999999999&text=Hello']
+          }
+        }.to_json
+        stub_request(:post, scrape_endpoint)
+          .to_return(status: 200, body: response, headers: { 'content-type' => 'application/json' })
+      end
+
+      it 'extracts phone number from query param' do
+        result = service.perform
+        expect(result[:social_handles][:whatsapp]).to eq('5511999999999')
+      end
+    end
+
+    context 'when WhatsApp link uses wa.me format' do
+      before do
+        create(:installation_config, name: 'CAPTAIN_FIRECRAWL_API_KEY', value: api_key)
+        response = {
+          success: true,
+          data: {
+            json: { business_name: 'Acme Corp' },
+            links: ['https://wa.me/+5511999999999']
+          }
+        }.to_json
+        stub_request(:post, scrape_endpoint)
+          .to_return(status: 200, body: response, headers: { 'content-type' => 'application/json' })
+      end
+
+      it 'extracts phone number from path' do
+        result = service.perform
+        expect(result[:social_handles][:whatsapp]).to eq('5511999999999')
+      end
+    end
+
+    context 'when links contain lookalike domains' do
+      before do
+        create(:installation_config, name: 'CAPTAIN_FIRECRAWL_API_KEY', value: api_key)
+        response = {
+          success: true,
+          data: {
+            json: { business_name: 'Acme Corp' },
+            links: ['https://notfacebook.com/page', 'https://fakeinstagram.com/user']
+          }
+        }.to_json
+        stub_request(:post, scrape_endpoint)
+          .to_return(status: 200, body: response, headers: { 'content-type' => 'application/json' })
+      end
+
+      it 'does not match lookalike domains' do
+        result = service.perform
+        expect(result[:social_handles][:facebook]).to be_nil
+        expect(result[:social_handles][:instagram]).to be_nil
+      end
+    end
+  end
+end
--- a/spec/services/website_branding_service_spec.rb
+++ b/spec/services/website_branding_service_spec.rb
@@ -0,0 +1,156 @@
+require 'rails_helper'
+
+RSpec.describe WebsiteBrandingService do
+  describe '#perform' do
+    let(:url) { 'https://example.com' }
+    let(:html_body) do
+      <<~HTML
+        <html lang="en">
+        <head>
+          <title>Acme Corp | Home</title>
+          <meta property="og:site_name" content="Acme Corp" />
+          <meta property="og:image" content="https://example.com/og-image.png" />
+          <meta name="theme-color" content="#FF5733" />
+          <link rel="icon" href="/favicon.ico" />
+        </head>
+        <body>
+          <header><a href="/">Home</a></header>
+          <footer>
+            <a href="https://facebook.com/acmecorp">Facebook</a>
+            <a href="https://instagram.com/acme_corp">Instagram</a>
+            <a href="https://wa.me/1234567890">WhatsApp</a>
+            <a href="https://t.me/acmecorp">Telegram</a>
+            <a href="https://tiktok.com/@acmetok">TikTok</a>
+          </footer>
+        </body>
+        </html>
+      HTML
+    end
+
+    before do
+      stub_request(:get, url).to_return(status: 200, body: html_body, headers: { 'content-type' => 'text/html' })
+    end
+
+    it 'extracts business info, branding, and social handles' do
+      result = described_class.new(url).perform
+
+      expect(result).to eq({
+                             business_name: 'Acme Corp',
+                             language: 'en',
+                             industry_category: nil,
+                             social_handles: {
+                               whatsapp: '1234567890',
+                               line: nil,
+                               facebook: 'acmecorp',
+                               instagram: 'acme_corp',
+                               telegram: 'acmecorp',
+                               tiktok: '@acmetok'
+                             },
+                             branding: {
+                               favicon: 'https://example.com/favicon.ico',
+                               primary_color: '#FF5733'
+                             }
+                           })
+    end
+
+    context 'when og:site_name is missing' do
+      let(:html_body) do
+        <<~HTML
+          <html lang="fr">
+          <head><title>Mon Entreprise - Bienvenue</title></head>
+          <body></body>
+          </html>
+        HTML
+      end
+
+      it 'falls back to the first segment of the title' do
+        result = described_class.new(url).perform
+        expect(result[:business_name]).to eq('Mon Entreprise')
+        expect(result[:language]).to eq('fr')
+      end
+    end
+
+    context 'when the page fails to load' do
+      before { stub_request(:get, url).to_return(status: 500, body: '') }
+
+      it 'returns nil' do
+        expect(described_class.new(url).perform).to be_nil
+      end
+    end
+
+    context 'when a network error occurs' do
+      before { stub_request(:get, url).to_raise(StandardError.new('connection refused')) }
+
+      it 'logs the error and returns nil' do
+        expect(Rails.logger).to receive(:error).with(/connection refused/)
+        expect(described_class.new(url).perform).to be_nil
+      end
+    end
+
+    context 'when URL has no scheme' do
+      before do
+        stub_request(:get, 'https://example.com').to_return(status: 200, body: html_body, headers: { 'content-type' => 'text/html' })
+      end
+
+      it 'prepends https://' do
+        result = described_class.new('example.com').perform
+        expect(result[:business_name]).to eq('Acme Corp')
+      end
+    end
+
+    context 'when WhatsApp link uses api.whatsapp.com format' do
+      let(:html_body) do
+        <<~HTML
+          <html lang="en">
+          <head><title>Test</title></head>
+          <body><a href="https://api.whatsapp.com/send?phone=5511999999999&text=Hello">Chat</a></body>
+          </html>
+        HTML
+      end
+
+      it 'extracts phone from query param' do
+        result = described_class.new(url).perform
+        expect(result[:social_handles][:whatsapp]).to eq('5511999999999')
+      end
+    end
+
+    context 'when links contain lookalike domains' do
+      let(:html_body) do
+        <<~HTML
+          <html lang="en">
+          <head><title>Test</title></head>
+          <body>
+            <a href="https://notfacebook.com/page">Not FB</a>
+            <a href="https://fakeinstagram.com/user">Not IG</a>
+          </body>
+          </html>
+        HTML
+      end
+
+      it 'does not match lookalike domains' do
+        result = described_class.new(url).perform
+        expect(result[:social_handles][:facebook]).to be_nil
+        expect(result[:social_handles][:instagram]).to be_nil
+      end
+    end
+
+    context 'when favicon uses a relative path without leading slash' do
+      let(:html_body) do
+        <<~HTML
+          <html lang="en">
+          <head>
+            <title>Test</title>
+            <link rel="icon" href="favicon.ico" />
+          </head>
+          <body></body>
+          </html>
+        HTML
+      end
+
+      it 'resolves the relative favicon URL' do
+        result = described_class.new(url).perform
+        expect(result[:branding][:favicon]).to eq('https://example.com/favicon.ico')
+      end
+    end
+  end
+end