feat: firecrawl branding api [UPM-15] (#13903)

Adds `WebsiteBrandingService` (OSS) with an Enterprise override using
Firecrawl v2 to extract branding and business data from a URL for
onboarding auto-fill.

OSS version uses HTTParty + Nokogiri to extract:
- Business name (og:site_name or title)
- Language (html lang)
- Favicon
- Social links from `<a>` tags

Enterprise version makes a single Firecrawl call to fetch:
- Structured JSON (name, language, industry via LLM)
- Branding (favicon, primary color)
- Page links

Falls back to OSS if Firecrawl is unavailable or fails.

Social handles (WhatsApp, Facebook, Instagram, Telegram, TikTok, LINE)
are parsed deterministically via a shared `SocialLinkParser`.

> We use links for socials, since the LLM extraction was unreliable,
mostly returned empty, and hallucinated in some rare scenarios

## How to test

```ruby
# OSS (no Firecrawl key needed)
WebsiteBrandingService.new('chatwoot.com').perform

# Enterprise (requires CAPTAIN_FIRECRAWL_API_KEY)
WebsiteBrandingService.new('notion.so').perform
WebsiteBrandingService.new('postman.com').perform
```

Verify the returned hash includes business_name, language,
industry_category, social_handles, and branding with
favicon/primary_color.

<img width="908" height="393" alt="image"
src="https://github.com/user-attachments/assets/e3696887-d366-485a-89a0-8e1a9698a788"
/>
This commit is contained in:
Shivam Mishra
2026-03-30 11:32:03 +05:30
committed by GitHub
parent 04acc16609
commit 7651c18b48
5 changed files with 597 additions and 0 deletions

View File

@@ -0,0 +1,171 @@
require 'rails_helper'
# Simulate the prepend_mod_with behavior for testing
test_klass = Class.new(WebsiteBrandingService) do
prepend Enterprise::WebsiteBrandingService
end
RSpec.describe Enterprise::WebsiteBrandingService do
describe '#perform' do
subject(:service) { test_klass.new(url) }
let(:url) { 'https://example.com' }
let(:api_key) { 'test-firecrawl-api-key' }
let(:scrape_endpoint) { described_class::FIRECRAWL_SCRAPE_ENDPOINT }
let(:fallback_html) { '<html lang="en"><head><title>Fallback</title></head><body></body></html>' }
let(:success_response_body) do
{
success: true,
data: {
json: {
business_name: 'Acme Corp',
language: 'en',
industry_category: 'Technology'
},
branding: {
images: { logo: 'https://example.com/logo.png', favicon: 'https://example.com/favicon.png' },
colors: { primary: '#FF5733' }
},
links: [
'https://example.com/about',
'https://facebook.com/acmecorp',
'https://instagram.com/acme_corp',
'https://wa.me/1234567890',
'https://t.me/acmecorp',
'https://tiktok.com/@acmetok'
]
}
}.to_json
end
before do
stub_request(:get, url).to_return(status: 200, body: fallback_html, headers: { 'content-type' => 'text/html' })
end
context 'when firecrawl is configured and API returns success' do
before do
create(:installation_config, name: 'CAPTAIN_FIRECRAWL_API_KEY', value: api_key)
stub_request(:post, scrape_endpoint)
.with(headers: { 'Authorization' => "Bearer #{api_key}", 'Content-Type' => 'application/json' })
.to_return(status: 200, body: success_response_body, headers: { 'content-type' => 'application/json' })
end
it 'returns business info and branding from firecrawl' do
result = service.perform
expect(result).to eq({
business_name: 'Acme Corp',
language: 'en',
industry_category: 'Technology',
social_handles: {
whatsapp: '1234567890',
line: nil,
facebook: 'acmecorp',
instagram: 'acme_corp',
telegram: 'acmecorp',
tiktok: '@acmetok'
},
branding: {
favicon: 'https://example.com/favicon.png',
primary_color: '#FF5733'
}
})
end
end
context 'when firecrawl API returns an error' do
before do
create(:installation_config, name: 'CAPTAIN_FIRECRAWL_API_KEY', value: api_key)
stub_request(:post, scrape_endpoint)
.to_return(status: 422, body: '{"error": "Invalid URL"}', headers: {})
end
it 'falls back to basic scrape' do
result = service.perform
expect(result[:business_name]).to eq('Fallback')
expect(result[:industry_category]).to be_nil
end
end
context 'when firecrawl raises an exception' do
before do
create(:installation_config, name: 'CAPTAIN_FIRECRAWL_API_KEY', value: api_key)
stub_request(:post, scrape_endpoint).to_raise(StandardError.new('connection refused'))
end
it 'falls back to basic scrape' do
result = service.perform
expect(result[:business_name]).to eq('Fallback')
end
end
context 'when firecrawl is not configured' do
it 'uses basic scrape' do
expect(HTTParty).not_to receive(:post)
result = service.perform
expect(result[:business_name]).to eq('Fallback')
end
end
context 'when WhatsApp link uses api.whatsapp.com format' do
before do
create(:installation_config, name: 'CAPTAIN_FIRECRAWL_API_KEY', value: api_key)
response = {
success: true,
data: {
json: { business_name: 'Acme Corp' },
links: ['https://api.whatsapp.com/send?phone=5511999999999&text=Hello']
}
}.to_json
stub_request(:post, scrape_endpoint)
.to_return(status: 200, body: response, headers: { 'content-type' => 'application/json' })
end
it 'extracts phone number from query param' do
result = service.perform
expect(result[:social_handles][:whatsapp]).to eq('5511999999999')
end
end
context 'when WhatsApp link uses wa.me format' do
before do
create(:installation_config, name: 'CAPTAIN_FIRECRAWL_API_KEY', value: api_key)
response = {
success: true,
data: {
json: { business_name: 'Acme Corp' },
links: ['https://wa.me/+5511999999999']
}
}.to_json
stub_request(:post, scrape_endpoint)
.to_return(status: 200, body: response, headers: { 'content-type' => 'application/json' })
end
it 'extracts phone number from path' do
result = service.perform
expect(result[:social_handles][:whatsapp]).to eq('5511999999999')
end
end
context 'when links contain lookalike domains' do
before do
create(:installation_config, name: 'CAPTAIN_FIRECRAWL_API_KEY', value: api_key)
response = {
success: true,
data: {
json: { business_name: 'Acme Corp' },
links: ['https://notfacebook.com/page', 'https://fakeinstagram.com/user']
}
}.to_json
stub_request(:post, scrape_endpoint)
.to_return(status: 200, body: response, headers: { 'content-type' => 'application/json' })
end
it 'does not match lookalike domains' do
result = service.perform
expect(result[:social_handles][:facebook]).to be_nil
expect(result[:social_handles][:instagram]).to be_nil
end
end
end
end

View File

@@ -0,0 +1,156 @@
require 'rails_helper'
RSpec.describe WebsiteBrandingService do
describe '#perform' do
let(:url) { 'https://example.com' }
let(:html_body) do
<<~HTML
<html lang="en">
<head>
<title>Acme Corp | Home</title>
<meta property="og:site_name" content="Acme Corp" />
<meta property="og:image" content="https://example.com/og-image.png" />
<meta name="theme-color" content="#FF5733" />
<link rel="icon" href="/favicon.ico" />
</head>
<body>
<header><a href="/">Home</a></header>
<footer>
<a href="https://facebook.com/acmecorp">Facebook</a>
<a href="https://instagram.com/acme_corp">Instagram</a>
<a href="https://wa.me/1234567890">WhatsApp</a>
<a href="https://t.me/acmecorp">Telegram</a>
<a href="https://tiktok.com/@acmetok">TikTok</a>
</footer>
</body>
</html>
HTML
end
before do
stub_request(:get, url).to_return(status: 200, body: html_body, headers: { 'content-type' => 'text/html' })
end
it 'extracts business info, branding, and social handles' do
result = described_class.new(url).perform
expect(result).to eq({
business_name: 'Acme Corp',
language: 'en',
industry_category: nil,
social_handles: {
whatsapp: '1234567890',
line: nil,
facebook: 'acmecorp',
instagram: 'acme_corp',
telegram: 'acmecorp',
tiktok: '@acmetok'
},
branding: {
favicon: 'https://example.com/favicon.ico',
primary_color: '#FF5733'
}
})
end
context 'when og:site_name is missing' do
let(:html_body) do
<<~HTML
<html lang="fr">
<head><title>Mon Entreprise - Bienvenue</title></head>
<body></body>
</html>
HTML
end
it 'falls back to the first segment of the title' do
result = described_class.new(url).perform
expect(result[:business_name]).to eq('Mon Entreprise')
expect(result[:language]).to eq('fr')
end
end
context 'when the page fails to load' do
before { stub_request(:get, url).to_return(status: 500, body: '') }
it 'returns nil' do
expect(described_class.new(url).perform).to be_nil
end
end
context 'when a network error occurs' do
before { stub_request(:get, url).to_raise(StandardError.new('connection refused')) }
it 'logs the error and returns nil' do
expect(Rails.logger).to receive(:error).with(/connection refused/)
expect(described_class.new(url).perform).to be_nil
end
end
context 'when URL has no scheme' do
before do
stub_request(:get, 'https://example.com').to_return(status: 200, body: html_body, headers: { 'content-type' => 'text/html' })
end
it 'prepends https://' do
result = described_class.new('example.com').perform
expect(result[:business_name]).to eq('Acme Corp')
end
end
context 'when WhatsApp link uses api.whatsapp.com format' do
let(:html_body) do
<<~HTML
<html lang="en">
<head><title>Test</title></head>
<body><a href="https://api.whatsapp.com/send?phone=5511999999999&text=Hello">Chat</a></body>
</html>
HTML
end
it 'extracts phone from query param' do
result = described_class.new(url).perform
expect(result[:social_handles][:whatsapp]).to eq('5511999999999')
end
end
context 'when links contain lookalike domains' do
let(:html_body) do
<<~HTML
<html lang="en">
<head><title>Test</title></head>
<body>
<a href="https://notfacebook.com/page">Not FB</a>
<a href="https://fakeinstagram.com/user">Not IG</a>
</body>
</html>
HTML
end
it 'does not match lookalike domains' do
result = described_class.new(url).perform
expect(result[:social_handles][:facebook]).to be_nil
expect(result[:social_handles][:instagram]).to be_nil
end
end
context 'when favicon uses a relative path without leading slash' do
let(:html_body) do
<<~HTML
<html lang="en">
<head>
<title>Test</title>
<link rel="icon" href="favicon.ico" />
</head>
<body></body>
</html>
HTML
end
it 'resolves the relative favicon URL' do
result = described_class.new(url).perform
expect(result[:branding][:favicon]).to eq('https://example.com/favicon.ico')
end
end
end
end