From 42a457ff5d9a51cfb8a76077bb3e4ed21b65c9dc Mon Sep 17 00:00:00 2001 From: Sojan Jose Date: Mon, 15 Apr 2024 16:35:23 -0700 Subject: [PATCH] feat: Use embeddings in help center search (#9227) --- .../api/v1/portals/articles_controller.rb | 8 ++- app/models/article.rb | 1 + config/features.yml | 3 + .../monkey_patches/schema_dumper.rb | 1 + .../api/v1/portals/articles_controller.rb | 11 +++ enterprise/app/models/article_embedding.rb | 23 ++++++ .../app/models/enterprise/concerns/article.rb | 71 +++++++++++++++++++ .../app/services/features/base_service.rb | 7 ++ .../helpcenter_embedding_search_service.rb | 42 +++++++++++ .../services/features/response_bot_service.rb | 8 +-- .../app/services/openai/embeddings_service.rb | 8 +-- .../v1/portals/articles_controller_spec.rb | 19 +++++ 12 files changed, 190 insertions(+), 12 deletions(-) create mode 100644 enterprise/app/controllers/enterprise/public/api/v1/portals/articles_controller.rb create mode 100644 enterprise/app/models/article_embedding.rb create mode 100644 enterprise/app/models/enterprise/concerns/article.rb create mode 100644 enterprise/app/services/features/base_service.rb create mode 100644 enterprise/app/services/features/helpcenter_embedding_search_service.rb create mode 100644 spec/enterprise/controllers/enterprise/public/api/v1/portals/articles_controller_spec.rb diff --git a/app/controllers/public/api/v1/portals/articles_controller.rb b/app/controllers/public/api/v1/portals/articles_controller.rb index 4e967cfbb..46ecf19a7 100644 --- a/app/controllers/public/api/v1/portals/articles_controller.rb +++ b/app/controllers/public/api/v1/portals/articles_controller.rb @@ -7,7 +7,7 @@ class Public::Api::V1::Portals::ArticlesController < Public::Api::V1::Portals::B def index @articles = @portal.articles - @articles = @articles.search(list_params) if list_params.present? + search_articles order_by_sort_param @articles.page(list_params[:page]) if list_params[:page].present? end @@ -16,6 +16,10 @@ class Public::Api::V1::Portals::ArticlesController < Public::Api::V1::Portals::B private + def search_articles + @articles = @articles.search(list_params) if list_params.present? + end + def order_by_sort_param @articles = if list_params[:sort].present? && list_params[:sort] == 'views' @articles.order_by_views @@ -51,3 +55,5 @@ class Public::Api::V1::Portals::ArticlesController < Public::Api::V1::Portals::B ChatwootMarkdownRenderer.new(content).render_article end end + +Public::Api::V1::Portals::ArticlesController.prepend_mod_with('Public::Api::V1::Portals::ArticlesController') diff --git a/app/models/article.rb b/app/models/article.rb index 9abe87857..77903aeb7 100644 --- a/app/models/article.rb +++ b/app/models/article.rb @@ -170,3 +170,4 @@ class Article < ApplicationRecord self.slug ||= "#{Time.now.utc.to_i}-#{title.underscore.parameterize(separator: '-')}" if title.present? end end +Article.include_mod_with('Concerns::Article') diff --git a/config/features.yml b/config/features.yml index 37439c31f..42714bc73 100644 --- a/config/features.yml +++ b/config/features.yml @@ -80,3 +80,6 @@ - name: sla enabled: false premium: true +- name: help_center_embedding_search + enabled: false + premium: true diff --git a/config/initializers/monkey_patches/schema_dumper.rb b/config/initializers/monkey_patches/schema_dumper.rb index a790f34d0..e4182d27b 100644 --- a/config/initializers/monkey_patches/schema_dumper.rb +++ b/config/initializers/monkey_patches/schema_dumper.rb @@ -34,3 +34,4 @@ ActiveRecord::SchemaDumper.ignore_tables << 'responses' ActiveRecord::SchemaDumper.ignore_tables << 'response_sources' ActiveRecord::SchemaDumper.ignore_tables << 'response_documents' ActiveRecord::SchemaDumper.ignore_tables << 'inbox_response_sources' +ActiveRecord::SchemaDumper.ignore_tables << 'article_embeddings' diff --git a/enterprise/app/controllers/enterprise/public/api/v1/portals/articles_controller.rb b/enterprise/app/controllers/enterprise/public/api/v1/portals/articles_controller.rb new file mode 100644 index 000000000..ec305bfd1 --- /dev/null +++ b/enterprise/app/controllers/enterprise/public/api/v1/portals/articles_controller.rb @@ -0,0 +1,11 @@ +module Enterprise::Public::Api::V1::Portals::ArticlesController + private + + def search_articles + if @portal.account.feature_enabled?('help_center_embedding_search') + @articles = @articles.vector_search(list_params) if list_params.present? + else + super + end + end +end diff --git a/enterprise/app/models/article_embedding.rb b/enterprise/app/models/article_embedding.rb new file mode 100644 index 000000000..cc339e304 --- /dev/null +++ b/enterprise/app/models/article_embedding.rb @@ -0,0 +1,23 @@ +# == Schema Information +# +# Table name: article_embeddings +# +# id :bigint not null, primary key +# embedding :vector(1536) +# term :text not null +# created_at :datetime not null +# updated_at :datetime not null +# article_id :bigint not null +# +class ArticleEmbedding < ApplicationRecord + belongs_to :article + has_neighbors :embedding, normalize: true + + before_save :update_response_embedding + + private + + def update_response_embedding + self.embedding = Openai::EmbeddingsService.new.get_embedding(term, 'text-embedding-3-small') + end +end diff --git a/enterprise/app/models/enterprise/concerns/article.rb b/enterprise/app/models/enterprise/concerns/article.rb new file mode 100644 index 000000000..aa67ee7a5 --- /dev/null +++ b/enterprise/app/models/enterprise/concerns/article.rb @@ -0,0 +1,71 @@ +module Enterprise::Concerns::Article + extend ActiveSupport::Concern + + included do + after_save :add_article_embedding, if: -> { saved_change_to_title? || saved_change_to_description? || saved_change_to_content? } + + def self.add_article_embedding_association + has_many :article_embeddings, dependent: :destroy_async + end + + add_article_embedding_association if Features::HelpcenterEmbeddingSearchService.new.feature_enabled? + + def self.vector_search(params) + embedding = Openai::EmbeddingsService.new.get_embedding(params['query'], 'text-embedding-3-small') + records = joins( + :category + ).search_by_category_slug( + params[:category_slug] + ).search_by_category_locale(params[:locale]).search_by_author(params[:author_id]).search_by_status(params[:status]) + filtered_article_ids = records.pluck(:id) + + # Fetch nearest neighbors and their distances, then filter directly + + # experimenting with filtering results based on result threshold + # distance_threshold = 0.2 + # if using add the filter block to the below query + # .filter { |ae| ae.neighbor_distance <= distance_threshold } + + article_ids = ArticleEmbedding.where(article_id: filtered_article_ids) + .nearest_neighbors(:embedding, embedding, distance: 'cosine') + .limit(5) + .pluck(:article_id) + + # Fetch the articles by the IDs obtained from the nearest neighbors search + where(id: article_ids) + end + end + + def add_article_embedding + return unless account.feature_enabled?('help_center_embedding_search') + + terms = generate_article_search_terms + article_embeddings.destroy_all + terms.each { |term| article_embeddings.create!(term: term) } + end + + def article_to_search_terms_prompt + <<~SYSTEM_PROMPT_MESSAGE + For the provided article content, generate potential search query keywords and snippets that can be used to generate the embeddings. + Ensure the search terms are as diverse as possible but capture the essence of the article and are super related to the articles. + Don't return any terms if there aren't any terms of relevance. + Always return results in valid JSON of the following format + { + "search_terms": [] + } + SYSTEM_PROMPT_MESSAGE + end + + def generate_article_search_terms + messages = [ + { role: 'system', content: article_to_search_terms_prompt }, + { role: 'user', content: "title: #{title} \n description: #{description} \n content: #{content}" } + ] + headers = { 'Content-Type' => 'application/json', 'Authorization' => "Bearer #{ENV.fetch('OPENAI_API_KEY', nil)}" } + body = { model: 'gpt-4-turbo', messages: messages, response_format: { type: 'json_object' } }.to_json + Rails.logger.info "Requesting Chat GPT with body: #{body}" + response = HTTParty.post('https://api.openai.com/v1/chat/completions', headers: headers, body: body) + Rails.logger.info "Chat GPT response: #{response.body}" + JSON.parse(response.parsed_response['choices'][0]['message']['content'])['search_terms'] + end +end diff --git a/enterprise/app/services/features/base_service.rb b/enterprise/app/services/features/base_service.rb new file mode 100644 index 000000000..0e0bc9b7b --- /dev/null +++ b/enterprise/app/services/features/base_service.rb @@ -0,0 +1,7 @@ +class Features::BaseService + MIGRATION_VERSION = ActiveRecord::Migration[7.0] + + def vector_extension_enabled? + ActiveRecord::Base.connection.extension_enabled?('vector') + end +end diff --git a/enterprise/app/services/features/helpcenter_embedding_search_service.rb b/enterprise/app/services/features/helpcenter_embedding_search_service.rb new file mode 100644 index 000000000..23c6de98f --- /dev/null +++ b/enterprise/app/services/features/helpcenter_embedding_search_service.rb @@ -0,0 +1,42 @@ +# ensure vector extension is enabled via response bot service +class Features::HelpcenterEmbeddingSearchService < Features::BaseService + def enable_in_installation + create_tables + end + + def disable_in_installation + drop_tables + end + + def feature_enabled? + vector_extension_enabled? && MIGRATION_VERSION.table_exists?(:article_embeddings) + end + + def create_tables + return unless vector_extension_enabled? + + %i[article_embeddings].each do |table| + send("create_#{table}_table") + end + end + + def drop_tables + %i[article_embeddings].each do |table| + MIGRATION_VERSION.drop_table table if MIGRATION_VERSION.table_exists?(table) + end + end + + private + + def create_article_embeddings_table + return if MIGRATION_VERSION.table_exists?(:article_embeddings) + + MIGRATION_VERSION.create_table :article_embeddings do |t| + t.bigint :article_id, null: false + t.text :term, null: false + t.vector :embedding, limit: 1536 + t.timestamps + end + MIGRATION_VERSION.add_index :article_embeddingsk, :embedding, using: :ivfflat, opclass: :vector_l2_ops + end +end diff --git a/enterprise/app/services/features/response_bot_service.rb b/enterprise/app/services/features/response_bot_service.rb index 9162c085e..a3c5ec1b0 100644 --- a/enterprise/app/services/features/response_bot_service.rb +++ b/enterprise/app/services/features/response_bot_service.rb @@ -1,6 +1,4 @@ -class Features::ResponseBotService - MIGRATION_VERSION = ActiveRecord::Migration[7.0] - +class Features::ResponseBotService < Features::BaseService def enable_in_installation enable_vector_extension create_tables @@ -21,10 +19,6 @@ class Features::ResponseBotService MIGRATION_VERSION.disable_extension 'vector' end - def vector_extension_enabled? - ActiveRecord::Base.connection.extension_enabled?('vector') - end - def create_tables return unless vector_extension_enabled? diff --git a/enterprise/app/services/openai/embeddings_service.rb b/enterprise/app/services/openai/embeddings_service.rb index b79167203..d12871e98 100644 --- a/enterprise/app/services/openai/embeddings_service.rb +++ b/enterprise/app/services/openai/embeddings_service.rb @@ -1,11 +1,11 @@ class Openai::EmbeddingsService - def get_embedding(content) - fetch_embeddings(content) + def get_embedding(content, model = 'text-embedding-ada-002') + fetch_embeddings(content, model) end private - def fetch_embeddings(input) + def fetch_embeddings(input, model) url = 'https://api.openai.com/v1/embeddings' headers = { 'Authorization' => "Bearer #{ENV.fetch('OPENAI_API_KEY', '')}", @@ -13,7 +13,7 @@ class Openai::EmbeddingsService } data = { input: input, - model: 'text-embedding-ada-002' + model: model } response = Net::HTTP.post(URI(url), data.to_json, headers) diff --git a/spec/enterprise/controllers/enterprise/public/api/v1/portals/articles_controller_spec.rb b/spec/enterprise/controllers/enterprise/public/api/v1/portals/articles_controller_spec.rb new file mode 100644 index 000000000..6fa94c55d --- /dev/null +++ b/spec/enterprise/controllers/enterprise/public/api/v1/portals/articles_controller_spec.rb @@ -0,0 +1,19 @@ +require 'rails_helper' + +RSpec.describe 'Public Articles API', type: :request do + let!(:portal) { create(:portal, slug: 'test-portal', config: { allowed_locales: %w[en es] }, custom_domain: 'www.example.com') } + + describe 'GET /public/api/v1/portals/:slug/articles' do + before do + portal.account.enable_features!(:help_center_embedding_search) + end + + context 'with help_center_embedding_search feature' do + it 'get all articles with searched text query using vector search if enabled' do + allow(Article).to receive(:vector_search) + get "/hc/#{portal.slug}/en/articles.json", params: { query: 'funny' } + expect(Article).to have_received(:vector_search) + end + end + end +end