diff --git a/lib/tasks/generate_test_data.rake b/lib/tasks/generate_test_data.rake new file mode 100644 index 000000000..898ed340d --- /dev/null +++ b/lib/tasks/generate_test_data.rake @@ -0,0 +1,30 @@ +require_relative '../test_data' + +namespace :data do + desc 'Generate large, distributed test data' + task generate_distributed_data: :environment do + if Rails.env.production? + puts 'Generating large amounts of data in production can have serious performance implications.' + puts 'Exiting to avoid impacting a live environment.' + exit + end + + # Configure logger + Rails.logger = ActiveSupport::Logger.new($stdout) + Rails.logger.formatter = proc do |severity, datetime, _progname, msg| + "#{datetime.strftime('%Y-%m-%d %H:%M:%S.%L')} #{severity}: #{msg}\n" + end + + begin + TestData::DatabaseOptimizer.setup + TestData.generate + ensure + TestData::DatabaseOptimizer.restore + end + end + + desc 'Clean up existing test data' + task cleanup_test_data: :environment do + TestData.cleanup + end +end diff --git a/lib/test_data.rb b/lib/test_data.rb new file mode 100644 index 000000000..f797029dc --- /dev/null +++ b/lib/test_data.rb @@ -0,0 +1,18 @@ +module TestData + def self.generate + Orchestrator.call + end + + def self.cleanup + CleanupService.call + end +end + +require_relative 'test_data/constants' +require_relative 'test_data/database_optimizer' +require_relative 'test_data/cleanup_service' +require_relative 'test_data/account_creator' +require_relative 'test_data/inbox_creator' +require_relative 'test_data/display_id_tracker' +require_relative 'test_data/contact_batch_service' +require_relative 'test_data/orchestrator' diff --git a/lib/test_data/account_creator.rb b/lib/test_data/account_creator.rb new file mode 100644 index 000000000..13540ce91 --- /dev/null +++ b/lib/test_data/account_creator.rb @@ -0,0 +1,31 @@ +class TestData::AccountCreator + DATA_FILE = 'tmp/test_data_account_ids.txt'.freeze + + def self.create!(id) + company_name = generate_company_name + domain = generate_domain(company_name) + account = Account.create!( + id: id, + name: company_name, + domain: domain, + created_at: Faker::Time.between(from: 2.years.ago, to: 6.months.ago) + ) + persist_account_id(account.id) + account + end + + def self.generate_company_name + "#{Faker::Company.name} #{TestData::Constants::COMPANY_TYPES.sample}" + end + + def self.generate_domain(company_name) + "#{company_name.parameterize}.#{TestData::Constants::DOMAIN_EXTENSIONS.sample}" + end + + def self.persist_account_id(account_id) + FileUtils.mkdir_p('tmp') + File.open(DATA_FILE, 'a') do |file| + file.write("#{account_id},") + end + end +end diff --git a/lib/test_data/cleanup_service.rb b/lib/test_data/cleanup_service.rb new file mode 100644 index 000000000..6f952ec2c --- /dev/null +++ b/lib/test_data/cleanup_service.rb @@ -0,0 +1,51 @@ +class TestData::CleanupService + DATA_FILE = 'tmp/test_data_account_ids.txt'.freeze + + class << self + def call + Rails.logger.info 'Cleaning up any existing test data...' + + return log_no_file_found unless file_exists? + + account_ids = parse_account_ids_from_file + + if account_ids.any? + delete_accounts(account_ids) + else + log_no_accounts_found + end + + delete_data_file + Rails.logger.info '==> Cleanup complete!' + end + + private + + def file_exists? + File.exist?(DATA_FILE) + end + + def log_no_file_found + Rails.logger.info 'No test data file found, skipping cleanup' + end + + def parse_account_ids_from_file + File.read(DATA_FILE).split(',').map(&:strip).reject(&:empty?).map(&:to_i) + end + + def delete_accounts(account_ids) + Rails.logger.info "Found #{account_ids.size} test accounts to clean up: #{account_ids.join(', ')}" + start_time = Time.zone.now + Account.where(id: account_ids).destroy_all + Rails.logger.info "Deleted #{account_ids.size} accounts in #{Time.zone.now - start_time}s" + end + + def log_no_accounts_found + Rails.logger.info 'No test account IDs found in the data file' + end + + def delete_data_file + File.delete(DATA_FILE) + end + end +end diff --git a/lib/test_data/constants.rb b/lib/test_data/constants.rb new file mode 100644 index 000000000..0ef15dc2e --- /dev/null +++ b/lib/test_data/constants.rb @@ -0,0 +1,18 @@ +module TestData::Constants + NUM_ACCOUNTS = 20 + MIN_MESSAGES = 1_000_000 # 1M + MAX_MESSAGES = 10_000_000 # 10M + BATCH_SIZE = 5_000 + + MAX_CONVERSATIONS_PER_CONTACT = 20 + INBOXES_PER_ACCOUNT = 5 + STATUSES = %w[open resolved pending].freeze + MESSAGE_TYPES = %w[incoming outgoing].freeze + + MIN_MESSAGES_PER_CONVO = 5 + MAX_MESSAGES_PER_CONVO = 50 + + COMPANY_TYPES = %w[Retail Healthcare Finance Education Manufacturing].freeze + DOMAIN_EXTENSIONS = %w[com io tech ai].freeze + COUNTRY_CODES = %w[1 44 91 61 81 86 49 33 34 39].freeze # US, UK, India, Australia, Japan, China, Germany, France, Spain, Italy +end diff --git a/lib/test_data/contact_batch_service.rb b/lib/test_data/contact_batch_service.rb new file mode 100644 index 000000000..eaa41b5c4 --- /dev/null +++ b/lib/test_data/contact_batch_service.rb @@ -0,0 +1,196 @@ +class TestData::ContactBatchService + def initialize(account:, inboxes:, batch_size:, display_id_tracker:) + @account = account + @inboxes = inboxes + @batch_size = batch_size + @display_id_tracker = display_id_tracker + @total_messages = 0 + end + + # Generates contacts, contact_inboxes, conversations, and messages + # Returns the total number of messages created in this batch + def generate! + Rails.logger.info { "Starting batch generation for account ##{@account.id} with #{@batch_size} contacts" } + + create_contacts + create_contact_inboxes + create_conversations + create_messages + + Rails.logger.info { "Completed batch with #{@total_messages} messages for account ##{@account.id}" } + @total_messages + end + + private + + # rubocop:disable Rails/SkipsModelValidations + def create_contacts + Rails.logger.info { "Creating #{@batch_size} contacts for account ##{@account.id}" } + start_time = Time.current + + @contacts_data = Array.new(@batch_size) { build_contact_data } + Contact.insert_all!(@contacts_data) if @contacts_data.any? + @contacts = Contact + .where(account_id: @account.id) + .order(created_at: :desc) + .limit(@batch_size) + + Rails.logger.info { "Contacts created in #{Time.current - start_time}s" } + end + # rubocop:enable Rails/SkipsModelValidations + + def build_contact_data + created_at = Faker::Time.between(from: 1.year.ago, to: Time.current) + { + account_id: @account.id, + name: Faker::Name.name, + email: "#{SecureRandom.uuid}@example.com", + phone_number: generate_e164_phone_number, + additional_attributes: maybe_add_additional_attributes, + created_at: created_at, + updated_at: created_at + } + end + + def maybe_add_additional_attributes + return unless rand < 0.3 + + { + company: Faker::Company.name, + city: Faker::Address.city, + country: Faker::Address.country_code + } + end + + def generate_e164_phone_number + return nil unless rand < 0.7 + + country_code = TestData::Constants::COUNTRY_CODES.sample + subscriber_number = rand(1_000_000..9_999_999_999).to_s + subscriber_number = subscriber_number[0...(15 - country_code.length)] + "+#{country_code}#{subscriber_number}" + end + + # rubocop:disable Rails/SkipsModelValidations + def create_contact_inboxes + Rails.logger.info { "Creating contact inboxes for #{@contacts.size} contacts" } + start_time = Time.current + + contact_inboxes_data = @contacts.flat_map do |contact| + @inboxes.map do |inbox| + { + inbox_id: inbox.id, + contact_id: contact.id, + source_id: SecureRandom.uuid, + created_at: contact.created_at, + updated_at: contact.created_at + } + end + end + + count = contact_inboxes_data.size + ContactInbox.insert_all!(contact_inboxes_data) if contact_inboxes_data.any? + @contact_inboxes = ContactInbox.where(contact_id: @contacts.pluck(:id)) + + Rails.logger.info { "Created #{count} contact inboxes in #{Time.current - start_time}s" } + end + # rubocop:enable Rails/SkipsModelValidations + + # rubocop:disable Rails/SkipsModelValidations + def create_conversations + Rails.logger.info { "Creating conversations for account ##{@account.id}" } + start_time = Time.current + + conversations_data = [] + @contact_inboxes.each do |ci| + num_convos = rand(1..TestData::Constants::MAX_CONVERSATIONS_PER_CONTACT) + num_convos.times { conversations_data << build_conversation(ci) } + end + + count = conversations_data.size + Rails.logger.info { "Preparing to insert #{count} conversations" } + + Conversation.insert_all!(conversations_data) if conversations_data.any? + @conversations = Conversation.where( + account_id: @account.id, + display_id: conversations_data.pluck(:display_id) + ).order(:created_at) + + Rails.logger.info { "Created #{count} conversations in #{Time.current - start_time}s" } + end + # rubocop:enable Rails/SkipsModelValidations + + def build_conversation(contact_inbox) + created_at = Faker::Time.between(from: contact_inbox.created_at, to: Time.current) + { + account_id: @account.id, + inbox_id: contact_inbox.inbox_id, + contact_id: contact_inbox.contact_id, + contact_inbox_id: contact_inbox.id, + status: TestData::Constants::STATUSES.sample, + created_at: created_at, + updated_at: created_at, + display_id: @display_id_tracker.next_id + } + end + + # rubocop:disable Rails/SkipsModelValidations + def create_messages + Rails.logger.info { "Creating messages for #{@conversations.size} conversations" } + start_time = Time.current + + batch_count = 0 + @conversations.find_in_batches(batch_size: 1000) do |batch| + batch_count += 1 + batch_start = Time.current + + messages_data = batch.flat_map do |convo| + build_messages_for_conversation(convo) + end + + batch_message_count = messages_data.size + Rails.logger.info { "Preparing to insert #{batch_message_count} messages (batch #{batch_count})" } + + Message.insert_all!(messages_data) if messages_data.any? + @total_messages += batch_message_count + + Rails.logger.info { "Created batch #{batch_count} with #{batch_message_count} messages in #{Time.current - batch_start}s" } + end + + Rails.logger.info { "Created total of #{@total_messages} messages in #{Time.current - start_time}s" } + end + # rubocop:enable Rails/SkipsModelValidations + + def build_messages_for_conversation(conversation) + num_messages = rand(TestData::Constants::MIN_MESSAGES_PER_CONVO..TestData::Constants::MAX_MESSAGES_PER_CONVO) + message_type = TestData::Constants::MESSAGE_TYPES.sample + time_range = [conversation.created_at, Time.current] + generate_messages(conversation, num_messages, message_type, time_range) + end + + def generate_messages(conversation, num_messages, initial_message_type, time_range) + message_type = initial_message_type + + Array.new(num_messages) do + message_type = (message_type == 'incoming' ? 'outgoing' : 'incoming') + created_at = Faker::Time.between(from: time_range.first, to: time_range.last) + build_message_data(conversation, message_type, created_at) + end + end + + def build_message_data(conversation, message_type, created_at) + { + account_id: @account.id, + inbox_id: conversation.inbox_id, + conversation_id: conversation.id, + message_type: message_type, + content: Faker::Lorem.paragraph(sentence_count: 2), + created_at: created_at, + updated_at: created_at, + private: false, + status: 'sent', + content_type: 'text', + source_id: SecureRandom.uuid + } + end +end diff --git a/lib/test_data/database_optimizer.rb b/lib/test_data/database_optimizer.rb new file mode 100644 index 000000000..f3fd42f7b --- /dev/null +++ b/lib/test_data/database_optimizer.rb @@ -0,0 +1,80 @@ +class TestData::DatabaseOptimizer + class << self + # Tables that need trigger management + TABLES_WITH_TRIGGERS = %w[conversations messages].freeze + + # Memory settings in MB + # Increased work_mem for better query performance with complex operations + WORK_MEM = 256 + + def setup + Rails.logger.info '==> Setting up database optimizations for improved performance' + + # Remove statement timeout to allow long-running operations to complete + Rails.logger.info ' Removing statement timeout' + ActiveRecord::Base.connection.execute('SET statement_timeout = 0') + + # Increase working memory for better query performance + Rails.logger.info " Increasing work_mem to #{WORK_MEM}MB" + ActiveRecord::Base.connection.execute("SET work_mem = '#{WORK_MEM}MB'") + + # Set tables to UNLOGGED mode for better write performance + # This disables WAL completely for these tables + Rails.logger.info ' Setting tables to UNLOGGED mode' + set_tables_unlogged + + # Disable triggers on specified tables to avoid overhead + Rails.logger.info ' Disabling triggers on specified tables' + disable_triggers + + Rails.logger.info '==> Database optimizations complete, data generation will run faster' + end + + def restore + Rails.logger.info '==> Restoring database settings to normal' + + Rails.logger.info ' Re-enabling triggers on specified tables' + enable_triggers + + Rails.logger.info ' Setting tables back to LOGGED mode' + set_tables_logged + + # Reset memory settings to defaults + Rails.logger.info ' Resetting memory settings to defaults' + ActiveRecord::Base.connection.execute('RESET work_mem') + ActiveRecord::Base.connection.execute('RESET maintenance_work_mem') + + Rails.logger.info '==> Database settings restored to normal operation' + end + + private + + def disable_triggers + TABLES_WITH_TRIGGERS.each do |table| + Rails.logger.info " Disabling triggers on #{table} table" + ActiveRecord::Base.connection.execute("ALTER TABLE #{table} DISABLE TRIGGER ALL") + end + end + + def enable_triggers + TABLES_WITH_TRIGGERS.each do |table| + Rails.logger.info " Enabling triggers on #{table} table" + ActiveRecord::Base.connection.execute("ALTER TABLE #{table} ENABLE TRIGGER ALL") + end + end + + def set_tables_unlogged + TABLES_WITH_TRIGGERS.each do |table| + Rails.logger.info " Setting #{table} table as UNLOGGED" + ActiveRecord::Base.connection.execute("ALTER TABLE #{table} SET UNLOGGED") + end + end + + def set_tables_logged + TABLES_WITH_TRIGGERS.each do |table| + Rails.logger.info " Setting #{table} table as LOGGED" + ActiveRecord::Base.connection.execute("ALTER TABLE #{table} SET LOGGED") + end + end + end +end diff --git a/lib/test_data/display_id_tracker.rb b/lib/test_data/display_id_tracker.rb new file mode 100644 index 000000000..04ea406b0 --- /dev/null +++ b/lib/test_data/display_id_tracker.rb @@ -0,0 +1,12 @@ +class TestData::DisplayIdTracker + attr_reader :current + + def initialize(account:) + max_display_id = Conversation.where(account_id: account.id).maximum(:display_id) || 0 + @current = max_display_id + end + + def next_id + @current += 1 + end +end diff --git a/lib/test_data/inbox_creator.rb b/lib/test_data/inbox_creator.rb new file mode 100644 index 000000000..4ece7624e --- /dev/null +++ b/lib/test_data/inbox_creator.rb @@ -0,0 +1,12 @@ +class TestData::InboxCreator + def self.create_for(account) + Array.new(TestData::Constants::INBOXES_PER_ACCOUNT) do + channel = Channel::Api.create!(account: account) + Inbox.create!( + account_id: account.id, + name: "API Inbox #{SecureRandom.hex(4)}", + channel: channel + ) + end + end +end diff --git a/lib/test_data/orchestrator.rb b/lib/test_data/orchestrator.rb new file mode 100644 index 000000000..996713d5b --- /dev/null +++ b/lib/test_data/orchestrator.rb @@ -0,0 +1,109 @@ +class TestData::Orchestrator + class << self + def call + Rails.logger.info { '========== STARTING TEST DATA GENERATION ==========' } + + cleanup_existing_data + set_start_id + + Rails.logger.info { "Starting to generate distributed test data across #{TestData::Constants::NUM_ACCOUNTS} accounts..." } + Rails.logger.info do + "Each account have between #{TestData::Constants::MIN_MESSAGES / 1_000_000}M and #{TestData::Constants::MAX_MESSAGES / 1_000_000}M messages" + end + + TestData::Constants::NUM_ACCOUNTS.times do |account_index| + Rails.logger.info { "Processing account #{account_index + 1} of #{TestData::Constants::NUM_ACCOUNTS}" } + process_account(account_index) + end + + Rails.logger.info { "========== ALL DONE! Created #{TestData::Constants::NUM_ACCOUNTS} accounts with distributed test data ==========" } + end + + private + + # Simple value object to group generation parameters + class DataGenerationParams + attr_reader :account, :inboxes, :total_contacts_needed, :target_message_count, :display_id_tracker + + def initialize(account:, inboxes:, total_contacts_needed:, target_message_count:, display_id_tracker:) + @account = account + @inboxes = inboxes + @total_contacts_needed = total_contacts_needed + @target_message_count = target_message_count + @display_id_tracker = display_id_tracker + end + end + + # 1. Remove existing data for old test accounts + def cleanup_existing_data + Rails.logger.info { 'Cleaning up existing test data...' } + TestData::CleanupService.call + Rails.logger.info { 'Cleanup complete' } + end + + # 2. Find the max Account ID to avoid conflicts + def set_start_id + max_id = Account.maximum(:id) || 0 + @start_id = max_id + 1 + Rails.logger.info { "Setting start ID to #{@start_id}" } + end + + # 3. Create an account, its inboxes, and some data + def process_account(account_index) + account_id = @start_id + account_index + Rails.logger.info { "Creating account with ID #{account_id}" } + account = TestData::AccountCreator.create!(account_id) + + inboxes = TestData::InboxCreator.create_for(account) + target_messages = rand(TestData::Constants::MIN_MESSAGES..TestData::Constants::MAX_MESSAGES) + avg_per_convo = rand(15..50) + total_convos = (target_messages / avg_per_convo.to_f).ceil + total_contacts = (total_convos / TestData::Constants::MAX_CONVERSATIONS_PER_CONTACT.to_f).ceil + + log_account_details(account, target_messages, total_contacts, total_convos) + display_id_tracker = TestData::DisplayIdTracker.new(account: account) + + params = DataGenerationParams.new( + account: account, + inboxes: inboxes, + total_contacts_needed: total_contacts, + target_message_count: target_messages, + display_id_tracker: display_id_tracker + ) + + Rails.logger.info { "Starting data generation for account ##{account.id}" } + generate_data_for_account(params) + end + + def generate_data_for_account(params) + contact_count = 0 + message_count = 0 + batch_number = 0 + + while contact_count < params.total_contacts_needed + batch_number += 1 + batch_size = [TestData::Constants::BATCH_SIZE, params.total_contacts_needed - contact_count].min + Rails.logger.info { "Processing batch ##{batch_number} (#{batch_size} contacts) for account ##{params.account.id}" } + + batch_service = TestData::ContactBatchService.new( + account: params.account, + inboxes: params.inboxes, + batch_size: batch_size, + display_id_tracker: params.display_id_tracker + ) + batch_created_messages = batch_service.generate! + + contact_count += batch_size + message_count += batch_created_messages + + end + + Rails.logger.info { "==> Completed Account ##{params.account.id} with #{message_count} messages" } + end + + def log_account_details(account, target_messages, total_contacts, total_convos) + Rails.logger.info { "==> Account ##{account.id} plan: target of #{target_messages / 1_000_000.0}M messages" } + Rails.logger.info { " Planning for #{total_contacts} contacts and #{total_convos} conversations" } + end + end +end