feat: Instrument captain (#12949)

Co-authored-by: aakashb95 <aakash@chatwoot.com>
2025-11-28 15:12:55 +05:30
parent 92fc286ecb
commit 1ef945de7b
12 changed files with 346 additions and 116 deletions
--- a/.rubocop.yml
+++ b/.rubocop.yml
@@ -88,7 +88,7 @@ Metrics/ModuleLength:
 Rails/HelperInstanceVariable:
  Exclude:
    - enterprise/app/helpers/captain/chat_helper.rb
-
+    - 'enterprise/app/helpers/captain/tool_execution_helper.rb'
 Rails/ApplicationController:
  Exclude:
    - 'app/controllers/api/v1/widget/messages_controller.rb'
--- a/enterprise/app/helpers/captain/chat_helper.rb
+++ b/enterprise/app/helpers/captain/chat_helper.rb
@@ -1,94 +1,78 @@
 module Captain::ChatHelper
+  include Integrations::LlmInstrumentation
+  include Captain::ToolExecutionHelper
+
  def request_chat_completion
    log_chat_completion_request

-    response = @client.chat(
-      parameters: {
-        model: @model,
-        messages: @messages,
-        tools: @tool_registry&.registered_tools || [],
-        response_format: { type: 'json_object' },
-        temperature: @assistant&.config&.[]('temperature').to_f || 1
-      }
-    )
-
-    handle_response(response)
+    with_agent_session do
+      response = instrument_llm_call(instrumentation_params) do
+        @client.chat(
+          parameters: chat_parameters
+        )
+      end
+      handle_response(response)
+    end
  rescue StandardError => e
    Rails.logger.error "#{self.class.name} Assistant: #{@assistant.id}, Error in chat completion: #{e}"
    raise e
  end

+  def instrumentation_params
+    {
+      span_name: "llm.captain.#{feature_name}",
+      account_id: resolved_account_id,
+      conversation_id: @conversation_id,
+      feature_name: feature_name,
+      model: @model,
+      messages: @messages,
+      temperature: temperature,
+      metadata: {
+        assistant_id: @assistant&.id
+      }
+    }
+  end
+
+  def chat_parameters
+    {
+      model: @model,
+      messages: @messages,
+      tools: @tool_registry&.registered_tools || [],
+      response_format: { type: 'json_object' },
+      temperature: temperature
+    }
+  end
+
+  def temperature
+    @assistant&.config&.[]('temperature').to_f || 1
+  end
+
+  def resolved_account_id
+    @account&.id || @assistant&.account_id
+  end
+
  private

-  def handle_response(response)
-    Rails.logger.debug { "#{self.class.name} Assistant: #{@assistant.id}, Received response #{response}" }
-    message = response.dig('choices', 0, 'message')
-    if message['tool_calls']
-      process_tool_calls(message['tool_calls'])
-    else
-      message = JSON.parse(message['content'].strip)
-      persist_message(message, 'assistant')
-      message
-    end
+  # Ensures all LLM calls and tool executions within an agentic loop
+  # are grouped under a single trace/session in Langfuse.
+  #
+  # Without this guard, each recursive call to request_chat_completion
+  # (triggered by tool calls) would create a separate trace instead of
+  # nesting within the existing session span.
+  def with_agent_session(&)
+    already_active = @agent_session_active
+    return yield if already_active
+
+    @agent_session_active = true
+    instrument_agent_session(instrumentation_params, &)
+  ensure
+    @agent_session_active = false unless already_active
  end

-  def process_tool_calls(tool_calls)
-    append_tool_calls(tool_calls)
-    tool_calls.each do |tool_call|
-      process_tool_call(tool_call)
-    end
-    request_chat_completion
-  end
-
-  def process_tool_call(tool_call)
-    arguments = JSON.parse(tool_call['function']['arguments'])
-    function_name = tool_call['function']['name']
-    tool_call_id = tool_call['id']
-
-    if @tool_registry.respond_to?(function_name)
-      execute_tool(function_name, arguments, tool_call_id)
-    else
-      process_invalid_tool_call(function_name, tool_call_id)
-    end
-  end
-
-  def execute_tool(function_name, arguments, tool_call_id)
-    persist_message(
-      {
-        content: I18n.t('captain.copilot.using_tool', function_name: function_name),
-        function_name: function_name
-      },
-      'assistant_thinking'
-    )
-    result = @tool_registry.send(function_name, arguments)
-    persist_message(
-      {
-        content: I18n.t('captain.copilot.completed_tool_call', function_name: function_name),
-        function_name: function_name
-      },
-      'assistant_thinking'
-    )
-    append_tool_response(result, tool_call_id)
-  end
-
-  def append_tool_calls(tool_calls)
-    @messages << {
-      role: 'assistant',
-      tool_calls: tool_calls
-    }
-  end
-
-  def process_invalid_tool_call(function_name, tool_call_id)
-    persist_message({ content: I18n.t('captain.copilot.invalid_tool_call'), function_name: function_name }, 'assistant_thinking')
-    append_tool_response(I18n.t('captain.copilot.tool_not_available'), tool_call_id)
-  end
-
-  def append_tool_response(content, tool_call_id)
-    @messages << {
-      role: 'tool',
-      tool_call_id: tool_call_id,
-      content: content
-    }
+  # Must be implemented by including class to identify the feature for instrumentation.
+  # Used for Langfuse tagging and span naming.
+  def feature_name
+    raise NotImplementedError, "#{self.class.name} must implement #feature_name"
  end

  def log_chat_completion_request
--- a/enterprise/app/helpers/captain/tool_execution_helper.rb
+++ b/enterprise/app/helpers/captain/tool_execution_helper.rb
@@ -0,0 +1,83 @@
+module Captain::ToolExecutionHelper
+  private
+
+  def handle_response(response)
+    Rails.logger.debug { "#{self.class.name} Assistant: #{@assistant.id}, Received response #{response}" }
+    message = response.dig('choices', 0, 'message')
+
+    if message['tool_calls']
+      process_tool_calls(message['tool_calls'])
+    else
+      message = JSON.parse(message['content'].strip)
+      persist_message(message, 'assistant')
+      message
+    end
+  end
+
+  def process_tool_calls(tool_calls)
+    append_tool_calls(tool_calls)
+    tool_calls.each { |tool_call| process_tool_call(tool_call) }
+    request_chat_completion
+  end
+
+  def process_tool_call(tool_call)
+    arguments = JSON.parse(tool_call['function']['arguments'])
+    function_name = tool_call['function']['name']
+    tool_call_id = tool_call['id']
+
+    if @tool_registry.respond_to?(function_name)
+      execute_tool(function_name, arguments, tool_call_id)
+    else
+      process_invalid_tool_call(function_name, tool_call_id)
+    end
+  end
+
+  def execute_tool(function_name, arguments, tool_call_id)
+    persist_tool_status(function_name, 'captain.copilot.using_tool')
+    result = perform_tool_call(function_name, arguments)
+    persist_tool_status(function_name, 'captain.copilot.completed_tool_call')
+    append_tool_response(result, tool_call_id)
+  end
+
+  def perform_tool_call(function_name, arguments)
+    instrument_tool_call(function_name, arguments) do
+      @tool_registry.send(function_name, arguments)
+    end
+  rescue StandardError => e
+    Rails.logger.error "Tool #{function_name} failed: #{e.message}"
+    "Error executing #{function_name}: #{e.message}"
+  end
+
+  def persist_tool_status(function_name, translation_key)
+    persist_message(
+      {
+        content: I18n.t(translation_key, function_name: function_name),
+        function_name: function_name
+      },
+      'assistant_thinking'
+    )
+  end
+
+  def append_tool_calls(tool_calls)
+    @messages << {
+      role: 'assistant',
+      tool_calls: tool_calls
+    }
+  end
+
+  def process_invalid_tool_call(function_name, tool_call_id)
+    persist_message(
+      { content: I18n.t('captain.copilot.invalid_tool_call'), function_name: function_name },
+      'assistant_thinking'
+    )
+    append_tool_response(I18n.t('captain.copilot.tool_not_available'), tool_call_id)
+  end
+
+  def append_tool_response(content, tool_call_id)
+    @messages << {
+      role: 'tool',
+      tool_call_id: tool_call_id,
+      content: content
+    }
+  end
+end
--- a/enterprise/app/jobs/captain/conversation/response_builder_job.rb
+++ b/enterprise/app/jobs/captain/conversation/response_builder_job.rb
@@ -30,7 +30,7 @@ class Captain::Conversation::ResponseBuilderJob < ApplicationJob
  delegate :account, :inbox, to: :@conversation

  def generate_and_process_response
-    @response = Captain::Llm::AssistantChatService.new(assistant: @assistant).generate_response(
+    @response = Captain::Llm::AssistantChatService.new(assistant: @assistant, conversation_id: @conversation.id).generate_response(
      message_history: collect_previous_messages
    )
    process_response
--- a/enterprise/app/jobs/captain/copilot/response_job.rb
+++ b/enterprise/app/jobs/captain/copilot/response_job.rb
@@ -15,11 +15,14 @@ class Captain::Copilot::ResponseJob < ApplicationJob
  private

  def generate_chat_response(assistant:, conversation_id:, user_id:, copilot_thread_id:, message:)
-    Captain::Copilot::ChatService.new(
+    service = Captain::Copilot::ChatService.new(
      assistant,
      user_id: user_id,
      copilot_thread_id: copilot_thread_id,
      conversation_id: conversation_id
-    ).generate_response(message)
+    )
+    # When using copilot_thread, message is already in previous_history
+    # Pass nil to avoid duplicate
+    service.generate_response(copilot_thread_id.present? ? nil : message)
  end
 end
--- a/enterprise/app/services/captain/copilot/chat_service.rb
+++ b/enterprise/app/services/captain/copilot/chat_service.rb
@@ -13,6 +13,7 @@ class Captain::Copilot::ChatService < Llm::BaseOpenAiService
    @user = nil
    @copilot_thread = nil
    @previous_history = []
+    @conversation_id = config[:conversation_id]
    setup_user(config)
    setup_message_history(config)
    register_tools
@@ -113,4 +114,8 @@ class Captain::Copilot::ChatService < Llm::BaseOpenAiService
      message_type: message_type
    )
  end
+
+  def feature_name
+    'copilot'
+  end
 end
--- a/enterprise/app/services/captain/llm/assistant_chat_service.rb
+++ b/enterprise/app/services/captain/llm/assistant_chat_service.rb
@@ -3,10 +3,11 @@ require 'openai'
 class Captain::Llm::AssistantChatService < Llm::BaseOpenAiService
  include Captain::ChatHelper

-  def initialize(assistant: nil)
+  def initialize(assistant: nil, conversation_id: nil)
    super()

    @assistant = assistant
+    @conversation_id = conversation_id
    @messages = [system_message]
    @response = ''
    register_tools
@@ -42,4 +43,8 @@ class Captain::Llm::AssistantChatService < Llm::BaseOpenAiService
  def persist_message(message, message_type = 'assistant')
    # No need to implement
  end
+
+  def feature_name
+    'assistant'
+  end
 end
--- a/lib/integrations/llm_instrumentation.rb
+++ b/lib/integrations/llm_instrumentation.rb
@@ -1,28 +1,10 @@
 # frozen_string_literal: true

 require 'opentelemetry_config'
+require_relative 'llm_instrumentation_constants'

 module Integrations::LlmInstrumentation
-  # OpenTelemetry attribute names following GenAI semantic conventions
-  # https://opentelemetry.io/docs/specs/semconv/gen-ai/
-  ATTR_GEN_AI_PROVIDER = 'gen_ai.provider.name'
-  ATTR_GEN_AI_REQUEST_MODEL = 'gen_ai.request.model'
-  ATTR_GEN_AI_REQUEST_TEMPERATURE = 'gen_ai.request.temperature'
-  ATTR_GEN_AI_PROMPT_ROLE = 'gen_ai.prompt.%d.role'
-  ATTR_GEN_AI_PROMPT_CONTENT = 'gen_ai.prompt.%d.content'
-  ATTR_GEN_AI_COMPLETION_ROLE = 'gen_ai.completion.0.role'
-  ATTR_GEN_AI_COMPLETION_CONTENT = 'gen_ai.completion.0.content'
-  ATTR_GEN_AI_USAGE_INPUT_TOKENS = 'gen_ai.usage.input_tokens'
-  ATTR_GEN_AI_USAGE_OUTPUT_TOKENS = 'gen_ai.usage.output_tokens'
-  ATTR_GEN_AI_USAGE_TOTAL_TOKENS = 'gen_ai.usage.total_tokens'
-  ATTR_GEN_AI_RESPONSE_ERROR = 'gen_ai.response.error'
-  ATTR_GEN_AI_RESPONSE_ERROR_CODE = 'gen_ai.response.error_code'
-
-  # Langfuse-specific attributes
-  # https://langfuse.com/integrations/native/opentelemetry#property-mapping
-  ATTR_LANGFUSE_USER_ID = 'langfuse.user.id'
-  ATTR_LANGFUSE_SESSION_ID = 'langfuse.session.id'
-  ATTR_LANGFUSE_TAGS = 'langfuse.trace.tags'
+  include Integrations::LlmInstrumentationConstants

  def tracer
    @tracer ||= OpentelemetryConfig.tracer
@@ -42,6 +24,37 @@ module Integrations::LlmInstrumentation
    yield
  end

+  def instrument_agent_session(params)
+    return yield unless ChatwootApp.otel_enabled?
+
+    tracer.in_span(params[:span_name]) do |span|
+      set_metadata_attributes(span, params)
+
+      # By default, the input and output of a trace are set from the root observation
+      span.set_attribute(ATTR_LANGFUSE_OBSERVATION_INPUT, params[:messages].to_json)
+      result = yield
+      span.set_attribute(ATTR_LANGFUSE_OBSERVATION_OUTPUT, result.to_json)
+
+      result
+    end
+  rescue StandardError => e
+    ChatwootExceptionTracker.new(e, account: params[:account]).capture_exception
+    yield
+  end
+
+  def instrument_tool_call(tool_name, arguments)
+    # There is no error handling because tools can fail and LLMs should be
+    # aware of those failures and factor them into their response.
+    return yield unless ChatwootApp.otel_enabled?
+
+    tracer.in_span(format(TOOL_SPAN_NAME, tool_name)) do |span|
+      span.set_attribute(ATTR_LANGFUSE_OBSERVATION_INPUT, arguments.to_json)
+      result = yield
+      span.set_attribute(ATTR_LANGFUSE_OBSERVATION_OUTPUT, result.to_json)
+      result
+    end
+  end
+
  private

  def setup_span_attributes(span, params)
@@ -62,8 +75,11 @@ module Integrations::LlmInstrumentation

  def set_prompt_messages(span, messages)
    messages.each_with_index do |msg, idx|
-      span.set_attribute(format(ATTR_GEN_AI_PROMPT_ROLE, idx), msg['role'])
-      span.set_attribute(format(ATTR_GEN_AI_PROMPT_CONTENT, idx), msg['content'])
+      role = msg[:role] || msg['role']
+      content = msg[:content] || msg['content']
+
+      span.set_attribute(format(ATTR_GEN_AI_PROMPT_ROLE, idx), role)
+      span.set_attribute(format(ATTR_GEN_AI_PROMPT_CONTENT, idx), content.to_s)
    end
  end

@@ -72,6 +88,12 @@ module Integrations::LlmInstrumentation
    span.set_attribute(ATTR_LANGFUSE_USER_ID, params[:account_id].to_s) if params[:account_id]
    span.set_attribute(ATTR_LANGFUSE_SESSION_ID, session_id) if session_id.present?
    span.set_attribute(ATTR_LANGFUSE_TAGS, [params[:feature_name]].to_json)
+
+    return unless params[:metadata].is_a?(Hash)
+
+    params[:metadata].each do |key, value|
+      span.set_attribute(format(ATTR_LANGFUSE_METADATA, key), value.to_s)
+    end
  end

  def set_completion_attributes(span, result)
@@ -81,26 +103,29 @@ module Integrations::LlmInstrumentation
  end

  def set_completion_message(span, result)
-    return if result[:message].blank?
+    message = result[:message] || result.dig('choices', 0, 'message', 'content')
+    return if message.blank?

    span.set_attribute(ATTR_GEN_AI_COMPLETION_ROLE, 'assistant')
-    span.set_attribute(ATTR_GEN_AI_COMPLETION_CONTENT, result[:message])
+    span.set_attribute(ATTR_GEN_AI_COMPLETION_CONTENT, message)
  end

  def set_usage_metrics(span, result)
-    return if result[:usage].blank?
+    usage = result[:usage] || result['usage']
+    return if usage.blank?

-    usage = result[:usage]
    span.set_attribute(ATTR_GEN_AI_USAGE_INPUT_TOKENS, usage['prompt_tokens']) if usage['prompt_tokens']
    span.set_attribute(ATTR_GEN_AI_USAGE_OUTPUT_TOKENS, usage['completion_tokens']) if usage['completion_tokens']
    span.set_attribute(ATTR_GEN_AI_USAGE_TOTAL_TOKENS, usage['total_tokens']) if usage['total_tokens']
  end

  def set_error_attributes(span, result)
-    return if result[:error].blank?
+    error = result[:error] || result['error']
+    return if error.blank?

-    span.set_attribute(ATTR_GEN_AI_RESPONSE_ERROR, result[:error].to_json)
-    span.set_attribute(ATTR_GEN_AI_RESPONSE_ERROR_CODE, result[:error_code]) if result[:error_code]
-    span.status = OpenTelemetry::Trace::Status.error("API Error: #{result[:error_code]}")
+    error_code = result[:error_code] || result['error_code']
+    span.set_attribute(ATTR_GEN_AI_RESPONSE_ERROR, error.to_json)
+    span.set_attribute(ATTR_GEN_AI_RESPONSE_ERROR_CODE, error_code) if error_code
+    span.status = OpenTelemetry::Trace::Status.error("API Error: #{error_code}")
  end
 end
--- a/lib/integrations/llm_instrumentation_constants.rb
+++ b/lib/integrations/llm_instrumentation_constants.rb
@@ -0,0 +1,31 @@
+# frozen_string_literal: true
+
+module Integrations::LlmInstrumentationConstants
+  # OpenTelemetry attribute names following GenAI semantic conventions
+  # https://opentelemetry.io/docs/specs/semconv/gen-ai/
+  ATTR_GEN_AI_PROVIDER = 'gen_ai.provider.name'
+  ATTR_GEN_AI_REQUEST_MODEL = 'gen_ai.request.model'
+  ATTR_GEN_AI_REQUEST_TEMPERATURE = 'gen_ai.request.temperature'
+  ATTR_GEN_AI_PROMPT_ROLE = 'gen_ai.prompt.%d.role'
+  ATTR_GEN_AI_PROMPT_CONTENT = 'gen_ai.prompt.%d.content'
+  ATTR_GEN_AI_COMPLETION_ROLE = 'gen_ai.completion.0.role'
+  ATTR_GEN_AI_COMPLETION_CONTENT = 'gen_ai.completion.0.content'
+  ATTR_GEN_AI_USAGE_INPUT_TOKENS = 'gen_ai.usage.input_tokens'
+  ATTR_GEN_AI_USAGE_OUTPUT_TOKENS = 'gen_ai.usage.output_tokens'
+  ATTR_GEN_AI_USAGE_TOTAL_TOKENS = 'gen_ai.usage.total_tokens'
+  ATTR_GEN_AI_RESPONSE_ERROR = 'gen_ai.response.error'
+  ATTR_GEN_AI_RESPONSE_ERROR_CODE = 'gen_ai.response.error_code'
+
+  TOOL_SPAN_NAME = 'tool.%s'
+
+  # Langfuse-specific attributes
+  # https://langfuse.com/integrations/native/opentelemetry#property-mapping
+  ATTR_LANGFUSE_USER_ID = 'langfuse.user.id'
+  ATTR_LANGFUSE_SESSION_ID = 'langfuse.session.id'
+  ATTR_LANGFUSE_TAGS = 'langfuse.trace.tags'
+  ATTR_LANGFUSE_METADATA = 'langfuse.trace.metadata.%s'
+  ATTR_LANGFUSE_TRACE_INPUT = 'langfuse.trace.input'
+  ATTR_LANGFUSE_TRACE_OUTPUT = 'langfuse.trace.output'
+  ATTR_LANGFUSE_OBSERVATION_INPUT = 'langfuse.observation.input'
+  ATTR_LANGFUSE_OBSERVATION_OUTPUT = 'langfuse.observation.output'
+end
--- a/spec/enterprise/jobs/captain/conversation/response_builder_job_spec.rb
+++ b/spec/enterprise/jobs/captain/conversation/response_builder_job_spec.rb
@@ -28,7 +28,7 @@ RSpec.describe Captain::Conversation::ResponseBuilderJob, type: :job do
      end

      it 'uses Captain::Llm::AssistantChatService' do
-        expect(Captain::Llm::AssistantChatService).to receive(:new).with(assistant: assistant)
+        expect(Captain::Llm::AssistantChatService).to receive(:new).with(assistant: assistant, conversation_id: conversation.id)
        expect(Captain::Assistant::AgentRunnerService).not_to receive(:new)

        described_class.perform_now(conversation, assistant)
--- a/spec/enterprise/jobs/captain/copilot/response_job_spec.rb
+++ b/spec/enterprise/jobs/captain/copilot/response_job_spec.rb
@@ -18,7 +18,9 @@ RSpec.describe Captain::Copilot::ResponseJob, type: :job do
        copilot_thread_id: copilot_thread.id,
        conversation_id: conversation_id
      ).and_return(chat_service)
-      allow(chat_service).to receive(:generate_response).with(message)
+      # When copilot_thread_id is present, message is already in previous_history
+      # so nil is passed to avoid duplicate
+      allow(chat_service).to receive(:generate_response).with(nil)
    end

    it 'initializes ChatService with correct parameters and calls generate_response' do
@@ -28,7 +30,9 @@ RSpec.describe Captain::Copilot::ResponseJob, type: :job do
        copilot_thread_id: copilot_thread.id,
        conversation_id: conversation_id
      )
-      expect(chat_service).to receive(:generate_response).with(message)
+      # Message is already persisted in copilot_thread.previous_history,
+      # so we pass nil to prevent duplicate user messages
+      expect(chat_service).to receive(:generate_response).with(nil)
      described_class.perform_now(
        assistant: assistant,
        conversation_id: conversation_id,
--- a/spec/lib/integrations/llm_instrumentation_spec.rb
+++ b/spec/lib/integrations/llm_instrumentation_spec.rb
@@ -213,5 +213,95 @@ RSpec.describe Integrations::LlmInstrumentation do
        expect(OpenTelemetry::Trace::Status).to have_received(:error).with('API Error: rate_limit_exceeded')
      end
    end
+
+    describe '#instrument_agent_session' do
+      context 'when OTEL provider is not configured' do
+        before { otel_config.update(value: '') }
+
+        it 'executes the block without tracing' do
+          result = instance.instrument_agent_session(params) { 'my_result' }
+          expect(result).to eq('my_result')
+        end
+      end
+
+      context 'when OTEL provider is configured' do
+        let(:mock_span) { instance_double(OpenTelemetry::Trace::Span) }
+        let(:mock_tracer) { instance_double(OpenTelemetry::Trace::Tracer) }
+
+        before do
+          allow(mock_span).to receive(:set_attribute)
+          allow(instance).to receive(:tracer).and_return(mock_tracer)
+          allow(mock_tracer).to receive(:in_span).and_yield(mock_span)
+        end
+
+        it 'executes the block and returns the result' do
+          result = instance.instrument_agent_session(params) { 'my_result' }
+          expect(result).to eq('my_result')
+        end
+
+        it 'returns the block result even if instrumentation has errors' do
+          allow(mock_tracer).to receive(:in_span).and_raise(StandardError.new('Instrumentation failed'))
+
+          result = instance.instrument_agent_session(params) { 'my_result' }
+
+          expect(result).to eq('my_result')
+        end
+
+        it 'sets trace input and output attributes' do
+          result_data = { content: 'AI response' }
+          instance.instrument_agent_session(params) { result_data }
+
+          expect(mock_span).to have_received(:set_attribute).with('langfuse.observation.input', params[:messages].to_json)
+          expect(mock_span).to have_received(:set_attribute).with('langfuse.observation.output', result_data.to_json)
+        end
+      end
+    end
+
+    describe '#instrument_tool_call' do
+      let(:tool_name) { 'search_documents' }
+      let(:arguments) { { query: 'test query' } }
+
+      context 'when OTEL provider is not configured' do
+        before { otel_config.update(value: '') }
+
+        it 'executes the block without tracing' do
+          result = instance.instrument_tool_call(tool_name, arguments) { 'tool_result' }
+          expect(result).to eq('tool_result')
+        end
+      end
+
+      context 'when OTEL provider is configured' do
+        let(:mock_span) { instance_double(OpenTelemetry::Trace::Span) }
+        let(:mock_tracer) { instance_double(OpenTelemetry::Trace::Tracer) }
+
+        before do
+          allow(mock_span).to receive(:set_attribute)
+          allow(instance).to receive(:tracer).and_return(mock_tracer)
+          allow(mock_tracer).to receive(:in_span).and_yield(mock_span)
+        end
+
+        it 'executes the block and returns the result' do
+          result = instance.instrument_tool_call(tool_name, arguments) { 'tool_result' }
+          expect(result).to eq('tool_result')
+        end
+
+        it 'propagates instrumentation errors' do
+          allow(mock_tracer).to receive(:in_span).and_raise(StandardError.new('Instrumentation failed'))
+
+          expect do
+            instance.instrument_tool_call(tool_name, arguments) { 'tool_result' }
+          end.to raise_error(StandardError, 'Instrumentation failed')
+        end
+
+        it 'creates a span with tool name and sets observation attributes' do
+          tool_result = { documents: ['doc1'] }
+          instance.instrument_tool_call(tool_name, arguments) { tool_result }
+
+          expect(mock_tracer).to have_received(:in_span).with('tool.search_documents')
+          expect(mock_span).to have_received(:set_attribute).with('langfuse.observation.input', arguments.to_json)
+          expect(mock_span).to have_received(:set_attribute).with('langfuse.observation.output', tool_result.to_json)
+        end
+      end
+    end
  end
 end