From bddf06907b622068fb679665e76930a70fe0cd4e Mon Sep 17 00:00:00 2001
From: Aakash Bakhle <48802744+aakashb95@users.noreply.github.com>
Date: Tue, 13 Jan 2026 18:52:38 +0530
Subject: [PATCH] fix: double counting in langfuse instrumentation (#13202)

---
 .../captain/chat_generation_recorder.rb       | 47 +++++++++++++++++++
 enterprise/app/helpers/captain/chat_helper.rb |  8 +++-
 lib/integrations/llm_instrumentation.rb       |  1 -
 .../integrations/llm_instrumentation_spec.rb  | 11 +++++
 4 files changed, 64 insertions(+), 3 deletions(-)
 create mode 100644 enterprise/app/helpers/captain/chat_generation_recorder.rb

diff --git a/enterprise/app/helpers/captain/chat_generation_recorder.rb b/enterprise/app/helpers/captain/chat_generation_recorder.rb
new file mode 100644
index 000000000..cd631fb16
--- /dev/null
+++ b/enterprise/app/helpers/captain/chat_generation_recorder.rb
@@ -0,0 +1,47 @@
+module Captain::ChatGenerationRecorder
+  extend ActiveSupport::Concern
+  include Integrations::LlmInstrumentationConstants
+
+  private
+
+  def record_llm_generation(chat, message)
+    return unless valid_llm_message?(message)
+
+    # Create a generation span with model and token info for Langfuse cost calculation.
+    # Note: span duration will be near-zero since we create and end it immediately, but token counts are what Langfuse uses for cost calculation.
+    tracer.in_span("llm.captain.#{feature_name}.generation") do |span|
+      set_generation_span_attributes(span, chat, message)
+    end
+  rescue StandardError => e
+    Rails.logger.warn "Failed to record LLM generation: #{e.message}"
+  end
+
+  # Skip non-LLM messages (e.g., tool results that RubyLLM processes internally).
+  # Check for assistant role rather than token presence - some providers/streaming modes
+  # may not return token counts, but we still want to capture the generation for evals.
+  def valid_llm_message?(message)
+    message.respond_to?(:role) && message.role.to_s == 'assistant'
+  end
+
+  def set_generation_span_attributes(span, chat, message)
+    generation_attributes(chat, message).each do |key, value|
+      span.set_attribute(key, value) if value
+    end
+  end
+
+  def generation_attributes(chat, message)
+    {
+      ATTR_GEN_AI_PROVIDER => determine_provider(model),
+      ATTR_GEN_AI_REQUEST_MODEL => model,
+      ATTR_GEN_AI_REQUEST_TEMPERATURE => temperature,
+      ATTR_GEN_AI_USAGE_INPUT_TOKENS => message.input_tokens,
+      ATTR_GEN_AI_USAGE_OUTPUT_TOKENS => message.respond_to?(:output_tokens) ? message.output_tokens : nil,
+      ATTR_LANGFUSE_OBSERVATION_INPUT => format_input_messages(chat),
+      ATTR_LANGFUSE_OBSERVATION_OUTPUT => message.respond_to?(:content) ? message.content.to_s : nil
+    }
+  end
+
+  def format_input_messages(chat)
+    chat.messages[0...-1].map { |m| { role: m.role.to_s, content: m.content.to_s } }.to_json
+  end
+end
diff --git a/enterprise/app/helpers/captain/chat_helper.rb b/enterprise/app/helpers/captain/chat_helper.rb
index 9bbbd86a5..9bdba5553 100644
--- a/enterprise/app/helpers/captain/chat_helper.rb
+++ b/enterprise/app/helpers/captain/chat_helper.rb
@@ -1,6 +1,7 @@
 module Captain::ChatHelper
   include Integrations::LlmInstrumentation
   include Captain::ChatResponseHelper
+  include Captain::ChatGenerationRecorder
 
   def request_chat_completion
     log_chat_completion_request
@@ -42,8 +43,11 @@ module Captain::ChatHelper
   end
 
   def setup_event_handlers(chat)
-    chat.on_new_message { start_llm_turn_span(instrumentation_params(chat)) }
-    chat.on_end_message { |message| end_llm_turn_span(message) }
+    # NOTE: We only use on_end_message to record the generation with token counts.
+    # RubyLLM callbacks fire after chunks arrive, not around the API call, so
+    # span timing won't reflect actual API latency. But Langfuse calculates costs
+    # from model + token counts, so this is sufficient for cost tracking.
+    chat.on_end_message { |message| record_llm_generation(chat, message) }
     chat.on_tool_call { |tool_call| handle_tool_call(tool_call) }
     chat.on_tool_result { |result| handle_tool_result(result) }
 
diff --git a/lib/integrations/llm_instrumentation.rb b/lib/integrations/llm_instrumentation.rb
index 34281c738..c3baf291e 100644
--- a/lib/integrations/llm_instrumentation.rb
+++ b/lib/integrations/llm_instrumentation.rb
@@ -30,7 +30,6 @@ module Integrations::LlmInstrumentation
     result = nil
     executed = false
     tracer.in_span(params[:span_name]) do |span|
-      set_request_attributes(span, params)
       set_metadata_attributes(span, params)
 
       # By default, the input and output of a trace are set from the root observation
diff --git a/spec/lib/integrations/llm_instrumentation_spec.rb b/spec/lib/integrations/llm_instrumentation_spec.rb
index 97a45a414..0be62f437 100644
--- a/spec/lib/integrations/llm_instrumentation_spec.rb
+++ b/spec/lib/integrations/llm_instrumentation_spec.rb
@@ -252,6 +252,17 @@ RSpec.describe Integrations::LlmInstrumentation do
           expect(mock_span).to have_received(:set_attribute).with('langfuse.observation.input', params[:messages].to_json)
           expect(mock_span).to have_received(:set_attribute).with('langfuse.observation.output', result_data.to_json)
         end
+
+        # Regression test for Langfuse double-counting bug.
+        # Setting gen_ai.request.model on parent spans causes Langfuse to classify them as
+        # GENERATIONs instead of SPANs, resulting in cost being counted multiple times
+        # (once for the parent, once for each child GENERATION).
+        # See: https://github.com/langfuse/langfuse/issues/7549
+        it 'does NOT set gen_ai.request.model to avoid being classified as a GENERATION' do
+          instance.instrument_agent_session(params) { 'result' }
+
+          expect(mock_span).not_to have_received(:set_attribute).with('gen_ai.request.model', anything)
+        end
       end
     end
   end