From bddf06907b622068fb679665e76930a70fe0cd4e Mon Sep 17 00:00:00 2001 From: Aakash Bakhle <48802744+aakashb95@users.noreply.github.com> Date: Tue, 13 Jan 2026 18:52:38 +0530 Subject: [PATCH] fix: double counting in langfuse instrumentation (#13202) --- .../captain/chat_generation_recorder.rb | 47 +++++++++++++++++++ enterprise/app/helpers/captain/chat_helper.rb | 8 +++- lib/integrations/llm_instrumentation.rb | 1 - .../integrations/llm_instrumentation_spec.rb | 11 +++++ 4 files changed, 64 insertions(+), 3 deletions(-) create mode 100644 enterprise/app/helpers/captain/chat_generation_recorder.rb diff --git a/enterprise/app/helpers/captain/chat_generation_recorder.rb b/enterprise/app/helpers/captain/chat_generation_recorder.rb new file mode 100644 index 000000000..cd631fb16 --- /dev/null +++ b/enterprise/app/helpers/captain/chat_generation_recorder.rb @@ -0,0 +1,47 @@ +module Captain::ChatGenerationRecorder + extend ActiveSupport::Concern + include Integrations::LlmInstrumentationConstants + + private + + def record_llm_generation(chat, message) + return unless valid_llm_message?(message) + + # Create a generation span with model and token info for Langfuse cost calculation. + # Note: span duration will be near-zero since we create and end it immediately, but token counts are what Langfuse uses for cost calculation. + tracer.in_span("llm.captain.#{feature_name}.generation") do |span| + set_generation_span_attributes(span, chat, message) + end + rescue StandardError => e + Rails.logger.warn "Failed to record LLM generation: #{e.message}" + end + + # Skip non-LLM messages (e.g., tool results that RubyLLM processes internally). + # Check for assistant role rather than token presence - some providers/streaming modes + # may not return token counts, but we still want to capture the generation for evals. + def valid_llm_message?(message) + message.respond_to?(:role) && message.role.to_s == 'assistant' + end + + def set_generation_span_attributes(span, chat, message) + generation_attributes(chat, message).each do |key, value| + span.set_attribute(key, value) if value + end + end + + def generation_attributes(chat, message) + { + ATTR_GEN_AI_PROVIDER => determine_provider(model), + ATTR_GEN_AI_REQUEST_MODEL => model, + ATTR_GEN_AI_REQUEST_TEMPERATURE => temperature, + ATTR_GEN_AI_USAGE_INPUT_TOKENS => message.input_tokens, + ATTR_GEN_AI_USAGE_OUTPUT_TOKENS => message.respond_to?(:output_tokens) ? message.output_tokens : nil, + ATTR_LANGFUSE_OBSERVATION_INPUT => format_input_messages(chat), + ATTR_LANGFUSE_OBSERVATION_OUTPUT => message.respond_to?(:content) ? message.content.to_s : nil + } + end + + def format_input_messages(chat) + chat.messages[0...-1].map { |m| { role: m.role.to_s, content: m.content.to_s } }.to_json + end +end diff --git a/enterprise/app/helpers/captain/chat_helper.rb b/enterprise/app/helpers/captain/chat_helper.rb index 9bbbd86a5..9bdba5553 100644 --- a/enterprise/app/helpers/captain/chat_helper.rb +++ b/enterprise/app/helpers/captain/chat_helper.rb @@ -1,6 +1,7 @@ module Captain::ChatHelper include Integrations::LlmInstrumentation include Captain::ChatResponseHelper + include Captain::ChatGenerationRecorder def request_chat_completion log_chat_completion_request @@ -42,8 +43,11 @@ module Captain::ChatHelper end def setup_event_handlers(chat) - chat.on_new_message { start_llm_turn_span(instrumentation_params(chat)) } - chat.on_end_message { |message| end_llm_turn_span(message) } + # NOTE: We only use on_end_message to record the generation with token counts. + # RubyLLM callbacks fire after chunks arrive, not around the API call, so + # span timing won't reflect actual API latency. But Langfuse calculates costs + # from model + token counts, so this is sufficient for cost tracking. + chat.on_end_message { |message| record_llm_generation(chat, message) } chat.on_tool_call { |tool_call| handle_tool_call(tool_call) } chat.on_tool_result { |result| handle_tool_result(result) } diff --git a/lib/integrations/llm_instrumentation.rb b/lib/integrations/llm_instrumentation.rb index 34281c738..c3baf291e 100644 --- a/lib/integrations/llm_instrumentation.rb +++ b/lib/integrations/llm_instrumentation.rb @@ -30,7 +30,6 @@ module Integrations::LlmInstrumentation result = nil executed = false tracer.in_span(params[:span_name]) do |span| - set_request_attributes(span, params) set_metadata_attributes(span, params) # By default, the input and output of a trace are set from the root observation diff --git a/spec/lib/integrations/llm_instrumentation_spec.rb b/spec/lib/integrations/llm_instrumentation_spec.rb index 97a45a414..0be62f437 100644 --- a/spec/lib/integrations/llm_instrumentation_spec.rb +++ b/spec/lib/integrations/llm_instrumentation_spec.rb @@ -252,6 +252,17 @@ RSpec.describe Integrations::LlmInstrumentation do expect(mock_span).to have_received(:set_attribute).with('langfuse.observation.input', params[:messages].to_json) expect(mock_span).to have_received(:set_attribute).with('langfuse.observation.output', result_data.to_json) end + + # Regression test for Langfuse double-counting bug. + # Setting gen_ai.request.model on parent spans causes Langfuse to classify them as + # GENERATIONs instead of SPANs, resulting in cost being counted multiple times + # (once for the parent, once for each child GENERATION). + # See: https://github.com/langfuse/langfuse/issues/7549 + it 'does NOT set gen_ai.request.model to avoid being classified as a GENERATION' do + instance.instrument_agent_session(params) { 'result' } + + expect(mock_span).not_to have_received(:set_attribute).with('gen_ai.request.model', anything) + end end end end