fix: double counting in langfuse instrumentation (#13202)
This commit is contained in:
47
enterprise/app/helpers/captain/chat_generation_recorder.rb
Normal file
47
enterprise/app/helpers/captain/chat_generation_recorder.rb
Normal file
@@ -0,0 +1,47 @@
|
||||
module Captain::ChatGenerationRecorder
|
||||
extend ActiveSupport::Concern
|
||||
include Integrations::LlmInstrumentationConstants
|
||||
|
||||
private
|
||||
|
||||
def record_llm_generation(chat, message)
|
||||
return unless valid_llm_message?(message)
|
||||
|
||||
# Create a generation span with model and token info for Langfuse cost calculation.
|
||||
# Note: span duration will be near-zero since we create and end it immediately, but token counts are what Langfuse uses for cost calculation.
|
||||
tracer.in_span("llm.captain.#{feature_name}.generation") do |span|
|
||||
set_generation_span_attributes(span, chat, message)
|
||||
end
|
||||
rescue StandardError => e
|
||||
Rails.logger.warn "Failed to record LLM generation: #{e.message}"
|
||||
end
|
||||
|
||||
# Skip non-LLM messages (e.g., tool results that RubyLLM processes internally).
|
||||
# Check for assistant role rather than token presence - some providers/streaming modes
|
||||
# may not return token counts, but we still want to capture the generation for evals.
|
||||
def valid_llm_message?(message)
|
||||
message.respond_to?(:role) && message.role.to_s == 'assistant'
|
||||
end
|
||||
|
||||
def set_generation_span_attributes(span, chat, message)
|
||||
generation_attributes(chat, message).each do |key, value|
|
||||
span.set_attribute(key, value) if value
|
||||
end
|
||||
end
|
||||
|
||||
def generation_attributes(chat, message)
|
||||
{
|
||||
ATTR_GEN_AI_PROVIDER => determine_provider(model),
|
||||
ATTR_GEN_AI_REQUEST_MODEL => model,
|
||||
ATTR_GEN_AI_REQUEST_TEMPERATURE => temperature,
|
||||
ATTR_GEN_AI_USAGE_INPUT_TOKENS => message.input_tokens,
|
||||
ATTR_GEN_AI_USAGE_OUTPUT_TOKENS => message.respond_to?(:output_tokens) ? message.output_tokens : nil,
|
||||
ATTR_LANGFUSE_OBSERVATION_INPUT => format_input_messages(chat),
|
||||
ATTR_LANGFUSE_OBSERVATION_OUTPUT => message.respond_to?(:content) ? message.content.to_s : nil
|
||||
}
|
||||
end
|
||||
|
||||
def format_input_messages(chat)
|
||||
chat.messages[0...-1].map { |m| { role: m.role.to_s, content: m.content.to_s } }.to_json
|
||||
end
|
||||
end
|
||||
@@ -1,6 +1,7 @@
|
||||
module Captain::ChatHelper
|
||||
include Integrations::LlmInstrumentation
|
||||
include Captain::ChatResponseHelper
|
||||
include Captain::ChatGenerationRecorder
|
||||
|
||||
def request_chat_completion
|
||||
log_chat_completion_request
|
||||
@@ -42,8 +43,11 @@ module Captain::ChatHelper
|
||||
end
|
||||
|
||||
def setup_event_handlers(chat)
|
||||
chat.on_new_message { start_llm_turn_span(instrumentation_params(chat)) }
|
||||
chat.on_end_message { |message| end_llm_turn_span(message) }
|
||||
# NOTE: We only use on_end_message to record the generation with token counts.
|
||||
# RubyLLM callbacks fire after chunks arrive, not around the API call, so
|
||||
# span timing won't reflect actual API latency. But Langfuse calculates costs
|
||||
# from model + token counts, so this is sufficient for cost tracking.
|
||||
chat.on_end_message { |message| record_llm_generation(chat, message) }
|
||||
chat.on_tool_call { |tool_call| handle_tool_call(tool_call) }
|
||||
chat.on_tool_result { |result| handle_tool_result(result) }
|
||||
|
||||
|
||||
@@ -30,7 +30,6 @@ module Integrations::LlmInstrumentation
|
||||
result = nil
|
||||
executed = false
|
||||
tracer.in_span(params[:span_name]) do |span|
|
||||
set_request_attributes(span, params)
|
||||
set_metadata_attributes(span, params)
|
||||
|
||||
# By default, the input and output of a trace are set from the root observation
|
||||
|
||||
@@ -252,6 +252,17 @@ RSpec.describe Integrations::LlmInstrumentation do
|
||||
expect(mock_span).to have_received(:set_attribute).with('langfuse.observation.input', params[:messages].to_json)
|
||||
expect(mock_span).to have_received(:set_attribute).with('langfuse.observation.output', result_data.to_json)
|
||||
end
|
||||
|
||||
# Regression test for Langfuse double-counting bug.
|
||||
# Setting gen_ai.request.model on parent spans causes Langfuse to classify them as
|
||||
# GENERATIONs instead of SPANs, resulting in cost being counted multiple times
|
||||
# (once for the parent, once for each child GENERATION).
|
||||
# See: https://github.com/langfuse/langfuse/issues/7549
|
||||
it 'does NOT set gen_ai.request.model to avoid being classified as a GENERATION' do
|
||||
instance.instrument_agent_session(params) { 'result' }
|
||||
|
||||
expect(mock_span).not_to have_received(:set_attribute).with('gen_ai.request.model', anything)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
Reference in New Issue
Block a user