From 68f0b66a85c911aab2ea48b00b16fce477497744 Mon Sep 17 00:00:00 2001 From: huiwengoh <45724323+huiwengoh@users.noreply.github.com> Date: Thu, 15 Jan 2026 12:00:40 -0500 Subject: [PATCH 1/3] update confidence -> trustworthiness --- tests/integration/test_inference.py | 2 +- tlm/api.py | 4 ++-- tlm/components/__init__.py | 4 ++-- tlm/components/response_assembly.py | 18 +++++++++--------- ...py => trustworthiness_score_computation.py} | 17 +++++++++++------ tlm/inference.py | 6 +++--- tlm/pipeline/factory.py | 8 ++++---- tlm/utils/explainability_utils.py | 9 ++++++--- tlm/utils/scoring/confidence_scoring_utils.py | 4 ++-- 9 files changed, 40 insertions(+), 32 deletions(-) rename tlm/components/scores/{confidence_score_computation.py => trustworthiness_score_computation.py} (69%) diff --git a/tests/integration/test_inference.py b/tests/integration/test_inference.py index dc0a6fd..6a99d67 100644 --- a/tests/integration/test_inference.py +++ b/tests/integration/test_inference.py @@ -67,7 +67,7 @@ async def run_inference_test(kwargs: dict, enabled=True) -> bool: response_str = response["response"] print(f" - Response: {response_str}") - print(f" - Confidence score: {response['confidence_score']}") + print(f" - Confidence score: {response['trustworthiness_score']}") print(f" - Usage: {response['usage']}") print(f" - Metadata: {response['metadata']}") print(f" - RAG evals: {response['evals']}") diff --git a/tlm/api.py b/tlm/api.py index 6278bc6..a2305b3 100644 --- a/tlm/api.py +++ b/tlm/api.py @@ -83,7 +83,7 @@ def create( Returns: InferenceResult object containing: - response: The generated response (string or dict for structured outputs) - - confidence_score: Confidence score between 0 and 1 + - trustworthiness_score: Confidence score between 0 and 1 - usage: Token usage information - metadata: Additional metadata (e.g., per-field scores for structured outputs) - evals: Dictionary of additional evaluation scores (if evals are provided) @@ -123,7 +123,7 @@ def score( Returns: InferenceResult containing: - response: The original response (preserved from input) - - confidence_score: Confidence score between 0 and 1 + - trustworthiness_score: Confidence score between 0 and 1 - usage: Token usage information - metadata: Additional metadata (e.g., per-field scores for structured outputs) - evals: Dictionary of additional evaluation scores (if evals are provided) diff --git a/tlm/components/__init__.py b/tlm/components/__init__.py index c1f37d8..b29189e 100644 --- a/tlm/components/__init__.py +++ b/tlm/components/__init__.py @@ -5,7 +5,7 @@ from .completions.self_reflection_completion_generator import SelfReflectionCompletionGenerator from .semantic_evaluation_score_generator import SemanticEvaluationScoreGenerator from .response_assembly import ResponseAssembly -from .scores.confidence_score_computation import ConfidenceScoreComputation +from .scores.trustworthiness_score_computation import TrustworthinessScoreComputation from .scores.consistency_score_computation import ConsistencyScoreComputation from .scores.perplexity_score_computation import PerplexityScoreComputation from .scores.prompt_evaluation_score_extraction import PromptEvaluationScoreExtraction @@ -18,7 +18,7 @@ "ObservedConsistencyCompletionGenerator", "SelfReflectionCompletionGenerator", "ConsistencyScoreComputation", - "ConfidenceScoreComputation", + "TrustworthinessScoreComputation", "PerplexityScoreComputation", "SelfReflectionScoreComputation", "ResponseAssembly", diff --git a/tlm/components/response_assembly.py b/tlm/components/response_assembly.py index dbd3cf6..5334629 100644 --- a/tlm/components/response_assembly.py +++ b/tlm/components/response_assembly.py @@ -30,24 +30,24 @@ def __init__( super().__init__(depends_on=depends_on) async def execute(self) -> None: - confidence_scores = self.execution_context.get("confidence_scores") + trustworthiness_scores = self.execution_context.get("trustworthiness_scores") reference_answers = self.execution_context.get("reference_answers") reference_completions: list[Completion] = self.execution_context.get("reference_completions") best_answer_idx: int - if np.isnan(confidence_scores).all(): + if np.isnan(trustworthiness_scores).all(): best_answer_idx = 0 - average_confidence_score = None + average_trustworthiness_score = None else: - best_answer_idx = np.nanargmax(confidence_scores, axis=0) - average_confidence_score = np.nanmean(confidence_scores) + best_answer_idx = np.nanargmax(trustworthiness_scores, axis=0) + average_trustworthiness_score = np.nanmean(trustworthiness_scores) best_answer = reference_answers[best_answer_idx] best_completion = reference_completions[best_answer_idx] - if average_confidence_score is not None: - make_score_asymptotic(average_confidence_score) + if average_trustworthiness_score is not None: + make_score_asymptotic(average_trustworthiness_score) self.execution_context.add("best_answer_idx", best_answer_idx) @@ -56,7 +56,7 @@ async def execute(self) -> None: else: self.execution_context.add("best_response", get_cleaned_chat_completion(best_completion)) - self.execution_context.add("confidence_score", average_confidence_score) + self.execution_context.add("trustworthiness_score", average_trustworthiness_score) if self.inference_type == InferenceType.PROMPT: if best_completion.usage is None: @@ -92,7 +92,7 @@ async def execute(self) -> None: mean_consistency_score = float(np.nanmean(consistency_scores)) explainability_message = get_explainability_message( - average_confidence_score, + average_trustworthiness_score, self_reflection_completions, observed_consistency_completions, mean_consistency_score, diff --git a/tlm/components/scores/confidence_score_computation.py b/tlm/components/scores/trustworthiness_score_computation.py similarity index 69% rename from tlm/components/scores/confidence_score_computation.py rename to tlm/components/scores/trustworthiness_score_computation.py index 9bab32a..d372774 100644 --- a/tlm/components/scores/confidence_score_computation.py +++ b/tlm/components/scores/trustworthiness_score_computation.py @@ -3,13 +3,18 @@ from tlm.components import Component from tlm.config.presets import WorkflowType -from tlm.utils.scoring.confidence_scoring_utils import get_confidence_scores +from tlm.utils.scoring.confidence_scoring_utils import get_trustworthiness_scores logger = logging.getLogger(__name__) -class ConfidenceScoreComputation(Component): - def __init__(self, workflow_type: WorkflowType, model: str, depends_on: list[Component] | None = None): +class TrustworthinessScoreComputation(Component): + def __init__( + self, + workflow_type: WorkflowType, + model: str, + depends_on: list[Component] | None = None, + ): self.workflow_type = workflow_type self.model = model super().__init__(depends_on=depends_on) @@ -22,7 +27,7 @@ async def execute(self): use_perplexity_score = self.execution_context.get("use_perplexity_score") prompt_evaluation_scores = self.execution_context.get("prompt_evaluation_scores", []) - confidence_scores = get_confidence_scores( + trustworthiness_scores = get_trustworthiness_scores( self.workflow_type, self.model, consistency_scores, @@ -33,6 +38,6 @@ async def execute(self): prompt_evaluation_scores, ) - logger.info(f"Calculated confidence scores: {confidence_scores}") + logger.info(f"Calculated trustworthiness scores: {trustworthiness_scores}") - self.execution_context.add("confidence_scores", confidence_scores) + self.execution_context.add("trustworthiness_scores", trustworthiness_scores) diff --git a/tlm/inference.py b/tlm/inference.py index f6d71ce..92e1d45 100644 --- a/tlm/inference.py +++ b/tlm/inference.py @@ -9,7 +9,7 @@ class InferenceResult(TypedDict): response: str | dict[str, Any] # either a response string or OpenAI chat completion dict - confidence_score: float + trustworthiness_score: float usage: dict[str, Any] metadata: dict[str, Any] | None evals: dict[str, float] | None @@ -37,7 +37,7 @@ async def tlm_inference( results = await pipeline.run() best_response = results["best_response"] - confidence_score = results["confidence_score"] + trustworthiness_score = results["trustworthiness_score"] usage = results.get("usage", {}) explanation = results.get("explanation") evals_not_requiring_response: dict[str, float] = results.get("evals_not_requiring_response", {}) @@ -48,7 +48,7 @@ async def tlm_inference( return InferenceResult( response=best_response, - confidence_score=confidence_score, + trustworthiness_score=trustworthiness_score, usage=usage, metadata=metadata, evals={ diff --git a/tlm/pipeline/factory.py b/tlm/pipeline/factory.py index ca284af..17361e4 100644 --- a/tlm/pipeline/factory.py +++ b/tlm/pipeline/factory.py @@ -1,7 +1,7 @@ from typing import Any, Dict from tlm.components import ( - ConfidenceScoreComputation, + TrustworthinessScoreComputation, ConsistencyScoreComputation, ObservedConsistencyCompletionGenerator, PerplexityScoreComputation, @@ -140,8 +140,8 @@ def create( else: prompt_evaluation_score_extraction = None - confidence_score_computation = pipeline.add( - ConfidenceScoreComputation( + trustworthiness_score_computation = pipeline.add( + TrustworthinessScoreComputation( workflow_type=config.workflow_type, model=config.model, depends_on=[ @@ -165,7 +165,7 @@ def create( depends_on=[ component for component in [ - confidence_score_computation, + trustworthiness_score_computation, evals_not_requiring_response_generator, evals_requiring_response_generator, ] diff --git a/tlm/utils/explainability_utils.py b/tlm/utils/explainability_utils.py index f418ff7..ac119cc 100644 --- a/tlm/utils/explainability_utils.py +++ b/tlm/utils/explainability_utils.py @@ -10,7 +10,7 @@ def get_explainability_message( - average_confidence_score: float | None, + average_trustworthiness_score: float | None, self_reflection_completions: list[list[Completion]], observed_consistency_completions: list[Completion], average_consistency_score: float, @@ -20,10 +20,13 @@ def get_explainability_message( ) -> str: explainability_message = "" - if average_confidence_score is None: + if average_trustworthiness_score is None: return explainability_message - if not np.isnan(average_confidence_score) and average_confidence_score < defaults.EXPLAINABILITY_THRESHOLD: + if ( + not np.isnan(average_trustworthiness_score) + and average_trustworthiness_score < defaults.EXPLAINABILITY_THRESHOLD + ): self_reflection_completions_flat = [ completion for sublist in self_reflection_completions for completion in sublist ] diff --git a/tlm/utils/scoring/confidence_scoring_utils.py b/tlm/utils/scoring/confidence_scoring_utils.py index 8f3b87c..11835b4 100644 --- a/tlm/utils/scoring/confidence_scoring_utils.py +++ b/tlm/utils/scoring/confidence_scoring_utils.py @@ -19,7 +19,7 @@ def __init__(self, score: np.float64 | None, weight: float): self.weight = weight -def get_confidence_scores( +def get_trustworthiness_scores( workflow_type: WorkflowType, model: str, consistency_scores: npt.NDArray[np.float64], @@ -53,7 +53,7 @@ def _generate_total_scores( ) -> npt.NDArray[np.float64]: """Generates total score for each reference answer (row) in scores dataframe. - The weights used to calculate total score are different depending on if prompt or get_confidence_score is called and perplexity score is calculated or not. + The weights used to calculate total score are different depending on if prompt or get_trustworthiness_score is called and perplexity score is calculated or not. If just self reflection score couldn't be computed (value is nan), that value is omitted from the total score calculation. If just observed consistency score couldn't be computed (value is nan), that value is omitted from the total score calculation. From aec3429b430504b5c246651d9c617aa1999db184 Mon Sep 17 00:00:00 2001 From: huiwengoh <45724323+huiwengoh@users.noreply.github.com> Date: Thu, 15 Jan 2026 12:04:39 -0500 Subject: [PATCH 2/3] filename --- .../scores/trustworthiness_score_computation.py | 2 +- ...tils.py => trustworthiness_scoring_utils.py} | 17 +++++++++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) rename tlm/utils/scoring/{confidence_scoring_utils.py => trustworthiness_scoring_utils.py} (94%) diff --git a/tlm/components/scores/trustworthiness_score_computation.py b/tlm/components/scores/trustworthiness_score_computation.py index d372774..68bad95 100644 --- a/tlm/components/scores/trustworthiness_score_computation.py +++ b/tlm/components/scores/trustworthiness_score_computation.py @@ -3,7 +3,7 @@ from tlm.components import Component from tlm.config.presets import WorkflowType -from tlm.utils.scoring.confidence_scoring_utils import get_trustworthiness_scores +from tlm.utils.scoring.trustworthiness_scoring_utils import get_trustworthiness_scores logger = logging.getLogger(__name__) diff --git a/tlm/utils/scoring/confidence_scoring_utils.py b/tlm/utils/scoring/trustworthiness_scoring_utils.py similarity index 94% rename from tlm/utils/scoring/confidence_scoring_utils.py rename to tlm/utils/scoring/trustworthiness_scoring_utils.py index 11835b4..40833bb 100644 --- a/tlm/utils/scoring/confidence_scoring_utils.py +++ b/tlm/utils/scoring/trustworthiness_scoring_utils.py @@ -6,7 +6,11 @@ import logging from tlm.config.presets import WorkflowType -from tlm.config.score_weights import COMPONENT_SCORE_WEIGHTS, DEFAULT_MODEL, PERPLEXITY_SCORE_WEIGHT +from tlm.config.score_weights import ( + COMPONENT_SCORE_WEIGHTS, + DEFAULT_MODEL, + PERPLEXITY_SCORE_WEIGHT, +) logger = logging.getLogger(__name__) @@ -73,7 +77,7 @@ def _generate_total_scores( total_scores: List[float] = [] - logger.info("Generating confidence scores with scores:") + logger.info("Generating trustworthiness scores with scores:") logger.info(f"-- Consistency scores: {consistency_scores}") logger.info(f"-- Indicator scores: {indicator_scores}") logger.info(f"-- Self reflection scores: {self_reflection_scores}") @@ -96,7 +100,11 @@ def _generate_total_scores( self_reflection_score_weight, prompt_eval_score_weight, perplexity_score_weight, - ) = get_score_weights(use_perplexity_score=use_perplexity_score, workflow_type=workflow_type, model=model).values() + ) = get_score_weights( + use_perplexity_score=use_perplexity_score, + workflow_type=workflow_type, + model=model, + ).values() for _, row in scores.iterrows(): weighted_score_parts: List[WeightedScore] = [ @@ -146,7 +154,8 @@ def _generate_total_scores( def get_score_weights(use_perplexity_score: bool, workflow_type: WorkflowType, model: str) -> Dict[str, float]: """Determines which weights to use for the total score calculation and returns appropriate weights dictionary. - Weights are dependent on the model used and if we have a perplexity score calculated.""" + Weights are dependent on the model used and if we have a perplexity score calculated. + """ # First get weights for the current workflow type workflow_weights = COMPONENT_SCORE_WEIGHTS.get(workflow_type, COMPONENT_SCORE_WEIGHTS[WorkflowType.DEFAULT]) From 7cf468f1c34c1f7dee39727f9fa071165d2bcab6 Mon Sep 17 00:00:00 2001 From: huiwengoh <45724323+huiwengoh@users.noreply.github.com> Date: Thu, 15 Jan 2026 12:08:47 -0500 Subject: [PATCH 3/3] update tutorial --- docs/tutorials/quickstart/index.ipynb | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/docs/tutorials/quickstart/index.ipynb b/docs/tutorials/quickstart/index.ipynb index d706709..77a6a2d 100644 --- a/docs/tutorials/quickstart/index.ipynb +++ b/docs/tutorials/quickstart/index.ipynb @@ -66,7 +66,7 @@ "data": { "text/plain": [ "{'response': ModelResponse(id='chatcmpl-Cvp7cQSXi0AYaYHhLMtY6Pr1pVDkf', created=1767897244, model='gpt-4.1-mini-2025-04-14', object='chat.completion', system_fingerprint='fp_376a7ccef1', choices=[Choices(finish_reason='stop', index=0, message=Message(content='The capital of France is Paris.', role='assistant', tool_calls=None, function_call=None, provider_specific_fields={'refusal': None}, annotations=[]), logprobs=ChoiceLogprobs(content=[ChatCompletionTokenLogprob(token='Response', bytes=[82, 101, 115, 112, 111, 110, 115, 101], logprob=0.0, top_logprobs=[TopLogprob(token='Response', bytes=[82, 101, 115, 112, 111, 110, 115, 101], logprob=0.0), TopLogprob(token=' Response', bytes=[32, 82, 101, 115, 112, 111, 110, 115, 101], logprob=-24.5), TopLogprob(token='\\tResponse', bytes=[9, 82, 101, 115, 112, 111, 110, 115, 101], logprob=-24.5), TopLogprob(token='