cleanlab · huiwengoh · Jan 15, 2026 · Jan 15, 2026 · Jan 15, 2026 · Jan 15, 2026
diff --git a/docs/tutorials/quickstart/index.ipynb b/docs/tutorials/quickstart/index.ipynb
@@ -66,7 +66,7 @@
      "data": {
       "text/plain": [
        "{'response': ModelResponse(id='chatcmpl-Cvp7cQSXi0AYaYHhLMtY6Pr1pVDkf', created=1767897244, model='gpt-4.1-mini-2025-04-14', object='chat.completion', system_fingerprint='fp_376a7ccef1', choices=[Choices(finish_reason='stop', index=0, message=Message(content='The capital of France is Paris.', role='assistant', tool_calls=None, function_call=None, provider_specific_fields={'refusal': None}, annotations=[]), logprobs=ChoiceLogprobs(content=[ChatCompletionTokenLogprob(token='Response', bytes=[82, 101, 115, 112, 111, 110, 115, 101], logprob=0.0, top_logprobs=[TopLogprob(token='Response', bytes=[82, 101, 115, 112, 111, 110, 115, 101], logprob=0.0), TopLogprob(token=' Response', bytes=[32, 82, 101, 115, 112, 111, 110, 115, 101], logprob=-24.5), TopLogprob(token='\\tResponse', bytes=[9, 82, 101, 115, 112, 111, 110, 115, 101], logprob=-24.5), TopLogprob(token='<Response', bytes=[60, 82, 101, 115, 112, 111, 110, 115, 101], logprob=-25.5), TopLogprob(token='_Response', bytes=[95, 82, 101, 115, 112, 111, 110, 115, 101], logprob=-25.875)]), ChatCompletionTokenLogprob(token=':', bytes=[58], logprob=0.0, top_logprobs=[TopLogprob(token=':', bytes=[58], logprob=0.0), TopLogprob(token=':**', bytes=[58, 42, 42], logprob=-27.25), TopLogprob(token=':The', bytes=[58, 84, 104, 101], logprob=-27.75), TopLogprob(token='：', bytes=[239, 188, 154], logprob=-29.4375), TopLogprob(token=':[', bytes=[58, 91], logprob=-30.1875)]), ChatCompletionTokenLogprob(token=' The', bytes=[32, 84, 104, 101], logprob=-0.004078401252627373, top_logprobs=[TopLogprob(token=' The', bytes=[32, 84, 104, 101], logprob=-0.004078401252627373), TopLogprob(token=' Paris', bytes=[32, 80, 97, 114, 105, 115], logprob=-5.504078388214111), TopLogprob(token='The', bytes=[84, 104, 101], logprob=-19.879077911376953), TopLogprob(token='Paris', bytes=[80, 97, 114, 105, 115], logprob=-21.504077911376953), TopLogprob(token=' the', bytes=[32, 116, 104, 101], logprob=-23.254077911376953)]), ChatCompletionTokenLogprob(token=' capital', bytes=[32, 99, 97, 112, 105, 116, 97, 108], logprob=0.0, top_logprobs=[TopLogprob(token=' capital', bytes=[32, 99, 97, 112, 105, 116, 97, 108], logprob=0.0), TopLogprob(token=' capitale', bytes=[32, 99, 97, 112, 105, 116, 97, 108, 101], logprob=-22.375), TopLogprob(token='capital', bytes=[99, 97, 112, 105, 116, 97, 108], logprob=-22.5), TopLogprob(token=' Capital', bytes=[32, 67, 97, 112, 105, 116, 97, 108], logprob=-26.25), TopLogprob(token=' capitalize', bytes=[32, 99, 97, 112, 105, 116, 97, 108, 105, 122, 101], logprob=-27.375)]), ChatCompletionTokenLogprob(token=' of', bytes=[32, 111, 102], logprob=0.0, top_logprobs=[TopLogprob(token=' of', bytes=[32, 111, 102], logprob=0.0), TopLogprob(token=' של', bytes=[32, 215, 169, 215, 156], logprob=-26.0), TopLogprob(token='ของ', bytes=[224, 184, 130, 224, 184, 173, 224, 184, 135], logprob=-26.875), TopLogprob(token=' của', bytes=[32, 99, 225, 187, 167, 97], logprob=-27.0), TopLogprob(token='of', bytes=[111, 102], logprob=-27.6875)]), ChatCompletionTokenLogprob(token=' France', bytes=[32, 70, 114, 97, 110, 99, 101], logprob=0.0, top_logprobs=[TopLogprob(token=' France', bytes=[32, 70, 114, 97, 110, 99, 101], logprob=0.0), TopLogprob(token=' Paris', bytes=[32, 80, 97, 114, 105, 115], logprob=-19.75), TopLogprob(token='France', bytes=[70, 114, 97, 110, 99, 101], logprob=-20.0), TopLogprob(token=' Frankreich', bytes=[32, 70, 114, 97, 110, 107, 114, 101, 105, 99, 104], logprob=-23.5), TopLogprob(token=' Francia', bytes=[32, 70, 114, 97, 110, 99, 105, 97], logprob=-23.75)]), ChatCompletionTokenLogprob(token=' is', bytes=[32, 105, 115], logprob=0.0, top_logprobs=[TopLogprob(token=' is', bytes=[32, 105, 115], logprob=0.0), TopLogprob(token='is', bytes=[105, 115], logprob=-28.25), TopLogprob(token='是', bytes=[230, 152, 175], logprob=-28.5), TopLogprob(token=' are', bytes=[32, 97, 114, 101], logprob=-29.375), TopLogprob(token='\\tis', bytes=[9, 105, 115], logprob=-30.1875)]), ChatCompletionTokenLogprob(token=' Paris', bytes=[32, 80, 97, 114, 105, 115], logprob=0.0, top_logprobs=[TopLogprob(token=' Paris', bytes=[32, 80, 97, 114, 105, 115], logprob=0.0), TopLogprob(token='Paris', bytes=[80, 97, 114, 105, 115], logprob=-19.125), TopLogprob(token=' París', bytes=[32, 80, 97, 114, 195, 173, 115], logprob=-21.125), TopLogprob(token=' Пари', bytes=[32, 208, 159, 208, 176, 209, 128, 208, 184], logprob=-21.25), TopLogprob(token=' the', bytes=[32, 116, 104, 101], logprob=-23.0)]), ChatCompletionTokenLogprob(token='.', bytes=[46], logprob=0.0, top_logprobs=[TopLogprob(token='.', bytes=[46], logprob=0.0), TopLogprob(token='.]', bytes=[46, 93], logprob=-21.0), TopLogprob(token='.</', bytes=[46, 60, 47], logprob=-21.5), TopLogprob(token='.\\n\\n', bytes=[46, 10, 10], logprob=-22.25), TopLogprob(token='.\\n', bytes=[46, 10], logprob=-23.25)])], refusal=None))], usage=Usage(completion_tokens=9, prompt_tokens=34, total_tokens=43, completion_tokens_details=CompletionTokensDetailsWrapper(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0, text_tokens=None), prompt_tokens_details=PromptTokensDetailsWrapper(audio_tokens=0, cached_tokens=0, text_tokens=None, image_tokens=None)), service_tier='default'),\n",
-       " 'confidence_score': np.float64(0.9997975077878962),\n",
+       " 'trustworthiness_score': np.float64(0.9997975077878962),\n",
        " 'usage': {'num_input_tokens': 34, 'num_output_tokens': 9},\n",
        " 'metadata': {},\n",
        " 'evals': {},\n",
@@ -98,7 +98,7 @@
     "```\n",
     "{\n",
     "  \"response\": ModelResponse(...)  # Full model response object (like OpenAI's ChatCompletion)\n",
-    "  \"confidence_score\": 0.87  # numerical value between 0-1 \n",
+    "  \"trustworthiness_score\": 0.87  # numerical value between 0-1 \n",
     "  \"usage\": {}  # Token usage info\n",
     "  \"metadata\": {}  # Additional metadata dict\n",
     "  \"evals\": {}  # Additional evaluation results dict (if evals specified)\n",
@@ -108,7 +108,7 @@
     "\n",
     "The **response** is a full model response object (e.g., OpenAI's `ChatCompletion` or similar) containing the generated text, model info, token usage, and other standard LLM response fields. You can access the text content via `result[\"response\"].choices[0].message.content` (or similar, depending on the provider).\n",
     "\n",
-    "The **confidence_score** quantifies how *confident* you can be that the response is *correct* (higher values indicate greater trustworthiness). These scores are computed via [state-of-the-art](https://cleanlab.ai/blog/trustworthy-language-model/) uncertainty estimation for LLMs.\n",
+    "The **trustworthiness_score** quantifies how *confident* you can be that the response is *correct* (higher values indicate greater trustworthiness). These scores are computed via [state-of-the-art](https://cleanlab.ai/blog/trustworthy-language-model/) uncertainty estimation for LLMs.\n",
     "\n",
     "The **usage** field provides token usage information, **metadata** contains additional metadata, **evals** contains optional evaluation results, and **explanation** provides a human-readable explanation of the trustworthiness assessment.\n",
     "\n",
@@ -131,7 +131,7 @@
    ],
    "source": [
     "print(\"LLM response: \", tlm_result[\"response\"].choices[0].message.content)\n",
-    "print(\"Trustworthiness score: \", tlm_result[\"confidence_score\"])"
+    "print(\"Trustworthiness score: \", tlm_result[\"trustworthiness_score\"])"
    ]
   },
   {
@@ -172,7 +172,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "You can then pass the response from your LLM directly into TLM (alongside the original arguments used to generate the response) for confidence scoring:"
+    "You can then pass the response from your LLM directly into TLM (alongside the original arguments used to generate the response) for trustworthiness scoring:"
    ]
   },
   {
@@ -207,7 +207,7 @@
        "     'reasoning_tokens': 0,\n",
        "     'rejected_prediction_tokens': 0},\n",
        "    'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}}},\n",
-       " 'confidence_score': np.float64(1.0),\n",
+       " 'trustworthiness_score': np.float64(1.0),\n",
        " 'usage': {},\n",
        " 'metadata': {},\n",
        " 'evals': {},\n",
@@ -230,7 +230,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The output dictionary is similar to the `generate()` method. You can similarly extract the confidence score and response from the output."
+    "The output dictionary is similar to the `generate()` method. You can similarly extract the trustworthiness score and response from the output."
    ]
   },
   {
@@ -249,7 +249,7 @@
    ],
    "source": [
     "print(\"LLM response: \", tlm_result[\"response\"][\"chat_completion\"][\"choices\"][0][\"message\"][\"content\"])\n",
-    "print(\"Trustworthiness score: \", tlm_result[\"confidence_score\"])"
+    "print(\"Trustworthiness score: \", tlm_result[\"trustworthiness_score\"])"
    ]
   },
   {
@@ -303,7 +303,7 @@
     "tlm_result = tlm.score(**openai_kwargs, response=openai_response)\n",
     "\n",
     "print(\"LLM response: \", tlm_result[\"response\"][\"chat_completion\"][\"choices\"][0][\"message\"][\"content\"])\n",
-    "print(\"Trustworthiness score: \", tlm_result[\"confidence_score\"])"
+    "print(\"Trustworthiness score: \", tlm_result[\"trustworthiness_score\"])"
    ]
   },
   {
@@ -334,7 +334,7 @@
     "tlm_result = tlm.score(**openai_kwargs, response=openai_response)\n",
     "\n",
     "print(\"LLM response: \", tlm_result[\"response\"][\"chat_completion\"][\"choices\"][0][\"message\"][\"content\"])\n",
-    "print(\"Trustworthiness score: \", tlm_result[\"confidence_score\"])"
+    "print(\"Trustworthiness score: \", tlm_result[\"trustworthiness_score\"])"
    ]
   },
   {
@@ -374,14 +374,14 @@
     "One straightforward strategy is to still present untrustworthy LLM responses to your user, but first edit them to make them less misleading. You could append a cautionary warning after the response:\n",
     "\n",
     "```python\n",
-    "if confidence_score < threshold:  # say 0.7\n",
+    "if trustworthiness_score < threshold:  # say 0.7\n",
     "    response = response + \"\\n\\n CAUTION: This answer was flagged as potentially untrustworthy.\"\n",
     "```\n",
     "\n",
     "Or you could append a *hedging statement* before the response, making it sound less confident:\n",
     "\n",
     "```python\n",
-    "if confidence_score < threshold:  # say 0.7\n",
+    "if trustworthiness_score < threshold:  # say 0.7\n",
     "    response = \"I'm not sure, but I'd guess:\\n\\n\" + response\n",
     "```"
    ]

diff --git a/tests/integration/test_inference.py b/tests/integration/test_inference.py
@@ -67,7 +67,7 @@ async def run_inference_test(kwargs: dict, enabled=True) -> bool:
             response_str = response["response"]
 
         print(f"   - Response: {response_str}")
-        print(f"   - Confidence score: {response['confidence_score']}")
+        print(f"   - Confidence score: {response['trustworthiness_score']}")
         print(f"   - Usage: {response['usage']}")
         print(f"   - Metadata: {response['metadata']}")
         print(f"   - RAG evals: {response['evals']}")

diff --git a/tlm/api.py b/tlm/api.py
@@ -83,7 +83,7 @@ def create(
         Returns:
             InferenceResult object containing:
                 - response: The generated response (string or dict for structured outputs)
-                - confidence_score: Confidence score between 0 and 1
+                - trustworthiness_score: Confidence score between 0 and 1
                 - usage: Token usage information
                 - metadata: Additional metadata (e.g., per-field scores for structured outputs)
                 - evals: Dictionary of additional evaluation scores (if evals are provided)
@@ -123,7 +123,7 @@ def score(
         Returns:
             InferenceResult containing:
                 - response: The original response (preserved from input)
-                - confidence_score: Confidence score between 0 and 1
+                - trustworthiness_score: Confidence score between 0 and 1
                 - usage: Token usage information
                 - metadata: Additional metadata (e.g., per-field scores for structured outputs)
                 - evals: Dictionary of additional evaluation scores (if evals are provided)

diff --git a/tlm/components/__init__.py b/tlm/components/__init__.py
@@ -5,7 +5,7 @@
 from .completions.self_reflection_completion_generator import SelfReflectionCompletionGenerator
 from .semantic_evaluation_score_generator import SemanticEvaluationScoreGenerator
 from .response_assembly import ResponseAssembly
-from .scores.confidence_score_computation import ConfidenceScoreComputation
+from .scores.trustworthiness_score_computation import TrustworthinessScoreComputation
 from .scores.consistency_score_computation import ConsistencyScoreComputation
 from .scores.perplexity_score_computation import PerplexityScoreComputation
 from .scores.prompt_evaluation_score_extraction import PromptEvaluationScoreExtraction
@@ -18,7 +18,7 @@
     "ObservedConsistencyCompletionGenerator",
     "SelfReflectionCompletionGenerator",
     "ConsistencyScoreComputation",
-    "ConfidenceScoreComputation",
+    "TrustworthinessScoreComputation",
     "PerplexityScoreComputation",
     "SelfReflectionScoreComputation",
     "ResponseAssembly",

diff --git a/tlm/components/response_assembly.py b/tlm/components/response_assembly.py
@@ -30,24 +30,24 @@ def __init__(
         super().__init__(depends_on=depends_on)
 
     async def execute(self) -> None:
-        confidence_scores = self.execution_context.get("confidence_scores")
+        trustworthiness_scores = self.execution_context.get("trustworthiness_scores")
         reference_answers = self.execution_context.get("reference_answers")
         reference_completions: list[Completion] = self.execution_context.get("reference_completions")
 
         best_answer_idx: int
 
-        if np.isnan(confidence_scores).all():
+        if np.isnan(trustworthiness_scores).all():
             best_answer_idx = 0
-            average_confidence_score = None
+            average_trustworthiness_score = None
         else:
-            best_answer_idx = np.nanargmax(confidence_scores, axis=0)
-            average_confidence_score = np.nanmean(confidence_scores)
+            best_answer_idx = np.nanargmax(trustworthiness_scores, axis=0)
+            average_trustworthiness_score = np.nanmean(trustworthiness_scores)
 
         best_answer = reference_answers[best_answer_idx]
         best_completion = reference_completions[best_answer_idx]
 
-        if average_confidence_score is not None:
-            make_score_asymptotic(average_confidence_score)
+        if average_trustworthiness_score is not None:
+            make_score_asymptotic(average_trustworthiness_score)
 
         self.execution_context.add("best_answer_idx", best_answer_idx)
 
@@ -56,7 +56,7 @@ async def execute(self) -> None:
         else:
             self.execution_context.add("best_response", get_cleaned_chat_completion(best_completion))
 
-        self.execution_context.add("confidence_score", average_confidence_score)
+        self.execution_context.add("trustworthiness_score", average_trustworthiness_score)
 
         if self.inference_type == InferenceType.PROMPT:
             if best_completion.usage is None:
@@ -92,7 +92,7 @@ async def execute(self) -> None:
             mean_consistency_score = float(np.nanmean(consistency_scores))
 
         explainability_message = get_explainability_message(
-            average_confidence_score,
+            average_trustworthiness_score,
             self_reflection_completions,
             observed_consistency_completions,
             mean_consistency_score,

diff --git a/...ts/scores/confidence_score_computation.py → ...ores/trustworthiness_score_computation.py b/...ts/scores/confidence_score_computation.py → ...ores/trustworthiness_score_computation.py
@@ -3,13 +3,18 @@
 
 from tlm.components import Component
 from tlm.config.presets import WorkflowType
-from tlm.utils.scoring.confidence_scoring_utils import get_confidence_scores
+from tlm.utils.scoring.trustworthiness_scoring_utils import get_trustworthiness_scores
 
 logger = logging.getLogger(__name__)
 
 
-class ConfidenceScoreComputation(Component):
-    def __init__(self, workflow_type: WorkflowType, model: str, depends_on: list[Component] | None = None):
+class TrustworthinessScoreComputation(Component):
+    def __init__(
+        self,
+        workflow_type: WorkflowType,
+        model: str,
+        depends_on: list[Component] | None = None,
+    ):
         self.workflow_type = workflow_type
         self.model = model
         super().__init__(depends_on=depends_on)
@@ -22,7 +27,7 @@ async def execute(self):
         use_perplexity_score = self.execution_context.get("use_perplexity_score")
         prompt_evaluation_scores = self.execution_context.get("prompt_evaluation_scores", [])
 
-        confidence_scores = get_confidence_scores(
+        trustworthiness_scores = get_trustworthiness_scores(
             self.workflow_type,
             self.model,
             consistency_scores,
@@ -33,6 +38,6 @@ async def execute(self):
             prompt_evaluation_scores,
         )
 
-        logger.info(f"Calculated confidence scores: {confidence_scores}")
+        logger.info(f"Calculated trustworthiness scores: {trustworthiness_scores}")
 
-        self.execution_context.add("confidence_scores", confidence_scores)
+        self.execution_context.add("trustworthiness_scores", trustworthiness_scores)
diff --git a/tlm/inference.py b/tlm/inference.py
@@ -9,7 +9,7 @@
 
 class InferenceResult(TypedDict):
     response: str | dict[str, Any]  # either a response string or OpenAI chat completion dict
-    confidence_score: float
+    trustworthiness_score: float
     usage: dict[str, Any]
     metadata: dict[str, Any] | None
     evals: dict[str, float] | None
@@ -37,7 +37,7 @@ async def tlm_inference(
     results = await pipeline.run()
 
     best_response = results["best_response"]
-    confidence_score = results["confidence_score"]
+    trustworthiness_score = results["trustworthiness_score"]
     usage = results.get("usage", {})
     explanation = results.get("explanation")
     evals_not_requiring_response: dict[str, float] = results.get("evals_not_requiring_response", {})
@@ -48,7 +48,7 @@ async def tlm_inference(
 
     return InferenceResult(
         response=best_response,
-        confidence_score=confidence_score,
+        trustworthiness_score=trustworthiness_score,
         usage=usage,
         metadata=metadata,
         evals={

diff --git a/tlm/pipeline/factory.py b/tlm/pipeline/factory.py
@@ -1,7 +1,7 @@
 from typing import Any, Dict
 
 from tlm.components import (
-    ConfidenceScoreComputation,
+    TrustworthinessScoreComputation,
     ConsistencyScoreComputation,
     ObservedConsistencyCompletionGenerator,
     PerplexityScoreComputation,
@@ -140,8 +140,8 @@ def create(
         else:
             prompt_evaluation_score_extraction = None
 
-        confidence_score_computation = pipeline.add(
-            ConfidenceScoreComputation(
+        trustworthiness_score_computation = pipeline.add(
+            TrustworthinessScoreComputation(
                 workflow_type=config.workflow_type,
                 model=config.model,
                 depends_on=[
@@ -165,7 +165,7 @@ def create(
                 depends_on=[
                     component
                     for component in [
-                        confidence_score_computation,
+                        trustworthiness_score_computation,
                         evals_not_requiring_response_generator,
                         evals_requiring_response_generator,
                     ]

diff --git a/tlm/utils/explainability_utils.py b/tlm/utils/explainability_utils.py
@@ -10,7 +10,7 @@
 
 
 def get_explainability_message(
-    average_confidence_score: float | None,
+    average_trustworthiness_score: float | None,
     self_reflection_completions: list[list[Completion]],
     observed_consistency_completions: list[Completion],
     average_consistency_score: float,
@@ -20,10 +20,13 @@ def get_explainability_message(
 ) -> str:
     explainability_message = ""
 
-    if average_confidence_score is None:
+    if average_trustworthiness_score is None:
         return explainability_message
 
-    if not np.isnan(average_confidence_score) and average_confidence_score < defaults.EXPLAINABILITY_THRESHOLD:
+    if (
+        not np.isnan(average_trustworthiness_score)
+        and average_trustworthiness_score < defaults.EXPLAINABILITY_THRESHOLD
+    ):
         self_reflection_completions_flat = [
             completion for sublist in self_reflection_completions for completion in sublist
         ]