From 68f0b66a85c911aab2ea48b00b16fce477497744 Mon Sep 17 00:00:00 2001
From: huiwengoh <45724323+huiwengoh@users.noreply.github.com>
Date: Thu, 15 Jan 2026 12:00:40 -0500
Subject: [PATCH 1/3] update confidence -> trustworthiness

---
 tests/integration/test_inference.py            |  2 +-
 tlm/api.py                                     |  4 ++--
 tlm/components/__init__.py                     |  4 ++--
 tlm/components/response_assembly.py            | 18 +++++++++---------
 ...py => trustworthiness_score_computation.py} | 17 +++++++++++------
 tlm/inference.py                               |  6 +++---
 tlm/pipeline/factory.py                        |  8 ++++----
 tlm/utils/explainability_utils.py              |  9 ++++++---
 tlm/utils/scoring/confidence_scoring_utils.py  |  4 ++--
 9 files changed, 40 insertions(+), 32 deletions(-)
 rename tlm/components/scores/{confidence_score_computation.py => trustworthiness_score_computation.py} (69%)

diff --git a/tests/integration/test_inference.py b/tests/integration/test_inference.py
index dc0a6fd..6a99d67 100644
--- a/tests/integration/test_inference.py
+++ b/tests/integration/test_inference.py
@@ -67,7 +67,7 @@ async def run_inference_test(kwargs: dict, enabled=True) -> bool:
             response_str = response["response"]
 
         print(f"   - Response: {response_str}")
-        print(f"   - Confidence score: {response['confidence_score']}")
+        print(f"   - Confidence score: {response['trustworthiness_score']}")
         print(f"   - Usage: {response['usage']}")
         print(f"   - Metadata: {response['metadata']}")
         print(f"   - RAG evals: {response['evals']}")
diff --git a/tlm/api.py b/tlm/api.py
index 6278bc6..a2305b3 100644
--- a/tlm/api.py
+++ b/tlm/api.py
@@ -83,7 +83,7 @@ def create(
         Returns:
             InferenceResult object containing:
                 - response: The generated response (string or dict for structured outputs)
-                - confidence_score: Confidence score between 0 and 1
+                - trustworthiness_score: Confidence score between 0 and 1
                 - usage: Token usage information
                 - metadata: Additional metadata (e.g., per-field scores for structured outputs)
                 - evals: Dictionary of additional evaluation scores (if evals are provided)
@@ -123,7 +123,7 @@ def score(
         Returns:
             InferenceResult containing:
                 - response: The original response (preserved from input)
-                - confidence_score: Confidence score between 0 and 1
+                - trustworthiness_score: Confidence score between 0 and 1
                 - usage: Token usage information
                 - metadata: Additional metadata (e.g., per-field scores for structured outputs)
                 - evals: Dictionary of additional evaluation scores (if evals are provided)
diff --git a/tlm/components/__init__.py b/tlm/components/__init__.py
index c1f37d8..b29189e 100644
--- a/tlm/components/__init__.py
+++ b/tlm/components/__init__.py
@@ -5,7 +5,7 @@
 from .completions.self_reflection_completion_generator import SelfReflectionCompletionGenerator
 from .semantic_evaluation_score_generator import SemanticEvaluationScoreGenerator
 from .response_assembly import ResponseAssembly
-from .scores.confidence_score_computation import ConfidenceScoreComputation
+from .scores.trustworthiness_score_computation import TrustworthinessScoreComputation
 from .scores.consistency_score_computation import ConsistencyScoreComputation
 from .scores.perplexity_score_computation import PerplexityScoreComputation
 from .scores.prompt_evaluation_score_extraction import PromptEvaluationScoreExtraction
@@ -18,7 +18,7 @@
     "ObservedConsistencyCompletionGenerator",
     "SelfReflectionCompletionGenerator",
     "ConsistencyScoreComputation",
-    "ConfidenceScoreComputation",
+    "TrustworthinessScoreComputation",
     "PerplexityScoreComputation",
     "SelfReflectionScoreComputation",
     "ResponseAssembly",
diff --git a/tlm/components/response_assembly.py b/tlm/components/response_assembly.py
index dbd3cf6..5334629 100644
--- a/tlm/components/response_assembly.py
+++ b/tlm/components/response_assembly.py
@@ -30,24 +30,24 @@ def __init__(
         super().__init__(depends_on=depends_on)
 
     async def execute(self) -> None:
-        confidence_scores = self.execution_context.get("confidence_scores")
+        trustworthiness_scores = self.execution_context.get("trustworthiness_scores")
         reference_answers = self.execution_context.get("reference_answers")
         reference_completions: list[Completion] = self.execution_context.get("reference_completions")
 
         best_answer_idx: int
 
-        if np.isnan(confidence_scores).all():
+        if np.isnan(trustworthiness_scores).all():
             best_answer_idx = 0
-            average_confidence_score = None
+            average_trustworthiness_score = None
         else:
-            best_answer_idx = np.nanargmax(confidence_scores, axis=0)
-            average_confidence_score = np.nanmean(confidence_scores)
+            best_answer_idx = np.nanargmax(trustworthiness_scores, axis=0)
+            average_trustworthiness_score = np.nanmean(trustworthiness_scores)
 
         best_answer = reference_answers[best_answer_idx]
         best_completion = reference_completions[best_answer_idx]
 
-        if average_confidence_score is not None:
-            make_score_asymptotic(average_confidence_score)
+        if average_trustworthiness_score is not None:
+            make_score_asymptotic(average_trustworthiness_score)
 
         self.execution_context.add("best_answer_idx", best_answer_idx)
 
@@ -56,7 +56,7 @@ async def execute(self) -> None:
         else:
             self.execution_context.add("best_response", get_cleaned_chat_completion(best_completion))
 
-        self.execution_context.add("confidence_score", average_confidence_score)
+        self.execution_context.add("trustworthiness_score", average_trustworthiness_score)
 
         if self.inference_type == InferenceType.PROMPT:
             if best_completion.usage is None:
@@ -92,7 +92,7 @@ async def execute(self) -> None:
             mean_consistency_score = float(np.nanmean(consistency_scores))
 
         explainability_message = get_explainability_message(
-            average_confidence_score,
+            average_trustworthiness_score,
             self_reflection_completions,
             observed_consistency_completions,
             mean_consistency_score,
diff --git a/tlm/components/scores/confidence_score_computation.py b/tlm/components/scores/trustworthiness_score_computation.py
similarity index 69%
rename from tlm/components/scores/confidence_score_computation.py
rename to tlm/components/scores/trustworthiness_score_computation.py
index 9bab32a..d372774 100644
--- a/tlm/components/scores/confidence_score_computation.py
+++ b/tlm/components/scores/trustworthiness_score_computation.py
@@ -3,13 +3,18 @@
 
 from tlm.components import Component
 from tlm.config.presets import WorkflowType
-from tlm.utils.scoring.confidence_scoring_utils import get_confidence_scores
+from tlm.utils.scoring.confidence_scoring_utils import get_trustworthiness_scores
 
 logger = logging.getLogger(__name__)
 
 
-class ConfidenceScoreComputation(Component):
-    def __init__(self, workflow_type: WorkflowType, model: str, depends_on: list[Component] | None = None):
+class TrustworthinessScoreComputation(Component):
+    def __init__(
+        self,
+        workflow_type: WorkflowType,
+        model: str,
+        depends_on: list[Component] | None = None,
+    ):
         self.workflow_type = workflow_type
         self.model = model
         super().__init__(depends_on=depends_on)
@@ -22,7 +27,7 @@ async def execute(self):
         use_perplexity_score = self.execution_context.get("use_perplexity_score")
         prompt_evaluation_scores = self.execution_context.get("prompt_evaluation_scores", [])
 
-        confidence_scores = get_confidence_scores(
+        trustworthiness_scores = get_trustworthiness_scores(
             self.workflow_type,
             self.model,
             consistency_scores,
@@ -33,6 +38,6 @@ async def execute(self):
             prompt_evaluation_scores,
         )
 
-        logger.info(f"Calculated confidence scores: {confidence_scores}")
+        logger.info(f"Calculated trustworthiness scores: {trustworthiness_scores}")
 
-        self.execution_context.add("confidence_scores", confidence_scores)
+        self.execution_context.add("trustworthiness_scores", trustworthiness_scores)
diff --git a/tlm/inference.py b/tlm/inference.py
index f6d71ce..92e1d45 100644
--- a/tlm/inference.py
+++ b/tlm/inference.py
@@ -9,7 +9,7 @@
 
 class InferenceResult(TypedDict):
     response: str | dict[str, Any]  # either a response string or OpenAI chat completion dict
-    confidence_score: float
+    trustworthiness_score: float
     usage: dict[str, Any]
     metadata: dict[str, Any] | None
     evals: dict[str, float] | None
@@ -37,7 +37,7 @@ async def tlm_inference(
     results = await pipeline.run()
 
     best_response = results["best_response"]
-    confidence_score = results["confidence_score"]
+    trustworthiness_score = results["trustworthiness_score"]
     usage = results.get("usage", {})
     explanation = results.get("explanation")
     evals_not_requiring_response: dict[str, float] = results.get("evals_not_requiring_response", {})
@@ -48,7 +48,7 @@ async def tlm_inference(
 
     return InferenceResult(
         response=best_response,
-        confidence_score=confidence_score,
+        trustworthiness_score=trustworthiness_score,
         usage=usage,
         metadata=metadata,
         evals={
diff --git a/tlm/pipeline/factory.py b/tlm/pipeline/factory.py
index ca284af..17361e4 100644
--- a/tlm/pipeline/factory.py
+++ b/tlm/pipeline/factory.py
@@ -1,7 +1,7 @@
 from typing import Any, Dict
 
 from tlm.components import (
-    ConfidenceScoreComputation,
+    TrustworthinessScoreComputation,
     ConsistencyScoreComputation,
     ObservedConsistencyCompletionGenerator,
     PerplexityScoreComputation,
@@ -140,8 +140,8 @@ def create(
         else:
             prompt_evaluation_score_extraction = None
 
-        confidence_score_computation = pipeline.add(
-            ConfidenceScoreComputation(
+        trustworthiness_score_computation = pipeline.add(
+            TrustworthinessScoreComputation(
                 workflow_type=config.workflow_type,
                 model=config.model,
                 depends_on=[
@@ -165,7 +165,7 @@ def create(
                 depends_on=[
                     component
                     for component in [
-                        confidence_score_computation,
+                        trustworthiness_score_computation,
                         evals_not_requiring_response_generator,
                         evals_requiring_response_generator,
                     ]
diff --git a/tlm/utils/explainability_utils.py b/tlm/utils/explainability_utils.py
index f418ff7..ac119cc 100644
--- a/tlm/utils/explainability_utils.py
+++ b/tlm/utils/explainability_utils.py
@@ -10,7 +10,7 @@
 
 
 def get_explainability_message(
-    average_confidence_score: float | None,
+    average_trustworthiness_score: float | None,
     self_reflection_completions: list[list[Completion]],
     observed_consistency_completions: list[Completion],
     average_consistency_score: float,
@@ -20,10 +20,13 @@ def get_explainability_message(
 ) -> str:
     explainability_message = ""
 
-    if average_confidence_score is None:
+    if average_trustworthiness_score is None:
         return explainability_message
 
-    if not np.isnan(average_confidence_score) and average_confidence_score < defaults.EXPLAINABILITY_THRESHOLD:
+    if (
+        not np.isnan(average_trustworthiness_score)
+        and average_trustworthiness_score < defaults.EXPLAINABILITY_THRESHOLD
+    ):
         self_reflection_completions_flat = [
             completion for sublist in self_reflection_completions for completion in sublist
         ]
diff --git a/tlm/utils/scoring/confidence_scoring_utils.py b/tlm/utils/scoring/confidence_scoring_utils.py
index 8f3b87c..11835b4 100644
--- a/tlm/utils/scoring/confidence_scoring_utils.py
+++ b/tlm/utils/scoring/confidence_scoring_utils.py
@@ -19,7 +19,7 @@ def __init__(self, score: np.float64 | None, weight: float):
         self.weight = weight
 
 
-def get_confidence_scores(
+def get_trustworthiness_scores(
     workflow_type: WorkflowType,
     model: str,
     consistency_scores: npt.NDArray[np.float64],
@@ -53,7 +53,7 @@ def _generate_total_scores(
 ) -> npt.NDArray[np.float64]:
     """Generates total score for each reference answer (row) in scores dataframe.
 
-    The weights used to calculate total score are different depending on if prompt or get_confidence_score is called and perplexity score is calculated or not.
+    The weights used to calculate total score are different depending on if prompt or get_trustworthiness_score is called and perplexity score is calculated or not.
 
     If just self reflection score couldn't be computed (value is nan), that value is omitted from the total score calculation.
     If just observed consistency score couldn't be computed (value is nan), that value is omitted from the total score calculation.

From aec3429b430504b5c246651d9c617aa1999db184 Mon Sep 17 00:00:00 2001
From: huiwengoh <45724323+huiwengoh@users.noreply.github.com>
Date: Thu, 15 Jan 2026 12:04:39 -0500
Subject: [PATCH 2/3] filename

---
 .../scores/trustworthiness_score_computation.py |  2 +-
 ...tils.py => trustworthiness_scoring_utils.py} | 17 +++++++++++++----
 2 files changed, 14 insertions(+), 5 deletions(-)
 rename tlm/utils/scoring/{confidence_scoring_utils.py => trustworthiness_scoring_utils.py} (94%)

diff --git a/tlm/components/scores/trustworthiness_score_computation.py b/tlm/components/scores/trustworthiness_score_computation.py
index d372774..68bad95 100644
--- a/tlm/components/scores/trustworthiness_score_computation.py
+++ b/tlm/components/scores/trustworthiness_score_computation.py
@@ -3,7 +3,7 @@
 
 from tlm.components import Component
 from tlm.config.presets import WorkflowType
-from tlm.utils.scoring.confidence_scoring_utils import get_trustworthiness_scores
+from tlm.utils.scoring.trustworthiness_scoring_utils import get_trustworthiness_scores
 
 logger = logging.getLogger(__name__)
 
diff --git a/tlm/utils/scoring/confidence_scoring_utils.py b/tlm/utils/scoring/trustworthiness_scoring_utils.py
similarity index 94%
rename from tlm/utils/scoring/confidence_scoring_utils.py
rename to tlm/utils/scoring/trustworthiness_scoring_utils.py
index 11835b4..40833bb 100644
--- a/tlm/utils/scoring/confidence_scoring_utils.py
+++ b/tlm/utils/scoring/trustworthiness_scoring_utils.py
@@ -6,7 +6,11 @@
 import logging
 
 from tlm.config.presets import WorkflowType
-from tlm.config.score_weights import COMPONENT_SCORE_WEIGHTS, DEFAULT_MODEL, PERPLEXITY_SCORE_WEIGHT
+from tlm.config.score_weights import (
+    COMPONENT_SCORE_WEIGHTS,
+    DEFAULT_MODEL,
+    PERPLEXITY_SCORE_WEIGHT,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -73,7 +77,7 @@ def _generate_total_scores(
 
     total_scores: List[float] = []
 
-    logger.info("Generating confidence scores with scores:")
+    logger.info("Generating trustworthiness scores with scores:")
     logger.info(f"-- Consistency scores: {consistency_scores}")
     logger.info(f"-- Indicator scores: {indicator_scores}")
     logger.info(f"-- Self reflection scores: {self_reflection_scores}")
@@ -96,7 +100,11 @@ def _generate_total_scores(
         self_reflection_score_weight,
         prompt_eval_score_weight,
         perplexity_score_weight,
-    ) = get_score_weights(use_perplexity_score=use_perplexity_score, workflow_type=workflow_type, model=model).values()
+    ) = get_score_weights(
+        use_perplexity_score=use_perplexity_score,
+        workflow_type=workflow_type,
+        model=model,
+    ).values()
 
     for _, row in scores.iterrows():
         weighted_score_parts: List[WeightedScore] = [
@@ -146,7 +154,8 @@ def _generate_total_scores(
 
 def get_score_weights(use_perplexity_score: bool, workflow_type: WorkflowType, model: str) -> Dict[str, float]:
     """Determines which weights to use for the total score calculation and returns appropriate weights dictionary.
-    Weights are dependent on the model used and if we have a perplexity score calculated."""
+    Weights are dependent on the model used and if we have a perplexity score calculated.
+    """
 
     # First get weights for the current workflow type
     workflow_weights = COMPONENT_SCORE_WEIGHTS.get(workflow_type, COMPONENT_SCORE_WEIGHTS[WorkflowType.DEFAULT])

From 7cf468f1c34c1f7dee39727f9fa071165d2bcab6 Mon Sep 17 00:00:00 2001
From: huiwengoh <45724323+huiwengoh@users.noreply.github.com>
Date: Thu, 15 Jan 2026 12:08:47 -0500
Subject: [PATCH 3/3] update tutorial

---
 docs/tutorials/quickstart/index.ipynb | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/docs/tutorials/quickstart/index.ipynb b/docs/tutorials/quickstart/index.ipynb
index d706709..77a6a2d 100644
--- a/docs/tutorials/quickstart/index.ipynb
+++ b/docs/tutorials/quickstart/index.ipynb
@@ -66,7 +66,7 @@
      "data": {
       "text/plain": [
        "{'response': ModelResponse(id='chatcmpl-Cvp7cQSXi0AYaYHhLMtY6Pr1pVDkf', created=1767897244, model='gpt-4.1-mini-2025-04-14', object='chat.completion', system_fingerprint='fp_376a7ccef1', choices=[Choices(finish_reason='stop', index=0, message=Message(content='The capital of France is Paris.', role='assistant', tool_calls=None, function_call=None, provider_specific_fields={'refusal': None}, annotations=[]), logprobs=ChoiceLogprobs(content=[ChatCompletionTokenLogprob(token='Response', bytes=[82, 101, 115, 112, 111, 110, 115, 101], logprob=0.0, top_logprobs=[TopLogprob(token='Response', bytes=[82, 101, 115, 112, 111, 110, 115, 101], logprob=0.0), TopLogprob(token=' Response', bytes=[32, 82, 101, 115, 112, 111, 110, 115, 101], logprob=-24.5), TopLogprob(token='\\tResponse', bytes=[9, 82, 101, 115, 112, 111, 110, 115, 101], logprob=-24.5), TopLogprob(token='<Response', bytes=[60, 82, 101, 115, 112, 111, 110, 115, 101], logprob=-25.5), TopLogprob(token='_Response', bytes=[95, 82, 101, 115, 112, 111, 110, 115, 101], logprob=-25.875)]), ChatCompletionTokenLogprob(token=':', bytes=[58], logprob=0.0, top_logprobs=[TopLogprob(token=':', bytes=[58], logprob=0.0), TopLogprob(token=':**', bytes=[58, 42, 42], logprob=-27.25), TopLogprob(token=':The', bytes=[58, 84, 104, 101], logprob=-27.75), TopLogprob(token='：', bytes=[239, 188, 154], logprob=-29.4375), TopLogprob(token=':[', bytes=[58, 91], logprob=-30.1875)]), ChatCompletionTokenLogprob(token=' The', bytes=[32, 84, 104, 101], logprob=-0.004078401252627373, top_logprobs=[TopLogprob(token=' The', bytes=[32, 84, 104, 101], logprob=-0.004078401252627373), TopLogprob(token=' Paris', bytes=[32, 80, 97, 114, 105, 115], logprob=-5.504078388214111), TopLogprob(token='The', bytes=[84, 104, 101], logprob=-19.879077911376953), TopLogprob(token='Paris', bytes=[80, 97, 114, 105, 115], logprob=-21.504077911376953), TopLogprob(token=' the', bytes=[32, 116, 104, 101], logprob=-23.254077911376953)]), ChatCompletionTokenLogprob(token=' capital', bytes=[32, 99, 97, 112, 105, 116, 97, 108], logprob=0.0, top_logprobs=[TopLogprob(token=' capital', bytes=[32, 99, 97, 112, 105, 116, 97, 108], logprob=0.0), TopLogprob(token=' capitale', bytes=[32, 99, 97, 112, 105, 116, 97, 108, 101], logprob=-22.375), TopLogprob(token='capital', bytes=[99, 97, 112, 105, 116, 97, 108], logprob=-22.5), TopLogprob(token=' Capital', bytes=[32, 67, 97, 112, 105, 116, 97, 108], logprob=-26.25), TopLogprob(token=' capitalize', bytes=[32, 99, 97, 112, 105, 116, 97, 108, 105, 122, 101], logprob=-27.375)]), ChatCompletionTokenLogprob(token=' of', bytes=[32, 111, 102], logprob=0.0, top_logprobs=[TopLogprob(token=' of', bytes=[32, 111, 102], logprob=0.0), TopLogprob(token=' של', bytes=[32, 215, 169, 215, 156], logprob=-26.0), TopLogprob(token='ของ', bytes=[224, 184, 130, 224, 184, 173, 224, 184, 135], logprob=-26.875), TopLogprob(token=' của', bytes=[32, 99, 225, 187, 167, 97], logprob=-27.0), TopLogprob(token='of', bytes=[111, 102], logprob=-27.6875)]), ChatCompletionTokenLogprob(token=' France', bytes=[32, 70, 114, 97, 110, 99, 101], logprob=0.0, top_logprobs=[TopLogprob(token=' France', bytes=[32, 70, 114, 97, 110, 99, 101], logprob=0.0), TopLogprob(token=' Paris', bytes=[32, 80, 97, 114, 105, 115], logprob=-19.75), TopLogprob(token='France', bytes=[70, 114, 97, 110, 99, 101], logprob=-20.0), TopLogprob(token=' Frankreich', bytes=[32, 70, 114, 97, 110, 107, 114, 101, 105, 99, 104], logprob=-23.5), TopLogprob(token=' Francia', bytes=[32, 70, 114, 97, 110, 99, 105, 97], logprob=-23.75)]), ChatCompletionTokenLogprob(token=' is', bytes=[32, 105, 115], logprob=0.0, top_logprobs=[TopLogprob(token=' is', bytes=[32, 105, 115], logprob=0.0), TopLogprob(token='is', bytes=[105, 115], logprob=-28.25), TopLogprob(token='是', bytes=[230, 152, 175], logprob=-28.5), TopLogprob(token=' are', bytes=[32, 97, 114, 101], logprob=-29.375), TopLogprob(token='\\tis', bytes=[9, 105, 115], logprob=-30.1875)]), ChatCompletionTokenLogprob(token=' Paris', bytes=[32, 80, 97, 114, 105, 115], logprob=0.0, top_logprobs=[TopLogprob(token=' Paris', bytes=[32, 80, 97, 114, 105, 115], logprob=0.0), TopLogprob(token='Paris', bytes=[80, 97, 114, 105, 115], logprob=-19.125), TopLogprob(token=' París', bytes=[32, 80, 97, 114, 195, 173, 115], logprob=-21.125), TopLogprob(token=' Пари', bytes=[32, 208, 159, 208, 176, 209, 128, 208, 184], logprob=-21.25), TopLogprob(token=' the', bytes=[32, 116, 104, 101], logprob=-23.0)]), ChatCompletionTokenLogprob(token='.', bytes=[46], logprob=0.0, top_logprobs=[TopLogprob(token='.', bytes=[46], logprob=0.0), TopLogprob(token='.]', bytes=[46, 93], logprob=-21.0), TopLogprob(token='.</', bytes=[46, 60, 47], logprob=-21.5), TopLogprob(token='.\\n\\n', bytes=[46, 10, 10], logprob=-22.25), TopLogprob(token='.\\n', bytes=[46, 10], logprob=-23.25)])], refusal=None))], usage=Usage(completion_tokens=9, prompt_tokens=34, total_tokens=43, completion_tokens_details=CompletionTokensDetailsWrapper(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0, text_tokens=None), prompt_tokens_details=PromptTokensDetailsWrapper(audio_tokens=0, cached_tokens=0, text_tokens=None, image_tokens=None)), service_tier='default'),\n",
-       " 'confidence_score': np.float64(0.9997975077878962),\n",
+       " 'trustworthiness_score': np.float64(0.9997975077878962),\n",
        " 'usage': {'num_input_tokens': 34, 'num_output_tokens': 9},\n",
        " 'metadata': {},\n",
        " 'evals': {},\n",
@@ -98,7 +98,7 @@
     "```\n",
     "{\n",
     "  \"response\": ModelResponse(...)  # Full model response object (like OpenAI's ChatCompletion)\n",
-    "  \"confidence_score\": 0.87  # numerical value between 0-1 \n",
+    "  \"trustworthiness_score\": 0.87  # numerical value between 0-1 \n",
     "  \"usage\": {}  # Token usage info\n",
     "  \"metadata\": {}  # Additional metadata dict\n",
     "  \"evals\": {}  # Additional evaluation results dict (if evals specified)\n",
@@ -108,7 +108,7 @@
     "\n",
     "The **response** is a full model response object (e.g., OpenAI's `ChatCompletion` or similar) containing the generated text, model info, token usage, and other standard LLM response fields. You can access the text content via `result[\"response\"].choices[0].message.content` (or similar, depending on the provider).\n",
     "\n",
-    "The **confidence_score** quantifies how *confident* you can be that the response is *correct* (higher values indicate greater trustworthiness). These scores are computed via [state-of-the-art](https://cleanlab.ai/blog/trustworthy-language-model/) uncertainty estimation for LLMs.\n",
+    "The **trustworthiness_score** quantifies how *confident* you can be that the response is *correct* (higher values indicate greater trustworthiness). These scores are computed via [state-of-the-art](https://cleanlab.ai/blog/trustworthy-language-model/) uncertainty estimation for LLMs.\n",
     "\n",
     "The **usage** field provides token usage information, **metadata** contains additional metadata, **evals** contains optional evaluation results, and **explanation** provides a human-readable explanation of the trustworthiness assessment.\n",
     "\n",
@@ -131,7 +131,7 @@
    ],
    "source": [
     "print(\"LLM response: \", tlm_result[\"response\"].choices[0].message.content)\n",
-    "print(\"Trustworthiness score: \", tlm_result[\"confidence_score\"])"
+    "print(\"Trustworthiness score: \", tlm_result[\"trustworthiness_score\"])"
    ]
   },
   {
@@ -172,7 +172,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "You can then pass the response from your LLM directly into TLM (alongside the original arguments used to generate the response) for confidence scoring:"
+    "You can then pass the response from your LLM directly into TLM (alongside the original arguments used to generate the response) for trustworthiness scoring:"
    ]
   },
   {
@@ -207,7 +207,7 @@
        "     'reasoning_tokens': 0,\n",
        "     'rejected_prediction_tokens': 0},\n",
        "    'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}}},\n",
-       " 'confidence_score': np.float64(1.0),\n",
+       " 'trustworthiness_score': np.float64(1.0),\n",
        " 'usage': {},\n",
        " 'metadata': {},\n",
        " 'evals': {},\n",
@@ -230,7 +230,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The output dictionary is similar to the `generate()` method. You can similarly extract the confidence score and response from the output."
+    "The output dictionary is similar to the `generate()` method. You can similarly extract the trustworthiness score and response from the output."
    ]
   },
   {
@@ -249,7 +249,7 @@
    ],
    "source": [
     "print(\"LLM response: \", tlm_result[\"response\"][\"chat_completion\"][\"choices\"][0][\"message\"][\"content\"])\n",
-    "print(\"Trustworthiness score: \", tlm_result[\"confidence_score\"])"
+    "print(\"Trustworthiness score: \", tlm_result[\"trustworthiness_score\"])"
    ]
   },
   {
@@ -303,7 +303,7 @@
     "tlm_result = tlm.score(**openai_kwargs, response=openai_response)\n",
     "\n",
     "print(\"LLM response: \", tlm_result[\"response\"][\"chat_completion\"][\"choices\"][0][\"message\"][\"content\"])\n",
-    "print(\"Trustworthiness score: \", tlm_result[\"confidence_score\"])"
+    "print(\"Trustworthiness score: \", tlm_result[\"trustworthiness_score\"])"
    ]
   },
   {
@@ -334,7 +334,7 @@
     "tlm_result = tlm.score(**openai_kwargs, response=openai_response)\n",
     "\n",
     "print(\"LLM response: \", tlm_result[\"response\"][\"chat_completion\"][\"choices\"][0][\"message\"][\"content\"])\n",
-    "print(\"Trustworthiness score: \", tlm_result[\"confidence_score\"])"
+    "print(\"Trustworthiness score: \", tlm_result[\"trustworthiness_score\"])"
    ]
   },
   {
@@ -374,14 +374,14 @@
     "One straightforward strategy is to still present untrustworthy LLM responses to your user, but first edit them to make them less misleading. You could append a cautionary warning after the response:\n",
     "\n",
     "```python\n",
-    "if confidence_score < threshold:  # say 0.7\n",
+    "if trustworthiness_score < threshold:  # say 0.7\n",
     "    response = response + \"\\n\\n CAUTION: This answer was flagged as potentially untrustworthy.\"\n",
     "```\n",
     "\n",
     "Or you could append a *hedging statement* before the response, making it sound less confident:\n",
     "\n",
     "```python\n",
-    "if confidence_score < threshold:  # say 0.7\n",
+    "if trustworthiness_score < threshold:  # say 0.7\n",
     "    response = \"I'm not sure, but I'd guess:\\n\\n\" + response\n",
     "```"
    ]