Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions docs/tutorials/quickstart/index.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@
"data": {
"text/plain": [
"{'response': ModelResponse(id='chatcmpl-Cvp7cQSXi0AYaYHhLMtY6Pr1pVDkf', created=1767897244, model='gpt-4.1-mini-2025-04-14', object='chat.completion', system_fingerprint='fp_376a7ccef1', choices=[Choices(finish_reason='stop', index=0, message=Message(content='The capital of France is Paris.', role='assistant', tool_calls=None, function_call=None, provider_specific_fields={'refusal': None}, annotations=[]), logprobs=ChoiceLogprobs(content=[ChatCompletionTokenLogprob(token='Response', bytes=[82, 101, 115, 112, 111, 110, 115, 101], logprob=0.0, top_logprobs=[TopLogprob(token='Response', bytes=[82, 101, 115, 112, 111, 110, 115, 101], logprob=0.0), TopLogprob(token=' Response', bytes=[32, 82, 101, 115, 112, 111, 110, 115, 101], logprob=-24.5), TopLogprob(token='\\tResponse', bytes=[9, 82, 101, 115, 112, 111, 110, 115, 101], logprob=-24.5), TopLogprob(token='<Response', bytes=[60, 82, 101, 115, 112, 111, 110, 115, 101], logprob=-25.5), TopLogprob(token='_Response', bytes=[95, 82, 101, 115, 112, 111, 110, 115, 101], logprob=-25.875)]), ChatCompletionTokenLogprob(token=':', bytes=[58], logprob=0.0, top_logprobs=[TopLogprob(token=':', bytes=[58], logprob=0.0), TopLogprob(token=':**', bytes=[58, 42, 42], logprob=-27.25), TopLogprob(token=':The', bytes=[58, 84, 104, 101], logprob=-27.75), TopLogprob(token=':', bytes=[239, 188, 154], logprob=-29.4375), TopLogprob(token=':[', bytes=[58, 91], logprob=-30.1875)]), ChatCompletionTokenLogprob(token=' The', bytes=[32, 84, 104, 101], logprob=-0.004078401252627373, top_logprobs=[TopLogprob(token=' The', bytes=[32, 84, 104, 101], logprob=-0.004078401252627373), TopLogprob(token=' Paris', bytes=[32, 80, 97, 114, 105, 115], logprob=-5.504078388214111), TopLogprob(token='The', bytes=[84, 104, 101], logprob=-19.879077911376953), TopLogprob(token='Paris', bytes=[80, 97, 114, 105, 115], logprob=-21.504077911376953), TopLogprob(token=' the', bytes=[32, 116, 104, 101], logprob=-23.254077911376953)]), ChatCompletionTokenLogprob(token=' capital', bytes=[32, 99, 97, 112, 105, 116, 97, 108], logprob=0.0, top_logprobs=[TopLogprob(token=' capital', bytes=[32, 99, 97, 112, 105, 116, 97, 108], logprob=0.0), TopLogprob(token=' capitale', bytes=[32, 99, 97, 112, 105, 116, 97, 108, 101], logprob=-22.375), TopLogprob(token='capital', bytes=[99, 97, 112, 105, 116, 97, 108], logprob=-22.5), TopLogprob(token=' Capital', bytes=[32, 67, 97, 112, 105, 116, 97, 108], logprob=-26.25), TopLogprob(token=' capitalize', bytes=[32, 99, 97, 112, 105, 116, 97, 108, 105, 122, 101], logprob=-27.375)]), ChatCompletionTokenLogprob(token=' of', bytes=[32, 111, 102], logprob=0.0, top_logprobs=[TopLogprob(token=' of', bytes=[32, 111, 102], logprob=0.0), TopLogprob(token=' של', bytes=[32, 215, 169, 215, 156], logprob=-26.0), TopLogprob(token='ของ', bytes=[224, 184, 130, 224, 184, 173, 224, 184, 135], logprob=-26.875), TopLogprob(token=' của', bytes=[32, 99, 225, 187, 167, 97], logprob=-27.0), TopLogprob(token='of', bytes=[111, 102], logprob=-27.6875)]), ChatCompletionTokenLogprob(token=' France', bytes=[32, 70, 114, 97, 110, 99, 101], logprob=0.0, top_logprobs=[TopLogprob(token=' France', bytes=[32, 70, 114, 97, 110, 99, 101], logprob=0.0), TopLogprob(token=' Paris', bytes=[32, 80, 97, 114, 105, 115], logprob=-19.75), TopLogprob(token='France', bytes=[70, 114, 97, 110, 99, 101], logprob=-20.0), TopLogprob(token=' Frankreich', bytes=[32, 70, 114, 97, 110, 107, 114, 101, 105, 99, 104], logprob=-23.5), TopLogprob(token=' Francia', bytes=[32, 70, 114, 97, 110, 99, 105, 97], logprob=-23.75)]), ChatCompletionTokenLogprob(token=' is', bytes=[32, 105, 115], logprob=0.0, top_logprobs=[TopLogprob(token=' is', bytes=[32, 105, 115], logprob=0.0), TopLogprob(token='is', bytes=[105, 115], logprob=-28.25), TopLogprob(token='是', bytes=[230, 152, 175], logprob=-28.5), TopLogprob(token=' are', bytes=[32, 97, 114, 101], logprob=-29.375), TopLogprob(token='\\tis', bytes=[9, 105, 115], logprob=-30.1875)]), ChatCompletionTokenLogprob(token=' Paris', bytes=[32, 80, 97, 114, 105, 115], logprob=0.0, top_logprobs=[TopLogprob(token=' Paris', bytes=[32, 80, 97, 114, 105, 115], logprob=0.0), TopLogprob(token='Paris', bytes=[80, 97, 114, 105, 115], logprob=-19.125), TopLogprob(token=' París', bytes=[32, 80, 97, 114, 195, 173, 115], logprob=-21.125), TopLogprob(token=' Пари', bytes=[32, 208, 159, 208, 176, 209, 128, 208, 184], logprob=-21.25), TopLogprob(token=' the', bytes=[32, 116, 104, 101], logprob=-23.0)]), ChatCompletionTokenLogprob(token='.', bytes=[46], logprob=0.0, top_logprobs=[TopLogprob(token='.', bytes=[46], logprob=0.0), TopLogprob(token='.]', bytes=[46, 93], logprob=-21.0), TopLogprob(token='.</', bytes=[46, 60, 47], logprob=-21.5), TopLogprob(token='.\\n\\n', bytes=[46, 10, 10], logprob=-22.25), TopLogprob(token='.\\n', bytes=[46, 10], logprob=-23.25)])], refusal=None))], usage=Usage(completion_tokens=9, prompt_tokens=34, total_tokens=43, completion_tokens_details=CompletionTokensDetailsWrapper(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0, text_tokens=None), prompt_tokens_details=PromptTokensDetailsWrapper(audio_tokens=0, cached_tokens=0, text_tokens=None, image_tokens=None)), service_tier='default'),\n",
" 'confidence_score': np.float64(0.9997975077878962),\n",
" 'trustworthiness_score': np.float64(0.9997975077878962),\n",
" 'usage': {'num_input_tokens': 34, 'num_output_tokens': 9},\n",
" 'metadata': {},\n",
" 'evals': {},\n",
Expand Down Expand Up @@ -98,7 +98,7 @@
"```\n",
"{\n",
" \"response\": ModelResponse(...) # Full model response object (like OpenAI's ChatCompletion)\n",
" \"confidence_score\": 0.87 # numerical value between 0-1 \n",
" \"trustworthiness_score\": 0.87 # numerical value between 0-1 \n",
" \"usage\": {} # Token usage info\n",
" \"metadata\": {} # Additional metadata dict\n",
" \"evals\": {} # Additional evaluation results dict (if evals specified)\n",
Expand All @@ -108,7 +108,7 @@
"\n",
"The **response** is a full model response object (e.g., OpenAI's `ChatCompletion` or similar) containing the generated text, model info, token usage, and other standard LLM response fields. You can access the text content via `result[\"response\"].choices[0].message.content` (or similar, depending on the provider).\n",
"\n",
"The **confidence_score** quantifies how *confident* you can be that the response is *correct* (higher values indicate greater trustworthiness). These scores are computed via [state-of-the-art](https://cleanlab.ai/blog/trustworthy-language-model/) uncertainty estimation for LLMs.\n",
"The **trustworthiness_score** quantifies how *confident* you can be that the response is *correct* (higher values indicate greater trustworthiness). These scores are computed via [state-of-the-art](https://cleanlab.ai/blog/trustworthy-language-model/) uncertainty estimation for LLMs.\n",
"\n",
"The **usage** field provides token usage information, **metadata** contains additional metadata, **evals** contains optional evaluation results, and **explanation** provides a human-readable explanation of the trustworthiness assessment.\n",
"\n",
Expand All @@ -131,7 +131,7 @@
],
"source": [
"print(\"LLM response: \", tlm_result[\"response\"].choices[0].message.content)\n",
"print(\"Trustworthiness score: \", tlm_result[\"confidence_score\"])"
"print(\"Trustworthiness score: \", tlm_result[\"trustworthiness_score\"])"
]
},
{
Expand Down Expand Up @@ -172,7 +172,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"You can then pass the response from your LLM directly into TLM (alongside the original arguments used to generate the response) for confidence scoring:"
"You can then pass the response from your LLM directly into TLM (alongside the original arguments used to generate the response) for trustworthiness scoring:"
]
},
{
Expand Down Expand Up @@ -207,7 +207,7 @@
" 'reasoning_tokens': 0,\n",
" 'rejected_prediction_tokens': 0},\n",
" 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}}},\n",
" 'confidence_score': np.float64(1.0),\n",
" 'trustworthiness_score': np.float64(1.0),\n",
" 'usage': {},\n",
" 'metadata': {},\n",
" 'evals': {},\n",
Expand All @@ -230,7 +230,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"The output dictionary is similar to the `generate()` method. You can similarly extract the confidence score and response from the output."
"The output dictionary is similar to the `generate()` method. You can similarly extract the trustworthiness score and response from the output."
]
},
{
Expand All @@ -249,7 +249,7 @@
],
"source": [
"print(\"LLM response: \", tlm_result[\"response\"][\"chat_completion\"][\"choices\"][0][\"message\"][\"content\"])\n",
"print(\"Trustworthiness score: \", tlm_result[\"confidence_score\"])"
"print(\"Trustworthiness score: \", tlm_result[\"trustworthiness_score\"])"
]
},
{
Expand Down Expand Up @@ -303,7 +303,7 @@
"tlm_result = tlm.score(**openai_kwargs, response=openai_response)\n",
"\n",
"print(\"LLM response: \", tlm_result[\"response\"][\"chat_completion\"][\"choices\"][0][\"message\"][\"content\"])\n",
"print(\"Trustworthiness score: \", tlm_result[\"confidence_score\"])"
"print(\"Trustworthiness score: \", tlm_result[\"trustworthiness_score\"])"
]
},
{
Expand Down Expand Up @@ -334,7 +334,7 @@
"tlm_result = tlm.score(**openai_kwargs, response=openai_response)\n",
"\n",
"print(\"LLM response: \", tlm_result[\"response\"][\"chat_completion\"][\"choices\"][0][\"message\"][\"content\"])\n",
"print(\"Trustworthiness score: \", tlm_result[\"confidence_score\"])"
"print(\"Trustworthiness score: \", tlm_result[\"trustworthiness_score\"])"
]
},
{
Expand Down Expand Up @@ -374,14 +374,14 @@
"One straightforward strategy is to still present untrustworthy LLM responses to your user, but first edit them to make them less misleading. You could append a cautionary warning after the response:\n",
"\n",
"```python\n",
"if confidence_score < threshold: # say 0.7\n",
"if trustworthiness_score < threshold: # say 0.7\n",
" response = response + \"\\n\\n CAUTION: This answer was flagged as potentially untrustworthy.\"\n",
"```\n",
"\n",
"Or you could append a *hedging statement* before the response, making it sound less confident:\n",
"\n",
"```python\n",
"if confidence_score < threshold: # say 0.7\n",
"if trustworthiness_score < threshold: # say 0.7\n",
" response = \"I'm not sure, but I'd guess:\\n\\n\" + response\n",
"```"
]
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ async def run_inference_test(kwargs: dict, enabled=True) -> bool:
response_str = response["response"]

print(f" - Response: {response_str}")
print(f" - Confidence score: {response['confidence_score']}")
print(f" - Confidence score: {response['trustworthiness_score']}")
print(f" - Usage: {response['usage']}")
print(f" - Metadata: {response['metadata']}")
print(f" - RAG evals: {response['evals']}")
Expand Down
4 changes: 2 additions & 2 deletions tlm/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def create(
Returns:
InferenceResult object containing:
- response: The generated response (string or dict for structured outputs)
- confidence_score: Confidence score between 0 and 1
- trustworthiness_score: Confidence score between 0 and 1
- usage: Token usage information
- metadata: Additional metadata (e.g., per-field scores for structured outputs)
- evals: Dictionary of additional evaluation scores (if evals are provided)
Expand Down Expand Up @@ -123,7 +123,7 @@ def score(
Returns:
InferenceResult containing:
- response: The original response (preserved from input)
- confidence_score: Confidence score between 0 and 1
- trustworthiness_score: Confidence score between 0 and 1
- usage: Token usage information
- metadata: Additional metadata (e.g., per-field scores for structured outputs)
- evals: Dictionary of additional evaluation scores (if evals are provided)
Expand Down
4 changes: 2 additions & 2 deletions tlm/components/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from .completions.self_reflection_completion_generator import SelfReflectionCompletionGenerator
from .semantic_evaluation_score_generator import SemanticEvaluationScoreGenerator
from .response_assembly import ResponseAssembly
from .scores.confidence_score_computation import ConfidenceScoreComputation
from .scores.trustworthiness_score_computation import TrustworthinessScoreComputation
from .scores.consistency_score_computation import ConsistencyScoreComputation
from .scores.perplexity_score_computation import PerplexityScoreComputation
from .scores.prompt_evaluation_score_extraction import PromptEvaluationScoreExtraction
Expand All @@ -18,7 +18,7 @@
"ObservedConsistencyCompletionGenerator",
"SelfReflectionCompletionGenerator",
"ConsistencyScoreComputation",
"ConfidenceScoreComputation",
"TrustworthinessScoreComputation",
"PerplexityScoreComputation",
"SelfReflectionScoreComputation",
"ResponseAssembly",
Expand Down
18 changes: 9 additions & 9 deletions tlm/components/response_assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,24 +30,24 @@ def __init__(
super().__init__(depends_on=depends_on)

async def execute(self) -> None:
confidence_scores = self.execution_context.get("confidence_scores")
trustworthiness_scores = self.execution_context.get("trustworthiness_scores")
reference_answers = self.execution_context.get("reference_answers")
reference_completions: list[Completion] = self.execution_context.get("reference_completions")

best_answer_idx: int

if np.isnan(confidence_scores).all():
if np.isnan(trustworthiness_scores).all():
best_answer_idx = 0
average_confidence_score = None
average_trustworthiness_score = None
else:
best_answer_idx = np.nanargmax(confidence_scores, axis=0)
average_confidence_score = np.nanmean(confidence_scores)
best_answer_idx = np.nanargmax(trustworthiness_scores, axis=0)
average_trustworthiness_score = np.nanmean(trustworthiness_scores)

best_answer = reference_answers[best_answer_idx]
best_completion = reference_completions[best_answer_idx]

if average_confidence_score is not None:
make_score_asymptotic(average_confidence_score)
if average_trustworthiness_score is not None:
make_score_asymptotic(average_trustworthiness_score)

self.execution_context.add("best_answer_idx", best_answer_idx)

Expand All @@ -56,7 +56,7 @@ async def execute(self) -> None:
else:
self.execution_context.add("best_response", get_cleaned_chat_completion(best_completion))

self.execution_context.add("confidence_score", average_confidence_score)
self.execution_context.add("trustworthiness_score", average_trustworthiness_score)

if self.inference_type == InferenceType.PROMPT:
if best_completion.usage is None:
Expand Down Expand Up @@ -92,7 +92,7 @@ async def execute(self) -> None:
mean_consistency_score = float(np.nanmean(consistency_scores))

explainability_message = get_explainability_message(
average_confidence_score,
average_trustworthiness_score,
self_reflection_completions,
observed_consistency_completions,
mean_consistency_score,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,18 @@

from tlm.components import Component
from tlm.config.presets import WorkflowType
from tlm.utils.scoring.confidence_scoring_utils import get_confidence_scores
from tlm.utils.scoring.trustworthiness_scoring_utils import get_trustworthiness_scores

logger = logging.getLogger(__name__)


class ConfidenceScoreComputation(Component):
def __init__(self, workflow_type: WorkflowType, model: str, depends_on: list[Component] | None = None):
class TrustworthinessScoreComputation(Component):
def __init__(
self,
workflow_type: WorkflowType,
model: str,
depends_on: list[Component] | None = None,
):
self.workflow_type = workflow_type
self.model = model
super().__init__(depends_on=depends_on)
Expand All @@ -22,7 +27,7 @@ async def execute(self):
use_perplexity_score = self.execution_context.get("use_perplexity_score")
prompt_evaluation_scores = self.execution_context.get("prompt_evaluation_scores", [])

confidence_scores = get_confidence_scores(
trustworthiness_scores = get_trustworthiness_scores(
self.workflow_type,
self.model,
consistency_scores,
Expand All @@ -33,6 +38,6 @@ async def execute(self):
prompt_evaluation_scores,
)

logger.info(f"Calculated confidence scores: {confidence_scores}")
logger.info(f"Calculated trustworthiness scores: {trustworthiness_scores}")

self.execution_context.add("confidence_scores", confidence_scores)
self.execution_context.add("trustworthiness_scores", trustworthiness_scores)
6 changes: 3 additions & 3 deletions tlm/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

class InferenceResult(TypedDict):
response: str | dict[str, Any] # either a response string or OpenAI chat completion dict
confidence_score: float
trustworthiness_score: float
usage: dict[str, Any]
metadata: dict[str, Any] | None
evals: dict[str, float] | None
Expand Down Expand Up @@ -37,7 +37,7 @@ async def tlm_inference(
results = await pipeline.run()

best_response = results["best_response"]
confidence_score = results["confidence_score"]
trustworthiness_score = results["trustworthiness_score"]
usage = results.get("usage", {})
explanation = results.get("explanation")
evals_not_requiring_response: dict[str, float] = results.get("evals_not_requiring_response", {})
Expand All @@ -48,7 +48,7 @@ async def tlm_inference(

return InferenceResult(
response=best_response,
confidence_score=confidence_score,
trustworthiness_score=trustworthiness_score,
usage=usage,
metadata=metadata,
evals={
Expand Down
8 changes: 4 additions & 4 deletions tlm/pipeline/factory.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Any, Dict

from tlm.components import (
ConfidenceScoreComputation,
TrustworthinessScoreComputation,
ConsistencyScoreComputation,
ObservedConsistencyCompletionGenerator,
PerplexityScoreComputation,
Expand Down Expand Up @@ -140,8 +140,8 @@ def create(
else:
prompt_evaluation_score_extraction = None

confidence_score_computation = pipeline.add(
ConfidenceScoreComputation(
trustworthiness_score_computation = pipeline.add(
TrustworthinessScoreComputation(
workflow_type=config.workflow_type,
model=config.model,
depends_on=[
Expand All @@ -165,7 +165,7 @@ def create(
depends_on=[
component
for component in [
confidence_score_computation,
trustworthiness_score_computation,
evals_not_requiring_response_generator,
evals_requiring_response_generator,
]
Expand Down
9 changes: 6 additions & 3 deletions tlm/utils/explainability_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@


def get_explainability_message(
average_confidence_score: float | None,
average_trustworthiness_score: float | None,
self_reflection_completions: list[list[Completion]],
observed_consistency_completions: list[Completion],
average_consistency_score: float,
Expand All @@ -20,10 +20,13 @@ def get_explainability_message(
) -> str:
explainability_message = ""

if average_confidence_score is None:
if average_trustworthiness_score is None:
return explainability_message

if not np.isnan(average_confidence_score) and average_confidence_score < defaults.EXPLAINABILITY_THRESHOLD:
if (
not np.isnan(average_trustworthiness_score)
and average_trustworthiness_score < defaults.EXPLAINABILITY_THRESHOLD
):
self_reflection_completions_flat = [
completion for sublist in self_reflection_completions for completion in sublist
]
Expand Down
Loading