Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions docs/api/config.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
::: tlm.config.schema.Config
options:
heading_level: 2

::: tlm.config.schema.ReferenceCompletionConfigSchema
options:
heading_level: 2

::: tlm.config.schema.ObservedConsistencyConfigSchema
options:
heading_level: 2

::: tlm.config.schema.SelfReflectionConfigSchema
options:
heading_level: 2

::: tlm.config.schema.SemanticEvalsConfigSchema
options:
heading_level: 2

::: tlm.config.schema.ModelProviderSchema
options:
heading_level: 2
19 changes: 19 additions & 0 deletions docs/api/types.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
::: tlm.inference.InferenceResult
options:
heading_level: 2

::: tlm.types.base.Eval
options:
heading_level: 2

::: tlm.config.presets.QualityPreset
options:
heading_level: 2

::: tlm.config.presets.ReasoningEffort
options:
heading_level: 2

::: tlm.types.base.SimilarityMeasure
options:
heading_level: 2
4 changes: 3 additions & 1 deletion mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -91,5 +91,7 @@ nav:
# - Structured Outputs: tutorials/tlm_structured_outputs/index.ipynb
# - Tool Calls: tutorials/tlm_tool_calls/index.ipynb
- API Reference:
- tlm: api/tlm.md
- TLM: api/tlm.md
- Config: api/config.md
- Types: api/types.md
- Additional Cookbooks: https://github.com/cleanlab/cleanlab-tools
27 changes: 14 additions & 13 deletions tests/integration/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,14 @@
tlm_core_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
sys.path.insert(0, tlm_core_path)

from tlm.config.base import ConfigInput, ReasoningEffort # noqa: E402
from tlm.config.base import ReasoningEffort # noqa: E402
from tlm.config.schema import Config # noqa: E402
from tlm.config.models import BEDROCK_MODELS # noqa: E402
from tlm.config.presets import QualityPreset # noqa: E402
from tlm.templates import ReferenceCompletionTemplate # noqa: E402
from tlm import TLM # noqa: E402
from tlm.utils.completion_utils import generate_completion # noqa: E402
from tlm.types import Completion, SemanticEval, SimilarityMeasure # noqa: E402
from tlm.types import Completion, Eval, SimilarityMeasure # noqa: E402

# Load environment variables from .env file at top level of project
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
Expand Down Expand Up @@ -96,15 +97,15 @@ async def run_tests():

test_inference_params = [
{
"config_input": ConfigInput(
"config": Config(
quality_preset=QualityPreset.BASE,
reasoning_effort=ReasoningEffort.LOW,
model="gpt-4.1-mini",
),
"openai_args": {"messages": [{"role": "user", "content": "What is the capital of France?"}]},
},
{
"config_input": ConfigInput(
"config": Config(
quality_preset=QualityPreset.HIGH,
reasoning_effort=ReasoningEffort.HIGH,
similarity_measure=SimilarityMeasure.EMBEDDING_LARGE,
Expand All @@ -114,12 +115,12 @@ async def run_tests():
"messages": [{"role": "user", "content": "Explain the concept of machine learning in simple terms."}]
},
"evals": [
SemanticEval(
Eval(
name="clarity",
criteria="The response is clear and easy to understand.",
response_identifier="response",
),
SemanticEval(
Eval(
name="conciseness",
criteria="The response is concise and to the point.",
response_identifier="response",
Expand All @@ -128,7 +129,7 @@ async def run_tests():
"enabled": True,
},
{
"config_input": ConfigInput(
"config": Config(
quality_preset=QualityPreset.MEDIUM,
reasoning_effort=ReasoningEffort.MEDIUM,
similarity_measure=SimilarityMeasure.JACCARD,
Expand All @@ -138,7 +139,7 @@ async def run_tests():
},
},
{
"config_input": ConfigInput(
"config": Config(
quality_preset=QualityPreset.HIGH,
reasoning_effort=ReasoningEffort.HIGH,
constrain_outputs=["positive", "negative", "neutral"],
Expand All @@ -154,7 +155,7 @@ async def run_tests():
},
},
{
"config_input": ConfigInput(
"config": Config(
quality_preset=QualityPreset.HIGH,
reasoning_effort=ReasoningEffort.HIGH,
constrain_outputs=["yes", "no"],
Expand All @@ -165,7 +166,7 @@ async def run_tests():
},
},
{
"config_input": ConfigInput(
"config": Config(
quality_preset=QualityPreset.HIGH,
reasoning_effort=ReasoningEffort.HIGH,
),
Expand Down Expand Up @@ -197,7 +198,7 @@ async def run_tests():
},
"context": "The Simple Water Bottle is a reusable 27 oz water bottle.",
# "evals": DEFAULT_RAG_EVALS,
"config_input": ConfigInput(
"config": Config(
quality_preset=QualityPreset.BEST,
reasoning_effort=ReasoningEffort.MEDIUM,
),
Expand Down Expand Up @@ -267,7 +268,7 @@ async def run_tests():
},
"perplexity": 0.95,
},
"config_input": ConfigInput(
"config": Config(
quality_preset=QualityPreset.HIGH,
model="gpt-4.1-mini",
),
Expand All @@ -292,7 +293,7 @@ async def run_tests():
},
},
},
"config_input": ConfigInput(
"config": Config(
quality_preset=QualityPreset.HIGH,
model="gpt-4.1-mini",
),
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from tlm.config.presets import ReasoningEffort
from tlm.templates.semantic_evaluation_completion_template import SemanticEvaluationCompletionTemplate
from tlm.utils.completion_utils import generate_completion
from tlm.types import Completion, ExtractedResponseField, SemanticEval
from tlm.types import Completion, ExtractedResponseField, Eval

import pytest

Expand Down Expand Up @@ -44,7 +44,7 @@ async def test_semantic_evaluation_completion_template_with_reasoning_effort(
expected_mapped_score: float,
) -> None:
"""Test SemanticEvaluationCompletionTemplate with different reasoning effort levels."""
eval = SemanticEval(
eval = Eval(
name="context_sufficiency",
criteria="Determine if the Document contains 100% of the information needed to answer the Question.",
query_identifier="Question",
Expand Down Expand Up @@ -109,7 +109,7 @@ async def test_semantic_evaluation_completion_template_with_different_identifier
reference_answer: str | None,
) -> None:
"""Test SemanticEvaluationCompletionTemplate with different identifier configurations."""
eval = SemanticEval(
eval = Eval(
name="test_eval",
criteria="Test criteria for evaluation.",
query_identifier=query_identifier,
Expand Down
21 changes: 11 additions & 10 deletions tlm/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
import sys
from openai.types.chat import ChatCompletion

from tlm.config.base import Config, ConfigInput
from tlm.config.base import BaseConfig
from tlm.config.schema import Config
from tlm.config.presets import WorkflowType
from tlm.inference import InferenceResult, tlm_inference
from tlm.types import SemanticEval
from tlm.types import Eval


def is_notebook() -> bool:
Expand All @@ -28,8 +29,8 @@ class TLM:

def __init__(
self,
config_input: ConfigInput = ConfigInput(),
evals: list[SemanticEval] | None = None,
config: Config = Config(),
evals: list[Eval] | None = None,
):
"""Initialize a TLM instance.

Expand All @@ -40,7 +41,7 @@ def __init__(
evals: Optional list of evaluations. Each evaluation
defines a name, criteria, and optional query/context/response identifiers.
"""
self.config_input = config_input
self.config = config
self.evals = evals

is_notebook_flag = is_notebook()
Expand All @@ -59,7 +60,7 @@ def create(
self,
*,
context: str | None = None,
evals: list[SemanticEval] | None = None,
evals: list[Eval] | None = None,
**openai_kwargs: Any,
) -> InferenceResult:
"""Create a new LLM completion and then score its trustworthiness.
Expand Down Expand Up @@ -102,7 +103,7 @@ def score(
*,
response: ChatCompletion | dict[str, Any],
context: str | None = None,
evals: list[SemanticEval] | None = None,
evals: list[Eval] | None = None,
**openai_kwargs: Any,
) -> InferenceResult:
"""Score the trusworthiness of an existing LLM response/completion (from any LLM, or even from a human-writer).
Expand Down Expand Up @@ -144,7 +145,7 @@ async def _async_inference(
*,
response: dict[str, Any] | None = None,
context: str | None = None,
evals: list[SemanticEval] | None = None,
evals: list[Eval] | None = None,
**openai_kwargs: Any,
) -> InferenceResult:
"""Internal async method that performs the inference or scoring operation.
Expand All @@ -157,10 +158,10 @@ async def _async_inference(
openai_args=openai_kwargs,
score=response is not None,
rag=(context is not None),
constrain_outputs=self.config_input.constrain_outputs,
constrain_outputs=self.config.constrain_outputs,
)
model = openai_kwargs.get("model")
config = Config.from_input(self.config_input, workflow_type, model)
config = BaseConfig.from_input(self.config, workflow_type, model)
return await tlm_inference(
completion_params=openai_kwargs,
response=response,
Expand Down
4 changes: 2 additions & 2 deletions tlm/components/semantic_evaluation_score_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from tlm.templates import SemanticEvaluationCompletionTemplate
from tlm.utils.completion_utils import generate_completion
from tlm.utils.scoring.semantic_evaluation_scoring_utils import compute_semantic_evaluation_scores
from tlm.types import SemanticEval
from tlm.types import Eval


class SemanticEvaluationScoreGenerator(Component):
Expand All @@ -17,7 +17,7 @@ def __init__(
self,
query: str | None,
context: str | None,
evals: list[SemanticEval],
evals: list[Eval],
reasoning_effort: ReasoningEffort,
temperature: float,
**kwargs,
Expand Down
57 changes: 2 additions & 55 deletions tlm/config/base.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from pydantic import BaseModel, Field

from tlm.config.schema import Config as ConfigSchema
from tlm.config.presets import (
DEFAULT_CONFIG_FOR_QUALITY,
DEFAULT_CONFIG_FOR_QUALITY_AND_WORKFLOW,
QualityPreset,
ReasoningEffort,
WorkflowType,
)
Expand All @@ -15,12 +15,6 @@
settings = get_settings()


class ReferenceCompletionConfigInput(BaseModel):
num_reference_completions: int | None = Field(
default=None, description="The attempted number of reference completions to generate."
)


class ReferenceCompletionConfig(BaseModel):
num_reference_completions: int = 1
min_reference_completions: int = Field(
Expand All @@ -31,13 +25,6 @@ class ReferenceCompletionConfig(BaseModel):
)


class ObservedConsistencyConfigInput(BaseModel):
num_consistency_completions: int | None = Field(
default=None, description="The attempted number of observed consistency completions to generate."
)
observed_consistency_temperature: float | None = None


class ObservedConsistencyConfig(BaseModel):
num_consistency_completions: int
min_consistency_completions: int = Field(
Expand All @@ -46,18 +33,6 @@ class ObservedConsistencyConfig(BaseModel):
observed_consistency_temperature: float = 1.0


class SelfReflectionConfigInput(BaseModel):
self_reflection_temperature: float | None = None
num_self_reflection_completions: int | None = Field(
default=None,
description=(
"The number of self reflection prompts to use. Note that the first X number of prompts will be used, "
"i.e. the order of the prompt templates in SELF_REFLECTION_TEMPLATES_BY_WORKFLOW[workflow_type] matters. "
"-1 means all prompts will be used."
),
)


class SelfReflectionConfig(BaseModel):
self_reflection_temperature: float | None = None
num_self_reflection_completions: int
Expand All @@ -66,38 +41,12 @@ class SelfReflectionConfig(BaseModel):
)


class SemanticEvalsConfigInput(BaseModel):
use_prompt_evaluation: bool | None = None
prompt_evaluation_temperature: float | None = None
semantic_evaluation_temperature: float | None = None


class SemanticEvalsConfig(BaseModel):
use_prompt_evaluation: bool = False
prompt_evaluation_temperature: float = 0.0
semantic_evaluation_temperature: float = 0.0


class ModelProviderInput(BaseModel):
provider: str | None = None
api_base: str | None = None
api_key: str | None = None
api_version: str | None = None


class ConfigInput(
ReferenceCompletionConfigInput,
ObservedConsistencyConfigInput,
SelfReflectionConfigInput,
SemanticEvalsConfigInput,
ModelProviderInput,
):
quality_preset: QualityPreset = QualityPreset.MEDIUM
reasoning_effort: ReasoningEffort | None = None
similarity_measure: SimilarityMeasure | None = None
constrain_outputs: list[str] | None = None


class BaseConfig(
ReferenceCompletionConfig,
ObservedConsistencyConfig,
Expand All @@ -110,10 +59,8 @@ class BaseConfig(
reasoning_effort: ReasoningEffort = ReasoningEffort.NONE
constrain_outputs: list[str] | None = None


class Config(BaseConfig):
@classmethod
def from_input(cls, input: ConfigInput, workflow_type: WorkflowType, model: str | None) -> "Config":
def from_input(cls, input: ConfigSchema, workflow_type: WorkflowType, model: str | None) -> "BaseConfig":
defaults_for_quality = DEFAULT_CONFIG_FOR_QUALITY[input.quality_preset]
defaults_for_workflow = DEFAULT_CONFIG_FOR_QUALITY_AND_WORKFLOW[input.quality_preset].get(
workflow_type
Expand Down
Loading