diff --git a/docs/api/config.md b/docs/api/config.md new file mode 100644 index 0000000..865877a --- /dev/null +++ b/docs/api/config.md @@ -0,0 +1,23 @@ +::: tlm.config.schema.Config + options: + heading_level: 2 + +::: tlm.config.schema.ReferenceCompletionConfigSchema + options: + heading_level: 2 + +::: tlm.config.schema.ObservedConsistencyConfigSchema + options: + heading_level: 2 + +::: tlm.config.schema.SelfReflectionConfigSchema + options: + heading_level: 2 + +::: tlm.config.schema.SemanticEvalsConfigSchema + options: + heading_level: 2 + +::: tlm.config.schema.ModelProviderSchema + options: + heading_level: 2 diff --git a/docs/api/types.md b/docs/api/types.md new file mode 100644 index 0000000..2d0b9ea --- /dev/null +++ b/docs/api/types.md @@ -0,0 +1,19 @@ +::: tlm.inference.InferenceResult + options: + heading_level: 2 + +::: tlm.types.base.Eval + options: + heading_level: 2 + +::: tlm.config.presets.QualityPreset + options: + heading_level: 2 + +::: tlm.config.presets.ReasoningEffort + options: + heading_level: 2 + +::: tlm.types.base.SimilarityMeasure + options: + heading_level: 2 diff --git a/mkdocs.yml b/mkdocs.yml index 7c3828e..f859c27 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -91,5 +91,7 @@ nav: # - Structured Outputs: tutorials/tlm_structured_outputs/index.ipynb # - Tool Calls: tutorials/tlm_tool_calls/index.ipynb - API Reference: - - tlm: api/tlm.md + - TLM: api/tlm.md + - Config: api/config.md + - Types: api/types.md - Additional Cookbooks: https://github.com/cleanlab/cleanlab-tools diff --git a/tests/integration/test_inference.py b/tests/integration/test_inference.py index 6a99d67..2ada4df 100644 --- a/tests/integration/test_inference.py +++ b/tests/integration/test_inference.py @@ -15,13 +15,14 @@ tlm_core_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) sys.path.insert(0, tlm_core_path) -from tlm.config.base import ConfigInput, ReasoningEffort # noqa: E402 +from tlm.config.base import ReasoningEffort # noqa: E402 +from tlm.config.schema import Config # noqa: E402 from tlm.config.models import BEDROCK_MODELS # noqa: E402 from tlm.config.presets import QualityPreset # noqa: E402 from tlm.templates import ReferenceCompletionTemplate # noqa: E402 from tlm import TLM # noqa: E402 from tlm.utils.completion_utils import generate_completion # noqa: E402 -from tlm.types import Completion, SemanticEval, SimilarityMeasure # noqa: E402 +from tlm.types import Completion, Eval, SimilarityMeasure # noqa: E402 # Load environment variables from .env file at top level of project project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) @@ -96,7 +97,7 @@ async def run_tests(): test_inference_params = [ { - "config_input": ConfigInput( + "config": Config( quality_preset=QualityPreset.BASE, reasoning_effort=ReasoningEffort.LOW, model="gpt-4.1-mini", @@ -104,7 +105,7 @@ async def run_tests(): "openai_args": {"messages": [{"role": "user", "content": "What is the capital of France?"}]}, }, { - "config_input": ConfigInput( + "config": Config( quality_preset=QualityPreset.HIGH, reasoning_effort=ReasoningEffort.HIGH, similarity_measure=SimilarityMeasure.EMBEDDING_LARGE, @@ -114,12 +115,12 @@ async def run_tests(): "messages": [{"role": "user", "content": "Explain the concept of machine learning in simple terms."}] }, "evals": [ - SemanticEval( + Eval( name="clarity", criteria="The response is clear and easy to understand.", response_identifier="response", ), - SemanticEval( + Eval( name="conciseness", criteria="The response is concise and to the point.", response_identifier="response", @@ -128,7 +129,7 @@ async def run_tests(): "enabled": True, }, { - "config_input": ConfigInput( + "config": Config( quality_preset=QualityPreset.MEDIUM, reasoning_effort=ReasoningEffort.MEDIUM, similarity_measure=SimilarityMeasure.JACCARD, @@ -138,7 +139,7 @@ async def run_tests(): }, }, { - "config_input": ConfigInput( + "config": Config( quality_preset=QualityPreset.HIGH, reasoning_effort=ReasoningEffort.HIGH, constrain_outputs=["positive", "negative", "neutral"], @@ -154,7 +155,7 @@ async def run_tests(): }, }, { - "config_input": ConfigInput( + "config": Config( quality_preset=QualityPreset.HIGH, reasoning_effort=ReasoningEffort.HIGH, constrain_outputs=["yes", "no"], @@ -165,7 +166,7 @@ async def run_tests(): }, }, { - "config_input": ConfigInput( + "config": Config( quality_preset=QualityPreset.HIGH, reasoning_effort=ReasoningEffort.HIGH, ), @@ -197,7 +198,7 @@ async def run_tests(): }, "context": "The Simple Water Bottle is a reusable 27 oz water bottle.", # "evals": DEFAULT_RAG_EVALS, - "config_input": ConfigInput( + "config": Config( quality_preset=QualityPreset.BEST, reasoning_effort=ReasoningEffort.MEDIUM, ), @@ -267,7 +268,7 @@ async def run_tests(): }, "perplexity": 0.95, }, - "config_input": ConfigInput( + "config": Config( quality_preset=QualityPreset.HIGH, model="gpt-4.1-mini", ), @@ -292,7 +293,7 @@ async def run_tests(): }, }, }, - "config_input": ConfigInput( + "config": Config( quality_preset=QualityPreset.HIGH, model="gpt-4.1-mini", ), diff --git a/tests/unit/templates/test_semantic_evaluation_completion_template.py b/tests/unit/templates/test_semantic_evaluation_completion_template.py index 92beaf8..1c3f414 100644 --- a/tests/unit/templates/test_semantic_evaluation_completion_template.py +++ b/tests/unit/templates/test_semantic_evaluation_completion_template.py @@ -1,7 +1,7 @@ from tlm.config.presets import ReasoningEffort from tlm.templates.semantic_evaluation_completion_template import SemanticEvaluationCompletionTemplate from tlm.utils.completion_utils import generate_completion -from tlm.types import Completion, ExtractedResponseField, SemanticEval +from tlm.types import Completion, ExtractedResponseField, Eval import pytest @@ -44,7 +44,7 @@ async def test_semantic_evaluation_completion_template_with_reasoning_effort( expected_mapped_score: float, ) -> None: """Test SemanticEvaluationCompletionTemplate with different reasoning effort levels.""" - eval = SemanticEval( + eval = Eval( name="context_sufficiency", criteria="Determine if the Document contains 100% of the information needed to answer the Question.", query_identifier="Question", @@ -109,7 +109,7 @@ async def test_semantic_evaluation_completion_template_with_different_identifier reference_answer: str | None, ) -> None: """Test SemanticEvaluationCompletionTemplate with different identifier configurations.""" - eval = SemanticEval( + eval = Eval( name="test_eval", criteria="Test criteria for evaluation.", query_identifier=query_identifier, diff --git a/tlm/api.py b/tlm/api.py index 007d7f1..1e92ef2 100644 --- a/tlm/api.py +++ b/tlm/api.py @@ -4,10 +4,11 @@ import sys from openai.types.chat import ChatCompletion -from tlm.config.base import Config, ConfigInput +from tlm.config.base import BaseConfig +from tlm.config.schema import Config from tlm.config.presets import WorkflowType from tlm.inference import InferenceResult, tlm_inference -from tlm.types import SemanticEval +from tlm.types import Eval def is_notebook() -> bool: @@ -28,8 +29,8 @@ class TLM: def __init__( self, - config_input: ConfigInput = ConfigInput(), - evals: list[SemanticEval] | None = None, + config: Config = Config(), + evals: list[Eval] | None = None, ): """Initialize a TLM instance. @@ -40,7 +41,7 @@ def __init__( evals: Optional list of evaluations. Each evaluation defines a name, criteria, and optional query/context/response identifiers. """ - self.config_input = config_input + self.config = config self.evals = evals is_notebook_flag = is_notebook() @@ -59,7 +60,7 @@ def create( self, *, context: str | None = None, - evals: list[SemanticEval] | None = None, + evals: list[Eval] | None = None, **openai_kwargs: Any, ) -> InferenceResult: """Create a new LLM completion and then score its trustworthiness. @@ -102,7 +103,7 @@ def score( *, response: ChatCompletion | dict[str, Any], context: str | None = None, - evals: list[SemanticEval] | None = None, + evals: list[Eval] | None = None, **openai_kwargs: Any, ) -> InferenceResult: """Score the trusworthiness of an existing LLM response/completion (from any LLM, or even from a human-writer). @@ -144,7 +145,7 @@ async def _async_inference( *, response: dict[str, Any] | None = None, context: str | None = None, - evals: list[SemanticEval] | None = None, + evals: list[Eval] | None = None, **openai_kwargs: Any, ) -> InferenceResult: """Internal async method that performs the inference or scoring operation. @@ -157,10 +158,10 @@ async def _async_inference( openai_args=openai_kwargs, score=response is not None, rag=(context is not None), - constrain_outputs=self.config_input.constrain_outputs, + constrain_outputs=self.config.constrain_outputs, ) model = openai_kwargs.get("model") - config = Config.from_input(self.config_input, workflow_type, model) + config = BaseConfig.from_input(self.config, workflow_type, model) return await tlm_inference( completion_params=openai_kwargs, response=response, diff --git a/tlm/components/semantic_evaluation_score_generator.py b/tlm/components/semantic_evaluation_score_generator.py index 6c7cf23..040b67b 100644 --- a/tlm/components/semantic_evaluation_score_generator.py +++ b/tlm/components/semantic_evaluation_score_generator.py @@ -5,7 +5,7 @@ from tlm.templates import SemanticEvaluationCompletionTemplate from tlm.utils.completion_utils import generate_completion from tlm.utils.scoring.semantic_evaluation_scoring_utils import compute_semantic_evaluation_scores -from tlm.types import SemanticEval +from tlm.types import Eval class SemanticEvaluationScoreGenerator(Component): @@ -17,7 +17,7 @@ def __init__( self, query: str | None, context: str | None, - evals: list[SemanticEval], + evals: list[Eval], reasoning_effort: ReasoningEffort, temperature: float, **kwargs, diff --git a/tlm/config/base.py b/tlm/config/base.py index 83f9ac3..17d7c80 100644 --- a/tlm/config/base.py +++ b/tlm/config/base.py @@ -1,9 +1,9 @@ from pydantic import BaseModel, Field +from tlm.config.schema import Config as ConfigSchema from tlm.config.presets import ( DEFAULT_CONFIG_FOR_QUALITY, DEFAULT_CONFIG_FOR_QUALITY_AND_WORKFLOW, - QualityPreset, ReasoningEffort, WorkflowType, ) @@ -15,12 +15,6 @@ settings = get_settings() -class ReferenceCompletionConfigInput(BaseModel): - num_reference_completions: int | None = Field( - default=None, description="The attempted number of reference completions to generate." - ) - - class ReferenceCompletionConfig(BaseModel): num_reference_completions: int = 1 min_reference_completions: int = Field( @@ -31,13 +25,6 @@ class ReferenceCompletionConfig(BaseModel): ) -class ObservedConsistencyConfigInput(BaseModel): - num_consistency_completions: int | None = Field( - default=None, description="The attempted number of observed consistency completions to generate." - ) - observed_consistency_temperature: float | None = None - - class ObservedConsistencyConfig(BaseModel): num_consistency_completions: int min_consistency_completions: int = Field( @@ -46,18 +33,6 @@ class ObservedConsistencyConfig(BaseModel): observed_consistency_temperature: float = 1.0 -class SelfReflectionConfigInput(BaseModel): - self_reflection_temperature: float | None = None - num_self_reflection_completions: int | None = Field( - default=None, - description=( - "The number of self reflection prompts to use. Note that the first X number of prompts will be used, " - "i.e. the order of the prompt templates in SELF_REFLECTION_TEMPLATES_BY_WORKFLOW[workflow_type] matters. " - "-1 means all prompts will be used." - ), - ) - - class SelfReflectionConfig(BaseModel): self_reflection_temperature: float | None = None num_self_reflection_completions: int @@ -66,38 +41,12 @@ class SelfReflectionConfig(BaseModel): ) -class SemanticEvalsConfigInput(BaseModel): - use_prompt_evaluation: bool | None = None - prompt_evaluation_temperature: float | None = None - semantic_evaluation_temperature: float | None = None - - class SemanticEvalsConfig(BaseModel): use_prompt_evaluation: bool = False prompt_evaluation_temperature: float = 0.0 semantic_evaluation_temperature: float = 0.0 -class ModelProviderInput(BaseModel): - provider: str | None = None - api_base: str | None = None - api_key: str | None = None - api_version: str | None = None - - -class ConfigInput( - ReferenceCompletionConfigInput, - ObservedConsistencyConfigInput, - SelfReflectionConfigInput, - SemanticEvalsConfigInput, - ModelProviderInput, -): - quality_preset: QualityPreset = QualityPreset.MEDIUM - reasoning_effort: ReasoningEffort | None = None - similarity_measure: SimilarityMeasure | None = None - constrain_outputs: list[str] | None = None - - class BaseConfig( ReferenceCompletionConfig, ObservedConsistencyConfig, @@ -110,10 +59,8 @@ class BaseConfig( reasoning_effort: ReasoningEffort = ReasoningEffort.NONE constrain_outputs: list[str] | None = None - -class Config(BaseConfig): @classmethod - def from_input(cls, input: ConfigInput, workflow_type: WorkflowType, model: str | None) -> "Config": + def from_input(cls, input: ConfigSchema, workflow_type: WorkflowType, model: str | None) -> "BaseConfig": defaults_for_quality = DEFAULT_CONFIG_FOR_QUALITY[input.quality_preset] defaults_for_workflow = DEFAULT_CONFIG_FOR_QUALITY_AND_WORKFLOW[input.quality_preset].get( workflow_type diff --git a/tlm/config/presets.py b/tlm/config/presets.py index ad25e81..6d04d3e 100644 --- a/tlm/config/presets.py +++ b/tlm/config/presets.py @@ -3,6 +3,15 @@ class QualityPreset(str, Enum): + """Quality presets that control the trade-off between speed and accuracy. + + Higher quality presets generate more completions and use more advanced techniques, + resulting in higher trustworthiness scores but slower inference and higher costs. + + Values: + `BASE`, `LOW`, `MEDIUM` (default), `HIGH`, `BEST` + """ + BASE = "base" LOW = "low" MEDIUM = "medium" @@ -11,7 +20,14 @@ class QualityPreset(str, Enum): class ReasoningEffort(str, Enum): - """Enum for different levels of reasoning effort supported by TLM.""" + """Reasoning effort levels that control explanation generation for trustworthiness scores. + + Higher reasoning effort generates longer explanations that provide more detailed + reasoning about why a particular trustworthiness score was assigned. + + Values: + `NONE` (default), `LOW`, `MEDIUM`, `HIGH` + """ NONE = "none" LOW = "low" diff --git a/tlm/config/schema.py b/tlm/config/schema.py new file mode 100644 index 0000000..e83a8db --- /dev/null +++ b/tlm/config/schema.py @@ -0,0 +1,110 @@ +from tlm.config.presets import QualityPreset, ReasoningEffort +from tlm.types import SimilarityMeasure + +from pydantic import BaseModel, Field + + +class ReferenceCompletionConfigSchema(BaseModel): + """ + Configuration for reference completion generation. + + Attributes: + num_reference_completions: The attempted number of reference completions to generate. + """ + + num_reference_completions: int | None = Field( + default=None, description="The attempted number of reference completions to generate." + ) + + +class ObservedConsistencyConfigSchema(BaseModel): + """ + Configuration for generating additional completions against which to score consistency of reference completions. + + Attributes: + num_consistency_completions: The attempted number of observed consistency completions to generate. + observed_consistency_temperature: The temperature to use for generating comparison completions. + """ + + num_consistency_completions: int | None = Field( + default=None, description="The attempted number of observed consistency completions to generate." + ) + observed_consistency_temperature: float | None = None + + +class SelfReflectionConfigSchema(BaseModel): + """ + Configuration for prompting LLM-as-judge to score the trustworthiness of reference completions using self-reflection prompts. + + Attributes: + self_reflection_temperature: The temperature to use for self reflection completions. + num_self_reflection_completions: The attempted number of self reflection completions to generate. + """ + + self_reflection_temperature: float | None = None + num_self_reflection_completions: int | None = Field( + default=None, + description=( + "The number of self reflection prompts to use. Note that the first X number of prompts will be used, " + "i.e. the order of the prompt templates in SELF_REFLECTION_TEMPLATES_BY_WORKFLOW[workflow_type] matters. " + "-1 means all prompts will be used." + ), + ) + + +class SemanticEvalsConfigSchema(BaseModel): + """ + Configuration for semantic evaluation of reference completions. + + Attributes: + use_prompt_evaluation: Whether to incorporate prompt evaluation scores into the final trustworthiness score. + prompt_evaluation_temperature: The temperature to use for prompt evaluation completions. + semantic_evaluation_temperature: The temperature to use when generating completions to score the Evals. + """ + + use_prompt_evaluation: bool | None = None + prompt_evaluation_temperature: float | None = None # TODO: rename to prompt_evaluation_temperature + semantic_evaluation_temperature: float | None = None # TODO: rename to semantic_evaluation_temperature + + +class ModelProviderSchema(BaseModel): + """ + Configuration for the model provider in alignment with the LiteLLM API. + + Attributes: + provider: The name of the model provider. + api_base: The base URL of the model provider's API. + api_key: The API key to use for the model provider. + api_version: The version of the model provider's API. + """ + + provider: str | None = None + api_base: str | None = None + api_key: str | None = None + api_version: str | None = None + + +class Config( + ReferenceCompletionConfigSchema, + ObservedConsistencyConfigSchema, + SelfReflectionConfigSchema, + SemanticEvalsConfigSchema, + ModelProviderSchema, +): + """Configuration for TLM inference. + + This class combines multiple configuration schemas to provide comprehensive + control over TLM's inference behavior, including reference completions, + consistency checking, self-reflection, semantic evaluation, and model provider settings. + + Attributes: + quality_preset: Quality preset controlling the trade-off between speed and accuracy. + reasoning_effort: Optional reasoning effort level for models that support it. + similarity_measure: Optional similarity measure to use for comparing consistency across responses. + constrain_outputs: Optional list of allowed output values to constrain responses, for example in multiple choice questions. + """ + + quality_preset: QualityPreset = QualityPreset.MEDIUM + reasoning_effort: ReasoningEffort | None = None + similarity_measure: SimilarityMeasure | None = None + constrain_outputs: list[str] | None = None diff --git a/tlm/inference.py b/tlm/inference.py index 92e1d45..4ccba9d 100644 --- a/tlm/inference.py +++ b/tlm/inference.py @@ -1,14 +1,25 @@ from typing import Any, TypedDict -from tlm.config.base import Config +from tlm.config.base import BaseConfig from tlm.config.presets import WorkflowType from tlm.pipeline import PipelineFactory -from tlm.types import SemanticEval, CompletionParams +from tlm.types import Eval, CompletionParams from tlm.utils.scoring.semantic_evaluation_scoring_utils import DEFAULT_RAG_EVALS class InferenceResult(TypedDict): - response: str | dict[str, Any] # either a response string or OpenAI chat completion dict + """Result returned from TLM inference. + + Attributes: + response: Either a response string or dictionary representation of an OpenAI chat completion. + trustworthiness_score: Score indicating the trustworthiness of the response, between 0 and 1. + usage: Token usage information for the inference, including prompt and completion tokens. + metadata: Optional metadata, e.g. per-field scores for structured outputs. + evals: Optional dictionary of Eval scores, keyed by evaluation name. + explanation: Explanation for the trustworthiness score. + """ + + response: str | dict[str, Any] trustworthiness_score: float usage: dict[str, Any] metadata: dict[str, Any] | None @@ -20,9 +31,9 @@ async def tlm_inference( *, completion_params: CompletionParams, response: dict[str, Any] | None, - evals: list[SemanticEval] | None, + evals: list[Eval] | None, context: str | None, - config: Config, + config: BaseConfig, ) -> InferenceResult: if evals is None and config.workflow_type == WorkflowType.RAG: evals = DEFAULT_RAG_EVALS diff --git a/tlm/pipeline/factory.py b/tlm/pipeline/factory.py index 17361e4..426cb63 100644 --- a/tlm/pipeline/factory.py +++ b/tlm/pipeline/factory.py @@ -14,12 +14,12 @@ SelfReflectionCompletionGenerator, SelfReflectionScoreComputation, ) -from tlm.config.base import Config +from tlm.config.base import BaseConfig from tlm.config.presets import WorkflowType from tlm.pipeline import InferencePipeline from tlm.utils.prompt_utils import format_user_request, extract_user_prompt from tlm.utils.eval_utils import group_evals -from tlm.types import SemanticEval, CompletionParams, InferenceType +from tlm.types import Eval, CompletionParams, InferenceType class PipelineFactory: @@ -27,9 +27,9 @@ class PipelineFactory: def create( *, completion_params: CompletionParams, - config: Config, + config: BaseConfig, response: Dict[str, Any] | None, - evals: list[SemanticEval] | None, + evals: list[Eval] | None, context: str | None, ) -> InferencePipeline: pipeline = InferencePipeline() diff --git a/tlm/templates/semantic_evaluation_completion_template.py b/tlm/templates/semantic_evaluation_completion_template.py index f2d1c4e..abd2a9e 100644 --- a/tlm/templates/semantic_evaluation_completion_template.py +++ b/tlm/templates/semantic_evaluation_completion_template.py @@ -13,7 +13,7 @@ ) from tlm.templates.parsers import RATING_XML_PARSER, THINK_RATING_XML_PARSER from tlm.templates.score_mapping import score_5_mapping -from tlm.types import SemanticEval, CompletionTemplate +from tlm.types import Eval, CompletionTemplate class SemanticEvaluationCompletionTemplate(CompletionTemplate): @@ -57,9 +57,7 @@ class SemanticEvaluationCompletionTemplate(CompletionTemplate): ) @classmethod - def create( - cls, eval: SemanticEval, reasoning_effort: ReasoningEffort, **kwargs - ) -> "SemanticEvaluationCompletionTemplate": + def create(cls, eval: Eval, reasoning_effort: ReasoningEffort, **kwargs) -> "SemanticEvaluationCompletionTemplate": prompt_parts = [cls._PREFIX] input_information = [] diff --git a/tlm/types/__init__.py b/tlm/types/__init__.py index 5c9241e..0f9b21a 100644 --- a/tlm/types/__init__.py +++ b/tlm/types/__init__.py @@ -7,7 +7,7 @@ CompletionFailureType, CompletionParams, FieldMetadata, - SemanticEval, + Eval, RegexPattern, AnswerChoiceToken, CompletionUsage, @@ -22,7 +22,7 @@ "SimilarityMeasure", "CompletionFailureType", "FieldMetadata", - "SemanticEval", + "Eval", "RegexPattern", "AnswerChoiceToken", "CompletionUsage", diff --git a/tlm/types/base.py b/tlm/types/base.py index 8404b94..cdb5371 100644 --- a/tlm/types/base.py +++ b/tlm/types/base.py @@ -21,6 +21,12 @@ class ExtractedResponseField(str, Enum): class SimilarityMeasure(str, Enum): + """Strategies for scoring the similarity of two generated responses. + + Values: + `JACCARD`, `EMBEDDING_SMALL`, `EMBEDDING_LARGE`, `CODE`, `STATEMENT` + """ + JACCARD = "jaccard" # formerly STRING EMBEDDING_SMALL = "embedding_small" EMBEDDING_LARGE = "embedding_large" @@ -55,7 +61,18 @@ class FieldMetadata(BaseModel): explanation: str -class SemanticEval(BaseModel): +class Eval(BaseModel): + """Criteria for performing a semantic evaluation of the query, context, and/or response. + At least one of query_identifier, context_identifier, and response_identifier must be provided. + + Attributes: + name: The name of the evaluation. + criteria: Semantic description of the criteria to assess. + query_identifier: Identifier for the user query to be provided in the prompt passed to the LLM, e.g. "User Query". Should be `None` if the evaluation does not require the query. + context_identifier: Identifier for the context to be provided in the prompt passed to the LLM, e.g. "Context". Should be `None` if the evaluation does not require the context. + response_identifier: Identifier for the response to be provided in the prompt passed to the LLM, e.g. "Response". Should be `None` if the evaluation does not require the response. + """ + name: str criteria: str query_identifier: str | None = None diff --git a/tlm/utils/eval_utils.py b/tlm/utils/eval_utils.py index a9a3e95..9c22661 100644 --- a/tlm/utils/eval_utils.py +++ b/tlm/utils/eval_utils.py @@ -1,7 +1,7 @@ -from tlm.types import SemanticEval +from tlm.types import Eval -def group_evals(evals: list[SemanticEval] | None) -> tuple[list[SemanticEval], list[SemanticEval]]: +def group_evals(evals: list[Eval] | None) -> tuple[list[Eval], list[Eval]]: if evals is None: return [], [] diff --git a/tlm/utils/scoring/semantic_evaluation_scoring_utils.py b/tlm/utils/scoring/semantic_evaluation_scoring_utils.py index fa93731..ca83a8b 100644 --- a/tlm/utils/scoring/semantic_evaluation_scoring_utils.py +++ b/tlm/utils/scoring/semantic_evaluation_scoring_utils.py @@ -1,7 +1,7 @@ import numpy as np from tlm.utils.math_utils import get_nan_safe_mean, make_score_asymptotic -from tlm.types import Completion, CompletionFailure, ExtractedResponseField, SemanticEval +from tlm.types import Completion, CompletionFailure, ExtractedResponseField, Eval from tlm.utils.parse_utils import compute_score_expected_value @@ -42,12 +42,12 @@ }, ] -DEFAULT_RAG_EVALS = [SemanticEval(**eval_dict) for eval_dict in _DEFAULT_EVALS_DICT] # type: ignore +DEFAULT_RAG_EVALS = [Eval(**eval_dict) for eval_dict in _DEFAULT_EVALS_DICT] # type: ignore def compute_semantic_evaluation_scores( reference_answers: list[str | None], - evals: list[SemanticEval], + evals: list[Eval], semantic_evaluation_completions: list[Completion | CompletionFailure], ) -> dict[str, float]: """