diff --git a/docs/api/config.md b/docs/api/config.md
new file mode 100644
index 0000000..865877a
--- /dev/null
+++ b/docs/api/config.md
@@ -0,0 +1,23 @@
+::: tlm.config.schema.Config
+    options:
+      heading_level: 2
+
+::: tlm.config.schema.ReferenceCompletionConfigSchema
+    options:
+      heading_level: 2
+
+::: tlm.config.schema.ObservedConsistencyConfigSchema
+    options:
+      heading_level: 2
+
+::: tlm.config.schema.SelfReflectionConfigSchema
+    options:
+      heading_level: 2
+
+::: tlm.config.schema.SemanticEvalsConfigSchema
+    options:
+      heading_level: 2
+
+::: tlm.config.schema.ModelProviderSchema
+    options:
+      heading_level: 2
diff --git a/docs/api/types.md b/docs/api/types.md
new file mode 100644
index 0000000..2d0b9ea
--- /dev/null
+++ b/docs/api/types.md
@@ -0,0 +1,19 @@
+::: tlm.inference.InferenceResult
+    options:
+      heading_level: 2
+
+::: tlm.types.base.Eval
+    options:
+      heading_level: 2
+
+::: tlm.config.presets.QualityPreset
+    options:
+      heading_level: 2
+
+::: tlm.config.presets.ReasoningEffort
+    options:
+      heading_level: 2
+
+::: tlm.types.base.SimilarityMeasure
+    options:
+      heading_level: 2
diff --git a/mkdocs.yml b/mkdocs.yml
index 7c3828e..f859c27 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -91,5 +91,7 @@ nav:
     # - Structured Outputs: tutorials/tlm_structured_outputs/index.ipynb
     # - Tool Calls: tutorials/tlm_tool_calls/index.ipynb
   - API Reference:
-    - tlm: api/tlm.md
+    - TLM: api/tlm.md
+    - Config: api/config.md
+    - Types: api/types.md
   - Additional Cookbooks: https://github.com/cleanlab/cleanlab-tools
diff --git a/tests/integration/test_inference.py b/tests/integration/test_inference.py
index 6a99d67..2ada4df 100644
--- a/tests/integration/test_inference.py
+++ b/tests/integration/test_inference.py
@@ -15,13 +15,14 @@
 tlm_core_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
 sys.path.insert(0, tlm_core_path)
 
-from tlm.config.base import ConfigInput, ReasoningEffort  # noqa: E402
+from tlm.config.base import ReasoningEffort  # noqa: E402
+from tlm.config.schema import Config  # noqa: E402
 from tlm.config.models import BEDROCK_MODELS  # noqa: E402
 from tlm.config.presets import QualityPreset  # noqa: E402
 from tlm.templates import ReferenceCompletionTemplate  # noqa: E402
 from tlm import TLM  # noqa: E402
 from tlm.utils.completion_utils import generate_completion  # noqa: E402
-from tlm.types import Completion, SemanticEval, SimilarityMeasure  # noqa: E402
+from tlm.types import Completion, Eval, SimilarityMeasure  # noqa: E402
 
 # Load environment variables from .env file at top level of project
 project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
@@ -96,7 +97,7 @@ async def run_tests():
 
     test_inference_params = [
         {
-            "config_input": ConfigInput(
+            "config": Config(
                 quality_preset=QualityPreset.BASE,
                 reasoning_effort=ReasoningEffort.LOW,
                 model="gpt-4.1-mini",
@@ -104,7 +105,7 @@ async def run_tests():
             "openai_args": {"messages": [{"role": "user", "content": "What is the capital of France?"}]},
         },
         {
-            "config_input": ConfigInput(
+            "config": Config(
                 quality_preset=QualityPreset.HIGH,
                 reasoning_effort=ReasoningEffort.HIGH,
                 similarity_measure=SimilarityMeasure.EMBEDDING_LARGE,
@@ -114,12 +115,12 @@ async def run_tests():
                 "messages": [{"role": "user", "content": "Explain the concept of machine learning in simple terms."}]
             },
             "evals": [
-                SemanticEval(
+                Eval(
                     name="clarity",
                     criteria="The response is clear and easy to understand.",
                     response_identifier="response",
                 ),
-                SemanticEval(
+                Eval(
                     name="conciseness",
                     criteria="The response is concise and to the point.",
                     response_identifier="response",
@@ -128,7 +129,7 @@ async def run_tests():
             "enabled": True,
         },
         {
-            "config_input": ConfigInput(
+            "config": Config(
                 quality_preset=QualityPreset.MEDIUM,
                 reasoning_effort=ReasoningEffort.MEDIUM,
                 similarity_measure=SimilarityMeasure.JACCARD,
@@ -138,7 +139,7 @@ async def run_tests():
             },
         },
         {
-            "config_input": ConfigInput(
+            "config": Config(
                 quality_preset=QualityPreset.HIGH,
                 reasoning_effort=ReasoningEffort.HIGH,
                 constrain_outputs=["positive", "negative", "neutral"],
@@ -154,7 +155,7 @@ async def run_tests():
             },
         },
         {
-            "config_input": ConfigInput(
+            "config": Config(
                 quality_preset=QualityPreset.HIGH,
                 reasoning_effort=ReasoningEffort.HIGH,
                 constrain_outputs=["yes", "no"],
@@ -165,7 +166,7 @@ async def run_tests():
             },
         },
         {
-            "config_input": ConfigInput(
+            "config": Config(
                 quality_preset=QualityPreset.HIGH,
                 reasoning_effort=ReasoningEffort.HIGH,
             ),
@@ -197,7 +198,7 @@ async def run_tests():
             },
             "context": "The Simple Water Bottle is a reusable 27 oz water bottle.",
             # "evals": DEFAULT_RAG_EVALS,
-            "config_input": ConfigInput(
+            "config": Config(
                 quality_preset=QualityPreset.BEST,
                 reasoning_effort=ReasoningEffort.MEDIUM,
             ),
@@ -267,7 +268,7 @@ async def run_tests():
                 },
                 "perplexity": 0.95,
             },
-            "config_input": ConfigInput(
+            "config": Config(
                 quality_preset=QualityPreset.HIGH,
                 model="gpt-4.1-mini",
             ),
@@ -292,7 +293,7 @@ async def run_tests():
                     },
                 },
             },
-            "config_input": ConfigInput(
+            "config": Config(
                 quality_preset=QualityPreset.HIGH,
                 model="gpt-4.1-mini",
             ),
diff --git a/tests/unit/templates/test_semantic_evaluation_completion_template.py b/tests/unit/templates/test_semantic_evaluation_completion_template.py
index 92beaf8..1c3f414 100644
--- a/tests/unit/templates/test_semantic_evaluation_completion_template.py
+++ b/tests/unit/templates/test_semantic_evaluation_completion_template.py
@@ -1,7 +1,7 @@
 from tlm.config.presets import ReasoningEffort
 from tlm.templates.semantic_evaluation_completion_template import SemanticEvaluationCompletionTemplate
 from tlm.utils.completion_utils import generate_completion
-from tlm.types import Completion, ExtractedResponseField, SemanticEval
+from tlm.types import Completion, ExtractedResponseField, Eval
 
 import pytest
 
@@ -44,7 +44,7 @@ async def test_semantic_evaluation_completion_template_with_reasoning_effort(
     expected_mapped_score: float,
 ) -> None:
     """Test SemanticEvaluationCompletionTemplate with different reasoning effort levels."""
-    eval = SemanticEval(
+    eval = Eval(
         name="context_sufficiency",
         criteria="Determine if the Document contains 100% of the information needed to answer the Question.",
         query_identifier="Question",
@@ -109,7 +109,7 @@ async def test_semantic_evaluation_completion_template_with_different_identifier
     reference_answer: str | None,
 ) -> None:
     """Test SemanticEvaluationCompletionTemplate with different identifier configurations."""
-    eval = SemanticEval(
+    eval = Eval(
         name="test_eval",
         criteria="Test criteria for evaluation.",
         query_identifier=query_identifier,
diff --git a/tlm/api.py b/tlm/api.py
index 007d7f1..1e92ef2 100644
--- a/tlm/api.py
+++ b/tlm/api.py
@@ -4,10 +4,11 @@
 import sys
 from openai.types.chat import ChatCompletion
 
-from tlm.config.base import Config, ConfigInput
+from tlm.config.base import BaseConfig
+from tlm.config.schema import Config
 from tlm.config.presets import WorkflowType
 from tlm.inference import InferenceResult, tlm_inference
-from tlm.types import SemanticEval
+from tlm.types import Eval
 
 
 def is_notebook() -> bool:
@@ -28,8 +29,8 @@ class TLM:
 
     def __init__(
         self,
-        config_input: ConfigInput = ConfigInput(),
-        evals: list[SemanticEval] | None = None,
+        config: Config = Config(),
+        evals: list[Eval] | None = None,
     ):
         """Initialize a TLM instance.
 
@@ -40,7 +41,7 @@ def __init__(
             evals: Optional list of evaluations. Each evaluation
                 defines a name, criteria, and optional query/context/response identifiers.
         """
-        self.config_input = config_input
+        self.config = config
         self.evals = evals
 
         is_notebook_flag = is_notebook()
@@ -59,7 +60,7 @@ def create(
         self,
         *,
         context: str | None = None,
-        evals: list[SemanticEval] | None = None,
+        evals: list[Eval] | None = None,
         **openai_kwargs: Any,
     ) -> InferenceResult:
         """Create a new LLM completion and then score its trustworthiness.
@@ -102,7 +103,7 @@ def score(
         *,
         response: ChatCompletion | dict[str, Any],
         context: str | None = None,
-        evals: list[SemanticEval] | None = None,
+        evals: list[Eval] | None = None,
         **openai_kwargs: Any,
     ) -> InferenceResult:
         """Score the trusworthiness of an existing LLM response/completion (from any LLM, or even from a human-writer).
@@ -144,7 +145,7 @@ async def _async_inference(
         *,
         response: dict[str, Any] | None = None,
         context: str | None = None,
-        evals: list[SemanticEval] | None = None,
+        evals: list[Eval] | None = None,
         **openai_kwargs: Any,
     ) -> InferenceResult:
         """Internal async method that performs the inference or scoring operation.
@@ -157,10 +158,10 @@ async def _async_inference(
             openai_args=openai_kwargs,
             score=response is not None,
             rag=(context is not None),
-            constrain_outputs=self.config_input.constrain_outputs,
+            constrain_outputs=self.config.constrain_outputs,
         )
         model = openai_kwargs.get("model")
-        config = Config.from_input(self.config_input, workflow_type, model)
+        config = BaseConfig.from_input(self.config, workflow_type, model)
         return await tlm_inference(
             completion_params=openai_kwargs,
             response=response,
diff --git a/tlm/components/semantic_evaluation_score_generator.py b/tlm/components/semantic_evaluation_score_generator.py
index 6c7cf23..040b67b 100644
--- a/tlm/components/semantic_evaluation_score_generator.py
+++ b/tlm/components/semantic_evaluation_score_generator.py
@@ -5,7 +5,7 @@
 from tlm.templates import SemanticEvaluationCompletionTemplate
 from tlm.utils.completion_utils import generate_completion
 from tlm.utils.scoring.semantic_evaluation_scoring_utils import compute_semantic_evaluation_scores
-from tlm.types import SemanticEval
+from tlm.types import Eval
 
 
 class SemanticEvaluationScoreGenerator(Component):
@@ -17,7 +17,7 @@ def __init__(
         self,
         query: str | None,
         context: str | None,
-        evals: list[SemanticEval],
+        evals: list[Eval],
         reasoning_effort: ReasoningEffort,
         temperature: float,
         **kwargs,
diff --git a/tlm/config/base.py b/tlm/config/base.py
index 83f9ac3..17d7c80 100644
--- a/tlm/config/base.py
+++ b/tlm/config/base.py
@@ -1,9 +1,9 @@
 from pydantic import BaseModel, Field
 
+from tlm.config.schema import Config as ConfigSchema
 from tlm.config.presets import (
     DEFAULT_CONFIG_FOR_QUALITY,
     DEFAULT_CONFIG_FOR_QUALITY_AND_WORKFLOW,
-    QualityPreset,
     ReasoningEffort,
     WorkflowType,
 )
@@ -15,12 +15,6 @@
 settings = get_settings()
 
 
-class ReferenceCompletionConfigInput(BaseModel):
-    num_reference_completions: int | None = Field(
-        default=None, description="The attempted number of reference completions to generate."
-    )
-
-
 class ReferenceCompletionConfig(BaseModel):
     num_reference_completions: int = 1
     min_reference_completions: int = Field(
@@ -31,13 +25,6 @@ class ReferenceCompletionConfig(BaseModel):
     )
 
 
-class ObservedConsistencyConfigInput(BaseModel):
-    num_consistency_completions: int | None = Field(
-        default=None, description="The attempted number of observed consistency completions to generate."
-    )
-    observed_consistency_temperature: float | None = None
-
-
 class ObservedConsistencyConfig(BaseModel):
     num_consistency_completions: int
     min_consistency_completions: int = Field(
@@ -46,18 +33,6 @@ class ObservedConsistencyConfig(BaseModel):
     observed_consistency_temperature: float = 1.0
 
 
-class SelfReflectionConfigInput(BaseModel):
-    self_reflection_temperature: float | None = None
-    num_self_reflection_completions: int | None = Field(
-        default=None,
-        description=(
-            "The number of self reflection prompts to use. Note that the first X number of prompts will be used, "
-            "i.e. the order of the prompt templates in SELF_REFLECTION_TEMPLATES_BY_WORKFLOW[workflow_type] matters. "
-            "-1 means all prompts will be used."
-        ),
-    )
-
-
 class SelfReflectionConfig(BaseModel):
     self_reflection_temperature: float | None = None
     num_self_reflection_completions: int
@@ -66,38 +41,12 @@ class SelfReflectionConfig(BaseModel):
     )
 
 
-class SemanticEvalsConfigInput(BaseModel):
-    use_prompt_evaluation: bool | None = None
-    prompt_evaluation_temperature: float | None = None
-    semantic_evaluation_temperature: float | None = None
-
-
 class SemanticEvalsConfig(BaseModel):
     use_prompt_evaluation: bool = False
     prompt_evaluation_temperature: float = 0.0
     semantic_evaluation_temperature: float = 0.0
 
 
-class ModelProviderInput(BaseModel):
-    provider: str | None = None
-    api_base: str | None = None
-    api_key: str | None = None
-    api_version: str | None = None
-
-
-class ConfigInput(
-    ReferenceCompletionConfigInput,
-    ObservedConsistencyConfigInput,
-    SelfReflectionConfigInput,
-    SemanticEvalsConfigInput,
-    ModelProviderInput,
-):
-    quality_preset: QualityPreset = QualityPreset.MEDIUM
-    reasoning_effort: ReasoningEffort | None = None
-    similarity_measure: SimilarityMeasure | None = None
-    constrain_outputs: list[str] | None = None
-
-
 class BaseConfig(
     ReferenceCompletionConfig,
     ObservedConsistencyConfig,
@@ -110,10 +59,8 @@ class BaseConfig(
     reasoning_effort: ReasoningEffort = ReasoningEffort.NONE
     constrain_outputs: list[str] | None = None
 
-
-class Config(BaseConfig):
     @classmethod
-    def from_input(cls, input: ConfigInput, workflow_type: WorkflowType, model: str | None) -> "Config":
+    def from_input(cls, input: ConfigSchema, workflow_type: WorkflowType, model: str | None) -> "BaseConfig":
         defaults_for_quality = DEFAULT_CONFIG_FOR_QUALITY[input.quality_preset]
         defaults_for_workflow = DEFAULT_CONFIG_FOR_QUALITY_AND_WORKFLOW[input.quality_preset].get(
             workflow_type
diff --git a/tlm/config/presets.py b/tlm/config/presets.py
index ad25e81..6d04d3e 100644
--- a/tlm/config/presets.py
+++ b/tlm/config/presets.py
@@ -3,6 +3,15 @@
 
 
 class QualityPreset(str, Enum):
+    """Quality presets that control the trade-off between speed and accuracy.
+
+    Higher quality presets generate more completions and use more advanced techniques,
+    resulting in higher trustworthiness scores but slower inference and higher costs.
+
+    Values:
+        `BASE`, `LOW`, `MEDIUM` (default), `HIGH`, `BEST`
+    """
+
     BASE = "base"
     LOW = "low"
     MEDIUM = "medium"
@@ -11,7 +20,14 @@ class QualityPreset(str, Enum):
 
 
 class ReasoningEffort(str, Enum):
-    """Enum for different levels of reasoning effort supported by TLM."""
+    """Reasoning effort levels that control explanation generation for trustworthiness scores.
+
+    Higher reasoning effort generates longer explanations that provide more detailed
+    reasoning about why a particular trustworthiness score was assigned.
+
+    Values:
+        `NONE` (default), `LOW`, `MEDIUM`, `HIGH`
+    """
 
     NONE = "none"
     LOW = "low"
diff --git a/tlm/config/schema.py b/tlm/config/schema.py
new file mode 100644
index 0000000..e83a8db
--- /dev/null
+++ b/tlm/config/schema.py
@@ -0,0 +1,110 @@
+from tlm.config.presets import QualityPreset, ReasoningEffort
+from tlm.types import SimilarityMeasure
+
+from pydantic import BaseModel, Field
+
+
+class ReferenceCompletionConfigSchema(BaseModel):
+    """
+    Configuration for reference completion generation.
+
+    Attributes:
+        num_reference_completions: The attempted number of reference completions to generate.
+    """
+
+    num_reference_completions: int | None = Field(
+        default=None, description="The attempted number of reference completions to generate."
+    )
+
+
+class ObservedConsistencyConfigSchema(BaseModel):
+    """
+    Configuration for generating additional completions against which to score consistency of reference completions.
+
+    Attributes:
+        num_consistency_completions: The attempted number of observed consistency completions to generate.
+        observed_consistency_temperature: The temperature to use for generating comparison completions.
+    """
+
+    num_consistency_completions: int | None = Field(
+        default=None, description="The attempted number of observed consistency completions to generate."
+    )
+    observed_consistency_temperature: float | None = None
+
+
+class SelfReflectionConfigSchema(BaseModel):
+    """
+    Configuration for prompting LLM-as-judge to score the trustworthiness of reference completions using self-reflection prompts.
+
+    Attributes:
+        self_reflection_temperature: The temperature to use for self reflection completions.
+        num_self_reflection_completions: The attempted number of self reflection completions to generate.
+    """
+
+    self_reflection_temperature: float | None = None
+    num_self_reflection_completions: int | None = Field(
+        default=None,
+        description=(
+            "The number of self reflection prompts to use. Note that the first X number of prompts will be used, "
+            "i.e. the order of the prompt templates in SELF_REFLECTION_TEMPLATES_BY_WORKFLOW[workflow_type] matters. "
+            "-1 means all prompts will be used."
+        ),
+    )
+
+
+class SemanticEvalsConfigSchema(BaseModel):
+    """
+    Configuration for semantic evaluation of reference completions.
+
+    Attributes:
+        use_prompt_evaluation: Whether to incorporate prompt evaluation scores into the final trustworthiness score.
+        prompt_evaluation_temperature: The temperature to use for prompt evaluation completions.
+        semantic_evaluation_temperature: The temperature to use when generating completions to score the Evals.
+    """
+
+    use_prompt_evaluation: bool | None = None
+    prompt_evaluation_temperature: float | None = None  # TODO: rename to prompt_evaluation_temperature
+    semantic_evaluation_temperature: float | None = None  # TODO: rename to semantic_evaluation_temperature
+
+
+class ModelProviderSchema(BaseModel):
+    """
+    Configuration for the model provider in alignment with the LiteLLM API.
+
+    Attributes:
+        provider: The name of the model provider.
+        api_base: The base URL of the model provider's API.
+        api_key: The API key to use for the model provider.
+        api_version: The version of the model provider's API.
+    """
+
+    provider: str | None = None
+    api_base: str | None = None
+    api_key: str | None = None
+    api_version: str | None = None
+
+
+class Config(
+    ReferenceCompletionConfigSchema,
+    ObservedConsistencyConfigSchema,
+    SelfReflectionConfigSchema,
+    SemanticEvalsConfigSchema,
+    ModelProviderSchema,
+):
+    """Configuration for TLM inference.
+
+    This class combines multiple configuration schemas to provide comprehensive
+    control over TLM's inference behavior, including reference completions,
+    consistency checking, self-reflection, semantic evaluation, and model provider settings.
+
+    Attributes:
+        quality_preset: Quality preset controlling the trade-off between speed and accuracy.
+        reasoning_effort: Optional reasoning effort level for models that support it.
+        similarity_measure: Optional similarity measure to use for comparing consistency across responses.
+        constrain_outputs: Optional list of allowed output values to constrain responses, for example in multiple choice questions.
+    """
+
+    quality_preset: QualityPreset = QualityPreset.MEDIUM
+    reasoning_effort: ReasoningEffort | None = None
+    similarity_measure: SimilarityMeasure | None = None
+    constrain_outputs: list[str] | None = None
diff --git a/tlm/inference.py b/tlm/inference.py
index 92e1d45..4ccba9d 100644
--- a/tlm/inference.py
+++ b/tlm/inference.py
@@ -1,14 +1,25 @@
 from typing import Any, TypedDict
 
-from tlm.config.base import Config
+from tlm.config.base import BaseConfig
 from tlm.config.presets import WorkflowType
 from tlm.pipeline import PipelineFactory
-from tlm.types import SemanticEval, CompletionParams
+from tlm.types import Eval, CompletionParams
 from tlm.utils.scoring.semantic_evaluation_scoring_utils import DEFAULT_RAG_EVALS
 
 
 class InferenceResult(TypedDict):
-    response: str | dict[str, Any]  # either a response string or OpenAI chat completion dict
+    """Result returned from TLM inference.
+
+    Attributes:
+        response: Either a response string or dictionary representation of an OpenAI chat completion.
+        trustworthiness_score: Score indicating the trustworthiness of the response, between 0 and 1.
+        usage: Token usage information for the inference, including prompt and completion tokens.
+        metadata: Optional metadata, e.g. per-field scores for structured outputs.
+        evals: Optional dictionary of Eval scores, keyed by evaluation name.
+        explanation: Explanation for the trustworthiness score.
+    """
+
+    response: str | dict[str, Any]
     trustworthiness_score: float
     usage: dict[str, Any]
     metadata: dict[str, Any] | None
@@ -20,9 +31,9 @@ async def tlm_inference(
     *,
     completion_params: CompletionParams,
     response: dict[str, Any] | None,
-    evals: list[SemanticEval] | None,
+    evals: list[Eval] | None,
     context: str | None,
-    config: Config,
+    config: BaseConfig,
 ) -> InferenceResult:
     if evals is None and config.workflow_type == WorkflowType.RAG:
         evals = DEFAULT_RAG_EVALS
diff --git a/tlm/pipeline/factory.py b/tlm/pipeline/factory.py
index 17361e4..426cb63 100644
--- a/tlm/pipeline/factory.py
+++ b/tlm/pipeline/factory.py
@@ -14,12 +14,12 @@
     SelfReflectionCompletionGenerator,
     SelfReflectionScoreComputation,
 )
-from tlm.config.base import Config
+from tlm.config.base import BaseConfig
 from tlm.config.presets import WorkflowType
 from tlm.pipeline import InferencePipeline
 from tlm.utils.prompt_utils import format_user_request, extract_user_prompt
 from tlm.utils.eval_utils import group_evals
-from tlm.types import SemanticEval, CompletionParams, InferenceType
+from tlm.types import Eval, CompletionParams, InferenceType
 
 
 class PipelineFactory:
@@ -27,9 +27,9 @@ class PipelineFactory:
     def create(
         *,
         completion_params: CompletionParams,
-        config: Config,
+        config: BaseConfig,
         response: Dict[str, Any] | None,
-        evals: list[SemanticEval] | None,
+        evals: list[Eval] | None,
         context: str | None,
     ) -> InferencePipeline:
         pipeline = InferencePipeline()
diff --git a/tlm/templates/semantic_evaluation_completion_template.py b/tlm/templates/semantic_evaluation_completion_template.py
index f2d1c4e..abd2a9e 100644
--- a/tlm/templates/semantic_evaluation_completion_template.py
+++ b/tlm/templates/semantic_evaluation_completion_template.py
@@ -13,7 +13,7 @@
 )
 from tlm.templates.parsers import RATING_XML_PARSER, THINK_RATING_XML_PARSER
 from tlm.templates.score_mapping import score_5_mapping
-from tlm.types import SemanticEval, CompletionTemplate
+from tlm.types import Eval, CompletionTemplate
 
 
 class SemanticEvaluationCompletionTemplate(CompletionTemplate):
@@ -57,9 +57,7 @@ class SemanticEvaluationCompletionTemplate(CompletionTemplate):
     )
 
     @classmethod
-    def create(
-        cls, eval: SemanticEval, reasoning_effort: ReasoningEffort, **kwargs
-    ) -> "SemanticEvaluationCompletionTemplate":
+    def create(cls, eval: Eval, reasoning_effort: ReasoningEffort, **kwargs) -> "SemanticEvaluationCompletionTemplate":
         prompt_parts = [cls._PREFIX]
         input_information = []
 
diff --git a/tlm/types/__init__.py b/tlm/types/__init__.py
index 5c9241e..0f9b21a 100644
--- a/tlm/types/__init__.py
+++ b/tlm/types/__init__.py
@@ -7,7 +7,7 @@
     CompletionFailureType,
     CompletionParams,
     FieldMetadata,
-    SemanticEval,
+    Eval,
     RegexPattern,
     AnswerChoiceToken,
     CompletionUsage,
@@ -22,7 +22,7 @@
     "SimilarityMeasure",
     "CompletionFailureType",
     "FieldMetadata",
-    "SemanticEval",
+    "Eval",
     "RegexPattern",
     "AnswerChoiceToken",
     "CompletionUsage",
diff --git a/tlm/types/base.py b/tlm/types/base.py
index 8404b94..cdb5371 100644
--- a/tlm/types/base.py
+++ b/tlm/types/base.py
@@ -21,6 +21,12 @@ class ExtractedResponseField(str, Enum):
 
 
 class SimilarityMeasure(str, Enum):
+    """Strategies for scoring the similarity of two generated responses.
+
+    Values:
+        `JACCARD`, `EMBEDDING_SMALL`, `EMBEDDING_LARGE`, `CODE`, `STATEMENT`
+    """
+
     JACCARD = "jaccard"  # formerly STRING
     EMBEDDING_SMALL = "embedding_small"
     EMBEDDING_LARGE = "embedding_large"
@@ -55,7 +61,18 @@ class FieldMetadata(BaseModel):
     explanation: str
 
 
-class SemanticEval(BaseModel):
+class Eval(BaseModel):
+    """Criteria for performing a semantic evaluation of the query, context, and/or response.
+    At least one of query_identifier, context_identifier, and response_identifier must be provided.
+
+    Attributes:
+        name: The name of the evaluation.
+        criteria: Semantic description of the criteria to assess.
+        query_identifier: Identifier for the user query to be provided in the prompt passed to the LLM, e.g. "User Query". Should be `None` if the evaluation does not require the query.
+        context_identifier: Identifier for the context to be provided in the prompt passed to the LLM, e.g. "Context". Should be `None` if the evaluation does not require the context.
+        response_identifier: Identifier for the response to be provided in the prompt passed to the LLM, e.g. "Response". Should be `None` if the evaluation does not require the response.
+    """
+
     name: str
     criteria: str
     query_identifier: str | None = None
diff --git a/tlm/utils/eval_utils.py b/tlm/utils/eval_utils.py
index a9a3e95..9c22661 100644
--- a/tlm/utils/eval_utils.py
+++ b/tlm/utils/eval_utils.py
@@ -1,7 +1,7 @@
-from tlm.types import SemanticEval
+from tlm.types import Eval
 
 
-def group_evals(evals: list[SemanticEval] | None) -> tuple[list[SemanticEval], list[SemanticEval]]:
+def group_evals(evals: list[Eval] | None) -> tuple[list[Eval], list[Eval]]:
     if evals is None:
         return [], []
 
diff --git a/tlm/utils/scoring/semantic_evaluation_scoring_utils.py b/tlm/utils/scoring/semantic_evaluation_scoring_utils.py
index fa93731..ca83a8b 100644
--- a/tlm/utils/scoring/semantic_evaluation_scoring_utils.py
+++ b/tlm/utils/scoring/semantic_evaluation_scoring_utils.py
@@ -1,7 +1,7 @@
 import numpy as np
 
 from tlm.utils.math_utils import get_nan_safe_mean, make_score_asymptotic
-from tlm.types import Completion, CompletionFailure, ExtractedResponseField, SemanticEval
+from tlm.types import Completion, CompletionFailure, ExtractedResponseField, Eval
 from tlm.utils.parse_utils import compute_score_expected_value
 
 
@@ -42,12 +42,12 @@
     },
 ]
 
-DEFAULT_RAG_EVALS = [SemanticEval(**eval_dict) for eval_dict in _DEFAULT_EVALS_DICT]  # type: ignore
+DEFAULT_RAG_EVALS = [Eval(**eval_dict) for eval_dict in _DEFAULT_EVALS_DICT]  # type: ignore
 
 
 def compute_semantic_evaluation_scores(
     reference_answers: list[str | None],
-    evals: list[SemanticEval],
+    evals: list[Eval],
     semantic_evaluation_completions: list[Completion | CompletionFailure],
 ) -> dict[str, float]:
     """