From f9bb98951865aad5e9d3cd9eb94e57b787dd58d3 Mon Sep 17 00:00:00 2001
From: Vilhjalmur Thorsteinsson <vt@extrada.com>
Date: Thu, 11 Dec 2025 16:58:35 +0000
Subject: [PATCH 1/3] Remove deprecated --handle_kludgy_ordinals flag
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove the outdated handle_kludgy_ordinals option from the CLI and
tokenization API. Kludgy ordinals (e.g. '1sti', '3ja') are now always
passed through unchanged as word tokens, which was the default behavior.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 README.md                      | 35 +++----------------
 src/tokenizer/__init__.py      |  6 ----
 src/tokenizer/definitions.py   | 32 ++---------------
 src/tokenizer/main.py          | 16 ---------
 src/tokenizer/tokenizer.py     | 43 ++++-------------------
 test/test_cli.py               |  9 -----
 test/test_index_calculation.py | 20 -----------
 test/test_tokenizer.py         | 63 ----------------------------------
 8 files changed, 13 insertions(+), 211 deletions(-)

diff --git a/README.md b/README.md
index 2ff0267..f3de20b 100644
--- a/README.md
+++ b/README.md
@@ -118,7 +118,6 @@ Other options can be specified on the command line:
 | `-g`, `--keep_composite_glyphs` | Do not replace composite glyphs using Unicode COMBINING codes with their accented/umlaut counterparts. |
 | `-e`, `--replace_html_escapes` | HTML escape codes replaced by their meaning, such as `&aacute;` -> `á`. |
 | `-c`, `--convert_numbers`      | English-style decimal points and thousands separators in numbers changed to Icelandic style. |
-| `-k N`, `--handle_kludgy_ordinals N` | Kludgy ordinal handling defined. 0: Returns the original mixed word form, 1. Kludgy ordinal returned as pure word forms, 2: Kludgy ordinals returned as pure numbers. |
 
 Type `tokenize -h` or `tokenize --help` to get a short help message.
 
@@ -453,31 +452,6 @@ functions:
 
   The default value for the `replace_html_escapes` option is `False`.
 
-* `handle_kludgy_ordinals=[value]`
-
-  This options controls the way Tokenizer handles 'kludgy' ordinals, such as
-  *1sti*, *4ðu*, or *2ja*. By default, such ordinals are returned unmodified
-  ('passed through') as word tokens (`TOK.WORD`).
-  However, this can be modified as follows:
-
-  * `tokenizer.KLUDGY_ORDINALS_MODIFY`: Kludgy ordinals are corrected
-    to become 'proper' word tokens, i.e. *1sti* becomes *fyrsti* and
-    *2ja* becomes *tveggja*.
-
-  * `tokenizer.KLUDGY_ORDINALS_TRANSLATE`: Kludgy ordinals that represent
-    proper ordinal numbers are translated to ordinal tokens (`TOK.ORDINAL`),
-    with their original text and their ordinal value. *1sti* thus
-    becomes a `TOK.ORDINAL` token with a value of 1, and *3ja* becomes
-    a `TOK.ORDINAL` with a value of 3.
-
-  * `tokenizer.KLUDGY_ORDINALS_PASS_THROUGH` is the default value of
-    the option. It causes kludgy ordinals to be returned unmodified as
-    word tokens.
-
-  Note that versions of Tokenizer prior to 1.4 behaved as if
-  `handle_kludgy_ordinals` were set to
-  `tokenizer.KLUDGY_ORDINALS_TRANSLATE`.
-
 ## Dash and Hyphen Handling
 
 Tokenizer distinguishes between three dash types and handles them contextually:
@@ -578,9 +552,8 @@ with the following exceptions:
   can be disabled; see the `replace_composite_glyphs` option described
   above.)
 
-* If the appropriate options are specified (see above), it converts
-  kludgy ordinals (*3ja*) to proper ones (*þriðja*), and English-style
-  thousand and decimal separators to Icelandic ones
+* If the `convert_numbers` option is specified (see above), English-style
+  thousand and decimal separators are converted to Icelandic ones
   (*10,345.67* becomes *10.345,67*).
 
 * If the `replace_html_escapes` option is set, Tokenizer replaces
@@ -812,8 +785,8 @@ can be found in the file `test/toktest_normal_gold_expected.txt`.
   `TOK.SERIALNUMBER` token kinds; abbreviations can now have multiple
   meanings.
 * Version 1.4.0: Added the `**options` parameter to the
-  `tokenize()` function, giving control over the handling of numbers,
-  telephone numbers, and 'kludgy' ordinals.
+  `tokenize()` function, giving control over the handling of numbers
+  and telephone numbers.
 * Version 1.3.0: Added `TOK.DOMAIN` and `TOK.HASHTAG` token types; 
   improved handling of capitalized month name *Ágúst*, which is
   now recognized when following an ordinal number; improved recognition
diff --git a/src/tokenizer/__init__.py b/src/tokenizer/__init__.py
index 47b0d3a..1c7f229 100644
--- a/src/tokenizer/__init__.py
+++ b/src/tokenizer/__init__.py
@@ -36,9 +36,6 @@
     TP_WORD,
     EN_DASH,
     EM_DASH,
-    KLUDGY_ORDINALS_PASS_THROUGH,
-    KLUDGY_ORDINALS_MODIFY,
-    KLUDGY_ORDINALS_TRANSLATE,
     BIN_Tuple,
     BIN_TupleList,
 )
@@ -80,9 +77,6 @@
     "EM_DASH",
     "EN_DASH",
     "generate_raw_tokens",
-    "KLUDGY_ORDINALS_MODIFY",
-    "KLUDGY_ORDINALS_PASS_THROUGH",
-    "KLUDGY_ORDINALS_TRANSLATE",
     "mark_paragraphs",
     "normalized_text_from_tokens",
     "normalized_text",
diff --git a/src/tokenizer/definitions.py b/src/tokenizer/definitions.py
index 278188b..cea723e 100644
--- a/src/tokenizer/definitions.py
+++ b/src/tokenizer/definitions.py
@@ -605,20 +605,8 @@ class PersonNameTuple(NamedTuple):
 )
 
 
-# If the handle_kludgy_ordinals option is set to
-# KLUDGY_ORDINALS_PASS_THROUGH, we do not convert
-# kludgy ordinals but pass them through as word tokens.
-KLUDGY_ORDINALS_PASS_THROUGH = 0
-# If the handle_kludgy_ordinals option is set to
-# KLUDGY_ORDINALS_MODIFY, we convert '1sti' to 'fyrsti', etc.,
-# and return the modified word as a token.
-KLUDGY_ORDINALS_MODIFY = 1
-# If the handle_kludgy_ordinals option is set to
-# KLUDGY_ORDINALS_TRANSLATE, we convert '1sti' to TOK.Ordinal('1sti', 1), etc.,
-# but otherwise pass the original word through as a word token ('2ja').
-KLUDGY_ORDINALS_TRANSLATE = 2
-
-# Incorrectly written ('kludgy') ordinals
+# Incorrectly written ('kludgy') ordinals: these are passed through unchanged
+# as word tokens, but they need to be recognized so they are not parsed as numbers
 ORDINAL_ERRORS: Mapping[str, str] = {
     "1sti": "fyrsti",
     "1sta": "fyrsta",
@@ -639,22 +627,6 @@ class PersonNameTuple(NamedTuple):
     "4ra": "fjögurra",
 }
 
-# Translations of kludgy ordinal words into numbers
-ORDINAL_NUMBERS: Mapping[str, int] = {
-    "1sti": 1,
-    "1sta": 1,
-    "1stu": 1,
-    "3ji": 3,
-    "3ja": 3,
-    "3ju": 3,
-    "4ði": 4,
-    "4ða": 4,
-    "4ðu": 4,
-    "5ti": 5,
-    "5ta": 5,
-    "5tu": 5,
-}
-
 # Handling of Roman numerals
 
 RE_ROMAN_NUMERAL = re.compile(
diff --git a/src/tokenizer/main.py b/src/tokenizer/main.py
index f7a34a5..c9a61e7 100755
--- a/src/tokenizer/main.py
+++ b/src/tokenizer/main.py
@@ -149,19 +149,6 @@
     ),
 )
 
-parser.add_argument(
-    "-k",
-    "--handle_kludgy_ordinals",
-    type=int,
-    default=0,
-    help=(
-        "Kludgy ordinal handling defined.\n"
-        "\t0: Returns the original word form.\n"
-        "\t1: Ordinals returned as pure words.\n"
-        "\t2: Ordinals returned as numbers."
-    ),
-)
-
 parser.add_argument(
     "-v",
     "--version",
@@ -263,9 +250,6 @@ def val(t: Tok, quote_word: bool = False) -> Any:
     if args.one_sent_per_line:
         options["one_sent_per_line"] = True
 
-    if args.handle_kludgy_ordinals:
-        options["handle_kludgy_ordinals"] = args.handle_kludgy_ordinals
-
     if args.original:
         options["original"] = args.original
 
diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py
index 879f919..4f41657 100644
--- a/src/tokenizer/tokenizer.py
+++ b/src/tokenizer/tokenizer.py
@@ -1730,42 +1730,22 @@ def _is_letter(self, char: str) -> bool:
 class NumberParser:
     """Parses a sequence of digits off the front of a raw token"""
 
-    def __init__(
-        self, rt: Tok, handle_kludgy_ordinals: int, convert_numbers: bool
-    ) -> None:
+    def __init__(self, rt: Tok, convert_numbers: bool) -> None:
         self.rt = rt
-        self.handle_kludgy_ordinals = handle_kludgy_ordinals
         self.convert_numbers = convert_numbers
 
     def parse(self) -> Iterable[Tok]:
         """Parse the raw token, yielding result tokens"""
         # Handle kludgy ordinals: '3ji', '5ti', etc.
+        # Yield them unchanged as word tokens (pass-through behavior)
         rt = self.rt
-        handle_kludgy_ordinals = self.handle_kludgy_ordinals
         convert_numbers = self.convert_numbers
-        for key, val in ORDINAL_ERRORS.items():
+        for key in ORDINAL_ERRORS:
             rtxt = rt.txt
             if rtxt.startswith(key):
-                # This is a kludgy ordinal
+                # This is a kludgy ordinal: yield it unchanged as a word token
                 key_tok, rt = rt.split(len(key))
-                if handle_kludgy_ordinals == KLUDGY_ORDINALS_MODIFY:
-                    # Convert ordinals to corresponding word tokens:
-                    # '1sti' -> 'fyrsti', '3ji' -> 'þriðji', etc.
-                    key_tok.substitute_longer((0, len(key)), val)
-                    yield TOK.Word(key_tok)
-                elif (
-                    handle_kludgy_ordinals == KLUDGY_ORDINALS_TRANSLATE
-                    and key in ORDINAL_NUMBERS
-                ):
-                    # Convert word-form ordinals into ordinal tokens,
-                    # i.e. '1sti' -> TOK.Ordinal('1sti', 1),
-                    # but leave other kludgy constructs ('2ja')
-                    # as word tokens
-                    yield TOK.Ordinal(key_tok, ORDINAL_NUMBERS[key])
-                else:
-                    # No special handling of kludgy ordinals:
-                    # yield them unchanged as word tokens
-                    yield TOK.Word(key_tok)
+                yield TOK.Word(key_tok)
                 break  # This skips the for loop 'else'
         else:
             # Not a kludgy ordinal: eat tokens starting with a digit
@@ -1898,7 +1878,6 @@ def parse(self, rt: Tok) -> Iterable[Tok]:
 
 def parse_mixed(
     rt: Tok,
-    handle_kludgy_ordinals: int,
     convert_numbers: bool,
     replace_composite_glyphs: bool = True,
 ) -> Iterable[Tok]:
@@ -1994,7 +1973,7 @@ def parse_mixed(
             rtxt[0] in DIGITS_PREFIX
             or (rtxt[0] in SIGN_PREFIX and len(rtxt) >= 2 and rtxt[1] in DIGITS_PREFIX)
         ):
-            np = NumberParser(rt, handle_kludgy_ordinals, convert_numbers)
+            np = NumberParser(rt, convert_numbers)
             yield from np.parse()
             rt = np.rt
             ate = True
@@ -2072,12 +2051,6 @@ def parse_tokens(txt: Union[str, Iterable[str]], **options: Any) -> Iterator[Tok
     replace_html_escapes: bool = options.get("replace_html_escapes", False)
     one_sent_per_line: bool = options.get("one_sent_per_line", False)
 
-    # The default behavior for kludgy ordinals is to pass them
-    # through as word tokens
-    handle_kludgy_ordinals: int = options.get(
-        "handle_kludgy_ordinals", KLUDGY_ORDINALS_PASS_THROUGH
-    )
-
     # This code proceeds roughly as follows:
     # 1) The text is split into raw tokens on whitespace boundaries.
     # 2) (By far the most common case:) Raw tokens that are purely
@@ -2178,9 +2151,7 @@ def parse_tokens(txt: Union[str, Iterable[str]], **options: Any) -> Iterator[Tok
                 yield TOK.Punctuation(punct, normalized="‚")
 
         # More complex case of mixed punctuation, letters and numbers
-        yield from parse_mixed(
-            rt, handle_kludgy_ordinals, convert_numbers, replace_composite_glyphs
-        )
+        yield from parse_mixed(rt, convert_numbers, replace_composite_glyphs)
 
     # Yield a sentinel token at the end that will be cut off by the final generator
     yield TOK.End_Sentinel()
diff --git a/test/test_cli.py b/test/test_cli.py
index 0cd826e..9dd900d 100644
--- a/test/test_cli.py
+++ b/test/test_cli.py
@@ -200,13 +200,4 @@ def test_cli(capsys: CaptureFixture[str], monkeypatch: MonkeyPatch) -> None:
         == "Hann fékk 7,5 í meðaleinkunn en bara 3,3 í íþróttum , og hlaut 2.000,5 USD fyrir ."
     )
 
-    # Handle kludgy ordinals
-    # --handle_kludgy_ordinals flag
-    t = "Hann var 1sti maðurinn til að heimsækja tunglið."
-    r = run_cli(c, m, ["-", "-", "--handle_kludgy_ordinals", "1"], t)
-    assert r == "Hann var fyrsti maðurinn til að heimsækja tunglið ."
-    # TODO: Broken functionality, needs to be fixed
-    # r = run_cli(c, m, ["-", "-", "--handle_kludgy_ordinals", "2"], t)
-    # assert r == "Hann var 1. maðurinn til að heimsækja tunglið ."
-
     # TODO: Add more tests for the CLI to achieve 100% coverage
diff --git a/test/test_index_calculation.py b/test/test_index_calculation.py
index 52a7b08..7b91e00 100644
--- a/test/test_index_calculation.py
+++ b/test/test_index_calculation.py
@@ -638,26 +638,6 @@ def test_composite_phrases() -> None:
     assert byte_indexes == [0, 25, 26]
 
 
-def test_lengthening_substitutions() -> None:
-    s = "Þetta er 3ji báturinn!"
-    #    0123456789012345678901
-    #    ^    ^  ^   ^        ^
-    #    x             x
-    #             !             lengthening happens here (3ji->þriðji)
-    toks = tokenizer.parse_tokens(
-        s, handle_kludgy_ordinals=tokenizer.KLUDGY_ORDINALS_MODIFY
-    )
-    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
-    assert char_indexes == [0, 5, 8, 12, 21]
-    assert byte_indexes == [0, 6, 9, 13, 23]
-    toks = tokenizer.parse_tokens(
-        s, handle_kludgy_ordinals=tokenizer.KLUDGY_ORDINALS_MODIFY
-    )
-    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks, last_is_end=True)
-    assert char_indexes == [0, 5, 8, 12, 21, 22]
-    assert byte_indexes == [0, 6, 9, 13, 23, 24]
-
-
 def test_converted_measurements() -> None:
     s = "Stillið ofninn á 12° C til að baka kökuna."
     #    012345678901234567890123456789012345678901
diff --git a/test/test_tokenizer.py b/test/test_tokenizer.py
index 99a9b6f..d5ad84e 100644
--- a/test/test_tokenizer.py
+++ b/test/test_tokenizer.py
@@ -498,21 +498,6 @@ def test_single_tokens() -> None:
         ("1-800-1234-545566", TOK.SERIALNUMBER),
     ]
 
-    TEST_CASES_KLUDGY_MODIFY = [
-        ("1sti", [Tok(TOK.WORD, "fyrsti", None)]),
-        ("4ðu", [Tok(TOK.WORD, "fjórðu", None)]),
-        ("2svar", [Tok(TOK.WORD, "tvisvar", None)]),
-        ("4ra", [Tok(TOK.WORD, "fjögurra", None)]),
-        ("2ja", [Tok(TOK.WORD, "tveggja", None)]),
-    ]
-
-    TEST_CASES_KLUDGY_TRANSLATE = [
-        ("1sti", [Tok(TOK.ORDINAL, "1sti", 1)]),
-        ("4ðu", [Tok(TOK.ORDINAL, "4ðu", 4)]),
-        ("2svar", [Tok(TOK.WORD, "2svar", None)]),
-        ("4ra", [Tok(TOK.WORD, "4ra", None)]),
-    ]
-
     TEST_CASES_CONVERT_TELNOS: List[TestCase] = [
         ("525-4764", TOK.TELNO),
         ("4204200", [Tok(TOK.TELNO, "4204200", ("420-4200", "354"))]),
@@ -602,10 +587,6 @@ def run_test(test_cases: Iterable[TestCase], **options: Any) -> None:
 
     run_test(cast(Iterable[TestCase], TEST_CASES))
     run_test(cast(Iterable[TestCase], TEST_CASES_CONVERT_TELNOS))
-    run_test(TEST_CASES_KLUDGY_MODIFY, handle_kludgy_ordinals=t.KLUDGY_ORDINALS_MODIFY)
-    run_test(
-        TEST_CASES_KLUDGY_TRANSLATE, handle_kludgy_ordinals=t.KLUDGY_ORDINALS_TRANSLATE
-    )
     run_test(TEST_CASES_CONVERT_NUMBERS, convert_numbers=True)
     run_test(
         cast(Iterable[TestCase], TEST_CASES_COALESCE_PERCENT), coalesce_percent=True
@@ -1051,42 +1032,6 @@ def test_correction() -> None:
             """Hann „gaf“ mér €10.780,65.""",
         ),
     ]
-    SENT_KLUDGY_ORDINALS_MODIFY = [
-        (
-            """Hann sagði: ´Þú ert fífl´! Farðu í 3ja herbergja íbúð.""",
-            """Hann sagði: ‚Þú ert fífl‘! Farðu í þriggja herbergja íbúð.""",
-        ),
-        (
-            """Hann sagði: ´Þú ert fífl´! Farðu í 1sta sinn.""",
-            """Hann sagði: ‚Þú ert fífl‘! Farðu í fyrsta sinn.""",
-        ),
-        (
-            """Hann sagði: ´Þú ert fífl´! Farðu 2svar í bað.""",
-            """Hann sagði: ‚Þú ert fífl‘! Farðu tvisvar í bað.""",
-        ),
-        (
-            """Ég keypti 4ra herbergja íbúð á verði 2ja herbergja.""",
-            """Ég keypti fjögurra herbergja íbúð á verði tveggja herbergja.""",
-        ),
-    ]
-    SENT_KLUDGY_ORDINALS_TRANSLATE = [
-        (
-            """Hann sagði: ´Þú ert fífl´! Farðu í 3ja sinn.""",
-            """Hann sagði: ‚Þú ert fífl‘! Farðu í 3ja sinn.""",
-        ),
-        (
-            """Hann sagði: ´Þú ert fífl´! Farðu í 1sta sinn.""",
-            """Hann sagði: ‚Þú ert fífl‘! Farðu í 1sta sinn.""",
-        ),
-        (
-            """Hann sagði: ´Þú ert fífl´! Farðu 2svar í bað.""",
-            """Hann sagði: ‚Þú ert fífl‘! Farðu 2svar í bað.""",
-        ),
-        (
-            """Ég keypti 4ra herbergja íbúð á verði 2ja herbergja.""",
-            """Ég keypti 4ra herbergja íbúð á verði 2ja herbergja.""",
-        ),
-    ]
     SENT_CONVERT_NUMBERS = [
         (
             """Hann "gaf" mér 10,780.65 dollara.""",
@@ -1102,14 +1047,6 @@ def test_correction() -> None:
         s = t.tokenize(sent)
         txt = t.detokenize(s, normalize=True)
         assert txt == correct
-    for sent, correct in SENT_KLUDGY_ORDINALS_MODIFY:
-        s = t.tokenize(sent, handle_kludgy_ordinals=t.KLUDGY_ORDINALS_MODIFY)
-        txt = t.detokenize(s, normalize=True)
-        assert txt == correct
-    for sent, correct in SENT_KLUDGY_ORDINALS_TRANSLATE:
-        s = t.tokenize(sent, handle_kludgy_ordinals=t.KLUDGY_ORDINALS_TRANSLATE)
-        txt = t.detokenize(s, normalize=True)
-        assert txt == correct
     for sent, correct in SENT_CONVERT_NUMBERS:
         s = t.tokenize(sent, convert_numbers=True)
         txt = t.detokenize(s, normalize=True)

From 39a39e5b724f6de25d543b8a79ff94241e79e579 Mon Sep 17 00:00:00 2001
From: Vilhjalmur Thorsteinsson <vt@extrada.com>
Date: Thu, 11 Dec 2025 17:03:47 +0000
Subject: [PATCH 2/3] CI: Fix PyPy 3.11 build by avoiding mypy dependency
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Install only test dependencies (pytest) on non-3.9 Python versions,
since mypy's librt dependency doesn't work on PyPy 3.11. The dev
dependencies (including mypy) are only needed for type checking on 3.9.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 .github/workflows/python-package.yml | 7 +++++--
 pyproject.toml                       | 3 +++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 13bd7be..fa1c2a8 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -29,11 +29,14 @@ jobs:
       run: |
         python -m pip install --upgrade uv
         uv pip install --system --upgrade wheel setuptools
-        uv pip install --system ".[dev]"
+        if [ "${{ matrix.python-version }}" == "3.9" ]; then
+          uv pip install --system ".[dev]"
+        else
+          uv pip install --system ".[test]"
+        fi
 
     - name: Type check with mypy (only on oldest supported Python version)
       run: |
-        if [ "${{ matrix.python-version }}" == "3.9" ]; then python -m pip install mypy; fi
         if [ "${{ matrix.python-version }}" == "3.9" ]; then mypy --python-version=3.9 src/tokenizer; fi
 
     - name: Test with pytest
diff --git a/pyproject.toml b/pyproject.toml
index d681bd3..6fcc689 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -53,6 +53,9 @@ Issues = "https://github.com/mideind/Tokenizer/issues"
 Changelog = "https://github.com/mideind/Tokenizer#changelog"
 
 [project.optional-dependencies]
+test = [
+    "pytest>=7.0",
+]
 dev = [
     "pytest>=7.0",
     "pytest-cov>=4.0",

From ceabab6218a0bccc76d31769d0e10c1666a8553b Mon Sep 17 00:00:00 2001
From: Vilhjalmur Thorsteinsson <vt@extrada.com>
Date: Thu, 11 Dec 2025 17:09:35 +0000
Subject: [PATCH 3/3] Simplify KLUDGY_ORDINALS to a tuple for use with
 startswith()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the Mapping with a simple tuple, enabling efficient prefix
matching with str.startswith(tuple).

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 src/tokenizer/definitions.py | 37 ++++++++++++++++++------------------
 src/tokenizer/tokenizer.py   | 15 ++++++++-------
 2 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/src/tokenizer/definitions.py b/src/tokenizer/definitions.py
index cea723e..9eaabed 100644
--- a/src/tokenizer/definitions.py
+++ b/src/tokenizer/definitions.py
@@ -607,25 +607,24 @@ class PersonNameTuple(NamedTuple):
 
 # Incorrectly written ('kludgy') ordinals: these are passed through unchanged
 # as word tokens, but they need to be recognized so they are not parsed as numbers
-ORDINAL_ERRORS: Mapping[str, str] = {
-    "1sti": "fyrsti",
-    "1sta": "fyrsta",
-    "1stu": "fyrstu",
-    "3ji": "þriðji",
-    # "3ja": "þriðja",  # þriggja
-    "3ju": "þriðju",
-    "4ði": "fjórði",
-    "4ða": "fjórða",
-    "4ðu": "fjórðu",
-    "5ti": "fimmti",
-    "5ta": "fimmta",
-    "5tu": "fimmtu",
-    "2svar": "tvisvar",
-    "3svar": "þrisvar",
-    "2ja": "tveggja",
-    "3ja": "þriggja",
-    "4ra": "fjögurra",
-}
+KLUDGY_ORDINALS: tuple[str, ...] = (
+    "1sti",
+    "1sta",
+    "1stu",
+    "2svar",
+    "3svar",
+    "2ja",
+    "3ja",
+    "3ji",
+    "3ju",
+    "4ði",
+    "4ða",
+    "4ðu",
+    "4ra",
+    "5ti",
+    "5ta",
+    "5tu",
+)
 
 # Handling of Roman numerals
 
diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py
index 4f41657..2199143 100644
--- a/src/tokenizer/tokenizer.py
+++ b/src/tokenizer/tokenizer.py
@@ -1740,13 +1740,14 @@ def parse(self) -> Iterable[Tok]:
         # Yield them unchanged as word tokens (pass-through behavior)
         rt = self.rt
         convert_numbers = self.convert_numbers
-        for key in ORDINAL_ERRORS:
-            rtxt = rt.txt
-            if rtxt.startswith(key):
-                # This is a kludgy ordinal: yield it unchanged as a word token
-                key_tok, rt = rt.split(len(key))
-                yield TOK.Word(key_tok)
-                break  # This skips the for loop 'else'
+        rtxt = rt.txt
+        if rtxt.startswith(KLUDGY_ORDINALS):
+            # This is a kludgy ordinal: find which one matched and yield as word token
+            for key in KLUDGY_ORDINALS:
+                if rtxt.startswith(key):
+                    key_tok, rt = rt.split(len(key))
+                    yield TOK.Word(key_tok)
+                    break
         else:
             # Not a kludgy ordinal: eat tokens starting with a digit
             t, rt = parse_digits(rt, convert_numbers)