From f9bb98951865aad5e9d3cd9eb94e57b787dd58d3 Mon Sep 17 00:00:00 2001 From: Vilhjalmur Thorsteinsson Date: Thu, 11 Dec 2025 16:58:35 +0000 Subject: [PATCH 1/3] Remove deprecated --handle_kludgy_ordinals flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the outdated handle_kludgy_ordinals option from the CLI and tokenization API. Kludgy ordinals (e.g. '1sti', '3ja') are now always passed through unchanged as word tokens, which was the default behavior. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- README.md | 35 +++---------------- src/tokenizer/__init__.py | 6 ---- src/tokenizer/definitions.py | 32 ++--------------- src/tokenizer/main.py | 16 --------- src/tokenizer/tokenizer.py | 43 ++++------------------- test/test_cli.py | 9 ----- test/test_index_calculation.py | 20 ----------- test/test_tokenizer.py | 63 ---------------------------------- 8 files changed, 13 insertions(+), 211 deletions(-) diff --git a/README.md b/README.md index 2ff0267..f3de20b 100644 --- a/README.md +++ b/README.md @@ -118,7 +118,6 @@ Other options can be specified on the command line: | `-g`, `--keep_composite_glyphs` | Do not replace composite glyphs using Unicode COMBINING codes with their accented/umlaut counterparts. | | `-e`, `--replace_html_escapes` | HTML escape codes replaced by their meaning, such as `á` -> `á`. | | `-c`, `--convert_numbers` | English-style decimal points and thousands separators in numbers changed to Icelandic style. | -| `-k N`, `--handle_kludgy_ordinals N` | Kludgy ordinal handling defined. 0: Returns the original mixed word form, 1. Kludgy ordinal returned as pure word forms, 2: Kludgy ordinals returned as pure numbers. | Type `tokenize -h` or `tokenize --help` to get a short help message. @@ -453,31 +452,6 @@ functions: The default value for the `replace_html_escapes` option is `False`. -* `handle_kludgy_ordinals=[value]` - - This options controls the way Tokenizer handles 'kludgy' ordinals, such as - *1sti*, *4ðu*, or *2ja*. By default, such ordinals are returned unmodified - ('passed through') as word tokens (`TOK.WORD`). - However, this can be modified as follows: - - * `tokenizer.KLUDGY_ORDINALS_MODIFY`: Kludgy ordinals are corrected - to become 'proper' word tokens, i.e. *1sti* becomes *fyrsti* and - *2ja* becomes *tveggja*. - - * `tokenizer.KLUDGY_ORDINALS_TRANSLATE`: Kludgy ordinals that represent - proper ordinal numbers are translated to ordinal tokens (`TOK.ORDINAL`), - with their original text and their ordinal value. *1sti* thus - becomes a `TOK.ORDINAL` token with a value of 1, and *3ja* becomes - a `TOK.ORDINAL` with a value of 3. - - * `tokenizer.KLUDGY_ORDINALS_PASS_THROUGH` is the default value of - the option. It causes kludgy ordinals to be returned unmodified as - word tokens. - - Note that versions of Tokenizer prior to 1.4 behaved as if - `handle_kludgy_ordinals` were set to - `tokenizer.KLUDGY_ORDINALS_TRANSLATE`. - ## Dash and Hyphen Handling Tokenizer distinguishes between three dash types and handles them contextually: @@ -578,9 +552,8 @@ with the following exceptions: can be disabled; see the `replace_composite_glyphs` option described above.) -* If the appropriate options are specified (see above), it converts - kludgy ordinals (*3ja*) to proper ones (*þriðja*), and English-style - thousand and decimal separators to Icelandic ones +* If the `convert_numbers` option is specified (see above), English-style + thousand and decimal separators are converted to Icelandic ones (*10,345.67* becomes *10.345,67*). * If the `replace_html_escapes` option is set, Tokenizer replaces @@ -812,8 +785,8 @@ can be found in the file `test/toktest_normal_gold_expected.txt`. `TOK.SERIALNUMBER` token kinds; abbreviations can now have multiple meanings. * Version 1.4.0: Added the `**options` parameter to the - `tokenize()` function, giving control over the handling of numbers, - telephone numbers, and 'kludgy' ordinals. + `tokenize()` function, giving control over the handling of numbers + and telephone numbers. * Version 1.3.0: Added `TOK.DOMAIN` and `TOK.HASHTAG` token types; improved handling of capitalized month name *Ágúst*, which is now recognized when following an ordinal number; improved recognition diff --git a/src/tokenizer/__init__.py b/src/tokenizer/__init__.py index 47b0d3a..1c7f229 100644 --- a/src/tokenizer/__init__.py +++ b/src/tokenizer/__init__.py @@ -36,9 +36,6 @@ TP_WORD, EN_DASH, EM_DASH, - KLUDGY_ORDINALS_PASS_THROUGH, - KLUDGY_ORDINALS_MODIFY, - KLUDGY_ORDINALS_TRANSLATE, BIN_Tuple, BIN_TupleList, ) @@ -80,9 +77,6 @@ "EM_DASH", "EN_DASH", "generate_raw_tokens", - "KLUDGY_ORDINALS_MODIFY", - "KLUDGY_ORDINALS_PASS_THROUGH", - "KLUDGY_ORDINALS_TRANSLATE", "mark_paragraphs", "normalized_text_from_tokens", "normalized_text", diff --git a/src/tokenizer/definitions.py b/src/tokenizer/definitions.py index 278188b..cea723e 100644 --- a/src/tokenizer/definitions.py +++ b/src/tokenizer/definitions.py @@ -605,20 +605,8 @@ class PersonNameTuple(NamedTuple): ) -# If the handle_kludgy_ordinals option is set to -# KLUDGY_ORDINALS_PASS_THROUGH, we do not convert -# kludgy ordinals but pass them through as word tokens. -KLUDGY_ORDINALS_PASS_THROUGH = 0 -# If the handle_kludgy_ordinals option is set to -# KLUDGY_ORDINALS_MODIFY, we convert '1sti' to 'fyrsti', etc., -# and return the modified word as a token. -KLUDGY_ORDINALS_MODIFY = 1 -# If the handle_kludgy_ordinals option is set to -# KLUDGY_ORDINALS_TRANSLATE, we convert '1sti' to TOK.Ordinal('1sti', 1), etc., -# but otherwise pass the original word through as a word token ('2ja'). -KLUDGY_ORDINALS_TRANSLATE = 2 - -# Incorrectly written ('kludgy') ordinals +# Incorrectly written ('kludgy') ordinals: these are passed through unchanged +# as word tokens, but they need to be recognized so they are not parsed as numbers ORDINAL_ERRORS: Mapping[str, str] = { "1sti": "fyrsti", "1sta": "fyrsta", @@ -639,22 +627,6 @@ class PersonNameTuple(NamedTuple): "4ra": "fjögurra", } -# Translations of kludgy ordinal words into numbers -ORDINAL_NUMBERS: Mapping[str, int] = { - "1sti": 1, - "1sta": 1, - "1stu": 1, - "3ji": 3, - "3ja": 3, - "3ju": 3, - "4ði": 4, - "4ða": 4, - "4ðu": 4, - "5ti": 5, - "5ta": 5, - "5tu": 5, -} - # Handling of Roman numerals RE_ROMAN_NUMERAL = re.compile( diff --git a/src/tokenizer/main.py b/src/tokenizer/main.py index f7a34a5..c9a61e7 100755 --- a/src/tokenizer/main.py +++ b/src/tokenizer/main.py @@ -149,19 +149,6 @@ ), ) -parser.add_argument( - "-k", - "--handle_kludgy_ordinals", - type=int, - default=0, - help=( - "Kludgy ordinal handling defined.\n" - "\t0: Returns the original word form.\n" - "\t1: Ordinals returned as pure words.\n" - "\t2: Ordinals returned as numbers." - ), -) - parser.add_argument( "-v", "--version", @@ -263,9 +250,6 @@ def val(t: Tok, quote_word: bool = False) -> Any: if args.one_sent_per_line: options["one_sent_per_line"] = True - if args.handle_kludgy_ordinals: - options["handle_kludgy_ordinals"] = args.handle_kludgy_ordinals - if args.original: options["original"] = args.original diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py index 879f919..4f41657 100644 --- a/src/tokenizer/tokenizer.py +++ b/src/tokenizer/tokenizer.py @@ -1730,42 +1730,22 @@ def _is_letter(self, char: str) -> bool: class NumberParser: """Parses a sequence of digits off the front of a raw token""" - def __init__( - self, rt: Tok, handle_kludgy_ordinals: int, convert_numbers: bool - ) -> None: + def __init__(self, rt: Tok, convert_numbers: bool) -> None: self.rt = rt - self.handle_kludgy_ordinals = handle_kludgy_ordinals self.convert_numbers = convert_numbers def parse(self) -> Iterable[Tok]: """Parse the raw token, yielding result tokens""" # Handle kludgy ordinals: '3ji', '5ti', etc. + # Yield them unchanged as word tokens (pass-through behavior) rt = self.rt - handle_kludgy_ordinals = self.handle_kludgy_ordinals convert_numbers = self.convert_numbers - for key, val in ORDINAL_ERRORS.items(): + for key in ORDINAL_ERRORS: rtxt = rt.txt if rtxt.startswith(key): - # This is a kludgy ordinal + # This is a kludgy ordinal: yield it unchanged as a word token key_tok, rt = rt.split(len(key)) - if handle_kludgy_ordinals == KLUDGY_ORDINALS_MODIFY: - # Convert ordinals to corresponding word tokens: - # '1sti' -> 'fyrsti', '3ji' -> 'þriðji', etc. - key_tok.substitute_longer((0, len(key)), val) - yield TOK.Word(key_tok) - elif ( - handle_kludgy_ordinals == KLUDGY_ORDINALS_TRANSLATE - and key in ORDINAL_NUMBERS - ): - # Convert word-form ordinals into ordinal tokens, - # i.e. '1sti' -> TOK.Ordinal('1sti', 1), - # but leave other kludgy constructs ('2ja') - # as word tokens - yield TOK.Ordinal(key_tok, ORDINAL_NUMBERS[key]) - else: - # No special handling of kludgy ordinals: - # yield them unchanged as word tokens - yield TOK.Word(key_tok) + yield TOK.Word(key_tok) break # This skips the for loop 'else' else: # Not a kludgy ordinal: eat tokens starting with a digit @@ -1898,7 +1878,6 @@ def parse(self, rt: Tok) -> Iterable[Tok]: def parse_mixed( rt: Tok, - handle_kludgy_ordinals: int, convert_numbers: bool, replace_composite_glyphs: bool = True, ) -> Iterable[Tok]: @@ -1994,7 +1973,7 @@ def parse_mixed( rtxt[0] in DIGITS_PREFIX or (rtxt[0] in SIGN_PREFIX and len(rtxt) >= 2 and rtxt[1] in DIGITS_PREFIX) ): - np = NumberParser(rt, handle_kludgy_ordinals, convert_numbers) + np = NumberParser(rt, convert_numbers) yield from np.parse() rt = np.rt ate = True @@ -2072,12 +2051,6 @@ def parse_tokens(txt: Union[str, Iterable[str]], **options: Any) -> Iterator[Tok replace_html_escapes: bool = options.get("replace_html_escapes", False) one_sent_per_line: bool = options.get("one_sent_per_line", False) - # The default behavior for kludgy ordinals is to pass them - # through as word tokens - handle_kludgy_ordinals: int = options.get( - "handle_kludgy_ordinals", KLUDGY_ORDINALS_PASS_THROUGH - ) - # This code proceeds roughly as follows: # 1) The text is split into raw tokens on whitespace boundaries. # 2) (By far the most common case:) Raw tokens that are purely @@ -2178,9 +2151,7 @@ def parse_tokens(txt: Union[str, Iterable[str]], **options: Any) -> Iterator[Tok yield TOK.Punctuation(punct, normalized="‚") # More complex case of mixed punctuation, letters and numbers - yield from parse_mixed( - rt, handle_kludgy_ordinals, convert_numbers, replace_composite_glyphs - ) + yield from parse_mixed(rt, convert_numbers, replace_composite_glyphs) # Yield a sentinel token at the end that will be cut off by the final generator yield TOK.End_Sentinel() diff --git a/test/test_cli.py b/test/test_cli.py index 0cd826e..9dd900d 100644 --- a/test/test_cli.py +++ b/test/test_cli.py @@ -200,13 +200,4 @@ def test_cli(capsys: CaptureFixture[str], monkeypatch: MonkeyPatch) -> None: == "Hann fékk 7,5 í meðaleinkunn en bara 3,3 í íþróttum , og hlaut 2.000,5 USD fyrir ." ) - # Handle kludgy ordinals - # --handle_kludgy_ordinals flag - t = "Hann var 1sti maðurinn til að heimsækja tunglið." - r = run_cli(c, m, ["-", "-", "--handle_kludgy_ordinals", "1"], t) - assert r == "Hann var fyrsti maðurinn til að heimsækja tunglið ." - # TODO: Broken functionality, needs to be fixed - # r = run_cli(c, m, ["-", "-", "--handle_kludgy_ordinals", "2"], t) - # assert r == "Hann var 1. maðurinn til að heimsækja tunglið ." - # TODO: Add more tests for the CLI to achieve 100% coverage diff --git a/test/test_index_calculation.py b/test/test_index_calculation.py index 52a7b08..7b91e00 100644 --- a/test/test_index_calculation.py +++ b/test/test_index_calculation.py @@ -638,26 +638,6 @@ def test_composite_phrases() -> None: assert byte_indexes == [0, 25, 26] -def test_lengthening_substitutions() -> None: - s = "Þetta er 3ji báturinn!" - # 0123456789012345678901 - # ^ ^ ^ ^ ^ - # x x - # ! lengthening happens here (3ji->þriðji) - toks = tokenizer.parse_tokens( - s, handle_kludgy_ordinals=tokenizer.KLUDGY_ORDINALS_MODIFY - ) - char_indexes, byte_indexes = tokenizer.calculate_indexes(toks) - assert char_indexes == [0, 5, 8, 12, 21] - assert byte_indexes == [0, 6, 9, 13, 23] - toks = tokenizer.parse_tokens( - s, handle_kludgy_ordinals=tokenizer.KLUDGY_ORDINALS_MODIFY - ) - char_indexes, byte_indexes = tokenizer.calculate_indexes(toks, last_is_end=True) - assert char_indexes == [0, 5, 8, 12, 21, 22] - assert byte_indexes == [0, 6, 9, 13, 23, 24] - - def test_converted_measurements() -> None: s = "Stillið ofninn á 12° C til að baka kökuna." # 012345678901234567890123456789012345678901 diff --git a/test/test_tokenizer.py b/test/test_tokenizer.py index 99a9b6f..d5ad84e 100644 --- a/test/test_tokenizer.py +++ b/test/test_tokenizer.py @@ -498,21 +498,6 @@ def test_single_tokens() -> None: ("1-800-1234-545566", TOK.SERIALNUMBER), ] - TEST_CASES_KLUDGY_MODIFY = [ - ("1sti", [Tok(TOK.WORD, "fyrsti", None)]), - ("4ðu", [Tok(TOK.WORD, "fjórðu", None)]), - ("2svar", [Tok(TOK.WORD, "tvisvar", None)]), - ("4ra", [Tok(TOK.WORD, "fjögurra", None)]), - ("2ja", [Tok(TOK.WORD, "tveggja", None)]), - ] - - TEST_CASES_KLUDGY_TRANSLATE = [ - ("1sti", [Tok(TOK.ORDINAL, "1sti", 1)]), - ("4ðu", [Tok(TOK.ORDINAL, "4ðu", 4)]), - ("2svar", [Tok(TOK.WORD, "2svar", None)]), - ("4ra", [Tok(TOK.WORD, "4ra", None)]), - ] - TEST_CASES_CONVERT_TELNOS: List[TestCase] = [ ("525-4764", TOK.TELNO), ("4204200", [Tok(TOK.TELNO, "4204200", ("420-4200", "354"))]), @@ -602,10 +587,6 @@ def run_test(test_cases: Iterable[TestCase], **options: Any) -> None: run_test(cast(Iterable[TestCase], TEST_CASES)) run_test(cast(Iterable[TestCase], TEST_CASES_CONVERT_TELNOS)) - run_test(TEST_CASES_KLUDGY_MODIFY, handle_kludgy_ordinals=t.KLUDGY_ORDINALS_MODIFY) - run_test( - TEST_CASES_KLUDGY_TRANSLATE, handle_kludgy_ordinals=t.KLUDGY_ORDINALS_TRANSLATE - ) run_test(TEST_CASES_CONVERT_NUMBERS, convert_numbers=True) run_test( cast(Iterable[TestCase], TEST_CASES_COALESCE_PERCENT), coalesce_percent=True @@ -1051,42 +1032,6 @@ def test_correction() -> None: """Hann „gaf“ mér €10.780,65.""", ), ] - SENT_KLUDGY_ORDINALS_MODIFY = [ - ( - """Hann sagði: ´Þú ert fífl´! Farðu í 3ja herbergja íbúð.""", - """Hann sagði: ‚Þú ert fífl‘! Farðu í þriggja herbergja íbúð.""", - ), - ( - """Hann sagði: ´Þú ert fífl´! Farðu í 1sta sinn.""", - """Hann sagði: ‚Þú ert fífl‘! Farðu í fyrsta sinn.""", - ), - ( - """Hann sagði: ´Þú ert fífl´! Farðu 2svar í bað.""", - """Hann sagði: ‚Þú ert fífl‘! Farðu tvisvar í bað.""", - ), - ( - """Ég keypti 4ra herbergja íbúð á verði 2ja herbergja.""", - """Ég keypti fjögurra herbergja íbúð á verði tveggja herbergja.""", - ), - ] - SENT_KLUDGY_ORDINALS_TRANSLATE = [ - ( - """Hann sagði: ´Þú ert fífl´! Farðu í 3ja sinn.""", - """Hann sagði: ‚Þú ert fífl‘! Farðu í 3ja sinn.""", - ), - ( - """Hann sagði: ´Þú ert fífl´! Farðu í 1sta sinn.""", - """Hann sagði: ‚Þú ert fífl‘! Farðu í 1sta sinn.""", - ), - ( - """Hann sagði: ´Þú ert fífl´! Farðu 2svar í bað.""", - """Hann sagði: ‚Þú ert fífl‘! Farðu 2svar í bað.""", - ), - ( - """Ég keypti 4ra herbergja íbúð á verði 2ja herbergja.""", - """Ég keypti 4ra herbergja íbúð á verði 2ja herbergja.""", - ), - ] SENT_CONVERT_NUMBERS = [ ( """Hann "gaf" mér 10,780.65 dollara.""", @@ -1102,14 +1047,6 @@ def test_correction() -> None: s = t.tokenize(sent) txt = t.detokenize(s, normalize=True) assert txt == correct - for sent, correct in SENT_KLUDGY_ORDINALS_MODIFY: - s = t.tokenize(sent, handle_kludgy_ordinals=t.KLUDGY_ORDINALS_MODIFY) - txt = t.detokenize(s, normalize=True) - assert txt == correct - for sent, correct in SENT_KLUDGY_ORDINALS_TRANSLATE: - s = t.tokenize(sent, handle_kludgy_ordinals=t.KLUDGY_ORDINALS_TRANSLATE) - txt = t.detokenize(s, normalize=True) - assert txt == correct for sent, correct in SENT_CONVERT_NUMBERS: s = t.tokenize(sent, convert_numbers=True) txt = t.detokenize(s, normalize=True) From 39a39e5b724f6de25d543b8a79ff94241e79e579 Mon Sep 17 00:00:00 2001 From: Vilhjalmur Thorsteinsson Date: Thu, 11 Dec 2025 17:03:47 +0000 Subject: [PATCH 2/3] CI: Fix PyPy 3.11 build by avoiding mypy dependency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Install only test dependencies (pytest) on non-3.9 Python versions, since mypy's librt dependency doesn't work on PyPy 3.11. The dev dependencies (including mypy) are only needed for type checking on 3.9. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- .github/workflows/python-package.yml | 7 +++++-- pyproject.toml | 3 +++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 13bd7be..fa1c2a8 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -29,11 +29,14 @@ jobs: run: | python -m pip install --upgrade uv uv pip install --system --upgrade wheel setuptools - uv pip install --system ".[dev]" + if [ "${{ matrix.python-version }}" == "3.9" ]; then + uv pip install --system ".[dev]" + else + uv pip install --system ".[test]" + fi - name: Type check with mypy (only on oldest supported Python version) run: | - if [ "${{ matrix.python-version }}" == "3.9" ]; then python -m pip install mypy; fi if [ "${{ matrix.python-version }}" == "3.9" ]; then mypy --python-version=3.9 src/tokenizer; fi - name: Test with pytest diff --git a/pyproject.toml b/pyproject.toml index d681bd3..6fcc689 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,6 +53,9 @@ Issues = "https://github.com/mideind/Tokenizer/issues" Changelog = "https://github.com/mideind/Tokenizer#changelog" [project.optional-dependencies] +test = [ + "pytest>=7.0", +] dev = [ "pytest>=7.0", "pytest-cov>=4.0", From ceabab6218a0bccc76d31769d0e10c1666a8553b Mon Sep 17 00:00:00 2001 From: Vilhjalmur Thorsteinsson Date: Thu, 11 Dec 2025 17:09:35 +0000 Subject: [PATCH 3/3] Simplify KLUDGY_ORDINALS to a tuple for use with startswith() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the Mapping with a simple tuple, enabling efficient prefix matching with str.startswith(tuple). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- src/tokenizer/definitions.py | 37 ++++++++++++++++++------------------ src/tokenizer/tokenizer.py | 15 ++++++++------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/src/tokenizer/definitions.py b/src/tokenizer/definitions.py index cea723e..9eaabed 100644 --- a/src/tokenizer/definitions.py +++ b/src/tokenizer/definitions.py @@ -607,25 +607,24 @@ class PersonNameTuple(NamedTuple): # Incorrectly written ('kludgy') ordinals: these are passed through unchanged # as word tokens, but they need to be recognized so they are not parsed as numbers -ORDINAL_ERRORS: Mapping[str, str] = { - "1sti": "fyrsti", - "1sta": "fyrsta", - "1stu": "fyrstu", - "3ji": "þriðji", - # "3ja": "þriðja", # þriggja - "3ju": "þriðju", - "4ði": "fjórði", - "4ða": "fjórða", - "4ðu": "fjórðu", - "5ti": "fimmti", - "5ta": "fimmta", - "5tu": "fimmtu", - "2svar": "tvisvar", - "3svar": "þrisvar", - "2ja": "tveggja", - "3ja": "þriggja", - "4ra": "fjögurra", -} +KLUDGY_ORDINALS: tuple[str, ...] = ( + "1sti", + "1sta", + "1stu", + "2svar", + "3svar", + "2ja", + "3ja", + "3ji", + "3ju", + "4ði", + "4ða", + "4ðu", + "4ra", + "5ti", + "5ta", + "5tu", +) # Handling of Roman numerals diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py index 4f41657..2199143 100644 --- a/src/tokenizer/tokenizer.py +++ b/src/tokenizer/tokenizer.py @@ -1740,13 +1740,14 @@ def parse(self) -> Iterable[Tok]: # Yield them unchanged as word tokens (pass-through behavior) rt = self.rt convert_numbers = self.convert_numbers - for key in ORDINAL_ERRORS: - rtxt = rt.txt - if rtxt.startswith(key): - # This is a kludgy ordinal: yield it unchanged as a word token - key_tok, rt = rt.split(len(key)) - yield TOK.Word(key_tok) - break # This skips the for loop 'else' + rtxt = rt.txt + if rtxt.startswith(KLUDGY_ORDINALS): + # This is a kludgy ordinal: find which one matched and yield as word token + for key in KLUDGY_ORDINALS: + if rtxt.startswith(key): + key_tok, rt = rt.split(len(key)) + yield TOK.Word(key_tok) + break else: # Not a kludgy ordinal: eat tokens starting with a digit t, rt = parse_digits(rt, convert_numbers)