diff --git a/README.md b/README.md index e923e72..c60bb8e 100644 --- a/README.md +++ b/README.md @@ -29,15 +29,15 @@ uses Tokenizer on its input. Tokenizer is licensed under the MIT license. -## Performance +## Indicative performance -Tokenization of 1 MB of a wide selection of texts from the Icelandic Gigaword Corpus -using a 64-bit 2.6 GHz Intel Core i9. +Time to tokenize 1 MB of a wide selection of texts from the Icelandic Gigaword Corpus +using a 64-bit 2.6 GHz Intel Core i9: -| | Time (sec) | -|---------------|--------------| -| CPython 3.12 | 25.27 | -| PyPy 3.11 | 8.08 | +| | Time (sec) | +|---------------|------------:| +| CPython 3.12 | 25.27 | +| PyPy 3.11 | 8.08 | Running tokenization with PyPy is about 3x faster than with CPython. @@ -88,8 +88,8 @@ the command line: $ tokenize input.txt output.txt ``` -Input and output files are in UTF-8 encoding. If the files are not -given explicitly, `stdin` and `stdout` are used for input and output, +Input and output files are assumed to be UTF-8 encoded. If the file names +are not given explicitly, `stdin` and `stdout` are used for input and output, respectively. Empty lines in the input are treated as hard sentence boundaries. @@ -103,14 +103,14 @@ on the command line: | Option | Description | |-------------|-----------------------------------------------------------| -| `--csv` | Deep tokenization. Output token objects in CSV format, one per line. Sentences are separated by lines containing `0,"",""`. | +| `--csv` | Deep tokenization. Output token objects in CSV format, one per line. Each line contains: token kind (number), normalized text, value (if applicable), original text with preserved whitespace, and character span indices. Sentences are separated by lines containing `0,"","","",""`. | | `--json` | Deep tokenization. Output token objects in JSON format, one per line. Each JSON object contains: `k` (token kind), `t` (normalized text), `v` (value if applicable), `o` (original text with preserved whitespace), `s` (character span indices). | Other options can be specified on the command line: | Option | Description | |------------------------------|-----------------------------------------------------------| -| `-n`, `--normalize` | Normalize punctuation, causing e.g. quotes to be output in Icelandic form and hyphens to be regularized. This option is only applicable to shallow tokenization. | +| `-n`, `--normalize` | Normalize punctuation: quotes output in Icelandic form („these“), ellipsis as single character (…), year ranges with en-dash (1914–1918), and em-dashes centered with spaces ( — ). This option is only applicable to shallow tokenization. | | `-s`, `--one_sent_per_line` | Input contains strictly one sentence per line, i.e. every newline is a sentence boundary. | | `-o`, `--original` | Output original token text, i.e. bypass shallow tokenization. This effectively runs the tokenizer as a sentence splitter only. | | `-m`, `--convert_measurements` | Degree signal in tokens denoting temperature normalized (200° C -> 200 °C). | @@ -130,19 +130,19 @@ $ echo "3.janúar sl. keypti ég 64kWst rafbíl. Hann kostaði € 30.000." | Hann kostaði €30.000 . $ echo "3.janúar sl. keypti ég 64kWst rafbíl. Hann kostaði € 30.000." | tokenize --csv -19,"3. janúar","0|1|3" -6,"sl.","síðastliðinn" -6,"keypti","" -6,"ég","" -22,"64kWst","J|230400000.0" -6,"rafbíl","" -1,".",".", -0,"","" -6,"Hann","" -6,"kostaði","" -13,"€30.000","30000|EUR" -1,".",".", -0,"","" +19,"3. janúar","0|1|3","3.janúar","0-1-2-2-3-4-5-6-7" +6,"sl.","síðastliðinn"," sl.","1-2-3" +6,"keypti",""," keypti","1-2-3-4-5-6" +6,"ég",""," ég","3-4" +22,"64kWst","J|230400000.0"," 64kWst","1-2-3-4-5-6" +6,"rafbíl",""," rafbíl","1-2-3-4-5-6" +1,".",".",".","0" +0,"","","","" +6,"Hann",""," Hann","1-2-3-4" +6,"kostaði",""," kostaði","1-2-3-4-5-6-7" +13,"€30.000","30000|EUR"," € 30.000","1-3-4-5-6-7-8" +1,".",".",".","0" +0,"","","","" $ echo "3.janúar sl. keypti ég 64kWst rafbíl. Hann kostaði € 30.000." | tokenize --json {"k":"BEGIN SENT"} @@ -162,6 +162,18 @@ $ echo "3.janúar sl. keypti ég 64kWst rafbíl. Hann kostaði € 30.000." | {"k":"END SENT"} ``` +#### CSV Output Format + +When using `--csv`, each token is output as a CSV row with the following five fields: + +1. **Token kind** (number): Numeric code representing the token type (e.g., 6 for WORD, 19 for DATEREL, 1 for PUNCTUATION) +2. **Normalized text**: The processed text of the token +3. **Value**: The parsed value, if applicable (e.g., date tuples, amounts, abbreviation expansions), or empty string +4. **Original text**: The original text including preserved whitespace +5. **Span indices**: Character indices mapping each character in the normalized text to its position in the original text, separated by hyphens + +Sentences are separated by rows containing `0,"","","",""`. + #### JSON Output Format When using `--json`, each token is output as a JSON object on a separate line with the following fields: @@ -306,7 +318,8 @@ You can pass the option `normalize=True` to the function if you want the normalized form of punctuation tokens. Normalization outputs Icelandic single and double quotes („these“) instead of English-style ones ("these"), converts three-dot ellipsis ... to single character -ellipsis …, and casts en-dashes – and em-dashes — to regular hyphens. +ellipsis …, normalizes year ranges to use en-dash (1914–1918), and +ensures em-dashes are centered with spaces ( — ). The `tokenizer.split_into_sentences()` function is typically called in a `for` loop: @@ -361,8 +374,9 @@ The `tokenizer.normalized_text(token)` function returns the normalized text for a token. This means that the original token text is returned except for certain punctuation tokens, where a normalized form is returned instead. Specifically, English-type quotes -are converted to Icelandic ones, and en- and em-dashes are converted -to regular hyphens. +are converted to Icelandic ones („these“), hyphens in year ranges are +converted to en-dash (1914–1918), and consecutive identical dashes are +preserved as multi-character tokens. ## The `text_from_tokens()` function @@ -374,7 +388,7 @@ with spaces between tokens. Example: >>> import tokenizer >>> toklist = list(tokenizer.tokenize("Hann sagði: \"Þú ert ágæt!\".")) >>> tokenizer.text_from_tokens(toklist) -'Hann sagði : \" Þú ert ágæt ! \" .' +'Hann sagði : " Þú ert ágæt ! " .' ``` ## The `normalized_text_from_tokens()` function @@ -387,7 +401,7 @@ token list, with spaces between tokens. Example (note the double quotes): >>> import tokenizer >>> toklist = list(tokenizer.tokenize("Hann sagði: \"Þú ert ágæt!\".")) >>> tokenizer.normalized_text_from_tokens(toklist) -'Hann sagði : \xE2\x80\x9e Þú ert ágæt ! \xE2\x80\x9c .' +'Hann sagði : „ Þú ert ágæt ! “ .' ``` ## Tokenization options @@ -464,6 +478,37 @@ functions: `handle_kludgy_ordinals` were set to `tokenizer.KLUDGY_ORDINALS_TRANSLATE`. +## Dash and Hyphen Handling + +Tokenizer distinguishes between three dash types and handles them contextually: + +- **Hyphen (-)**: Regular hyphen, used e.g. in compound words +- **En-dash (–)**: Longer dash, preferred in Icelandic for year/date ranges +- **Em-dash (—)**: Longest dash, used for emphasis or parenthetical remarks + +### Context-Specific Behavior + +**Year ranges**: Hyphens between years are normalized to en-dash when +`normalize=True`, following Icelandic spelling rules: `1914-1918` → `1914–1918`. + +**Free-standing dashes**: Hyphens and en-dashes with spaces around them +preserve those spaces: `word - word` remains `word - word`, not `word-word`. + +**Composite word continuations**: Hyphens stay attached to preceding words +in patterns like `fjölskyldu- og húsdýragarðurinn`, and to succeeding words +in cases like `eldhúsborð og -stólar`. + +**Em-dashes**: Treated as centered punctuation with spaces on both +sides: `word—word` → `word — word`. + +**Consecutive dashes**: Multiple identical dashes (`--`, `––`, `——`) are +treated as single tokens and preserve their spacing. + +### Edge Cases + +The tokenizer correctly handles `1914 -1918` where `-1918` might appear to +be a negative number but is actually part of a year range. + ## The token object Each token is an instance of the class `Tok` that has three main properties: @@ -509,8 +554,9 @@ defines within the `TOK` class: | S_BEGIN | 11001 | Start of sentence | | | S_END | 11002 | End of sentence | | -(*) The token types marked with an asterisk are reserved for the GreynirEngine package -and not currently returned by the tokenizer. +(*) The token types marked with an asterisk are reserved for +the [GreynirEngine package](https://github.com/mideind/GreynirEngine) and +not currently returned by the tokenizer. To obtain a descriptive text for a token kind, use `TOK.descr[token.kind]` (see example above). @@ -628,7 +674,7 @@ An example is *o.s.frv.*, which results in a `val` field equal to `[('og svo framvegis', 0, 'ao', 'frasi', 'o.s.frv.', '-')]`. The tuple format is designed to be compatible with the -*Database of Icelandic Morphology* (*DIM*), +[*Database of Icelandic Morphology* (*DIM*)](https://bin.arnastofnun.is/DMII/), *Beygingarlýsing íslensks nútímamáls*, i.e. the so-called *Sigrúnarsnið*. ## Development installation @@ -693,6 +739,12 @@ can be found in the file `test/toktest_normal_gold_expected.txt`. ## Changelog +* Version 3.5.4: Improved dash and hyphen handling: free-standing hyphens + between words now preserve spaces, year ranges normalize to en-dash (with + `normalize=True`), em-dashes are centered with spaces, and consecutive + identical dashes are handled as single tokens. Fixed edge case where negative + years in ranges (e.g., "1914 -1918") were incorrectly parsed as negative + numbers. * Version 3.5.3: Fixed PyPI package description display (README.md reference in pyproject.toml) * Version 3.5.2: Improved JSON output format and BIN_Tuple representation; documentation updates * Version 3.5.1: Fixed bug in composite glyph handling diff --git a/src/tokenizer/definitions.py b/src/tokenizer/definitions.py index 950afb0..278188b 100644 --- a/src/tokenizer/definitions.py +++ b/src/tokenizer/definitions.py @@ -184,8 +184,8 @@ class PersonNameTuple(NamedTuple): # Recognized punctuation LEFT_PUNCTUATION = "([„‚«#$€£¥₽<" RIGHT_PUNCTUATION = ".,:;)]!%‰?“»”’‛‘…>°" -CENTER_PUNCTUATION = '"*•&+=@©|' -NONE_PUNCTUATION = "^/±'´~\\" + HYPHEN + EN_DASH + EM_DASH +CENTER_PUNCTUATION = '"*•&+=@©|' + EM_DASH +NONE_PUNCTUATION = "^/±'´~\\" + HYPHEN + EN_DASH PUNCTUATION = ( LEFT_PUNCTUATION + CENTER_PUNCTUATION + RIGHT_PUNCTUATION + NONE_PUNCTUATION ) @@ -254,7 +254,7 @@ class PersonNameTuple(NamedTuple): # Characters that can start a numeric token DIGITS_PREFIX = frozenset([d for d in "0123456789"]) -SIGN_PREFIX = frozenset(("+", "-")) +SIGN_PREFIX = frozenset(("+", HYPHEN)) # Month names and numbers MONTHS: Mapping[str, int] = { @@ -1212,7 +1212,7 @@ def roman_to_int(s: str) -> int: def valid_ssn(kt: str) -> bool: """Validate Icelandic social security number ("kennitala")""" - if not kt or len(kt) != 11 or kt[6] != "-": + if not kt or len(kt) != 11 or (kt[6] != HYPHEN and kt[6] != EN_DASH): return False m = 11 - sum((ord(kt[i]) - 48) * KT_MAGIC[i] for i in range(9)) % 11 c = ord(kt[9]) - 48 @@ -1274,8 +1274,8 @@ def valid_ssn(kt: str) -> bool: "emsp": " ", "thinsp": " ", # Dashes and hyphens - "ndash": "–", - "mdash": "—", + "ndash": EN_DASH, + "mdash": EM_DASH, # The soft hyphen ­ is mapped to an empty string "shy": "", # Other non-ASCII letters diff --git a/src/tokenizer/main.py b/src/tokenizer/main.py index b729df1..f7a34a5 100755 --- a/src/tokenizer/main.py +++ b/src/tokenizer/main.py @@ -39,7 +39,6 @@ Dict, Iterable, List, - Optional, TextIO, Iterator, Callable, diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py index 10f27ff..6dd571d 100644 --- a/src/tokenizer/tokenizer.py +++ b/src/tokenizer/tokenizer.py @@ -74,6 +74,36 @@ SPAN_START = 0 SPAN_END = 1 +ISO_DATE_HYPHEN = r"(\d{4}-\d\d-\d\d)" +ISO_DATE_SLASH = r"(\d{4}/\d\d/\d\d)" +ISO_DATE_EN_DASH = ISO_DATE_HYPHEN.replace(HYPHEN, EN_DASH) +ISO_DATE = ( + r"(" + ISO_DATE_HYPHEN + "|" + ISO_DATE_SLASH + "|" + ISO_DATE_EN_DASH + r")(?!\d)" +) + +DMY_DATE_DOT = r"\d{1,2}\.\d{1,2}\.\d{2,4}" +DMY_DATE_HYPHEN = r"\d{1,2}-\d{1,2}-\d{2,4}" +DMY_DATE_EN_DASH = DMY_DATE_HYPHEN.replace(HYPHEN, EN_DASH) +DMY_DATE_SLASH = r"\d{1,2}/\d{1,2}/\d{2,4}" +DMY_DATE = ( + r"(" + + DMY_DATE_DOT + + "|" + + DMY_DATE_SLASH + + "|" + + DMY_DATE_HYPHEN + + "|" + + DMY_DATE_EN_DASH + + r")(?!\d)" +) + +SINGLE_DASH_OR_DOT_DATE = r"(\d{2})[-.](\d{4})(?!\d)".replace("-", HYPHEN + EN_DASH) + +HYPHEN_OG = f"{HYPHEN}og" +HYPHEN_EÐA = f"{HYPHEN}eða" +EN_DASH_OG = f"{EN_DASH}og" +EN_DASH_EÐA = f"{EN_DASH}eða" + class Tok: """Information about a single token""" @@ -1034,12 +1064,14 @@ def parse_digits(tok: Tok, convert_numbers: bool) -> Tuple[Tok, Tok]: t, rest = tok.split(s.end()) return TOK.Time(t, h, m, 0), rest - s = re.match(r"((\d{4}-\d\d-\d\d)|(\d{4}/\d\d/\d\d))(?!\d)", w) + s = re.match(ISO_DATE, w) if s: # Looks like an ISO format date: YYYY-MM-DD or YYYY/MM/DD g = s.group() - if "-" in g: - p = g.split("-") + if HYPHEN in g: + p = g.split(HYPHEN) + elif EN_DASH in g: + p = g.split(EN_DASH) else: p = g.split("/") y = int(p[0]) @@ -1049,18 +1081,16 @@ def parse_digits(tok: Tok, convert_numbers: bool) -> Tuple[Tok, Tok]: t, rest = tok.split(s.end()) return TOK.Date(t, y, m, d), rest - s = ( - re.match(r"\d{1,2}\.\d{1,2}\.\d{2,4}(?!\d)", w) - or re.match(r"\d{1,2}/\d{1,2}/\d{2,4}(?!\d)", w) - or re.match(r"\d{1,2}-\d{1,2}-\d{2,4}(?!\d)", w) - ) + s = re.match(DMY_DATE, w) if s: # Looks like a date with day, month and year parts g = s.group() if "/" in g: p = g.split("/") - elif "-" in g: - p = g.split("-") + elif HYPHEN in g: + p = g.split(HYPHEN) + elif EN_DASH in g: + p = g.split(EN_DASH) else: p = g.split(".") y = int(p[2]) @@ -1088,7 +1118,7 @@ def parse_digits(tok: Tok, convert_numbers: bool) -> Tuple[Tok, Tok]: t, rest = tok.split(s.end()) return TOK.Daterel(t, y=0, m=m, d=d), rest - s = re.match(r"(\d{2})[-.](\d{4})(?!\d)", w) + s = re.match(SINGLE_DASH_OR_DOT_DATE, w) if s: # A date in the form of mm.yyyy or mm-yyyy g = s.group() @@ -1287,7 +1317,7 @@ def parse_digits(tok: Tok, convert_numbers: bool) -> Tuple[Tok, Tok]: s = re.match(r"\d\d\d\d\d\d\d(?!\d)", w) if s and w[0] in TELNO_PREFIXES: # Looks like a telephone number - telno = w[0:3] + "-" + w[3:7] + telno = w[0:3] + HYPHEN + w[3:7] t, rest = tok.split(7) return TOK.Telno(t, telno), rest @@ -1605,7 +1635,7 @@ class LetterParser: def __init__(self, rt: Tok) -> None: self.rt = rt - + def _is_letter(self, char: str) -> bool: """Test if character is alphabetic - fast path.""" return char.isalpha() @@ -1661,10 +1691,10 @@ def parse(self) -> Iterable[Tok]: yield TOK.Punctuation(punct) yield TOK.Word(word2) - elif ww.endswith("-og") or ww.endswith("-eða"): + elif ww.endswith((HYPHEN_OG, HYPHEN_EÐA, EN_DASH_OG, EN_DASH_EÐA)): # Handle missing space before 'og'/'eða', # such as 'fjármála-og efnahagsráðuneyti' - a = ww.split("-") + a = ww.split(EN_DASH if EN_DASH in ww else HYPHEN) word1, rt = rt.split(len(a[0])) punct, rt = rt.split(1) @@ -1690,11 +1720,11 @@ def parse(self) -> Iterable[Tok]: class LetterParserComposite(LetterParser): """Parses a sequence of alphabetic characters off the front of a raw token. Handles combining characters when --keep_composite_glyphs is specified.""" - + def _is_letter(self, char: str) -> bool: """Test if character is alphabetic or a combining mark.""" cat = unicodedata.category(char) - return cat.startswith(('L', 'M')) + return cat.startswith(("L", "M")) class NumberParser: @@ -1807,9 +1837,21 @@ def parse(self, rt: Tok) -> Iterable[Tok]: punct, rt = rt.split(numcommas) yield TOK.Punctuation(punct, normalized=",") elif rtxt[0] in HYPHENS: - # Normalize all hyphens the same way - punct, rt = rt.split(1) - yield TOK.Punctuation(punct, normalized=HYPHEN) + # Coalesce a sequence of identical hyphens into a single token + numhyphens = 1 + hyphen_char = rtxt[0] + for c in rtxt[1:]: + if c == hyphen_char: + numhyphens += 1 + else: + break + punct, rt = rt.split(numhyphens) + if numhyphens == 2 and hyphen_char == HYPHEN: + # Normalize exactly two hyphens to an em dash + yield TOK.Punctuation(punct, normalized=EM_DASH) + else: + # Normalize to a regular hyphen in other cases + yield TOK.Punctuation(punct, normalized=HYPHEN * numhyphens) elif rtxt[0] in DQUOTES: # Convert to a proper closing double quote punct, rt = rt.split(1) @@ -1855,12 +1897,17 @@ def parse(self, rt: Tok) -> Iterable[Tok]: def parse_mixed( - rt: Tok, handle_kludgy_ordinals: int, convert_numbers: bool, replace_composite_glyphs: bool = True + rt: Tok, + handle_kludgy_ordinals: int, + convert_numbers: bool, + replace_composite_glyphs: bool = True, ) -> Iterable[Tok]: """Parse a mixed raw token string, from the token rt""" # Select the appropriate letter parser class based on composite glyph handling - LetterParserClass = LetterParser if replace_composite_glyphs else LetterParserComposite + LetterParserClass = ( + LetterParser if replace_composite_glyphs else LetterParserComposite + ) # Initialize a singleton parser for punctuation pp = PunctuationParser() @@ -2009,8 +2056,10 @@ def is_word_with_composites(txt: str) -> bool: it can contain composite characters (combining accents, etc.). However, the word must start with a proper alphabetic character, since combining accents musth *follow* a letter - they can't *precede* it.""" - return len(txt) > 1 and txt[0].isalpha() and all( - unicodedata.category(char).startswith(('L', 'M')) for char in txt[1:] + return ( + len(txt) > 1 + and txt[0].isalpha() + and all(unicodedata.category(char).startswith(("L", "M")) for char in txt[1:]) ) @@ -2129,7 +2178,9 @@ def parse_tokens(txt: Union[str, Iterable[str]], **options: Any) -> Iterator[Tok yield TOK.Punctuation(punct, normalized="‚") # More complex case of mixed punctuation, letters and numbers - yield from parse_mixed(rt, handle_kludgy_ordinals, convert_numbers, replace_composite_glyphs) + yield from parse_mixed( + rt, handle_kludgy_ordinals, convert_numbers, replace_composite_glyphs + ) # Yield a sentinel token at the end that will be cut off by the final generator yield TOK.End_Sentinel() @@ -2349,7 +2400,7 @@ def lookup(abbrev: str) -> Optional[list[BIN_Tuple]]: and re.search(r"^\d\d\d$", token.txt) and re.search(r"^\d\d\d\d$", next_token.txt) ): - telno = token.txt + "-" + next_token.txt + telno = token.txt + HYPHEN + next_token.txt token = TOK.Telno(token.concatenate(next_token, separator=" "), telno) next_token = next(token_stream) @@ -3026,6 +3077,62 @@ def parse_phrases_2( # Eat the percent word token next_token = next(token_stream) + # Check for year range with negative year: [YEAR] [NUMBER(negative year)] + # This handles the edge case "1914 -1918" where -1918 is parsed as a negative number + # but should be treated as a year in a year range + if ( + token.kind == TOK.YEAR + and next_token.kind == TOK.NUMBER + and next_token.number < 0 # Negative number + and 1776 + <= -next_token.number + <= 2100 # Looks like a year when positive + and next_token.txt.startswith( + "-" + ) # Text starts with regular hyphen (not EN_DASH) + ): + # Split the token "-1918" into a hyphen punctuation and a year "1918" + # According to Icelandic spelling rules, normalize to EN_DASH between years + hyphen_tok, year_tok = next_token.split(1) + hyphen_tok = TOK.Punctuation(hyphen_tok, normalized=EN_DASH) + year_tok = TOK.Year(year_tok, int(-next_token.number)) + # Yield the current year, then the hyphen, then continue with the new year + yield token + yield hyphen_tok + token = year_tok + next_token = next(token_stream) + + # Check for year range with hyphen: [YEAR] [PUNCTUATION(hyphen)] [YEAR] + # According to Icelandic spelling rules, normalize hyphens to EN_DASH between years + if ( + token.kind == TOK.YEAR + and next_token.kind == TOK.PUNCTUATION + and next_token.punctuation + == HYPHEN # Only normalize if it's a regular hyphen + ): + # Peek ahead to see if there's another year + try: + third_token = next(token_stream) + if third_token.kind == TOK.YEAR: + # This is a year range, normalize the hyphen to EN_DASH + next_token = TOK.Punctuation(next_token, normalized=EN_DASH) + # Yield the current year and hyphen, then continue with the second year + yield token + yield next_token + token = third_token + next_token = next(token_stream) + else: + # Not a year range, put the third token back by yielding current token + # and setting up the lookahead to be hyphen and third_token + yield token + token = next_token + next_token = third_token + # Don't continue - fall through to normal flow + except StopIteration: + # No third token available, yield token and set next_token as current + # This will be handled by the normal loop flow + pass + # Check for composites: # 'stjórnskipunar- og eftirlitsnefnd' # 'dómsmála-, viðskipta- og iðnaðarráðherra' @@ -3067,7 +3174,8 @@ def parse_phrases_2( _acc = _acc.concatenate( t, separator=" ", metadata_from_other=True ) - _acc.substitute_all(" -", "-") + _acc.substitute_all(f" {HYPHEN}", HYPHEN) + _acc.substitute_all(f" {EN_DASH}", EN_DASH) _acc.substitute_all(" ,", ",") token = _acc next_token = next(token_stream) @@ -3126,12 +3234,20 @@ def split_into_sentences( """Shallow tokenization of the input text, which can be either a text string or a generator of lines of text (such as a file). This function returns a generator of strings, where each string - is a sentence, and tokens are separated by spaces.""" + is a sentence, and tokens are separated by spaces. + The function accepts 'original' and 'normalize' boolean options: + - If 'original' is True, the original text of each token is returned, + with the original whitespace. + - If 'normalize' is True, the normalized text of each token is returned, + where certain modifications may have been applied (see README). + This option takes precedence over 'original' if both are True.""" to_text: Callable[[Tok], str] - og = options.pop("original", False) - if options.pop("normalize", False): + original = options.pop("original", False) + normalize = options.pop("normalize", False) + if normalize: to_text = normalized_text - elif og: + original = False # normalize takes precedence over original + elif original: to_text = lambda t: t.original or t.txt else: to_text = lambda t: t.txt @@ -3142,7 +3258,7 @@ def split_into_sentences( # Note that curr_sent can be an empty list, # and in that case we yield an empty string if t.kind == TOK.S_END or t.kind == TOK.S_SPLIT: - if og: + if original: yield "".join(curr_sent) else: yield " ".join(curr_sent) @@ -3152,7 +3268,7 @@ def split_into_sentences( if txt: curr_sent.append(txt) if curr_sent: - if og: + if original: yield "".join(curr_sent) else: yield " ".join(curr_sent) @@ -3225,8 +3341,12 @@ def valid_sent(sent: Optional[list[Tok]]) -> bool: r"|([^\W\d_]+\.(?:[^\W\d_]+\.)+)(?![^\W\d_]+\s)" # The following regex catches degree characters, i.e. °C, °F r"|(°[CF])" + # The following regex catches consecutive identical hyphens/dashes + r"|(--+|––+|——+)" # --, ––, —— # Finally, space and punctuation - r"|([~\s" + "".join("\\" + c for c in PUNCTUATION) + r"])" + r"|([~\s" + + "".join("\\" + c for c in PUNCTUATION) + + r"])" ) RE_SPLIT = re.compile(RE_SPLIT_STR) @@ -3239,33 +3359,67 @@ def correct_spaces(s: str) -> str: r: List[str] = [] last = TP_NONE double_quote_count = 0 - for w in RE_SPLIT.split(s): - if w is None: + last_was_word = False # Track if previous non-empty token was a word + last_word_text = "" # Track the actual text of the last word token + had_space_before = False # Track if there was whitespace before current token + for w_raw in RE_SPLIT.split(s): + if w_raw is None: continue - w = w.strip() + # Check for whitespace before stripping + w = w_raw.strip() if not w: + if w_raw: # Non-empty but all whitespace + had_space_before = True continue + # Now w is a non-empty token if len(w) > 1: this = TP_WORD + last_was_word = w.isalpha() + last_word_text = w.lower() if last_was_word else "" elif w == '"': # For English-type double quotes, we glue them alternatively # to the right and to the left token this = (TP_LEFT, TP_RIGHT)[double_quote_count % 2] double_quote_count += 1 + last_was_word = False + last_word_text = "" elif w in LEFT_PUNCTUATION: this = TP_LEFT + last_was_word = False + last_word_text = "" elif w in RIGHT_PUNCTUATION: this = TP_RIGHT + last_was_word = False + last_word_text = "" elif w in NONE_PUNCTUATION: - this = TP_NONE + # Special case: free-standing hyphens/en-dashes after words + # If hyphen had space before it and follows a word (not year/number), + # BUT not "og"/"eða" (which are part of composite word patterns), + # treat as TP_CENTER to preserve spaces on both sides + if ( + w in COMPOSITE_HYPHENS + and had_space_before + and last_was_word + and last_word_text not in ("og", "eða") + ): + this = TP_CENTER + else: + this = TP_NONE + last_was_word = False + last_word_text = "" elif w in CENTER_PUNCTUATION: this = TP_CENTER + last_was_word = False + last_word_text = "" else: this = TP_WORD + last_was_word = True + last_word_text = w.lower() + had_space_before = False if ( (w == "og" or w == "eða") and len(r) >= 2 - and r[-1] == "-" + and r[-1] in COMPOSITE_HYPHENS and r[-2].lstrip().isalpha() ): # Special case for compounds such as "fjármála- og efnahagsráðuneytið" @@ -3275,13 +3429,13 @@ def correct_spaces(s: str) -> str: elif ( this == TP_WORD and len(r) >= 2 - and r[-1] == "-" + and r[-1] in COMPOSITE_HYPHENS and w.isalpha() and (r[-2] == "," or r[-2].lstrip() in ("og", "eða")) ): # Special case for compounds such as # "bensínstöðvar, -dælur og -tankar" - r[-1] = " -" + r[-1] = " " + r[-1] r.append(w) elif ( TP_SPACE[last - 1][this - 1] @@ -3311,10 +3465,15 @@ def detokenize(tokens: Iterable[Tok], normalize: bool = False) -> str: to_text: Callable[[Tok], str] = normalized_text if normalize else lambda t: t.txt r: List[str] = [] last = TP_NONE + last_kind = None # Track the previous token kind double_quote_count = 0 for t in tokens: w = to_text(t) if not w: + if t.kind == TOK.S_END: + # Reset the double quote counter at the end of a sentence + double_quote_count = 0 + last_kind = None continue this = TP_WORD if t.kind == TOK.PUNCTUATION: @@ -3329,6 +3488,15 @@ def detokenize(tokens: Iterable[Tok], normalize: bool = False) -> str: this = TP_LEFT elif w in RIGHT_PUNCTUATION: this = TP_RIGHT + elif w in COMPOSITE_HYPHENS: + if last_kind == TOK.WORD and t.original and t.original.startswith(" "): + # Special case: free-standing hyphens/en-dashes between words + # If the hyphen had a space before it in the original text, + # and follows a WORD (not YEAR or NUMBER), treat as TP_CENTER + # to preserve spaces on both sides + this = TP_CENTER + else: + this = TP_NONE elif w in NONE_PUNCTUATION: this = TP_NONE elif w in CENTER_PUNCTUATION: @@ -3338,6 +3506,7 @@ def detokenize(tokens: Iterable[Tok], normalize: bool = False) -> str: else: r.append(w) last = this + last_kind = t.kind return "".join(r) diff --git a/test/test_dashes.py b/test/test_dashes.py new file mode 100644 index 0000000..9b0392b --- /dev/null +++ b/test/test_dashes.py @@ -0,0 +1,386 @@ +# type: ignore + +""" + +test_dashes.py + +Tests for dash handling in Tokenizer module + +Copyright (C) 2016-2025 by Miðeind ehf. +Original author: Vilhjálmur Þorsteinsson + +This software is licensed under the MIT License: + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation + files (the "Software"), to deal in the Software without restriction, + including without limitation the rights to use, copy, modify, merge, + publish, distribute, sublicense, and to permit persons to whom the + Software is furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +""" + +import pytest +import tokenizer as t + +EN = t.EN_DASH +EM = t.EM_DASH + +YEAR_RANGES = [ + ("1914-1918", "1914-1918"), # Regular hyphen + ("1914 - 1918", "1914-1918"), # Regular hyphen + ("1914- 1918", "1914-1918"), # Regular hyphen + ("1914 -1918", "1914-1918"), # Regular hyphen - negative year converted to positive + ("1914- 1918", "1914-1918"), # Regular hyphen + ("1914 - 1918", "1914-1918"), # Regular hyphen + ( + "1914 -1918", + "1914-1918", + ), # Regular hyphen - negative year converted to positive + (f"1914{EN}1918", f"1914{EN}1918"), # En dash + (f"1914 {EN}1918", f"1914{EN}1918"), # En dash + (f"1914{EN} 1918", f"1914{EN}1918"), # En dash + (f"1914 {EN} 1918", f"1914{EN}1918"), # En dash + (f"1914{EM}1918", f"1914 {EM} 1918"), # Em dash + (f"1914 {EM}1918", f"1914 {EM} 1918"), # Em dash + (f"1914{EM} 1918", f"1914 {EM} 1918"), # Em dash + (f"1914 {EM} 1918", f"1914 {EM} 1918"), # Em dash +] + +THOUGHT_PAUSES = [ + ( + "Ég elska ketti - þeir eru svo sætir!", + "Ég elska ketti - þeir eru svo sætir!", + ), + ( + "Ég elska ketti - þeir eru svo sætir!", + "Ég elska ketti - þeir eru svo sætir!", + ), + ( + "Ég elska ketti - þeir eru svo sætir!", + "Ég elska ketti - þeir eru svo sætir!", + ), + ( + f"Ég elska ketti {EN} þeir eru svo sætir!", + f"Ég elska ketti {EN} þeir eru svo sætir!", + ), + ( + f"Ég elska ketti {EM} þeir eru svo sætir!", + f"Ég elska ketti {EM} þeir eru svo sætir!", + ), +] + +COMPOSITE_WORD_CONTINUATIONS = [ + ( + "Ég fór í fjölskyldu- og húsdýragarðinn", + "Ég fór í fjölskyldu- og húsdýragarðinn", + ), + ( + "Ég fór í fjölskyldu - og húsdýragarðinn", + "Ég fór í fjölskyldu - og húsdýragarðinn", + ), + ( + f"Ég fór í fjölskyldu{EN} og húsdýragarðinn", + f"Ég fór í fjölskyldu{EN} og húsdýragarðinn", + ), + ( + f"Ég fór í fjölskyldu {EN} og húsdýragarðinn", + f"Ég fór í fjölskyldu {EN} og húsdýragarðinn", + ), + ( + f"Ég fór í fjölskyldu{EM} og húsdýragarðinn", + f"Ég fór í fjölskyldu {EM} og húsdýragarðinn", + ), + ( + f"Ég fór í fjölskyldu {EM} og húsdýragarðinn", + f"Ég fór í fjölskyldu {EM} og húsdýragarðinn", + ), + ( + f"Ég fór í fjölskyldu {EM}og húsdýragarðinn", + f"Ég fór í fjölskyldu {EM} og húsdýragarðinn", + ), + ( + "Forstjóri Barna- og fjölskyldustofu segir dæmi um að…", + "Forstjóri Barna- og fjölskyldustofu segir dæmi um að…", + ), + ( + f"Forstjóri Barna{EN} og fjölskyldustofu segir dæmi um að…", + f"Forstjóri Barna{EN} og fjölskyldustofu segir dæmi um að…", + ), + ( + f"Innflutningur bensín-, dísel- og rafmagnsbíla hefur aukist.", + f"Innflutningur bensín-, dísel- og rafmagnsbíla hefur aukist.", + ), + ( + f"Innflutningur bensín{EN}, dísel{EN} og rafmagnsbíla hefur aukist.", + f"Innflutningur bensín{EN}, dísel{EN} og rafmagnsbíla hefur aukist.", + ), +] + +COMPOUND_WORDS = [ + ("Austur-Skaftafellssýsla", "Austur-Skaftafellssýsla"), # Regular hyphen + (f"Austur{EN}Skaftafellssýsla", f"Austur{EN}Skaftafellssýsla"), # En dash + (f"Austur{EM}Skaftafellssýsla", f"Austur {EM} Skaftafellssýsla"), # Em dash +] + +BEGIN_DASHES = [ + ("- Byrjar á bandstriki", "-Byrjar á bandstriki"), + (f"{EN} Byrjar á en striki", f"{EN}Byrjar á en striki"), + ( + f"{EM} Byrjar á em striki", + f"{EM} Byrjar á em striki", + ), # Should preserve space after +] + +END_DASHES = [ + ("Endar á bandstriki-", "Endar á bandstriki-"), + ("Endar á bandstriki -", "Endar á bandstriki -"), + (f"Endar á en striki{EN}", f"Endar á en striki{EN}"), + (f"Endar á en striki {EN}", f"Endar á en striki {EN}"), + ( + f"Endar á em striki{EM}", + f"Endar á em striki {EM}", + ), # Should preserve space before + ( + f"Endar á em striki {EM}", + f"Endar á em striki {EM}", + ), # Should preserve space before +] + +MULTIPLE_DASHES_IN_SEQUENCE = [ + ("This is -- a test", "This is -- a test"), # Multiple hyphens + (f"This is {EN}{EN} a test", f"This is {EN}{EN} a test"), # Multiple en dashes + ( + f"This is {EM}{EM} a test", + f"This is {EM}{EM} a test", + ), # Multiple em dashes - MUST preserve spaces +] + + +@pytest.mark.parametrize("test_pair", YEAR_RANGES) +def test_year_ranges_tokenize(test_pair: tuple[str, str]) -> None: + """Test year ranges without spaces using different dash types.""" + text_in, text_out = test_pair + # Test tokenize + detokenize preserves text + tokens = list(t.tokenize(text_in)) + detok = t.detokenize(tokens) + assert ( + detok == text_out + ), f"detokenize failed for {repr(text_in)}: got {repr(detok)}" + + +@pytest.mark.parametrize("test_pair", YEAR_RANGES) +def test_year_ranges_correct_spaces(test_pair: tuple[str, str]) -> None: + """Test year ranges without spaces using different dash types.""" + text_in, text_out = test_pair + # Test split_into_sentences + correct_spaces preserves text + sentences = list(t.split_into_sentences(text_in)) + if sentences: + corrected = t.correct_spaces(sentences[0]) + assert ( + corrected == text_out + ), f"correct_spaces failed for {repr(text_in)}: got {repr(corrected)}" + + +def test_year_ranges_normalize_to_en_dash() -> None: + """Test that year ranges with hyphens normalize to EN_DASH when normalize=True. + + According to Icelandic spelling rules, EN_DASH is preferred between years/dates. + """ + test_cases = [ + ("1914-1918", f"1914{EN}1918"), # Hyphen normalized to EN_DASH + ("1914 -1918", f"1914{EN}1918"), # Negative year edge case + ("1914 - 1918", f"1914{EN}1918"), # Hyphen with spaces + (f"1914{EN}1918", f"1914{EN}1918"), # EN_DASH unchanged + (f"1914 {EN}1918", f"1914{EN}1918"), # Hyphen normalized to EN_DASH + (f"1914 {EN} 1918", f"1914{EN}1918"), # Hyphen normalized to EN_DASH + ] + + for text_in, expected in test_cases: + tokens = list(t.tokenize(text_in)) + detok = t.detokenize(tokens, normalize=True) + assert ( + detok == expected + ), f"normalize=True failed for {repr(text_in)}: expected {repr(expected)}, got {repr(detok)}" + + +@pytest.mark.parametrize("test_pair", THOUGHT_PAUSES) +def test_thought_pauses_tokenize(test_pair: tuple[str, str]) -> None: + """Test thought pauses/parenthetical remarks with spaces using different dash types. + + Em dashes inside text should ALWAYS have spaces on both sides. + """ + text_in, text_out = test_pair + # Test tokenize + detokenize preserves text + tokens = list(t.tokenize(text_in)) + detok = t.detokenize(tokens) + assert ( + detok == text_out + ), f"detokenize failed for {repr(text_in)}: got {repr(detok)}" + + +@pytest.mark.parametrize("test_pair", THOUGHT_PAUSES) +def test_thought_pauses_correct_spaces(test_pair: tuple[str, str]) -> None: + """Test thought pauses/parenthetical remarks with spaces using different dash types. + + Em dashes inside text should ALWAYS have spaces on both sides. + """ + text_in, text_out = test_pair + # Test split_into_sentences + correct_spaces preserves text + sentences = list(t.split_into_sentences(text_in)) + if sentences: + corrected = t.correct_spaces(sentences[0]) + assert ( + corrected == text_out + ), f"correct_spaces failed for {repr(text_in)}: got {repr(corrected)}" + + +@pytest.mark.parametrize("test_pair", COMPOSITE_WORD_CONTINUATIONS) +def test_composite_word_continuation_tokenize(test_pair: tuple[str, str]) -> None: + """Test continuation of composite words using different dash types. + + Em dashes should ALWAYS preserve spaces, even in composite word continuations. + """ + text_in, text_out = test_pair + # Test tokenize + detokenize preserves text + tokens = list(t.tokenize(text_in)) + detok = t.detokenize(tokens) + assert ( + detok == text_out + ), f"detokenize failed for {repr(text_in)}: got {repr(detok)}" + + +@pytest.mark.parametrize("test_pair", COMPOSITE_WORD_CONTINUATIONS) +def test_composite_word_continuation_correct_spaces(test_pair: tuple[str, str]) -> None: + """Test continuation of composite words using different dash types. + + Em dashes should ALWAYS preserve spaces, even in composite word continuations. + """ + text_in, text_out = test_pair + # Test split_into_sentences + correct_spaces preserves text + sentences = list(t.split_into_sentences(text_in, original=True)) + if sentences: + corrected = t.correct_spaces(sentences[0]) + assert ( + corrected == text_out + ), f"correct_spaces failed for {repr(text_in)}: got {repr(corrected)}" + + +@pytest.mark.parametrize("test_pair", COMPOUND_WORDS) +def test_compound_words(test_pair: tuple[str, str]) -> None: + """Test compound words joined with dashes.""" + text_in, text_out = test_pair + # Test tokenize + detokenize preserves text + tokens = list(t.tokenize(text_in)) + detok = t.detokenize(tokens) + assert ( + detok == text_out + ), f"detokenize failed for {repr(text_in)}: got {repr(detok)}" + + +@pytest.mark.parametrize("test_pair", COMPOUND_WORDS) +def test_compound_words_correct_spaces(test_pair: tuple[str, str]) -> None: + """Test compound words joined with dashes.""" + text_in, text_out = test_pair + # Test split_into_sentences + correct_spaces preserves text + sentences = list(t.split_into_sentences(text_in)) + if sentences: + corrected = t.correct_spaces(sentences[0]) + assert ( + corrected == text_out + ), f"correct_spaces failed for {repr(text_in)}: got {repr(corrected)}" + + +@pytest.mark.parametrize("test_pair", MULTIPLE_DASHES_IN_SEQUENCE) +def test_multiple_dashes_in_sequence_tokenize(test_pair: tuple[str, str]) -> None: + """Test multiple dashes in sequence. + + Em dashes should ALWAYS preserve spaces, even when doubled. + """ + text_in, text_out = test_pair + # Test tokenize + detokenize preserves text + tokens = list(t.tokenize(text_in)) + detok = t.detokenize(tokens) + assert ( + detok == text_out + ), f"detokenize failed for {repr(text_in)}: got {repr(detok)}" + + +@pytest.mark.parametrize("test_pair", MULTIPLE_DASHES_IN_SEQUENCE) +def test_multiple_dashes_in_sequence_correct_spaces(test_pair: tuple[str, str]) -> None: + """Test multiple dashes in sequence.""" + text_in, text_out = test_pair + # Test split_into_sentences + correct_spaces preserves text + sentences = list(t.split_into_sentences(text_in, original=True)) + if sentences: + corrected = t.correct_spaces(sentences[0]) + assert ( + corrected == text_out + ), f"correct_spaces failed for {repr(text_in)}: got {repr(corrected)}" + + +@pytest.mark.parametrize("test_pair", BEGIN_DASHES) +def test_dashes_at_start_tokenize(test_pair: tuple[str, str]) -> None: + """Test dashes at the start of text. + + Em dashes at sentence start should preserve the space after them. + """ + text_in, text_out = test_pair + # Test tokenize + detokenize preserves text + tokens = list(t.tokenize(text_in)) + detok = t.detokenize(tokens) + assert ( + detok == text_out + ), f"detokenize failed for {repr(text_in)}: got {repr(detok)}" + + +@pytest.mark.parametrize("test_pair", BEGIN_DASHES) +def test_dashes_at_start_correct_spaces(test_pair: tuple[str, str]) -> None: + """Test dashes at the start of text.""" + text_in, text_out = test_pair + # Test split_into_sentences + correct_spaces preserves text + sentences = list(t.split_into_sentences(text_in)) + if sentences: + corrected = t.correct_spaces(sentences[0]) + assert ( + corrected == text_out + ), f"correct_spaces failed for {repr(text_in)}: got {repr(corrected)}" + + +@pytest.mark.parametrize("test_pair", END_DASHES) +def test_dashes_at_end_tokenize(test_pair: tuple[str, str]) -> None: + """Test dashes at the end of text. + + Em dashes at sentence end should preserve the space before them. + """ + text_in, text_out = test_pair + # Test tokenize + detokenize preserves text + tokens = list(t.tokenize(text_in)) + detok = t.detokenize(tokens) + assert ( + detok == text_out + ), f"detokenize failed for {repr(text_in)}: got {repr(detok)}" + + +@pytest.mark.parametrize("test_pair", END_DASHES) +def test_dashes_at_end_correct_spaces(test_pair: tuple[str, str]) -> None: + """Test dashes at the end of text.""" + text_in, text_out = test_pair + # Test split_into_sentences + correct_spaces preserves text + sentences = list(t.split_into_sentences(text_in, original=True)) + if sentences: + corrected = t.correct_spaces(sentences[0]) + assert ( + corrected == text_out + ), f"correct_spaces failed for {repr(text_in)}: got {repr(corrected)}" diff --git a/test/test_tokenizer.py b/test/test_tokenizer.py index 72afafa..60fa490 100644 --- a/test/test_tokenizer.py +++ b/test/test_tokenizer.py @@ -34,7 +34,7 @@ from typing import Any, Iterable, Iterator, Union, cast import tokenizer as t -from tokenizer.definitions import BIN_Tuple, ValType +from tokenizer.definitions import BIN_Tuple, ValType, EN_DASH, EM_DASH TOK = t.TOK Tok = t.Tok @@ -875,6 +875,21 @@ def test_sentence(text: str, expected: str, **options: Any) -> None: "B W P W P W W W P E", ) + test_sentence( + f"Þingmenn og {EN_DASH}konur versluðu marg{EN_DASH}ítrekað í Tösku{EN_DASH} og hanskabúðinni.", + "B W W W W W W W P E", + ) + + test_sentence( + f"Tösku{EN_DASH} og hanskabúðin, sálug, var á Lauga{EN_DASH} eða Skothúsvegi.", + "B W P W P W W W P E", + ) + + test_sentence( + f"Tösku{EN_DASH}og hanskabúðin, sálug, var á Lauga{EN_DASH}eða Skothúsvegi.", + "B W P W P W W W P E", + ) + test_sentence( "Friðgeir fór út kl. hálf átta en var hálf slompaður.", "B W W W T W W W W P E", @@ -1125,8 +1140,8 @@ def test_correct_spaces() -> None: "\n Breytingin var +4,10 þingmenn \t en dollarinn er nú á €1,3455 ." ) assert s == "Breytingin var +4,10 þingmenn en dollarinn er nú á €1,3455." - s = t.correct_spaces("Jón- sem var formaður — mótmælti málinu.") - assert s == "Jón-sem var formaður—mótmælti málinu." + s = t.correct_spaces("Jón- sem var formaður—mótmælti málinu.") + assert s == "Jón-sem var formaður — mótmælti málinu." s = t.correct_spaces("Það á að geyma mjólkina við 20 ± 3 °C") assert s == "Það á að geyma mjólkina við 20±3 °C" s = t.correct_spaces("Við förum t.d. til Íslands o.s.frv.") @@ -2267,12 +2282,12 @@ def gen(s: Iterable[str]) -> Iterator[str]: sents = list(g) assert len(sents) == 2 assert sents[0] == "Hún sagði : „ Þú ert leiðinlegur “ !" - assert sents[1] == "Hann svaraði engu - - en hætti við ferðina ." + assert sents[1] == f"Hann svaraði engu {EM_DASH} en hætti við ferðina ." g = t.split_into_sentences(s, normalize=False) sents = list(g) assert len(sents) == 2 assert sents[0] == 'Hún sagði : " Þú ert leiðinlegur " !' - assert sents[1] == "Hann svaraði engu - - en hætti við ferðina ." + assert sents[1] == "Hann svaraði engu -- en hætti við ferðina ." g = t.split_into_sentences( "Aðalsteinn Jónsson SU á leið til hafnar í "