diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index b27ab891..f156695c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -34,4 +34,10 @@ jobs: uv run ruff check medcat2 --preview - name: Test run: | - timeout 10m uv run python -m unittest discover + timeout 20m uv run python -m unittest discover + - name: Model regression + run: | + uv run bash tests/backwards_compatibility/run_current.sh + - name: Backwards compatibility + run: | + uv run bash tests/backwards_compatibility/check_backwards_compatibility.sh diff --git a/.release/README.md b/.release/README.md new file mode 100644 index 00000000..363fa794 --- /dev/null +++ b/.release/README.md @@ -0,0 +1,21 @@ +# Releases + +The scripts within here are designed to help preparing for and dealing with releases. + +The main idea is to use the `prepare_release.sh` script from within the root of the project and it will delegate either to `prepare_minor_release.sh` or `prepare_patch_release.sh` as necessary. +The workflow within the scripts is as follows: +- Create or check out release branch (`release/v.`) +- Update version in `pyproject.toml` +- Create a tag based on the version +- Push both the branch as well as the tag to `origin` + +The general usage for a minor release based on the `main` branch from within the **root of the project** is simply: +``` +bash .release/prepare_release.sh ..0 +``` +and the usage for a patch release (from within the **root of the project**) is in the format +``` +bash .release/prepare_release.sh .. ... +``` +where `hash 1` and `hash 2` (and so on) refer to the commit hashes that need to be included / cherry-picked in the patch release. + diff --git a/medcat2/cat.py b/medcat2/cat.py index e997dc46..c5d2cf1f 100644 --- a/medcat2/cat.py +++ b/medcat2/cat.py @@ -1,6 +1,10 @@ -from typing import Optional, Union, Any, overload, Literal +from typing import Optional, Union, Any, overload, Literal, Iterable, Iterator +from typing import cast import os import json +from datetime import date +from concurrent.futures import ProcessPoolExecutor, as_completed, Future +import itertools import shutil import logging @@ -23,6 +27,7 @@ from medcat2.components.addons.addons import AddonComponent from medcat2.utils.legacy.identifier import is_legacy_model_pack from medcat2.utils.defaults import AVOID_LEGACY_CONVERSION_ENVIRON +from medcat2.utils.usage_monitoring import UsageMonitor logger = logging.getLogger(__name__) @@ -51,6 +56,8 @@ def __init__(self, self._trainer: Optional[Trainer] = None self._pipeline = self._recreate_pipe(model_load_path) + self.usage_monitor = UsageMonitor( + self._get_hash, self.config.general.usage_monitor) def _recreate_pipe(self, model_load_path: Optional[str] = None ) -> Pipeline: @@ -75,7 +82,10 @@ def ignore_attrs(cls) -> list[str]: ] def __call__(self, text: str) -> Optional[MutableDocument]: - return self._pipeline.get_doc(text) + doc = self._pipeline.get_doc(text) + if self.usage_monitor.should_monitor: + self.usage_monitor.log_inference(len(text), len(doc.final_ents)) + return doc def _ensure_not_training(self) -> None: """Method to ensure config is not set to train. @@ -139,6 +149,188 @@ def get_entities(self, return {} return self._doc_to_out(doc, only_cui=only_cui) + def _mp_worker_func( + self, + texts_and_indices: list[tuple[str, str, bool]] + ) -> list[tuple[str, str, Union[dict, Entities, OnlyCUIEntities]]]: + return [ + (text, text_index, self.get_entities(text, only_cui=only_cui)) + for text, text_index, only_cui in texts_and_indices] + + def _generate_batches_by_char_length( + self, + text_iter: Union[Iterator[str], Iterator[tuple[str, str]]], + batch_size_chars: int, + only_cui: bool, + ) -> Iterator[list[tuple[str, str, bool]]]: + docs: list[tuple[str, str, bool]] = [] + char_count = 0 + for i, _doc in enumerate(text_iter): + # NOTE: not sure why mypy is complaining here + doc = cast( + str, _doc[1] if isinstance(_doc, tuple) else _doc) + doc_index: str = _doc[0] if isinstance(_doc, tuple) else str(i) + clen = len(doc) + char_count += clen + if char_count > batch_size_chars: + yield docs + docs = [] + char_count = clen + docs.append((doc_index, doc, only_cui)) + + if len(docs) > 0: + yield docs + + def _generate_batches( + self, + text_iter: Union[Iterator[str], Iterator[tuple[str, str]]], + batch_size: int, + batch_size_chars: int, + only_cui: bool, + ) -> Iterator[list[tuple[str, str, bool]]]: + if batch_size_chars < 1 and batch_size < 1: + raise ValueError("Either `batch_size` or `batch_size_chars` " + "must be greater than 0.") + if batch_size > 0 and batch_size_chars > 0: + raise ValueError( + "Cannot specify both `batch_size` and `batch_size_chars`. " + "Please use one of them.") + if batch_size_chars > 0: + return self._generate_batches_by_char_length( + text_iter, batch_size_chars, only_cui) + else: + return self._generate_simple_batches( + text_iter, batch_size, only_cui) + + def _generate_simple_batches( + self, + text_iter: Union[Iterator[str], Iterator[tuple[str, str]]], + batch_size: int, + only_cui: bool, + ) -> Iterator[list[tuple[str, str, bool]]]: + text_index = 0 + while True: + # Take a small batch from the iterator + batch = list(itertools.islice(text_iter, batch_size)) + if not batch: + break + # NOTE: typing is correct: + # - if str, then (str, int, bool) + # - if tuple, then (str, int, bool) + # but for some reason mypy complains + yield [ + (text, str(text_index + i), only_cui) # type: ignore + if isinstance(text, str) else + (text[1], text[0], only_cui) + for i, text in enumerate(batch) + ] + text_index += len(batch) + + def _mp_one_batch_per_process( + self, + executor: ProcessPoolExecutor, + batch_iter: Iterator[list[tuple[str, str, bool]]], + external_processes: int + ) -> Iterator[tuple[str, Union[dict, Entities, OnlyCUIEntities]]]: + futures: list[Future] = [] + # submit batches, one for each external processes + for _ in range(external_processes): + try: + batch = next(batch_iter) + futures.append( + executor.submit(self._mp_worker_func, batch)) + except StopIteration: + break + # Main process works on next batch while workers are busy + main_batch: Optional[list[tuple[str, str, bool]]] + try: + main_batch = next(batch_iter) + main_results = self._mp_worker_func(main_batch) + + # Yield main process results immediately + for result in main_results: + yield result[1], result[2] + + except StopIteration: + main_batch = None + # since the main process did around the same amount of work + # we would expect all subprocess to have finished by now + # so we're going to wait for them to finish, yield their results, + # and subsequently submit the next batch to keep them busy + for _ in range(external_processes): + # Wait for any future to complete + done_future = next(as_completed(futures)) + futures.remove(done_future) + + # Yield all results from this batch + for result in done_future.result(): + yield result[1], result[2] + + # Submit next batch to keep workers busy + try: + batch = next(batch_iter) + futures.append( + executor.submit(self._mp_worker_func, batch)) + except StopIteration: + # NOTE: if there's nothing to batch, we've got nothing + # to submit in terms of new work to the workers, + # but we may still have some futures to wait for + pass + + def get_entities_multi_texts( + self, + texts: Union[Iterable[str], Iterable[tuple[str, str]]], + only_cui: bool = False, + n_process: int = 1, + batch_size: int = -1, + batch_size_chars: int = 1_000_000, + ) -> Iterator[tuple[str, Union[dict, Entities, OnlyCUIEntities]]]: + """Get entities from multiple texts (potentially in parallel). + + If `n_process` > 1, `n_process - 1` new processes will be created + and data will be processed on those as well as the main process in + parallel. + + Args: + texts (Union[Iterable[str], Iterable[tuple[str, str]]]): + The input text. Either an iterable of raw text or one + with in the format of `(text_index, text)`. + only_cui (bool): + Whether to only return CUIs rather than other information + like start/end and annotated value. Defaults to False. + n_process (int): + Number of processes to use. Defaults to 1. + batch_size (int): + The number of texts to batch at a time. A batch of the + specified size will be given to each worker process. + Defaults to -1 and in this case the character count will + be used instead. + batch_size_chars (int): + The maximum number of characters to process in a batch. + Each process will be given batch of texts with a total + number of characters not exceeding this value. Defaults + to 1,000,000 characters. Set to -1 to disable. + + Yields: + Iterator[tuple[str, Union[dict, Entities, OnlyCUIEntities]]]: + The results in the format of (text_index, entities). + """ + text_iter = cast( + Union[Iterator[str], Iterator[tuple[str, str]]], iter(texts)) + batch_iter = self._generate_batches( + text_iter, batch_size, batch_size_chars, only_cui) + if n_process == 1: + # just do in series + for batch in batch_iter: + for text_index, _, result in self._mp_worker_func(batch): + yield text_index, result + return + + external_processes = n_process - 1 + with ProcessPoolExecutor(max_workers=external_processes) as executor: + yield from self._mp_one_batch_per_process( + executor, batch_iter, external_processes) + def _get_entity(self, ent: MutableEntity, doc_tokens: list[str], cui: str) -> Entity: @@ -253,6 +445,9 @@ def save_model_pack( self, target_folder: str, pack_name: str = DEFAULT_PACK_NAME, serialiser_type: Union[str, AvailableSerialisers] = 'dill', make_archive: bool = True, + only_archive: bool = False, + add_hash_to_pack_name: bool = True, + change_description: Optional[str] = None, ) -> str: """Save model pack. @@ -268,14 +463,22 @@ def save_model_pack( The serialiser type. Defaults to 'dill'. make_archive (bool): Whether to make the arhive /.zip file. Defaults to True. + only_archive (bool): + Whether to clear the non-compressed folder. Defaults to False. + add_hash_to_pack_name (bool): + Whether to add the hash to the pack name. This is only relevant + if pack_name is specified. Defaults to True. + change_description (Optional[str]): + If provided, this the description will be added to the + model description. Defaults to None. Returns: str: The final model pack path. """ self.config.meta.mark_saved_now() # figure out the location/folder of the saved files - hex_hash = self._versioning() - if pack_name == DEFAULT_PACK_NAME: + hex_hash = self._versioning(change_description) + if pack_name == DEFAULT_PACK_NAME or add_hash_to_pack_name: pack_name = f"{pack_name}_{hex_hash}" model_pack_path = os.path.join(target_folder, pack_name) # ensure target folder and model pack folder exist @@ -294,9 +497,16 @@ def save_model_pack( if make_archive: shutil.make_archive(model_pack_path, 'zip', root_dir=model_pack_path) + if only_archive: + logger.info("Removing the non-archived model pack folder: %s", + model_pack_path) + shutil.rmtree(model_pack_path, ignore_errors=True) + # change the model pack path to the zip file so that we + # refer to an existing file + model_pack_path += ".zip" return model_pack_path - def _versioning(self) -> str: + def _get_hash(self) -> str: hasher = Hasher() logger.debug("Hashing the CDB") hasher.update(self.cdb.get_hash()) @@ -306,6 +516,14 @@ def _versioning(self) -> str: type(component).__name__) hasher.update(component.get_hash()) hex_hash = self.config.meta.hash = hasher.hexdigest() + return hex_hash + + def _versioning(self, change_description: Optional[str]) -> str: + date_today = date.today().strftime("%d %B %Y") + if change_description is not None: + self.config.meta.description += ( + f"\n[{date_today}] {change_description}") + hex_hash = self._get_hash() history = self.config.meta.history if not history or history[-1] != hex_hash: history.append(hex_hash) diff --git a/medcat2/components/addons/meta_cat/data_utils.py b/medcat2/components/addons/meta_cat/data_utils.py index cf64ab77..1a2837aa 100644 --- a/medcat2/components/addons/meta_cat/data_utils.py +++ b/medcat2/components/addons/meta_cat/data_utils.py @@ -1,4 +1,6 @@ from typing import Optional +import copy + from medcat2.components.addons.meta_cat.mctokenizers.tokenizers import ( TokenizerWrapperBase) import logging @@ -189,7 +191,9 @@ def prepare_for_oversampled_data(data: list, def encode_category_values(data: dict, existing_category_value2id: Optional[dict] = None, - category_undersample=None) -> tuple: + category_undersample=None, + alternative_class_names: list[list[str]] = [] + ) -> tuple: """Converts the category values in the data outputted by `prepare_from_json` into integer values. @@ -201,6 +205,10 @@ def encode_category_values(data: dict, category_undersample: Name of class that should be used to undersample the data (for 2 phase learning) + alternative_class_names (list[list[str]]): + A list of lists of strings, where each list contains variations + of a class name. Usually read from the config at + `config.general.alternative_class_names`. Returns: dict: @@ -210,6 +218,10 @@ def encode_category_values(data: dict, inplace of strings for category values dict: Map from category value to ID for all categories in the data. + + Raises: + Exception: If categoryvalue2id is pre-defined and its labels do + not match the labels found in the data """ data_list = list(data) if existing_category_value2id is not None: @@ -218,9 +230,61 @@ def encode_category_values(data: dict, category_value2id = {} category_values = set([x[2] for x in data_list]) - for c in category_values: - if c not in category_value2id: - category_value2id[c] = len(category_value2id) + + if (len(category_value2id) != 0 and + set(category_value2id.keys()) != category_values): + # if categoryvalue2id doesn't match the labels in the data, + # then 'alternative_class_names' has to be defined to check + # for variations + if len(alternative_class_names) == 0: + # Raise an exception since the labels don't match + raise Exception( + "The classes set in the config are not the same as the one " + "found in the data. The classes present in the config vs the " + "ones found in the data - {set(category_value2id.keys())}, " + f"{category_values}. Additionally, ensure the populate the " + "'alternative_class_names' attribute to accommodate for " + "variations.") + updated_category_value2id = {} + for _class in category_value2id.keys(): + if _class in category_values: + updated_category_value2id[_class] = category_value2id[_class] + else: + found_in = [sub_map for sub_map in alternative_class_names + if _class in sub_map] + failed_to_find = False + if len(found_in) != 0: + class_name_matched = [label for label in found_in[0] + if label in category_values] + if len(class_name_matched) != 0: + updated_category_value2id[class_name_matched[0] + ] = category_value2id[_class] + logger.info( + "Class name '%s' does not exist in the data; " + "however a variation of it '%s' is present; " + "updating it...", _class, class_name_matched[0]) + else: + failed_to_find = True + else: + failed_to_find = True + if failed_to_find: + raise Exception( + "The classes set in the config are not the same as " + "the one found in the data. The classes present in " + "the config vs the ones found in the data - " + f"{set(category_value2id.keys())}, {category_values}. " + "Additionally, ensure the populate the " + "'alternative_class_names' attribute to accommodate " + "for variations.") + category_value2id = copy.deepcopy(updated_category_value2id) + logger.info("Updated categoryvalue2id mapping - %s", category_value2id) + # Else create the mapping from the labels found in the data + else: + for c in category_values: + if c not in category_value2id: + category_value2id[c] = len(category_value2id) + logger.info("Categoryvalue2id mapping created with labels found " + "in the data - %s", category_value2id) # Map values to numbers for i in range(len(data_list)): @@ -232,7 +296,7 @@ def encode_category_values(data: dict, if data_list[i][2] in category_value2id.values(): label_data_[data_list[i][2]] = label_data_[data_list[i][2]] + 1 - logger.info("Original label_data: %s", label_data_) + logger.info("Original number of samples per label: %s", label_data_) # Undersampling data if category_undersample is None or category_undersample == '': min_label = min(label_data_.values()) @@ -257,6 +321,7 @@ def encode_category_values(data: dict, if data_undersampled[i][2] in category_value2id.values(): label_data[data_undersampled[i][2]] = label_data[ data_undersampled[i][2]] + 1 - logger.info("Updated label_data: %s", label_data) + logger.info("Updated number of samples per label (for 2-phase learning): " + "%s", label_data) return data_list, data_undersampled, category_value2id diff --git a/medcat2/components/addons/meta_cat/meta_cat.py b/medcat2/components/addons/meta_cat/meta_cat.py index 0c7f325d..a6d54a4e 100644 --- a/medcat2/components/addons/meta_cat/meta_cat.py +++ b/medcat2/components/addons/meta_cat/meta_cat.py @@ -454,12 +454,16 @@ def train_raw(self, data_loaded: dict, save_dir_path: Optional[str] = None, prerequisites=t_config.prerequisites, lowercase=g_config.lowercase) # Check is the name present - category_name = g_config.category_name - if category_name not in data_in: + category_name = category_name = g_config.get_applicable_category_name( + data_in) + if category_name is None: raise Exception( "The category name does not exist in this json file. " f"You've provided '{category_name}', while the possible " - f"options are: {' | '.join(list(data_in.keys()))}") + f"options are: {' | '.join(list(data_in.keys()))}. " + "Additionally, ensure the populate the " + "'alternative_category_names' attribute to accommodate " + "for variations.") data = data_in[category_name] if data_oversampled: @@ -473,15 +477,17 @@ def train_raw(self, data_loaded: dict, save_dir_path: Optional[str] = None, (full_data, data_undersampled, category_value2id) = encode_category_values( data, - category_undersample=self.config.model.category_undersample) - g_config.category_value2id = category_value2id + category_undersample=self.config.model.category_undersample, + alternative_class_names=g_config.alternative_class_names) else: # We already have everything, just get the data (full_data, data_undersampled, category_value2id) = encode_category_values( data, existing_category_value2id=category_value2id, - category_undersample=self.config.model.category_undersample) + category_undersample=self.config.model.category_undersample, + alternative_class_names=g_config.alternative_class_names) g_config.category_value2id = category_value2id + self.config.model.nclasses = len(category_value2id) # Make sure the config number of classes is the same # as the one found in the data if len(category_value2id) != self.config.model.nclasses: @@ -499,9 +505,8 @@ def train_raw(self, data_loaded: dict, save_dir_path: Optional[str] = None, try: self.model.load_state_dict(torch.load( model_save_path, map_location=device)) - logger.info( - "Model state loaded from dict for 2 phase learning") - + logger.info("Training model for Phase 2, with model dict " + "loaded from disk") except FileNotFoundError: raise FileNotFoundError( f"\nError: Model file not found at path: {model_save_path}" @@ -524,6 +529,7 @@ def train_raw(self, data_loaded: dict, save_dir_path: Optional[str] = None, logger.info("For phase 1, model state has to be saved. " "Saving model...") t_config.auto_save_model = True + logger.info("Training model for Phase 1 now...") report = train_model(self.model, data=data, config=self.config, save_dir_path=save_dir_path) @@ -576,8 +582,8 @@ def eval(self, json_path: str) -> dict: prerequisites=t_config.prerequisites, lowercase=g_config.lowercase) # Check is the name there - category_name = g_config.category_name - if category_name not in data_in: + category_name = g_config.get_applicable_category_name(data_in) + if category_name is None: raise Exception( "The category name does not exist in this json file.") diff --git a/medcat2/components/addons/meta_cat/ml_utils.py b/medcat2/components/addons/meta_cat/ml_utils.py index be4c3ff0..527ad483 100644 --- a/medcat2/components/addons/meta_cat/ml_utils.py +++ b/medcat2/components/addons/meta_cat/ml_utils.py @@ -360,7 +360,8 @@ def initialize_model(classifier, data_, batch_size_, lr_, epochs=4): name='Test') _report = classification_report(y_test, np.argmax(np.concatenate( - all_logits_test, axis=0), axis=1), output_dict=True) + all_logits_test, axis=0), axis=1), output_dict=True, + zero_division=0) if not winner_report or _report[config.train.metric['base']][ config.train.metric['score']] > \ winner_report['report'][config.train.metric['base']][ @@ -371,7 +372,8 @@ def initialize_model(classifier, data_, batch_size_, lr_, epochs=4): cm = confusion_matrix(y_test, np.argmax(np.concatenate( all_logits_test, axis=0), axis=1), normalize='true') report_train = classification_report(y_train, np.argmax( - np.concatenate(all_logits, axis=0), axis=1), output_dict=True) + np.concatenate(all_logits, axis=0), axis=1), output_dict=True, + zero_division=0) winner_report['confusion_matrix'] = cm winner_report['report'] = report diff --git a/medcat2/components/ner/trf/deid.py b/medcat2/components/ner/trf/deid.py index 8bdc68e9..eee03d0f 100644 --- a/medcat2/components/ner/trf/deid.py +++ b/medcat2/components/ner/trf/deid.py @@ -35,6 +35,7 @@ - cdb """ from typing import Union, Any, Optional +import re import logging from medcat2.cat import CAT @@ -44,6 +45,7 @@ from medcat2.components.ner.trf.model import NerModel from medcat2.components.ner.trf.helpers import replace_entities_in_text from medcat2.components.ner.trf.transformers_ner import TransformersNER +from medcat2.data.entities import Entity logger = logging.getLogger(__name__) @@ -141,3 +143,136 @@ def create(cls, cdb: CDB, cnf: ConfigTransformersNER): cdb.config.components.linking.comp_name = 'no_action' cat = CAT(cdb=cdb, vocab=None, config=cdb.config) return cls(cat) + + +def match_rules(rules: list[tuple[str, str]], texts: list[str], + cui2preferred_name: dict[str, str]) -> list[list[Entity]]: + """Match a set of rules - pat / cui combos as post processing labels. + Uses a cat DeID model for pretty name mapping. + Args: + rules (list[tuple[str, str]]): List of tuples of pattern and cui + texts (list[str]): List of texts to match rules on + cui2preferred_name (dict[str, str]): Dictionary of CUI to + preferred name, likely to be cat.cdb.cui2preferred_name. + Examples: + >>> cat = CAT.load_model_pack(model_pack_path) + ... + >>> rules = [ + ('(123) 456-7890', '134'), + ('1234567890', '134'), + ('123.456.7890', '134'), + ('1234567890', '134'), + ('1234567890', '134'), + ] + >>> texts = [ + 'My phone number is (123) 456-7890', + 'My phone number is 1234567890', + 'My phone number is 123.456.7890', + 'My phone number is 1234567890', + ] + >>> matches = match_rules(rules, texts, cat.cdb.cui2preferred_name) + Returns: + List[List[Dict]]: List of lists of predictions from `match_rules` + """ + # Iterate through each text and pattern combination + rule_matches_per_text: list[list[Entity]] = [] + for i, text in enumerate(texts): + matches_in_text: list[Entity] = [] + for pattern, concept in rules: + # Find all matches of current pattern in current text + text_matches = re.finditer(pattern, text, flags=re.M) + # Add each match with its pattern and text info + for match in text_matches: + matches_in_text.append({ + 'source_value': match.group(), + 'pretty_name': cui2preferred_name[concept], + 'start': match.start(), + 'end': match.end(), + 'cui': concept, + 'acc': 1.0 + }) + rule_matches_per_text.append(matches_in_text) + return rule_matches_per_text + + +def merge_all_preds(model_preds_by_text: list[list[Entity]], + rule_matches_per_text: list[list[Entity]], + accept_preds: bool = True) -> list[list[Entity]]: + """Conveniance method to merge predictions from rule based and deID model + predictions. + + Args: + model_preds_by_text (list[list[Entity]]): + List of predictions from `cat.get_entities()`, then + `[list(m['entities'].values()) for m in model_preds]` + rule_matches_per_text (list[list[Entity]]): + List of predictions from output of running `match_rules` + accept_preds (bool): + Uses the predicted label from the model, + model_preds_by_text, over the rule matches if they overlap. + Defaults to using model preds over rules. + Returns: + list[list[Entity]]: List of lists of predictions from `merge_all_preds` + """ + assert len(model_preds_by_text) == len(rule_matches_per_text), ( + "model_preds_by_text and rule_matches_per_text must have the same " + "length as they should be CAT.get_entities and match_rules outputs of " + "the same text") + return [ + merge_preds(model_preds_by_text[i], + rule_matches_per_text[i], + accept_preds) + for i in range(len(model_preds_by_text))] + + +def merge_preds(model_preds: list[Entity], + rule_matches: list[Entity], + accept_preds: bool = True) -> list[Entity]: + """Merge predictions from rule based and deID model predictions. + Args: + model_preds (list[Entity]): predictions from `cat.get_entities()` + rule_matches (list[Entity]): predictions from output of running + `match_rules` on the same text + accept_preds (bool): uses the predicted label from the model, + model_preds, over the rule matches if they overlap. + Defaults to using model preds over rules. + Examples: + >>> # a list of predictions from `cat.get_entities()` + >>> model_preds = [ + [ + {'cui': '134', 'start': 10, 'end': 20, 'acc': 1.0, + 'pretty_name': 'Phone Number'}, + {'cui': '134', 'start': 25, 'end': 35, 'acc': 1.0, + 'pretty_name': 'Phone Number'} + ] + ] + >>> # a list of predictions from `match_rules` + >>> rule_matches = [ + [ + {'cui': '134', 'start': 10, 'end': 20, 'acc': 1.0, + 'pretty_name': 'Phone Number'}, + {'cui': '134', 'start': 25, 'end': 35, 'acc': 1.0, + 'pretty_name': 'Phone Number'} + ] + ] + >>> merged_preds = merge_preds(model_preds, rule_matches) + Returns: + list[Entity]: List of predictions from `merge_preds` + """ + if accept_preds: + labels1 = model_preds + labels2 = rule_matches + else: + labels1 = rule_matches + labels2 = model_preds + + # Keep only non-overlapping model predictions + labels2 = [span2 for span2 in labels2 + if not any(not (span2['end'] <= span1['start'] or + span1['end'] <= span2['start']) + for span1 in labels1)] + # merge preds and sort on start + merged_preds = labels1 + labels2 + merged_preds.sort(key=lambda x: x['start']) + merged_preds + return merged_preds diff --git a/medcat2/components/ner/trf/transformers_ner.py b/medcat2/components/ner/trf/transformers_ner.py index 0a3260af..d520256f 100644 --- a/medcat2/components/ner/trf/transformers_ner.py +++ b/medcat2/components/ner/trf/transformers_ner.py @@ -379,6 +379,8 @@ def train(self, ignore_extra_labels=False, dataset=None, meta_requirements=None, + train_json_path: Union[str, list, None] = None, + test_json_path: Union[str, list, None] = None, trainer_callbacks: Optional[list[TrCBCreator]] = None ) -> tuple: """Train or continue training a model give a json_path containing a @@ -395,6 +397,10 @@ def train(self, in the old model. dataset: Defaults to None. meta_requirements: Defaults to None + train_json_path (Union[str, list, None]): + The json path for the training data. Defaults to None. + test_json_path (Union[str, list, None]): + The json path for the test data. Defaults to None. trainer_callbacks (list[TrCBCreator]): A list of trainer callbacks for collecting metrics during the training at the client side. The transformers Trainer object @@ -404,12 +410,22 @@ def train(self, Tuple: The dataframe, examples, and the dataset """ - if dataset is None and json_path is not None: + if dataset is None: # Load the medcattrainer export - json_path = self._prepare_dataset( - json_path, ignore_extra_labels=ignore_extra_labels, - meta_requirements=meta_requirements, - file_name='data_eval.json') + if json_path is not None: + json_path = self._prepare_dataset( + json_path, ignore_extra_labels=ignore_extra_labels, + meta_requirements=meta_requirements, + file_name='data_eval.json') + elif test_json_path is not None and train_json_path is not None: + train_json_path = self._prepare_dataset( + train_json_path, ignore_extra_labels=ignore_extra_labels, + meta_requirements=meta_requirements, + file_name='data_train.json') + test_json_path = self._prepare_dataset( + test_json_path, ignore_extra_labels=ignore_extra_labels, + meta_requirements=meta_requirements, + file_name='data_test.json') # Load dataset # NOTE: The following is for backwards comppatibility @@ -424,15 +440,27 @@ def train(self, trust_remote_code=True) else: ds_load_dataset = datasets.load_dataset - dataset = ds_load_dataset(os.path.abspath( - transformers_ner.__file__), - data_files={'train': json_path}, # type: ignore - split='train', - cache_dir='/tmp/') - # We split before encoding so the split is document level, - # as encoding does the document splitting into max_seq_len - dataset = dataset.train_test_split( - test_size=self.config.general.test_size) # type: ignore + if json_path: + dataset = ds_load_dataset(os.path.abspath( + transformers_ner.__file__), + data_files={'train': json_path}, # type: ignore + split='train', + cache_dir='/tmp/') + # We split before encoding so the split is document level, + # as encoding does the document splitting into max_seq_len + dataset = dataset.train_test_split( + test_size=self.config.general.test_size) # type: ignore + elif train_json_path and test_json_path: + dataset = ds_load_dataset( + os.path.abspath(transformers_ner.__file__), + data_files={ + 'train': train_json_path, + 'test': test_json_path}, # type: ignore + cache_dir='/tmp/') + else: + raise ValueError( + "Either json_path or train_json_path and test_json_path " + "must be provided when no dataset is provided") # Update labelmap in case the current dataset has more labels # than what we had before @@ -448,7 +476,8 @@ def train(self, len(self.tokenizer.label_map)) self.model = AutoModelForTokenClassification.from_pretrained( self.config.general.model_name, - num_labels=len(self.tokenizer.label_map)) + num_labels=len(self.tokenizer.label_map), + ignore_mismatched_sizes=True) self.tokenizer.cui2name = { k: self.cdb.get_name(k) for k in self.tokenizer.label_map.keys()} diff --git a/medcat2/config/config.py b/medcat2/config/config.py index 1cae59be..a81d7ced 100644 --- a/medcat2/config/config.py +++ b/medcat2/config/config.py @@ -1,4 +1,5 @@ -from typing import Optional, Iterator, Iterable, TypeVar, cast, Type, Any +from typing import (Optional, Iterator, Iterable, TypeVar, cast, Type, Any, + Literal) from typing import Protocol, runtime_checkable import logging from datetime import datetime @@ -189,11 +190,35 @@ class Config: validate_assignment = True +class UsageMonitor(SerialisableBaseModel): + enabled: Literal[True, False, 'auto'] = False + r"""Whether usage monitoring is enabled (True), disabled (False), + or automatic ('auto'). + If set to False, no logging is performed. + If set to True, logs are saved in the location specified by `log_folder`. + If set to 'auto', logs will be automatically enabled or disabled based on + environmenta variable (`MEDCAT_LOGS` - setting it to False or 0 + disabled logging) and distributed according to the OS preferred logs + location (`MEDCAT_LOGS_LOCATION`). + The defaults for the location are: + - For Linux: ~/.local/share/medcat/logs/ + - For Windows: C:\Users\%USERNAME%\.cache\medcat\logs\ + """ + batch_size: int = 100 + """Number of logged events to write at once.""" + file_prefix: str = "usage_" + """The prefix for logged files. The suffix will be the model hash.""" + log_folder: str = "." + """The folder which contains the usage logs. In certain situations, + it may make sense to keep this separate from the overall logs. + NOTE: Does not take affect if `enabled` is set to 'auto'""" + + class General(SerialisableBaseModel): """The general part of the config""" nlp: NLPConfig = NLPConfig() # checkpoint: CheckPoint = CheckPoint() - # usage_monitor = UsageMonitor() + usage_monitor: UsageMonitor = UsageMonitor() """Checkpointing config""" log_level: int = logging.INFO """Logging config for everything | 'tagger' can be disabled, diff --git a/medcat2/config/config_meta_cat.py b/medcat2/config/config_meta_cat.py index 2d7a0343..27983646 100644 --- a/medcat2/config/config_meta_cat.py +++ b/medcat2/config/config_meta_cat.py @@ -1,8 +1,14 @@ from typing import Any, Optional +from collections.abc import Container from medcat2.config.config import DirtiableBaseModel, ComponentConfig from medcat2.storage.serialisers import AvailableSerialisers +import logging + + +logger = logging.getLogger(__name__) + class General(DirtiableBaseModel): """The General part of the MetaCAT config""" @@ -34,9 +40,28 @@ class General(DirtiableBaseModel): NB! For these changes to take effect, the pipe would need to be recreated. """ - category_value2id: dict = {} + alternative_category_names: list[str] = [] + """List that stores the variations of possible category names + Example: For Experiencer, the alternate name is Subject + alternative_category_names: ['Experiencer','Subject'] + In the case that one specified in `category_name` parameter does not match + the data, this ensures no error is raised and it is automatically mapped + """ + category_value2id: dict[str, int] = {} """Map from category values to ID, if empty it will be autocalculated during training""" + alternative_class_names: list[list[str]] = [[]] + """List of lists that stores the variations of possible class names + for each class mentioned in self.general.category_value2id. + Example: For Presence task, the class names vary across NHS sites. + To accommodate for this, alternative_class_names is populated as: + [ + ["Hypothetical (N/A)","Hypothetical"], + ["Not present (False)","False"], + ["Present (True)","True"] + ] + Each sub list contains the possible variations of the given class. + """ vocab_size: int = -1 """Will be set automatically if the tokenizer is provided during meta_cat init""" @@ -75,6 +100,22 @@ class General(DirtiableBaseModel): serialiser: AvailableSerialisers = AvailableSerialisers.dill """The serialiser to use when saving.""" + def get_applicable_category_name( + self, available_names: Container[str]) -> Optional[str]: + if self.category_name in available_names: + return self.category_name + matches = [cat for cat in self.alternative_category_names + if cat in available_names] + if len(matches) > 0: + logger.info( + "The category name provided in the config - '%s' is not " + "present in the data. However, the corresponding name - '%s' " + "from the category_name_mapping has been found. Updating the " + "category name...", self.category_name, *matches) + self.category_name = matches[0] + return self.category_name + return None + class Config: extra = 'allow' validate_assignment = True diff --git a/medcat2/config/config_rel_cat.py b/medcat2/config/config_rel_cat.py index 37074a50..225dd3e7 100644 --- a/medcat2/config/config_rel_cat.py +++ b/medcat2/config/config_rel_cat.py @@ -118,6 +118,9 @@ def __setattr__(self, key: str, value: Any): value = self.convert_keys_to_int(value) # Ensure conversion super().__setattr__(key, value) + class Config: + protected_namespaces = () + class Model(SerialisableBaseModel): """The model part of the RelCAT config""" @@ -159,6 +162,7 @@ class Model(SerialisableBaseModel): class Config: extra = 'allow' validate_assignment = True + protected_namespaces = () class Train(SerialisableBaseModel): diff --git a/medcat2/config/config_transformers_ner.py b/medcat2/config/config_transformers_ner.py index a0be9f25..23fb15dd 100644 --- a/medcat2/config/config_transformers_ner.py +++ b/medcat2/config/config_transformers_ner.py @@ -25,6 +25,7 @@ class General(SerialisableBaseModel): class Config: extra = 'allow' validate_assignment = True + protected_namespaces = () class ConfigTransformersNER(SerialisableBaseModel): diff --git a/medcat2/model_creation/preprocess_snomed.py b/medcat2/model_creation/preprocess_snomed.py new file mode 100644 index 00000000..70bffac1 --- /dev/null +++ b/medcat2/model_creation/preprocess_snomed.py @@ -0,0 +1,676 @@ +import os +import json +import re +import hashlib +import pandas as pd +from typing import Dict, List, Optional, Tuple +from dataclasses import dataclass, field +from enum import Enum, auto + + +def parse_file(filename, first_row_header=True, columns=None): + with open(filename, encoding='utf-8') as f: + entities = [[n.strip() for n in line.split('\t')] for line in f] + return pd.DataFrame( + entities[1:], columns=entities[0] if first_row_header else columns) + + +def get_all_children(sctid, pt2ch): + """ + Retrieves all the children of a given SNOMED CT ID (SCTID) from a given + parent-to-child mapping (pt2ch) via the "IS A" relationship. + pt2ch can be found in a MedCAT model in the additional info + via the call: cat.cdb.addl_info['pt2ch'] + + Args: + sctid (int): The SCTID whose children need to be retrieved. + pt2ch (dict): A dictionary containing the parent-to-child + elationships in the form {parent_sctid: [list of child sctids]}. + + Returns: + list: A list of unique SCTIDs that are children of the given SCTID. + """ + result = [] + stack = [sctid] + while len(stack) != 0: + # remove the last element from the stack + current_snomed = stack.pop() + current_snomed_children = pt2ch.get(current_snomed, []) + stack.extend(current_snomed_children) + result.append(current_snomed) + result = list(set(result)) + return result + + +def get_direct_refset_mapping(in_dict: dict) -> dict: + """This method uses the output from Snomed.map_snomed2icd10 or + Snomed.map_snomed2opcs4 and removes the metadata and maps each + SNOMED CUI to the prioritised list of the target ontology CUIs. + + The input dict is expected to be in the following format: + - Keys are SnomedCT CUIs + - The values are lists of dictionaries, each list item (at least) + - Has a key 'code' that specifies the target onotlogy CUI + - Has a key 'mapPriority' that specifies the priority + + Args: + in_dict (dict): The input dict. + + Returns: + dict: The map from Snomed CUI to list of priorities list of target + ontology CUIs. + """ + ret_dict = dict() + for k, vals in in_dict.items(): + # sort such that highest priority values are first + svals = sorted(vals, key=lambda el: el['mapPriority'], reverse=True) + # only keep the code / CUI + ret_dict[k] = [v['code'] for v in svals] + return ret_dict + + +_IGNORE_TAG = '##IGNORE-THIS##' + + +class RefSetFileType(Enum): + concept = auto() + description = auto() + relationship = auto() + refset = auto() + + +@dataclass +class FileFormatDescriptor: + concept: str + description: str + relationship: str + refset: str + # for concept, description, and relationship (but not refset) + common_prefix: str = "sct2_" + + @classmethod + def ignore_all(cls) -> 'FileFormatDescriptor': + return cls(concept=_IGNORE_TAG, description=_IGNORE_TAG, + relationship=_IGNORE_TAG, refset=_IGNORE_TAG) + + def get_file_per_type(self, file_type: RefSetFileType) -> str: + raw = self._get_raw(file_type) + return (raw + if file_type == RefSetFileType.refset else + self.common_prefix + raw) + + def _get_raw(self, file_type: RefSetFileType) -> str: + return getattr(self, file_type.name) + + def get_concept(self) -> str: + return self.get_file_per_type(RefSetFileType.concept) + + def get_description(self) -> str: + return self.get_file_per_type(RefSetFileType.description) + + def get_relationship(self) -> str: + return self.get_file_per_type(RefSetFileType.relationship) + + def get_refset(self) -> str: + return self.get_file_per_type(RefSetFileType.refset) + + +@dataclass +class ExtensionDescription: + exp_name_in_folder: str + exp_files: FileFormatDescriptor + exp_2nd_part_in_folder: Optional[str] = None + + +SNOMED_FOLDER_NAME_PATTERN = re.compile( + # within: EXTENSION PRODUCTION RELEASE + r"^SnomedCT_([A-Za-z0-9]+)_([A-Za-z0-9]+)_(\d{8}T\d{6}Z$)") +PER_FILE_TYPE_PATHS = { + RefSetFileType.concept: os.path.join("Snapshot", "Terminology"), + RefSetFileType.description: os.path.join("Snapshot", "Terminology"), + RefSetFileType.relationship: os.path.join("Snapshot", "Terminology"), + RefSetFileType.refset: os.path.join("Snapshot", "Refset", "Map"), +} + + +class SupportedExtension(Enum): + INTERNATIONAL = ExtensionDescription( + exp_name_in_folder="InternationalRF2", + exp_files=FileFormatDescriptor( + concept="Concept_Snapshot", + description="Description_Snapshot-en", + relationship="Relationship_Snapshot", + # NOTE: the below will be ignored for UK_CLIN bundle + refset="der2_iisssccRefset_ExtendedMapSnapshot" + ), + ) + UK_CLINICAL = ExtensionDescription( + exp_name_in_folder="UKClinicalRF2", + exp_files=FileFormatDescriptor( + concept="Concept_UKCLSnapshot", + description="Description_UKCLSnapshot-en", + relationship="Relationship_UKCLSnapshot", + refset="der2_iisssciRefset_ExtendedMapUKCLSnapshot" + ), + ) + UK_CLINICAL_REFSET = ExtensionDescription( + exp_name_in_folder="UKClinicalRefsetsRF2", + exp_files=FileFormatDescriptor.ignore_all() + ) + UK_EDITION = ExtensionDescription( + exp_name_in_folder="UKEditionRF2", + exp_files=FileFormatDescriptor( + concept="Concept_UKEDSnapshot", + description="Description_UKEDSnapshot-en", + relationship="Relationship_UKEDSnapshot", + refset="der2_iisssciRefset_ExtendedMapUKEDSnapshot" + ), + ) + UK_DRUG = ExtensionDescription( + exp_name_in_folder="UKDrugRF2", + exp_files=FileFormatDescriptor( + concept="Concept_UKDGSnapshot", + description="Description_UKDGSnapshot-en", + relationship="Relationship_UKDGSnapshot", + refset="der2_iisssciRefset_ExtendedMapUKDGSnapshot", + ), + ) + AU = ExtensionDescription( + exp_name_in_folder="Release", + exp_2nd_part_in_folder="AU1000036", + exp_files=FileFormatDescriptor( + concept="Concept_Snapshot", + description="Description_Snapshot-en-AU", + relationship="Relationship_Snapshot", + refset=_IGNORE_TAG, + ), + ) + + +@dataclass +class BundleDescriptor: + extensions: List[SupportedExtension] + ignores: Dict[RefSetFileType, List[SupportedExtension]] = field( + default_factory=dict) + + def has_invalid(self, ext: SupportedExtension, + file_types: Tuple[RefSetFileType]) -> bool: + for ft in file_types: + if ft not in self.ignores: + continue + exts2ignore = self.ignores[ft] + if ext in exts2ignore: + return True + return False + + +class SupportedBundles(Enum): + UK_CLIN = BundleDescriptor( + extensions=[ + SupportedExtension.INTERNATIONAL, SupportedExtension.UK_CLINICAL, + SupportedExtension.UK_CLINICAL_REFSET, + SupportedExtension.UK_EDITION], + ignores={RefSetFileType.refset: [SupportedExtension.INTERNATIONAL]} + ) + UK_DRUG_EXT = BundleDescriptor( + extensions=[SupportedExtension.UK_DRUG, SupportedExtension.UK_EDITION], + ) + + +def match_partials_with_folders(exp_names: List[Tuple[str, Optional[str]]], + folder_names: List[str], + _group_nr1: int = 1, _group_nr2: int = 2 + ) -> bool: + if len(exp_names) > len(folder_names): + return False + available_folders = [os.path.basename(f) for f in folder_names] + for exp_name, exp_name_p2 in exp_names: + found_cur_name = False + for fi, folder in enumerate(available_folders): + m = SNOMED_FOLDER_NAME_PATTERN.match(folder) + if not m: + continue + if m.group(_group_nr1) != exp_name: + continue + if exp_name_p2 and m.group(_group_nr2) != exp_name_p2: + continue + found_cur_name = True + break + if found_cur_name: + available_folders.pop(fi) + else: + return False + return True + + +class Snomed: + """ + Pre-process SNOMED CT release files. + + This class is used to create a SNOMED CT concept DataFrame ready for + MedCAT CDB creation. + + Attributes: + data_path (str): Path to the unzipped SNOMED CT folder. + release (str): Release of SNOMED CT folder. + uk_ext (bool, optional): Specifies whether the version is a + SNOMED UK extension released after 2021. Defaults to False. + uk_drug_ext (bool, optional): Specifies whether the version + is a SNOMED UK drug extension. Defaults to False. + au_ext (bool, optional): Specifies whether the version is a + AU release. Defaults to False. + """ + NO_VERSION_DETECTED = 'N/A' + + def __init__(self, data_path): + self.data_path = data_path + self.bundle = self._determine_bundle(self.data_path) + self.paths, self.snomed_releases, self.exts = ( + self._check_path_and_release()) + + @classmethod + def _determine_bundle(cls, data_path) -> Optional[SupportedBundles]: + if not os.path.exists(data_path) or not os.path.isdir(data_path): + return None + for bundle in SupportedBundles: + folder_names = list(os.listdir(data_path)) + exp_names = [(ext.value.exp_name_in_folder, + ext.value.exp_2nd_part_in_folder) + for ext in bundle.value.extensions] + if match_partials_with_folders(exp_names, folder_names): + return bundle + return None + + def _set_extension(self, release: str, extension: SupportedExtension + ) -> None: + # NOTE: now using the later refset IF by default + # NOTE: the OPCS4 refset ID is only relevant for UK releases + self.opcs_refset_id = '1382401000000109' + if (extension in (SupportedExtension.UK_CLINICAL, + SupportedExtension.UK_DRUG) and + # using lexicographical comparison below + # e.g "20240101" > "20231122" results in True + # yet "20231121" > "20231122" results in False + len(release) == len("20231122") and release < "20231122"): + # NOTE for UK extensions starting from 20231122 the + # OPCS4 refset ID seems to be different + self.opcs_refset_id = "1126441000000105" + self._extension = extension + + @classmethod + def _determine_extension(cls, folder_path: str, + _group_nr1: int = 1, _group_nr2: int = 2 + ) -> SupportedExtension: + folder_basename = os.path.basename(folder_path) + m = SNOMED_FOLDER_NAME_PATTERN.match(folder_basename) + if not m: + raise UnkownSnomedReleaseException( + f"Unable to determine extension for path {repr(folder_path)}. " + f"Checking against pattern {SNOMED_FOLDER_NAME_PATTERN}") + ext_str = m.group(_group_nr1) + ext_str2 = m.group(_group_nr2) + for extension in SupportedExtension: + if extension.value.exp_name_in_folder != ext_str: + continue + if (extension.value.exp_2nd_part_in_folder and + extension.value.exp_2nd_part_in_folder != ext_str2): + continue + return extension + ext_names_folders = ",".join( + [f"{ext.name} ({ext.value.exp_name_in_folder})" + for ext in SupportedExtension]) + raise UnkownSnomedReleaseException( + f"Cannot Find the extension for {folder_path}. " + f"Tried the following extensions: {ext_names_folders}") + + @classmethod + def _determine_release(cls, folder_path: str, strict: bool = True, + _group_nr: int = 3, _keep_chars: int = 8) -> str: + folder_basename = os.path.basename(folder_path) + match = SNOMED_FOLDER_NAME_PATTERN.match(folder_basename) + if match is None and strict: + raise UnkownSnomedReleaseException( + f"No version found in '{folder_path}'") + elif match is None: + return cls.NO_VERSION_DETECTED + return match.group(_group_nr)[:_keep_chars] + + def to_concept_df(self): + """ + Create a SNOMED CT concept DataFrame. + + Creates a SNOMED CT concept DataFrame ready for MEDCAT CDB creation. + Checks if the version is a UK extension release and sets the correct + file names for the concept and description snapshots accordingly. + Additionally, handles the divergent release format of the UK Drug + Extension >v2021 with the `uk_drug_ext` variable. + + Returns: + pandas.DataFrame: SNOMED CT concept DataFrame. + """ + + df2merge = [] + for i, snomed_release in enumerate(self.snomed_releases): + self._set_extension(snomed_release, self.exts[i]) + contents_path = os.path.join( + self.paths[i], PER_FILE_TYPE_PATHS[RefSetFileType.concept]) + exp_files = self._extension.value.exp_files + concept_snapshot = exp_files.get_concept() + description_snapshot = exp_files.get_description() + if concept_snapshot is None or _IGNORE_TAG in concept_snapshot or ( + self.bundle and self.bundle.value.has_invalid( + self._extension, [RefSetFileType.concept, + RefSetFileType.description])): + continue + + for f in os.listdir(contents_path): + m = re.search(f'{concept_snapshot}' + r'_(.*)_\d*.txt', f) + if m: + snomed_v = m.group(1) + + int_terms = parse_file( + f'{contents_path}/' + f'{concept_snapshot}_{snomed_v}_{snomed_release}.txt') + active_terms = int_terms[int_terms.active == '1'] + del int_terms + + int_desc = parse_file( + f'{contents_path}/{description_snapshot}_{snomed_v}_' + f'{snomed_release}.txt') + active_descs = int_desc[int_desc.active == '1'] + del int_desc + + _ = pd.merge(active_terms, active_descs, + left_on=['id'], right_on=['conceptId'], + how='inner') + del active_terms + del active_descs + + active_with_primary_desc = _[ + _['typeId'] == '900000000000003001'] # active description + active_with_synonym_desc = _[ + _['typeId'] == '900000000000013009'] # active synonym + del _ + active_with_all_desc = pd.concat( + [active_with_primary_desc, active_with_synonym_desc]) + + active_snomed_df = active_with_all_desc[['id_x', 'term', 'typeId']] + del active_with_all_desc + + active_snomed_df = active_snomed_df.rename( + columns={'id_x': 'cui', 'term': 'name', + 'typeId': 'name_status'}) + active_snomed_df['ontologies'] = 'SNOMED-CT' + active_snomed_df['name_status'] = active_snomed_df[ + 'name_status'].replace( + ['900000000000003001', '900000000000013009'], + ['P', 'A']) + active_snomed_df = active_snomed_df.reset_index(drop=True) + + temp_df = active_snomed_df[ + active_snomed_df['name_status'] == 'P'][['cui', 'name']] + temp_df['description_type_ids'] = temp_df['name'].str.extract( + r"\((\w+\s?.?\s?\w+.?\w+.?\w+.?)\)$") + active_snomed_df = pd.merge( + active_snomed_df, + temp_df.loc[:, ['cui', 'description_type_ids']], + on='cui', + how='left') + del temp_df + + # Hash semantic tag to get a 8 digit type_id code + active_snomed_df['type_ids'] = ( + active_snomed_df['description_type_ids'].apply( + lambda x: int( + hashlib.sha256(str(x).encode('utf-8')).hexdigest(), + 16) % 10 ** 8)) + df2merge.append(active_snomed_df) + + return pd.concat(df2merge).reset_index(drop=True) + + def list_all_relationships(self): + """ + List all SNOMED CT relationships. + + SNOMED CT provides a rich set of inter-relationships between concepts. + + Returns: + list: List of all SNOMED CT relationships. + """ + all_rela = [] + for i, snomed_release in enumerate(self.snomed_releases): + self._set_extension(snomed_release, self.exts[i]) + contents_path = os.path.join( + self.paths[i], PER_FILE_TYPE_PATHS[RefSetFileType.concept]) + exp_files = self._extension.value.exp_files + concept_snapshot = exp_files.get_concept() + relationship_snapshot = exp_files.get_relationship() + if concept_snapshot is None or _IGNORE_TAG in concept_snapshot or ( + self.bundle and self.bundle.value.has_invalid( + self._extension, [RefSetFileType.concept, + RefSetFileType.description])): + continue + + for f in os.listdir(contents_path): + m = re.search(f'{concept_snapshot}' + r'_(.*)_\d*.txt', f) + if m: + snomed_v = m.group(1) + int_relat = parse_file( + f'{contents_path}/' + f'{relationship_snapshot}_{snomed_v}_{snomed_release}.txt') + active_relat = int_relat[int_relat.active == '1'] + del int_relat + + all_rela.extend( + [relationship for + relationship in active_relat["typeId"].unique()]) + return all_rela + + def relationship2json(self, relationshipcode, output_jsonfile): + """ + Convert a single relationship map structure to JSON file. + + Args: + relationshipcode (str): A single SCTID or unique concept identifier + of the relationship type. + output_jsonfile (str): Name of JSON file output. + + Returns: + file: JSON file of relationship mapping. + """ + output_dict = {} + for i, snomed_release in enumerate(self.snomed_releases): + self._set_extension(snomed_release, self.exts[i]) + contents_path = os.path.join( + self.paths[i], PER_FILE_TYPE_PATHS[RefSetFileType.concept]) + exp_files = self._extension.value.exp_files + concept_snapshot = exp_files.get_concept() + relationship_snapshot = exp_files.get_relationship() + if concept_snapshot is None or _IGNORE_TAG in concept_snapshot or ( + self.bundle and self.bundle.value.has_invalid( + self._extension, [RefSetFileType.concept, + RefSetFileType.description])): + continue + + for f in os.listdir(contents_path): + m = re.search(f'{concept_snapshot}' + r'_(.*)_\d*.txt', f) + if m: + snomed_v = m.group(1) + int_relat = parse_file( + f'{contents_path}/' + f'{relationship_snapshot}_{snomed_v}_{snomed_release}.txt') + active_relat = int_relat[int_relat.active == '1'] + del int_relat + + relationship = dict( + [(key, []) for key in active_relat["destinationId"].unique()]) + for _, v in active_relat.iterrows(): + if v['typeId'] == str(relationshipcode): + _ = v['destinationId'] + relationship[_].append(v['sourceId']) + else: + pass + output_dict = { + key: output_dict.get(key, []) + relationship.get(key, []) + for key in + set(list(output_dict.keys()) + list(relationship.keys()))} + with open(output_jsonfile, 'w') as json_file: + json.dump(output_dict, json_file) + return + + def map_snomed2icd10(self): + """ + This function maps SNOMED CT concepts to ICD-10 codes using the refset + mappings provided in the SNOMED CT release package. + + Returns: + dict: A dictionary containing the SNOMED CT to ICD-10 mappings + including metadata. + """ + snomed2icd10df = self._map_snomed2refset() + return self._refset_df2dict(snomed2icd10df[0]) + + def map_snomed2opcs4(self) -> dict: + """ + This function maps SNOMED CT concepts to OPCS-4 codes using the refset + mappings provided in the SNOMED CT release package. + + Then it calls the internal function _map_snomed2refset() to get the + DataFrame containing the OPCS-4 mappings. + The function then converts the DataFrame to a dictionary using the + internal function _refset_df2dict() + + Raises: + AttributeError: If OPCS-4 mappings aren't available. + + Returns: + dict: A dictionary containing the SNOMED CT to OPCS-4 mappings + including metadata. + """ + if all(ext not in (SupportedExtension.UK_CLINICAL, + SupportedExtension.UK_DRUG) + for ext in self.exts): + raise AttributeError( + "OPCS-4 mapping does not exist in this edition") + snomed2opcs4df = self._map_snomed2refset()[1] + return self._refset_df2dict(snomed2opcs4df) + + def _check_path_and_release(self): + """ + This function checks the path and release of the SNOMED CT data + provided. + + It looks for the "Snapshot" folder within the data path, and if + it's not found, it looks for any folder containing the name "SnomedCT". + It then stores the path and release in separate lists. + If no valid paths are found, it raises a FileNotFoundError. + + Returns: + tuple: a tuple containing two lists, the first one is a list of + the paths where the data is located and the second is a list + of the releases of the data. + + Raises: + FileNotFoundError: If the path to the SNOMED CT directory is + incorrect. + """ + snomed_releases = [] + paths = [] + exts = [] + if "Snapshot" in os.listdir(self.data_path): + paths.append(self.data_path) + snomed_releases.append( + self._determine_release(self.data_path, strict=True)) + exts.append(self._determine_extension(self.data_path)) + else: + for folder in os.listdir(self.data_path): + if "SnomedCT" in folder: + paths.append(os.path.join(self.data_path, folder)) + rel = self._determine_release(folder, strict=True) + snomed_releases.append(rel) + exts.append(self._determine_extension(paths[-1])) + if len(paths) == 0: + raise FileNotFoundError('Incorrect path to SNOMED CT directory') + return paths, snomed_releases, exts + + def _refset_df2dict(self, refset_df: pd.DataFrame) -> dict: + """ + This function takes a SNOMED refset DataFrame as an input and converts + it into a dictionary. + + The DataFrame should contain the columns: + 'referencedComponentId','mapTarget','mapGroup','mapPriority','mapRule','mapAdvice'. + + Args: + refset_df (pd.DataFrame) : DataFrame containing the refset data + + Returns: + dict: mapping from SNOMED CT codes as key and the refset metadata + list of dictionaries as values. + """ + refset_dict = refset_df.groupby('referencedComponentId').apply( + lambda group: [{'code': row['mapTarget'], + 'mapGroup': row['mapPriority'], + 'mapPriority': row['mapPriority'], + 'mapRule': row['mapRule'], + 'mapAdvice': row['mapAdvice']} + for _, row in group.iterrows()]).to_dict() + return refset_dict + + def _map_snomed2refset(self): + """ + Maps SNOMED CT concepts to refset mappings provided in the SNOMED CT + release package. + + This function maps SNOMED CT concepts using the refset mappings in + the Snapshot/Refset/Map directory. + The refset mappings can either be ICD-10 codes in international + releases or OPCS4 codes for SNOMED UK_extension, if available. + + Returns: + pd.DataFrame: Dataframe containing SNOMED CT to refset mappings + and metadata. + OR + tuple: Tuple of dataframes containing SNOMED CT to refset mappings + and metadata (ICD-10, OPCS4), if uk_ext is True. + """ + dfs2merge = [] + for i, snomed_release in enumerate(self.snomed_releases): + self._set_extension(snomed_release, self.exts[i]) + refset_terminology = os.path.join( + self.paths[i], PER_FILE_TYPE_PATHS[RefSetFileType.refset]) + icd10_ref_set = self._extension.value.exp_files.get_refset() + if icd10_ref_set is None or _IGNORE_TAG in icd10_ref_set or ( + self.bundle and self.bundle.value.has_invalid( + self._extension, [RefSetFileType.concept, + RefSetFileType.description])): + continue + for f in os.listdir(refset_terminology): + m = re.search(f'{icd10_ref_set}' + r'_(.*)_\d*.txt', f) + if m: + snomed_v = m.group(1) + mappings = parse_file( + f'{refset_terminology}' + f'/{icd10_ref_set}_{snomed_v}_{snomed_release}.txt') + mappings = mappings[mappings.active == '1'] + icd_mappings = mappings.sort_values( + by=['referencedComponentId', 'mapPriority', + 'mapGroup']).reset_index(drop=True) + dfs2merge.append(icd_mappings) + mapping_df = pd.concat(dfs2merge) + del dfs2merge + if any(ext in (SupportedExtension.UK_CLINICAL, + SupportedExtension.UK_DRUG) + for ext in self.exts): + opcs_df = mapping_df[mapping_df['refsetId'] == self.opcs_refset_id] + icd10_df = mapping_df[mapping_df['refsetId'] + == '999002271000000101'] + return icd10_df, opcs_df + else: + return mapping_df, None + + +class UnkownSnomedReleaseException(ValueError): + + def __init__(self, *args) -> None: + super().__init__(*args) diff --git a/medcat2/model_creation/preprocess_umls.py b/medcat2/model_creation/preprocess_umls.py new file mode 100644 index 00000000..264dee53 --- /dev/null +++ b/medcat2/model_creation/preprocess_umls.py @@ -0,0 +1,311 @@ + +from typing import List, Union +import pandas as pd +import tqdm +import os +from typing import Dict + +_DEFAULT_COLUMNS: list = [ + "CUI", + "LAT", + "TS", + "LUI", + "STT", + "SUI", + "ISPREF", + "AUI", + "SAUI", + "SCUI", + "SDUI", + "SAB", + "TTY", + "CODE", + "STR", + "SRL", + "SUPPRESS", + "CVF", +] + +_DEFAULT_SEM_TYPE_COLUMNS: list = [ + "CUI", + "TUI", + "STN", + "STY", + "ATUI", + "CVF", +] + +_DEFAULT_MRHIER_COLUMNS: list = [ + "CUI", + "AUI", + "CXN", + "PAUI", + "SAB", + "RELA", + "PTR", + "HCD", + "CVF", +] + +medcat_csv_mapper: dict = { + 'CUI': 'cui', + 'STR': 'name', + 'SAB': 'ontologies', + 'ISPREF': 'name_status', + 'TUI': 'type_ids', # from MRSTY.RRF +} + + +class UMLS: + """Pre-process UMLS release files: + Args: + main_file_name (str): + Path to the main file name (probably MRCONSO.RRF) + sem_types_file (str): + Path to the semantic types file name (probably MRSTY.RRF) + allow_langugages (list): + Languages to filter out. Defaults to just English (['ENG']). + sep (str): + The separator used within the files. Defaults to '|'. + """ + + def __init__(self, main_file_name: str, sem_types_file: str, + allow_languages: list = ['ENG'], sep: str = '|'): + self.main_file_name = main_file_name + self.sem_types_file = sem_types_file + self.main_columns = list(_DEFAULT_COLUMNS) # copy + self.sem_types_columns = list(_DEFAULT_SEM_TYPE_COLUMNS) # copy + self.mrhier_columns = list(_DEFAULT_MRHIER_COLUMNS) # copy + self.sep = sep + # copy in case of default list + self.allow_langugages = list( + allow_languages) if allow_languages else allow_languages + + def to_concept_df(self) -> pd.DataFrame: + """Create a concept DataFrame. + The default column names are expected. + + Returns: + pd.DataFrame: The resulting DataFrame + """ + # target columns: + # cui, name, name_status, ontologies, description_type_ids, type_ids + df = pd.read_csv(self.main_file_name, + names=self.main_columns, sep=self.sep, + index_col=False) + + # filter languages + if self.allow_langugages: + df = df[df["LAT"].isin(self.allow_langugages)] + + # TODO filter by activity ? + + # get TUI + + sem_types = pd.read_csv( + self.sem_types_file, names=self.sem_types_columns, sep=self.sep, + index_col=False) + df = df.merge(sem_types) + + # rename columns + + df = df.rename(columns=medcat_csv_mapper) + + # pop all unnecessary columns + + # all initial columns should have been renamed + for col_name in self.main_columns + self.sem_types_columns: + if col_name in df.columns: + df.pop(col_name) + + # looks like description_type_ids is not really used anywhere, + # so I won't look for it + + return df + + def map_umls2snomed(self) -> pd.DataFrame: + """Map to SNOMED-CT. + + Currently, uses the SCUI column. At the time of writing, this is equal + to the CODE column. + But this may not be the case in the future. + + Returns: + pd.DataFrame: Dataframe that contains the SCUI (source CUI) as + well as the UMLS CUI for each applicable concept + """ + df = pd.read_csv(self.main_file_name, names=self.main_columns, + sep=self.sep, index_col=False, dtype={'SCUI': 'str'}) + # get only SNOMED-CT US based concepts that have a SNOMED-CT (source) + # CUI + df = df[df.SAB == 'SNOMEDCT_US'][df.SCUI.notna()] + # sort by SCUI + df = df.sort_values(by='SCUI').reset_index(drop=True) + # rearrange with SCUI as the first column + df = df[['SCUI',] + [ + col for col in df.columns.values if col != 'SCUI']] + return df + + def map_umls2icd10(self) -> pd.DataFrame: + """Map to ICD-10. + + Available SAB's that contain 'ICD10': + - CCSR_ICD10CM - CCSR_ICD10CM (Clinical Classifications Software Refined for ICD-10-CM) - Synopsis + - CCSR_ICD10PCS - CCSR_ICD10PCS (Clinical Classifications Software Refined for ICD-10-PCS) - Synopsis + - DMDICD10 - DMDICD10 (ICD-10 German) - Statistics + - ICD10AE - ICD10AE (ICD-10, American English Equivalents) - Synopsis + - ICD10AMAE - ICD10AMAE (ICD-10, Australian Modification, Americanized English Equivalents) - Synopsis + - ICD10AM - ICD10AM (ICD-10, Australian Modification) - Synopsis + - ICD10DUT - ICD10DUT (ICD10, Dutch Translation) - Synopsis + - ICD10PCS - ICD10PCS (ICD-10 Procedure Coding System) - Synopsis + - ICD10 - ICD10 (International Classification of Diseases and Related Health Problems, Tenth Revision) - Synopsis + - ICPC2ICD10DUT - ICPC2ICD10DUT (ICPC2-ICD10 Thesaurus, Dutch Translation) - Synopsis + - ICPC2ICD10ENG - ICPC2ICD10ENG (ICPC2-ICD10 Thesaurus) - Synopsis + - MTHICPC2ICD10AE - MTHICPC2ICD10AE (ICPC2E-ICD10 Thesaurus, American English Equivalents) - Synopsis + + Currently only using 'ICD10'. But others may be relevant as well. + + If one wants to use one of the other sources listed above, + they would need to use the map_umls2source method. + + Returns: + pd.DataFrame: DataFrame that has the ICD-10 codes + """ # noqa + return self.map_umls2source(sources='ICD10') + + def map_umls2source(self, sources: Union[str, List[str]]) -> pd.DataFrame: + """Allows mapping to an arbitrary + + Args: + sources (Union[str, List[str]]): The source or sources to include. + + Returns: + pd.DataFrame: DataFrame that has the target source codes + """ + df = pd.read_csv(self.main_file_name, names=self.main_columns, + sep=self.sep, index_col=False, dtype={'CODE': 'str'}) + # get the specified source(s) + if isinstance(sources, list): + df = df[df.SAB.isin(sources)][df.CODE.notna()] + else: + df = df[df.SAB == sources][df.CODE.notna()] + # sort by CODE + df = df.sort_values(by='CODE').reset_index(drop=True) + # rearrange columns starting with CODE + df = df[['CODE',] + [ + col for col in df.columns.values if col != 'CODE']] + return df + + def get_pt2ch(self) -> dict: + """Generates a parent to children dict. + + It goes through all the < # TODO + + The resulting dictionary maps a CUI to a list of CUIs that + consider that CUI as their parent. + + PS: + This expects the MRHIER.RRF file to also exist in the same folder + as the MRCONSO.RRF file. + + Raises: + ValueError: If the MRHIER.RRF file wasn't found + + Returns: + dict: The dictionary of parent CUI and their children. + """ + path = self.main_file_name.rsplit('/', 1)[0] + hier_file = f"{path}/MRHIER.RRF" + + if not os.path.exists(hier_file): + raise ValueError( + 'Expected MRHIER.RRF to exist within the same parent folder ' + f'({path})') + + conso_df = pd.read_csv(self.main_file_name, names=self.main_columns, + sep=self.sep, index_col=False) + + hier_df = pd.read_csv(hier_file, sep=self.sep, index_col=False, + header=None, names=self.mrhier_columns) + + # filter languages + if self.allow_langugages: + conso_df = conso_df[conso_df["LAT"].isin(self.allow_langugages)] + + # create a AUI -> CUI map + aui_cui = dict(zip(conso_df["AUI"], conso_df["CUI"])) + + # remove non-preferred from conso + conso_df = conso_df[conso_df['ISPREF'] == 'Y'] + + # filter ISA relationships + hier_df = hier_df[hier_df['RELA'] == 'isa'] + + # merge dataframes + merged_df = pd.merge(conso_df, hier_df, on=['AUI', 'CUI']) + + # only keep CUI and parent AUI + cui_parent = merged_df[['CUI', 'PAUI']] + # only include CUIs with a parent + cui_parent = cui_parent[cui_parent['PAUI'].notna()] + + # create dict + pt2ch: dict = {} + for _, row in tqdm.tqdm(cui_parent.iterrows(), + total=len(cui_parent.index)): + cur_cui = row['CUI'] + paui = row['PAUI'] + parent_cui = aui_cui[paui] + # avoid self as parent/child + if parent_cui == cur_cui: + continue + if parent_cui not in pt2ch: + pt2ch[parent_cui] = set() + pt2ch[parent_cui].add(cur_cui) + # move from set to list for consistency with SNOMED + pt2ch: Dict[str, List[str]] = pt2ch # type: ignore + for k, v in pt2ch.items(): + pt2ch[k] = list(v) + return pt2ch + + +if __name__ == '__main__': + import sys + if len(sys.argv) < 3: + print('Need to specify two file locations: MRCONSO.RRF and MRSTY.RRF') + sys.exit(1) + umls = UMLS(sys.argv[1], sys.argv[2]) + df = umls.to_concept_df() + save_file = "preprocessed_umls.csv" + print(f"Saving to {save_file}") + df.to_csv(save_file, index=False) + print('Converting to SNOMED') + to_snomed = umls.map_umls2snomed() + print('As SNOMED:') + print(to_snomed.head()) + to_ICD10 = umls.map_umls2icd10() + print('As ICD-10:') + print(to_ICD10.head()) + to_ICD10_man = umls.map_umls2source(sources=['ICD10']) + print('As ICD-10(MAN):') + print(to_ICD10_man.head()) + pt2ch = umls.get_pt2ch() + print('Get parent-child dict', len(pt2ch), + '' if len(pt2ch) > 1_000 else pt2ch) + all_vals = [len(v) for v in pt2ch.values()] + print('LEN of VALS:', sum(all_vals), 'max', + max(all_vals), 'min', min(all_vals), 'mean', + sum(all_vals) / len(all_vals)) + import random + random_4_keys = random.sample(list(pt2ch.keys()), k=4) + + def _get_name(cui: str) -> str: + matches = df[df['cui'] == cui] + if len(matches.index) == 0: + return 'N/A' # UNKNOWN + return matches['name'].iloc[0] + print('FF RAW ', [f"{k}:{pt2ch[k]}" for k in random_4_keys]) + print('FIRST FEW', [ + (f"{_get_name(key)} ({key})", [f"{_get_name(child)} ({child})" + for child in pt2ch[key]]) + for key in random_4_keys]) diff --git a/medcat2/storage/schema.py b/medcat2/storage/schema.py index 6a720657..83328db3 100644 --- a/medcat2/storage/schema.py +++ b/medcat2/storage/schema.py @@ -1,5 +1,9 @@ from typing import Type import json +import logging + + +logger = logging.getLogger(__name__) _CLASS_PATH = "serialised-class" @@ -41,7 +45,15 @@ def load_schema(file_name: str) -> tuple[str, list[str]]: """ with open(file_name) as f: data = json.load(f) - return data[_CLASS_PATH], data[_INIT_PARTS_PATH] + class_name, init_parts = data[_CLASS_PATH], data[_INIT_PARTS_PATH] + if __package__.startswith("medcat.") and class_name.startswith("medcat2."): + # if we're loading a beta-release (medcat2.* namespaced) schema + # we need to convert it to the stable one + logger.info( + "Loading beta-release medcat2 schema at '%s'. " + "Converting to release schema.", file_name) + class_name = class_name.replace("medcat2.", "medcat.") + return class_name, init_parts class IllegalSchemaException(ValueError): diff --git a/medcat2/trainer.py b/medcat2/trainer.py index dbf11029..8fba3617 100644 --- a/medcat2/trainer.py +++ b/medcat2/trainer.py @@ -1,4 +1,4 @@ -from typing import Iterable, Callable, Optional, Union +from typing import Iterable, Callable, Optional, Union, cast import logging from itertools import chain, repeat, islice from tqdm import trange @@ -10,11 +10,12 @@ from medcat2.utils.config_utils import temp_changed_config from medcat2.utils.data_utils import make_mc_train_test, get_false_positives from medcat2.utils.filters import project_filters -from medcat2.data.mctexport import (MedCATTrainerExport, - MedCATTrainerExportProject, - MedCATTrainerExportDocument) +from medcat2.data.mctexport import ( + MedCATTrainerExport, MedCATTrainerExportProject, + MedCATTrainerExportDocument, count_all_annotations, iter_anns) from medcat2.preprocessors.cleaners import prepare_name, NameDescriptor from medcat2.components.types import CoreComponentType, TrainableComponent +from medcat2.components.addons.addons import AddonComponent from medcat2.pipeline.pipeline import Pipeline @@ -132,6 +133,7 @@ def train_supervised_raw(self, extra_cui_filter: Optional[set[str]] = None, # checkpoint: Optional[Checkpoint] = None, disable_progress: bool = False, + train_addons: bool = False, ) -> tuple: """Train supervised based on the raw data provided. @@ -213,6 +215,9 @@ def train_supervised_raw(self, disable_progress (bool): Whether to disable the progress output (tqdm). Defaults to False. + train_addons (bool): + Whether to also train the addons (e.g MetaCATs). Defaults + to False. Returns: tuple: Consisting of the following parts @@ -304,8 +309,42 @@ def train_supervised_raw(self, # # reset the state of filters # self.config.linking.filters = orig_filters + if (train_addons and + # NOTE if no annnotaitons, no point + count_all_annotations(data) > 0): + self._train_addons(data) + return fp, fn, tp, p, r, f1, cui_counts, examples + def _train_meta_cat(self, addon: AddonComponent, + data: MedCATTrainerExport) -> None: + # NOTE: dynamic import to avoid circular imports + from medcat2.components.addons.meta_cat.meta_cat import ( + MetaCATAddon) + _, _, ann0 = next(iter_anns(data)) + if not isinstance(addon, MetaCATAddon): + raise TypeError( + f"Expected MetaCATAddon, got {type(addon)}") + if 'meta_anns' not in ann0: + logger.info("No Meta Annotations found to train MetaCATs") + return + # only consider meta-cats that have been defined + # for the category + ann_names = ann0['meta_anns'].keys() # type: ignore + # adapt to alternative names if applicable + cnf = addon.config + cat_name = cnf.general.get_applicable_category_name(ann_names) + if cat_name in ann_names: + logger.debug("Training MetaCAT %s", cnf.general.category_name) + # NOTE: this is a mypy quirk - the types are compatible + addon.mc.train_raw(cast(dict, data)) + + def _train_addons(self, data: MedCATTrainerExport): + logger.info("Training addons within train_supervised_raw") + for addon in self._pipeline._addons: + if addon.addon_type == "meta_cat": + self._train_meta_cat(addon, data) + def _perform_epoch(self, current_project: int, current_document: int, train_set: MedCATTrainerExport, diff --git a/medcat2/utils/data_utils.py b/medcat2/utils/data_utils.py index 0feae24f..3ae49ebd 100644 --- a/medcat2/utils/data_utils.py +++ b/medcat2/utils/data_utils.py @@ -121,8 +121,9 @@ def _split_doc_train_test(self, document: MedCATTrainerExportDocument, def _should_add_to_test(self, _cnts: dict[str, int]) -> bool: # Did we get more than 30% of concepts for any CUI with >=10 cnt return any( - self.cnts[cui] >= 10 and - (v + self.test_cnts.get(cui, 0)) / self.cnts[cui] < 0.3 + self.cnts[cui] >= self.MIN_CNT_FOR_TEST and + (v + self.test_cnts.get(cui, 0) + ) / self.cnts[cui] < self.MAX_TEST_FRACTION for cui, v in _cnts.items() ) diff --git a/medcat2/utils/ner/metrics.py b/medcat2/utils/ner/metrics.py index 102c0318..a85a34ec 100644 --- a/medcat2/utils/ner/metrics.py +++ b/medcat2/utils/ner/metrics.py @@ -13,7 +13,24 @@ def metrics(p, return_df=False, plus_recall=0, tokenizer=None, dataset=None, merged_negative={0, 1, -100}, padding_label=-100, csize=15, subword_label=1, verbose=False): - """TODO: This could be done better, for sure. But it works.""" # noqa + """ + Calculate metrics for a model's predictions, based off the tokenized + output of a MedCATTrainer project. + + Args: + p: The model's predictions. + return_df: Whether to return a DataFrame of metrics. + plus_recall: The recall to add to the model's predictions. + tokenizer: The tokenizer used to tokenize the texts. + dataset: The dataset used to train the model. + merged_negative: The negative labels to merge. + padding_label: The padding label. + csize: The size of the context window. + subword_label: The subword label. + verbose: Whether to print the metrics. + Returns: + Dict: A dictionary of metrics. + """ predictions = np.array(p.predictions) predictions = softmax(predictions, axis=2) examples = None @@ -146,3 +163,107 @@ def metrics(p, return_df=False, plus_recall=0, tokenizer=None, dataset=None, if pd.notna(x)])} else: return df, examples + + +def _anno_within_pred_list(label: dict, preds: list[dict]) -> bool: + """ + Check if a label is within a list of predictions, + + Args: + label (Dict): an annotation likely from a MedCATTrainer project + preds (List[Dict]): a list of predictions likely from a cat.__call__ + Returns: + bool: True if the label is within the list of predictions, + False otherwise + """ + return any(label['start'] >= p['start'] and label['end'] <= p['end'] + for p in preds) + + +def evaluate_predictions(true_annotations: list[list[dict]], + all_preds: list[list[dict]], texts: list[str], + cui2preferred_name: dict[str, str] + ) -> tuple[pd.DataFrame, dict]: + """ + Evaluate predictions against sets of collected labels as collected and + utput from a MedCATTrainer project. + Counts predictions as correct if the prediction fully encloses the label. + + Args: + true_annotations (list[list[dict]]): Ground truth predictions by text + all_preds (list[list[dict]]): Model predictions by text + texts (list[str]): Original list of texts + cui2preferred_name (dict[str, str]): Dictionary of CUI to preferred + name, likely to be cat.cdb.cui2preferred_name. + Returns: + tuple[pd.DataFrame, Dict]: A tuple containing a DataFrame of + evaluation metrics and a dictionary of missed annotations per CUI. + """ + per_cui_recall = {} + per_cui_prec = {} + per_cui_recall_merged = {} + per_cui_anno_counts = {} + per_cui_annos_missed = defaultdict(list) + uniq_labels = set([p['cui'] for ap in true_annotations for p in ap]) + + for cui in uniq_labels: + # annos in test set + anno_count = sum([len([p for p in cui_annos if p['cui'] == cui]) + for cui_annos in true_annotations]) + pred_counts = sum([len([p for p in d if p['cui'] == cui]) + for d in all_preds]) + + # print(anno_count) + # print(pred_counts) + + # print(f'pred_count: {pred_counts}, anno_count:{anno_count}') + per_cui_anno_counts[cui] = anno_count + + doc_annos_left, preds_left, doc_annos_left_any_cui = [], [], [] + + for doc_preds, doc_labels, text in zip( + all_preds, true_annotations, texts): + # num of annos that are not found - recall + cui_labels = [label for label in doc_labels if label['cui'] == cui] + cui_doc_preds = [pred for pred in doc_preds if pred['cui'] == cui] + + labels_not_found = [ + label for label in cui_labels + if not _anno_within_pred_list(label, cui_doc_preds)] + doc_annos_left.append(len(labels_not_found)) + + # num of annos that are not found across any cui prediction + # - recall_merged + any_labels_not_found = [ + label for label in cui_labels + if not _anno_within_pred_list(label, doc_preds)] + doc_annos_left_any_cui.append(len(any_labels_not_found)) + + per_cui_annos_missed[cui].append(any_labels_not_found) + + # num of preds that are incorrect - precision + preds_left.append(len([ + label for label in cui_doc_preds + if not _anno_within_pred_list(label, cui_labels)])) + + if anno_count != 0 and pred_counts != 0: + per_cui_recall[cui] = ( + anno_count - sum(doc_annos_left)) / anno_count + per_cui_recall_merged[cui] = ( + anno_count - sum(doc_annos_left_any_cui)) / anno_count + per_cui_prec[cui] = ( + pred_counts - sum(preds_left)) / pred_counts + else: + per_cui_recall[cui] = 0 + per_cui_recall_merged[cui] = 0 + per_cui_prec[cui] = 0 + + res_df = pd.DataFrame({ + 'cui': per_cui_recall_merged.keys(), + 'recall_merged': per_cui_recall_merged.values(), + 'recall': per_cui_recall.values(), + 'precision': per_cui_prec.values(), + 'label_count': per_cui_anno_counts.values()}, + index=[cui2preferred_name[k] for k in per_cui_recall_merged]) + + return res_df, per_cui_annos_missed diff --git a/medcat2/utils/ner/transformers_ner.py b/medcat2/utils/ner/transformers_ner.py index 0399b41e..fcf3c8a0 100644 --- a/medcat2/utils/ner/transformers_ner.py +++ b/medcat2/utils/ner/transformers_ner.py @@ -63,7 +63,7 @@ def _info(self): def _split_generators(self, dl_manager): # noqa """Returns SplitGenerators.""" # noqa - return [ + splits = [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ @@ -72,6 +72,20 @@ def _split_generators(self, dl_manager): # noqa ), ] + # Only add test split if test data files are provided + if 'test' in self.config.data_files: + splits.append( + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepaths": self.config.data_files['test'], + }, + ) + ) + + return splits + + def _generate_examples(self, filepaths): # noqa cnt = 0 for filepath in filepaths: diff --git a/medcat2/utils/usage_monitoring.py b/medcat2/utils/usage_monitoring.py new file mode 100644 index 00000000..660e4602 --- /dev/null +++ b/medcat2/utils/usage_monitoring.py @@ -0,0 +1,113 @@ +import os +from datetime import datetime +from typing import Union, Callable +import platform +import logging + +from medcat2.config.config import UsageMonitor as UsageMonitorConfig + + +LOGS_ENV = "MEDCAT_USAGE_LOGS" +LOGS_LOC_ENV = "MEDCAT_USAGE_LOGS_LOCATION" + +DEFAULT_LOGS_WINDOWS = os.path.join(os.environ.get('APPDATA', "NOT WINDOWS"), + 'medcat', 'logs') +DEFAULT_LOGS_LINUX = os.path.expanduser("~/.local/share/medcat/logs/") +DEFAULT_LOGS_MACOS = os.path.expanduser( + "~/Library/Application Support/medcat/logs/") + + +logger = logging.getLogger(__name__) + + +class UsageMonitor: + + def __init__(self, model_hash: Union[Callable[[], str], str], + config: UsageMonitorConfig) -> None: + self.config = config + self.log_buffer: list[str] = [] + # NOTE: if the model hash changes (i.e model is trained) + # then this does not immediately take effect + self._model_hash = model_hash + + @property + def model_hash(self) -> str: + return (self._model_hash() if callable(self._model_hash) + else self._model_hash) + + @property + def log_file(self): + return os.path.join( + self.config.log_folder, + f"{self.config.file_prefix}{self.model_hash}.csv") + + def _get_auto_logs_location(self): + system = platform.system().lower() + if system == "windows": + return DEFAULT_LOGS_WINDOWS + elif system == "linux": + return DEFAULT_LOGS_LINUX + elif system == "darwin": # macOS + return DEFAULT_LOGS_MACOS + else: + raise OSError(f"Unsupported operating system: {system}") + + def _setup_auto_logs(self): + # NOTE: os.environ is a snapshot of the environmental variables + # from the time that the process was started. + # However, someone could still change os.environm manually + log_dir = os.environ.get(LOGS_LOC_ENV, self._get_auto_logs_location()) + if not os.path.exists(log_dir): + os.makedirs(log_dir) + if log_dir != self.config.log_folder: + self.config.log_folder = log_dir + + @property + def should_monitor(self) -> bool: + return self.config.enabled in (True, 'auto') + + def _should_log(self) -> bool: + if not self.config.enabled: + logger.warning("Trying to log to file when the usage monitor is " + "disabled. This should generally not happen unless " + "the config kept track of by the CAT object has is " + "different from the one kept track of by the usage " + "monitor. So if this keeps coming up, make sure " + "everything is up to date") + return False + elif self.config.enabled is True: + return True + elif self.config.enabled != 'auto': + raise ValueError("Unknown UsageMonitor enabled status: " + f"{self.config.enabled}. Expected one of: " + f"True, False, 'auto'") + # enabled == 'auto' + env_enabled = os.environ.get(LOGS_ENV, "false").lower() + if env_enabled in ("false", "0"): + return False + self._setup_auto_logs() + return True + + def log_inference(self, + input_text_len: int, + nr_of_ents_found: int) -> None: + if not self._should_log(): + return + timestamp = datetime.now().isoformat() + log_entry = f"{timestamp},{input_text_len},{nr_of_ents_found}" + self.log_buffer.append(log_entry) + if len(self.log_buffer) >= self.config.batch_size: + self._flush_logs() + + def _flush_logs(self) -> None: + if not self.log_buffer: + return + with open(self.log_file, 'a') as f: + for log_entry in self.log_buffer: + f.write(log_entry + '\n') + self.log_buffer = [] + + def __del__(self): + # fail safe for when buffer is non-empty upon application stop + # (i.e exit call) + self._flush_logs() diff --git a/tests/backwards_compatibility/check_backwards_compatibility.sh b/tests/backwards_compatibility/check_backwards_compatibility.sh new file mode 100644 index 00000000..7a2e6511 --- /dev/null +++ b/tests/backwards_compatibility/check_backwards_compatibility.sh @@ -0,0 +1,43 @@ +# CONSTANTs/ shouldn't change +REGRESSION_MODULE="medcat2.utils.regression.regression_checker" +REGRESSION_OPTIONS="--strictness STRICTEST --require-fully-correct" + +# CHANGABLES +# target models +DL_LINK="https://cogstack-medcat-example-models.s3.eu-west-2.amazonaws.com/medcat-example-models/all_fake_medcat_v2_models.zip" +ZIP_FILE_NAME="all_fake_medcat_v2_models.zip" +# target regression set +REGRESSION_TEST_SET="tests/backwards_compatibility/testing/example_regression_suite.yml" +# folder to house models under test +MODEL_FOLDER="fake_models" + +# START WORK + +echo "Downloading models" +wget $DL_LINK +# Create folder if it doesn't exit +mkdir -p "$MODEL_FOLDER" +echo "Uncompressing files" +unzip $ZIP_FILE_NAME -d $MODEL_FOLDER +echo "Cleaning up the overall zip" +rm $ZIP_FILE_NAME +for model_path in `ls $MODEL_FOLDER/*.zip`; do + if [ -f "$model_path" ]; then + echo "Processing $model_path" + python -m $REGRESSION_MODULE \ + "$model_path" \ + $REGRESSION_TEST_SET \ + $REGRESSION_OPTIONS + # this is a sanity check - needs to run after so that the folder has been created + grep "MedCAT Version" "${model_path%.*}/model_card.json" + # clean up here so we don't leave both the .zip'ed model + # and the folder so we don't fill the disk + echo "Cleaning up at: ${model_path%.*}" + rm -rf ${model_path%.*}* + else + echo "No files found matching the pattern: $file" + fi +done + +# Remove the fake model folder +rm -r "$MODEL_FOLDER" diff --git a/tests/backwards_compatibility/creation/cat_creation.py b/tests/backwards_compatibility/creation/cat_creation.py new file mode 100644 index 00000000..94ec3e65 --- /dev/null +++ b/tests/backwards_compatibility/creation/cat_creation.py @@ -0,0 +1,71 @@ +import os +import sys +import pandas as pd +import json + +from medcat2 import __version__ as MCT_VER +from medcat2.vocab import Vocab +from medcat2.config import Config +from medcat2.model_creation.cdb_maker import CDBMaker +from medcat2.cdb import CDB +from medcat2.cat import CAT + + +vi = sys.version_info +PY_VER = f"{vi.major}.{vi.minor}" + + +# paths +VOCAB_DATA_PATH = os.path.join( + os.path.dirname(__file__), 'vocab_data.txt' + # os.path.dirname(__file__), 'vocab_data_auto.txt' +) +CDB_PREPROCESSED_PATH = os.path.join( + os.path.dirname(__file__), 'preprocessed4cdb.txt' +) +SELF_SUPERVISED_DATA_PATH = os.path.join( + os.path.dirname(__file__), 'selfsupervised_data.txt' +) +SUPERVISED_DATA_PATH = os.path.join( + os.path.dirname(__file__), 'supervised_mct_export.json' +) +SAVE_PATH = os.path.dirname(__file__) +SAVE_NAME = f"simple_model4test-{PY_VER}-{MCT_VER}" + +# vocab + +vocab = Vocab() +vocab.add_words(VOCAB_DATA_PATH) + +# CDB +config = Config() +config.general.nlp.provider = "spacy" + +maker = CDBMaker(config) + +cdb: CDB = maker.prepare_csvs([CDB_PREPROCESSED_PATH]) + +# CAT +cat = CAT(cdb, vocab) + +# training +# self-supervised +unsup_data = pd.read_csv(SELF_SUPERVISED_DATA_PATH) +cat.trainer.train_unsupervised(unsup_data.text.values) + +print("[sst] cui2count_train", cat.cdb.get_cui2count_train()) + +# supervised + +with open(SUPERVISED_DATA_PATH) as f: + sup_data = json.load(f) + +cat.trainer.train_supervised_raw(sup_data) + +print("[sup] cui2count_train", cat.cdb.get_cui2count_train()) + +# save +full_path = cat.save_model_pack(SAVE_PATH, pack_name=SAVE_NAME, + only_archive=True) +print("Saved to") +print(full_path) diff --git a/tests/backwards_compatibility/creation/preprocessed4cdb.txt b/tests/backwards_compatibility/creation/preprocessed4cdb.txt new file mode 100644 index 00000000..113805b2 --- /dev/null +++ b/tests/backwards_compatibility/creation/preprocessed4cdb.txt @@ -0,0 +1,11 @@ +cui,name +C01,kidney failure +C01,loss of kidney function +C02,diabetes +C02,diabetes mellitus +C03,fever +C03,high temperature +C04,seizure +C04,fittest +C05,healthy +C05,fittest \ No newline at end of file diff --git a/tests/backwards_compatibility/creation/selfsupervised_data.txt b/tests/backwards_compatibility/creation/selfsupervised_data.txt new file mode 100644 index 00000000..382a3804 --- /dev/null +++ b/tests/backwards_compatibility/creation/selfsupervised_data.txt @@ -0,0 +1,10 @@ +id,text +FD0,"Patient presented with severe diabetes and had also been diagnosed with acute kidney failure. +Prior to visit the patient had also complained about a light fever" +FD1,"50yo RHM with light fever admitted to hospital. +Tests conducted and acute kidney failure discovered. +Tests also show signes of severe diabetes, though there are no other symptoms." +FD2,"102yo LHF presented with acute seizure after long day of work. +No further complications" +FD3,"Patient is a healthy male in their 20s. +No health complications were noted." \ No newline at end of file diff --git a/tests/backwards_compatibility/creation/supervised_mct_export.json b/tests/backwards_compatibility/creation/supervised_mct_export.json new file mode 100644 index 00000000..d6cdcd82 --- /dev/null +++ b/tests/backwards_compatibility/creation/supervised_mct_export.json @@ -0,0 +1,136 @@ +{ + "projects": [ + { + "cuis": "", + "documents": [ + { + "annotations": [ + { + "cui": "C01", + "start": 38, + "end": 52, + "value": "kidney failure" + }, + { + "cui": "C01", + "start": 122, + "end": 145, + "value": "loss of kidney function" + }, + { + "cui": "C02", + "start": 192, + "end": 200, + "value": "diabetes" + }, + { + "cui": "C02", + "start": 279, + "end": 296, + "value": "diabetes mellitus" + }, + { + "cui": "C03", + "start": 390, + "end": 395, + "value": "fever" + }, + { + "cui": "C03", + "start": 454, + "end": 470, + "value": "high temperature" + } + ], + "id": "ID-0", + "last_modified": "2024-08-21", + "name": "Doc#0", + "text": "Patient had been diagnosed with acute kidney failure the week before. The current complaint was related to the same acute loss of kidney function as the diagnosis. The patient also has severe diabetes even though they have never consumed any sugar. The prior diagnosis of severe diabetes mellitus was confirmed by doctor. Due to the previous issues, patient had been suffering from a light fever all day. They took some paracetamol but still had a light high temperature afterwards." + }, + { + "annotations": [ + { + "cui": "C04", + "start": 20, + "end": 27, + "value": "seizure" + }, + { + "cui": "C04", + "start": 81, + "end": 87, + "value": "fittest" + } + ], + "id": "ID-1", + "last_modified": "2024-08-21", + "name": "Doc#1", + "text": "Patient had a acute seizure during visit with GP. This is the first time a minor fittest was observed for this patient. " + }, + { + "annotations": [ + { + "cui": "C05", + "start": 26, + "end": 33, + "value": "healthy" + }, + { + "cui": "C05", + "start": 84, + "end": 91, + "value": "fittest" + } + ], + "id": "ID-2", + "last_modified": "2024-08-21", + "name": "Doc#2", + "text": "The patient is considered healthy as per tests run. The patient would be considered fittest according to any standard known." + }, + { + "annotations": [ + { + "cui": "C04", + "start": 24, + "end": 31, + "value": "seizure" + }, + { + "cui": "C04", + "start": 65, + "end": 72, + "value": "fittest" + } + ], + "id": "ID-3", + "last_modified": "2024-08-21", + "name": "Doc#3", + "text": "The patient has a minor seizure every day. The presence of daily fittest is extremely problematic." + }, + { + "annotations": [ + { + "cui": "C05", + "start": 16, + "end": 23, + "value": "healthy" + }, + { + "cui": "C05", + "start": 111, + "end": 118, + "value": "fittest" + } + ], + "id": "ID-3", + "last_modified": "2024-08-21", + "name": "Doc#4", + "text": "The RHS male is healthy as considered by all available tests. There are no indications that the patient is not fittest." + } + ], + "id": "Project#0", + "name": "Project-0", + "tuis": "" + } + ] +} diff --git a/tests/backwards_compatibility/creation/vocab_data.txt b/tests/backwards_compatibility/creation/vocab_data.txt new file mode 100644 index 00000000..0a3ca805 --- /dev/null +++ b/tests/backwards_compatibility/creation/vocab_data.txt @@ -0,0 +1,18 @@ +severe 10000 1.0 0 0 1 0 0 0 +minor 10000 -1.0 0 0 1 0 0 0 +acute 6500 0 1.0 0 1 0 0 0 +chronic 6500 0 -1.0 0 0 1 0 0 +heavy 4000 0 0 1.0 1 0 0 0 +light 4000 0 0 -1.0 1 0 0 0 +considered 1000 0.1 -0.2 0 0 0.9 0 0 +with 20000 0 0 0 0 0 0.8 0 +of 22000 0 0 0 0 0 1 0 +to 19000 0 0 0 0 0 0.9 0 +were 12000 0 0 0 0 0.95 0 0 +was 11000 0 0 0 0 0.94 0 0 +is 12000 0 0 0 0 1 0 0 +are 12000 0 0 0 0 1.1 0 0 +has 11000 0 0 0 0 0.98 0 0 +presence 1000 0 0 0 0 0 0 0.4 +indication 500 0 0 0 0 0 0 0.3 +time 450 0 0 0 0 0 0 0.1 diff --git a/tests/backwards_compatibility/run_current.sh b/tests/backwards_compatibility/run_current.sh new file mode 100644 index 00000000..79997cc9 --- /dev/null +++ b/tests/backwards_compatibility/run_current.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# exit immediately upon non-zero exit status +set -e + +# create and train model and capture output +# this will create a model pack based on some data included within the tests/resources/regression/creation/ folder, +# it will then train on some self-supervised as well as supervised training data and save the model. +output=$(python tests/backwards_compatibility/creation/cat_creation.py) +# make sure the user sees the output +echo "$output" + +# extract the last line of the output which contains the full model path +model_path=$(echo "$output" | tail -n 1) +# NOTE: this file should be tagged with the python version we're using + +# test the vocab to make sure it's all good +python tests/backwards_compatibility/testing/test_vocab.py +# TODO: test other things as well? + +# run the regression_checker with the captured file path +# if any of the regression cases fail, this will return a non-zero exit status +python -m medcat2.utils.regression.regression_checker \ + "$model_path" \ + tests/backwards_compatibility/testing/example_regression_suite.yml \ + --strictness STRICTEST \ + --require-fully-correct + +# Step 4: Clean up the generated file(s) +rm -rf "$model_path"* \ No newline at end of file diff --git a/tests/backwards_compatibility/testing/example_regression_suite.yml b/tests/backwards_compatibility/testing/example_regression_suite.yml new file mode 100644 index 00000000..ad1da300 --- /dev/null +++ b/tests/backwards_compatibility/testing/example_regression_suite.yml @@ -0,0 +1,63 @@ +# this is only mean for the test "model pack" in the examples folder +unambiguous-works: # this uses the exact same context that was used during training + targeting: + placeholders: + - placeholder: '[CONCEPT1]' + cuis: [ + 'CO1', # kidney failure + ] + - placeholder: '[CONCEPT2]' + cuis: [ + 'C02', # diabetes + ] + - placeholder: '[CONCEPT3]' + cuis: [ + 'C03', # fever + ] + phrases: # The list of phrases + - Man was diagnosed with severe [CONCEPT1] and acute [CONCEPT2] and presented with a light [CONCEPT3] +unambiguous-works-rnd: # these use the random word that one of the concepts WAS trained for + targeting: + placeholders: + - placeholder: '[CONCEPT]' + cuis: [ + 'CO1', # kidney failure + 'C02', # diabetes + 'C03', # fever + ] + phrases: # The list of phrases + - Patient was diagnosed with severe [CONCEPT]. + - Patient was diagnosed with acute [CONCEPT]. + - Patient presented with light [CONCEPT]. +unambiguous-works-rnd-reverse: # these use the OPPOSITE random word that one of the concepts WAS trained for + targeting: + placeholders: + - placeholder: '[CONCEPT]' + cuis: [ + 'CO1', # kidney failure + 'C02', # diabetes + 'C03', # fever + ] + phrases: # The list of phrases + - Patient was diagnosed with minor [CONCEPT]. + - Patient was diagnosed with chronic [CONCEPT]. + - Patient presented with heavy [CONCEPT]. +ambiguous-works-trained-1: # Uses AMBIGUOUS concepts in the trained context + targeting: + placeholders: + - placeholder: '[CONCEPT]' + cuis: [ + 'C04', # seizure/fit + ] + phrases: # The list of phrases + - Patient presented with acute [CONCEPT]. + - Patient had a minor [CONCEPT] during visit. +ambiguous-works-trained-2: # Uses AMBIGUOUS concepts in the trained context + targeting: + placeholders: + - placeholder: '[CONCEPT]' + cuis: [ + 'C05', # healthy/fit + ] + phrases: # The list of phrases + - Patient is a 50yo RHM considered [CONCEPT]. diff --git a/tests/backwards_compatibility/testing/test_vocab.py b/tests/backwards_compatibility/testing/test_vocab.py new file mode 100644 index 00000000..0d12fee8 --- /dev/null +++ b/tests/backwards_compatibility/testing/test_vocab.py @@ -0,0 +1,32 @@ +import os + +from medcat2.vocab import Vocab + +import unittest + + +class RegressionModelVocabTests(unittest.TestCase): + VOCAB_DATA_PATH = os.path.join( + os.path.dirname(__file__), '..', 'creation', 'vocab_data.txt') + + @classmethod + def setUpClass(cls): + cls.vocab = Vocab() + cls.vocab.add_words(cls.VOCAB_DATA_PATH) + + def test_has_same_vector_lengths(self): + all_lengths = set() + for w in self.vocab.vec_index2word.values(): + all_lengths.add(len(self.vocab.vec(w))) + self.assertEqual(len(all_lengths), 1, + f"Expected equal lengths. Got: {all_lengths}") + + def test_all_words_have_vectors(self): + for w in self.vocab.vocab: + with self.subTest(f"Word: {repr(w)}"): + # NOTE: if not there, will raise an exception + self.assertIsNotNone(self.vocab.vec(w)) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/components/ner/trf/test_transformers_ner.py b/tests/components/ner/trf/test_transformers_ner.py index 240d4637..f5daf849 100644 --- a/tests/components/ner/trf/test_transformers_ner.py +++ b/tests/components/ner/trf/test_transformers_ner.py @@ -1,14 +1,28 @@ +import tempfile +import json +import os +import shutil + from medcat2.components.ner.trf import transformers_ner from medcat2.storage.serialisables import ManualSerialisable +from medcat2.cdb import CDB +from medcat2.components.ner.trf.transformers_ner import ( + TransformersNER, TransformersNERComponent, _save_component) +from medcat2.config.config_transformers_ner import ConfigTransformersNER +from medcat2.model_creation.cdb_maker import CDBMaker +from transformers import TrainerCallback + from unittest import TestCase +import unittest.mock from ...addons.meta_cat.test_meta_cat import FakeTokenizer from ....pipeline.test_pipeline import FakeCDB, Config +from .... import RESOURCES_PATH -class TransformersNERTestS(TestCase): +class TransformersNERTests(TestCase): @classmethod def setUpClass(cls): @@ -18,3 +32,287 @@ def setUpClass(cls): def test_is_manually_serialisable(self): self.assertIsInstance(self.tner, ManualSerialisable) + + +class TestTransformersNER(TestCase): + + @classmethod + def setUpClass(cls): + cls.base_tokenizer = FakeTokenizer() + + def setUp(self): + # Create a temporary directory for the test + self.tmp_dir = tempfile.TemporaryDirectory() + # Create results dir for training outputs + self.results_dir = './results' + os.makedirs(self.results_dir, exist_ok=True) + + # Create a minimal CDB + self.cdb = CDB(Config()) + + # Create initial training data with 2 labels and multiple examples + self.initial_data = { + "projects": [{ + "documents": [ + { + "text": "Patient has diabetes and hypertension.", + "annotations": [ + { + "cui": "C0011849", # Diabetes + "start": 14, + "end": 22, + "value": "diabetes" + }, + { + "cui": "C0020538", # Hypertension + "start": 27, + "end": 39, + "value": "hypertension" + } + ] + }, + { + "text": "History of diabetes with hypertension.", + "annotations": [ + { + "cui": "C0011849", # Diabetes + "start": 12, + "end": 20, + "value": "diabetes" + }, + { + "cui": "C0020538", # Hypertension + "start": 26, + "end": 38, + "value": "hypertension" + } + ] + }, + { + "text": "Diagnosed with hypertension and diabetes.", + "annotations": [ + { + "cui": "C0020538", # Hypertension + "start": 15, + "end": 27, + "value": "hypertension" + }, + { + "cui": "C0011849", # Diabetes + "start": 32, + "end": 40, + "value": "diabetes" + } + ] + } + ] + }] + } + + # Create new training data with an extra label + self.new_data = { + "projects": [{ + "documents": [ + { + "text": + "Patient has diabetes, hypertension, and asthma.", + "annotations": [ + { + "cui": "C0011849", # Diabetes + "start": 14, + "end": 22, + "value": "diabetes" + }, + { + "cui": "C0020538", # Hypertension + "start": 24, + "end": 36, + "value": "hypertension" + }, + { + "cui": "C0004096", # Asthma + "start": 42, + "end": 48, + "value": "asthma" + } + ] + }, + { + "text": + "History of asthma with diabetes and hypertension.", + "annotations": [ + { + "cui": "C0004096", # Asthma + "start": 12, + "end": 18, + "value": "asthma" + }, + { + "cui": "C0011849", # Diabetes + "start": 24, + "end": 32, + "value": "diabetes" + }, + { + "cui": "C0020538", # Hypertension + "start": 37, + "end": 49, + "value": "hypertension" + } + ] + }, + { + "text": + "Diagnosed with asthma, diabetes, and hypertension.", + "annotations": [ + { + "cui": "C0004096", # Asthma + "start": 15, + "end": 21, + "value": "asthma" + }, + { + "cui": "C0011849", # Diabetes + "start": 23, + "end": 31, + "value": "diabetes" + }, + { + "cui": "C0020538", # Hypertension + "start": 37, + "end": 49, + "value": "hypertension" + } + ] + } + ] + }] + } + + # Save initial training data + self.initial_data_path = os.path.join( + self.tmp_dir.name, 'initial_data.json') + with open(self.initial_data_path, 'w') as f: + json.dump(self.initial_data, f) + + # Save new training data + self.new_data_path = os.path.join(self.tmp_dir.name, 'new_data.json') + with open(self.new_data_path, 'w') as f: + json.dump(self.new_data, f) + + def tearDown(self): + # Clean up the temporary directory + self.tmp_dir.cleanup() + # Clean up results directory if it exists + if os.path.exists(self.results_dir): + shutil.rmtree(self.results_dir) + # Clean up logs directory if it exists + if os.path.exists('./logs'): + shutil.rmtree('./logs') + + def test_ignore_extra_labels(self): + # Create and train initial model with tiny BERT + config = ConfigTransformersNER() + config.general.model_name = 'prajjwal1/bert-tiny' + # Set to single epoch and small test size for faster testing + config.general.test_size = 0.1 + + # Create training arguments with reduced epochs + from transformers import TrainingArguments + training_args = TrainingArguments( + output_dir=self.results_dir, # Use the class results_dir + num_train_epochs=1 + ) + + ner = TransformersNERComponent( + self.cdb, self.base_tokenizer, config=config, + training_arguments=training_args) + ner.train(self.initial_data_path) + + # Save the model + model_path = os.path.join(self.tmp_dir.name, 'model') + _save_component(ner, model_path) + + # Load the saved model + loaded_ner = TransformersNER.deserialise_from( + model_path, + cdb=self.cdb, + base_tokenizer=self.base_tokenizer)._component + + # Get initial number of labels + initial_num_labels = len(loaded_ner.tokenizer.label_map) + + # Train with ignore_extra_labels=True + loaded_ner.train(self.new_data_path, ignore_extra_labels=True) + + # Verify number of labels hasn't changed + self.assertEqual( + len(loaded_ner.tokenizer.label_map), + initial_num_labels, + "Number of labels changed despite ignore_extra_labels=True" + ) + + # Verify only original labels are present (including special tokens) + expected_labels = {"C0011849", "C0020538", "O", "X"} + self.assertEqual( + set(loaded_ner.tokenizer.label_map.keys()), + expected_labels, + "Label map contains unexpected labels" + ) + + # Train with ignore_extra_labels=False + loaded_ner.train(self.new_data_path, ignore_extra_labels=False) + + # Verify new label was added + self.assertEqual( + len(loaded_ner.tokenizer.label_map), + initial_num_labels + 1, + "New label was not added when ignore_extra_labels=False" + ) + + # Verify all labels are present (including special tokens) + expected_labels = {"C0011849", "C0020538", "C0004096", "O", "X"} + self.assertEqual( + set(loaded_ner.tokenizer.label_map.keys()), + expected_labels, + "Label map missing expected labels" + ) + + +class AdditionalTransfromersNERTests(TestCase): + TOKENIZER = FakeTokenizer() + CNF = ConfigTransformersNER() + + @classmethod + def setUpClass(cls) -> None: + config = Config() + config.general.nlp.modelname = "en_core_web_md" + cdb_maker = CDBMaker(config) + cdb_csv = os.path.join(RESOURCES_PATH, "cdb_example.csv") + cdb = cdb_maker.prepare_csvs([cdb_csv], full_build=True) + cls.undertest = TransformersNER(cdb, base_tokenizer=cls.TOKENIZER, + component=TransformersNERComponent( + cdb, cls.TOKENIZER, cls.CNF), + config=cls.CNF) + cls.undertest._component.create_eval_pipeline() + + def test_train_with_test_file(self): + tracker = unittest.mock.Mock() + + class _DummyCallback(TrainerCallback): + def __init__(self, trainer) -> None: + self._trainer = trainer + + def on_epoch_end(self, *args, **kwargs) -> None: + tracker.call() + + train_data = os.path.join(RESOURCES_PATH, "deid_train_data.json") + test_data = os.path.join(RESOURCES_PATH, "deid_test_data.json") + self.undertest._component.training_arguments.num_train_epochs = 1 + df, examples, dataset = self.undertest._component.train( + train_json_path=train_data, test_json_path=test_data, + trainer_callbacks=[_DummyCallback]) + assert "fp" in examples + assert "fn" in examples + assert dataset["train"].num_rows == 60 + self.assertEqual(tracker.call.call_count, 1) diff --git a/tests/resources/cdb_example.csv b/tests/resources/cdb_example.csv new file mode 100644 index 00000000..259e2f0d --- /dev/null +++ b/tests/resources/cdb_example.csv @@ -0,0 +1,6 @@ +cui,name,ontologies,name_status,type_ids,description +C0000039,"Virus",MSH,P,T109|T123,Synthetic phospholipid used in liposomes and lipid bilayers to study biological membranes. It is also a major constituent of PULMONARY SURFACTANTS. +C0000039,"Virus M",,,T234, +C0000039,"Virus M |Virus K|Virus Z",,,, +C0000139,"Virus M|Virus K|Virus Z",,P,, +C0000139,"Virus",,A,, diff --git a/tests/resources/deid_test_data.json b/tests/resources/deid_test_data.json new file mode 100644 index 00000000..1ed443a6 --- /dev/null +++ b/tests/resources/deid_test_data.json @@ -0,0 +1 @@ +{"projects": [{"name": "/Users/martratas/Documents/CogStack/.MedCAT.nosync/MedCAT/temp/deid/testing-PHI-Gold-fixed.tar.gz", "documents": [{"text": "\n\n\nRecord date: 2090-07-16\n\n\n\n\nNAME: Curtis, Om \nMRN: 7682941\n \nHe is feeling great. He is all done with his radiation to the left axilla for metastatic\nsquamous cell cancer. He is following closely with the radiation oncologist and the\nmedical oncologist. He is seeing them both later this month. He has had no\nproblems with chest pains or shortness of breath. All in all, things are going well.\n\nPHYSICAL EXAM: On exam, no acute distress. Lungs are clear. Heart is regular\nrate and rhythm. No murmurs, gallops or rubs. He does have some skin\ndiscoloration around the left axilla but I feel no mass. He has a well-healed incision. \nThere is no hair noted in or around the axilla. Extremities with no edema.\n\nASSESSMENT AND PLAN: \n\n(1) CAD/hypertension/diabetes mellitus. This is stable. Check glycosylated\n hemoglobin. \n\n(2) Metastatic squamous cell cancer. He is being followed closely by Oncology for\n this. Follow-up with me in the spring.\n\nWilliam V. Geiger, M.D.\n\nWVG/xin/quilici\n\n\n\n\n", "name": "119-03.xml", "annotations": [{"start": "16", "end": "26", "cui": "DATE", "value": "2090-07-16"}, {"start": "40", "end": "50", "cui": "PATIENT", "value": "Curtis, Om"}, {"start": "61", "end": "68", "cui": "MEDICALRECORD", "value": "7682941"}, {"start": "972", "end": "978", "cui": "DATE", "value": "spring"}, {"start": "981", "end": "998", "cui": "DOCTOR", "value": "William V. Geiger"}, {"start": "1006", "end": "1009", "cui": "DOCTOR", "value": "WVG"}, {"start": "1010", "end": "1013", "cui": "DOCTOR", "value": "xin"}, {"start": "1014", "end": "1021", "cui": "DOCTOR", "value": "quilici"}]}, {"text": "\n\n\nRecord date: 2078-03-17\n\n\n\nPatient Name: JORGENSON,VIVIANLEE [ 47190847(JMH) ] Date of Visit: 03/17/2078\n\n\nCC: Syncope, Afib\n\n\nHPI: \n71 year old lady with a history of A fib was seen in clinic today for complaints of DOE and increased wt. She was found to be in mild to moderate CHF and an increase in her torsemide dose was recommended. On her way to a blood draw, while in the elevator, she had syncope and hit her head. The fall was witnessed by her daughter. There was no prodrome, no nausea, no incontinence with the fall. The pt does not have any seizure or hypoglycemia history. She notes an 8 lb wt gain over 2-3 weeks and denies dietary indiscretions. She takes her medications faithfully which include an escalating dose of torsemide in over the last several months. She's been hospitalized twice for CHF - once in 2075 and again in 11/77.\n\n\nThe pt was brought to the ED. In the ED the patient had a head CT was done which did not reveal any acute pathology.\n\n\nPMH:Cardiomyopathy : Nonischemic, 6/19/74 cath no significant coronary disease, 11/26/77 EF 20% with and global hypokinesis \nCoronary artery disease : 6/19/74 cath: RCA 30%\nThyroid cancer : Papillary nodule Ca, 2071, a/p thyroidectomy\nDiabetes: Had been on glyburide, currently diet controlled, 11/25 A1c 6.3\nHypothyroidism: 11/25: 1.4 normal\nElevated cholesterol: 11/26: LDL of 36 and an HDL of 26\nHypertension \nAtrial fibrillation on Coumadin\nNon sustained ventricular tachycardia \n\n\nMedications\nAmbien (ZOLPIDEM TARTRATE) 5 MG (5MG TABLET take 1) PO QHS PRN \nAsa (ACETYLSALICYLIC ACID) 81 MG (81MG TABLET take 1) PO QD \nCelexa (CITALOPRAM) 20MG TABLET take 1 Tablet(s) PO QD \nCoreg (CARVEDILOL) 12.5 MG (12.5MG TABLET take 1) PO BID \nCoumadin (WARFARIN SODIUM) 1 MG (2.5MG TABLET take 1) PO QPM \nDigoxin 0.0612 MG (125MCG TABLET take 1) PO QD \nLipitor (ATORVASTATIN) 20 MG (20MG TABLET take 1) PO QD \nLisinopril 40 MG PO QD \nPotassium CHLORIDE SLOW REL. (KCL SLOW RELEASE) 20 MEQ (20MEQ TAB PRT SR take 1) PO BID \nSynthroid (LEVOTHYROXINE SODIUM) 150MCG TABLET PO variable \nTorsemide 200 MG (20MG TABLET take 1) PO QD \n\n\n\n\n\nAllergies\nNKA \n \nFH: Mom had MI, age 50\n\n\nSH: \n Lives with husband (Blacksmith, recently had CABG) and has 4 grown children.\n\nHealth-Related Behaviors\nAlcohol-social only\nTobacco-prior use, 60 ppy, quit 2067Drug use-no illicit drugs \n\n\nPE: \nT 98, BP 100/65, HR 131, RR 18, 96%RA\nGEN: NAD\nHEENT: PERRL, EOMI, mm moist\nCV: irreg irreg, no murmurs appreciated. JVP 15 cm\nLUNG: Rales in bases bilaterally\nABD: soft, non-tender, non-distended\nEXT: no c/c/e\nNeuro: A&Ox3, moves all extremiteis.\n\n\nCXR: enlarged heart. Minimal pulmonary infiltrate\nEKG: A fib, no ischemic changes, poor RWP. Unchanged from previous.\n\n\nLabs\nResultsDate/Time NA K CL CO2 03/17/2078 [1] 139 4.6 103 28 03/17/2078 139 4.2 102 26 Date/Time BUN CRE EGFR GLU 03/17/2078 [2] 50 (*) 1.73 (*) 29 [3] 110 03/17/2078 50 (*) 1.66 (*#) 30 [4] 106 Date/Time ANION 03/17/2078 [5] 8 03/17/2078 11 Date/Time CA MG TBILI TP 03/17/2078 [6] 9.7 3.0 (*) 6.9 03/17/2078 9.7 2.2 2.9 (*) 7.0 Date/Time ALB GLOB LIPS 03/17/2078 [7] 4.0 2.9 62 (*)[8] 03/17/2078 4.0 3.0 Date/Time ALT/SGPT AST/SGOT ALKP TBILI 03/17/2078 [9] 8 24 106 3.0 (*) 03/17/2078 8 (#) 22 106 2.9 (*) Date/Time CK CK-MB TROP-I 03/17/2078 [10] 56 2.1 SEE DETAIL[11] Date/Time TSH 03/17/2078 4.091 Date/Time WBC RBC HGB HCT 03/17/2078 [12] 5.68 (#) 4.67 15.2 45.8 Date/Time MCV MCH MCHC PLT 03/17/2078 [13] 98.2 (*#) 32.6 (*#) 33.1 136 (*) Date/Time RDW 03/17/2078 [14] 14.6 (*) Date/Time %POLY-A %LYMPH-A %MONO-A %EOS-A 03/17/2078 [15] 76.4 (*) 15.4 (*) 6.6 1.3 Date/Time %BASO-A 03/17/2078 [16] 0.3 Date/Time ANEUT-A ALYMP-A AMONO-A AEOS-A 03/17/2078 [17] 4.34 .88 0.37 0.07 Date/Time ABASO-A 03/17/2078 [18] 0.02 Date/Time HYPO MACRO 03/17/2078 [19] + + Date/Time PT PT-INR PTT 03/17/2078 [20] 24.1 (*) 2.1 (*) 39.2 (*) A/P: 71 year old lady with syncope of unclear eitiology with CHF/Afib. Ddx includes cardiac arrhythmia, hypovolemia, vaso-vagal. Neurologic causes less likely from history.\n\n#CV - i. Low suspicion ACS- Rule out with serial CE\n- Continue asa, statin#CV - p. Decompensated CHF- Diuresis with IV lasix. Consider lasix drip if inadequate response.\n- Daily wt. Strict I/O.- Cont. digoxin- Will start amiodarone 400 bid- Discuss possible role of AICD with patient and family.#CV - r. - Telemetry#AFR - acute on chronic - likely from poor forward flow- reduce lisinopril- urine electrolytes, eos.\n#Psych\n- continue celexa\n#Endocrine\n- continue home thyroid regimen\n#FEN\n- low sodium, 2L fluid restricted diet#CODE: Full\n\n\n\n\n\n\n\n\n____________________________________\nXavier B. Nix, M.D., Ph.D.\n\n\n\n", "name": "132-04.xml", "annotations": [{"start": "16", "end": "26", "cui": "DATE", "value": "2078-03-17"}, {"start": "44", "end": "63", "cui": "PATIENT", "value": "JORGENSON,VIVIANLEE"}, {"start": "66", "end": "74", "cui": "MEDICALRECORD", "value": "47190847"}, {"start": "75", "end": "78", "cui": "HOSPITAL", "value": "JMH"}, {"start": "97", "end": "107", "cui": "DATE", "value": "03/17/2078"}, {"start": "136", "end": "138", "cui": "AGE", "value": "71"}, {"start": "828", "end": "832", "cui": "DATE", "value": "2075"}, {"start": "846", "end": "851", "cui": "DATE", "value": "11/77"}, {"start": "1010", "end": "1017", "cui": "DATE", "value": "6/19/74"}, {"start": "1056", "end": "1064", "cui": "DATE", "value": "11/26/77"}, {"start": "1127", "end": "1134", "cui": "DATE", "value": "6/19/74"}, {"start": "1187", "end": "1191", "cui": "DATE", "value": "2071"}, {"start": "1272", "end": "1277", "cui": "DATE", "value": "11/25"}, {"start": "1302", "end": "1307", "cui": "DATE", "value": "11/25"}, {"start": "1344", "end": "1349", "cui": "DATE", "value": "11/26"}, {"start": "2145", "end": "2147", "cui": "AGE", "value": "50"}, {"start": "2177", "end": "2187", "cui": "PROFESSION", "value": "Blacksmith"}, {"start": "2312", "end": "2316", "cui": "DATE", "value": "2067"}, {"start": "2814", "end": "2824", "cui": "DATE", "value": "03/17/2078"}, {"start": "2892", "end": "2902", "cui": "DATE", "value": "03/17/2078"}, {"start": "3048", "end": "3058", "cui": "DATE", "value": "03/17/2078"}, {"start": "3126", "end": "3136", "cui": "DATE", "value": "03/17/2078"}, {"start": "3237", "end": "3247", "cui": "DATE", "value": "03/17/2078"}, {"start": "3270", "end": "3280", "cui": "DATE", "value": "03/17/2078"}, {"start": "3381", "end": "3391", "cui": "DATE", "value": "03/17/2078"}, {"start": "3459", "end": "3469", "cui": "DATE", "value": "03/17/2078"}, {"start": "3600", "end": "3610", "cui": "DATE", "value": "03/17/2078"}, {"start": "3663", "end": "3673", "cui": "DATE", "value": "03/17/2078"}, {"start": "3804", "end": "3814", "cui": "DATE", "value": "03/17/2078"}, {"start": "3882", "end": "3892", "cui": "DATE", "value": "03/17/2078"}, {"start": "4023", "end": "4033", "cui": "DATE", "value": "03/17/2078"}, {"start": "4119", "end": "4129", "cui": "DATE", "value": "03/17/2078"}, {"start": "4230", "end": "4240", "cui": "DATE", "value": "03/17/2078"}, {"start": "4386", "end": "4396", "cui": "DATE", "value": "03/17/2078"}, {"start": "4497", "end": "4507", "cui": "DATE", "value": "03/17/2078"}, {"start": "4608", "end": "4618", "cui": "DATE", "value": "03/17/2078"}, {"start": "4719", "end": "4729", "cui": "DATE", "value": "03/17/2078"}, {"start": "4830", "end": "4840", "cui": "DATE", "value": "03/17/2078"}, {"start": "4941", "end": "4951", "cui": "DATE", "value": "03/17/2078"}, {"start": "5022", "end": "5032", "cui": "DATE", "value": "03/17/2078"}, {"start": "5133", "end": "5143", "cui": "DATE", "value": "03/17/2078"}, {"start": "5223", "end": "5225", "cui": "AGE", "value": "71"}, {"start": "5976", "end": "5989", "cui": "DOCTOR", "value": "Xavier B. Nix"}]}, {"text": "\n\n\nRecord date: 2092-10-29\n\n \n\nTeam 3 Intern Admit Note\n\nName: Walton, Levi\n\nMR#: 2554172\n\nDate: 10/29/92\nPCP: Paul Eggleston\n\nCardiology: Youmans\n\n\n\nCC: emesis, light-headedness \n\n\n\nHPI: 85 y/o with CAD and active ischemia on 6/87 stress test (medically managed) who was feeling well until last night. During the day yesterday, he went on his daily walk with his wife and felt fine. In the evening around 8:30 pm, he experienced dull, 8/10 pain under both arm pits which resolved within 20 minutes after taking 3 SL TNG 5 mins apart. He denies SOB, N/V, radiation or diaphoresis associated with this arm pain. He states that he gets similar bilateral underarm pain about twice per week which is usually relieved by a single SL TNG within 10-15 mins. The arm pain is not exertional.\n\n\tAfter the arm pain relieved last night, the pt took 3 Aleve tabs which his daughter bought for him for arthritis pain. A couple of hours later, he became nauseated and had a single episode of dark brown emesis. After vomiting, he felt light-headed while walking around the house. He had a bowel movement this AM which he states was darker than usual, almost black looking. He denies BRBPR, abdominal pain, CP, SOB, palpitations, headache, fever or chills.\n\n\tIn the setting of significant fatigue and light-headedness, the pt presented to the ED this morning. Admission vitals: 97.2, 120/78, HR 80's, RR12, sats 97% RA. He was noted to be in A fib. Hct was 22 (down from 38 last week). NG lavage was not performed in the ED. However, he did have another episode of bilateral arm pain which was relieved by SL TNG x 3. HR at the time of arm pain was in the 80's and ECG showed A fib with possible TWI in I,L and V6 but relatively unchanged from admission ECG without pain. He subsequently received lopressor 2.5 IV/12.5 PO, isordil 10 PO and zantac 50 IV. \n\n\t \n\nPMHx:\n\n1.\tCAD\n\na.\tS/p anterior MI 2082 - pt does not recall having chest pain associated with MI\n\nb.\tCath 2082 - 3 vessel CAD, PTCA to mid-LAD lesion, 90% RCA, occluded OM2\n\nc.\tETT-mibi 6/87 - 75% MPHR, 8 METS. 0.5-1 mm ST depressions in inferior and percordial (V4, V5) leads; imaging showed inferior and posterior ischemia\n\nd.\tECHO 3/87 - EF 67%, LA 37mm, no WMA \n\n2.\tBPH\n\n3.\tOA/DJD bilateral shoulders\n\n4.\tcolonoscopy 2087 - hemorrhoids, diverticulosis, single tubular adenoma excised; no endoscopy since then \n\n5.\tDM II - diet controlled; last A1c 6.6 (10/92)\n\n6.\thyperlipidemia - chol 111, LDL 55 HDL 33, tri 115 (10/92 on treatment)\n\n\n\nMeds:\n\n1.\tatenolol 100 QD\n\n2.\tisordil 30 QID\n\n3.\tASA 81 QD\n\n4.\tlisinopril 20 QD\n\n5.\tSL TNG prn\n\n6.\tMVI\n\n7.\tlipitor 10 QD\n\n\n\nAllergies: NKDA\n\n\n\nSHx: lives with wife; walks around the mall daily\n\nTobacco - quit >35 years ago\n\nEtOH - 2 mixed drinks (vodka) per night; denies anything more\n\nDrugs - none\n\n\n\nPE: vitals 95.0 115/58 87 18 98% RA\n\nGeneral: no acute distress\n\nHEENT: sclera anicteric; EOMI, PERRLA, OP without masses or infiltrate\n\nNECK: JVP flat; carotid pulses brisk and symmetric; no carotid bruits; no thyromegaly; no cervical or supraclav. LAD\n\nLUNGS: CTA B\n\nCARDIAC: irreg irreg, nl s1s2, no M/R/G\n\nABD: soft, NT/ND, pos. BS, no HSM\n\nRECTAL: guaiac positive, normal tone (per ED resident)\n\nEXT: no axillary or inguinal LAD; no c/c/e\n\nMSK: significantly limited and painful ROM at both shoulders\n\nNEURO: A&O x 3; CN II-XII intact; otherwise non-focal\n\n\n\nLabs:\n\nSodium (Stat Lab) 134 L (135-145) mmol/L\n\nPotassium (Stat Lab) 3.9 (3.4-4.8) mmol/L\n\nChloride (Stat Lab) 102 (100-108) mmol/L\n\nCO2 (Stat Lab) 22.5 L (23.0-31.9) mmol/L\n\nBUN (Stat Lab) 67 H (8-25) mg/dl\n\nCreatinine (Stat Lab) 1.4 (0.6-1.5) mg/dl\n\nGlucose (Stat Lab) 291 H (70-110) mg/dl\n\n\n\nCalcium 9.2 (8.5-10.5) mg/dl\n\nPhosphorus 3.9 (2.6-4.5) mg/dl\n\nMagnesium 1.6 (1.4-2.0) meq/L\n\nTotal Protein 6.5 (6.0-8.3) g/dl\n\nAlbumin 3.5 (3.3-5.0) g/dl\n\nGlobulin 3.0 (2.6-4.1) g/dl\n\nDirect Bilirubin 0.1 (0-0.4) mg/dl\n\nTotal Bilirubin 0.3 (0-1.0) mg/dl\n\nAlkaline Phosphatase 67 (45-115) U/L\n\nTransaminase-SGPT 16 (10-55) U/L\n\nAmylase 16 (3-100) units/L\n\nLipase 2.5 (1.3-6.0) U/dl\n\nTransaminase-SGOT 28 (10-40) U/L\n\n\n\nCreatine Kinase Isoenz BORDERLINE (NEG)\n\nTroponin-I NEGATIVE (NEG)\n\n\n\nCreatine Kinase Isoenz 13.2 H (0.0-6.9) ng/ml\n\nCPK Isoenzymes Index 12.8 H (0.0-3.5) %\n\nTroponin-T 0.06 (0.00-0.09) ng/ml\n\nCreatine Kinase 103 (60-400) U/L\n\n\n\nAdmission:\n\nWBC 18.4 H (4.5-11.0) th/cmm\n\nHCT 22.2 L (41.0-53.0) %\n\nHGB 7.4 L (13.5-17.5) gm/dl\n\nRBC 2.25 L (4.50-5.90) mil/cmm\n\nPLT 243 (150-350) th/cumm\n\nMCV 99 (80-100) fl\n\nMCH 33.0 (26.0-34.0) pg/rbc\n\nMCHC 33.4 (31.0-37.0) g/dl\n\nRDW 15.6 H (11.5-14.5) %\n\nSuperstat PT 14.0 H (11.1-13.1) sec\n\nSuperstat APTT 25.3 (22.1-35.1) sec\n\n\n\n\n\nUA-Specific Gravity <1.005 (1.001-1.035)\n\nUA-pH 5.0 (5.0-9.0)\n\nUA-WBC Screen NEGATIVE (NEG)\n\nUA-Nitrite NEGATIVE (NEG)\n\nUA-Albumin NEGATIVE (NEG)\n\nUA-Glucose Trace (NEG)\n\nUA-Ketones NEGATIVE (NEG)\n\nUA-Occult Blood NEGATIVE (NEG)\n\n\n\nWBC 17.2 H (4.5-11.0) th/cmm\n\nHCT 24.7 L (41.0-53.0) %\n\nHGB 8.6 L (13.5-17.5) gm/dl\n\nRBC 2.65 L (4.50-5.90) mil/cmm\n\nPLT 210 (150-350) th/cumm\n\nMCV 93 (80-100) fl\n\nMCH 32.5 (26.0-34.0) pg/rbc\n\nMCHC 35.0 (31.0-37.0) g/dl\n\nRDW 16.8 H (11.5-14.5) %\n\n\n\nStudies:\n\nEKG AF (92) LAD TWI I,L,V6\n\nCXR IMPRESSION:\n\n\tThere are linear opacities at both bases consistent with\n\n\tsubsegmental atelectasis. Bilateral calcified pleural plaques are\n\n\tconsistent with asbestos exposure. There is no pneumothorax. No\n\npleural effusions are visualized. There is extensive change including both shoulders.\n\n\n\nAssessment: 85 y/o with medically management multivessel CAD who presents with light-headedness, fatigue and anemia. Likely coffee gound emesis, melana and increased BUN concerning for UGIB. Given history of alcohol consumption (suspect that consumption may be slightly more than given by history) and NSAID use, gastritis is a possibility. No clear reason for other sources of UGIB. Pt also found to be in Afib of unknown duration. Unclear if Afib was prompted by anemia due to UGIB.\n\n\n\nPlan:\n\n\n\n1.\tUGIB\n\na.\tActive blood back sample, IV access\n\nb.\tHct increased from 22.2 to 24.7 after 2u pRBCs. Given CAD history, will transfuse one more unit\n\nc.\tGI consult for possible EGD (pt likely also needs colonoscopy given h/o tubular adenoma)\n\nd.\tIV zantac \n\ne.\tHold aspirin for now\n\n2.\tCV\n\na.\tIschemia - TVI on admission on ECG from ED. Will cycle cardiac enzymes. Hold aspirin for now given GIB. Continue beta blocker both for ischemia protection and AF rate control; will hold other anti-hypertensives for now\n\nb.\tRhythm - unclear duration of Afib so immediate cardioversion not an option (avoid TEE in the setting of UGIB); lopressor for rate control; no anticoagulation for now; central telmetry\n\nc.\tPump - no active issues \n\n3.\tincreased WBC \n\na.\tunclear etiology ? reactive\n\nb.\tAfebrile, no signs of infection; will follow\n\n4.\tdiabetes\n\na.\tSSI\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJohn Kirk, MD\n\nIntern in Medicine \n\nPager 92915\n\n\n\n", "name": "218-01.xml", "annotations": [{"start": "16", "end": "26", "cui": "DATE", "value": "2092-10-29"}, {"start": "63", "end": "75", "cui": "PATIENT", "value": "Walton, Levi"}, {"start": "82", "end": "89", "cui": "MEDICALRECORD", "value": "2554172"}, {"start": "97", "end": "105", "cui": "DATE", "value": "10/29/92"}, {"start": "111", "end": "125", "cui": "DOCTOR", "value": "Paul Eggleston"}, {"start": "139", "end": "146", "cui": "DOCTOR", "value": "Youmans"}, {"start": "188", "end": "190", "cui": "AGE", "value": "85"}, {"start": "227", "end": "231", "cui": "DATE", "value": "6/87"}, {"start": "1906", "end": "1910", "cui": "DATE", "value": "2082"}, {"start": "1978", "end": "1982", "cui": "DATE", "value": "2082"}, {"start": "2058", "end": "2062", "cui": "DATE", "value": "6/87"}, {"start": "2206", "end": "2210", "cui": "DATE", "value": "3/87"}, {"start": "2293", "end": "2297", "cui": "DATE", "value": "2087"}, {"start": "2429", "end": "2434", "cui": "DATE", "value": "10/92"}, {"start": "2491", "end": "2496", "cui": "DATE", "value": "10/92"}, {"start": "7749", "end": "7751", "cui": "AGE", "value": "85"}, {"start": "9107", "end": "9116", "cui": "DOCTOR", "value": "John Kirk"}, {"start": "9149", "end": "9154", "cui": "PHONE", "value": "92915"}]}, {"text": "\n\n\nRecord date: 2094-12-26\n\nCARDIOLOGY\n\nCOQUILLE VALLEY HOSPITAL\n\n\n\nReason for visit:\n\n NSTEMI\n\n\n\nInterval History:\n\n Multiple risk factors for CAD including DM, HTN, CRI. Developed complaints of exertional dyspnea and fatigue during the spring of 2094. Progressed and she was evaluated by Bonnie Eaves. Bruce protocol was postive at 9 minutes, 64% PMHR. Complaints of dyspnea. EKG with borderline changes. Nuclear images with moderate sized defect of anteroapical zone with partial reperfusion. Some scar. Mild LV dysfunction with EF 45-50% and apical dyskinesis.\n\n\n\n8/2094 Admitted to CVH for elective cath. Cath with Dr Vitale found nl LM. 99% subtotal LAD with TIMI I flow. Circ was patent. RCA with proximal 30% lesions and mid 80% lesion. Nl LV gram. Treated with PTCA and stent to mid LAD with mini-vision non-drug-eluding stent (2.5 X 18 mm). Excellent results. Distal LAD with residual diffuse 30-40% lesion. Plan to return for staged intervention to RCA.\n\n\n\n9/29/2094 she was readmitted for elective RCA PCI. A relook at her LAD stent found >90% restenosis to proximal edge of LAD stent. Treated with 2.75 X 12 mm TAXUS stent. Tolerated procedure well. Discharged the following day with creat 3.3. Plan was to again return for RCA PCI.\n\n\n\n10/94 Cath: patent LAD stents. Mid RCA lesion treated with 2.5 X 13 mm cypher stent. Discharge delayed for neurologic changes (LOC and unresponsive) but EEG, CNIS, CT, all without new infarct (old PCA CVA). \n\n\n\n12/25 Developed SOB and arrived in EW at SMM with chest pain. EKG without changes. Trop +/- but continued to have chest pain. Given previous stents, she was transferred urgently to CVH for cath. \n\n\n\nPast medical history:\n\n CVA of left PCA territory 2093. No residual. Rx with ASA, folate, niaspan\n\nComplete neurologic evaluation in LOC in 10/94 and previously in 2093\n\nIDDM diagnosed 30 yrs ago on insulin pump\n\nCRI with creat baseline 3.7 (followed by Orlando Ernst) and recent eval by Dr Ratliff for transplant. +proteinuria with nephrotic syndrome. Donor kidney (sister)lined up. No date for transplant made...needs to complete Plavix course first.\n\nHTN\n\nHigh cholesterol\n\nDiabetic retinopathy\n\nAnemia\n\n\n\nMedications (Confirmed):\n\n\n\n\t\t\t\n\nColace 100 mg po qd\n\nDiovan 160mg po qd\n\nEpogen 10000u sc sundays\n\nerythromycin 333mg po TID\n\nfolic acid 1 mg po qd\n\nInsulin pump\n\niron supplement 325mg po am\n\nisosorbide dinitrate\n\nLasix 80 mg po qd\n\nmultivitamins 1 tab po qd\n\nNiaspan 1000mg SR po qhs\n\nNorvasc 5mg po q pm\n\nPhoslo and vitamin B 1\n\nPlavix 75mg po qd\n\nToprol XL 25 mg po qd\n\nVytorin 10mg/80mg po qhs\n\n\n\nAllergies:\n\nNo known drug allergy \n\n\n\nFamily history:\n\n Mom A&W in her 60's with HTN. Dad A&W. Sister is planning to be donor kidney. Other sister is back-up donor. \n\n\n\nSocial history:\n\n Works as Patternmaker at IMN. \n\nvolunteer firefighter. Single and lives with her parents. \n\nNever smoked. No ETOH. \n\n\n\nReview of systems:\n\n no peripheral edema currently (but had it in the past). No fever, chills, sweating. Problems with gastroparesis and is planning to have a gastric pacemaker inserted (has been having delayed spikes in insulin). \n\n\n\nPhysical examination:\n\n-BP: 110/50 \n\n-Pulse: 70 \n\n-resp. rate: 16 \n\n-weight: 153 \n\n-General appearance: No acute distress.\n\n-Skin: No rashes, anicteric.\n\n-Heent: Unremarkable\n\n-Neck: Carotids 2+ without bruits. JVP no jugular venous distention\n\n-Chest: Clear to auscultation and percussion.\n\n-Cardiac: Left ventricular impulse discrete and nondisplaced. Regular rate and rhythm, normal S1 and S2, with no S3 or S4. There were no murmurs, clicks or rubs.\n\n-Abdomen: Normal bowel sounds, soft and nontender, with no hepatosplenomegaly or masses appreciated.\n\n-Extremities: No cyanosis, clubbing or edema. 2+ femoral pulses without bruits. 2+ pedal pulses.\n\n-Neuro: A&O x3, CN 2-12 grossly intact. Reflexes 2+ and symmetric x 4 extremities. Toes B downgoing.\n\n\n\nSelected recent labs:\n\n K 4.4, Creat 4.2. WBC 7.5, Creat 35.8, Plts 401. INR 0.9\n\n\n\nAssessment and plan:\n\n 40 y.o. with multiple risk factors for CAD. HTN, high chol, IDDM. Previous LAD stent with restenosis. Then second LAD stent and finally RCA stent. Now with admit for NSTEMI. Transferred for urgent cath. Plan for eventual renal transplant at CVH (had been waiting for Plavix to be completed). Pre-treated with mucomyst. \n\n\n\nGiven her previous restenosis, and the fact that further Plavix courses are postponing her renal transplant....she would be best served with CABG if she has restenosis. \n\n\n\nFurther plan per Dr Rollins\n\nFollow up with Dr Eaves\n\n\n\nFrances Travis Potts NP\n\n\n\n\n\nChanges to Medications this visit\n\nDiovan 160mg po qd Start: 09/28/2094\n\nLasix 80 mg po qd Start: 09/28/2094\n\nToprol XL 25 mg po qd Start: 09/28/2094 just increased to 50\n\nNorvasc 5mg po q pm Start: 09/28/2094\n\nPlavix 75mg po qd Start: 09/28/2094\n\nNiaspan 1000mg SR po qhs Start: 09/28/2094\n\nVytorin 10mg/80mg po qhs Start: 09/28/2094\n\nfolic acid 1 mg po qd Start: 09/28/2094\n\nEpogen 10000u sc sundays Start: 09/28/2094\n\nmultivitamins 1 tab po qd Start: 09/28/2094\n\niron supplement 325mg po am Start: 09/28/2094\n\nColace 100 mg po qd Start: 09/28/2094\n\nisosorbide dinitrate Start: 09/28/2094\n\nerythromycin 333mg po TID Start: 09/28/2094 for gastroparesis\n\nPhoslo and vitamin B 1 Start: 09/28/2094\n\nInsulin pump \n\n\n\nSigned electronically by Frances T Potts NP on Dec 26, 2094 \n\n\n\n", "name": "231-02.xml", "annotations": [{"start": "16", "end": "26", "cui": "DATE", "value": "2094-12-26"}, {"start": "40", "end": "64", "cui": "HOSPITAL", "value": "COQUILLE VALLEY HOSPITAL"}, {"start": "242", "end": "256", "cui": "DATE", "value": "spring of 2094"}, {"start": "294", "end": "306", "cui": "DOCTOR", "value": "Bonnie Eaves"}, {"start": "572", "end": "578", "cui": "DATE", "value": "8/2094"}, {"start": "591", "end": "594", "cui": "HOSPITAL", "value": "CVH"}, {"start": "627", "end": "633", "cui": "DOCTOR", "value": "Vitale"}, {"start": "972", "end": "981", "cui": "DATE", "value": "9/29/2094"}, {"start": "1253", "end": "1258", "cui": "DATE", "value": "10/94"}, {"start": "1464", "end": "1469", "cui": "DATE", "value": "12/25"}, {"start": "1505", "end": "1508", "cui": "HOSPITAL", "value": "SMM"}, {"start": "1645", "end": "1648", "cui": "HOSPITAL", "value": "CVH"}, {"start": "1715", "end": "1719", "cui": "DATE", "value": "2093"}, {"start": "1805", "end": "1810", "cui": "DATE", "value": "10/94"}, {"start": "1829", "end": "1833", "cui": "DATE", "value": "2093"}, {"start": "1919", "end": "1932", "cui": "DOCTOR", "value": "Orlando Ernst"}, {"start": "1956", "end": "1963", "cui": "DOCTOR", "value": "Ratliff"}, {"start": "2265", "end": "2272", "cui": "DATE", "value": "sundays"}, {"start": "2651", "end": "2655", "cui": "AGE", "value": "60's"}, {"start": "2778", "end": "2790", "cui": "PROFESSION", "value": "Patternmaker"}, {"start": "2794", "end": "2797", "cui": "ORGANIZATION", "value": "IMN"}, {"start": "2801", "end": "2822", "cui": "PROFESSION", "value": "volunteer firefighter"}, {"start": "4015", "end": "4017", "cui": "AGE", "value": "40"}, {"start": "4256", "end": "4259", "cui": "HOSPITAL", "value": "CVH"}, {"start": "4531", "end": "4538", "cui": "DOCTOR", "value": "Rollins"}, {"start": "4558", "end": "4563", "cui": "DOCTOR", "value": "Eaves"}, {"start": "4567", "end": "4587", "cui": "DOCTOR", "value": "Frances Travis Potts"}, {"start": "4658", "end": "4668", "cui": "DATE", "value": "09/28/2094"}, {"start": "4696", "end": "4706", "cui": "DATE", "value": "09/28/2094"}, {"start": "4738", "end": "4748", "cui": "DATE", "value": "09/28/2094"}, {"start": "4799", "end": "4809", "cui": "DATE", "value": "09/28/2094"}, {"start": "4837", "end": "4847", "cui": "DATE", "value": "09/28/2094"}, {"start": "4882", "end": "4892", "cui": "DATE", "value": "09/28/2094"}, {"start": "4927", "end": "4937", "cui": "DATE", "value": "09/28/2094"}, {"start": "4969", "end": "4979", "cui": "DATE", "value": "09/28/2094"}, {"start": "4999", "end": "5006", "cui": "DATE", "value": "sundays"}, {"start": "5014", "end": "5024", "cui": "DATE", "value": "09/28/2094"}, {"start": "5060", "end": "5070", "cui": "DATE", "value": "09/28/2094"}, {"start": "5108", "end": "5118", "cui": "DATE", "value": "09/28/2094"}, {"start": "5148", "end": "5158", "cui": "DATE", "value": "09/28/2094"}, {"start": "5192", "end": "5202", "cui": "DATE", "value": "09/28/2094"}, {"start": "5238", "end": "5248", "cui": "DATE", "value": "09/28/2094"}, {"start": "5301", "end": "5311", "cui": "DATE", "value": "09/28/2094"}, {"start": "5360", "end": "5375", "cui": "DOCTOR", "value": "Frances T Potts"}, {"start": "5384", "end": "5396", "cui": "DATE", "value": "Dec 26, 2094"}]}, {"text": "\n\n\nRecord date: 2069-11-18\n\nHPI\n\n54 yo F with h/o CRI, DM on insulin, HTN, obesity, GERD, glaucoma, anemia, p/f for f/u. \n\nShe is generally doing well and has no complaints. She lost her glucometer and hasnt check her BS for 2 weeks. Last A1C after starting insulin down to 7.5, only on 5 glyburide bc of BS lows.\n\nFor her CRI, which is multifactorial (DM, HTN and L renal artery stenosis) she sees Dr Uriarte in Internal Medicine, saw him yesterday, Cr stable and PTH up a bit to 226. \n\n \n\n================ Problems ================\n\nESSENTIAL HYPERTENSION \n\nMODERATE OBESITY \n\nDiabetes mellitus \n\nChronic renal dysfunction cre cl estimate 23 in 12/67\n\nGlaucoma \n\nLactose intolerance \n\nAtypical chest pain negative ETT 4/8/67, negative ETT 2064, ETT w/ SPECT 2062\n\nIron-deficiency anemia \n\nGastritis EGD 9/67, also GERD symptoms\n\n\n\n================ Medications ================\n\nAsa 81 MG PO QD \n\nProcardia XL 60 MG (60MG TABLET take 1) PO QD , may use adalat instead\n\nRanitidine HCL 150 MG PO QD \n\nLosartan 100 MG PO QD \n\nGlyburide 5MG TABLET take 1 Tablet(s) PO QD \n\nFerrous GLUCONATE 325MG TABLET take 1 Tablet(s) PO TID , Take one tablet with every meal for iron deficiency\n\nHydrochlorothiazide 25MG TABLET take 1 Tablet(s) PO QD , Take every day for blood pressure\n\nToprol XL (METOPROLOL SUCCINATE EXTENDED ... 50MG TABLET CR 24HR take 1 Tablet(s) PO QD \n\nCalcitriol 0.25MCG CAPSULE take 1 Capsule(s) PO QD \n\nZocor (SIMVASTATIN) 20 MG (20MG TABLET take 1) PO QHS \n\nLantus (INSULIN GLARGINE) 16 UNITS SC QHS , dose increase\n\nTimolol XE 0.25% 1 DROP OU QD , must keep appt\n\nXalatan (LATANOPROST) 1 DROP OU QPM , must keep scheduled appt \n\n\n\n\n\nALLERGIES NKDA \n\n\n\nPHYSICAL EXAM\n\nVS: BP 110/60 \n\nCor: RRR, nl S1S2, 1/6 sys M, no rubs, gallops\n\nLungs: CTA b/l, no rales, rhonchi or wheezes\n\nAbd: Soft, NTND. Normal active bowel sounds. \n\nNo c/c/e\n\n\n\nA/P\n\n54 yo F with h/o CRI, DM, HTN, obesity, GERD, glaucoma, anemia for f/u. \n\n1. DM: Last A1C 7.5 from 9.3 after starting insulin, still on Lantus 16 and now Glyburide 5 qam. She sees Optho 2x/yr as she has glaucoma, saw them 2mos ago. \n\n--Prescribed glucometer\n\n--Cont current regimen\n\n--check A1C (pt left without getting labs, will get at next visit)\n\n--2/69 urine mcalb/cr 200\n\n--on statin\n\n\n\n2. CRI: Multifactorial, followed by Dr Uriarte. Last Cr 2.8 on 5/69. She has 2ndary hyperparathyroidism as well, PTH yest 226. On Calcitriol. \n\n--Cont ARB. \n\n--F/u with Dr Uriarte. \n\n\n\n3. Anemia, h/o, Fe deficiency, on Fe supp. Last Hct 36.\n\n\n\n4. H/o bowel ischemia but most recent CT wnl so more likely it was an infectious etiology.\n\n\n\n5. HTN: Cont on ARB, Toprol, Procardia, HCTZ. Today somewhat high, increased Procardia to 60 qd. On ASA for 1ry prevention of CAD. Statin.\n\n\n\n6.GERD: Cont Zantac, well controlled sx.\n\n\n\n7. Galucoma: followed by Optho, cont eye drops\n\n\n\n8. HM:\n\n--lipids 8/69much improved with addition of statin TC 121, TG 309, HDL 28, LDL 31. Lfts ok. Will check again in Feb\n\n--BMD 10/68 wnl\n\n--colonoscopy 2/68 wnl\n\n--Mammo will be scheduled for 12/69\n\n--PAP overdue, will reschedule a PAP appt\n\n\n\n\n\n\n\n\n\n______________________________ \n\n\n\nQuiana Gagnon, M.D.\n\n \n\n\n\n========================== Preceptor's Note ==========================\n\n\n\nI have discussed the evaluation and care of this patient with Dr. Gagnon.\n\n\n\n\n\n______________________________ \n\n\n\nDavid R. Quintin, M.D.\n\n\n\n", "name": "313-03.xml", "annotations": [{"start": "16", "end": "26", "cui": "DATE", "value": "2069-11-18"}, {"start": "33", "end": "35", "cui": "AGE", "value": "54"}, {"start": "402", "end": "409", "cui": "DOCTOR", "value": "Uriarte"}, {"start": "683", "end": "688", "cui": "DATE", "value": "12/67"}, {"start": "783", "end": "789", "cui": "DATE", "value": "4/8/67"}, {"start": "804", "end": "808", "cui": "DATE", "value": "2064"}, {"start": "823", "end": "827", "cui": "DATE", "value": "2062"}, {"start": "885", "end": "889", "cui": "DATE", "value": "9/67"}, {"start": "1938", "end": "1940", "cui": "AGE", "value": "54"}, {"start": "2292", "end": "2296", "cui": "DATE", "value": "2/69"}, {"start": "2371", "end": "2378", "cui": "DOCTOR", "value": "Uriarte"}, {"start": "2396", "end": "2400", "cui": "DATE", "value": "5/69"}, {"start": "2505", "end": "2512", "cui": "DOCTOR", "value": "Uriarte"}, {"start": "2924", "end": "2928", "cui": "DATE", "value": "8/69"}, {"start": "3027", "end": "3030", "cui": "DATE", "value": "Feb"}, {"start": "3038", "end": "3043", "cui": "DATE", "value": "10/68"}, {"start": "3063", "end": "3067", "cui": "DATE", "value": "2/68"}, {"start": "3103", "end": "3108", "cui": "DATE", "value": "12/69"}, {"start": "3225", "end": "3238", "cui": "DOCTOR", "value": "Quiana Gagnon"}, {"start": "3396", "end": "3402", "cui": "DOCTOR", "value": "Gagnon"}, {"start": "3473", "end": "3489", "cui": "DOCTOR", "value": "David R. Quintin"}]}]}]} \ No newline at end of file diff --git a/tests/test_cat.py b/tests/test_cat.py index 1cb83050..bc5fe04d 100644 --- a/tests/test_cat.py +++ b/tests/test_cat.py @@ -18,6 +18,7 @@ from medcat2.utils.defaults import AVOID_LEGACY_CONVERSION_ENVIRON import unittest +import tempfile from . import EXAMPLE_MODEL_PACK_ZIP from . import V1_MODEL_PACK_PATH, UNPACKED_V1_MODEL_PACK_PATH @@ -120,10 +121,21 @@ def setUpClass(cls): cls.cdb: CDB = maker.prepare_csvs([cls.CDB_PREPROCESSED_PATH]) + # usage monitoring + cls._temp_logs_folder = tempfile.TemporaryDirectory() + config.general.usage_monitor.enabled = True + config.general.usage_monitor.log_folder = cls._temp_logs_folder.name + # CAT cls.cat = cat.CAT(cls.cdb, vocab) cls.cat.config.components.linking.train = False + def tearDown(self): + # remove existing contents / empty file log file + log_file_path = self.cat.usage_monitor.log_file + if os.path.exists(log_file_path): + os.remove(log_file_path) + class CATCreationTests(CATIncludingTests): # should be persistent as long as we don't change the underlying model @@ -149,7 +161,7 @@ def test_versioning_updates_config_hash(self): self.assert_hashes_to(self.EXPECTED_HASH) def assert_hashes_to(self, exp_hash: str) -> None: - self.cat._versioning() + self.cat._versioning(None) new_hash = self.cat.config.meta.hash self.assertNotEqual(self.prev_hash, new_hash) self.assertEqual(new_hash, exp_hash) @@ -157,8 +169,8 @@ def assert_hashes_to(self, exp_hash: str) -> None: def test_versioning_does_not_overpopulate_history(self): # run multiple times - self.cat._versioning() - self.cat._versioning() + self.cat._versioning(None) + self.cat._versioning(None) # and expect it not to append multiple times in the history # if there were multiple instances, the set would remove duplicates sorted_set = sorted(set(self.cat.config.meta.history)) @@ -187,7 +199,7 @@ def test_model_card_has_no_extra_keys(self): class CatWithMetaCATTests(CATCreationTests): - EXPECTED_HASH = "04095f95f5f7c222" + EXPECTED_HASH = "9104103a2f191822" EXPECT_SAME_INSTANCES = True @classmethod @@ -216,9 +228,34 @@ def test_can_recreate_pipe(self): # otherwise they should differ self.assertNotEqual(self.init_addons, addons_after) + def test_get_entities_gets_monitored(self, + text="Some text"): + repeats = self.cat.config.general.usage_monitor.batch_size + # ensure something gets written to the file + for _ in range(repeats): + self.cat.get_entities(text) + log_file_path = self.cat.usage_monitor.log_file + self.assertTrue(os.path.exists(log_file_path)) + with open(log_file_path) as f: + contents = f.readline() + self.assertTrue(contents) + + def test_get_entities_logs_usage( + self, + text="The dog is sitting outside the house."): + # clear usage monitor buffer + self.cat.usage_monitor.log_buffer.clear() + self.cat.get_entities(text) + self.assertTrue(self.cat.usage_monitor.log_buffer) + self.assertEqual(len(self.cat.usage_monitor.log_buffer), 1) + line = self.cat.usage_monitor.log_buffer[0] + # the 1st element is the input text length + input_text_length = line.split(",")[1] + self.assertEqual(str(len(text)), input_text_length) + class CatWithChangesMetaCATTests(CatWithMetaCATTests): - EXPECTED_HASH = "7206cc91ed3424ac" + EXPECTED_HASH = "28f20b1460960b1e" EXPECT_SAME_INSTANCES = False @classmethod @@ -330,6 +367,19 @@ def test_can_get_entities(self, self.assertEqual(len(ents), len(expected_cuis)) self.assertEqual(set(ents.values()), set(expected_cuis)) + def test_can_get_multiple_entities(self): + texts = [ + "The fittest most fit of chronic kidney failure", + "The dog is sitting outside the house." + ] + ents = list(self.cat.get_entities_multi_texts(texts)) + self.assertEqual(len(ents), len(texts)) + # NOTE: text IDs are integers starting from 0 + exp_ids = set(str(i) for i in range(len(texts))) + for ent_id_str, ent in ents: + with self.subTest(f"Entity: {ent_id_str} [{ent}]"): + self.assertIn(ent_id_str, exp_ids) + class CATWithDocAddonTests(CATIncludingTests): EXAMPLE_TEXT = "Example text to tokenize" @@ -433,3 +483,136 @@ def test_cannot_load_legacy_with_environ_set(self): AVOID_LEGACY_CONVERSION_ENVIRON: "true"}, clear=True): with self.assertRaises(ValueError): cat.CAT.load_model_pack(V1_MODEL_PACK_PATH) + + +class CATSaveTests(CATIncludingTests): + DESCRIPTION = "Test CAT save functionality" + + @classmethod + def setUpClass(cls): + super().setUpClass() + cls.temp_folder = tempfile.TemporaryDirectory() + cls.saved_path = cls.cat.save_model_pack( + cls.temp_folder.name, change_description=cls.DESCRIPTION) + + @classmethod + def tearDownClass(cls): + super().tearDownClass() + cls.temp_folder.cleanup() + + def test_can_save_model_pack(self): + self.assertTrue(os.path.exists(self.saved_path)) + + def test_model_adds_description(self): + self.assertIn(self.DESCRIPTION, self.cat.config.meta.description) + + +class BatchingTests(unittest.TestCase): + NUM_TEXTS = 100 + all_texts = [ + f"Text {num:04d} -> " + "a" * num + for num in range(NUM_TEXTS) + ] + total_text_length = sum(len(text) for text in all_texts) + + @classmethod + def setUpClass(cls): + cnf = Config() + cls.cat = cat.CAT(cdb=CDB(cnf), vocab=Vocab()) + + # per doc batching tests + + def test_batching_gets_full(self): + batches = list(self.cat._generate_simple_batches( + iter(self.all_texts), batch_size=self.NUM_TEXTS, + only_cui=False)) + self.assertEqual(len(batches), 1) + self.assertEqual(len(batches[0]), self.NUM_TEXTS) + # NOTE: the contents has the text and the index and the only_cui bool + # so can't check equality directly + # self.assertEqual(batches[0], self.all_texts) + + def test_batching_gets_in_sequence(self): + batches = list(self.cat._generate_simple_batches( + iter(self.all_texts), batch_size=self.NUM_TEXTS // 2, + only_cui=False)) + self.assertEqual(len(batches), 2) + self.assertEqual(len(batches[0]), self.NUM_TEXTS // 2) + self.assertEqual(len(batches[1]), self.NUM_TEXTS // 2) + # self.assertEqual(batches[0] + batches[1], self.all_texts) + + def test_batching_gets_all_1_at_a_time(self): + batches = list(self.cat._generate_simple_batches( + iter(self.all_texts), batch_size=1, only_cui=False)) + self.assertEqual(len(batches), self.NUM_TEXTS) + for num, batch in enumerate(batches): + with self.subTest(f"Batch {num}"): + self.assertEqual(len(batch), 1) + # self.assertEqual(batch[0], f"Text {num}") + + # per character batching tests + + def test_batching_gets_full_char(self): + batches = list(self.cat._generate_batches_by_char_length( + iter(self.all_texts), batch_size_chars=self.total_text_length, + only_cui=False)) + self.assertEqual(len(batches), 1) + # has all texts + self.assertEqual(sum(len(batch) for batch in batches), self.NUM_TEXTS) + # has all characters + self.assertEqual(sum(len(text[1]) for text in batches[0]), + self.total_text_length) + + def test_batching_gets_all_half_at_a_time(self): + exp_chars = int(0.7 * self.total_text_length) + batches = list(self.cat._generate_batches_by_char_length( + iter(self.all_texts), batch_size_chars=exp_chars, + only_cui=False)) + # NOTE: should have 2 batches at 40% overlap + self.assertEqual(len(batches), 2) + # each batch should have less than expected characters + for batch_num, batch in enumerate(batches): + with self.subTest(f"Batch {batch_num}"): + cur_total_chars = sum(len(text[1]) for text in batch) + self.assertLessEqual(cur_total_chars, exp_chars) + # has all texts + self.assertEqual(sum(len(batch) for batch in batches), self.NUM_TEXTS) + # has all characters + self.assertEqual(sum(len(text[1]) + for batch in batches for text in batch), + self.total_text_length) + + # overal batching (i.e joint methods) + + def test_cannot_set_both_neg(self): + with self.assertRaises(ValueError): + list(self.cat._generate_batches( + iter(self.all_texts), batch_size_chars=-1, + batch_size=-1, only_cui=False)) + + def test_cannot_set_both_pos(self): + with self.assertRaises(ValueError): + list(self.cat._generate_batches( + iter(self.all_texts), batch_size_chars=100, + batch_size=10, only_cui=False)) + + def test_can_do_char_based(self): + exp_chars = int(0.3 * self.total_text_length) + batches = list(self.cat._generate_batches( + iter(self.all_texts), batch_size_chars=exp_chars, + batch_size=-1, only_cui=False)) + self.assertGreater(len(batches), 0) + batch_lens = [len(batch) for batch in batches] + # has different number of texts in some batches -> not doc based + self.assertGreater(max(batch_lens), min(batch_lens)) + + def test_can_set_batch_size_per_doc(self): + exp_batches = 10 + batches = list(self.cat._generate_batches( + iter(self.all_texts), batch_size=exp_batches, + batch_size_chars=-1, only_cui=False)) + self.assertGreater(len(batches), 0) + batch_lens = [len(batch) for batch in batches] + # has same number of texts in each batch -> doc based + self.assertEqual(max(batch_lens), min(batch_lens)) + self.assertEqual(max(batch_lens), exp_batches) diff --git a/tests/utils/test_usage_monitoring.py b/tests/utils/test_usage_monitoring.py new file mode 100644 index 00000000..7ddefde7 --- /dev/null +++ b/tests/utils/test_usage_monitoring.py @@ -0,0 +1,129 @@ +import os + +from medcat2.config.config import UsageMonitor as UsageMonitorConfig +from medcat2.utils import usage_monitoring + +import tempfile + +from unittest import TestCase +from unittest.mock import patch + + +class UsageMonitorBaseTests(TestCase): + MODEL_HASH = "MODEL_HASH" + BATCH_SIZE = 10 + ALL_DATA = [ + (10, 2), (100, 4), (110, 0) + ] + + @classmethod + def setUpClass(cls) -> None: + cls.config = UsageMonitorConfig(enabled=True, + batch_size=cls.BATCH_SIZE) + + def setUp(self) -> None: + self._temp_dir = tempfile.TemporaryDirectory() + self.config.log_folder = self._temp_dir.name + self.monitor = usage_monitoring.UsageMonitor(self.MODEL_HASH, + self.config) + for data in self.ALL_DATA: + self.monitor.log_inference(*data) + + def _get_saved_lines(self) -> list: + if not os.path.exists(self.monitor.log_file): + return [] + with open(self.monitor.log_file) as f: + return f.readlines() + + def tearDown(self) -> None: + self.monitor.log_buffer.clear() + self._temp_dir.cleanup() + + +class UsageMonitorInBufferTests(UsageMonitorBaseTests): + BATCH_SIZE = 100 + + def test_nothing_in_file(self): + self.assertFalse(self._get_saved_lines()) + + def test_all_in_buffer(self): + lines = self.monitor.log_buffer + self.assertEqual(len(lines), len(self.ALL_DATA)) + for data_nr, (data, line) in enumerate(zip(self.ALL_DATA, lines)): + for sub_nr, nr in enumerate(data): + with self.subTest(f"{data_nr}-{sub_nr} ({nr})"): + self.assertIn(str(nr), line) + + +class UsageMonitorInFileTests(UsageMonitorBaseTests): + BATCH_SIZE = 1 + + def test_nothing_in_buffer(self): + self.assertFalse(self.monitor.log_buffer) + + def test_all_in_file(self): + lines = self._get_saved_lines() + self.assertEqual(len(lines), len(self.ALL_DATA)) + for data_nr, (data, line) in enumerate(zip(self.ALL_DATA, lines)): + for sub_nr, nr in enumerate(data): + with self.subTest(f"{data_nr}-{sub_nr} ({nr})"): + self.assertIn(str(nr), line) + + +class InterMediateUsageMonitorTests(UsageMonitorBaseTests): + BATCH_SIZE = 2 + + def setUp(self) -> None: + super().setUp() + total_items = len(self.ALL_DATA) + self.expected_in_buffer = total_items % self.BATCH_SIZE + self.expected_in_file = total_items - self.expected_in_buffer + + def test_some_in_buffer(self): + self.assertTrue(self.monitor.log_buffer) + self.assertEqual(len(self.monitor.log_buffer), self.expected_in_buffer) + + def test_some_in_file(self): + lines = self._get_saved_lines() + self.assertTrue(lines) + self.assertEqual(len(lines), self.expected_in_file) + + +class UMT(UsageMonitorBaseTests): + ENABLED_DICT = { + "MEDCAT_USAGE_LOGS": "True", + "MEDCAT_USAGE_LOGS_LOCATION": "." + } + DISABLED_DICT_1 = { + "MEDCAT_USAGE_LOGS": "False", + # should not change anything + "MEDCAT_USAGE_LOGS_LOCATION": "FAIL" + } + DISABLED_DICT_2 = { + "MEDCAT_USAGE_LOGS": "0", + "MEDCAT_USAGE_LOGS_LOCATION": "." + } + + def setUp(self) -> None: + super().setUp() + self.config.enabled = "auto" + self.config.log_folder = self._temp_dir.name + self.monitor = usage_monitoring.UsageMonitor(self.MODEL_HASH, + self.config) + + @patch.dict(os.environ, ENABLED_DICT) + def test_listens_to_os_environ_enabled(self): + self.assertTrue(self.monitor._should_log()) + self.assertNotEqual(self.config.log_folder, self._temp_dir.name) + self.assertEqual(self.config.log_folder, + self.ENABLED_DICT["MEDCAT_USAGE_LOGS_LOCATION"]) + + @patch.dict(os.environ, DISABLED_DICT_1) + def test_listens_to_os_environ_disabled_1(self): + self.assertFalse(self.monitor._should_log()) + self.assertEqual(self.config.log_folder, self._temp_dir.name) + + @patch.dict(os.environ, DISABLED_DICT_2) + def test_listens_to_os_environ_disabled_2(self): + self.assertFalse(self.monitor._should_log()) + self.assertEqual(self.config.log_folder, self._temp_dir.name)