The spans in the NER model are incorrect.
Code to reproduce:
from glob import glob
import pandas as pd
import re
from pprint import pprint
import pkg_resources
from pymedextcore.document import Document
from pymedext_eds.annotators import Endlines, SentenceTokenizer, SectionSplitter
from pymedext_eds.utils import rawtext_loader
from pymedext_eds.med import MedicationAnnotator, MedicationNormalizer
endlines = Endlines(["raw_text"], "clean_text", ID="endlines")
sections = SectionSplitter(['clean_text'], "section", ID= 'sections')
sentenceSplitter = SentenceTokenizer(["section"],"sentence", ID="sentences")
models_param = [{'tagger_path':'data/models/apmed5/entities/final-model.pt' ,
'tag_name': 'entity_pred' },
{'tagger_path':'data/models/apmed5/events/final-model.pt' ,
'tag_name': 'event_pred' },
{'tagger_path': "data/models/apmed5/drugblob/final-model.pt",
'tag_name': 'drugblob_pred'}]
med = MedicationAnnotator(['sentence'], 'med', ID='med:v2', models_param=models_param, device='cuda:1')
data_path = pkg_resources.resource_filename('pymedext_eds', 'data/romedi')
romedi_path = glob(data_path + '/*.p')[0]
norm = MedicationNormalizer(['ENT/DRUG','ENT/CLASS'], 'normalized_mention', ID='norm',romedi_path= romedi_path)
pipeline = [endlines,sections, sentenceSplitter, med, norm]
data_path = pkg_resources.resource_filename('pymedext_eds', 'data/demo')
file_list = glob(data_path + '/*.txt')
docs = [rawtext_loader(x) for x in file_list]
for doc in docs:
doc.annotate(pipeline)
[t.value for t in docs[0].get_annotations('ENT/DRUG')]
docs[0].get_annotations('clean_text')[0].value[5687:5691]
The spans in the NER model are incorrect.
Code to reproduce: