diff --git a/tests/test_tools.py b/tests/test_tools.py index 6e8aaea..0fbc06b 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -542,6 +542,102 @@ def test_nquads_export(self, pl_eq, tmp_path): assert "' in l for l in length_lines) + + def test_reference_keys_stay_iris(self, nquads_lines): + # xsd:anyURI keys (e.g. Model.DependentOn) are references, not typed literals + dep_lines = [l for l in nquads_lines if "Model.DependentOn" in l] + assert dep_lines + assert all("^^" not in l for l in dep_lines) + + def test_rdflib_parses_export(self, nquads_lines, tmp_path): + rdflib = pytest.importorskip("rdflib") + path = tmp_path / "validate.nq" + path.write_text("".join(nquads_lines)) + + dataset = rdflib.Dataset() + dataset.parse(str(path), format="nquads") + assert len(dataset) == len(nquads_lines) + + # typed literals round-trip through rdflib with the right python type + length_predicate = rdflib.URIRef("http://iec.ch/TC57/CIM100#Conductor.length") + lengths = [obj for _, _, obj, _ in dataset.quads((None, length_predicate, None, None))] + assert lengths + for literal in lengths: + assert literal.datatype == rdflib.XSD.float + assert isinstance(literal.toPython(), float) + + def test_references_resolve_within_dataset(self, svedala_eq, nquads_lines, tmp_path): + """Every urn:uuid reference resolves to a subject — except the references + the source data itself knows are dangling (boundary objects, other models).""" + rdflib = pytest.importorskip("rdflib") + from triplets import cgmes_tools + + path = tmp_path / "refs.nq" + path.write_text("".join(nquads_lines)) + dataset = rdflib.Dataset() + dataset.parse(str(path), format="nquads") + + subjects = set() + uuid_objects = set() + for s, _, o, _ in dataset.quads((None, None, None, None)): + subjects.add(str(s)) + if isinstance(o, rdflib.URIRef) and str(o).startswith("urn:uuid:"): + uuid_objects.add(str(o)) + unresolved = {o.removeprefix("urn:uuid:") for o in uuid_objects - subjects} + + dangling = cgmes_tools.get_dangling_references(svedala_eq, detailed=True) + known_dangling = set(dangling["VALUE_FROM"].astype(str)) + + assert unresolved, "single EQ file must have boundary references" + assert unresolved == unresolved & known_dangling, \ + f"references neither resolved nor known-dangling: {sorted(unresolved - known_dangling)[:5]}" + + def test_string_literal_stays_plain(self, nquads_lines): + # xsd:string is the RDF 1.1 default — no annotation + name_lines = [l for l in nquads_lines if "IdentifiedObject.name>" in l] + assert name_lines + assert all("^^" not in l for l in name_lines) + + def test_mrid_is_literal_not_reference(self, nquads_lines): + # mRID is a string attribute by schema; the UUID heuristic must not turn it into a urn:uuid reference + mrid_lines = [l for l in nquads_lines if "IdentifiedObject.mRID>" in l] + assert mrid_lines + for line in mrid_lines: + obj = line.split("> ", 2)[2] # object + graph part after subject and predicate + assert obj.startswith('"'), line + + def test_without_schema_no_datatypes(self, svedala_eq, tmp_path): + output = str(tmp_path / "untyped.nq") + triplets.export.export_to_nquads(svedala_eq, output) + with open(output) as f: + content = f.read() + assert "^^<" not in content + + def test_polars_engine_matches_pandas(self, svedala_eq, tmp_path, nquads_lines): + polars = pytest.importorskip("polars") + from triplets.export_schema import schemas + output = str(tmp_path / "typed_pl.nq") + triplets.export.export_to_nquads(polars.from_pandas(svedala_eq), output, rdf_map=schemas.ENTSOE_CGMES_3_0_0_552_ED1) + with open(output) as f: + pl_lines = f.readlines() + assert sorted(pl_lines) == sorted(nquads_lines) + + # ── Roundtrip test (export CIM XML → reimport → compare) ──────────────────── class TestCimxmlRoundtrip: diff --git a/triplets/export/nquads_pandas.py b/triplets/export/nquads_pandas.py index 2fd1766..af495e3 100644 --- a/triplets/export/nquads_pandas.py +++ b/triplets/export/nquads_pandas.py @@ -18,10 +18,11 @@ def export_to_nquads(data, path, rdf_map=None): path : str Output file path (.nq). rdf_map : dict or str, optional - Export schema for proper enum/association detection. - If None, enumerations won't get namespace (exported as literals). + Export schema for proper enum/association detection and literal + datatype annotations ("400"^^<...XMLSchema#float>). If None, + enumerations won't get namespace and literals stay untyped. """ - enum_keys, key_namespaces = build_key_metadata(rdf_map) if rdf_map else (set(), {}) + enum_keys, key_namespaces, key_datatypes = build_key_metadata(rdf_map) if rdf_map else (set(), {}, {}) id_col = data["ID"].astype(str) key_col = data["KEY"].astype(str) @@ -31,7 +32,7 @@ def export_to_nquads(data, path, rdf_map=None): subjects = id_col.apply(make_subject) predicates = key_col.apply(lambda k: make_predicate(k, key_namespaces)) objects = pandas.Series( - [make_object(k, v, enum_keys) for k, v in zip(key_col, val_col)], + [make_object(k, v, enum_keys, key_datatypes) for k, v in zip(key_col, val_col)], index=data.index, ) graphs = inst_col.apply(make_graph) diff --git a/triplets/export/nquads_polars.py b/triplets/export/nquads_polars.py index 4cf51a2..3e21b07 100644 --- a/triplets/export/nquads_polars.py +++ b/triplets/export/nquads_polars.py @@ -22,16 +22,17 @@ def export_to_nquads(data, path, rdf_map=None): path : str Output file path (.nq). rdf_map : dict or str, optional - Export schema for proper enum/association detection. + Export schema for proper enum/association detection and literal + datatype annotations ("400"^^<...XMLSchema#float>). """ - enum_keys, key_namespaces = build_key_metadata(rdf_map) if rdf_map else (set(), {}) + enum_keys, key_namespaces, key_datatypes = build_key_metadata(rdf_map) if rdf_map else (set(), {}, {}) # Build quads row by row (complex classification can't be fully vectorized) quads = [] for row in data.iter_rows(named=True): s = make_subject(str(row["ID"])) p = make_predicate(str(row["KEY"]), key_namespaces) - o = make_object(str(row["KEY"]), str(row["VALUE"]), enum_keys) + o = make_object(str(row["KEY"]), str(row["VALUE"]), enum_keys, key_datatypes) g = make_graph(str(row["INSTANCE_ID"])) quads.append(f"{s} {p} {o} {g} .") diff --git a/triplets/export/nquads_utils.py b/triplets/export/nquads_utils.py index 9c2e355..2b8adf2 100644 --- a/triplets/export/nquads_utils.py +++ b/triplets/export/nquads_utils.py @@ -8,12 +8,13 @@ CIM_NS = "http://iec.ch/TC57/CIM100#" RDF_TYPE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" +XSD_NS = "http://www.w3.org/2001/XMLSchema#" UUID_RE = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$') def build_key_metadata(rdf_map): - """Extract enum keys and key→namespace mapping from export schema. + """Extract enum keys, key→namespace, and key→datatype mappings from export schema. Parameters ---------- @@ -26,6 +27,11 @@ def build_key_metadata(rdf_map): KEY names whose values are enumerations (need namespace on VALUE). key_namespaces : dict KEY name → namespace URI for predicate construction. + key_datatypes : dict + KEY name → full xsd datatype URI (from the schema's "xsd:type", + e.g. "xsd:float" → "http://www.w3.org/2001/XMLSchema#float"). + A key present here is a literal attribute by schema. xsd:string + keys map to None: literal, but no annotation (RDF 1.1 default). """ if not isinstance(rdf_map, dict): with open(str(rdf_map)) as f: @@ -33,6 +39,7 @@ def build_key_metadata(rdf_map): enum_keys = set() key_namespaces = {} + key_datatypes = {} for profile_name, profile_data in rdf_map.items(): if not isinstance(profile_data, dict): @@ -42,13 +49,19 @@ def build_key_metadata(rdf_map): continue prop_type = prop_data.get("type") namespace = prop_data.get("namespace", CIM_NS) + xsd_type = prop_data.get("xsd:type") if prop_type == "Enumeration": enum_keys.add(prop_name) if namespace: key_namespaces[prop_name] = namespace + if xsd_type and xsd_type.startswith("xsd:"): + datatype = xsd_type.removeprefix("xsd:") + if datatype == "anyURI": + continue # references (e.g. Model.DependentOn) — keep IRI handling + key_datatypes[prop_name] = None if datatype == "string" else f"{XSD_NS}{datatype}" - return enum_keys, key_namespaces + return enum_keys, key_namespaces, key_datatypes def make_subject(id_val): @@ -68,13 +81,16 @@ def make_predicate(key, key_namespaces=None): return f"<{ns}{key}>" -def make_object(key, value, enum_keys=None): +def make_object(key, value, enum_keys=None, key_datatypes=None): """Convert VALUE to object (URI or literal). Rules: - Type row → - Already starts with http/https/urn → (pass through) - Enum KEY → + - KEY with schema datatype → "literal"^^ (plain for xsd:string); + takes precedence over the UUID heuristic (e.g. IdentifiedObject.mRID is + a string attribute, not a reference) - UUID pattern → - Everything else → "literal" (with escaping) """ @@ -91,6 +107,12 @@ def make_object(key, value, enum_keys=None): if enum_keys and key in enum_keys: return f"<{CIM_NS}{value}>" + # Literal attribute by schema — annotate with its xsd datatype + if key_datatypes and key in key_datatypes: + escaped = value.replace('\\', '\\\\').replace('"', '\\"').replace('\n', '\\n') + datatype = key_datatypes[key] + return f'"{escaped}"^^<{datatype}>' if datatype else f'"{escaped}"' + # UUID reference if UUID_RE.match(value): return f""