diff --git a/lib/termium.rb b/lib/termium.rb index a6d19a2..48cd130 100644 --- a/lib/termium.rb +++ b/lib/termium.rb @@ -3,10 +3,10 @@ require "glossarist" require "lutaml/model" -require "lutaml/model/xml_adapter/nokogiri_adapter" +require "lutaml/model/xml/nokogiri_adapter" Lutaml::Model::Config.configure do |config| - config.xml_adapter = Lutaml::Model::XmlAdapter::NokogiriAdapter + config.xml_adapter = Lutaml::Model::Xml::NokogiriAdapter end module Termium diff --git a/lib/termium/abbreviation.rb b/lib/termium/abbreviation.rb index c1b482c..f7150a9 100644 --- a/lib/termium/abbreviation.rb +++ b/lib/termium/abbreviation.rb @@ -52,5 +52,17 @@ def to_h set end + + def to_designation + attrs = { + designation: value, + normative_status: deprecated ? "deprecated" : "preferred", + } + + attrs[:gender] = gender if gender + attrs[:part_of_speech] = part_of_speech if part_of_speech + + Glossarist::Designation::Abbreviation.new(attrs) + end end end diff --git a/lib/termium/core.rb b/lib/termium/core.rb index 69f302b..3a3df09 100644 --- a/lib/termium/core.rb +++ b/lib/termium/core.rb @@ -47,10 +47,13 @@ def uuid(str = identification_number) # details="Compartment - ISO/IEC JTC 1 Information Technology Vocabulary" /> def to_concept(options = {}) Glossarist::ManagedConcept.new.tap do |concept| - # The way to set the universal concept's identifier: data.identifier - concept.id = identification_number + # V2: Create new data object to ensure it's serialized (not marked as default) + concept.data = Glossarist::ManagedConceptData.new( + id: identification_number, + sources: concept_sources + ) - concept.uuid = uuid + concept.id = uuid # Assume no related concepts concept.related = [] @@ -60,19 +63,19 @@ def to_concept(options = {}) concept.date_accepted = options[:date_accepted] end - language_module.map do |lang_mod| + language_module.each do |lang_mod| localized_concept = lang_mod.to_concept(options) # TODO: This is needed to skip the empty french entries of 10031781 and 10031778 next if localized_concept.nil? - localized_concept.id = identification_number - localized_concept.uuid = uuid("#{identification_number}-#{lang_mod.language}") + localized_concept.data.id = identification_number + localized_concept.id = uuid("#{identification_number}-#{lang_mod.language}") universal_entry.each do |entry| - localized_concept.notes << Glossarist::DetailedDefinition.new(content: entry.value) + localized_concept.data.notes << Glossarist::DetailedDefinition.new(content: entry.value) end - localized_concept.sources = concept_sources + localized_concept.data.sources = concept_sources concept.add_localization(localized_concept) end end diff --git a/lib/termium/entry_term.rb b/lib/termium/entry_term.rb index 288df5d..485acce 100644 --- a/lib/termium/entry_term.rb +++ b/lib/termium/entry_term.rb @@ -86,5 +86,19 @@ def to_h set end + + def to_designation + attrs = { + designation: value, + normative_status: normative_status, + } + + attrs[:geographical_area] = geographical_area if geographical_area + attrs[:plurality] = plurality if plurality + attrs[:gender] = gender if gender + attrs[:part_of_speech] = part_of_speech if part_of_speech + + Glossarist::Designation::Expression.new(attrs) + end end end diff --git a/lib/termium/extract.rb b/lib/termium/extract.rb index 9177061..501ea70 100644 --- a/lib/termium/extract.rb +++ b/lib/termium/extract.rb @@ -30,8 +30,9 @@ class Extract < Lutaml::Model::Serializable def to_concept(options = {}) coll = Glossarist::ManagedConceptCollection.new - coll.managed_concepts = core.map do |managed_concept| - managed_concept.to_concept(options) + core.each do |managed_concept| + concept = managed_concept.to_concept(options) + coll.store(concept) end coll end diff --git a/lib/termium/language_module.rb b/lib/termium/language_module.rb index 7b6c346..53cf667 100644 --- a/lib/termium/language_module.rb +++ b/lib/termium/language_module.rb @@ -47,7 +47,7 @@ def abbreviations def designations # NOTE: entry_term is a collection - entry_term + abbreviations + (entry_term + abbreviations).compact end def to_h @@ -69,17 +69,22 @@ def to_h end def to_concept(options = {}) - x = to_h - return nil unless x + return nil unless definition + + Glossarist::LocalizedConcept.new.tap do |concept| + concept.data = Glossarist::ConceptData.new( + language_code: LANGUAGE_CODE_MAPPING[language.downcase], + terms: designations.map(&:to_designation), + definition: [Glossarist::DetailedDefinition.new(content: definition)], + notes: notes.map { |n| Glossarist::DetailedDefinition.new(content: n) }, + examples: examples.map { |e| Glossarist::DetailedDefinition.new(content: e) }, + entry_status: "valid", + domain: domain + ) - Glossarist::LocalizedConcept.new(x).tap do |concept| - # Fill in register parameters if options[:date_accepted] - puts options[:date_accepted].inspect concept.date_accepted = options[:date_accepted] end - - puts concept.inspect end end end diff --git a/lib/termium/source.rb b/lib/termium/source.rb index 1dfe9a0..78a8414 100644 --- a/lib/termium/source.rb +++ b/lib/termium/source.rb @@ -25,11 +25,11 @@ def content end def to_concept_source - Glossarist::ConceptSource.new({ - "type" => "lineage", - "ref" => content, - "status" => "identical", - }) + Glossarist::ConceptSource.new( + type: "lineage", + status: "identical", + origin: Glossarist::Citation.new(ref: content) + ) end end end diff --git a/spec/fixtures/expected_v2_output/concept/4d2d4ac4-af47-545c-9d41-19285d785fc5.yaml b/spec/fixtures/expected_v2_output/concept/4d2d4ac4-af47-545c-9d41-19285d785fc5.yaml new file mode 100644 index 0000000..d78d408 --- /dev/null +++ b/spec/fixtures/expected_v2_output/concept/4d2d4ac4-af47-545c-9d41-19285d785fc5.yaml @@ -0,0 +1,17 @@ +--- +data: + identifier: '2123225' + localized_concepts: + eng: 33f0da1d-b8af-511c-8a7a-2f777419fa95 + fre: 848965b7-2090-5f93-af30-8cba405c248c + sources: + - origin: + ref: ISO/IEC 2382-16:1996 + status: identical + type: lineage + - origin: + ref: Ranger, Natalie * 2006 * Bureau de la traduction + status: identical + type: lineage +id: 4d2d4ac4-af47-545c-9d41-19285d785fc5 +status: valid diff --git a/spec/fixtures/expected_v2_output/localized_concept/33f0da1d-b8af-511c-8a7a-2f777419fa95.yaml b/spec/fixtures/expected_v2_output/localized_concept/33f0da1d-b8af-511c-8a7a-2f777419fa95.yaml new file mode 100644 index 0000000..bd264f4 --- /dev/null +++ b/spec/fixtures/expected_v2_output/localized_concept/33f0da1d-b8af-511c-8a7a-2f777419fa95.yaml @@ -0,0 +1,25 @@ +--- +data: + definition: + - content: quotient of the character mean entropy by the mean duration of a character + examples: [] + id: '2123225' + notes: + - content: The average information rate may be expressed in shannons per second. + - content: 16.04.07 (2382) + sources: + - origin: + ref: ISO/IEC 2382-16:1996 + status: identical + type: lineage + - origin: + ref: Ranger, Natalie * 2006 * Bureau de la traduction + status: identical + type: lineage + terms: + - type: expression + normative_status: preferred + designation: average information rate + language_code: eng + entry_status: valid +id: 33f0da1d-b8af-511c-8a7a-2f777419fa95 diff --git a/spec/fixtures/expected_v2_output/localized_concept/848965b7-2090-5f93-af30-8cba405c248c.yaml b/spec/fixtures/expected_v2_output/localized_concept/848965b7-2090-5f93-af30-8cba405c248c.yaml new file mode 100644 index 0000000..0ee6057 --- /dev/null +++ b/spec/fixtures/expected_v2_output/localized_concept/848965b7-2090-5f93-af30-8cba405c248c.yaml @@ -0,0 +1,26 @@ +--- +data: + definition: + - content: quotient de l'entropie moyenne par caractère par la durée moyenne d'un + caractère + examples: [] + id: '2123225' + notes: + - content: Le débit moyen d'entropie peut s'exprimer en shannons par seconde. + - content: 16.04.07 (2382) + sources: + - origin: + ref: ISO/IEC 2382-16:1996 + status: identical + type: lineage + - origin: + ref: Ranger, Natalie * 2006 * Bureau de la traduction + status: identical + type: lineage + terms: + - type: expression + normative_status: preferred + designation: débit moyen d'entropie + language_code: fre + entry_status: valid +id: 848965b7-2090-5f93-af30-8cba405c248c diff --git a/spec/fixtures/single_entry.xml b/spec/fixtures/single_entry.xml new file mode 100644 index 0000000..b6ec5f2 --- /dev/null +++ b/spec/fixtures/single_entry.xml @@ -0,0 +1,45 @@ + + + + + + + + + + + + + quotient of the character mean entropy by the mean duration of a character + + + + The average information rate may be expressed in shannons per second. + + + + + + + + + + + + quotient de l'entropie moyenne par caractère par la durée moyenne d'un caractère + + + + Le débit moyen d'entropie peut s'exprimer en shannons par seconde. + + + + + 16.04.07 (2382) + + + + + + + diff --git a/spec/termium_spec.rb b/spec/termium_spec.rb index caea42d..9984f6e 100644 --- a/spec/termium_spec.rb +++ b/spec/termium_spec.rb @@ -1,17 +1,137 @@ # frozen_string_literal: true +require 'yaml' + RSpec.describe Termium do + let(:termium_extract_file) { fixtures_path('Characters.xml') } + let(:glossarist_output_dir) { fixtures_path('Characters-Glossarist') } + + before do + FileUtils.mkdir_p(glossarist_output_dir) + end + + after do + FileUtils.rm_rf(glossarist_output_dir) + end + + describe 'V2 format conversion' do + let(:termium_extract) { Termium::Extract.from_xml(File.read(termium_extract_file)) } + let(:glossarist_col) { termium_extract.to_concept } + + before do + glossarist_col.save_to_files(glossarist_output_dir) + end + + it 'creates concept and localized_concept directories' do + expect(Dir.exist?(File.join(glossarist_output_dir, 'concept'))).to be true + expect(Dir.exist?(File.join(glossarist_output_dir, 'localized_concept'))).to be true + end + + it 'creates concept files with V2 structure' do + concept_files = Dir.glob(File.join(glossarist_output_dir, 'concept', '*.yaml')) + expect(concept_files).not_to be_empty + + concept_files.each do |file| + concept = YAML.safe_load(File.read(file), permitted_classes: [Date, Time]) + + # V2: concept must have id (UUID) at root level + expect(concept).to have_key('id') + expect(concept['id']).to match(/^[0-9a-f-]{36}$/) + + # V2: concept must have data with identifier and localized_concepts + expect(concept).to have_key('data') + expect(concept['data']).to have_key('identifier') + expect(concept['data']).to have_key('localized_concepts') + expect(concept['data']['localized_concepts']).to be_a(Hash) + end + end + + it 'creates localized_concept files with V2 structure' do + localized_files = Dir.glob(File.join(glossarist_output_dir, 'localized_concept', '*.yaml')) + expect(localized_files).not_to be_empty + + localized_files.each do |file| + localized = YAML.safe_load(File.read(file), permitted_classes: [Date, Time]) + + # V2: localized concept must have id (UUID) at root level + expect(localized).to have_key('id') + expect(localized['id']).to match(/^[0-9a-f-]{36}$/) + + # V2: localized concept must have data with language_code and terms + expect(localized).to have_key('data') + expect(localized['data']).to have_key('language_code') + expect(localized['data']['language_code']).to match(/^[a-z]{3}$/) + expect(localized['data']).to have_key('terms') + expect(localized['data']['terms']).to be_an(Array) + end + end + + it 'links concepts to localized concepts via UUID' do + concept_files = Dir.glob(File.join(glossarist_output_dir, 'concept', '*.yaml')) + localized_files = Dir.glob(File.join(glossarist_output_dir, 'localized_concept', '*.yaml')) + + localized_uuids = localized_files.map do |file| + YAML.safe_load(File.read(file), permitted_classes: [Date, Time])['id'] + end + + concept_files.each do |file| + concept = YAML.safe_load(File.read(file), permitted_classes: [Date, Time]) + referenced_uuids = concept['data']['localized_concepts'].values + + referenced_uuids.each do |uuid| + expect(localized_uuids).to include(uuid) + end + end + end + end + + describe 'V2 file comparison' do + let(:single_entry_file) { fixtures_path('single_entry.xml') } + let(:expected_output_dir) { fixtures_path('expected_v2_output') } + let(:actual_output_dir) { fixtures_path('actual_v2_output') } + + before do + FileUtils.mkdir_p(actual_output_dir) + + extract = Termium::Extract.from_xml(File.read(single_entry_file)) + collection = extract.to_concept + collection.save_to_files(actual_output_dir) + end + + after do + FileUtils.rm_rf(actual_output_dir) + end + + it 'generates V2 output identical to expected fixtures' do + %w[concept localized_concept].each do |subdir| + expected_files = Dir.glob(File.join(expected_output_dir, subdir, '*.yaml')) + actual_files = Dir.glob(File.join(actual_output_dir, subdir, '*.yaml')) + + expect(actual_files.size).to eq(expected_files.size), + "Expected #{expected_files.size} #{subdir} files, got #{actual_files.size}" + + expected_files.each do |expected_file| + filename = File.basename(expected_file) + actual_file = File.join(actual_output_dir, subdir, filename) + + expect(File.exist?(actual_file)).to be(true), + "Expected #{subdir} file #{filename} not found in actual output" + + expected_raw = File.read(expected_file) + actual_raw = File.read(actual_file) + + expected_content = YAML.safe_load(expected_raw, permitted_classes: [Date, Time]) + actual_content = YAML.safe_load(actual_raw, permitted_classes: [Date, Time]) + + expect(actual_raw).to eq(expected_raw), + "Raw YAML content differs for #{subdir}/#{filename}" - # let(:concept_folder) { "concept_collection_v2" } - # let(:concept_files) { Dir.glob(File.join(fixtures_path(concept_folder), "concept", "*.{yaml,yml}")) } - # let(:localized_concepts_folder) { File.join(fixtures_path(concept_folder), "localized_concept") } - - let(:termium_extract_file) { fixtures_path("Characters.xml") } - let(:glossarist_output_file) { fixtures_path("Characters-Glossarist") } - it "does something useful" do - termium_extract = Termium::Extract.from_xml(IO.read(termium_extract_file)) - glossarist_col = termium_extract.to_concept - FileUtils.mkdir_p(glossarist_output_file) - glossarist_col.save_to_files(glossarist_output_file) + expect(actual_content).to eq(expected_content), + "#{subdir}/#{filename} differs from expected:\n" \ + "Expected:\n#{expected_content.to_yaml}\n" \ + "Actual:\n#{actual_content.to_yaml}" + end + end + end end end