Skip to content

Commit 53db137

Browse files
author
kristjan.vilgo
committed
Added support for REDFS to cim-diff.py
1 parent 16b4d1d commit 53db137

File tree

2 files changed

+59
-25
lines changed

2 files changed

+59
-25
lines changed

Tools/RDF_PARSER/RDF_parser.py

Lines changed: 56 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from lxml.etree import QName
1818

1919
import pandas
20+
#import dask as pandas
2021
import datetime
2122
import zipfile
2223
import uuid
@@ -90,7 +91,7 @@ def load_RDF_objects_from_XML(path_or_fileobject, debug=False):
9091

9192
# Get unique ID for loaded instance
9293
# instance_id = clean_ID(parsed_xml.find("./").attrib.values()[0]) # Lets asume that the first RDF element describes the whole document - TODO replace it with hash of whole XML
93-
instance_id = str(uuid.uuid4()) # Guarantees unique ID for each loaded instance of data
94+
instance_id = str(uuid.uuid4()) # Guarantees unique ID for each loaded instance of data, in erronous data it happens that same UUID is used for multiple files
9495

9596
if debug:
9697
_, start_time = print_duration("XML loaded to tree object", start_time)
@@ -158,7 +159,7 @@ def find_all_xml(list_of_paths_to_zip_globalzip_xml, debug=False):
158159
return xml_files_list
159160

160161

161-
def load_RDF_to_list(path_or_fileobject, debug=False):
162+
def load_RDF_to_list(path_or_fileobject, debug=False, keep_ns=False):
162163
"""Parse single file to triplestore list"""
163164

164165
file_name = path_or_fileobject
@@ -173,31 +174,38 @@ def load_RDF_to_list(path_or_fileobject, debug=False):
173174
if debug:
174175
start_time = datetime.datetime.now()
175176

176-
# Lets generate list for RDF data and store the original filename under rdf:label
177-
data_list = [(str(uuid.uuid4()), "label", file_name, INSTANCE_ID)]
177+
# Lets generate list for RDF data and store the original filename under rdf:label in dcat:Distribution object
178+
ID = str(uuid.uuid4())
179+
data_list = [
180+
(ID, "Type", "Distribution", INSTANCE_ID),
181+
(ID, "label", file_name, INSTANCE_ID)
182+
]
178183

179184
# lets create all variables, so that in loops they are reused, rather than new ones are created, green thinking
180-
ID = ""
185+
#ID = ""
181186
KEY = ""
182187
VALUE = ""
188+
NS = ""
183189

184190
for RDF_object in RDF_objects:
185191

186192
ID = clean_ID(RDF_object.attrib.values()[0])
187-
# KEY = '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Type' # If we would like to keep all with correct namespace
193+
# KEY = '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}type' # If we would like to keep all with correct namespace
188194
KEY = 'Type'
189-
VALUE = RDF_object.tag.split("}")[1]
195+
KEY_NS = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
196+
VALUE_NS, VALUE = RDF_object.tag.split("}") #TODO - case where there is no namespace will fail, but is it realistic for RDF file?
190197
# VALUE = etree.QName(object).localname
191-
# ID_TYPE = object.attrib.keys()[0].split("}")[1] # Adds column to identifi "ID" and "about" types of ID
198+
# ID_TYPE = object.attrib.keys()[0].split("}")[1] # Adds column to identify "ID" and "about" types of ID
192199

193200
# data_list.append([ID, ID_TYPE, KEY, VALUE]) # If using ID TYPE, maybe also namespace should be kept?
194201
data_list.append((ID, KEY, VALUE, INSTANCE_ID))
195202

196203
for element in RDF_object.iterchildren():
197204

198-
KEY = element.tag.split("}")[1]
205+
KEY_NS, KEY = element.tag.split("}") #TODO - case where there is no namespace will fail, but is it realistic for RDF file?
199206
# KEY = etree.QName(element).localname
200207
VALUE = element.text
208+
VALUE_NS = ""
201209

202210
if VALUE is None and len(element.attrib.values()) > 0:
203211
VALUE = clean_ID(element.attrib.values()[0])
@@ -809,24 +817,49 @@ def update_triplet_from_tableview(data, tableview, update=True, add=True, instan
809817
pandas.DataFrame.update_triplet_from_tableview = update_triplet_from_tableview
810818

811819

812-
def get_diff(left_data, right_data, print_diff=False, file_id_key="label"):
813-
diff = left_data.merge(right_data, on=["ID", "KEY", "VALUE"], how='outer', indicator=True, suffixes=("_OLD", "_NEW"), sort=False).query("_merge != 'both'")
820+
def remove_triplet_from_triplet(from_triplet, what_triplet, columns=["ID", "KEY", "VALUE"]):
821+
"""Retuns from_triplet - what_triplet"""
822+
return from_triplet.drop(from_triplet.reset_index().merge(what_triplet[columns], on=columns, how="inner")["index"], axis=0)
814823

815-
if print_diff:
816-
changes = diff.replace({'_merge': {"left_only": "-", "right_only": "+"}}).sort_values(by=['ID', 'KEY']).query("KEY != 'label'")
817-
changes_on_left = len(changes.query("_merge == '-'"))
818-
changes_on_right = len(changes.query("_merge == '+'"))
819824

820-
for _, file_id in left_data.query("KEY == @file_id_key").VALUE.iteritems():
821-
print(f"--- {file_id}")# from-file-modification-time")
825+
def filter_triplet_by_type(triplet, type):
826+
"""Filter out all objects data by rdf:type"""
827+
return triplet.merge(triplet.query("KEY == 'Type' and VALUE == @type").ID)
822828

823-
for _, file_id in right_data.query("KEY == @file_id_key").VALUE.iteritems():
824-
print(f"+++ {file_id}")# to-file-modification-time")
825-
print(f"@@ -1,{changes_on_left} +1,{changes_on_right} @@")
826-
for _, change in (changes._merge + changes.ID + " " + changes.KEY + " " + changes.VALUE).iteritems():
827-
print(change)
828829

829-
return diff
830+
def triplet_diff(left_data, right_data):
831+
832+
return left_data.merge(right_data, on=["ID", "KEY", "VALUE"], how='outer', indicator=True, suffixes=("_OLD", "_NEW"), sort=False).query("_merge != 'both'")
833+
834+
835+
def print_triplet_diff(left_data, right_data, file_id_object="Distribution", file_id_key="label", exclude_objects=[]):
836+
837+
diff = triplet_diff(left_data, right_data)
838+
839+
changes = diff.replace({'_merge': {"left_only": "-", "right_only": "+"}}).sort_values(by=['ID', 'KEY'])
840+
841+
file_id_data = filter_triplet_by_type(changes, file_id_object)
842+
changes = remove_triplet_from_triplet(changes, file_id_data)
843+
print(f"INFO - removed {file_id_object} from diff")
844+
845+
if exclude_objects:
846+
for object_name in exclude_objects:
847+
excluded_data = filter_triplet_by_type(changes, object_name)
848+
changes = remove_triplet_from_triplet(changes, excluded_data)
849+
print(f"INFO - removed {object_name} from diff")
850+
851+
for _, file_id in file_id_data.query("KEY == @file_id_key and _merge == '-'").VALUE.iteritems():
852+
print(f"--- {file_id}")# from-file-modification-time")
853+
854+
for _, file_id in file_id_data.query("KEY == @file_id_key and _merge == '+'").VALUE.iteritems():
855+
print(f"+++ {file_id}")# to-file-modification-time")
856+
857+
changes_on_left = len(changes.query("_merge == '-'"))
858+
changes_on_right = len(changes.query("_merge == '+'"))
859+
print(f"@@ -1,{changes_on_left} +1,{changes_on_right} @@")
860+
for _, change in (changes._merge + changes.ID + " " + changes.KEY + " " + changes.VALUE).iteritems():
861+
print(change)
862+
830863
# changes = changes.replace({'_merge': {"left_only": "-", "right_only": "+"}})
831864

832865
def export_to_networkx(data):

Tools/RDF_PARSER/examples/cim-diff.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,17 @@
33
import argparse
44

55
sys.path.append("..")
6-
from RDF_parser import get_diff, load_all_to_dataframe
6+
from RDF_parser import print_triplet_diff, load_all_to_dataframe
77

88
parser = argparse.ArgumentParser(description="""Create diff in Unified format for XML RDF CIM files. Diff is per object (ID KEY VALUE) not per XML line in file. The input can be xml, zip(xml), zip(zip(xml))""",
99
epilog="""Copyright (c) Kristjan Vilgo 2021; Licence: GPL 2.0""")
1010
parser.add_argument('original_file', type=str, help='Original file path')
1111
parser.add_argument('changed_file', type=str, help='Changed file path')
12+
parser.add_argument('-ex', '--exclude_objects', nargs='+', help='Names of rdf:Description rdf:type-s without namespace or prefix to be excluded from diff')
1213

1314
arg = parser.parse_args()
1415

15-
get_diff(load_all_to_dataframe([arg.original_file]), load_all_to_dataframe([arg.changed_file]), print_diff=True)
16+
print_triplet_diff(load_all_to_dataframe([arg.original_file]), load_all_to_dataframe([arg.changed_file]), exclude_objects=arg.exclude_objects)
1617

1718
# Example Use
1819
# python cim-diff.py K:\PROJEKT\ER_EJK_FSYSTEM\TSM_models\eq\20210512T2330Z_ELERING_EQ_001.zip K:\PROJEKT\ER_EJK_FSYSTEM\TSM_models\eq\20210516T2330Z_ELERING_EQ_001.zip

0 commit comments

Comments
 (0)