17
17
from lxml .etree import QName
18
18
19
19
import pandas
20
+ #import dask as pandas
20
21
import datetime
21
22
import zipfile
22
23
import uuid
@@ -90,7 +91,7 @@ def load_RDF_objects_from_XML(path_or_fileobject, debug=False):
90
91
91
92
# Get unique ID for loaded instance
92
93
# instance_id = clean_ID(parsed_xml.find("./").attrib.values()[0]) # Lets asume that the first RDF element describes the whole document - TODO replace it with hash of whole XML
93
- instance_id = str (uuid .uuid4 ()) # Guarantees unique ID for each loaded instance of data
94
+ instance_id = str (uuid .uuid4 ()) # Guarantees unique ID for each loaded instance of data, in erronous data it happens that same UUID is used for multiple files
94
95
95
96
if debug :
96
97
_ , start_time = print_duration ("XML loaded to tree object" , start_time )
@@ -158,7 +159,7 @@ def find_all_xml(list_of_paths_to_zip_globalzip_xml, debug=False):
158
159
return xml_files_list
159
160
160
161
161
- def load_RDF_to_list (path_or_fileobject , debug = False ):
162
+ def load_RDF_to_list (path_or_fileobject , debug = False , keep_ns = False ):
162
163
"""Parse single file to triplestore list"""
163
164
164
165
file_name = path_or_fileobject
@@ -173,31 +174,38 @@ def load_RDF_to_list(path_or_fileobject, debug=False):
173
174
if debug :
174
175
start_time = datetime .datetime .now ()
175
176
176
- # Lets generate list for RDF data and store the original filename under rdf:label
177
- data_list = [(str (uuid .uuid4 ()), "label" , file_name , INSTANCE_ID )]
177
+ # Lets generate list for RDF data and store the original filename under rdf:label in dcat:Distribution object
178
+ ID = str (uuid .uuid4 ())
179
+ data_list = [
180
+ (ID , "Type" , "Distribution" , INSTANCE_ID ),
181
+ (ID , "label" , file_name , INSTANCE_ID )
182
+ ]
178
183
179
184
# lets create all variables, so that in loops they are reused, rather than new ones are created, green thinking
180
- ID = ""
185
+ # ID = ""
181
186
KEY = ""
182
187
VALUE = ""
188
+ NS = ""
183
189
184
190
for RDF_object in RDF_objects :
185
191
186
192
ID = clean_ID (RDF_object .attrib .values ()[0 ])
187
- # KEY = '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Type ' # If we would like to keep all with correct namespace
193
+ # KEY = '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}type ' # If we would like to keep all with correct namespace
188
194
KEY = 'Type'
189
- VALUE = RDF_object .tag .split ("}" )[1 ]
195
+ KEY_NS = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
196
+ VALUE_NS , VALUE = RDF_object .tag .split ("}" ) #TODO - case where there is no namespace will fail, but is it realistic for RDF file?
190
197
# VALUE = etree.QName(object).localname
191
- # ID_TYPE = object.attrib.keys()[0].split("}")[1] # Adds column to identifi "ID" and "about" types of ID
198
+ # ID_TYPE = object.attrib.keys()[0].split("}")[1] # Adds column to identify "ID" and "about" types of ID
192
199
193
200
# data_list.append([ID, ID_TYPE, KEY, VALUE]) # If using ID TYPE, maybe also namespace should be kept?
194
201
data_list .append ((ID , KEY , VALUE , INSTANCE_ID ))
195
202
196
203
for element in RDF_object .iterchildren ():
197
204
198
- KEY = element .tag .split ("}" )[ 1 ]
205
+ KEY_NS , KEY = element .tag .split ("}" ) #TODO - case where there is no namespace will fail, but is it realistic for RDF file?
199
206
# KEY = etree.QName(element).localname
200
207
VALUE = element .text
208
+ VALUE_NS = ""
201
209
202
210
if VALUE is None and len (element .attrib .values ()) > 0 :
203
211
VALUE = clean_ID (element .attrib .values ()[0 ])
@@ -809,24 +817,49 @@ def update_triplet_from_tableview(data, tableview, update=True, add=True, instan
809
817
pandas .DataFrame .update_triplet_from_tableview = update_triplet_from_tableview
810
818
811
819
812
- def get_diff (left_data , right_data , print_diff = False , file_id_key = "label" ):
813
- diff = left_data .merge (right_data , on = ["ID" , "KEY" , "VALUE" ], how = 'outer' , indicator = True , suffixes = ("_OLD" , "_NEW" ), sort = False ).query ("_merge != 'both'" )
820
+ def remove_triplet_from_triplet (from_triplet , what_triplet , columns = ["ID" , "KEY" , "VALUE" ]):
821
+ """Retuns from_triplet - what_triplet"""
822
+ return from_triplet .drop (from_triplet .reset_index ().merge (what_triplet [columns ], on = columns , how = "inner" )["index" ], axis = 0 )
814
823
815
- if print_diff :
816
- changes = diff .replace ({'_merge' : {"left_only" : "-" , "right_only" : "+" }}).sort_values (by = ['ID' , 'KEY' ]).query ("KEY != 'label'" )
817
- changes_on_left = len (changes .query ("_merge == '-'" ))
818
- changes_on_right = len (changes .query ("_merge == '+'" ))
819
824
820
- for _ , file_id in left_data .query ("KEY == @file_id_key" ).VALUE .iteritems ():
821
- print (f"--- { file_id } " )# from-file-modification-time")
825
+ def filter_triplet_by_type (triplet , type ):
826
+ """Filter out all objects data by rdf:type"""
827
+ return triplet .merge (triplet .query ("KEY == 'Type' and VALUE == @type" ).ID )
822
828
823
- for _ , file_id in right_data .query ("KEY == @file_id_key" ).VALUE .iteritems ():
824
- print (f"+++ { file_id } " )# to-file-modification-time")
825
- print (f"@@ -1,{ changes_on_left } +1,{ changes_on_right } @@" )
826
- for _ , change in (changes ._merge + changes .ID + " " + changes .KEY + " " + changes .VALUE ).iteritems ():
827
- print (change )
828
829
829
- return diff
830
+ def triplet_diff (left_data , right_data ):
831
+
832
+ return left_data .merge (right_data , on = ["ID" , "KEY" , "VALUE" ], how = 'outer' , indicator = True , suffixes = ("_OLD" , "_NEW" ), sort = False ).query ("_merge != 'both'" )
833
+
834
+
835
+ def print_triplet_diff (left_data , right_data , file_id_object = "Distribution" , file_id_key = "label" , exclude_objects = []):
836
+
837
+ diff = triplet_diff (left_data , right_data )
838
+
839
+ changes = diff .replace ({'_merge' : {"left_only" : "-" , "right_only" : "+" }}).sort_values (by = ['ID' , 'KEY' ])
840
+
841
+ file_id_data = filter_triplet_by_type (changes , file_id_object )
842
+ changes = remove_triplet_from_triplet (changes , file_id_data )
843
+ print (f"INFO - removed { file_id_object } from diff" )
844
+
845
+ if exclude_objects :
846
+ for object_name in exclude_objects :
847
+ excluded_data = filter_triplet_by_type (changes , object_name )
848
+ changes = remove_triplet_from_triplet (changes , excluded_data )
849
+ print (f"INFO - removed { object_name } from diff" )
850
+
851
+ for _ , file_id in file_id_data .query ("KEY == @file_id_key and _merge == '-'" ).VALUE .iteritems ():
852
+ print (f"--- { file_id } " )# from-file-modification-time")
853
+
854
+ for _ , file_id in file_id_data .query ("KEY == @file_id_key and _merge == '+'" ).VALUE .iteritems ():
855
+ print (f"+++ { file_id } " )# to-file-modification-time")
856
+
857
+ changes_on_left = len (changes .query ("_merge == '-'" ))
858
+ changes_on_right = len (changes .query ("_merge == '+'" ))
859
+ print (f"@@ -1,{ changes_on_left } +1,{ changes_on_right } @@" )
860
+ for _ , change in (changes ._merge + changes .ID + " " + changes .KEY + " " + changes .VALUE ).iteritems ():
861
+ print (change )
862
+
830
863
# changes = changes.replace({'_merge': {"left_only": "-", "right_only": "+"}})
831
864
832
865
def export_to_networkx (data ):
0 commit comments