1
+ """A script to populate the 'validated_runs' field in data records."""
1
2
import json
2
3
import os
3
4
import re
4
5
6
+
5
7
def build_validation_lookup (directory ):
6
- """
7
- Scans all 'validated-runs' files to build a lookup table mapping
8
- a record's recid to its validation type ('full' or 'muonsonly').
8
+ """Scan all 'validated-runs' files to build a lookup table.
9
+
10
+ This function maps a record's recid to its validation type ('full' or 'muonsonly').
9
11
"""
10
12
validation_lookup = {}
11
- validated_files_regex = re .compile (r' .*validated-runs.*\.json' , re .IGNORECASE )
12
- muons_only_regex = re .compile (r' only valid muons' , re .IGNORECASE )
13
+ validated_files_regex = re .compile (r" .*validated-runs.*\.json" , re .IGNORECASE )
14
+ muons_only_regex = re .compile (r" only valid muons" , re .IGNORECASE )
13
15
14
16
for filename in os .listdir (directory ):
15
17
if validated_files_regex .match (filename ):
16
18
filepath = os .path .join (directory , filename )
17
19
try :
18
- with open (filepath , 'r' , encoding = ' utf-8' ) as f :
20
+ with open (filepath , "r" , encoding = " utf-8" ) as f :
19
21
data = json .load (f )
20
22
21
23
records_to_process = []
@@ -28,13 +30,15 @@ def build_validation_lookup(directory):
28
30
if not isinstance (record , dict ):
29
31
continue
30
32
31
- recid = record .get (' recid' )
32
- title = record .get (' title' , '' )
33
- title_additional = record .get (' title_additional' , '' )
33
+ recid = record .get (" recid" )
34
+ title = record .get (" title" , "" )
35
+ title_additional = record .get (" title_additional" , "" )
34
36
35
37
if recid and (title or title_additional ):
36
38
validation_type = "full"
37
- if muons_only_regex .search (title ) or muons_only_regex .search (title_additional ):
39
+ if muons_only_regex .search (title ) or muons_only_regex .search (
40
+ title_additional
41
+ ):
38
42
validation_type = "muonsonly"
39
43
validation_lookup [str (recid )] = validation_type
40
44
@@ -45,18 +49,22 @@ def build_validation_lookup(directory):
45
49
46
50
return validation_lookup , validated_files_regex
47
51
52
+
48
53
def fix_and_add_validated_runs (directory , validation_lookup , validated_files_regex ):
54
+ """Add a 'validated_runs' field to records based on a lookup table.
55
+
56
+ This function uses a pre-built validation lookup table and skips all
57
+ validated-run files themselves.
49
58
"""
50
- Adds a 'validated_runs' field to records based on the pre-built
51
- validation lookup table.
52
- """
53
- validated_description_regex = re .compile (r'validated (runs|lumi sections)' , re .IGNORECASE )
59
+ validated_description_regex = re .compile (
60
+ r"validated (runs|lumi sections)" , re .IGNORECASE
61
+ )
54
62
55
63
for filename in os .listdir (directory ):
56
64
if not validated_files_regex .match (filename ):
57
65
filepath = os .path .join (directory , filename )
58
66
try :
59
- with open (filepath , 'r' , encoding = ' utf-8' ) as f :
67
+ with open (filepath , "r" , encoding = " utf-8" ) as f :
60
68
data = json .load (f )
61
69
62
70
records_to_process = []
@@ -67,35 +75,48 @@ def fix_and_add_validated_runs(directory, validation_lookup, validated_files_reg
67
75
68
76
modified = False
69
77
for record in records_to_process :
70
- if not isinstance (record , dict ) or ' validated_runs' in record :
78
+ if not isinstance (record , dict ) or " validated_runs" in record :
71
79
continue
72
80
73
- if ('abstract' in record and
74
- isinstance (record .get ('abstract' ), dict ) and
75
- validated_description_regex .search (record ['abstract' ].get ('description' , '' ))):
81
+ if (
82
+ "abstract" in record
83
+ and isinstance (record .get ("abstract" ), dict )
84
+ and validated_description_regex .search (
85
+ record ["abstract" ].get ("description" , "" )
86
+ )
87
+ ):
76
88
77
- links = record [' abstract' ].get (' links' , [])
89
+ links = record [" abstract" ].get (" links" , [])
78
90
if links :
79
- record [' validated_runs' ] = []
91
+ record [" validated_runs" ] = []
80
92
for link in links :
81
- link_recid = link .get (' recid' )
93
+ link_recid = link .get (" recid" )
82
94
if link_recid :
83
- validation_type = validation_lookup .get (str (link_recid ), "full" )
84
- record ['validated_runs' ].append ({
85
- "recid" : link_recid ,
86
- "validation" : validation_type
87
- })
95
+ validation_type = validation_lookup .get (
96
+ str (link_recid ), "full"
97
+ )
98
+ record ["validated_runs" ].append (
99
+ {
100
+ "recid" : link_recid ,
101
+ "validation" : validation_type ,
102
+ }
103
+ )
88
104
modified = True
89
105
if modified :
90
- final_data = records_to_process if isinstance (data , list ) else records_to_process [0 ]
91
- with open (filepath , 'w' , encoding = 'utf-8' ) as f :
106
+ final_data = (
107
+ records_to_process
108
+ if isinstance (data , list )
109
+ else records_to_process [0 ]
110
+ )
111
+ with open (filepath , "w" , encoding = "utf-8" ) as f :
92
112
json .dump (final_data , f , indent = 2 , ensure_ascii = False )
93
- f .write (' \n ' )
113
+ f .write (" \n " )
94
114
print (f"Updated validated runs in { filename } " )
95
115
96
116
except (json .JSONDecodeError , IOError ) as e :
97
117
print (f"An error occurred with { filename } : { e } " )
98
118
119
+
99
120
if __name__ == "__main__" :
100
121
data_directory = "data/records"
101
122
print ("Building validation lookup table..." )
@@ -104,4 +125,4 @@ def fix_and_add_validated_runs(directory, validation_lookup, validated_files_reg
104
125
105
126
print ("\n Processing dataset records..." )
106
127
fix_and_add_validated_runs (data_directory , validation_map , validated_files_pattern )
107
- print ("\n Script finished." )
128
+ print ("\n Script finished." )
0 commit comments