Skip to content

Commit 1778d3f

Browse files
committed
feat(records): populate validated_runs field in records
A data migration script was run to parse the existing `abstract` field and create the new structured data. It correctly identifies records that mention either "validated runs" or "validated lumi sections," ensuring consistency across all datasets. Closes #3746
1 parent f4ccdea commit 1778d3f

File tree

1 file changed

+52
-31
lines changed

1 file changed

+52
-31
lines changed

scripts/add_validated_runs.py

Lines changed: 52 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,23 @@
1+
"""A script to populate the 'validated_runs' field in data records."""
12
import json
23
import os
34
import re
45

6+
57
def build_validation_lookup(directory):
6-
"""
7-
Scans all 'validated-runs' files to build a lookup table mapping
8-
a record's recid to its validation type ('full' or 'muonsonly').
8+
"""Scan all 'validated-runs' files to build a lookup table.
9+
10+
This function maps a record's recid to its validation type ('full' or 'muonsonly').
911
"""
1012
validation_lookup = {}
11-
validated_files_regex = re.compile(r'.*validated-runs.*\.json', re.IGNORECASE)
12-
muons_only_regex = re.compile(r'only valid muons', re.IGNORECASE)
13+
validated_files_regex = re.compile(r".*validated-runs.*\.json", re.IGNORECASE)
14+
muons_only_regex = re.compile(r"only valid muons", re.IGNORECASE)
1315

1416
for filename in os.listdir(directory):
1517
if validated_files_regex.match(filename):
1618
filepath = os.path.join(directory, filename)
1719
try:
18-
with open(filepath, 'r', encoding='utf-8') as f:
20+
with open(filepath, "r", encoding="utf-8") as f:
1921
data = json.load(f)
2022

2123
records_to_process = []
@@ -28,13 +30,15 @@ def build_validation_lookup(directory):
2830
if not isinstance(record, dict):
2931
continue
3032

31-
recid = record.get('recid')
32-
title = record.get('title', '')
33-
title_additional = record.get('title_additional', '')
33+
recid = record.get("recid")
34+
title = record.get("title", "")
35+
title_additional = record.get("title_additional", "")
3436

3537
if recid and (title or title_additional):
3638
validation_type = "full"
37-
if muons_only_regex.search(title) or muons_only_regex.search(title_additional):
39+
if muons_only_regex.search(title) or muons_only_regex.search(
40+
title_additional
41+
):
3842
validation_type = "muonsonly"
3943
validation_lookup[str(recid)] = validation_type
4044

@@ -45,18 +49,22 @@ def build_validation_lookup(directory):
4549

4650
return validation_lookup, validated_files_regex
4751

52+
4853
def fix_and_add_validated_runs(directory, validation_lookup, validated_files_regex):
54+
"""Add a 'validated_runs' field to records based on a lookup table.
55+
56+
This function uses a pre-built validation lookup table and skips all
57+
validated-run files themselves.
4958
"""
50-
Adds a 'validated_runs' field to records based on the pre-built
51-
validation lookup table.
52-
"""
53-
validated_description_regex = re.compile(r'validated (runs|lumi sections)', re.IGNORECASE)
59+
validated_description_regex = re.compile(
60+
r"validated (runs|lumi sections)", re.IGNORECASE
61+
)
5462

5563
for filename in os.listdir(directory):
5664
if not validated_files_regex.match(filename):
5765
filepath = os.path.join(directory, filename)
5866
try:
59-
with open(filepath, 'r', encoding='utf-8') as f:
67+
with open(filepath, "r", encoding="utf-8") as f:
6068
data = json.load(f)
6169

6270
records_to_process = []
@@ -67,35 +75,48 @@ def fix_and_add_validated_runs(directory, validation_lookup, validated_files_reg
6775

6876
modified = False
6977
for record in records_to_process:
70-
if not isinstance(record, dict) or 'validated_runs' in record:
78+
if not isinstance(record, dict) or "validated_runs" in record:
7179
continue
7280

73-
if ('abstract' in record and
74-
isinstance(record.get('abstract'), dict) and
75-
validated_description_regex.search(record['abstract'].get('description', ''))):
81+
if (
82+
"abstract" in record
83+
and isinstance(record.get("abstract"), dict)
84+
and validated_description_regex.search(
85+
record["abstract"].get("description", "")
86+
)
87+
):
7688

77-
links = record['abstract'].get('links', [])
89+
links = record["abstract"].get("links", [])
7890
if links:
79-
record['validated_runs'] = []
91+
record["validated_runs"] = []
8092
for link in links:
81-
link_recid = link.get('recid')
93+
link_recid = link.get("recid")
8294
if link_recid:
83-
validation_type = validation_lookup.get(str(link_recid), "full")
84-
record['validated_runs'].append({
85-
"recid": link_recid,
86-
"validation": validation_type
87-
})
95+
validation_type = validation_lookup.get(
96+
str(link_recid), "full"
97+
)
98+
record["validated_runs"].append(
99+
{
100+
"recid": link_recid,
101+
"validation": validation_type,
102+
}
103+
)
88104
modified = True
89105
if modified:
90-
final_data = records_to_process if isinstance(data, list) else records_to_process[0]
91-
with open(filepath, 'w', encoding='utf-8') as f:
106+
final_data = (
107+
records_to_process
108+
if isinstance(data, list)
109+
else records_to_process[0]
110+
)
111+
with open(filepath, "w", encoding="utf-8") as f:
92112
json.dump(final_data, f, indent=2, ensure_ascii=False)
93-
f.write('\n')
113+
f.write("\n")
94114
print(f"Updated validated runs in {filename}")
95115

96116
except (json.JSONDecodeError, IOError) as e:
97117
print(f"An error occurred with {filename}: {e}")
98118

119+
99120
if __name__ == "__main__":
100121
data_directory = "data/records"
101122
print("Building validation lookup table...")
@@ -104,4 +125,4 @@ def fix_and_add_validated_runs(directory, validation_lookup, validated_files_reg
104125

105126
print("\nProcessing dataset records...")
106127
fix_and_add_validated_runs(data_directory, validation_map, validated_files_pattern)
107-
print("\nScript finished.")
128+
print("\nScript finished.")

0 commit comments

Comments
 (0)