Skip to content

Commit 78caa2a

Browse files
committed
ci(check-mapping): generalised license checker and added cli params
1 parent 0bf9dc5 commit 78caa2a

File tree

5 files changed

+199
-112
lines changed

5 files changed

+199
-112
lines changed

.github/workflows/ci.yml

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,20 +53,35 @@ jobs:
5353
pip install click
5454
./run-tests.sh --data-json
5555
56-
data-licenses:
56+
data-mapping:
5757
runs-on: ubuntu-24.04
5858
steps:
5959
- name: Checkout
6060
uses: actions/checkout@v4
61+
with:
62+
fetch-depth: 0
6163

6264
- name: Setup Python
6365
uses: actions/setup-python@v5
6466
with:
6567
python-version: 3.9
6668

67-
- name: Check data licenses
69+
- name: Fetch base branch
70+
run: |
71+
git fetch origin ${{ github.event.pull_request.base.ref }}
72+
73+
- name: Check mapping of all records
74+
run: |
75+
pip install click pyyaml
76+
./run-tests.sh --data-mapping
77+
78+
- name: Check mapping of changed records
79+
if: github.event_name == 'pull_request'
6880
run: |
69-
./run-tests.sh --data-licenses
81+
CHANGED_FILES=$(git diff --name-only origin/${{ github.event.pull_request.base.ref }}...HEAD data/records/)
82+
echo "Changed files: $CHANGED_FILES"
83+
pip install click pyyaml
84+
./run-tests.sh --data-mapping ${CHANGED_FILES}
7085
7186
data-recids:
7287
runs-on: ubuntu-24.04

run-tests.sh

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,9 @@ data_json() {
3939
find . -name "*.json" -exec ./scripts/clean_json_file.py --check {} \+
4040
}
4141

42-
data_licenses() {
43-
scripts/check_licenses.py
42+
data_mapping() {
43+
shift
44+
scripts/check_mapping.py "$@"
4445
}
4546

4647
data_recids() {
@@ -151,7 +152,7 @@ lint_yamllint() {
151152
all() {
152153
data_dois
153154
data_json
154-
data_licenses
155+
data_mapping
155156
data_recids
156157
data_slugs
157158
data_types
@@ -173,7 +174,7 @@ help() {
173174
echo " --all Perform all checks [default]"
174175
echo " --data-dois Check data DOIs"
175176
echo " --data-json Check data JSON"
176-
echo " --data-licenses Check data licenses"
177+
echo " --data-mapping Check data mapping"
177178
echo " --data-recids Check data record IDs "
178179
echo " --data-slugs Check data slugs"
179180
echo " --data-types Check data types"
@@ -201,7 +202,7 @@ case $arg in
201202
--help) help ;;
202203
--data-dois) data_dois ;;
203204
--data-json) data_json ;;
204-
--data-licenses) data_licenses ;;
205+
--data-mapping) data_mapping "$@" ;;
205206
--data-recids) data_recids ;;
206207
--data-slugs) data_slugs ;;
207208
--data-types) data_types ;;

scripts/check_licenses.py

Lines changed: 0 additions & 104 deletions
This file was deleted.

scripts/check_mapping.py

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
#!/usr/bin/env python
2+
3+
"""Check if license fields are valid in all records."""
4+
5+
import asyncio
6+
import dataclasses
7+
import itertools
8+
import json
9+
import logging
10+
import os
11+
import time
12+
import typing
13+
from glob import glob
14+
from pathlib import Path
15+
16+
import click
17+
import yaml
18+
19+
LOOP = asyncio.get_event_loop()
20+
MAPPING = Path(os.getcwd()) / "scripts" / "record_mapping.yaml"
21+
FILES = Path(os.getcwd()) / "data" / "records" / "*"
22+
23+
24+
@dataclasses.dataclass
25+
class InvalidRecord:
26+
"""Dataclass to hold information about a validated file."""
27+
28+
recid: typing.Optional[str]
29+
path: Path
30+
msg: str
31+
32+
33+
@click.command()
34+
@click.option(
35+
"-m",
36+
"--mapping",
37+
default=MAPPING,
38+
type=click.Path(readable=True, path_type=Path, dir_okay=False),
39+
help="Path to check records against.",
40+
)
41+
@click.option(
42+
"-v", "--verbose", default=False, is_flag=True, help="Print verbose output."
43+
)
44+
@click.argument("files", type=click.Path(readable=True, path_type=Path), nargs=-1)
45+
def command(**kwargs):
46+
"""Validate a files of supplied paths. Arguments support unix-like patterns."""
47+
try:
48+
LOOP.run_until_complete(main(**kwargs))
49+
finally:
50+
LOOP.close()
51+
52+
53+
async def main(mapping, verbose, files) -> None:
54+
"""Validate record fields against a defined mapping."""
55+
start_time = time.perf_counter()
56+
files = files or (FILES,)
57+
58+
log_level = logging.DEBUG if verbose else logging.INFO
59+
logging.basicConfig(level=log_level, format="[%(levelname)s] %(message)s")
60+
61+
logging.info("Loading mapping file...")
62+
mapping: dict = await LOOP.run_in_executor(
63+
None, lambda: yaml.safe_load(open(mapping, "r"))
64+
)
65+
66+
globs = [glob(str(f)) for f in files]
67+
paths = [Path(g) for g in itertools.chain(*globs)]
68+
logging.info("Found %d files. Validating...", len(paths))
69+
70+
tasks = [LOOP.create_task(validate_single(path, mapping)) for path in paths]
71+
results = await asyncio.gather(*tasks)
72+
73+
finish = f"within {time.perf_counter() - start_time:.2f} seconds"
74+
logging.info(
75+
"Validated %d files (%d records) %s.",
76+
len(paths),
77+
sum(r[0] for r in results),
78+
finish,
79+
)
80+
81+
errors = {p: e for _, e, p in results if len(e)}
82+
if not errors:
83+
logging.info("All files validated successfully. No errors found.")
84+
exit(0)
85+
86+
logging.error(
87+
"Found %d errors in %d files.",
88+
sum(len(e) for e in errors.values()),
89+
len(errors),
90+
)
91+
92+
for p, err in errors.items():
93+
logging.error("File %s has %d errors:", p.name, len(err))
94+
95+
for e in err:
96+
logging.error(" - %s: %s", e.recid or "UNSET", e.msg)
97+
98+
exit(1)
99+
100+
101+
async def validate_single(
102+
path: Path, mapping: dict
103+
) -> tuple[int, list[InvalidRecord], Path]:
104+
"""Validate a single file against the mapping schema."""
105+
errors = []
106+
try:
107+
records = await asyncio.get_event_loop().run_in_executor(
108+
None, lambda p: json.loads(open(p, "rb").read()), path
109+
)
110+
111+
except Exception as e:
112+
logging.error("Failed to load json file %s: %s", path.name, e)
113+
records = []
114+
115+
def rcheck(doc, validation, stack=None) -> typing.Generator[str, None, None]:
116+
"""Recursively checks a record against the validation schema."""
117+
stack = stack or []
118+
119+
if isinstance(validation, dict):
120+
for v_key, v_value in validation.items():
121+
is_optional = v_key.startswith("?")
122+
v_key = v_key.removeprefix("?")
123+
124+
if v_key not in doc:
125+
if not is_optional:
126+
yield f"Missing required key [{']['.join(stack)}]->{v_key}"
127+
continue
128+
129+
sub_docs = v if isinstance((v := doc[v_key]), list) else [v]
130+
for sub_doc in sub_docs:
131+
yield from rcheck(sub_doc, v_value, stack + [v_key])
132+
133+
else:
134+
allowed = validation if isinstance(validation, list) else [validation]
135+
136+
if not any(doc == pattern for pattern in allowed):
137+
yield f"Value of [{']['.join(stack)}]: `{doc}` does not match any valid patterns: {allowed}"
138+
139+
for record in records:
140+
if error_list := list(rcheck(record, mapping)):
141+
for e in error_list:
142+
rec = InvalidRecord(recid=record.get("recid"), path=path, msg=e)
143+
errors.append(rec)
144+
145+
logging.debug("Validated file %s with %d records.", path.name, len(records))
146+
return len(records), errors, path
147+
148+
149+
if __name__ == "__main__":
150+
command()

scripts/record_mapping.yaml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
?license:
2+
attribution:
3+
- Apache-2.0
4+
- BSD-3-Clause
5+
- CC0-1.0
6+
- GPL-3.0-only
7+
- MIT
8+
?collision_information:
9+
?type:
10+
- e+e-
11+
- pp
12+
- pPb
13+
- PbPb
14+
- Interfill
15+
16+
experiment:
17+
- ALICE
18+
- ATLAS
19+
- CMS
20+
- DELPHI
21+
- JADE
22+
- LHCb
23+
- OPERA
24+
- PHENIX
25+
- TOTEM

0 commit comments

Comments
 (0)