|
| 1 | +#!/usr/bin/env python |
| 2 | + |
| 3 | +"""Check if license fields are valid in all records.""" |
| 4 | + |
| 5 | +import asyncio |
| 6 | +import dataclasses |
| 7 | +import itertools |
| 8 | +import json |
| 9 | +import logging |
| 10 | +import os |
| 11 | +import time |
| 12 | +import typing |
| 13 | +from glob import glob |
| 14 | +from pathlib import Path |
| 15 | + |
| 16 | +import click |
| 17 | +import yaml |
| 18 | + |
| 19 | +LOOP = asyncio.get_event_loop() |
| 20 | +MAPPING = Path(os.getcwd()) / "scripts" / "record_mapping.yaml" |
| 21 | +FILES = Path(os.getcwd()) / "data" / "records" / "*" |
| 22 | + |
| 23 | + |
| 24 | +@dataclasses.dataclass |
| 25 | +class InvalidRecord: |
| 26 | + """Dataclass to hold information about a validated file.""" |
| 27 | + |
| 28 | + recid: typing.Optional[str] |
| 29 | + path: Path |
| 30 | + msg: str |
| 31 | + |
| 32 | + |
| 33 | +@click.command() |
| 34 | +@click.option( |
| 35 | + "-m", |
| 36 | + "--mapping", |
| 37 | + default=MAPPING, |
| 38 | + type=click.Path(readable=True, path_type=Path, dir_okay=False), |
| 39 | + help="Path to check records against.", |
| 40 | +) |
| 41 | +@click.option( |
| 42 | + "-v", "--verbose", default=False, is_flag=True, help="Print verbose output." |
| 43 | +) |
| 44 | +@click.argument("files", type=click.Path(readable=True, path_type=Path), nargs=-1) |
| 45 | +def command(**kwargs): |
| 46 | + """Validate a files of supplied paths. Arguments support unix-like patterns.""" |
| 47 | + try: |
| 48 | + LOOP.run_until_complete(main(**kwargs)) |
| 49 | + finally: |
| 50 | + LOOP.close() |
| 51 | + |
| 52 | + |
| 53 | +async def main(mapping, verbose, files) -> None: |
| 54 | + """Validate record fields against a defined mapping.""" |
| 55 | + start_time = time.perf_counter() |
| 56 | + files = files or (FILES,) |
| 57 | + |
| 58 | + log_level = logging.DEBUG if verbose else logging.INFO |
| 59 | + logging.basicConfig(level=log_level, format="[%(levelname)s] %(message)s") |
| 60 | + |
| 61 | + logging.info("Loading mapping file...") |
| 62 | + mapping: dict = await LOOP.run_in_executor( |
| 63 | + None, lambda: yaml.safe_load(open(mapping, "r")) |
| 64 | + ) |
| 65 | + |
| 66 | + globs = [glob(str(f)) for f in files] |
| 67 | + paths = [Path(g) for g in itertools.chain(*globs)] |
| 68 | + logging.info("Found %d files. Validating...", len(paths)) |
| 69 | + |
| 70 | + tasks = [LOOP.create_task(validate_single(path, mapping)) for path in paths] |
| 71 | + results = await asyncio.gather(*tasks) |
| 72 | + |
| 73 | + finish = f"within {time.perf_counter() - start_time:.2f} seconds" |
| 74 | + logging.info( |
| 75 | + "Validated %d files (%d records) %s.", |
| 76 | + len(paths), |
| 77 | + sum(r[0] for r in results), |
| 78 | + finish, |
| 79 | + ) |
| 80 | + |
| 81 | + errors = {p: e for _, e, p in results if len(e)} |
| 82 | + if not errors: |
| 83 | + logging.info("All files validated successfully. No errors found.") |
| 84 | + exit(0) |
| 85 | + |
| 86 | + logging.error( |
| 87 | + "Found %d errors in %d files.", |
| 88 | + sum(len(e) for e in errors.values()), |
| 89 | + len(errors), |
| 90 | + ) |
| 91 | + |
| 92 | + for p, err in errors.items(): |
| 93 | + logging.error("File %s has %d errors:", p.name, len(err)) |
| 94 | + |
| 95 | + for e in err: |
| 96 | + logging.error(" - %s: %s", e.recid or "UNSET", e.msg) |
| 97 | + |
| 98 | + exit(1) |
| 99 | + |
| 100 | + |
| 101 | +async def validate_single( |
| 102 | + path: Path, mapping: dict |
| 103 | +) -> tuple[int, list[InvalidRecord], Path]: |
| 104 | + """Validate a single file against the mapping schema.""" |
| 105 | + errors = [] |
| 106 | + try: |
| 107 | + records = await asyncio.get_event_loop().run_in_executor( |
| 108 | + None, lambda p: json.loads(open(p, "rb").read()), path |
| 109 | + ) |
| 110 | + |
| 111 | + except Exception as e: |
| 112 | + logging.error("Failed to load json file %s: %s", path.name, e) |
| 113 | + records = [] |
| 114 | + |
| 115 | + def rcheck(doc, validation, stack=None) -> typing.Generator[str, None, None]: |
| 116 | + """Recursively checks a record against the validation schema.""" |
| 117 | + stack = stack or [] |
| 118 | + |
| 119 | + if isinstance(validation, dict): |
| 120 | + for v_key, v_value in validation.items(): |
| 121 | + is_optional = v_key.startswith("?") |
| 122 | + v_key = v_key.removeprefix("?") |
| 123 | + |
| 124 | + if v_key not in doc: |
| 125 | + if not is_optional: |
| 126 | + yield f"Missing required key [{']['.join(stack)}]->{v_key}" |
| 127 | + continue |
| 128 | + |
| 129 | + sub_docs = v if isinstance((v := doc[v_key]), list) else [v] |
| 130 | + for sub_doc in sub_docs: |
| 131 | + yield from rcheck(sub_doc, v_value, stack + [v_key]) |
| 132 | + |
| 133 | + else: |
| 134 | + allowed = validation if isinstance(validation, list) else [validation] |
| 135 | + |
| 136 | + if not any(doc == pattern for pattern in allowed): |
| 137 | + yield f"Value of [{']['.join(stack)}]: `{doc}` does not match any valid patterns: {allowed}" |
| 138 | + |
| 139 | + for record in records: |
| 140 | + if error_list := list(rcheck(record, mapping)): |
| 141 | + for e in error_list: |
| 142 | + rec = InvalidRecord(recid=record.get("recid"), path=path, msg=e) |
| 143 | + errors.append(rec) |
| 144 | + |
| 145 | + logging.debug("Validated file %s with %d records.", path.name, len(records)) |
| 146 | + return len(records), errors, path |
| 147 | + |
| 148 | + |
| 149 | +if __name__ == "__main__": |
| 150 | + command() |
0 commit comments