Zelalem | 917b43e | 2020-08-04 11:39:55 -0500 | [diff] [blame^] | 1 | #!/usr/bin/env python3 |
| 2 | # |
| 3 | # Copyright (c) 2019, Arm Limited. All rights reserved. |
| 4 | # |
| 5 | # SPDX-License-Identifier: BSD-3-Clause |
| 6 | # |
| 7 | |
| 8 | import argparse |
| 9 | import os |
| 10 | import subprocess |
| 11 | import sys |
| 12 | import logging |
| 13 | import tempfile |
| 14 | import yaml |
| 15 | |
| 16 | |
| 17 | def case_infra_error(case): |
| 18 | try: |
| 19 | if case["metadata"]["error_type"] == "Infrastructure": |
| 20 | logging.error("case %s: infra error is type Infrastructure", case["id"]) |
| 21 | return False |
| 22 | elif "timed out" in case["metadata"]["error_msg"]: |
| 23 | logging.error( |
| 24 | "case %s: infra error: %s", case["id"], case["metadata"]["error_msg"] |
| 25 | ) |
| 26 | return False |
| 27 | else: |
| 28 | return True |
| 29 | except KeyError: |
| 30 | return True |
| 31 | |
| 32 | |
| 33 | def not_infra_error(path): |
| 34 | """Returns a boolean indicating if there was not an infra error""" |
| 35 | try: |
| 36 | with open(path) as file: |
| 37 | results = yaml.safe_load(file) |
| 38 | return all(case_infra_error(tc) for tc in results) |
| 39 | except FileNotFoundError: |
| 40 | logging.warning("Could not open results file %s", path) |
| 41 | return True |
| 42 | |
| 43 | |
| 44 | def run_one_job(cmd): |
| 45 | """Run a job and return a boolean indicating if there was not an infra error. |
| 46 | Raises a `subprocess.CalledProcessError` when the called script fails. |
| 47 | """ |
| 48 | subprocess.run(cmd, check=True) |
| 49 | return not_infra_error("job_results.yaml") |
| 50 | |
| 51 | |
| 52 | def retry_job(cmd, retries): |
| 53 | """Run a job until there was not an infra error or retries are exhausted. |
| 54 | Raises a `subprocess.CalledProcessError` when the called script fails. |
| 55 | """ |
| 56 | logging.debug("trying job %s up to %d times", str(cmd), retries) |
| 57 | return any(run_one_job(cmd) for _ in range(retries)) |
| 58 | |
| 59 | |
| 60 | if __name__ == "__main__": |
| 61 | parser = argparse.ArgumentParser( |
| 62 | description="Lava job runner with infrastructure error dectection and retry." |
| 63 | ) |
| 64 | parser.add_argument( |
| 65 | "script", |
| 66 | nargs="?", |
| 67 | default=os.path.join(os.path.dirname(__file__), "run_lava_job.sh"), |
| 68 | help="bash job script to run a lava job", |
| 69 | ) |
| 70 | parser.add_argument( |
| 71 | "job", |
| 72 | nargs="?", |
| 73 | default=os.path.join("artefacts", os.environ["BIN_MODE"], "juno.yaml"), |
| 74 | help="the Lava job description file", |
| 75 | ) |
| 76 | parser.add_argument( |
| 77 | "retries", |
| 78 | type=int, |
| 79 | nargs="?", |
| 80 | default=3, |
| 81 | help="Number of retries. defaluts to 3", |
| 82 | ) |
| 83 | parser.add_argument( |
| 84 | "--save", |
| 85 | default=tempfile.mkdtemp(prefix="job-output"), |
| 86 | help="directory to store the job_output.log", |
| 87 | ) |
| 88 | parser.add_argument( |
| 89 | "-v", action="count", default=0, help="Increase printing of debug ouptut" |
| 90 | ) |
| 91 | args = parser.parse_args() |
| 92 | if args.v >= 2: |
| 93 | logging.getLogger().setLevel(logging.DEBUG) |
| 94 | elif args.v >= 1: |
| 95 | logging.getLogger().setLevel(logging.INFO) |
| 96 | logging.debug(args) |
| 97 | try: |
| 98 | if not retry_job([args.script, args.job, args.save], args.retries): |
| 99 | logging.critical("All jobs failed with infra errors; retries exhausted") |
| 100 | sys.exit(-1) |
| 101 | else: |
| 102 | sys.exit(0) |
| 103 | except subprocess.CalledProcessError as e: |
| 104 | logging.critical("Job script returned error code %d", e.returncode) |
| 105 | sys.exit(e.returncode) |