Zelalem | 917b43e | 2020-08-04 11:39:55 -0500 | [diff] [blame] | 1 | #!/usr/bin/env python3 |
| 2 | # |
Leonardo Sandoval | 579c737 | 2020-10-23 15:23:32 -0500 | [diff] [blame] | 3 | # Copyright (c) 2019-2020 Arm Limited. All rights reserved. |
Zelalem | 917b43e | 2020-08-04 11:39:55 -0500 | [diff] [blame] | 4 | # |
| 5 | # SPDX-License-Identifier: BSD-3-Clause |
| 6 | # |
| 7 | |
| 8 | import argparse |
| 9 | import os |
| 10 | import subprocess |
| 11 | import sys |
| 12 | import logging |
| 13 | import tempfile |
| 14 | import yaml |
| 15 | |
| 16 | |
| 17 | def case_infra_error(case): |
| 18 | try: |
| 19 | if case["metadata"]["error_type"] == "Infrastructure": |
| 20 | logging.error("case %s: infra error is type Infrastructure", case["id"]) |
| 21 | return False |
| 22 | elif "timed out" in case["metadata"]["error_msg"]: |
| 23 | logging.error( |
| 24 | "case %s: infra error: %s", case["id"], case["metadata"]["error_msg"] |
| 25 | ) |
| 26 | return False |
| 27 | else: |
| 28 | return True |
| 29 | except KeyError: |
| 30 | return True |
| 31 | |
| 32 | |
| 33 | def not_infra_error(path): |
| 34 | """Returns a boolean indicating if there was not an infra error""" |
| 35 | try: |
| 36 | with open(path) as file: |
| 37 | results = yaml.safe_load(file) |
| 38 | return all(case_infra_error(tc) for tc in results) |
| 39 | except FileNotFoundError: |
| 40 | logging.warning("Could not open results file %s", path) |
| 41 | return True |
| 42 | |
| 43 | |
| 44 | def run_one_job(cmd): |
| 45 | """Run a job and return a boolean indicating if there was not an infra error. |
| 46 | Raises a `subprocess.CalledProcessError` when the called script fails. |
| 47 | """ |
| 48 | subprocess.run(cmd, check=True) |
| 49 | return not_infra_error("job_results.yaml") |
| 50 | |
| 51 | |
| 52 | def retry_job(cmd, retries): |
| 53 | """Run a job until there was not an infra error or retries are exhausted. |
| 54 | Raises a `subprocess.CalledProcessError` when the called script fails. |
| 55 | """ |
| 56 | logging.debug("trying job %s up to %d times", str(cmd), retries) |
| 57 | return any(run_one_job(cmd) for _ in range(retries)) |
| 58 | |
| 59 | |
| 60 | if __name__ == "__main__": |
| 61 | parser = argparse.ArgumentParser( |
| 62 | description="Lava job runner with infrastructure error dectection and retry." |
| 63 | ) |
| 64 | parser.add_argument( |
| 65 | "script", |
| 66 | nargs="?", |
| 67 | default=os.path.join(os.path.dirname(__file__), "run_lava_job.sh"), |
| 68 | help="bash job script to run a lava job", |
| 69 | ) |
| 70 | parser.add_argument( |
| 71 | "job", |
| 72 | nargs="?", |
| 73 | default=os.path.join("artefacts", os.environ["BIN_MODE"], "juno.yaml"), |
| 74 | help="the Lava job description file", |
| 75 | ) |
| 76 | parser.add_argument( |
| 77 | "retries", |
| 78 | type=int, |
| 79 | nargs="?", |
| 80 | default=3, |
| 81 | help="Number of retries. defaluts to 3", |
| 82 | ) |
| 83 | parser.add_argument( |
| 84 | "--save", |
| 85 | default=tempfile.mkdtemp(prefix="job-output"), |
| 86 | help="directory to store the job_output.log", |
| 87 | ) |
| 88 | parser.add_argument( |
Zelalem | 8afa092 | 2020-08-28 10:40:44 -0500 | [diff] [blame] | 89 | "--username", |
| 90 | required=True, |
| 91 | help="the user name for lava server", |
| 92 | ) |
| 93 | parser.add_argument( |
| 94 | "--token", |
| 95 | required=True, |
| 96 | help="the token for lava server", |
| 97 | ) |
| 98 | parser.add_argument( |
Zelalem | 917b43e | 2020-08-04 11:39:55 -0500 | [diff] [blame] | 99 | "-v", action="count", default=0, help="Increase printing of debug ouptut" |
| 100 | ) |
| 101 | args = parser.parse_args() |
| 102 | if args.v >= 2: |
| 103 | logging.getLogger().setLevel(logging.DEBUG) |
| 104 | elif args.v >= 1: |
| 105 | logging.getLogger().setLevel(logging.INFO) |
| 106 | logging.debug(args) |
| 107 | try: |
Zelalem | 8afa092 | 2020-08-28 10:40:44 -0500 | [diff] [blame] | 108 | if not retry_job([args.script, args.job, args.save, args.username, args.token],\ |
| 109 | args.retries): |
Zelalem | 917b43e | 2020-08-04 11:39:55 -0500 | [diff] [blame] | 110 | logging.critical("All jobs failed with infra errors; retries exhausted") |
| 111 | sys.exit(-1) |
| 112 | else: |
| 113 | sys.exit(0) |
| 114 | except subprocess.CalledProcessError as e: |
| 115 | logging.critical("Job script returned error code %d", e.returncode) |
| 116 | sys.exit(e.returncode) |