Zelalem | 917b43e | 2020-08-04 11:39:55 -0500 | [diff] [blame] | 1 | #!/usr/bin/env python3 |
| 2 | # |
Khasim Syed Mohammed | 430d594 | 2021-08-09 21:26:15 +0530 | [diff] [blame] | 3 | # Copyright (c) 2019-2021 Arm Limited. All rights reserved. |
Zelalem | 917b43e | 2020-08-04 11:39:55 -0500 | [diff] [blame] | 4 | # |
| 5 | # SPDX-License-Identifier: BSD-3-Clause |
| 6 | # |
| 7 | |
| 8 | import argparse |
| 9 | import os |
| 10 | import subprocess |
| 11 | import sys |
| 12 | import logging |
| 13 | import tempfile |
| 14 | import yaml |
| 15 | |
| 16 | |
| 17 | def case_infra_error(case): |
| 18 | try: |
| 19 | if case["metadata"]["error_type"] == "Infrastructure": |
| 20 | logging.error("case %s: infra error is type Infrastructure", case["id"]) |
| 21 | return False |
| 22 | elif "timed out" in case["metadata"]["error_msg"]: |
| 23 | logging.error( |
| 24 | "case %s: infra error: %s", case["id"], case["metadata"]["error_msg"] |
| 25 | ) |
| 26 | return False |
| 27 | else: |
| 28 | return True |
| 29 | except KeyError: |
| 30 | return True |
| 31 | |
| 32 | |
| 33 | def not_infra_error(path): |
| 34 | """Returns a boolean indicating if there was not an infra error""" |
| 35 | try: |
| 36 | with open(path) as file: |
| 37 | results = yaml.safe_load(file) |
| 38 | return all(case_infra_error(tc) for tc in results) |
| 39 | except FileNotFoundError: |
| 40 | logging.warning("Could not open results file %s", path) |
| 41 | return True |
| 42 | |
| 43 | |
| 44 | def run_one_job(cmd): |
| 45 | """Run a job and return a boolean indicating if there was not an infra error. |
| 46 | Raises a `subprocess.CalledProcessError` when the called script fails. |
| 47 | """ |
| 48 | subprocess.run(cmd, check=True) |
| 49 | return not_infra_error("job_results.yaml") |
| 50 | |
| 51 | |
| 52 | def retry_job(cmd, retries): |
| 53 | """Run a job until there was not an infra error or retries are exhausted. |
| 54 | Raises a `subprocess.CalledProcessError` when the called script fails. |
| 55 | """ |
| 56 | logging.debug("trying job %s up to %d times", str(cmd), retries) |
| 57 | return any(run_one_job(cmd) for _ in range(retries)) |
| 58 | |
| 59 | |
| 60 | if __name__ == "__main__": |
Khasim Syed Mohammed | 430d594 | 2021-08-09 21:26:15 +0530 | [diff] [blame] | 61 | |
| 62 | # To deploy and boot the artefacts on a board in LAVA a platform specific |
| 63 | # yaml file should be dispatched to LAVA. The below logic will identify |
| 64 | # the name of the yaml file at run time for the platform defined in run_cfg. |
| 65 | platform_list = ['n1sdp', 'juno'] |
| 66 | |
| 67 | run_cfg = os.environ["RUN_CONFIG"] |
| 68 | res = [i for i in platform_list if i in run_cfg] |
| 69 | if res: |
| 70 | platform_yaml=''.join(res)+'.yaml' |
| 71 | else: |
| 72 | logging.critical("Exiting: Platform not found for LAVA in run-config %s", os.environ["RUN_CONFIG"]) |
| 73 | sys.exit(-1) |
| 74 | |
Zelalem | 917b43e | 2020-08-04 11:39:55 -0500 | [diff] [blame] | 75 | parser = argparse.ArgumentParser( |
| 76 | description="Lava job runner with infrastructure error dectection and retry." |
| 77 | ) |
| 78 | parser.add_argument( |
| 79 | "script", |
| 80 | nargs="?", |
| 81 | default=os.path.join(os.path.dirname(__file__), "run_lava_job.sh"), |
| 82 | help="bash job script to run a lava job", |
| 83 | ) |
| 84 | parser.add_argument( |
| 85 | "job", |
| 86 | nargs="?", |
Khasim Syed Mohammed | 430d594 | 2021-08-09 21:26:15 +0530 | [diff] [blame] | 87 | default=os.path.join("artefacts", os.environ["BIN_MODE"], platform_yaml), |
Zelalem | 917b43e | 2020-08-04 11:39:55 -0500 | [diff] [blame] | 88 | help="the Lava job description file", |
| 89 | ) |
| 90 | parser.add_argument( |
| 91 | "retries", |
| 92 | type=int, |
| 93 | nargs="?", |
| 94 | default=3, |
| 95 | help="Number of retries. defaluts to 3", |
| 96 | ) |
| 97 | parser.add_argument( |
| 98 | "--save", |
| 99 | default=tempfile.mkdtemp(prefix="job-output"), |
| 100 | help="directory to store the job_output.log", |
| 101 | ) |
| 102 | parser.add_argument( |
Zelalem | 8afa092 | 2020-08-28 10:40:44 -0500 | [diff] [blame] | 103 | "--username", |
| 104 | required=True, |
| 105 | help="the user name for lava server", |
| 106 | ) |
| 107 | parser.add_argument( |
| 108 | "--token", |
| 109 | required=True, |
| 110 | help="the token for lava server", |
| 111 | ) |
| 112 | parser.add_argument( |
Zelalem | 917b43e | 2020-08-04 11:39:55 -0500 | [diff] [blame] | 113 | "-v", action="count", default=0, help="Increase printing of debug ouptut" |
| 114 | ) |
| 115 | args = parser.parse_args() |
| 116 | if args.v >= 2: |
| 117 | logging.getLogger().setLevel(logging.DEBUG) |
| 118 | elif args.v >= 1: |
| 119 | logging.getLogger().setLevel(logging.INFO) |
| 120 | logging.debug(args) |
| 121 | try: |
Zelalem | 8afa092 | 2020-08-28 10:40:44 -0500 | [diff] [blame] | 122 | if not retry_job([args.script, args.job, args.save, args.username, args.token],\ |
| 123 | args.retries): |
Zelalem | 917b43e | 2020-08-04 11:39:55 -0500 | [diff] [blame] | 124 | logging.critical("All jobs failed with infra errors; retries exhausted") |
| 125 | sys.exit(-1) |
| 126 | else: |
| 127 | sys.exit(0) |
| 128 | except subprocess.CalledProcessError as e: |
| 129 | logging.critical("Job script returned error code %d", e.returncode) |
| 130 | sys.exit(e.returncode) |