blob: 93d522c66e55a5799c04fd71561cfbec103ede8d [file] [log] [blame]
Zelalem917b43e2020-08-04 11:39:55 -05001#!/usr/bin/env python3
2#
3# Copyright (c) 2019, Arm Limited. All rights reserved.
4#
5# SPDX-License-Identifier: BSD-3-Clause
6#
7
8import argparse
9import os
10import subprocess
11import sys
12import logging
13import tempfile
14import yaml
15
16
17def case_infra_error(case):
18 try:
19 if case["metadata"]["error_type"] == "Infrastructure":
20 logging.error("case %s: infra error is type Infrastructure", case["id"])
21 return False
22 elif "timed out" in case["metadata"]["error_msg"]:
23 logging.error(
24 "case %s: infra error: %s", case["id"], case["metadata"]["error_msg"]
25 )
26 return False
27 else:
28 return True
29 except KeyError:
30 return True
31
32
33def not_infra_error(path):
34 """Returns a boolean indicating if there was not an infra error"""
35 try:
36 with open(path) as file:
37 results = yaml.safe_load(file)
38 return all(case_infra_error(tc) for tc in results)
39 except FileNotFoundError:
40 logging.warning("Could not open results file %s", path)
41 return True
42
43
44def run_one_job(cmd):
45 """Run a job and return a boolean indicating if there was not an infra error.
46 Raises a `subprocess.CalledProcessError` when the called script fails.
47 """
48 subprocess.run(cmd, check=True)
49 return not_infra_error("job_results.yaml")
50
51
52def retry_job(cmd, retries):
53 """Run a job until there was not an infra error or retries are exhausted.
54 Raises a `subprocess.CalledProcessError` when the called script fails.
55 """
56 logging.debug("trying job %s up to %d times", str(cmd), retries)
57 return any(run_one_job(cmd) for _ in range(retries))
58
59
60if __name__ == "__main__":
61 parser = argparse.ArgumentParser(
62 description="Lava job runner with infrastructure error dectection and retry."
63 )
64 parser.add_argument(
65 "script",
66 nargs="?",
67 default=os.path.join(os.path.dirname(__file__), "run_lava_job.sh"),
68 help="bash job script to run a lava job",
69 )
70 parser.add_argument(
71 "job",
72 nargs="?",
73 default=os.path.join("artefacts", os.environ["BIN_MODE"], "juno.yaml"),
74 help="the Lava job description file",
75 )
76 parser.add_argument(
77 "retries",
78 type=int,
79 nargs="?",
80 default=3,
81 help="Number of retries. defaluts to 3",
82 )
83 parser.add_argument(
84 "--save",
85 default=tempfile.mkdtemp(prefix="job-output"),
86 help="directory to store the job_output.log",
87 )
88 parser.add_argument(
89 "-v", action="count", default=0, help="Increase printing of debug ouptut"
90 )
91 args = parser.parse_args()
92 if args.v >= 2:
93 logging.getLogger().setLevel(logging.DEBUG)
94 elif args.v >= 1:
95 logging.getLogger().setLevel(logging.INFO)
96 logging.debug(args)
97 try:
98 if not retry_job([args.script, args.job, args.save], args.retries):
99 logging.critical("All jobs failed with infra errors; retries exhausted")
100 sys.exit(-1)
101 else:
102 sys.exit(0)
103 except subprocess.CalledProcessError as e:
104 logging.critical("Job script returned error code %d", e.returncode)
105 sys.exit(e.returncode)