blob: d0c2f8727586d1a468fcf6618c3e818c112d45cb [file] [log] [blame]
Zelalem917b43e2020-08-04 11:39:55 -05001#!/usr/bin/env python3
2#
Leonardo Sandoval579c7372020-10-23 15:23:32 -05003# Copyright (c) 2019-2020 Arm Limited. All rights reserved.
Zelalem917b43e2020-08-04 11:39:55 -05004#
5# SPDX-License-Identifier: BSD-3-Clause
6#
7
8import argparse
9import os
10import subprocess
11import sys
12import logging
13import tempfile
14import yaml
15
16
17def case_infra_error(case):
18 try:
19 if case["metadata"]["error_type"] == "Infrastructure":
20 logging.error("case %s: infra error is type Infrastructure", case["id"])
21 return False
22 elif "timed out" in case["metadata"]["error_msg"]:
23 logging.error(
24 "case %s: infra error: %s", case["id"], case["metadata"]["error_msg"]
25 )
26 return False
27 else:
28 return True
29 except KeyError:
30 return True
31
32
33def not_infra_error(path):
34 """Returns a boolean indicating if there was not an infra error"""
35 try:
36 with open(path) as file:
37 results = yaml.safe_load(file)
38 return all(case_infra_error(tc) for tc in results)
39 except FileNotFoundError:
40 logging.warning("Could not open results file %s", path)
41 return True
42
43
44def run_one_job(cmd):
45 """Run a job and return a boolean indicating if there was not an infra error.
46 Raises a `subprocess.CalledProcessError` when the called script fails.
47 """
48 subprocess.run(cmd, check=True)
49 return not_infra_error("job_results.yaml")
50
51
52def retry_job(cmd, retries):
53 """Run a job until there was not an infra error or retries are exhausted.
54 Raises a `subprocess.CalledProcessError` when the called script fails.
55 """
56 logging.debug("trying job %s up to %d times", str(cmd), retries)
57 return any(run_one_job(cmd) for _ in range(retries))
58
59
60if __name__ == "__main__":
61 parser = argparse.ArgumentParser(
62 description="Lava job runner with infrastructure error dectection and retry."
63 )
64 parser.add_argument(
65 "script",
66 nargs="?",
67 default=os.path.join(os.path.dirname(__file__), "run_lava_job.sh"),
68 help="bash job script to run a lava job",
69 )
70 parser.add_argument(
71 "job",
72 nargs="?",
73 default=os.path.join("artefacts", os.environ["BIN_MODE"], "juno.yaml"),
74 help="the Lava job description file",
75 )
76 parser.add_argument(
77 "retries",
78 type=int,
79 nargs="?",
80 default=3,
81 help="Number of retries. defaluts to 3",
82 )
83 parser.add_argument(
84 "--save",
85 default=tempfile.mkdtemp(prefix="job-output"),
86 help="directory to store the job_output.log",
87 )
88 parser.add_argument(
Zelalem8afa0922020-08-28 10:40:44 -050089 "--username",
90 required=True,
91 help="the user name for lava server",
92 )
93 parser.add_argument(
94 "--token",
95 required=True,
96 help="the token for lava server",
97 )
98 parser.add_argument(
Zelalem917b43e2020-08-04 11:39:55 -050099 "-v", action="count", default=0, help="Increase printing of debug ouptut"
100 )
101 args = parser.parse_args()
102 if args.v >= 2:
103 logging.getLogger().setLevel(logging.DEBUG)
104 elif args.v >= 1:
105 logging.getLogger().setLevel(logging.INFO)
106 logging.debug(args)
107 try:
Zelalem8afa0922020-08-28 10:40:44 -0500108 if not retry_job([args.script, args.job, args.save, args.username, args.token],\
109 args.retries):
Zelalem917b43e2020-08-04 11:39:55 -0500110 logging.critical("All jobs failed with infra errors; retries exhausted")
111 sys.exit(-1)
112 else:
113 sys.exit(0)
114 except subprocess.CalledProcessError as e:
115 logging.critical("Job script returned error code %d", e.returncode)
116 sys.exit(e.returncode)