blob: a15a058f0e51a17fb1ff920309edfdd9b11fb2c5 [file] [log] [blame]
Darryl Green10d9ce32018-02-28 10:02:55 +00001#!/usr/bin/env python3
Gilles Peskine7dfcfce2019-07-04 19:31:02 +02002
Bence Szépkúti1e148272020-08-07 13:07:28 +02003# Copyright The Mbed TLS Contributors
Dave Rodgman16799db2023-11-02 19:47:20 +00004# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
Gilles Peskine7dfcfce2019-07-04 19:31:02 +02005
Darryl Green10d9ce32018-02-28 10:02:55 +00006"""
Darryl Green10d9ce32018-02-28 10:02:55 +00007This script checks the current state of the source code for minor issues,
8including incorrect file permissions, presence of tabs, non-Unix line endings,
Gilles Peskine55b49ee2019-07-04 19:31:33 +02009trailing whitespace, and presence of UTF-8 BOM.
Darryl Green10d9ce32018-02-28 10:02:55 +000010Note: requires python 3, must be run from Mbed TLS root.
11"""
12
Darryl Green10d9ce32018-02-28 10:02:55 +000013import argparse
Darryl Green10d9ce32018-02-28 10:02:55 +000014import codecs
Gilles Peskinef2fb9f62023-11-03 14:13:55 +010015import inspect
Gilles Peskine990030b2023-11-03 13:55:00 +010016import logging
17import os
Gilles Peskine0598db82020-05-10 16:57:16 +020018import re
Gilles Peskine3e2ee3c2020-05-10 17:18:06 +020019import subprocess
Darryl Green10d9ce32018-02-28 10:02:55 +000020import sys
Gilles Peskineac9e7c02020-08-11 15:11:50 +020021try:
22 from typing import FrozenSet, Optional, Pattern # pylint: disable=unused-import
23except ImportError:
24 pass
Darryl Green10d9ce32018-02-28 10:02:55 +000025
Gilles Peskined9071e72022-09-18 21:17:09 +020026import scripts_path # pylint: disable=unused-import
27from mbedtls_dev import build_tree
28
Darryl Green10d9ce32018-02-28 10:02:55 +000029
Gilles Peskine184c0962020-03-24 18:25:17 +010030class FileIssueTracker:
Gilles Peskine6ee576e2019-02-25 20:59:05 +010031 """Base class for file-wide issue tracking.
32
33 To implement a checker that processes a file as a whole, inherit from
Gilles Peskine1e9698a2019-02-25 21:10:04 +010034 this class and implement `check_file_for_issue` and define ``heading``.
35
Gilles Peskine05a51a82020-05-10 16:52:44 +020036 ``suffix_exemptions``: files whose name ends with a string in this set
Gilles Peskine1e9698a2019-02-25 21:10:04 +010037 will not be checked.
38
Gilles Peskine0598db82020-05-10 16:57:16 +020039 ``path_exemptions``: files whose path (relative to the root of the source
40 tree) matches this regular expression will not be checked. This can be
41 ``None`` to match no path. Paths are normalized and converted to ``/``
42 separators before matching.
43
Gilles Peskine1e9698a2019-02-25 21:10:04 +010044 ``heading``: human-readable description of the issue
Gilles Peskine6ee576e2019-02-25 20:59:05 +010045 """
Darryl Green10d9ce32018-02-28 10:02:55 +000046
Gilles Peskineac9e7c02020-08-11 15:11:50 +020047 suffix_exemptions = frozenset() #type: FrozenSet[str]
48 path_exemptions = None #type: Optional[Pattern[str]]
Gilles Peskine1e9698a2019-02-25 21:10:04 +010049 # heading must be defined in derived classes.
50 # pylint: disable=no-member
51
Darryl Green10d9ce32018-02-28 10:02:55 +000052 def __init__(self):
Darryl Green10d9ce32018-02-28 10:02:55 +000053 self.files_with_issues = {}
54
Gilles Peskine0598db82020-05-10 16:57:16 +020055 @staticmethod
56 def normalize_path(filepath):
Gilles Peskineeca95db2020-05-28 18:19:20 +020057 """Normalize ``filepath`` with / as the directory separator."""
Gilles Peskine0598db82020-05-10 16:57:16 +020058 filepath = os.path.normpath(filepath)
Gilles Peskineeca95db2020-05-28 18:19:20 +020059 # On Windows, we may have backslashes to separate directories.
60 # We need slashes to match exemption lists.
Gilles Peskine0598db82020-05-10 16:57:16 +020061 seps = os.path.sep
62 if os.path.altsep is not None:
63 seps += os.path.altsep
64 return '/'.join(filepath.split(seps))
65
Darryl Green10d9ce32018-02-28 10:02:55 +000066 def should_check_file(self, filepath):
Gilles Peskineaaee4442020-03-24 16:49:21 +010067 """Whether the given file name should be checked.
68
Gilles Peskine05a51a82020-05-10 16:52:44 +020069 Files whose name ends with a string listed in ``self.suffix_exemptions``
70 or whose path matches ``self.path_exemptions`` will not be checked.
Gilles Peskineaaee4442020-03-24 16:49:21 +010071 """
Gilles Peskine05a51a82020-05-10 16:52:44 +020072 for files_exemption in self.suffix_exemptions:
Darryl Green10d9ce32018-02-28 10:02:55 +000073 if filepath.endswith(files_exemption):
74 return False
Gilles Peskine0598db82020-05-10 16:57:16 +020075 if self.path_exemptions and \
76 re.match(self.path_exemptions, self.normalize_path(filepath)):
77 return False
Darryl Green10d9ce32018-02-28 10:02:55 +000078 return True
79
Darryl Green10d9ce32018-02-28 10:02:55 +000080 def check_file_for_issue(self, filepath):
Gilles Peskineaaee4442020-03-24 16:49:21 +010081 """Check the specified file for the issue that this class is for.
82
83 Subclasses must implement this method.
84 """
Gilles Peskine6ee576e2019-02-25 20:59:05 +010085 raise NotImplementedError
Darryl Green10d9ce32018-02-28 10:02:55 +000086
Gilles Peskine04398052018-11-23 21:11:30 +010087 def record_issue(self, filepath, line_number):
Gilles Peskineaaee4442020-03-24 16:49:21 +010088 """Record that an issue was found at the specified location."""
Gilles Peskine04398052018-11-23 21:11:30 +010089 if filepath not in self.files_with_issues.keys():
90 self.files_with_issues[filepath] = []
91 self.files_with_issues[filepath].append(line_number)
92
Darryl Green10d9ce32018-02-28 10:02:55 +000093 def output_file_issues(self, logger):
Gilles Peskineaaee4442020-03-24 16:49:21 +010094 """Log all the locations where the issue was found."""
Darryl Green10d9ce32018-02-28 10:02:55 +000095 if self.files_with_issues.values():
96 logger.info(self.heading)
97 for filename, lines in sorted(self.files_with_issues.items()):
98 if lines:
99 logger.info("{}: {}".format(
100 filename, ", ".join(str(x) for x in lines)
101 ))
102 else:
103 logger.info(filename)
104 logger.info("")
105
Gilles Peskined4a853d2020-05-10 16:57:59 +0200106BINARY_FILE_PATH_RE_LIST = [
107 r'docs/.*\.pdf\Z',
Ryan Everettabd89772023-12-15 12:28:38 +0000108 r'docs/.*\.png\Z',
Gilles Peskined4a853d2020-05-10 16:57:59 +0200109 r'programs/fuzz/corpuses/[^.]+\Z',
110 r'tests/data_files/[^.]+\Z',
111 r'tests/data_files/.*\.(crt|csr|db|der|key|pubkey)\Z',
112 r'tests/data_files/.*\.req\.[^/]+\Z',
113 r'tests/data_files/.*malformed[^/]+\Z',
114 r'tests/data_files/format_pkcs12\.fmt\Z',
Gilles Peskine0ed9e782023-01-05 20:27:18 +0100115 r'tests/data_files/.*\.bin\Z',
Gilles Peskined4a853d2020-05-10 16:57:59 +0200116]
117BINARY_FILE_PATH_RE = re.compile('|'.join(BINARY_FILE_PATH_RE_LIST))
118
Gilles Peskine6ee576e2019-02-25 20:59:05 +0100119class LineIssueTracker(FileIssueTracker):
120 """Base class for line-by-line issue tracking.
Darryl Green10d9ce32018-02-28 10:02:55 +0000121
Gilles Peskine6ee576e2019-02-25 20:59:05 +0100122 To implement a checker that processes files line by line, inherit from
123 this class and implement `line_with_issue`.
124 """
125
Gilles Peskined4a853d2020-05-10 16:57:59 +0200126 # Exclude binary files.
127 path_exemptions = BINARY_FILE_PATH_RE
128
Gilles Peskineb3897432023-01-05 20:28:30 +0100129 def issue_with_line(self, line, filepath, line_number):
Gilles Peskineaaee4442020-03-24 16:49:21 +0100130 """Check the specified line for the issue that this class is for.
131
132 Subclasses must implement this method.
133 """
Gilles Peskine6ee576e2019-02-25 20:59:05 +0100134 raise NotImplementedError
135
136 def check_file_line(self, filepath, line, line_number):
Gilles Peskineb3897432023-01-05 20:28:30 +0100137 if self.issue_with_line(line, filepath, line_number):
Gilles Peskine6ee576e2019-02-25 20:59:05 +0100138 self.record_issue(filepath, line_number)
139
140 def check_file_for_issue(self, filepath):
Gilles Peskineaaee4442020-03-24 16:49:21 +0100141 """Check the lines of the specified file.
142
143 Subclasses must implement the ``issue_with_line`` method.
144 """
Gilles Peskine6ee576e2019-02-25 20:59:05 +0100145 with open(filepath, "rb") as f:
146 for i, line in enumerate(iter(f.readline, b"")):
147 self.check_file_line(filepath, line, i + 1)
148
Gilles Peskine2c618732020-03-24 22:26:01 +0100149
150def is_windows_file(filepath):
151 _root, ext = os.path.splitext(filepath)
Gilles Peskined2df86f2020-05-10 17:36:51 +0200152 return ext in ('.bat', '.dsp', '.dsw', '.sln', '.vcxproj')
Gilles Peskine2c618732020-03-24 22:26:01 +0100153
154
Gilles Peskine4aebb8d2020-08-08 23:15:18 +0200155class ShebangIssueTracker(FileIssueTracker):
156 """Track files with a bad, missing or extraneous shebang line.
157
158 Executable scripts must start with a valid shebang (#!) line.
159 """
160
161 heading = "Invalid shebang line:"
162
163 # Allow either /bin/sh, /bin/bash, or /usr/bin/env.
164 # Allow at most one argument (this is a Linux limitation).
165 # For sh and bash, the argument if present must be options.
Shaun Case8b0ecbc2021-12-20 21:14:10 -0800166 # For env, the argument must be the base name of the interpreter.
Gilles Peskine4aebb8d2020-08-08 23:15:18 +0200167 _shebang_re = re.compile(rb'^#! ?(?:/bin/(bash|sh)(?: -[^\n ]*)?'
168 rb'|/usr/bin/env ([^\n /]+))$')
169 _extensions = {
170 b'bash': 'sh',
171 b'perl': 'pl',
172 b'python3': 'py',
173 b'sh': 'sh',
174 }
175
176 def is_valid_shebang(self, first_line, filepath):
177 m = re.match(self._shebang_re, first_line)
178 if not m:
179 return False
180 interpreter = m.group(1) or m.group(2)
181 if interpreter not in self._extensions:
182 return False
183 if not filepath.endswith('.' + self._extensions[interpreter]):
184 return False
185 return True
186
187 def check_file_for_issue(self, filepath):
188 is_executable = os.access(filepath, os.X_OK)
189 with open(filepath, "rb") as f:
190 first_line = f.readline()
191 if first_line.startswith(b'#!'):
192 if not is_executable:
193 # Shebang on a non-executable file
194 self.files_with_issues[filepath] = None
195 elif not self.is_valid_shebang(first_line, filepath):
196 self.files_with_issues[filepath] = [1]
197 elif is_executable:
198 # Executable without a shebang
199 self.files_with_issues[filepath] = None
200
201
Gilles Peskine6ee576e2019-02-25 20:59:05 +0100202class EndOfFileNewlineIssueTracker(FileIssueTracker):
Gilles Peskine0d060ef2019-02-25 20:35:31 +0100203 """Track files that end with an incomplete line
204 (no newline character at the end of the last line)."""
Darryl Green10d9ce32018-02-28 10:02:55 +0000205
Gilles Peskine1e9698a2019-02-25 21:10:04 +0100206 heading = "Missing newline at end of file:"
Darryl Green10d9ce32018-02-28 10:02:55 +0000207
Gilles Peskined4a853d2020-05-10 16:57:59 +0200208 path_exemptions = BINARY_FILE_PATH_RE
209
Darryl Green10d9ce32018-02-28 10:02:55 +0000210 def check_file_for_issue(self, filepath):
211 with open(filepath, "rb") as f:
Gilles Peskine12b180a2020-05-10 17:36:42 +0200212 try:
213 f.seek(-1, 2)
214 except OSError:
215 # This script only works on regular files. If we can't seek
216 # 1 before the end, it means that this position is before
217 # the beginning of the file, i.e. that the file is empty.
218 return
219 if f.read(1) != b"\n":
Darryl Green10d9ce32018-02-28 10:02:55 +0000220 self.files_with_issues[filepath] = None
221
222
Gilles Peskine6ee576e2019-02-25 20:59:05 +0100223class Utf8BomIssueTracker(FileIssueTracker):
Gilles Peskine0d060ef2019-02-25 20:35:31 +0100224 """Track files that start with a UTF-8 BOM.
225 Files should be ASCII or UTF-8. Valid UTF-8 does not start with a BOM."""
Darryl Green10d9ce32018-02-28 10:02:55 +0000226
Gilles Peskine1e9698a2019-02-25 21:10:04 +0100227 heading = "UTF-8 BOM present:"
Darryl Green10d9ce32018-02-28 10:02:55 +0000228
Gilles Peskine05a51a82020-05-10 16:52:44 +0200229 suffix_exemptions = frozenset([".vcxproj", ".sln"])
Gilles Peskined4a853d2020-05-10 16:57:59 +0200230 path_exemptions = BINARY_FILE_PATH_RE
Gilles Peskine2c618732020-03-24 22:26:01 +0100231
Darryl Green10d9ce32018-02-28 10:02:55 +0000232 def check_file_for_issue(self, filepath):
233 with open(filepath, "rb") as f:
234 if f.read().startswith(codecs.BOM_UTF8):
235 self.files_with_issues[filepath] = None
236
237
Gilles Peskined11bb472023-01-05 20:28:57 +0100238class UnicodeIssueTracker(LineIssueTracker):
239 """Track lines with invalid characters or invalid text encoding."""
240
241 heading = "Invalid UTF-8 or forbidden character:"
242
Aditya Deshpande15b6dd02023-01-30 13:46:58 +0000243 # Only allow valid UTF-8, and only other explicitly allowed characters.
Gilles Peskined11bb472023-01-05 20:28:57 +0100244 # We deliberately exclude all characters that aren't a simple non-blank,
245 # non-zero-width glyph, apart from a very small set (tab, ordinary space,
246 # line breaks, "basic" no-break space and soft hyphen). In particular,
247 # non-ASCII control characters, combinig characters, and Unicode state
248 # changes (e.g. right-to-left text) are forbidden.
249 # Note that we do allow some characters with a risk of visual confusion,
250 # for example '-' (U+002D HYPHEN-MINUS) vs '­' (U+00AD SOFT HYPHEN) vs
251 # '‐' (U+2010 HYPHEN), or 'A' (U+0041 LATIN CAPITAL LETTER A) vs
252 # 'Α' (U+0391 GREEK CAPITAL LETTER ALPHA).
253 GOOD_CHARACTERS = ''.join([
254 '\t\n\r -~', # ASCII (tabs and line endings are checked separately)
255 '\u00A0-\u00FF', # Latin-1 Supplement (for NO-BREAK SPACE and punctuation)
256 '\u2010-\u2027\u2030-\u205E', # General Punctuation (printable)
257 '\u2070\u2071\u2074-\u208E\u2090-\u209C', # Superscripts and Subscripts
258 '\u2190-\u21FF', # Arrows
259 '\u2200-\u22FF', # Mathematical Symbols
Aditya Deshpandeebb22692023-02-01 13:30:26 +0000260 '\u2500-\u257F' # Box Drawings characters used in markdown trees
Gilles Peskined11bb472023-01-05 20:28:57 +0100261 ])
262 # Allow any of the characters and ranges above, and anything classified
263 # as a word constituent.
264 GOOD_CHARACTERS_RE = re.compile(r'[\w{}]+\Z'.format(GOOD_CHARACTERS))
265
266 def issue_with_line(self, line, _filepath, line_number):
267 try:
268 text = line.decode('utf-8')
269 except UnicodeDecodeError:
270 return True
271 if line_number == 1 and text.startswith('\uFEFF'):
272 # Strip BOM (U+FEFF ZERO WIDTH NO-BREAK SPACE) at the beginning.
273 # Which files are allowed to have a BOM is handled in
274 # Utf8BomIssueTracker.
275 text = text[1:]
276 return not self.GOOD_CHARACTERS_RE.match(text)
277
Gilles Peskine2c618732020-03-24 22:26:01 +0100278class UnixLineEndingIssueTracker(LineIssueTracker):
Gilles Peskine0d060ef2019-02-25 20:35:31 +0100279 """Track files with non-Unix line endings (i.e. files with CR)."""
Darryl Green10d9ce32018-02-28 10:02:55 +0000280
Gilles Peskine2c618732020-03-24 22:26:01 +0100281 heading = "Non-Unix line endings:"
282
283 def should_check_file(self, filepath):
Gilles Peskine0598db82020-05-10 16:57:16 +0200284 if not super().should_check_file(filepath):
285 return False
Gilles Peskine2c618732020-03-24 22:26:01 +0100286 return not is_windows_file(filepath)
Darryl Green10d9ce32018-02-28 10:02:55 +0000287
Gilles Peskineb3897432023-01-05 20:28:30 +0100288 def issue_with_line(self, line, _filepath, _line_number):
Darryl Green10d9ce32018-02-28 10:02:55 +0000289 return b"\r" in line
290
291
Gilles Peskine545e13f2020-03-24 22:29:11 +0100292class WindowsLineEndingIssueTracker(LineIssueTracker):
Gilles Peskined703a2e2020-04-01 13:35:46 +0200293 """Track files with non-Windows line endings (i.e. CR or LF not in CRLF)."""
Gilles Peskine545e13f2020-03-24 22:29:11 +0100294
295 heading = "Non-Windows line endings:"
296
297 def should_check_file(self, filepath):
Gilles Peskine0598db82020-05-10 16:57:16 +0200298 if not super().should_check_file(filepath):
299 return False
Gilles Peskine545e13f2020-03-24 22:29:11 +0100300 return is_windows_file(filepath)
301
Gilles Peskineb3897432023-01-05 20:28:30 +0100302 def issue_with_line(self, line, _filepath, _line_number):
Gilles Peskined703a2e2020-04-01 13:35:46 +0200303 return not line.endswith(b"\r\n") or b"\r" in line[:-2]
Gilles Peskine545e13f2020-03-24 22:29:11 +0100304
305
Gilles Peskine6ee576e2019-02-25 20:59:05 +0100306class TrailingWhitespaceIssueTracker(LineIssueTracker):
Gilles Peskine0d060ef2019-02-25 20:35:31 +0100307 """Track lines with trailing whitespace."""
Darryl Green10d9ce32018-02-28 10:02:55 +0000308
Gilles Peskine1e9698a2019-02-25 21:10:04 +0100309 heading = "Trailing whitespace:"
Gilles Peskine05a51a82020-05-10 16:52:44 +0200310 suffix_exemptions = frozenset([".dsp", ".md"])
Darryl Green10d9ce32018-02-28 10:02:55 +0000311
Gilles Peskineb3897432023-01-05 20:28:30 +0100312 def issue_with_line(self, line, _filepath, _line_number):
Darryl Green10d9ce32018-02-28 10:02:55 +0000313 return line.rstrip(b"\r\n") != line.rstrip()
314
315
Gilles Peskine6ee576e2019-02-25 20:59:05 +0100316class TabIssueTracker(LineIssueTracker):
Gilles Peskine0d060ef2019-02-25 20:35:31 +0100317 """Track lines with tabs."""
Darryl Green10d9ce32018-02-28 10:02:55 +0000318
Gilles Peskine1e9698a2019-02-25 21:10:04 +0100319 heading = "Tabs present:"
Gilles Peskine05a51a82020-05-10 16:52:44 +0200320 suffix_exemptions = frozenset([
Gilles Peskine76022982023-12-22 15:28:07 +0100321 ".make",
Gilles Peskine344da1c2020-05-10 17:37:02 +0200322 ".pem", # some openssl dumps have tabs
Gilles Peskine2c618732020-03-24 22:26:01 +0100323 ".sln",
Gilles Peskine2aa63ea2024-03-04 11:08:19 +0100324 "/.gitmodules",
Gilles Peskine6e8d5a02020-03-24 22:01:28 +0100325 "/Makefile",
326 "/Makefile.inc",
327 "/generate_visualc_files.pl",
Gilles Peskine1e9698a2019-02-25 21:10:04 +0100328 ])
Darryl Green10d9ce32018-02-28 10:02:55 +0000329
Gilles Peskineb3897432023-01-05 20:28:30 +0100330 def issue_with_line(self, line, _filepath, _line_number):
Darryl Green10d9ce32018-02-28 10:02:55 +0000331 return b"\t" in line
332
333
Gilles Peskine6ee576e2019-02-25 20:59:05 +0100334class MergeArtifactIssueTracker(LineIssueTracker):
Gilles Peskine0d060ef2019-02-25 20:35:31 +0100335 """Track lines with merge artifacts.
336 These are leftovers from a ``git merge`` that wasn't fully edited."""
Gilles Peskinec117d592018-11-23 21:11:52 +0100337
Gilles Peskine1e9698a2019-02-25 21:10:04 +0100338 heading = "Merge artifact:"
Gilles Peskinec117d592018-11-23 21:11:52 +0100339
Gilles Peskineb3897432023-01-05 20:28:30 +0100340 def issue_with_line(self, line, _filepath, _line_number):
Gilles Peskinec117d592018-11-23 21:11:52 +0100341 # Detect leftover git conflict markers.
342 if line.startswith(b'<<<<<<< ') or line.startswith(b'>>>>>>> '):
343 return True
344 if line.startswith(b'||||||| '): # from merge.conflictStyle=diff3
345 return True
346 if line.rstrip(b'\r\n') == b'=======' and \
Gilles Peskine6ee576e2019-02-25 20:59:05 +0100347 not _filepath.endswith('.md'):
Gilles Peskinec117d592018-11-23 21:11:52 +0100348 return True
349 return False
350
Darryl Green10d9ce32018-02-28 10:02:55 +0000351
Gilles Peskinece782002023-11-03 14:49:12 +0100352def this_location():
353 frame = inspect.currentframe()
354 assert frame is not None
355 info = inspect.getframeinfo(frame)
356 return os.path.basename(info.filename), info.lineno
357THIS_FILE_BASE_NAME, LINE_NUMBER_BEFORE_LICENSE_ISSUE_TRACKER = this_location()
358
Gilles Peskinef2fb9f62023-11-03 14:13:55 +0100359class LicenseIssueTracker(LineIssueTracker):
360 """Check copyright statements and license indications.
361
362 This class only checks that statements are correct if present. It does
363 not enforce the presence of statements in each file.
364 """
365
366 heading = "License issue:"
367
368 LICENSE_EXEMPTION_RE_LIST = [
369 # Third-party code, other than whitelisted third-party modules,
370 # may be under a different license.
371 r'3rdparty/(?!(p256-m)/.*)',
372 # Documentation explaining the license may have accidental
373 # false positives.
374 r'(ChangeLog|LICENSE|[-0-9A-Z_a-z]+\.md)\Z',
375 # Files imported from TF-M, and not used except in test builds,
376 # may be under a different license.
Dave Rodgman1c910572023-12-08 17:58:44 +0000377 r'configs/ext/crypto_config_profile_medium\.h\Z',
378 r'configs/ext/tfm_mbedcrypto_config_profile_medium\.h\Z',
379 r'configs/ext/README\.md\Z',
Gilles Peskinef2fb9f62023-11-03 14:13:55 +0100380 # Third-party file.
381 r'dco\.txt\Z',
382 ]
383 path_exemptions = re.compile('|'.join(BINARY_FILE_PATH_RE_LIST +
384 LICENSE_EXEMPTION_RE_LIST))
385
386 COPYRIGHT_HOLDER = rb'The Mbed TLS Contributors'
387 # Catch "Copyright foo", "Copyright (C) foo", "Copyright © foo", etc.
388 COPYRIGHT_RE = re.compile(rb'.*\bcopyright\s+((?:\w|\s|[()]|[^ -~])*\w)', re.I)
389
390 SPDX_HEADER_KEY = b'SPDX-License-Identifier'
391 LICENSE_IDENTIFIER = b'Apache-2.0 OR GPL-2.0-or-later'
392 SPDX_RE = re.compile(br'.*?(' +
393 re.escape(SPDX_HEADER_KEY) +
394 br')(:\s*(.*?)\W*\Z|.*)', re.I)
395
Gilles Peskine3b9facd2023-11-03 14:35:28 +0100396 LICENSE_MENTION_RE = re.compile(rb'.*(?:' + rb'|'.join([
397 rb'Apache License',
398 rb'General Public License',
399 ]) + rb')', re.I)
400
Gilles Peskinef2fb9f62023-11-03 14:13:55 +0100401 def __init__(self):
402 super().__init__()
403 # Record what problem was caused. We can't easily report it due to
404 # the structure of the script. To be fixed after
405 # https://github.com/Mbed-TLS/mbedtls/pull/2506
406 self.problem = None
407
408 def issue_with_line(self, line, filepath, line_number):
Gilles Peskine3b9facd2023-11-03 14:35:28 +0100409 #pylint: disable=too-many-return-statements
410
Gilles Peskinef2fb9f62023-11-03 14:13:55 +0100411 # Use endswith() rather than the more correct os.path.basename()
412 # because experimentally, it makes a significant difference to
413 # the running time.
414 if filepath.endswith(THIS_FILE_BASE_NAME) and \
415 line_number > LINE_NUMBER_BEFORE_LICENSE_ISSUE_TRACKER:
416 # Avoid false positives from the code in this class.
417 # Also skip the rest of this file, which is highly unlikely to
418 # contain any problematic statements since we put those near the
419 # top of files.
420 return False
421
422 m = self.COPYRIGHT_RE.match(line)
423 if m and m.group(1) != self.COPYRIGHT_HOLDER:
424 self.problem = 'Invalid copyright line'
425 return True
426
427 m = self.SPDX_RE.match(line)
428 if m:
429 if m.group(1) != self.SPDX_HEADER_KEY:
430 self.problem = 'Misspelled ' + self.SPDX_HEADER_KEY.decode()
431 return True
432 if not m.group(3):
433 self.problem = 'Improperly formatted SPDX license identifier'
434 return True
435 if m.group(3) != self.LICENSE_IDENTIFIER:
436 self.problem = 'Wrong SPDX license identifier'
437 return True
Gilles Peskine3b9facd2023-11-03 14:35:28 +0100438
439 m = self.LICENSE_MENTION_RE.match(line)
440 if m:
441 self.problem = 'Suspicious license mention'
442 return True
443
Gilles Peskinef2fb9f62023-11-03 14:13:55 +0100444 return False
445
446
Gilles Peskine184c0962020-03-24 18:25:17 +0100447class IntegrityChecker:
Gilles Peskine0d060ef2019-02-25 20:35:31 +0100448 """Sanity-check files under the current directory."""
Darryl Green10d9ce32018-02-28 10:02:55 +0000449
450 def __init__(self, log_file):
Gilles Peskine0d060ef2019-02-25 20:35:31 +0100451 """Instantiate the sanity checker.
452 Check files under the current directory.
453 Write a report of issues to log_file."""
Gilles Peskined9071e72022-09-18 21:17:09 +0200454 build_tree.check_repo_path()
Darryl Green10d9ce32018-02-28 10:02:55 +0000455 self.logger = None
456 self.setup_logger(log_file)
Darryl Green10d9ce32018-02-28 10:02:55 +0000457 self.issues_to_check = [
Gilles Peskine4aebb8d2020-08-08 23:15:18 +0200458 ShebangIssueTracker(),
Darryl Green10d9ce32018-02-28 10:02:55 +0000459 EndOfFileNewlineIssueTracker(),
460 Utf8BomIssueTracker(),
Gilles Peskined11bb472023-01-05 20:28:57 +0100461 UnicodeIssueTracker(),
Gilles Peskine2c618732020-03-24 22:26:01 +0100462 UnixLineEndingIssueTracker(),
Gilles Peskine545e13f2020-03-24 22:29:11 +0100463 WindowsLineEndingIssueTracker(),
Darryl Green10d9ce32018-02-28 10:02:55 +0000464 TrailingWhitespaceIssueTracker(),
465 TabIssueTracker(),
Gilles Peskinec117d592018-11-23 21:11:52 +0100466 MergeArtifactIssueTracker(),
Gilles Peskinef2fb9f62023-11-03 14:13:55 +0100467 LicenseIssueTracker(),
Darryl Green10d9ce32018-02-28 10:02:55 +0000468 ]
469
Darryl Green10d9ce32018-02-28 10:02:55 +0000470 def setup_logger(self, log_file, level=logging.INFO):
Gilles Peskinede047b02024-03-04 11:51:31 +0100471 """Log to log_file if provided, or to stderr if None."""
Darryl Green10d9ce32018-02-28 10:02:55 +0000472 self.logger = logging.getLogger()
473 self.logger.setLevel(level)
474 if log_file:
475 handler = logging.FileHandler(log_file)
476 self.logger.addHandler(handler)
477 else:
478 console = logging.StreamHandler()
479 self.logger.addHandler(console)
480
Gilles Peskine3e2ee3c2020-05-10 17:18:06 +0200481 @staticmethod
482 def collect_files():
Gilles Peskinede047b02024-03-04 11:51:31 +0100483 """Return the list of files to check.
484
485 These are the regular files commited into Git.
486 """
Gilles Peskine3e2ee3c2020-05-10 17:18:06 +0200487 bytes_output = subprocess.check_output(['git', 'ls-files', '-z'])
488 bytes_filepaths = bytes_output.split(b'\0')[:-1]
489 ascii_filepaths = map(lambda fp: fp.decode('ascii'), bytes_filepaths)
Gilles Peskine2aa63ea2024-03-04 11:08:19 +0100490 # Filter out directories. Normally Git doesn't list directories
491 # (it only knows about the files inside them), but there is
492 # at least one case where 'git ls-files' includes a directory:
493 # submodules. Just skip submodules (and any other directories).
494 ascii_filepaths = [fp for fp in ascii_filepaths
495 if os.path.isfile(fp)]
Gilles Peskine3e2ee3c2020-05-10 17:18:06 +0200496 # Prepend './' to files in the top-level directory so that
497 # something like `'/Makefile' in fp` matches in the top-level
498 # directory as well as in subdirectories.
499 return [fp if os.path.dirname(fp) else os.path.join(os.curdir, fp)
500 for fp in ascii_filepaths]
Gilles Peskine95c55752018-09-28 11:48:10 +0200501
Darryl Green10d9ce32018-02-28 10:02:55 +0000502 def check_files(self):
Gilles Peskinede047b02024-03-04 11:51:31 +0100503 """Check all files for all issues."""
Gilles Peskine3e2ee3c2020-05-10 17:18:06 +0200504 for issue_to_check in self.issues_to_check:
505 for filepath in self.collect_files():
506 if issue_to_check.should_check_file(filepath):
507 issue_to_check.check_file_for_issue(filepath)
Darryl Green10d9ce32018-02-28 10:02:55 +0000508
509 def output_issues(self):
Gilles Peskinede047b02024-03-04 11:51:31 +0100510 """Log the issues found and their locations.
511
512 Return 1 if there were issues, 0 otherwise.
513 """
Darryl Green10d9ce32018-02-28 10:02:55 +0000514 integrity_return_code = 0
515 for issue_to_check in self.issues_to_check:
516 if issue_to_check.files_with_issues:
517 integrity_return_code = 1
518 issue_to_check.output_file_issues(self.logger)
519 return integrity_return_code
520
521
522def run_main():
Gilles Peskine7dfcfce2019-07-04 19:31:02 +0200523 parser = argparse.ArgumentParser(description=__doc__)
Darryl Green10d9ce32018-02-28 10:02:55 +0000524 parser.add_argument(
525 "-l", "--log_file", type=str, help="path to optional output log",
526 )
527 check_args = parser.parse_args()
528 integrity_check = IntegrityChecker(check_args.log_file)
529 integrity_check.check_files()
530 return_code = integrity_check.output_issues()
531 sys.exit(return_code)
532
533
534if __name__ == "__main__":
535 run_main()