Check copyright statements and SPDX license identifier

Enforce a specific copyright statement and a specific SPDX license
identifier where they are present.

Binary files, third-party modules and a few other exceptions are not
checked.

There is currently no check that copyright statements and license
identifiers are present.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>
diff --git a/tests/scripts/check_files.py b/tests/scripts/check_files.py
index 31edf8b..554ed47 100755
--- a/tests/scripts/check_files.py
+++ b/tests/scripts/check_files.py
@@ -12,6 +12,7 @@
 
 import argparse
 import codecs
+import inspect
 import logging
 import os
 import re
@@ -345,6 +346,84 @@
         return False
 
 
+THIS_FILE_BASE_NAME = \
+    os.path.basename(inspect.getframeinfo(inspect.currentframe()).filename)
+LINE_NUMBER_BEFORE_LICENSE_ISSUE_TRACKER = \
+    inspect.getframeinfo(inspect.currentframe()).lineno
+class LicenseIssueTracker(LineIssueTracker):
+    """Check copyright statements and license indications.
+
+    This class only checks that statements are correct if present. It does
+    not enforce the presence of statements in each file.
+    """
+
+    heading = "License issue:"
+
+    LICENSE_EXEMPTION_RE_LIST = [
+        # Third-party code, other than whitelisted third-party modules,
+        # may be under a different license.
+        r'3rdparty/(?!(p256-m)/.*)',
+        # Documentation explaining the license may have accidental
+        # false positives.
+        r'(ChangeLog|LICENSE|[-0-9A-Z_a-z]+\.md)\Z',
+        # Files imported from TF-M, and not used except in test builds,
+        # may be under a different license.
+        r'configs/crypto_config_profile_medium\.h\Z',
+        r'configs/tfm_mbedcrypto_config_profile_medium\.h\Z',
+        # Third-party file.
+        r'dco\.txt\Z',
+    ]
+    path_exemptions = re.compile('|'.join(BINARY_FILE_PATH_RE_LIST +
+                                          LICENSE_EXEMPTION_RE_LIST))
+
+    COPYRIGHT_HOLDER = rb'The Mbed TLS Contributors'
+    # Catch "Copyright foo", "Copyright (C) foo", "Copyright © foo", etc.
+    COPYRIGHT_RE = re.compile(rb'.*\bcopyright\s+((?:\w|\s|[()]|[^ -~])*\w)', re.I)
+
+    SPDX_HEADER_KEY = b'SPDX-License-Identifier'
+    LICENSE_IDENTIFIER = b'Apache-2.0 OR GPL-2.0-or-later'
+    SPDX_RE = re.compile(br'.*?(' +
+                         re.escape(SPDX_HEADER_KEY) +
+                         br')(:\s*(.*?)\W*\Z|.*)', re.I)
+
+    def __init__(self):
+        super().__init__()
+        # Record what problem was caused. We can't easily report it due to
+        # the structure of the script. To be fixed after
+        # https://github.com/Mbed-TLS/mbedtls/pull/2506
+        self.problem = None
+
+    def issue_with_line(self, line, filepath, line_number):
+        # Use endswith() rather than the more correct os.path.basename()
+        # because experimentally, it makes a significant difference to
+        # the running time.
+        if filepath.endswith(THIS_FILE_BASE_NAME) and \
+           line_number > LINE_NUMBER_BEFORE_LICENSE_ISSUE_TRACKER:
+            # Avoid false positives from the code in this class.
+            # Also skip the rest of this file, which is highly unlikely to
+            # contain any problematic statements since we put those near the
+            # top of files.
+            return False
+
+        m = self.COPYRIGHT_RE.match(line)
+        if m and m.group(1) != self.COPYRIGHT_HOLDER:
+            self.problem = 'Invalid copyright line'
+            return True
+
+        m = self.SPDX_RE.match(line)
+        if m:
+            if m.group(1) != self.SPDX_HEADER_KEY:
+                self.problem = 'Misspelled ' + self.SPDX_HEADER_KEY.decode()
+                return True
+            if not m.group(3):
+                self.problem = 'Improperly formatted SPDX license identifier'
+                return True
+            if m.group(3) != self.LICENSE_IDENTIFIER:
+                self.problem = 'Wrong SPDX license identifier'
+                return True
+        return False
+
+
 class IntegrityChecker:
     """Sanity-check files under the current directory."""
 
@@ -365,6 +444,7 @@
             TrailingWhitespaceIssueTracker(),
             TabIssueTracker(),
             MergeArtifactIssueTracker(),
+            LicenseIssueTracker(),
         ]
 
     def setup_logger(self, log_file, level=logging.INFO):