Support non-ASCII characters in headers Filter out non-ASCII characters in automatically processed headers. Do this in a way that minimizes the code change: keep manipulating strings, but strip off non-ASCII characters when reading lines, which should only remove characters in comments that we don't parse anyway.

commit: 49af2d3a4f1f51ec0c842df41b293b348574ec3f [log] [tgz]
author: Gilles Peskine <Gilles.Peskine@arm.com> Fri Dec 06 19:20:13 2019 +0100
committer: Gilles Peskine <Gilles.Peskine@arm.com> Wed Dec 11 11:03:07 2019 +0100
tree: 85c790ceb7529031a84920178232a7138c4ca99a
parent: 81f7909497c12f637ab4f45d16bdab5cf91f2e43 [diff]
diff --git a/scripts/generate_psa_constants.py b/scripts/generate_psa_constants.py
index c2d2558..a9de148 100755
--- a/scripts/generate_psa_constants.py
+++ b/scripts/generate_psa_constants.py

@@ -270,11 +270,16 @@
             # Other macro without parameter
             return
 
+    _nonascii_re = re.compile(rb'[^\x00-\x7f]+')
+    _continued_line_re = re.compile(rb'\\\r?\n\Z')
     def read_file(self, header_file):
         for line in header_file:
-            while line.endswith('\\\n'):
+            m = re.search(self._continued_line_re, line)
+            while m:
                 cont = next(header_file)
-                line = line[:-2] + cont
+                line = line[:m.start(0)] + cont
+                m = re.search(self._continued_line_re, line)
+            line = re.sub(self._nonascii_re, rb'', line).decode('ascii')
             self.read_line(line)
 
     @staticmethod
@@ -380,7 +385,7 @@
 def generate_psa_constants(header_file_names, output_file_name):
     collector = MacroCollector()
     for header_file_name in header_file_names:
-        with open(header_file_name) as header_file:
+        with open(header_file_name, 'rb') as header_file:
             collector.read_file(header_file)
     temp_file_name = output_file_name + '.tmp'
     with open(temp_file_name, 'w') as output_file:

diff --git a/tests/scripts/test_psa_constant_names.py b/tests/scripts/test_psa_constant_names.py
index 7553394..4829321 100755
--- a/tests/scripts/test_psa_constant_names.py
+++ b/tests/scripts/test_psa_constant_names.py

@@ -43,12 +43,14 @@
     except that if process(line) raises an exception, then the read_file_lines
     snippet annotates the exception with the file name and line number.
     """
-    def __init__(self, filename):
+    def __init__(self, filename, binary=False):
         self.filename = filename
         self.line_number = 'entry'
         self.generator = None
+        self.binary = binary
     def __enter__(self):
-        self.generator = enumerate(open(self.filename, 'r'))
+        self.generator = enumerate(open(self.filename,
+                                        'rb' if self.binary else 'r'))
         return self
     def __iter__(self):
         for line_number, content in self.generator:
@@ -224,13 +226,15 @@
         if m.group(3):
             self.argspecs[name] = self._argument_split(m.group(3))
 
+    _nonascii_re = re.compile(rb'[^\x00-\x7f]+')
     def parse_header(self, filename):
         """Parse a C header file, looking for "#define PSA_xxx"."""
-        with read_file_lines(filename) as lines:
+        with read_file_lines(filename, binary=True) as lines:
             for line in lines:
+                line = re.sub(self._nonascii_re, rb'', line).decode('ascii')
                 self.parse_header_line(line)
 
-    _macro_identifier_re = r'[A-Z]\w+'
+    _macro_identifier_re = re.compile(r'[A-Z]\w+')
     def generate_undeclared_names(self, expr):
         for name in re.findall(self._macro_identifier_re, expr):
             if name not in self.all_declared:
commit	49af2d3a4f1f51ec0c842df41b293b348574ec3f	[log] [tgz]
author	Gilles Peskine <Gilles.Peskine@arm.com>	Fri Dec 06 19:20:13 2019 +0100
committer	Gilles Peskine <Gilles.Peskine@arm.com>	Wed Dec 11 11:03:07 2019 +0100
tree	85c790ceb7529031a84920178232a7138c4ca99a
parent	81f7909497c12f637ab4f45d16bdab5cf91f2e43 [diff]