Gilles Peskine | cedb112 | 2023-11-22 19:24:31 +0100 | [diff] [blame] | 1 | """Helper functions to parse C code in heavily constrained scenarios. |
| 2 | |
| 3 | Currently supported functionality: |
| 4 | |
| 5 | * read_function_declarations: read function declarations from a header file. |
| 6 | """ |
| 7 | |
| 8 | # Copyright The Mbed TLS Contributors |
| 9 | # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later |
| 10 | |
Gilles Peskine | 7c7b7d5 | 2024-01-04 17:28:59 +0100 | [diff] [blame] | 11 | ### WARNING: the code in this file has not been extensively reviewed yet. |
| 12 | ### We do not think it is harmful, but it may be below our normal standards |
| 13 | ### for robustness and maintainability. |
| 14 | |
Gilles Peskine | cedb112 | 2023-11-22 19:24:31 +0100 | [diff] [blame] | 15 | import re |
| 16 | from typing import Dict, Iterable, Iterator, List, Optional, Tuple |
| 17 | |
| 18 | |
| 19 | class ArgumentInfo: |
| 20 | """Information about an argument to an API function.""" |
| 21 | #pylint: disable=too-few-public-methods |
| 22 | |
| 23 | _KEYWORDS = [ |
| 24 | 'const', 'register', 'restrict', |
| 25 | 'int', 'long', 'short', 'signed', 'unsigned', |
| 26 | ] |
| 27 | _DECLARATION_RE = re.compile( |
| 28 | r'(?P<type>\w[\w\s*]*?)\s*' + |
| 29 | r'(?!(?:' + r'|'.join(_KEYWORDS) + r'))(?P<name>\b\w+\b)?' + |
| 30 | r'\s*(?P<suffix>\[[^][]*\])?\Z', |
| 31 | re.A | re.S) |
| 32 | |
| 33 | @classmethod |
| 34 | def normalize_type(cls, typ: str) -> str: |
| 35 | """Normalize whitespace in a type.""" |
| 36 | typ = re.sub(r'\s+', r' ', typ) |
| 37 | typ = re.sub(r'\s*\*', r' *', typ) |
| 38 | return typ |
| 39 | |
| 40 | def __init__(self, decl: str) -> None: |
| 41 | self.decl = decl.strip() |
| 42 | m = self._DECLARATION_RE.match(self.decl) |
| 43 | if not m: |
| 44 | raise ValueError(self.decl) |
| 45 | self.type = self.normalize_type(m.group('type')) #type: str |
| 46 | self.name = m.group('name') #type: Optional[str] |
| 47 | self.suffix = m.group('suffix') if m.group('suffix') else '' #type: str |
| 48 | |
| 49 | |
| 50 | class FunctionInfo: |
| 51 | """Information about an API function.""" |
| 52 | #pylint: disable=too-few-public-methods |
| 53 | |
| 54 | # Regex matching the declaration of a function that returns void. |
| 55 | VOID_RE = re.compile(r'\s*\bvoid\s*\Z', re.A) |
| 56 | |
| 57 | def __init__(self, #pylint: disable=too-many-arguments |
| 58 | filename: str, |
| 59 | line_number: int, |
| 60 | qualifiers: Iterable[str], |
| 61 | return_type: str, |
| 62 | name: str, |
| 63 | arguments: List[str]) -> None: |
| 64 | self.filename = filename |
| 65 | self.line_number = line_number |
| 66 | self.qualifiers = frozenset(qualifiers) |
| 67 | self.return_type = return_type |
| 68 | self.name = name |
| 69 | self.arguments = [ArgumentInfo(arg) for arg in arguments] |
| 70 | |
| 71 | def returns_void(self) -> bool: |
| 72 | """Whether the function returns void.""" |
| 73 | return bool(self.VOID_RE.search(self.return_type)) |
| 74 | |
| 75 | |
| 76 | # Match one C comment. |
| 77 | # Note that we match both comment types, so things like // in a /*...*/ |
| 78 | # comment are handled correctly. |
Gilles Peskine | f81f191 | 2024-01-08 21:05:42 +0100 | [diff] [blame] | 79 | _C_COMMENT_RE = re.compile(r'//(?:[^\n]|\\\n)*|/\*.*?\*/', re.S) |
Gilles Peskine | cedb112 | 2023-11-22 19:24:31 +0100 | [diff] [blame] | 80 | _NOT_NEWLINES_RE = re.compile(r'[^\n]+') |
| 81 | |
| 82 | def read_logical_lines(filename: str) -> Iterator[Tuple[int, str]]: |
| 83 | """Read logical lines from a file. |
| 84 | |
| 85 | Logical lines are one or more physical line, with balanced parentheses. |
| 86 | """ |
| 87 | with open(filename, encoding='utf-8') as inp: |
| 88 | content = inp.read() |
| 89 | # Strip comments, but keep newlines for line numbering |
| 90 | content = re.sub(_C_COMMENT_RE, |
| 91 | lambda m: re.sub(_NOT_NEWLINES_RE, "", m.group(0)), |
| 92 | content) |
| 93 | lines = enumerate(content.splitlines(), 1) |
| 94 | for line_number, line in lines: |
| 95 | # Read a logical line, containing balanced parentheses. |
| 96 | # We assume that parentheses are balanced (this should be ok |
| 97 | # since comments have been stripped), otherwise there will be |
| 98 | # a gigantic logical line at the end. |
| 99 | paren_level = line.count('(') - line.count(')') |
| 100 | while paren_level > 0: |
| 101 | _, more = next(lines) #pylint: disable=stop-iteration-return |
| 102 | paren_level += more.count('(') - more.count(')') |
| 103 | line += '\n' + more |
| 104 | yield line_number, line |
| 105 | |
| 106 | _C_FUNCTION_DECLARATION_RE = re.compile( |
| 107 | r'(?P<qualifiers>(?:(?:extern|inline|static)\b\s*)*)' |
| 108 | r'(?P<return_type>\w[\w\s*]*?)\s*' + |
| 109 | r'\b(?P<name>\w+)' + |
| 110 | r'\s*\((?P<arguments>.*)\)\s*;', |
| 111 | re.A | re.S) |
| 112 | |
| 113 | def read_function_declarations(functions: Dict[str, FunctionInfo], |
| 114 | filename: str) -> None: |
| 115 | """Collect function declarations from a C header file.""" |
| 116 | for line_number, line in read_logical_lines(filename): |
| 117 | m = _C_FUNCTION_DECLARATION_RE.match(line) |
| 118 | if not m: |
| 119 | continue |
| 120 | qualifiers = m.group('qualifiers').split() |
| 121 | return_type = m.group('return_type') |
| 122 | name = m.group('name') |
| 123 | arguments = m.group('arguments').split(',') |
| 124 | if len(arguments) == 1 and re.match(FunctionInfo.VOID_RE, arguments[0]): |
| 125 | arguments = [] |
| 126 | # Note: we replace any existing declaration for the same name. |
| 127 | functions[name] = FunctionInfo(filename, line_number, |
| 128 | qualifiers, |
| 129 | return_type, |
| 130 | name, |
| 131 | arguments) |