Blame - tests/scripts/check_names.py - mirror/mbed-tls

blob: f71a97805f718f677973fdf44264cd6d30501036 [file] [log] [blame]

Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	1	#!/usr/bin/env python3
				2	#
				3	# Copyright The Mbed TLS Contributors
				4	# SPDX-License-Identifier: Apache-2.0
				5	#
				6	# Licensed under the Apache License, Version 2.0 (the "License"); you may
				7	# not use this file except in compliance with the License.
				8	# You may obtain a copy of the License at
				9	#
				10	# http://www.apache.org/licenses/LICENSE-2.0
				11	#
				12	# Unless required by applicable law or agreed to in writing, software
				13	# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
				14	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				15	# See the License for the specific language governing permissions and
				16	# limitations under the License.
				17
				18	"""
				19	This script confirms that the naming of all symbols and identifiers in Mbed TLS
				20	are consistent with the house style and are also self-consistent. It only runs
				21	on Linux and macOS since it depends on nm.
				22
				23	It contains two major Python classes, CodeParser and NameChecker. They both have
				24	a comprehensive "run-all" function (comprehensive_parse() and perform_checks())
				25	but the individual functions can also be used for specific needs.
				26
				27	CodeParser makes heavy use of regular expressions to parse the code, and is
				28	dependent on the current code formatting. Many Python C parser libraries require
				29	preprocessed C code, which means no macro parsing. Compiler tools are also not
				30	very helpful when we want the exact location in the original source (which
				31	becomes impossible when e.g. comments are stripped).
				32
				33	NameChecker performs the following checks:
				34
				35	- All exported and available symbols in the library object files, are explicitly
				36	declared in the header files. This uses the nm command.
				37	- All macros, constants, and identifiers (function names, struct names, etc)
				38	follow the required regex pattern.
Pengyu Lv	018b2f6	2022-11-08 15:55:00 +0800	[diff] [blame]	39	- Typo checking: All words that begin with MBED\|PSA exist as macros or constants.
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	40
				41	The script returns 0 on success, 1 on test failure, and 2 if there is a script
				42	error. It must be run from Mbed TLS root.
				43	"""
				44
				45	import abc
				46	import argparse
Gilles Peskine	7bf5205	2021-09-27 19:20:17 +0200	[diff] [blame]	47	import fnmatch
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	48	import glob
				49	import textwrap
				50	import os
				51	import sys
				52	import traceback
				53	import re
				54	import enum
				55	import shutil
				56	import subprocess
				57	import logging
				58
Gilles Peskine	7ff4766	2022-09-18 21:17:09 +0200	[diff] [blame]	59	import scripts_path # pylint: disable=unused-import
				60	from mbedtls_dev import build_tree
				61
				62
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	63	# Naming patterns to check against. These are defined outside the NameCheck
				64	# class for ease of modification.
				65	MACRO_PATTERN = r"^(MBEDTLS\|PSA)_[0-9A-Z_]*[0-9A-Z]$"
				66	CONSTANTS_PATTERN = MACRO_PATTERN
				67	IDENTIFIER_PATTERN = r"^(mbedtls\|psa)_[0-9a-z_]*[0-9a-z]$"
				68
				69	class Match(): # pylint: disable=too-few-public-methods
				70	"""
				71	A class representing a match, together with its found position.
				72
				73	Fields:
				74	* filename: the file that the match was in.
				75	* line: the full line containing the match.
				76	* line_no: the line number.
				77	* pos: a tuple of (start, end) positions on the line where the match is.
				78	* name: the match itself.
				79	"""
				80	def __init__(self, filename, line, line_no, pos, name):
				81	# pylint: disable=too-many-arguments
				82	self.filename = filename
				83	self.line = line
				84	self.line_no = line_no
				85	self.pos = pos
				86	self.name = name
				87
				88	def __str__(self):
				89	"""
				90	Return a formatted code listing representation of the erroneous line.
				91	"""
				92	gutter = format(self.line_no, "4d")
				93	underline = self.pos[0] * " " + (self.pos[1] - self.pos[0]) * "^"
				94
				95	return (
				96	" {0} \|\n".format(" " * len(gutter)) +
				97	" {0} \| {1}".format(gutter, self.line) +
				98	" {0} \| {1}\n".format(" " * len(gutter), underline)
				99	)
				100
				101	class Problem(abc.ABC): # pylint: disable=too-few-public-methods
				102	"""
				103	An abstract parent class representing a form of static analysis error.
				104	It extends an Abstract Base Class, which means it is not instantiable, and
				105	it also mandates certain abstract methods to be implemented in subclasses.
				106	"""
				107	# Class variable to control the quietness of all problems
				108	quiet = False
				109	def __init__(self):
				110	self.textwrapper = textwrap.TextWrapper()
				111	self.textwrapper.width = 80
				112	self.textwrapper.initial_indent = " > "
				113	self.textwrapper.subsequent_indent = " "
				114
				115	def __str__(self):
				116	"""
				117	Unified string representation method for all Problems.
				118	"""
				119	if self.__class__.quiet:
				120	return self.quiet_output()
				121	return self.verbose_output()
				122
				123	@abc.abstractmethod
				124	def quiet_output(self):
				125	"""
				126	The output when --quiet is enabled.
				127	"""
				128	pass
				129
				130	@abc.abstractmethod
				131	def verbose_output(self):
				132	"""
				133	The default output with explanation and code snippet if appropriate.
				134	"""
				135	pass
				136
				137	class SymbolNotInHeader(Problem): # pylint: disable=too-few-public-methods
				138	"""
				139	A problem that occurs when an exported/available symbol in the object file
				140	is not explicitly declared in header files. Created with
				141	NameCheck.check_symbols_declared_in_header()
				142
				143	Fields:
				144	* symbol_name: the name of the symbol.
				145	"""
				146	def __init__(self, symbol_name):
				147	self.symbol_name = symbol_name
				148	Problem.__init__(self)
				149
				150	def quiet_output(self):
				151	return "{0}".format(self.symbol_name)
				152
				153	def verbose_output(self):
				154	return self.textwrapper.fill(
				155	"'{0}' was found as an available symbol in the output of nm, "
				156	"however it was not declared in any header files."
				157	.format(self.symbol_name))
				158
				159	class PatternMismatch(Problem): # pylint: disable=too-few-public-methods
				160	"""
				161	A problem that occurs when something doesn't match the expected pattern.
				162	Created with NameCheck.check_match_pattern()
				163
				164	Fields:
				165	* pattern: the expected regex pattern
				166	* match: the Match object in question
				167	"""
				168	def __init__(self, pattern, match):
				169	self.pattern = pattern
				170	self.match = match
				171	Problem.__init__(self)
				172
				173
				174	def quiet_output(self):
				175	return (
				176	"{0}:{1}:{2}"
				177	.format(self.match.filename, self.match.line_no, self.match.name)
				178	)
				179
				180	def verbose_output(self):
				181	return self.textwrapper.fill(
				182	"{0}:{1}: '{2}' does not match the required pattern '{3}'."
				183	.format(
				184	self.match.filename,
				185	self.match.line_no,
				186	self.match.name,
				187	self.pattern
				188	)
				189	) + "\n" + str(self.match)
				190
				191	class Typo(Problem): # pylint: disable=too-few-public-methods
				192	"""
Pengyu Lv	018b2f6	2022-11-08 15:55:00 +0800	[diff] [blame]	193	A problem that occurs when a word using MBED or PSA doesn't
				194	appear to be defined as constants nor enum values. Created with
				195	NameCheck.check_for_typos()
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	196
				197	Fields:
Pengyu Lv	018b2f6	2022-11-08 15:55:00 +0800	[diff] [blame]	198	* match: the Match object of the MBED\|PSA name in question.
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	199	"""
				200	def __init__(self, match):
				201	self.match = match
				202	Problem.__init__(self)
				203
				204	def quiet_output(self):
				205	return (
				206	"{0}:{1}:{2}"
				207	.format(self.match.filename, self.match.line_no, self.match.name)
				208	)
				209
				210	def verbose_output(self):
				211	return self.textwrapper.fill(
				212	"{0}:{1}: '{2}' looks like a typo. It was not found in any "
				213	"macros or any enums. If this is not a typo, put "
				214	"//no-check-names after it."
				215	.format(self.match.filename, self.match.line_no, self.match.name)
				216	) + "\n" + str(self.match)
				217
				218	class CodeParser():
				219	"""
				220	Class for retrieving files and parsing the code. This can be used
				221	independently of the checks that NameChecker performs, for example for
				222	list_internal_identifiers.py.
				223	"""
				224	def __init__(self, log):
				225	self.log = log
Gilles Peskine	7ff4766	2022-09-18 21:17:09 +0200	[diff] [blame]	226	build_tree.check_repo_path()
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	227
				228	# Memo for storing "glob expression": set(filepaths)
				229	self.files = {}
				230
Gilles Peskine	7bf5205	2021-09-27 19:20:17 +0200	[diff] [blame]	231	# Globally excluded filenames.
				232	# Note that "*" can match directory separators in exclude lists.
Gilles Peskine	d47f636	2021-09-27 20:12:00 +0200	[diff] [blame]	233	self.excluded_files = ["/bn_mul", "/compat-1.3.h"]
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	234
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	235	def comprehensive_parse(self):
				236	"""
				237	Comprehensive ("default") function to call each parsing function and
				238	retrieve various elements of the code, together with the source location.
				239
				240	Returns a dict of parsed item key to the corresponding List of Matches.
				241	"""
				242	self.log.info("Parsing source code...")
				243	self.log.debug(
				244	"The following files are excluded from the search: {}"
				245	.format(str(self.excluded_files))
				246	)
				247
				248	all_macros = self.parse_macros([
				249	"include/mbedtls/*.h",
				250	"include/psa/*.h",
				251	"library/*.h",
				252	"tests/include/test/drivers/*.h",
				253	"3rdparty/everest/include/everest/everest.h",
				254	"3rdparty/everest/include/everest/x25519.h"
				255	])
Pengyu Lv	018b2f6	2022-11-08 15:55:00 +0800	[diff] [blame]	256	private_macros = self.parse_macros([
				257	"library/*.c",
				258	])
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	259	enum_consts = self.parse_enum_consts([
				260	"include/mbedtls/*.h",
Pengyu Lv	018b2f6	2022-11-08 15:55:00 +0800	[diff] [blame]	261	"include/psa/*.h",
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	262	"library/*.h",
Pengyu Lv	018b2f6	2022-11-08 15:55:00 +0800	[diff] [blame]	263	"library/*.c",
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	264	"3rdparty/everest/include/everest/everest.h",
				265	"3rdparty/everest/include/everest/x25519.h"
				266	])
				267	identifiers = self.parse_identifiers([
				268	"include/mbedtls/*.h",
				269	"include/psa/*.h",
				270	"library/*.h",
				271	"3rdparty/everest/include/everest/everest.h",
				272	"3rdparty/everest/include/everest/x25519.h"
				273	])
Pengyu Lv	018b2f6	2022-11-08 15:55:00 +0800	[diff] [blame]	274	mbed_psa_words = self.parse_mbed_psa_words([
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	275	"include/mbedtls/*.h",
				276	"include/psa/*.h",
				277	"library/*.h",
				278	"3rdparty/everest/include/everest/everest.h",
				279	"3rdparty/everest/include/everest/x25519.h",
				280	"library/*.c",
				281	"3rdparty/everest/library/everest.c",
				282	"3rdparty/everest/library/x25519.c"
				283	])
				284	symbols = self.parse_symbols()
				285
				286	# Remove identifier macros like mbedtls_printf or mbedtls_calloc
				287	identifiers_justname = [x.name for x in identifiers]
				288	actual_macros = []
				289	for macro in all_macros:
				290	if macro.name not in identifiers_justname:
				291	actual_macros.append(macro)
				292
				293	self.log.debug("Found:")
				294	# Aligns the counts on the assumption that none exceeds 4 digits
				295	self.log.debug(" {:4} Total Macros".format(len(all_macros)))
				296	self.log.debug(" {:4} Non-identifier Macros".format(len(actual_macros)))
				297	self.log.debug(" {:4} Enum Constants".format(len(enum_consts)))
				298	self.log.debug(" {:4} Identifiers".format(len(identifiers)))
				299	self.log.debug(" {:4} Exported Symbols".format(len(symbols)))
				300	return {
				301	"macros": actual_macros,
Pengyu Lv	018b2f6	2022-11-08 15:55:00 +0800	[diff] [blame]	302	"private_macros": private_macros,
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	303	"enum_consts": enum_consts,
				304	"identifiers": identifiers,
				305	"symbols": symbols,
Pengyu Lv	018b2f6	2022-11-08 15:55:00 +0800	[diff] [blame]	306	"mbed_psa_words": mbed_psa_words
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	307	}
				308
Gilles Peskine	7bf5205	2021-09-27 19:20:17 +0200	[diff] [blame]	309	def is_file_excluded(self, path, exclude_wildcards):
Gilles Peskine	1c39975	2021-09-28 10:12:49 +0200	[diff] [blame]	310	"""Whether the given file path is excluded."""
Gilles Peskine	7bf5205	2021-09-27 19:20:17 +0200	[diff] [blame]	311	# exclude_wildcards may be None. Also, consider the global exclusions.
				312	exclude_wildcards = (exclude_wildcards or []) + self.excluded_files
				313	for pattern in exclude_wildcards:
				314	if fnmatch.fnmatch(path, pattern):
				315	return True
				316	return False
				317
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	318	def get_files(self, include_wildcards, exclude_wildcards):
				319	"""
				320	Get all files that match any of the UNIX-style wildcards. While the
				321	check_names script is designed only for use on UNIX/macOS (due to nm),
				322	this function alone would work fine on Windows even with forward slashes
				323	in the wildcard.
				324
				325	Args:
				326	* include_wildcards: a List of shell-style wildcards to match filepaths.
				327	* exclude_wildcards: a List of shell-style wildcards to exclude.
				328
				329	Returns a List of relative filepaths.
				330	"""
				331	accumulator = set()
				332
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	333	for include_wildcard in include_wildcards:
Gilles Peskine	7bf5205	2021-09-27 19:20:17 +0200	[diff] [blame]	334	accumulator = accumulator.union(glob.iglob(include_wildcard))
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	335
Gilles Peskine	7bf5205	2021-09-27 19:20:17 +0200	[diff] [blame]	336	return list(path for path in accumulator
				337	if not self.is_file_excluded(path, exclude_wildcards))
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	338
				339	def parse_macros(self, include, exclude=None):
				340	"""
				341	Parse all macros defined by #define preprocessor directives.
				342
				343	Args:
				344	* include: A List of glob expressions to look for files through.
				345	* exclude: A List of glob expressions for excluding files.
				346
				347	Returns a List of Match objects for the found macros.
				348	"""
				349	macro_regex = re.compile(r"# *define +(?P<macro>\w+)")
				350	exclusions = (
				351	"asm", "inline", "EMIT", "_CRT_SECURE_NO_DEPRECATE", "MULADDC_"
				352	)
				353
				354	files = self.get_files(include, exclude)
				355	self.log.debug("Looking for macros in {} files".format(len(files)))
				356
				357	macros = []
				358	for header_file in files:
				359	with open(header_file, "r", encoding="utf-8") as header:
				360	for line_no, line in enumerate(header):
				361	for macro in macro_regex.finditer(line):
				362	if macro.group("macro").startswith(exclusions):
				363	continue
				364
				365	macros.append(Match(
				366	header_file,
				367	line,
				368	line_no,
				369	macro.span("macro"),
				370	macro.group("macro")))
				371
				372	return macros
				373
Pengyu Lv	018b2f6	2022-11-08 15:55:00 +0800	[diff] [blame]	374	def parse_mbed_psa_words(self, include, exclude=None):
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	375	"""
Pengyu Lv	018b2f6	2022-11-08 15:55:00 +0800	[diff] [blame]	376	Parse all words in the file that begin with MBED\|PSA, in and out of
				377	macros, comments, anything.
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	378
				379	Args:
				380	* include: A List of glob expressions to look for files through.
				381	* exclude: A List of glob expressions for excluding files.
				382
Pengyu Lv	018b2f6	2022-11-08 15:55:00 +0800	[diff] [blame]	383	Returns a List of Match objects for words beginning with MBED\|PSA.
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	384	"""
				385	# Typos of TLS are common, hence the broader check below than MBEDTLS.
Pengyu Lv	018b2f6	2022-11-08 15:55:00 +0800	[diff] [blame]	386	mbed_regex = re.compile(r"\b(MBED.+?\|PSA)_[A-Z0-9_]*")
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	387	exclusions = re.compile(r"// *no-check-names\|#error")
				388
				389	files = self.get_files(include, exclude)
Pengyu Lv	018b2f6	2022-11-08 15:55:00 +0800	[diff] [blame]	390	self.log.debug(
				391	"Looking for MBED\|PSA words in {} files"
				392	.format(len(files))
				393	)
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	394
Pengyu Lv	018b2f6	2022-11-08 15:55:00 +0800	[diff] [blame]	395	mbed_psa_words = []
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	396	for filename in files:
				397	with open(filename, "r", encoding="utf-8") as fp:
				398	for line_no, line in enumerate(fp):
				399	if exclusions.search(line):
				400	continue
				401
				402	for name in mbed_regex.finditer(line):
Pengyu Lv	018b2f6	2022-11-08 15:55:00 +0800	[diff] [blame]	403	mbed_psa_words.append(Match(
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	404	filename,
				405	line,
				406	line_no,
				407	name.span(0),
				408	name.group(0)))
				409
Pengyu Lv	018b2f6	2022-11-08 15:55:00 +0800	[diff] [blame]	410	return mbed_psa_words
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	411
				412	def parse_enum_consts(self, include, exclude=None):
				413	"""
				414	Parse all enum value constants that are declared.
				415
				416	Args:
				417	* include: A List of glob expressions to look for files through.
				418	* exclude: A List of glob expressions for excluding files.
				419
				420	Returns a List of Match objects for the findings.
				421	"""
				422	files = self.get_files(include, exclude)
				423	self.log.debug("Looking for enum consts in {} files".format(len(files)))
				424
				425	# Emulate a finite state machine to parse enum declarations.
				426	# OUTSIDE_KEYWORD = outside the enum keyword
				427	# IN_BRACES = inside enum opening braces
				428	# IN_BETWEEN = between enum keyword and opening braces
				429	states = enum.Enum("FSM", ["OUTSIDE_KEYWORD", "IN_BRACES", "IN_BETWEEN"])
				430	enum_consts = []
				431	for header_file in files:
				432	state = states.OUTSIDE_KEYWORD
				433	with open(header_file, "r", encoding="utf-8") as header:
				434	for line_no, line in enumerate(header):
				435	# Match typedefs and brackets only when they are at the
				436	# beginning of the line -- if they are indented, they might
				437	# be sub-structures within structs, etc.
David Horstmann	e1e776c	2022-12-16 13:39:04 +0000	[diff] [blame]	438	optional_c_identifier = r"([_a-zA-Z][_a-zA-Z0-9]*)?"
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	439	if (state == states.OUTSIDE_KEYWORD and
David Horstmann	e1e776c	2022-12-16 13:39:04 +0000	[diff] [blame]	440	re.search(r"^(typedef +)?enum " + \
				441	optional_c_identifier + \
				442	r" *{", line)):
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	443	state = states.IN_BRACES
				444	elif (state == states.OUTSIDE_KEYWORD and
				445	re.search(r"^(typedef +)?enum", line)):
				446	state = states.IN_BETWEEN
				447	elif (state == states.IN_BETWEEN and
				448	re.search(r"^{", line)):
				449	state = states.IN_BRACES
				450	elif (state == states.IN_BRACES and
				451	re.search(r"^}", line)):
				452	state = states.OUTSIDE_KEYWORD
				453	elif (state == states.IN_BRACES and
				454	not re.search(r"^ *#", line)):
				455	enum_const = re.search(r"^ *(?P<enum_const>\w+)", line)
				456	if not enum_const:
				457	continue
				458
				459	enum_consts.append(Match(
				460	header_file,
				461	line,
				462	line_no,
				463	enum_const.span("enum_const"),
				464	enum_const.group("enum_const")))
				465
				466	return enum_consts
				467
Gilles Peskine	4480162	2021-11-17 20:43:35 +0100	[diff] [blame]	468	IGNORED_CHUNK_REGEX = re.compile('\|'.join([
				469	r'/\.?\*/', # block comment entirely on one line
				470	r'//.*', # line comment
				471	r'(?P<string>")(?:[^\\\"]\|\\.)*"', # string literal
				472	]))
				473
Gilles Peskine	df30665	2021-11-17 20:32:31 +0100	[diff] [blame]	474	def strip_comments_and_literals(self, line, in_block_comment):
				475	"""Strip comments and string literals from line.
				476
				477	Continuation lines are not supported.
				478
				479	If in_block_comment is true, assume that the line starts inside a
				480	block comment.
				481
				482	Return updated values of (line, in_block_comment) where:
				483	* Comments in line have been replaced by a space (or nothing at the
				484	start or end of the line).
				485	* String contents have been removed.
				486	* in_block_comment indicates whether the line ends inside a block
				487	comment that continues on the next line.
				488	"""
Gilles Peskine	23b4096	2021-11-17 20:45:39 +0100	[diff] [blame]	489
				490	# Terminate current multiline comment?
Gilles Peskine	df30665	2021-11-17 20:32:31 +0100	[diff] [blame]	491	if in_block_comment:
Gilles Peskine	23b4096	2021-11-17 20:45:39 +0100	[diff] [blame]	492	m = re.search(r"\*/", line)
				493	if m:
				494	in_block_comment = False
				495	line = line[m.end(0):]
				496	else:
				497	return '', True
Gilles Peskine	4480162	2021-11-17 20:43:35 +0100	[diff] [blame]	498
				499	# Remove full comments and string literals.
				500	# Do it all together to handle cases like "/*" correctly.
				501	# Note that continuation lines are not supported.
				502	line = re.sub(self.IGNORED_CHUNK_REGEX,
				503	lambda s: '""' if s.group('string') else ' ',
Gilles Peskine	df30665	2021-11-17 20:32:31 +0100	[diff] [blame]	504	line)
Gilles Peskine	4480162	2021-11-17 20:43:35 +0100	[diff] [blame]	505
Gilles Peskine	df30665	2021-11-17 20:32:31 +0100	[diff] [blame]	506	# Start an unfinished comment?
Gilles Peskine	4480162	2021-11-17 20:43:35 +0100	[diff] [blame]	507	# (If `/*` was part of a complete comment, it's already been removed.)
Gilles Peskine	23b4096	2021-11-17 20:45:39 +0100	[diff] [blame]	508	m = re.search(r"/\*", line)
Gilles Peskine	df30665	2021-11-17 20:32:31 +0100	[diff] [blame]	509	if m:
				510	in_block_comment = True
Gilles Peskine	23b4096	2021-11-17 20:45:39 +0100	[diff] [blame]	511	line = line[:m.start(0)]
Gilles Peskine	4480162	2021-11-17 20:43:35 +0100	[diff] [blame]	512
Gilles Peskine	df30665	2021-11-17 20:32:31 +0100	[diff] [blame]	513	return line, in_block_comment
				514
Gilles Peskine	c8fc67f	2021-11-17 20:23:18 +0100	[diff] [blame]	515	IDENTIFIER_REGEX = re.compile('\|'.join([
Gilles Peskine	b3f4dd5	2021-11-16 20:56:47 +0100	[diff] [blame]	516	# Match " something(a" or " *something(a". Functions.
				517	# Assumptions:
				518	# - function definition from return type to one of its arguments is
				519	# all on one line
				520	# - function definition line only contains alphanumeric, asterisk,
				521	# underscore, and open bracket
Gilles Peskine	c8fc67f	2021-11-17 20:23:18 +0100	[diff] [blame]	522	r".* \*(\w+) \( *\w",
Gilles Peskine	b3f4dd5	2021-11-16 20:56:47 +0100	[diff] [blame]	523	# Match "(*something)(".
Gilles Peskine	c8fc67f	2021-11-17 20:23:18 +0100	[diff] [blame]	524	r".$ \* (\w+) $ *\(",
Gilles Peskine	b3f4dd5	2021-11-16 20:56:47 +0100	[diff] [blame]	525	# Match names of named data structures.
Gilles Peskine	c8fc67f	2021-11-17 20:23:18 +0100	[diff] [blame]	526	r"(?:typedef +)?(?:struct\|union\|enum) +(\w+)(?: *{)?$",
Gilles Peskine	b3f4dd5	2021-11-16 20:56:47 +0100	[diff] [blame]	527	# Match names of typedef instances, after closing bracket.
Gilles Peskine	c8fc67f	2021-11-17 20:23:18 +0100	[diff] [blame]	528	r"}? (\w+)[;[].",
				529	]))
Gilles Peskine	b3f4dd5	2021-11-16 20:56:47 +0100	[diff] [blame]	530	# The regex below is indented for clarity.
Gilles Peskine	c8fc67f	2021-11-17 20:23:18 +0100	[diff] [blame]	531	EXCLUSION_LINES = re.compile("\|".join([
				532	r"extern +\"C\"",
				533	r"(typedef +)?(struct\|union\|enum)( *{)?$",
				534	r"} *;?$",
				535	r"$",
				536	r"//",
				537	r"#",
				538	]))
Gilles Peskine	b3f4dd5	2021-11-16 20:56:47 +0100	[diff] [blame]	539
				540	def parse_identifiers_in_file(self, header_file, identifiers):
				541	"""
				542	Parse all lines of a header where a function/enum/struct/union/typedef
				543	identifier is declared, based on some regex and heuristics. Highly
				544	dependent on formatting style.
				545
				546	Append found matches to the list ``identifiers``.
				547	"""
				548
				549	with open(header_file, "r", encoding="utf-8") as header:
				550	in_block_comment = False
				551	# The previous line variable is used for concatenating lines
				552	# when identifiers are formatted and spread across multiple
				553	# lines.
				554	previous_line = ""
				555
				556	for line_no, line in enumerate(header):
Gilles Peskine	df30665	2021-11-17 20:32:31 +0100	[diff] [blame]	557	line, in_block_comment = \
				558	self.strip_comments_and_literals(line, in_block_comment)
Gilles Peskine	b3f4dd5	2021-11-16 20:56:47 +0100	[diff] [blame]	559
Gilles Peskine	c8fc67f	2021-11-17 20:23:18 +0100	[diff] [blame]	560	if self.EXCLUSION_LINES.match(line):
Gilles Peskine	b3f4dd5	2021-11-16 20:56:47 +0100	[diff] [blame]	561	previous_line = ""
				562	continue
				563
				564	# If the line contains only space-separated alphanumeric
Gilles Peskine	4f04d61	2021-11-17 20:39:56 +0100	[diff] [blame]	565	# characters (or underscore, asterisk, or open parenthesis),
Gilles Peskine	b3f4dd5	2021-11-16 20:56:47 +0100	[diff] [blame]	566	# and nothing else, high chance it's a declaration that
				567	# continues on the next line
				568	if re.search(r"^([\w\*\(]+\s+)+$", line):
				569	previous_line += line
				570	continue
				571
				572	# If previous line seemed to start an unfinished declaration
				573	# (as above), concat and treat them as one.
				574	if previous_line:
				575	line = previous_line.strip() + " " + line.strip() + "\n"
				576	previous_line = ""
				577
				578	# Skip parsing if line has a space in front = heuristic to
				579	# skip function argument lines (highly subject to formatting
				580	# changes)
				581	if line[0] == " ":
				582	continue
				583
				584	identifier = self.IDENTIFIER_REGEX.search(line)
				585
				586	if not identifier:
				587	continue
				588
				589	# Find the group that matched, and append it
				590	for group in identifier.groups():
				591	if not group:
				592	continue
				593
				594	identifiers.append(Match(
				595	header_file,
				596	line,
				597	line_no,
				598	identifier.span(),
				599	group))
				600
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	601	def parse_identifiers(self, include, exclude=None):
				602	"""
				603	Parse all lines of a header where a function/enum/struct/union/typedef
				604	identifier is declared, based on some regex and heuristics. Highly
				605	dependent on formatting style.
				606
				607	Args:
				608	* include: A List of glob expressions to look for files through.
				609	* exclude: A List of glob expressions for excluding files.
				610
				611	Returns a List of Match objects with identifiers.
				612	"""
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	613
				614	files = self.get_files(include, exclude)
				615	self.log.debug("Looking for identifiers in {} files".format(len(files)))
				616
				617	identifiers = []
				618	for header_file in files:
Gilles Peskine	b3f4dd5	2021-11-16 20:56:47 +0100	[diff] [blame]	619	self.parse_identifiers_in_file(header_file, identifiers)
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	620
				621	return identifiers
				622
				623	def parse_symbols(self):
				624	"""
				625	Compile the Mbed TLS libraries, and parse the TLS, Crypto, and x509
				626	object files using nm to retrieve the list of referenced symbols.
				627	Exceptions thrown here are rethrown because they would be critical
				628	errors that void several tests, and thus needs to halt the program. This
				629	is explicitly done for clarity.
				630
				631	Returns a List of unique symbols defined and used in the libraries.
				632	"""
				633	self.log.info("Compiling...")
				634	symbols = []
				635
Tom Cosgrove	49f99bc	2022-12-04 16:44:21 +0000	[diff] [blame]	636	# Back up the config and atomically compile with the full configuration.
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	637	shutil.copy(
Gilles Peskine	d47f636	2021-09-27 20:12:00 +0200	[diff] [blame]	638	"include/mbedtls/config.h",
				639	"include/mbedtls/config.h.bak"
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	640	)
				641	try:
				642	# Use check=True in all subprocess calls so that failures are raised
				643	# as exceptions and logged.
				644	subprocess.run(
				645	["python3", "scripts/config.py", "full"],
				646	universal_newlines=True,
				647	check=True
				648	)
				649	my_environment = os.environ.copy()
				650	my_environment["CFLAGS"] = "-fno-asynchronous-unwind-tables"
				651	# Run make clean separately to lib to prevent unwanted behavior when
				652	# make is invoked with parallelism.
				653	subprocess.run(
				654	["make", "clean"],
				655	universal_newlines=True,
				656	check=True
				657	)
				658	subprocess.run(
				659	["make", "lib"],
				660	env=my_environment,
				661	universal_newlines=True,
				662	stdout=subprocess.PIPE,
				663	stderr=subprocess.STDOUT,
				664	check=True
				665	)
				666
				667	# Perform object file analysis using nm
				668	symbols = self.parse_symbols_from_nm([
				669	"library/libmbedcrypto.a",
				670	"library/libmbedtls.a",
				671	"library/libmbedx509.a"
				672	])
				673
				674	subprocess.run(
				675	["make", "clean"],
				676	universal_newlines=True,
				677	check=True
				678	)
				679	except subprocess.CalledProcessError as error:
				680	self.log.debug(error.output)
				681	raise error
				682	finally:
				683	# Put back the original config regardless of there being errors.
				684	# Works also for keyboard interrupts.
				685	shutil.move(
Gilles Peskine	d47f636	2021-09-27 20:12:00 +0200	[diff] [blame]	686	"include/mbedtls/config.h.bak",
				687	"include/mbedtls/config.h"
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	688	)
				689
				690	return symbols
				691
				692	def parse_symbols_from_nm(self, object_files):
				693	"""
				694	Run nm to retrieve the list of referenced symbols in each object file.
				695	Does not return the position data since it is of no use.
				696
				697	Args:
				698	* object_files: a List of compiled object filepaths to search through.
				699
				700	Returns a List of unique symbols defined and used in any of the object
				701	files.
				702	"""
				703	nm_undefined_regex = re.compile(r"^\S+: +U \|^$\|^\S+:$")
				704	nm_valid_regex = re.compile(r"^\S+( [0-9A-Fa-f]+)* . _*(?P<symbol>\w+)")
				705	exclusions = ("FStar", "Hacl")
				706
				707	symbols = []
				708
				709	# Gather all outputs of nm
				710	nm_output = ""
				711	for lib in object_files:
				712	nm_output += subprocess.run(
				713	["nm", "-og", lib],
				714	universal_newlines=True,
				715	stdout=subprocess.PIPE,
				716	stderr=subprocess.STDOUT,
				717	check=True
				718	).stdout
				719
				720	for line in nm_output.splitlines():
				721	if not nm_undefined_regex.search(line):
				722	symbol = nm_valid_regex.search(line)
				723	if (symbol and not symbol.group("symbol").startswith(exclusions)):
				724	symbols.append(symbol.group("symbol"))
				725	else:
				726	self.log.error(line)
				727
				728	return symbols
				729
				730	class NameChecker():
				731	"""
				732	Representation of the core name checking operation performed by this script.
				733	"""
				734	def __init__(self, parse_result, log):
				735	self.parse_result = parse_result
				736	self.log = log
				737
				738	def perform_checks(self, quiet=False):
				739	"""
				740	A comprehensive checker that performs each check in order, and outputs
				741	a final verdict.
				742
				743	Args:
				744	* quiet: whether to hide detailed problem explanation.
				745	"""
				746	self.log.info("=============")
				747	Problem.quiet = quiet
				748	problems = 0
				749	problems += self.check_symbols_declared_in_header()
				750
				751	pattern_checks = [
				752	("macros", MACRO_PATTERN),
				753	("enum_consts", CONSTANTS_PATTERN),
				754	("identifiers", IDENTIFIER_PATTERN)
				755	]
				756	for group, check_pattern in pattern_checks:
				757	problems += self.check_match_pattern(group, check_pattern)
				758
				759	problems += self.check_for_typos()
				760
				761	self.log.info("=============")
				762	if problems > 0:
				763	self.log.info("FAIL: {0} problem(s) to fix".format(str(problems)))
				764	if quiet:
				765	self.log.info("Remove --quiet to see explanations.")
				766	else:
				767	self.log.info("Use --quiet for minimal output.")
				768	return 1
				769	else:
				770	self.log.info("PASS")
				771	return 0
				772
				773	def check_symbols_declared_in_header(self):
				774	"""
				775	Perform a check that all detected symbols in the library object files
				776	are properly declared in headers.
				777	Assumes parse_names_in_source() was called before this.
				778
				779	Returns the number of problems that need fixing.
				780	"""
				781	problems = []
				782
				783	for symbol in self.parse_result["symbols"]:
				784	found_symbol_declared = False
				785	for identifier_match in self.parse_result["identifiers"]:
				786	if symbol == identifier_match.name:
				787	found_symbol_declared = True
				788	break
				789
				790	if not found_symbol_declared:
				791	problems.append(SymbolNotInHeader(symbol))
				792
				793	self.output_check_result("All symbols in header", problems)
				794	return len(problems)
				795
				796	def check_match_pattern(self, group_to_check, check_pattern):
				797	"""
				798	Perform a check that all items of a group conform to a regex pattern.
				799	Assumes parse_names_in_source() was called before this.
				800
				801	Args:
				802	* group_to_check: string key to index into self.parse_result.
				803	* check_pattern: the regex to check against.
				804
				805	Returns the number of problems that need fixing.
				806	"""
				807	problems = []
				808
				809	for item_match in self.parse_result[group_to_check]:
				810	if not re.search(check_pattern, item_match.name):
				811	problems.append(PatternMismatch(check_pattern, item_match))
				812	# Double underscore should not be used for names
				813	if re.search(r".__.", item_match.name):
				814	problems.append(
				815	PatternMismatch("no double underscore allowed", item_match))
				816
				817	self.output_check_result(
				818	"Naming patterns of {}".format(group_to_check),
				819	problems)
				820	return len(problems)
				821
				822	def check_for_typos(self):
				823	"""
Shaun Case	0e7791f	2021-12-20 21:14:10 -0800	[diff] [blame]	824	Perform a check that all words in the source code beginning with MBED are
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	825	either defined as macros, or as enum constants.
				826	Assumes parse_names_in_source() was called before this.
				827
				828	Returns the number of problems that need fixing.
				829	"""
				830	problems = []
				831
				832	# Set comprehension, equivalent to a list comprehension wrapped by set()
				833	all_caps_names = {
				834	match.name
				835	for match
Pengyu Lv	018b2f6	2022-11-08 15:55:00 +0800	[diff] [blame]	836	in self.parse_result["macros"] +
				837	self.parse_result["private_macros"] +
				838	self.parse_result["enum_consts"]
				839	}
Ronald Cron	b814bda	2021-09-13 14:50:42 +0200	[diff] [blame]	840	typo_exclusion = re.compile(r"XXX\|__\|_$\|^MBEDTLS_.*CONFIG_FILE$\|"
Pengyu Lv	fda7f50	2022-11-08 16:56:51 +0800	[diff] [blame]	841	r"MBEDTLS_TEST_LIBTESTDRIVER*\|"
				842	r"PSA_CRYPTO_DRIVER_TEST")
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	843
Pengyu Lv	018b2f6	2022-11-08 15:55:00 +0800	[diff] [blame]	844	for name_match in self.parse_result["mbed_psa_words"]:
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	845	found = name_match.name in all_caps_names
				846
				847	# Since MBEDTLS_PSA_ACCEL_XXX defines are defined by the
				848	# PSA driver, they will not exist as macros. However, they
				849	# should still be checked for typos using the equivalent
				850	# BUILTINs that exist.
				851	if "MBEDTLS_PSA_ACCEL_" in name_match.name:
				852	found = name_match.name.replace(
				853	"MBEDTLS_PSA_ACCEL_",
				854	"MBEDTLS_PSA_BUILTIN_") in all_caps_names
				855
				856	if not found and not typo_exclusion.search(name_match.name):
				857	problems.append(Typo(name_match))
				858
				859	self.output_check_result("Likely typos", problems)
				860	return len(problems)
				861
				862	def output_check_result(self, name, problems):
				863	"""
				864	Write out the PASS/FAIL status of a performed check depending on whether
				865	there were problems.
				866
				867	Args:
				868	* name: the name of the test
				869	* problems: a List of encountered Problems
				870	"""
				871	if problems:
				872	self.log.info("{}: FAIL\n".format(name))
				873	for problem in problems:
				874	self.log.warning(str(problem))
				875	else:
				876	self.log.info("{}: PASS".format(name))
				877
				878	def main():
				879	"""
				880	Perform argument parsing, and create an instance of CodeParser and
				881	NameChecker to begin the core operation.
				882	"""
				883	parser = argparse.ArgumentParser(
				884	formatter_class=argparse.RawDescriptionHelpFormatter,
				885	description=(
				886	"This script confirms that the naming of all symbols and identifiers "
				887	"in Mbed TLS are consistent with the house style and are also "
				888	"self-consistent.\n\n"
				889	"Expected to be run from the MbedTLS root directory.")
				890	)
				891	parser.add_argument(
				892	"-v", "--verbose",
				893	action="store_true",
				894	help="show parse results"
				895	)
				896	parser.add_argument(
				897	"-q", "--quiet",
				898	action="store_true",
Tom Cosgrove	49f99bc	2022-12-04 16:44:21 +0000	[diff] [blame]	899	help="hide unnecessary text, explanations, and highlights"
Gilles Peskine	8266b5b	2021-09-27 19:53:31 +0200	[diff] [blame]	900	)
				901
				902	args = parser.parse_args()
				903
				904	# Configure the global logger, which is then passed to the classes below
				905	log = logging.getLogger()
				906	log.setLevel(logging.DEBUG if args.verbose else logging.INFO)
				907	log.addHandler(logging.StreamHandler())
				908
				909	try:
				910	code_parser = CodeParser(log)
				911	parse_result = code_parser.comprehensive_parse()
				912	except Exception: # pylint: disable=broad-except
				913	traceback.print_exc()
				914	sys.exit(2)
				915
				916	name_checker = NameChecker(parse_result, log)
				917	return_code = name_checker.perform_checks(quiet=args.quiet)
				918
				919	sys.exit(return_code)
				920
				921	if __name__ == "__main__":
				922	main()