forked from intel/llvm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfilecheck_lint.py
251 lines (204 loc) · 8.2 KB
/
filecheck_lint.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
# ===----------------------------------------------------------------------===##
#
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# ===----------------------------------------------------------------------===##
"""A linter that detects potential typos in FileCheck directive names.
Consider a broken test foo.cpp:
// RUN: clang -cc1 -ast-dump %s | FileCheck %s --check-prefix=NEW
// RUN: clang -cc1 -ast-dump %s -std=c++98 | FileCheck %s --check-prefix=OLD
auto x = 42;
// NEWW: auto is a c++11 extension
// ODL-NOT: auto is a c++11 extension
We first detect the locally valid FileCheck directive prefixes by parsing the
--check-prefix flags. Here we get {CHECK, NEW, OLD}, so our directive names are
{CHECK, NEW, OLD, CHECK-NOT, NEW-NOT, ...}.
Then we look for lines that look like directives. These are of the form 'FOO:',
usually at the beginning of a line or a comment. If any of these are a
"near-miss" for a directive name, then we suspect this is a typo and report it.
Usage: filecheck_lint path/to/test/file/1 ... path/to/test/file/n
"""
import itertools
import logging
import pathlib
import re
import sys
from typing import Generator, Sequence, Tuple
_distance_threshold = 3
_prefixes = {'CHECK'}
_suffixes = {'-DAG', '-COUNT', '-EMPTY', '-LABEL', '-NEXT', '-NOT', '-SAME'}
# 'NOTE' and 'TODO' are not directives, but are likely to be false positives
# if encountered and to generate noise as a result. We filter them out also to
# avoid this.
_lit_directives = {
'RUN',
'REQUIRES',
'UNSUPPORTED',
'XFAIL',
'DEFINE',
'REDEFINE',
}
# 'COM' and 'RUN' are default comment prefixes for FileCheck.
_comment_prefixes = {'COM', 'RUN'}
_ignore = _lit_directives.union(_comment_prefixes).union({'NOTE', 'TODO'})
def levenshtein(s1: str, s2: str) -> int: # pylint: disable=g-doc-args
"""Computes the edit distance between two strings.
Additions, deletions, and substitutions all count as a single operation.
"""
if not s1:
return len(s2)
if not s2:
return len(s1)
distances = range(len(s2) + 1)
for i in range(len(s1)):
new_distances = [i + 1]
for j in range(len(s2)):
cost = min(distances[j] + int(s1[i] != s2[j]), distances[j + 1] + 1,
new_distances[-1] + 1)
new_distances.append(cost)
distances = new_distances
return distances[-1]
class FileRange:
"""Stores the coordinates of a span on a single line within a file.
Attributes:
line: the line number
start_column: the (inclusive) column where the span starts
end_column: the (inclusive) column where the span ends
"""
line: int
start_column: int
end_column: int
def __init__(self, content: str, start_byte: int, end_byte: int): # pylint: disable=g-doc-args
"""Derives a span's coordinates based on a string and start/end bytes.
`start_byte` and `end_byte` are assumed to be on the same line.
"""
content_before_span = content[:start_byte]
self.line = content_before_span.count('\n') + 1
self.start_column = start_byte - content_before_span.rfind('\n')
self.end_column = self.start_column + (end_byte - start_byte - 1)
def __str__(self) -> str:
return f'{self.line}:{self.start_column}-{self.end_column}'
class Diagnostic:
"""Stores information about one typo and a suggested fix.
Attributes:
filepath: the path to the file in which the typo was found
filerange: the position at which the typo was found in the file
typo: the typo
fix: a suggested fix
"""
filepath: pathlib.Path
filerange: FileRange
typo: str
fix: str
def __init__(
self,
filepath: pathlib.Path,
filerange: FileRange,
typo: str,
fix: str # pylint: disable=redefined-outer-name
):
self.filepath = filepath
self.filerange = filerange
self.typo = typo
self.fix = fix
def __str__(self) -> str:
return f'{self.filepath}:' + str(self.filerange) + f': {self.summary()}'
def summary(self) -> str:
return (
f'Found potentially misspelled directive "{self.typo}". Did you mean '
f'"{self.fix}"?')
def find_potential_directives(
content: str,) -> Generator[Tuple[FileRange, str], None, None]:
"""Extracts all the potential FileCheck directives from a string.
What constitutes a potential directive is loosely defined---we err on the side
of capturing more strings than is necessary, rather than missing any.
Args:
content: the string in which to look for directives
Yields:
Tuples (p, d) where p is the span where the potential directive occurs
within the string and d is the potential directive.
"""
directive_pattern = re.compile(
r'(?:^|//|;|#)[^\d\w\-_]*([\d\w\-_][\s\d\w\-_]*):', re.MULTILINE)
for match in re.finditer(directive_pattern, content):
potential_directive, span = match.group(1), match.span(1)
yield (FileRange(content, span[0], span[1]), potential_directive)
# TODO(bchetioui): also parse comment prefixes to ignore.
def parse_custom_prefixes(content: str) -> Generator[str, None, None]: # pylint: disable=g-doc-args
"""Parses custom prefixes defined in the string provided.
For example, given the following file content:
RUN: something | FileCheck %s -check-prefixes CHECK1,CHECK2
RUN: something_else | FileCheck %s -check-prefix 'CHECK3'
the custom prefixes are CHECK1, CHECK2, and CHECK3.
"""
param_re = r'|'.join([r"'[^']*'", r'"[^"]*"', r'[^\'"\s]+'])
for m in re.finditer(r'-check-prefix(?:es)?(?:\s+|=)({})'.format(param_re),
content):
prefixes = m.group(1)
if prefixes.startswith('\'') or prefixes.startswith('"'):
prefixes = prefixes[1:-1]
for prefix in prefixes.split(','):
yield prefix
def find_directive_typos(
content: str,
filepath: pathlib.Path,
threshold: int = 3,
) -> Generator[Diagnostic, None, None]:
"""Detects potential typos in FileCheck directives.
Args:
content: the content of the file
filepath: the path to the file to check for typos in directives
threshold: the (inclusive) maximum edit distance between a potential
directive and an actual directive, such that the potential directive is
classified as a typo
Yields:
Diagnostics, in order from the top of the file.
"""
all_prefixes = _prefixes.union(set(parse_custom_prefixes(content)))
all_directives = ([
f'{prefix}{suffix}'
for prefix, suffix in itertools.product(all_prefixes, _suffixes)
] + list(_ignore) + list(all_prefixes))
def find_best_match(typo):
return min(
[(threshold + 1, typo)] + [(levenshtein(typo, d), d)
for d in all_directives
if abs(len(d) - len(typo)) <= threshold],
key=lambda tup: tup[0],
)
potential_directives = find_potential_directives(content)
for filerange, potential_directive in potential_directives:
# TODO(bchetioui): match count directives more finely. We skip directives
# starting with 'CHECK-COUNT-' for the moment as they require more complex
# logic to be handled correctly.
if any(
potential_directive.startswith(f'{prefix}-COUNT-')
for prefix in all_prefixes):
continue
# Ignoring potential typos that will not be matched later due to a too low
# threshold, in order to avoid potentially long computation times.
if len(potential_directive) > max(map(len, all_directives)) + threshold:
continue
score, best_match = find_best_match(potential_directive)
if score == 0: # This is an actual directive, ignore.
continue
elif score <= threshold and best_match not in _ignore:
yield Diagnostic(filepath, filerange, potential_directive, best_match)
def main(argv: Sequence[str]):
if len(argv) < 2:
print(f'Usage: {argv[0]} path/to/file/1 ... path/to/file/n')
exit(1)
for filepath in argv[1:]:
logging.info('Checking %s', filepath)
with open(filepath, 'rt') as f:
content = f.read()
for diagnostic in find_directive_typos(
content,
pathlib.Path(filepath),
threshold=_distance_threshold,
):
print(diagnostic)
if __name__ == '__main__':
main(sys.argv)