Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MAINT Automatic Cython linting #25861

Merged
merged 24 commits into from
Apr 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
15ba44c
wip
jeremiedbb Mar 13, 2023
95266a6
done all but templates
jeremiedbb Mar 13, 2023
9becad8
done this time
jeremiedbb Mar 13, 2023
273d513
last ones
jeremiedbb Mar 14, 2023
d002019
remove outdated ignored flake8 errors
jeremiedbb Mar 15, 2023
ac87cba
Merge remote-tracking branch 'upstream/main' into cython-lint
jeremiedbb Mar 15, 2023
7293cd4
Merge remote-tracking branch 'upstream/main' into cython-lint
jeremiedbb Mar 15, 2023
ba3979f
cython-lint in CI
jeremiedbb Mar 15, 2023
920df1b
args in pre-commit
jeremiedbb Mar 15, 2023
c59f028
fix circle ci
jeremiedbb Mar 15, 2023
b03e25e
Merge branch 'main' into cython-lint
jeremiedbb Mar 16, 2023
da90347
Merge remote-tracking branch 'upstream/main' into cython-lint
jeremiedbb Mar 21, 2023
182b965
address review comments
jeremiedbb Mar 21, 2023
d14ed54
config pyproject.toml
jeremiedbb Apr 3, 2023
213b991
update
jeremiedbb Apr 3, 2023
2afba48
Merge remote-tracking branch 'upstream/main' into cython-lint
jeremiedbb Apr 3, 2023
2f88426
exclude newly templated sgd_fast
jeremiedbb Apr 3, 2023
e25e24d
comment
jeremiedbb Apr 3, 2023
82e4ea3
Merge branch 'main' into cython-lint
jeremiedbb Apr 3, 2023
114960f
Merge remote-tracking branch 'upstream/main' into cython-lint
jeremiedbb Apr 7, 2023
7a63d58
lint
jeremiedbb Apr 7, 2023
6f5e4cc
Merge branch 'cython-lint' of github.com:jeremiedbb/scikit-learn into…
jeremiedbb Apr 7, 2023
cb7d18e
Merge remote-tracking branch 'upstream/main' into pr/jeremiedbb/25861
jeremiedbb Apr 14, 2023
b2fe3f7
Merge branch 'main' into cython-lint
jeremiedbb Apr 19, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
command: |
source build_tools/shared.sh
# Include pytest compatibility with mypy
pip install pytest flake8 $(get_dep mypy min) $(get_dep black min)
pip install pytest flake8 $(get_dep mypy min) $(get_dep black min) cython-lint
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally those dependencies should all be locked and maintained via our script in build_tools but we can do that later.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes but this is not run within the same environment

- run:
name: linting
command: ./build_tools/linting.sh
Expand Down
6 changes: 6 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,9 @@ repos:
- id: mypy
files: sklearn/
additional_dependencies: [pytest==6.2.4]
- repo: https://github.com/MarcoGorelli/cython-lint
rev: v0.15.0
hooks:
# TODO: add the double-quote-cython-strings hook when it's usability has improved:
# possibility to pass a directory and use it as a check instead of auto-formatter.
- id: cython-lint
ogrisel marked this conversation as resolved.
Show resolved Hide resolved
2 changes: 1 addition & 1 deletion azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ jobs:
- bash: |
source build_tools/shared.sh
# Include pytest compatibility with mypy
pip install pytest flake8 $(get_dep mypy min) $(get_dep black min)
pip install pytest flake8 $(get_dep mypy min) $(get_dep black min) cython-lint
displayName: Install linters
- bash: |
./build_tools/linting.sh
Expand Down
3 changes: 3 additions & 0 deletions build_tools/linting.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ echo -e "No problem detected by flake8\n"
mypy sklearn/
echo -e "No problem detected by mypy\n"

cython-lint sklearn/
echo -e "No problem detected by cython-lint\n"

# For docstrings and warnings of deprecated attributes to be rendered
# properly, the property decorator must come before the deprecated decorator
# (else they are treated as functions)
Expand Down
46 changes: 46 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,49 @@ exclude = '''
| asv_benchmarks/env
)/
'''

[tool.cython-lint]
# Ignore the same error codes as flake8
# + E501 (line too long) because keeping it < 88 in cython
# often makes code less readable.
ignore = [
# check ignored by default in flake8. Meaning unclear.
'E24',
# space before : (needed for how black formats slicing)
'E203',
# line too long
'E501',
# do not assign a lambda expression, use a def
'E731',
# do not use variables named 'l', 'O', or 'I'
'E741',
# line break before binary operator
'W503',
# line break after binary operator
'W504',
]
# Exclude files are generated from tempita templates
exclude= '''
(
sklearn/_loss/_loss.pyx
| sklearn/linear_model/_sag_fast.pyx
| sklearn/linear_model/_sgd_fast.pyx
| sklearn/utils/_seq_dataset.pyx
| sklearn/utils/_seq_dataset.pxd
| sklearn/utils/_weight_vector.pyx
| sklearn/utils/_weight_vector.pxd
| sklearn/metrics/_dist_metrics.pyx
| sklearn/metrics/_dist_metrics.pxd
| sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd
| sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx
| sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx
| sklearn/metrics/_pairwise_distances_reduction/_base.pxd
| sklearn/metrics/_pairwise_distances_reduction/_base.pyx
| sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd
| sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx
| sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd
| sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx
| sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd
| sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx
)
'''
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about using git ls-files as suggested in #25861 (comment) instead of maintaining a materialized exclusion list?

I am fine with both approaches, but maybe the git ls-files trick would be lower maintenance?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

git ls-files is fine in the CI but I wanted that contributors can just run cython-lint . like they run black or flake8.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We now have three lists to maintain (this one, the one in setup.py, and the one in setup.cfg). Is there a way to have fewer lists to maintain?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pre-commit would solve this for you, as it automatically handles parallelisation + only running on tracked files ;) pandas uses it for everything linting/formatting-related, can recommend

otherwise, I guess you could make a local script run_cython_lint.py which runs something like

files = subprocess.run(['git ls-files'], text=True, capture_output=True).strip().splitlines()
subprocess.run(['cython-lint', *files])

and then both contributors and CI run python scripts/run_cython_lint.py?

Or write another script to keep the three lists in sync and have it run in CI? This might be smallest-effort, least disruptive solution?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pre-commit would solve this for you

I don't want to force our contributors to use pre-commit. I don't use it for instance :)

then both contributors and CI run python scripts/run_cython_lint.py?

I like the idea of contributors being able to invoke the cython linter the same way they invoke the other linters. We have a list of excluded directories/files for black in pyproject.toml. I think it's acceptable to have a list for cython-lint as well.

11 changes: 1 addition & 10 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -33,18 +33,8 @@ target-version = ['py37']
ignore=
# check ignored by default in flake8. Meaning unclear.
E24,
# continuation line under-indented
E121,
# closing bracket does not match indentation
E123,
# continuation line over-indented for hanging indent
E126,
# space before : (needed for how black formats slicing)
E203,
# missing whitespace around arithmetic operator
E226,
# multiple statements on one line (def)
E704,
Comment on lines -36 to -47
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All these error codes seem outdated since we are black compliant. I check removing them and we indeed don't violate any of them. I think it's a good opportunity to do some clean-up

# do not assign a lambda expression, use a def
E731,
# do not use variables named 'l', 'O', or 'I'
Expand Down Expand Up @@ -82,6 +72,7 @@ allow_redefinition = True
ignore =
sklearn/_loss/_loss.pyx
sklearn/linear_model/_sag_fast.pyx
sklearn/linear_model/_sgd_fast.pyx
sklearn/utils/_seq_dataset.pyx
sklearn/utils/_seq_dataset.pxd
sklearn/utils/_weight_vector.pyx
Expand Down
4 changes: 1 addition & 3 deletions sklearn/_isotonic.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ import numpy as np
from cython cimport floating



def _inplace_contiguous_isotonic_regression(floating[::1] y, floating[::1] w):
cdef:
Py_ssize_t n = y.shape[0], i, k
Expand Down Expand Up @@ -85,7 +84,6 @@ def _make_unique(const floating[::1] X,
cdef floating current_x = X[0]
cdef floating current_y = 0
cdef floating current_weight = 0
cdef floating y_old = 0
cdef int i = 0
cdef int j
cdef floating x
Expand Down Expand Up @@ -114,4 +112,4 @@ def _make_unique(const floating[::1] X,
np.asarray(x_out[:i+1]),
np.asarray(y_out[:i+1]),
np.asarray(weights_out[:i+1]),
)
)
4 changes: 2 additions & 2 deletions sklearn/_loss/_loss.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ ctypedef fused G_DTYPE_C:

# Struct to return 2 doubles
ctypedef struct double_pair:
double val1
double val2
double val1
double val2


# C base class for loss functions
Expand Down
1 change: 1 addition & 0 deletions sklearn/cluster/_dbscan_inner.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ cimport numpy as cnp

cnp.import_array()


def dbscan_inner(const cnp.uint8_t[::1] is_core,
object[:] neighborhoods,
cnp.npy_intp[::1] labels):
Expand Down
2 changes: 1 addition & 1 deletion sklearn/cluster/_k_means_common.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ cdef floating _euclidean_sparse_dense(
const floating[::1],
const int[::1],
const floating[::1],
floating,
floating,
bint
) noexcept nogil

Expand Down
16 changes: 10 additions & 6 deletions sklearn/cluster/_k_means_common.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,14 @@ cdef floating _euclidean_dense_dense(

# We manually unroll the loop for better cache optimization.
for i in range(n):
result += ((a[0] - b[0]) * (a[0] - b[0])
+(a[1] - b[1]) * (a[1] - b[1])
+(a[2] - b[2]) * (a[2] - b[2])
+(a[3] - b[3]) * (a[3] - b[3]))
a += 4; b += 4
result += (
(a[0] - b[0]) * (a[0] - b[0]) +
(a[1] - b[1]) * (a[1] - b[1]) +
(a[2] - b[2]) * (a[2] - b[2]) +
(a[3] - b[3]) * (a[3] - b[3])
)
a += 4
b += 4

for i in range(rem):
result += (a[i] - b[i]) * (a[i] - b[i])
Expand Down Expand Up @@ -77,7 +80,8 @@ cdef floating _euclidean_sparse_dense(

result += b_squared_norm

if result < 0: result = 0.0
if result < 0:
result = 0.0

return result if squared else sqrt(result)

Expand Down
29 changes: 18 additions & 11 deletions sklearn/cluster/_k_means_elkan.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,6 @@ def init_bounds_sparse(
cdef:
int n_samples = X.shape[0]
int n_clusters = centers.shape[0]
int n_features = X.shape[1]

floating[::1] X_data = X.data
int[::1] X_indices = X.indices
Expand Down Expand Up @@ -269,7 +268,7 @@ def elkan_iter_chunked_dense(
int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
int n_chunks = n_samples // n_samples_chunk
int n_samples_rem = n_samples % n_samples_chunk
int chunk_idx, n_samples_chunk_eff
int chunk_idx
int start, end

int i, j, k
Expand Down Expand Up @@ -386,9 +385,11 @@ cdef void _update_chunk_dense(
# If this holds, then center_index is a good candidate for the
# sample to be relabelled, and we need to confirm this by
# recomputing the upper and lower bounds.
if (j != label
if (
j != label
and (upper_bound > lower_bounds[i, j])
and (upper_bound > center_half_distances[label, j])):
and (upper_bound > center_half_distances[label, j])
):

# Recompute upper bound by calculating the actual distance
# between the sample and its current assigned center.
Expand All @@ -401,8 +402,10 @@ cdef void _update_chunk_dense(
# If the condition still holds, then compute the actual
# distance between the sample and center. If this is less
# than the previous distance, reassign label.
if (upper_bound > lower_bounds[i, j]
or (upper_bound > center_half_distances[label, j])):
if (
upper_bound > lower_bounds[i, j]
or (upper_bound > center_half_distances[label, j])
):

distance = _euclidean_dense_dense(
&X[i, 0], &centers_old[j, 0], n_features, False)
Expand Down Expand Up @@ -504,7 +507,7 @@ def elkan_iter_chunked_sparse(
int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
int n_chunks = n_samples // n_samples_chunk
int n_samples_rem = n_samples % n_samples_chunk
int chunk_idx, n_samples_chunk_eff
int chunk_idx
int start, end

int i, j, k
Expand Down Expand Up @@ -631,9 +634,11 @@ cdef void _update_chunk_sparse(
# If this holds, then center_index is a good candidate for the
# sample to be relabelled, and we need to confirm this by
# recomputing the upper and lower bounds.
if (j != label
if (
j != label
and (upper_bound > lower_bounds[i, j])
and (upper_bound > center_half_distances[label, j])):
and (upper_bound > center_half_distances[label, j])
):

# Recompute upper bound by calculating the actual distance
# between the sample and its current assigned center.
Expand All @@ -648,8 +653,10 @@ cdef void _update_chunk_sparse(
# If the condition still holds, then compute the actual
# distance between the sample and center. If this is less
# than the previous distance, reassign label.
if (upper_bound > lower_bounds[i, j]
or (upper_bound > center_half_distances[label, j])):
if (
upper_bound > lower_bounds[i, j]
or (upper_bound > center_half_distances[label, j])
):
distance = _euclidean_sparse_dense(
X_data[X_indptr[i] - s: X_indptr[i + 1] - s],
X_indices[X_indptr[i] - s: X_indptr[i + 1] - s],
Expand Down
9 changes: 5 additions & 4 deletions sklearn/cluster/_k_means_lloyd.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def lloyd_iter_chunked_dense(
int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
int n_chunks = n_samples // n_samples_chunk
int n_samples_rem = n_samples % n_samples_chunk
int chunk_idx, n_samples_chunk_eff
int chunk_idx
int start, end

int j, k
Expand Down Expand Up @@ -153,8 +153,9 @@ def lloyd_iter_chunked_dense(

if update_centers:
omp_destroy_lock(&lock)
_relocate_empty_clusters_dense(X, sample_weight, centers_old,
centers_new, weight_in_clusters, labels)
_relocate_empty_clusters_dense(
X, sample_weight, centers_old, centers_new, weight_in_clusters, labels
)

_average_centers(centers_new, weight_in_clusters)
_center_shift(centers_old, centers_new, center_shift)
Expand Down Expand Up @@ -278,7 +279,7 @@ def lloyd_iter_chunked_sparse(
int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
int n_chunks = n_samples // n_samples_chunk
int n_samples_rem = n_samples % n_samples_chunk
int chunk_idx, n_samples_chunk_eff = 0
int chunk_idx
int start = 0, end = 0

int j, k
Expand Down
17 changes: 8 additions & 9 deletions sklearn/datasets/_svmlight_format_fast.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ def _load_svmlight_file(f, dtype, bint multilabel, bint zero_based,

return (dtype, data, indices, indptr, labels, query)


# Two fused types are defined to be able to
# use all possible combinations of parameters.
ctypedef fused int_or_float:
Expand All @@ -128,8 +129,9 @@ ctypedef fused int_or_longlong:
cython.integral
signed long long


def get_dense_row_string(
int_or_float[:,:] X,
int_or_float[:, :] X,
Py_ssize_t[:] x_inds,
double_or_longlong[:] x_vals,
Py_ssize_t row,
Expand All @@ -143,7 +145,7 @@ def get_dense_row_string(
int_or_float val

for k in range(row_length):
val = X[row,k]
val = X[row, k]
if val == 0:
continue
x_inds[x_nz_used] = k
Expand All @@ -157,6 +159,7 @@ def get_dense_row_string(

return " ".join(reprs)


def get_sparse_row_string(
int_or_float[:] X_data,
int[:] X_indptr,
Expand All @@ -176,6 +179,7 @@ def get_sparse_row_string(

return " ".join(reprs)


def _dump_svmlight_file(
X,
y,
Expand Down Expand Up @@ -211,8 +215,6 @@ def _dump_svmlight_file(
Py_ssize_t j
Py_ssize_t col_start
Py_ssize_t col_end
bint first
Py_ssize_t x_nz_used
Py_ssize_t[:] x_inds = np.empty(row_length, dtype=np.intp)
signed long long[:] x_vals_int
double[:] x_vals_float
Expand All @@ -224,8 +226,6 @@ def _dump_svmlight_file(
x_vals_float = np.zeros(row_length, dtype=np.float64)

for i in range(x_len):
x_nz_used = 0

if not X_is_sp:
if X_is_integral:
s = get_dense_row_string(X, x_inds, x_vals_int, i, value_pattern, one_based)
Expand All @@ -234,18 +234,17 @@ def _dump_svmlight_file(
else:
s = get_sparse_row_string(X.data, X.indptr, X.indices, i, value_pattern, one_based)
if multilabel:
first = True
if y_is_sp:
col_start = y.indptr[i]
col_end = y.indptr[i+1]
labels_str = ','.join(tuple(label_pattern % y.indices[j] for j in range(col_start, col_end) if y.data[j] != 0))
else:
labels_str = ','.join(label_pattern % j for j in range(num_labels) if y[i,j] != 0)
labels_str = ','.join(label_pattern % j for j in range(num_labels) if y[i, j] != 0)
else:
if y_is_sp:
labels_str = label_pattern % y.data[i]
else:
labels_str = label_pattern % y[i,0]
labels_str = label_pattern % y[i, 0]

if query_id_is_not_empty:
feat = (labels_str, query_id[i], s)
Expand Down
2 changes: 1 addition & 1 deletion sklearn/decomposition/_cdnmf_fast.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -34,5 +34,5 @@ def _update_cdnmf_fast(floating[:, ::1] W, floating[:, :] HHt,

if hess != 0:
W[i, t] = max(W[i, t] - grad / hess, 0.)

return violation
Loading