Skip to content

Commit

Permalink
MAINT Cython linting (#25861)
Browse files Browse the repository at this point in the history
  • Loading branch information
jeremiedbb authored Apr 19, 2023
1 parent 42c2731 commit 9cb1111
Show file tree
Hide file tree
Showing 53 changed files with 477 additions and 423 deletions.
2 changes: 1 addition & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
command: |
source build_tools/shared.sh
# Include pytest compatibility with mypy
pip install pytest flake8 $(get_dep mypy min) $(get_dep black min)
pip install pytest flake8 $(get_dep mypy min) $(get_dep black min) cython-lint
- run:
name: linting
command: ./build_tools/linting.sh
Expand Down
6 changes: 6 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,9 @@ repos:
- id: mypy
files: sklearn/
additional_dependencies: [pytest==6.2.4]
- repo: https://github.com/MarcoGorelli/cython-lint
rev: v0.15.0
hooks:
# TODO: add the double-quote-cython-strings hook when it's usability has improved:
# possibility to pass a directory and use it as a check instead of auto-formatter.
- id: cython-lint
2 changes: 1 addition & 1 deletion azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ jobs:
- bash: |
source build_tools/shared.sh
# Include pytest compatibility with mypy
pip install pytest flake8 $(get_dep mypy min) $(get_dep black min)
pip install pytest flake8 $(get_dep mypy min) $(get_dep black min) cython-lint
displayName: Install linters
- bash: |
./build_tools/linting.sh
Expand Down
3 changes: 3 additions & 0 deletions build_tools/linting.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ echo -e "No problem detected by flake8\n"
mypy sklearn/
echo -e "No problem detected by mypy\n"

cython-lint sklearn/
echo -e "No problem detected by cython-lint\n"

# For docstrings and warnings of deprecated attributes to be rendered
# properly, the property decorator must come before the deprecated decorator
# (else they are treated as functions)
Expand Down
46 changes: 46 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,49 @@ exclude = '''
| asv_benchmarks/env
)/
'''

[tool.cython-lint]
# Ignore the same error codes as flake8
# + E501 (line too long) because keeping it < 88 in cython
# often makes code less readable.
ignore = [
# check ignored by default in flake8. Meaning unclear.
'E24',
# space before : (needed for how black formats slicing)
'E203',
# line too long
'E501',
# do not assign a lambda expression, use a def
'E731',
# do not use variables named 'l', 'O', or 'I'
'E741',
# line break before binary operator
'W503',
# line break after binary operator
'W504',
]
# Exclude files are generated from tempita templates
exclude= '''
(
sklearn/_loss/_loss.pyx
| sklearn/linear_model/_sag_fast.pyx
| sklearn/linear_model/_sgd_fast.pyx
| sklearn/utils/_seq_dataset.pyx
| sklearn/utils/_seq_dataset.pxd
| sklearn/utils/_weight_vector.pyx
| sklearn/utils/_weight_vector.pxd
| sklearn/metrics/_dist_metrics.pyx
| sklearn/metrics/_dist_metrics.pxd
| sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd
| sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx
| sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx
| sklearn/metrics/_pairwise_distances_reduction/_base.pxd
| sklearn/metrics/_pairwise_distances_reduction/_base.pyx
| sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd
| sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx
| sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd
| sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx
| sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd
| sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx
)
'''
11 changes: 1 addition & 10 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -33,18 +33,8 @@ target-version = ['py37']
ignore=
# check ignored by default in flake8. Meaning unclear.
E24,
# continuation line under-indented
E121,
# closing bracket does not match indentation
E123,
# continuation line over-indented for hanging indent
E126,
# space before : (needed for how black formats slicing)
E203,
# missing whitespace around arithmetic operator
E226,
# multiple statements on one line (def)
E704,
# do not assign a lambda expression, use a def
E731,
# do not use variables named 'l', 'O', or 'I'
Expand Down Expand Up @@ -82,6 +72,7 @@ allow_redefinition = True
ignore =
sklearn/_loss/_loss.pyx
sklearn/linear_model/_sag_fast.pyx
sklearn/linear_model/_sgd_fast.pyx
sklearn/utils/_seq_dataset.pyx
sklearn/utils/_seq_dataset.pxd
sklearn/utils/_weight_vector.pyx
Expand Down
4 changes: 1 addition & 3 deletions sklearn/_isotonic.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ import numpy as np
from cython cimport floating



def _inplace_contiguous_isotonic_regression(floating[::1] y, floating[::1] w):
cdef:
Py_ssize_t n = y.shape[0], i, k
Expand Down Expand Up @@ -85,7 +84,6 @@ def _make_unique(const floating[::1] X,
cdef floating current_x = X[0]
cdef floating current_y = 0
cdef floating current_weight = 0
cdef floating y_old = 0
cdef int i = 0
cdef int j
cdef floating x
Expand Down Expand Up @@ -114,4 +112,4 @@ def _make_unique(const floating[::1] X,
np.asarray(x_out[:i+1]),
np.asarray(y_out[:i+1]),
np.asarray(weights_out[:i+1]),
)
)
4 changes: 2 additions & 2 deletions sklearn/_loss/_loss.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ ctypedef fused G_DTYPE_C:

# Struct to return 2 doubles
ctypedef struct double_pair:
double val1
double val2
double val1
double val2


# C base class for loss functions
Expand Down
1 change: 1 addition & 0 deletions sklearn/cluster/_dbscan_inner.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ cimport numpy as cnp

cnp.import_array()


def dbscan_inner(const cnp.uint8_t[::1] is_core,
object[:] neighborhoods,
cnp.npy_intp[::1] labels):
Expand Down
2 changes: 1 addition & 1 deletion sklearn/cluster/_k_means_common.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ cdef floating _euclidean_sparse_dense(
const floating[::1],
const int[::1],
const floating[::1],
floating,
floating,
bint
) noexcept nogil

Expand Down
16 changes: 10 additions & 6 deletions sklearn/cluster/_k_means_common.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,14 @@ cdef floating _euclidean_dense_dense(

# We manually unroll the loop for better cache optimization.
for i in range(n):
result += ((a[0] - b[0]) * (a[0] - b[0])
+(a[1] - b[1]) * (a[1] - b[1])
+(a[2] - b[2]) * (a[2] - b[2])
+(a[3] - b[3]) * (a[3] - b[3]))
a += 4; b += 4
result += (
(a[0] - b[0]) * (a[0] - b[0]) +
(a[1] - b[1]) * (a[1] - b[1]) +
(a[2] - b[2]) * (a[2] - b[2]) +
(a[3] - b[3]) * (a[3] - b[3])
)
a += 4
b += 4

for i in range(rem):
result += (a[i] - b[i]) * (a[i] - b[i])
Expand Down Expand Up @@ -77,7 +80,8 @@ cdef floating _euclidean_sparse_dense(

result += b_squared_norm

if result < 0: result = 0.0
if result < 0:
result = 0.0

return result if squared else sqrt(result)

Expand Down
29 changes: 18 additions & 11 deletions sklearn/cluster/_k_means_elkan.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,6 @@ def init_bounds_sparse(
cdef:
int n_samples = X.shape[0]
int n_clusters = centers.shape[0]
int n_features = X.shape[1]

floating[::1] X_data = X.data
int[::1] X_indices = X.indices
Expand Down Expand Up @@ -269,7 +268,7 @@ def elkan_iter_chunked_dense(
int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
int n_chunks = n_samples // n_samples_chunk
int n_samples_rem = n_samples % n_samples_chunk
int chunk_idx, n_samples_chunk_eff
int chunk_idx
int start, end

int i, j, k
Expand Down Expand Up @@ -386,9 +385,11 @@ cdef void _update_chunk_dense(
# If this holds, then center_index is a good candidate for the
# sample to be relabelled, and we need to confirm this by
# recomputing the upper and lower bounds.
if (j != label
if (
j != label
and (upper_bound > lower_bounds[i, j])
and (upper_bound > center_half_distances[label, j])):
and (upper_bound > center_half_distances[label, j])
):

# Recompute upper bound by calculating the actual distance
# between the sample and its current assigned center.
Expand All @@ -401,8 +402,10 @@ cdef void _update_chunk_dense(
# If the condition still holds, then compute the actual
# distance between the sample and center. If this is less
# than the previous distance, reassign label.
if (upper_bound > lower_bounds[i, j]
or (upper_bound > center_half_distances[label, j])):
if (
upper_bound > lower_bounds[i, j]
or (upper_bound > center_half_distances[label, j])
):

distance = _euclidean_dense_dense(
&X[i, 0], &centers_old[j, 0], n_features, False)
Expand Down Expand Up @@ -504,7 +507,7 @@ def elkan_iter_chunked_sparse(
int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
int n_chunks = n_samples // n_samples_chunk
int n_samples_rem = n_samples % n_samples_chunk
int chunk_idx, n_samples_chunk_eff
int chunk_idx
int start, end

int i, j, k
Expand Down Expand Up @@ -631,9 +634,11 @@ cdef void _update_chunk_sparse(
# If this holds, then center_index is a good candidate for the
# sample to be relabelled, and we need to confirm this by
# recomputing the upper and lower bounds.
if (j != label
if (
j != label
and (upper_bound > lower_bounds[i, j])
and (upper_bound > center_half_distances[label, j])):
and (upper_bound > center_half_distances[label, j])
):

# Recompute upper bound by calculating the actual distance
# between the sample and its current assigned center.
Expand All @@ -648,8 +653,10 @@ cdef void _update_chunk_sparse(
# If the condition still holds, then compute the actual
# distance between the sample and center. If this is less
# than the previous distance, reassign label.
if (upper_bound > lower_bounds[i, j]
or (upper_bound > center_half_distances[label, j])):
if (
upper_bound > lower_bounds[i, j]
or (upper_bound > center_half_distances[label, j])
):
distance = _euclidean_sparse_dense(
X_data[X_indptr[i] - s: X_indptr[i + 1] - s],
X_indices[X_indptr[i] - s: X_indptr[i + 1] - s],
Expand Down
9 changes: 5 additions & 4 deletions sklearn/cluster/_k_means_lloyd.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def lloyd_iter_chunked_dense(
int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
int n_chunks = n_samples // n_samples_chunk
int n_samples_rem = n_samples % n_samples_chunk
int chunk_idx, n_samples_chunk_eff
int chunk_idx
int start, end

int j, k
Expand Down Expand Up @@ -153,8 +153,9 @@ def lloyd_iter_chunked_dense(

if update_centers:
omp_destroy_lock(&lock)
_relocate_empty_clusters_dense(X, sample_weight, centers_old,
centers_new, weight_in_clusters, labels)
_relocate_empty_clusters_dense(
X, sample_weight, centers_old, centers_new, weight_in_clusters, labels
)

_average_centers(centers_new, weight_in_clusters)
_center_shift(centers_old, centers_new, center_shift)
Expand Down Expand Up @@ -278,7 +279,7 @@ def lloyd_iter_chunked_sparse(
int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
int n_chunks = n_samples // n_samples_chunk
int n_samples_rem = n_samples % n_samples_chunk
int chunk_idx, n_samples_chunk_eff = 0
int chunk_idx
int start = 0, end = 0

int j, k
Expand Down
17 changes: 8 additions & 9 deletions sklearn/datasets/_svmlight_format_fast.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ def _load_svmlight_file(f, dtype, bint multilabel, bint zero_based,

return (dtype, data, indices, indptr, labels, query)


# Two fused types are defined to be able to
# use all possible combinations of parameters.
ctypedef fused int_or_float:
Expand All @@ -128,8 +129,9 @@ ctypedef fused int_or_longlong:
cython.integral
signed long long


def get_dense_row_string(
int_or_float[:,:] X,
int_or_float[:, :] X,
Py_ssize_t[:] x_inds,
double_or_longlong[:] x_vals,
Py_ssize_t row,
Expand All @@ -143,7 +145,7 @@ def get_dense_row_string(
int_or_float val

for k in range(row_length):
val = X[row,k]
val = X[row, k]
if val == 0:
continue
x_inds[x_nz_used] = k
Expand All @@ -157,6 +159,7 @@ def get_dense_row_string(

return " ".join(reprs)


def get_sparse_row_string(
int_or_float[:] X_data,
int[:] X_indptr,
Expand All @@ -176,6 +179,7 @@ def get_sparse_row_string(

return " ".join(reprs)


def _dump_svmlight_file(
X,
y,
Expand Down Expand Up @@ -211,8 +215,6 @@ def _dump_svmlight_file(
Py_ssize_t j
Py_ssize_t col_start
Py_ssize_t col_end
bint first
Py_ssize_t x_nz_used
Py_ssize_t[:] x_inds = np.empty(row_length, dtype=np.intp)
signed long long[:] x_vals_int
double[:] x_vals_float
Expand All @@ -224,8 +226,6 @@ def _dump_svmlight_file(
x_vals_float = np.zeros(row_length, dtype=np.float64)

for i in range(x_len):
x_nz_used = 0

if not X_is_sp:
if X_is_integral:
s = get_dense_row_string(X, x_inds, x_vals_int, i, value_pattern, one_based)
Expand All @@ -234,18 +234,17 @@ def _dump_svmlight_file(
else:
s = get_sparse_row_string(X.data, X.indptr, X.indices, i, value_pattern, one_based)
if multilabel:
first = True
if y_is_sp:
col_start = y.indptr[i]
col_end = y.indptr[i+1]
labels_str = ','.join(tuple(label_pattern % y.indices[j] for j in range(col_start, col_end) if y.data[j] != 0))
else:
labels_str = ','.join(label_pattern % j for j in range(num_labels) if y[i,j] != 0)
labels_str = ','.join(label_pattern % j for j in range(num_labels) if y[i, j] != 0)
else:
if y_is_sp:
labels_str = label_pattern % y.data[i]
else:
labels_str = label_pattern % y[i,0]
labels_str = label_pattern % y[i, 0]

if query_id_is_not_empty:
feat = (labels_str, query_id[i], s)
Expand Down
2 changes: 1 addition & 1 deletion sklearn/decomposition/_cdnmf_fast.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -34,5 +34,5 @@ def _update_cdnmf_fast(floating[:, ::1] W, floating[:, :] HHt,

if hess != 0:
W[i, t] = max(W[i, t] - grad / hess, 0.)

return violation
Loading

0 comments on commit 9cb1111

Please sign in to comment.