scikit-learn · jeremiedbb · Apr 19, 2023 · Mar 13, 2023 · Mar 13, 2023 · Mar 13, 2023
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -11,7 +11,7 @@ jobs:
           command: |
             source build_tools/shared.sh
             # Include pytest compatibility with mypy
-            pip install pytest flake8 $(get_dep mypy min) $(get_dep black min)
+            pip install pytest flake8 $(get_dep mypy min) $(get_dep black min) cython-lint
       - run:
           name: linting
           command: ./build_tools/linting.sh

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -20,3 +20,9 @@ repos:
      -  id: mypy
         files: sklearn/
         additional_dependencies: [pytest==6.2.4]
+-   repo: https://github.com/MarcoGorelli/cython-lint
+    rev: v0.15.0
+    hooks:
+    # TODO: add the double-quote-cython-strings hook when it's usability has improved:
+    # possibility to pass a directory and use it as a check instead of auto-formatter.
+    -   id: cython-lint
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -35,7 +35,7 @@ jobs:
     - bash: |
         source build_tools/shared.sh
         # Include pytest compatibility with mypy
-        pip install pytest flake8 $(get_dep mypy min) $(get_dep black min)
+        pip install pytest flake8 $(get_dep mypy min) $(get_dep black min) cython-lint
       displayName: Install linters
     - bash: |
         ./build_tools/linting.sh

diff --git a/build_tools/linting.sh b/build_tools/linting.sh
@@ -13,6 +13,9 @@ echo -e "No problem detected by flake8\n"
 mypy sklearn/
 echo -e "No problem detected by mypy\n"
 
+cython-lint sklearn/
+echo -e "No problem detected by cython-lint\n"
+
 # For docstrings and warnings of deprecated attributes to be rendered
 # properly, the property decorator must come before the deprecated decorator
 # (else they are treated as functions)

diff --git a/pyproject.toml b/pyproject.toml
@@ -38,3 +38,49 @@ exclude = '''
   | asv_benchmarks/env
 )/
 '''
+
+[tool.cython-lint]
+# Ignore the same error codes as flake8
+# + E501 (line too long) because keeping it < 88 in cython
+# often makes code less readable.
+ignore = [
+    # check ignored by default in flake8. Meaning unclear.
+    'E24',
+    # space before : (needed for how black formats slicing)
+    'E203',
+    # line too long
+    'E501',
+    # do not assign a lambda expression, use a def
+    'E731',
+    # do not use variables named 'l', 'O', or 'I'
+    'E741',
+    # line break before binary operator
+    'W503',
+    # line break after binary operator
+    'W504',
+]
+# Exclude files are generated from tempita templates
+exclude= '''
+(
+    sklearn/_loss/_loss.pyx
+  | sklearn/linear_model/_sag_fast.pyx
+  | sklearn/linear_model/_sgd_fast.pyx
+  | sklearn/utils/_seq_dataset.pyx
+  | sklearn/utils/_seq_dataset.pxd
+  | sklearn/utils/_weight_vector.pyx
+  | sklearn/utils/_weight_vector.pxd
+  | sklearn/metrics/_dist_metrics.pyx
+  | sklearn/metrics/_dist_metrics.pxd
+  | sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd
+  | sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx
+  | sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx
+  | sklearn/metrics/_pairwise_distances_reduction/_base.pxd
+  | sklearn/metrics/_pairwise_distances_reduction/_base.pyx
+  | sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd
+  | sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx
+  | sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd
+  | sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx
+  | sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd
+  | sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx
+)
+'''
diff --git a/setup.cfg b/setup.cfg
@@ -33,18 +33,8 @@ target-version = ['py37']
 ignore=
     # check ignored by default in flake8. Meaning unclear.
     E24,
-    # continuation line under-indented
-    E121,
-    # closing bracket does not match indentation
-    E123,
-    # continuation line over-indented for hanging indent
-    E126,
     # space before : (needed for how black formats slicing)
     E203,
-    # missing whitespace around arithmetic operator
-    E226,
-    # multiple statements on one line (def)
-    E704,
     # do not assign a lambda expression, use a def
     E731,
     # do not use variables named 'l', 'O', or 'I'
@@ -82,6 +72,7 @@ allow_redefinition = True
 ignore =
     sklearn/_loss/_loss.pyx
     sklearn/linear_model/_sag_fast.pyx
+    sklearn/linear_model/_sgd_fast.pyx
     sklearn/utils/_seq_dataset.pyx
     sklearn/utils/_seq_dataset.pxd
     sklearn/utils/_weight_vector.pyx

diff --git a/sklearn/_isotonic.pyx b/sklearn/_isotonic.pyx
@@ -8,7 +8,6 @@ import numpy as np
 from cython cimport floating
 
 
-
 def _inplace_contiguous_isotonic_regression(floating[::1] y, floating[::1] w):
     cdef:
         Py_ssize_t n = y.shape[0], i, k
@@ -85,7 +84,6 @@ def _make_unique(const floating[::1] X,
     cdef floating current_x = X[0]
     cdef floating current_y = 0
     cdef floating current_weight = 0
-    cdef floating y_old = 0
     cdef int i = 0
     cdef int j
     cdef floating x
@@ -114,4 +112,4 @@ def _make_unique(const floating[::1] X,
         np.asarray(x_out[:i+1]),
         np.asarray(y_out[:i+1]),
         np.asarray(weights_out[:i+1]),
-)
+    )
diff --git a/sklearn/_loss/_loss.pxd b/sklearn/_loss/_loss.pxd
@@ -14,8 +14,8 @@ ctypedef fused G_DTYPE_C:
 
 # Struct to return 2 doubles
 ctypedef struct double_pair:
-   double val1
-   double val2
+    double val1
+    double val2
 
 
 # C base class for loss functions

diff --git a/sklearn/cluster/_dbscan_inner.pyx b/sklearn/cluster/_dbscan_inner.pyx
@@ -7,6 +7,7 @@ cimport numpy as cnp
 
 cnp.import_array()
 
+
 def dbscan_inner(const cnp.uint8_t[::1] is_core,
                  object[:] neighborhoods,
                  cnp.npy_intp[::1] labels):

diff --git a/sklearn/cluster/_k_means_common.pxd b/sklearn/cluster/_k_means_common.pxd
@@ -12,7 +12,7 @@ cdef floating _euclidean_sparse_dense(
     const floating[::1],
     const int[::1],
     const floating[::1],
-    floating, 
+    floating,
     bint
 ) noexcept nogil
 

diff --git a/sklearn/cluster/_k_means_common.pyx b/sklearn/cluster/_k_means_common.pyx
@@ -35,11 +35,14 @@ cdef floating _euclidean_dense_dense(
 
     # We manually unroll the loop for better cache optimization.
     for i in range(n):
-        result += ((a[0] - b[0]) * (a[0] - b[0])
-                  +(a[1] - b[1]) * (a[1] - b[1])
-                  +(a[2] - b[2]) * (a[2] - b[2])
-                  +(a[3] - b[3]) * (a[3] - b[3]))
-        a += 4; b += 4
+        result += (
+            (a[0] - b[0]) * (a[0] - b[0]) +
+            (a[1] - b[1]) * (a[1] - b[1]) +
+            (a[2] - b[2]) * (a[2] - b[2]) +
+            (a[3] - b[3]) * (a[3] - b[3])
+        )
+        a += 4
+        b += 4
 
     for i in range(rem):
         result += (a[i] - b[i]) * (a[i] - b[i])
@@ -77,7 +80,8 @@ cdef floating _euclidean_sparse_dense(
 
     result += b_squared_norm
 
-    if result < 0: result = 0.0
+    if result < 0:
+        result = 0.0
 
     return result if squared else sqrt(result)
 

diff --git a/sklearn/cluster/_k_means_elkan.pyx b/sklearn/cluster/_k_means_elkan.pyx
@@ -154,7 +154,6 @@ def init_bounds_sparse(
     cdef:
         int n_samples = X.shape[0]
         int n_clusters = centers.shape[0]
-        int n_features = X.shape[1]
 
         floating[::1] X_data = X.data
         int[::1] X_indices = X.indices
@@ -269,7 +268,7 @@ def elkan_iter_chunked_dense(
         int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
         int n_chunks = n_samples // n_samples_chunk
         int n_samples_rem = n_samples % n_samples_chunk
-        int chunk_idx, n_samples_chunk_eff
+        int chunk_idx
         int start, end
 
         int i, j, k
@@ -386,9 +385,11 @@ cdef void _update_chunk_dense(
                 # If this holds, then center_index is a good candidate for the
                 # sample to be relabelled, and we need to confirm this by
                 # recomputing the upper and lower bounds.
-                if (j != label
+                if (
+                    j != label
                     and (upper_bound > lower_bounds[i, j])
-                    and (upper_bound > center_half_distances[label, j])):
+                    and (upper_bound > center_half_distances[label, j])
+                ):
 
                     # Recompute upper bound by calculating the actual distance
                     # between the sample and its current assigned center.
@@ -401,8 +402,10 @@ cdef void _update_chunk_dense(
                     # If the condition still holds, then compute the actual
                     # distance between the sample and center. If this is less
                     # than the previous distance, reassign label.
-                    if (upper_bound > lower_bounds[i, j]
-                        or (upper_bound > center_half_distances[label, j])):
+                    if (
+                        upper_bound > lower_bounds[i, j]
+                        or (upper_bound > center_half_distances[label, j])
+                    ):
 
                         distance = _euclidean_dense_dense(
                             &X[i, 0], &centers_old[j, 0], n_features, False)
@@ -504,7 +507,7 @@ def elkan_iter_chunked_sparse(
         int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
         int n_chunks = n_samples // n_samples_chunk
         int n_samples_rem = n_samples % n_samples_chunk
-        int chunk_idx, n_samples_chunk_eff
+        int chunk_idx
         int start, end
 
         int i, j, k
@@ -631,9 +634,11 @@ cdef void _update_chunk_sparse(
                 # If this holds, then center_index is a good candidate for the
                 # sample to be relabelled, and we need to confirm this by
                 # recomputing the upper and lower bounds.
-                if (j != label
+                if (
+                    j != label
                     and (upper_bound > lower_bounds[i, j])
-                    and (upper_bound > center_half_distances[label, j])):
+                    and (upper_bound > center_half_distances[label, j])
+                ):
 
                     # Recompute upper bound by calculating the actual distance
                     # between the sample and its current assigned center.
@@ -648,8 +653,10 @@ cdef void _update_chunk_sparse(
                     # If the condition still holds, then compute the actual
                     # distance between the sample and center. If this is less
                     # than the previous distance, reassign label.
-                    if (upper_bound > lower_bounds[i, j]
-                        or (upper_bound > center_half_distances[label, j])):
+                    if (
+                        upper_bound > lower_bounds[i, j]
+                        or (upper_bound > center_half_distances[label, j])
+                    ):
                         distance = _euclidean_sparse_dense(
                             X_data[X_indptr[i] - s: X_indptr[i + 1] - s],
                             X_indices[X_indptr[i] - s: X_indptr[i + 1] - s],

diff --git a/sklearn/cluster/_k_means_lloyd.pyx b/sklearn/cluster/_k_means_lloyd.pyx
@@ -87,7 +87,7 @@ def lloyd_iter_chunked_dense(
         int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
         int n_chunks = n_samples // n_samples_chunk
         int n_samples_rem = n_samples % n_samples_chunk
-        int chunk_idx, n_samples_chunk_eff
+        int chunk_idx
         int start, end
 
         int j, k
@@ -153,8 +153,9 @@ def lloyd_iter_chunked_dense(
 
     if update_centers:
         omp_destroy_lock(&lock)
-        _relocate_empty_clusters_dense(X, sample_weight, centers_old,
-                                    centers_new, weight_in_clusters, labels)
+        _relocate_empty_clusters_dense(
+            X, sample_weight, centers_old, centers_new, weight_in_clusters, labels
+        )
 
         _average_centers(centers_new, weight_in_clusters)
         _center_shift(centers_old, centers_new, center_shift)
@@ -278,7 +279,7 @@ def lloyd_iter_chunked_sparse(
         int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
         int n_chunks = n_samples // n_samples_chunk
         int n_samples_rem = n_samples % n_samples_chunk
-        int chunk_idx, n_samples_chunk_eff = 0
+        int chunk_idx
         int start = 0, end = 0
 
         int j, k

diff --git a/sklearn/datasets/_svmlight_format_fast.pyx b/sklearn/datasets/_svmlight_format_fast.pyx
@@ -113,6 +113,7 @@ def _load_svmlight_file(f, dtype, bint multilabel, bint zero_based,
 
     return (dtype, data, indices, indptr, labels, query)
 
+
 # Two fused types are defined to be able to
 # use all possible combinations of parameters.
 ctypedef fused int_or_float:
@@ -128,8 +129,9 @@ ctypedef fused int_or_longlong:
     cython.integral
     signed long long
 
+
 def get_dense_row_string(
-    int_or_float[:,:] X,
+    int_or_float[:, :] X,
     Py_ssize_t[:] x_inds,
     double_or_longlong[:] x_vals,
     Py_ssize_t row,
@@ -143,7 +145,7 @@ def get_dense_row_string(
         int_or_float val
 
     for k in range(row_length):
-        val = X[row,k]
+        val = X[row, k]
         if val == 0:
             continue
         x_inds[x_nz_used] = k
@@ -157,6 +159,7 @@ def get_dense_row_string(
 
     return " ".join(reprs)
 
+
 def get_sparse_row_string(
     int_or_float[:] X_data,
     int[:] X_indptr,
@@ -176,6 +179,7 @@ def get_sparse_row_string(
 
     return " ".join(reprs)
 
+
 def _dump_svmlight_file(
     X,
     y,
@@ -211,8 +215,6 @@ def _dump_svmlight_file(
         Py_ssize_t j
         Py_ssize_t col_start
         Py_ssize_t col_end
-        bint first
-        Py_ssize_t x_nz_used
         Py_ssize_t[:] x_inds = np.empty(row_length, dtype=np.intp)
         signed long long[:] x_vals_int
         double[:] x_vals_float
@@ -224,8 +226,6 @@ def _dump_svmlight_file(
             x_vals_float = np.zeros(row_length, dtype=np.float64)
 
     for i in range(x_len):
-        x_nz_used = 0
-
         if not X_is_sp:
             if X_is_integral:
                 s = get_dense_row_string(X, x_inds, x_vals_int, i, value_pattern, one_based)
@@ -234,18 +234,17 @@ def _dump_svmlight_file(
         else:
             s = get_sparse_row_string(X.data, X.indptr, X.indices, i, value_pattern, one_based)
         if multilabel:
-            first = True
             if y_is_sp:
                 col_start = y.indptr[i]
                 col_end = y.indptr[i+1]
                 labels_str = ','.join(tuple(label_pattern % y.indices[j] for j in range(col_start, col_end) if y.data[j] != 0))
             else:
-                labels_str = ','.join(label_pattern % j for j in range(num_labels) if y[i,j] != 0)
+                labels_str = ','.join(label_pattern % j for j in range(num_labels) if y[i, j] != 0)
         else:
             if y_is_sp:
                 labels_str = label_pattern % y.data[i]
             else:
-                labels_str = label_pattern % y[i,0]
+                labels_str = label_pattern % y[i, 0]
 
         if query_id_is_not_empty:
             feat = (labels_str, query_id[i], s)

diff --git a/sklearn/decomposition/_cdnmf_fast.pyx b/sklearn/decomposition/_cdnmf_fast.pyx
@@ -34,5 +34,5 @@ def _update_cdnmf_fast(floating[:, ::1] W, floating[:, :] HHt,
 
                 if hess != 0:
                     W[i, t] = max(W[i, t] - grad / hess, 0.)
-                
+
     return violation
Original file line number	Diff line number	Diff line change
Expand Up		@@ -34,5 +34,5 @@ def _update_cdnmf_fast(floating[:, ::1] W, floating[:, :] HHt,

		if hess != 0:
		W[i, t] = max(W[i, t] - grad / hess, 0.)


		return violation