diff --git a/cleanlab/data_valuation.py b/cleanlab/data_valuation.py
index 7a80babc82..fcc00b9276 100644
--- a/cleanlab/data_valuation.py
+++ b/cleanlab/data_valuation.py
@@ -19,14 +19,12 @@
 """
 
 
-from typing import Callable, Optional, Union, cast
+from typing import Callable, Optional, Union
 
 import numpy as np
 from scipy.sparse import csr_matrix
-from scipy.spatial.distance import euclidean
-from sklearn.neighbors import NearestNeighbors
-from sklearn.exceptions import NotFittedError
-from sklearn.utils.validation import check_is_fitted
+
+from cleanlab.internal.neighbor.knn_graph import create_knn_graph_and_index
 
 
 def _knn_shapley_score(knn_graph: csr_matrix, labels: np.ndarray, k: int) -> np.ndarray:
@@ -45,29 +43,6 @@ def _knn_shapley_score(knn_graph: csr_matrix, labels: np.ndarray, k: int) -> np.
     return 0.5 * (np.mean(scores / k, axis=0) + 1)
 
 
-def _process_knn_graph_from_features(
-    features: np.ndarray, metric: Optional[Union[str, Callable]], k: int = 10
-) -> csr_matrix:
-    """Calculate the knn graph from the features if it is not provided in the kwargs."""
-    if k > len(features):  # Ensure number of neighbors less than number of examples
-        raise ValueError(
-            f"Number of nearest neighbors k={k} cannot exceed the number of examples N={len(features)} passed into the estimator (knn)."
-        )
-    if metric == None:
-        metric = (
-            "cosine"
-            if features.shape[1] > 3
-            else "euclidean" if features.shape[0] > 100 else euclidean
-        )
-    knn = NearestNeighbors(n_neighbors=k, metric=metric).fit(features)
-    knn_graph = knn.kneighbors_graph(mode="distance")
-    try:
-        check_is_fitted(knn)
-    except NotFittedError:
-        knn.fit(features)
-    return knn_graph
-
-
 def data_shapley_knn(
     labels: np.ndarray,
     *,
@@ -135,6 +110,7 @@ def data_shapley_knn(
     if knn_graph is None and features is None:
         raise ValueError("Either knn_graph or features must be provided.")
 
+    # Use provided knn_graph or compute it from features
     if knn_graph is None:
-        knn_graph = _process_knn_graph_from_features(cast(np.ndarray, features), metric, k)
+        knn_graph, _ = create_knn_graph_and_index(features, n_neighbors=k, metric=metric)
     return _knn_shapley_score(knn_graph, labels, k)
diff --git a/cleanlab/datalab/internal/issue_manager/data_valuation.py b/cleanlab/datalab/internal/issue_manager/data_valuation.py
index 759c9bc400..347a443189 100644
--- a/cleanlab/datalab/internal/issue_manager/data_valuation.py
+++ b/cleanlab/datalab/internal/issue_manager/data_valuation.py
@@ -25,19 +25,15 @@
     Optional,
     Union,
 )
-import warnings
 
 
 import numpy as np
 import pandas as pd
 from scipy.sparse import csr_matrix
-from scipy.spatial.distance import euclidean
-from sklearn.exceptions import NotFittedError
-from sklearn.neighbors import NearestNeighbors
-from sklearn.utils.validation import check_is_fitted
 
 from cleanlab.data_valuation import data_shapley_knn
 from cleanlab.datalab.internal.issue_manager import IssueManager
+from cleanlab.internal.neighbor.knn_graph import create_knn_graph_and_index
 
 if TYPE_CHECKING:  # pragma: no cover
     import numpy.typing as npt
@@ -139,33 +135,11 @@ def find_issues(
             )
             raise TypeError(error_msg)
         if knn_graph is None or metric_changes:
-            if features is None:
-                raise ValueError(
-                    "If a knn_graph is not provided, features must be provided to fit a new knn."
-                )
-            if self.metric is None:
-                self.metric = (
-                    "cosine"
-                    if features.shape[1] > 3
-                    else "euclidean" if features.shape[0] > 100 else euclidean
-                )
-            knn = NearestNeighbors(n_neighbors=self.k, metric=self.metric).fit(features)
-
-            if self.metric and self.metric != knn.metric:
-                warnings.warn(
-                    f"Metric {self.metric} does not match metric {knn.metric} used to fit knn. "
-                    "Most likely an existing NearestNeighbors object was passed in, but a different "
-                    "metric was specified."
-                )
+            knn_graph, knn = create_knn_graph_and_index(
+                features, n_neighbors=self.k, metric=self.metric
+            )
             self.metric = knn.metric
 
-            try:
-                check_is_fitted(knn)
-            except NotFittedError:
-                knn.fit(features)
-
-            knn_graph = knn.kneighbors_graph(mode="distance")
-
         scores = data_shapley_knn(labels, knn_graph=knn_graph, k=self.k)
 
         self.issues = pd.DataFrame(
diff --git a/cleanlab/datalab/internal/issue_manager/duplicate.py b/cleanlab/datalab/internal/issue_manager/duplicate.py
index a446d111fc..fb1fd7f319 100644
--- a/cleanlab/datalab/internal/issue_manager/duplicate.py
+++ b/cleanlab/datalab/internal/issue_manager/duplicate.py
@@ -21,12 +21,10 @@
 import numpy as np
 import pandas as pd
 from scipy.sparse import csr_matrix
-from scipy.spatial.distance import euclidean
-from sklearn.neighbors import NearestNeighbors
-from sklearn.exceptions import NotFittedError
-from sklearn.utils.validation import check_is_fitted
+
 
 from cleanlab.datalab.internal.issue_manager import IssueManager
+from cleanlab.internal.neighbor.knn_graph import create_knn_graph_and_index
 from cleanlab.internal.constants import EPSILON
 
 if TYPE_CHECKING:  # pragma: no cover
@@ -76,32 +74,10 @@ def find_issues(
         metric_changes = self.metric and self.metric != old_knn_metric
 
         if knn_graph is None or metric_changes:
-            if features is None:
-                raise ValueError(
-                    "If a knn_graph is not provided, features must be provided to fit a new knn."
-                )
-            if self.metric is None:
-                self.metric = (
-                    "cosine"
-                    if features.shape[1] > 3
-                    else "euclidean" if features.shape[0] > 100 else euclidean
-                )
-            knn = NearestNeighbors(n_neighbors=self.k, metric=self.metric)
-
-            if self.metric and self.metric != knn.metric:
-                warnings.warn(
-                    f"Metric {self.metric} does not match metric {knn.metric} used to fit knn. "
-                    "Most likely an existing NearestNeighbors object was passed in, but a different "
-                    "metric was specified."
-                )
+            knn_graph, knn = create_knn_graph_and_index(
+                features, n_neighbors=self.k, metric=self.metric
+            )
             self.metric = knn.metric
-
-            try:
-                check_is_fitted(knn)
-            except NotFittedError:
-                knn.fit(features)
-
-            knn_graph = knn.kneighbors_graph(mode="distance")
         N = knn_graph.shape[0]
         nn_distances = knn_graph.data.reshape(N, -1)[:, 0]
         median_nn_distance = max(np.median(nn_distances), EPSILON)  # avoid threshold = 0
diff --git a/cleanlab/datalab/internal/issue_manager/noniid.py b/cleanlab/datalab/internal/issue_manager/noniid.py
index 12d42859b4..b28f8c3868 100644
--- a/cleanlab/datalab/internal/issue_manager/noniid.py
+++ b/cleanlab/datalab/internal/issue_manager/noniid.py
@@ -1,19 +1,16 @@
 from __future__ import annotations
 
 from typing import TYPE_CHECKING, Any, Callable, ClassVar, Dict, Optional, Union, cast
-import warnings
 import itertools
 
 from scipy.stats import gaussian_kde
 import numpy as np
 import pandas as pd
 from scipy.sparse import csr_matrix
-from scipy.spatial.distance import euclidean
 from sklearn.neighbors import NearestNeighbors
-from sklearn.exceptions import NotFittedError
-from sklearn.utils.validation import check_is_fitted
 
 from cleanlab.datalab.internal.issue_manager import IssueManager
+from cleanlab.internal.neighbor.knn_graph import construct_knn_graph_from_index, features_to_knn
 
 if TYPE_CHECKING:  # pragma: no cover
     import numpy.typing as npt
@@ -203,28 +200,8 @@ def _setup_knn(
             return None
         features_to_use = self._determine_features(features, pred_probs)
 
-        if self.metric is None:
-            self.metric = (
-                "cosine"
-                if features_to_use.shape[1] > 3
-                else "euclidean" if features_to_use.shape[0] > 100 else euclidean
-            )
-
-        knn = NearestNeighbors(n_neighbors=self.k, metric=self.metric)
-
-        if self.metric != knn.metric:
-            warnings.warn(
-                f"Metric {self.metric} does not match metric {knn.metric} used to fit knn. "
-                "Most likely an existing NearestNeighbors object was passed in, but a different "
-                "metric was specified."
-            )
-        self.metric = knn.metric
-
-        try:
-            check_is_fitted(knn)
-        except NotFittedError:
-            knn.fit(features_to_use)
-
+        knn = features_to_knn(features_to_use, n_neighbors=self.k, metric=self.metric)
+        self.metric = knn.metric  # Update the metric to the one used in the KNN object.
         return knn
 
     def find_issues(
@@ -305,7 +282,7 @@ def collect_info(
         }
         if knn_graph is None:
             assert knn is not None, "If knn_graph is None, knn must be provided."
-            knn_graph = knn.kneighbors_graph(mode="distance")  # type: ignore[union-attr]
+            knn_graph = construct_knn_graph_from_index(knn)
 
         assert knn_graph is not None, "knn_graph must be provided or computed."
         statistics_dict = self._build_statistics_dictionary(knn_graph=knn_graph)
diff --git a/cleanlab/datalab/internal/issue_manager/outlier.py b/cleanlab/datalab/internal/issue_manager/outlier.py
index 0ba01c0e00..93ca6b798e 100644
--- a/cleanlab/datalab/internal/issue_manager/outlier.py
+++ b/cleanlab/datalab/internal/issue_manager/outlier.py
@@ -23,6 +23,7 @@
 import pandas as pd
 
 from cleanlab.datalab.internal.issue_manager import IssueManager
+from cleanlab.internal.neighbor.knn_graph import construct_knn_graph_from_index
 from cleanlab.outlier import OutOfDistribution, transform_distances_to_scores
 
 if TYPE_CHECKING:  # pragma: no cover
@@ -219,7 +220,7 @@ def _process_knn_graph_from_features(self, kwargs: Dict) -> csr_matrix:
             # If the pre-existing knn graph has fewer neighbors than the knn object,
             # then we need to recompute the knn graph
             assert knn == self.ood.params["knn"]  # type: ignore[union-attr]
-            knn_graph = knn.kneighbors_graph(mode="distance")  # type: ignore[union-attr]
+            knn_graph = construct_knn_graph_from_index(knn)
             self._metric = knn.metric  # type: ignore[union-attr]
 
         return knn_graph
diff --git a/cleanlab/datalab/internal/issue_manager/underperforming_group.py b/cleanlab/datalab/internal/issue_manager/underperforming_group.py
index 707d788fd1..6f52055af0 100644
--- a/cleanlab/datalab/internal/issue_manager/underperforming_group.py
+++ b/cleanlab/datalab/internal/issue_manager/underperforming_group.py
@@ -22,13 +22,10 @@
 import numpy as np
 import pandas as pd
 from scipy.sparse import csr_matrix
-from scipy.spatial.distance import euclidean
-from sklearn.neighbors import NearestNeighbors
-from sklearn.exceptions import NotFittedError
-from sklearn.utils.validation import check_is_fitted
 from sklearn.cluster import DBSCAN
 
 from cleanlab.datalab.internal.issue_manager import IssueManager
+from cleanlab.internal.neighbor.knn_graph import create_knn_graph_and_index
 from cleanlab.rank import get_self_confidence_for_each_label
 
 if TYPE_CHECKING:  # pragma: no cover
@@ -153,31 +150,10 @@ def set_knn_graph(
         metric_changes = self.metric and self.metric != old_knn_metric
 
         if knn_graph is None or metric_changes:
-            if features is None:
-                raise ValueError(
-                    "If a knn_graph is not provided, features must be provided to fit a new knn."
-                )
-            if self.metric is None:
-                self.metric = (
-                    "cosine"
-                    if features.shape[1] > 3
-                    else "euclidean" if features.shape[0] > 100 else euclidean
-                )
-            knn = NearestNeighbors(n_neighbors=self.k, metric=self.metric)
-
-            if self.metric and self.metric != knn.metric:
-                warnings.warn(
-                    f"Metric {self.metric} does not match metric {knn.metric} used to fit knn. "
-                    "Most likely an existing NearestNeighbors object was passed in, but a different "
-                    "metric was specified."
-                )
+            knn_graph, knn = create_knn_graph_and_index(
+                features, n_neighbors=self.k, metric=self.metric
+            )
             self.metric = knn.metric
-
-            try:
-                check_is_fitted(knn)
-            except NotFittedError:
-                knn.fit(features)
-            knn_graph = knn.kneighbors_graph(mode="distance")
         return knn_graph
 
     def perform_clustering(self, knn_graph: csr_matrix) -> npt.NDArray[np.int_]:
diff --git a/cleanlab/internal/neighbor/__init__.py b/cleanlab/internal/neighbor/__init__.py
new file mode 100644
index 0000000000..2575a85545
--- /dev/null
+++ b/cleanlab/internal/neighbor/__init__.py
@@ -0,0 +1 @@
+from .knn_graph import features_to_knn
diff --git a/cleanlab/internal/neighbor/knn_graph.py b/cleanlab/internal/neighbor/knn_graph.py
new file mode 100644
index 0000000000..fbbe96318d
--- /dev/null
+++ b/cleanlab/internal/neighbor/knn_graph.py
@@ -0,0 +1,180 @@
+from __future__ import annotations
+from typing import Optional, TYPE_CHECKING, Tuple
+
+import numpy as np
+from scipy.sparse import csr_matrix
+from sklearn.neighbors import NearestNeighbors
+
+if TYPE_CHECKING:
+    from cleanlab.typing import FeatureArray, Metric
+
+from cleanlab.internal.neighbor.metric import decide_default_metric
+from cleanlab.internal.neighbor.search import construct_knn
+
+
+DEFAULT_K = 10
+"""Default number of neighbors to consider in the k-nearest neighbors search,
+unless the size of the feature array is too small or the user specifies a different value.
+
+This should be the largest desired value of k for all desired issue types that require a KNN graph.
+
+E.g. if near duplicates wants k=1 but outliers wants 10, then DEFAULT_K should be 10. This way, all issue types can rely on the same KNN graph.
+"""
+
+
+def features_to_knn(
+    features: Optional[FeatureArray],
+    *,
+    n_neighbors: Optional[int] = None,
+    metric: Optional[Metric] = None,
+    **sklearn_knn_kwargs,
+) -> NearestNeighbors:
+    """Build and fit a k-nearest neighbors search object from an array of numerical features.
+
+    Parameters
+    ----------
+    features :
+        The input feature array, with shape (N, M), where N is the number of samples and M is the number of features.
+    n_neighbors :
+        The number of nearest neighbors to consider. If None, a default value is determined based on the feature array size.
+    metric :
+        The distance metric to use for computing distances between points. If None, the metric is determined based on the feature array shape.
+    **sklearn_knn_kwargs :
+        Additional keyword arguments to be passed to the search index constructor.
+
+    Returns
+    -------
+    knn :
+        A k-nearest neighbors search object fitted to the input feature array.
+
+    Examples
+    --------
+
+    >>> import numpy as np
+    >>> from cleanlab.internal.neighbor import features_to_knn
+    >>> features = np.random.rand(100, 10)
+    >>> knn = features_to_knn(features)
+    >>> knn
+    NearestNeighbors(metric='cosine', n_neighbors=10)
+    """
+    if features is None:
+        raise ValueError("Both knn and features arguments cannot be None at the same time.")
+    # Use provided metric if available, otherwise decide based on the features.
+    metric = metric or decide_default_metric(features)
+
+    # Decide the number of neighbors to use in the KNN search.
+    n_neighbors = _configure_num_neighbors(features, n_neighbors)
+
+    knn = construct_knn(n_neighbors, metric, **sklearn_knn_kwargs)
+    return knn.fit(features)
+
+
+def construct_knn_graph_from_index(knn: NearestNeighbors) -> csr_matrix:
+    """Construct a sparse distance matrix representation of KNN graph out of a fitted NearestNeighbors search object.
+
+    Parameters
+    ----------
+    knn :
+        A NearestNeighbors object that has been fitted to a feature array.
+        The knn graph is constructed based on the distances and indices of each feature row's nearest neighbors.
+
+    Returns
+    -------
+    knn_graph :
+        A sparse, weighted adjacency matrix representing the KNN graph of the feature array.
+
+    Note
+    ----
+    This is *not* intended to construct a KNN graph of test data. It is only used to construct a KNN graph of the data used to fit the NearestNeighbors object.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from cleanlab.internal.neighbor.knn_graph import features_to_knn, construct_knn_graph_from_index
+    >>> features = np.array([
+    ...    [0.701, 0.701],
+    ...    [0.900, 0.436],
+    ...    [0.000, 1.000],
+    ... ])
+    >>> knn = features_to_knn(features, n_neighbors=1)
+    >>> knn_graph = construct_knn_graph_from_index(knn)
+    >>> knn_graph.toarray()  # For demonstration purposes only. It is generally a bad idea to transform to dense matrix for large graphs.
+    array([[0.        , 0.33140006, 0.        ],
+           [0.33140006, 0.        , 0.        ],
+           [0.76210367, 0.        , 0.        ]])
+    """
+
+    distances, indices = knn.kneighbors(return_distance=True)
+
+    N, K = distances.shape
+
+    # Pointers to the row elements distances[indptr[i]:indptr[i+1]],
+    # and their corresponding column indices indices[indptr[i]:indptr[i+1]].
+    indptr = np.arange(0, N * K + 1, K)
+
+    return csr_matrix((distances.reshape(-1), indices.reshape(-1), indptr), shape=(N, N))
+
+
+def create_knn_graph_and_index(
+    features: Optional[FeatureArray],
+    *,
+    n_neighbors: Optional[int] = None,
+    metric: Optional[Metric] = None,
+    **sklearn_knn_kwargs,
+) -> Tuple[csr_matrix, NearestNeighbors]:
+    """Calculate the KNN graph from the features if it is not provided in the kwargs.
+
+    Parameters
+    ----------
+    features :
+        The input feature array, with shape (N, M), where N is the number of samples and M is the number of features.
+    n_neighbors :
+        The number of nearest neighbors to consider. If None, a default value is determined based on the feature array size.
+    metric :
+        The distance metric to use for computing distances between points. If None, the metric is determined based on the feature array shape.
+    **sklearn_knn_kwargs :
+        Additional keyword arguments to be passed to the search index constructor.
+
+    Returns
+    -------
+    knn_graph :
+        A sparse, weighted adjacency matrix representing the KNN graph of the feature array.
+    knn :
+        A k-nearest neighbors search object fitted to the input feature array. This object can be used to query the nearest neighbors of new data points.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from cleanlab.internal.neighbor.knn_graph import create_knn_graph_and_index
+    >>> features = np.array([
+    ...    [0.701, 0.701],
+    ...    [0.900, 0.436],
+    ...    [0.000, 1.000],
+    ... ])
+    >>> knn_graph, knn = create_knn_graph_and_index(features, n_neighbors=1)
+    >>> knn_graph.toarray()  # For demonstration purposes only. It is generally a bad idea to transform to dense matrix for large graphs.
+    array([[0.        , 0.33140006, 0.        ],
+           [0.33140006, 0.        , 0.        ],
+           [0.76210367, 0.        , 0.        ]])
+    >>> knn
+    NearestNeighbors(metric=<function euclidean at ...>, n_neighbors=1)  # For demonstration purposes only. The actual metric may vary.
+    """
+    # Construct NearestNeighbors object
+    knn = features_to_knn(features, n_neighbors=n_neighbors, metric=metric, **sklearn_knn_kwargs)
+    # Build graph from NearestNeighbors object
+    knn_graph = construct_knn_graph_from_index(knn)
+    return knn_graph, knn
+
+
+def _configure_num_neighbors(features: FeatureArray, k: Optional[int]):
+    # Error if the provided value is greater or equal to the number of examples.
+    N = features.shape[0]
+    k_larger_than_dataset = k is not None and k >= N
+    if k_larger_than_dataset:
+        raise ValueError(
+            f"Number of nearest neighbors k={k} cannot exceed the number of examples N={len(features)} passed into the estimator (knn)."
+        )
+
+    # Either use the provided value or select a default value based on the feature array size.
+    k = k or min(DEFAULT_K, N - 1)
+    return k
diff --git a/cleanlab/internal/neighbor/metric.py b/cleanlab/internal/neighbor/metric.py
new file mode 100644
index 0000000000..321be90ecb
--- /dev/null
+++ b/cleanlab/internal/neighbor/metric.py
@@ -0,0 +1,107 @@
+from scipy.spatial.distance import euclidean
+
+from cleanlab.typing import FeatureArray, Metric
+
+HIGH_DIMENSION_CUTOFF: int = 3
+"""
+If the number of columns (M) in the `features` array is greater than this cutoff value,
+then by default, K-nearest-neighbors will use the "cosine" metric.
+The cosine metric is more suitable for high-dimensional data.
+Otherwise the "euclidean" distance will be used.
+
+"""
+ROW_COUNT_CUTOFF: int = 100
+"""
+Only affects settings where Euclidean metrics would be used by default.
+If the number of rows (N) in the `features` array is greater than this cutoff value,
+then by default, Euclidean distances are computed via the "euclidean" metric
+(implemented in sklearn for efficiency reasons).
+Otherwise, Euclidean distances are by default computed via
+the ``euclidean`` metric from scipy (slower but numerically more precise/accurate).
+"""
+
+
+# Metric decision functions
+def _euclidean_large_dataset() -> str:
+    return "euclidean"
+
+
+def _euclidean_small_dataset() -> Metric:
+    return euclidean
+
+
+def _cosine_metric() -> str:
+    return "cosine"
+
+
+def decide_euclidean_metric(features: FeatureArray) -> Metric:
+    """
+    Decide the appropriate Euclidean metric implementation based on the size of the dataset.
+
+    Parameters
+    ----------
+    features :
+        The input features array.
+
+    Returns
+    -------
+    metric :
+        A string or a callable representing a specific implementation of computing the euclidean distance.
+
+    Note
+    ----
+    A choice is made between two implementations
+    of the euclidean metric based on the number of rows in the feature array.
+    If the number of rows (N) in the feature array is greater than another predefined
+    cutoff value (ROW_COUNT_CUTOFF), the ``"euclidean"`` metric is used. This
+    is because the euclidean metric performs better on larger datasets.
+    If neither condition is met, the ``euclidean`` metric function from scipy is returned.
+
+    See also
+    --------
+    ROW_COUNT_CUTOFF: The cutoff value for the number of rows in the feature array.
+    sklearn.metrics.pairwise.euclidean_distances: The euclidean metric function from scikit-learn.
+    scipy.spatial.distance.euclidean: The euclidean metric function from scipy.
+    """
+    num_rows = features.shape[0]
+    if num_rows > ROW_COUNT_CUTOFF:
+        return _euclidean_large_dataset()
+    else:
+        return _euclidean_small_dataset()
+
+
+# Main function to decide the metric
+def decide_default_metric(features: FeatureArray) -> Metric:
+    """
+    Decide the KNN metric to be used based on the shape of the feature array.
+
+    Parameters
+    ----------
+    features :
+        The input feature array, with shape (N, M), where N is the number of samples and M is the number of features.
+
+    Returns
+    -------
+    metric :
+        The distance metric to be used for neighbor search. It can be either a string
+        representing the metric name ("cosine" or "euclidean") or a callable
+        representing the metric function from scipy (euclidean).
+
+    Note
+    ----
+    The decision of which metric to use is based on the shape of the feature array.
+    If the number of columns (M) in the feature array is greater than a predefined
+    cutoff value (HIGH_DIMENSION_CUTOFF), the "cosine" metric is used. This is because the cosine
+    metric is more suitable for high-dimensional data.
+
+    Otherwise, a euclidean metric is used.
+    That is handled by the :py:meth:`~cleanlab.internal.neighbor.metric.decide_euclidean_metric` function.
+
+    See Also
+    --------
+    HIGH_DIMENSION_CUTOFF: The cutoff value for the number of columns in the feature array.
+    sklearn.metrics.pairwise.cosine_distances: The cosine metric function from scikit-learn
+    """
+    if features.shape[1] > HIGH_DIMENSION_CUTOFF:
+        return _cosine_metric()
+    return decide_euclidean_metric(features)
diff --git a/cleanlab/internal/neighbor/search.py b/cleanlab/internal/neighbor/search.py
new file mode 100644
index 0000000000..cf4bb0ab57
--- /dev/null
+++ b/cleanlab/internal/neighbor/search.py
@@ -0,0 +1,75 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
+
+from sklearn.neighbors import NearestNeighbors
+
+
+if TYPE_CHECKING:
+
+    from cleanlab.typing import Metric
+
+
+def construct_knn(n_neighbors: int, metric: Metric, **knn_kwargs) -> NearestNeighbors:
+    """
+    Constructs a k-nearest neighbors search object. You can implement a similar method to run cleanlab with your own approximate-KNN library.
+
+    Parameters
+    ----------
+    n_neighbors :
+        The number of nearest neighbors to consider.
+    metric :
+        The distance metric to use for computing distances between points.
+        See :py:mod:`~cleanlab.internal.neighbor.metric` for more information.
+    **knn_kwargs:
+        Additional keyword arguments to be passed to the search index constructor.
+        See https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html for more details on the available options.
+
+    Returns
+    -------
+    knn :
+        A k-nearest neighbors search object compatible with the scikit-learn NearestNeighbors class interface.
+
+        Implements:
+
+            - `fit` method: Accepts a feature array `X` to fit the model.
+                This enables subsequent neighbor searches on the data.
+            - `kneighbors` method: Finds the K-neighbors of a point, returning distances and indices of the k-nearest neighbors. Handles two scenarios:
+                1. When a query array `features: np.ndarray` is provided, it returns the distances and indices for each point in the query array.
+                2. When no query array is provided (`features = None`), it returns neighbors for each indexed point without considering the query point as its own neighbor.
+                Optionally, allows re-specification of the number of neighbors for each query point, defaulting to the constructor's value if not specified.
+
+        Attributes:
+
+            - `n_neighbors`: Number of neighbors to consider.
+            - `metric`: Distance metric used to compute distances between points.
+            - `metric_params`: Additional parameters for the distance metric function.
+
+        Optional:
+
+            - `kneighbors_graph` method: Not required but can be implemented for convenience.
+            Responsibility shifted to :py:ref:`construct_knn_graph_from_index <cleanlab.internal.neighbor.neighbor.construct_knn_graph_from_index>`.
+
+        Fitted Attributes:
+
+            - `n_features_in_`: Number of features observed during fit.
+            - `effective_metric_params_`: Metric parameters used in distance computation.
+            - `effective_metric_`: Metric used for computing distances to neighbors.
+            - `n_samples_fit_`: Number of samples in the fitted data.
+
+        Additional:
+
+            - `__sklearn_is_fitted__`: Method returning a boolean indicating if the object is fitted,
+                useful for conducting an is_fitted validation, which verifies the presence of fitted attributes (typically ending with a trailing underscore).
+
+
+    The above specifications ensure compatibility and provide a clear directive for developers needing to integrate alternative k-nearest neighbors implementations or modify existing functionalities.
+
+    Note
+    ----
+    The `metric` argument should be a callable that takes two arguments (the two points) and returns the distance between them.
+    The additional keyword arguments (`**knn_kwargs`) are passed directly to the underlying k-nearest neighbors search algorithm.
+
+    """
+    sklearn_knn = NearestNeighbors(n_neighbors=n_neighbors, metric=metric, **knn_kwargs)
+
+    return sklearn_knn
diff --git a/cleanlab/outlier.py b/cleanlab/outlier.py
index b851f0d305..cb3432a0db 100644
--- a/cleanlab/outlier.py
+++ b/cleanlab/outlier.py
@@ -21,10 +21,9 @@
 """
 
 import warnings
-from typing import Callable, Dict, Optional, Tuple, Union
+from typing import Dict, Optional, Tuple, Union
 
 import numpy as np
-from scipy.spatial.distance import euclidean
 from sklearn.exceptions import NotFittedError
 from sklearn.neighbors import NearestNeighbors
 
@@ -33,6 +32,7 @@
     _subtract_confident_thresholds,
     get_normalized_entropy,
 )
+from cleanlab.internal.neighbor.knn_graph import features_to_knn
 from cleanlab.internal.numerics import softmax
 from cleanlab.internal.outlier import correct_precision_errors, transform_distances_to_scores
 from cleanlab.internal.validation import assert_valid_inputs, labels_to_array
@@ -424,30 +424,13 @@ def _get_ood_features_scores(
         distance_metric = None
         if knn is None:  # setup default KNN estimator
             # Make sure both knn and features are not None
-            if features is None:
-                raise ValueError(
-                    "Both knn and features arguments cannot be None at the same time. Not enough information to compute outlier scores."
-                )
-            if k is None:
-                k = DEFAULT_K  # use default when knn and k are both None
-            N, M = features.shape
-            if k > N:  # Ensure number of neighbors less than number of examples
-                raise ValueError(
-                    f"Number of nearest neighbors k={k} cannot exceed the number of examples N={len(features)} passed into the estimator (knn)."
-                )
-
-            # strings are used for sklearn metrics, callables are scipy pairwise distance functions
-            metric: Union[str, Callable]
-            if M > 3:  # use euclidean distance for lower dimensional spaces
-                metric = "cosine"
-            elif N > 100:  # Use efficient implementation (numerically unstable in edge cases)
-                metric = "euclidean"
-            else:  # Use scipy implementation for precise results
-                metric = euclidean
-
-            knn = NearestNeighbors(n_neighbors=k, metric=metric).fit(features)
+            knn = features_to_knn(features, n_neighbors=k)
             features = None  # features should be None in knn.kneighbors(features) to avoid counting duplicate data points
-            distance_metric = metric if isinstance(metric, str) else str(metric.__name__)
+            # Log knn metric as string to ensure compatibility for score correction
+            distance_metric = (
+                metric if isinstance((metric := knn.metric), str) else str(metric.__name__)
+            )
+            k = knn.n_neighbors
 
         elif k is None:
             k = knn.n_neighbors
diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py
index 4badef6a13..439ddbb02a 100644
--- a/cleanlab/regression/rank.py
+++ b/cleanlab/regression/rank.py
@@ -27,9 +27,9 @@
 from typing import Dict, Callable, Optional, Union
 import numpy as np
 from numpy.typing import ArrayLike
-from scipy.spatial.distance import euclidean
-from sklearn.neighbors import NearestNeighbors
 
+from cleanlab.internal.neighbor.metric import decide_euclidean_metric
+from cleanlab.internal.neighbor.knn_graph import features_to_knn
 from cleanlab.outlier import OutOfDistribution
 from cleanlab.internal.regression_utils import assert_valid_prediction_inputs
 
@@ -142,7 +142,7 @@ def _get_outre_score_for_each_label(
     *,
     residual_scale: float = 5,
     frac_neighbors: float = 0.5,
-    neighbor_metric: Optional[Union[str, Callable]] = "euclidean",
+    neighbor_metric: Optional[Union[str, Callable]] = None,
 ) -> np.ndarray:
     """Returns OUTRE based label-quality scores.
 
@@ -181,12 +181,9 @@ def _get_outre_score_for_each_label(
     features = np.array([labels, residual]).T
 
     neighbors = int(np.ceil(frac_neighbors * labels.shape[0]))
-    if neighbor_metric is None:
-        if features.shape[0] > 100:
-            neighbor_metric = "euclidean"
-        else:
-            neighbor_metric = euclidean
-    knn = NearestNeighbors(n_neighbors=neighbors, metric=neighbor_metric).fit(features)
+    # Use provided metric or select a decent implementation of the euclidean metric for knn search
+    neighbor_metric = neighbor_metric or decide_euclidean_metric(features)
+    knn = features_to_knn(features, n_neighbors=neighbors, metric=neighbor_metric)
     ood = OutOfDistribution(params={"knn": knn})
 
     label_quality_scores = ood.score(features=features)
diff --git a/cleanlab/typing.py b/cleanlab/typing.py
index 995d20bdec..48aabe1a04 100644
--- a/cleanlab/typing.py
+++ b/cleanlab/typing.py
@@ -1,4 +1,4 @@
-from typing import Any, Union
+from typing import Any, Callable, Union
 import numpy as np
 import pandas as pd
 
@@ -8,3 +8,20 @@
 
 DatasetLike = Any
 """Type for objects that behave like datasets."""
+
+###########################################################
+# Types aliases used in cleanlab/internal/neighbor/ modules
+###########################################################
+
+FeatureArray = np.ndarray
+"""A type alias for a 2D numpy array representing numerical features."""
+Metric = Union[str, Callable]
+"""A type alias for the distance metric to be used for neighbor search. It can be either a string
+representing the metric name ("cosine" or "euclidean") or a callable representing the metric function from scipy (euclidean).
+
+Valid values for metric are mentioned in the scikit-learn documentation for the sklearn.metrics.pairwise_distances function.
+
+See Also
+--------
+sklearn.metrics.pairwise_distances: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html#sklearn-metrics-pairwise-distances
+"""
diff --git a/docs/source/cleanlab/internal/index.rst b/docs/source/cleanlab/internal/index.rst
index 6642479281..df59cbcbb5 100644
--- a/docs/source/cleanlab/internal/index.rst
+++ b/docs/source/cleanlab/internal/index.rst
@@ -17,6 +17,7 @@ internal
     label_quality_utils
     multilabel_utils
     multilabel_scorer
+    neighbor/index
     outlier
     token_classification_utils
     validation
diff --git a/docs/source/cleanlab/internal/neighbor/index.rst b/docs/source/cleanlab/internal/neighbor/index.rst
new file mode 100644
index 0000000000..27439ece96
--- /dev/null
+++ b/docs/source/cleanlab/internal/neighbor/index.rst
@@ -0,0 +1,21 @@
+neighbor
+========
+
+The `neighbor` modules provide functionality for performing nearest neighbor search and pairwise distance calculations in those searches.
+
+This submodule consists of the following modules:
+
+- `neighbor.knn_graph`: Contains functions for setting up a nearest neighbor search index and constructing knn graphs.
+- `neighbor.search`: Contains a helper function that wraps the default implementation of nearest neighbor searches.
+- `neighbor.metric`: Contains functions for selecting distance metrics for nearest neighbor searches.
+
+.. automodule:: cleanlab.internal.neighbor
+   :autosummary:
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. toctree::
+    knn_graph
+    metric
+    search
diff --git a/docs/source/cleanlab/internal/neighbor/knn_graph.rst b/docs/source/cleanlab/internal/neighbor/knn_graph.rst
new file mode 100644
index 0000000000..1486a76e16
--- /dev/null
+++ b/docs/source/cleanlab/internal/neighbor/knn_graph.rst
@@ -0,0 +1,8 @@
+knn_graph
+=========
+
+.. automodule:: cleanlab.internal.neighbor.knn_graph
+    :autosummary:
+    :members:
+    :undoc-members:
+    :show-inheritance:
\ No newline at end of file
diff --git a/docs/source/cleanlab/internal/neighbor/metric.rst b/docs/source/cleanlab/internal/neighbor/metric.rst
new file mode 100644
index 0000000000..f78f47cf50
--- /dev/null
+++ b/docs/source/cleanlab/internal/neighbor/metric.rst
@@ -0,0 +1,8 @@
+metric
+======
+
+.. automodule:: cleanlab.internal.neighbor.metric
+    :autosummary:
+    :members:
+    :undoc-members:
+    :show-inheritance:
\ No newline at end of file
diff --git a/docs/source/cleanlab/internal/neighbor/search.rst b/docs/source/cleanlab/internal/neighbor/search.rst
new file mode 100644
index 0000000000..056bfbc0a2
--- /dev/null
+++ b/docs/source/cleanlab/internal/neighbor/search.rst
@@ -0,0 +1,8 @@
+search
+======
+
+.. automodule:: cleanlab.internal.neighbor.search
+    :autosummary:
+    :members:
+    :undoc-members:
+    :show-inheritance:
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
index 82b8ba2d76..fab96aa43f 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -3,6 +3,7 @@ ignore =
 select = F401,F403
 per-file-ignores =
     cleanlab/__init__.py: F401
+    cleanlab/internal/neighbor/__init__.py: F401
     cleanlab/token_classification/__init__.py: F401
     cleanlab/benchmarking/__init__.py: F401
     cleanlab/regression/__init__.py: F401
diff --git a/tests/datalab/datalab/test_datalab.py b/tests/datalab/datalab/test_datalab.py
index f52115180c..9693c11544 100644
--- a/tests/datalab/datalab/test_datalab.py
+++ b/tests/datalab/datalab/test_datalab.py
@@ -938,6 +938,16 @@ def test_non_iid_issues_pred_probs_knn_graph_checks(self, lab, random_embeddings
         issues_2 = lab_2.get_issues("non_iid")
         pd.testing.assert_frame_equal(issues_1, issues_2)
 
+    def test_all_identical_dataset(self):
+        """Test that the non-IID issue finder correctly identifies an all-identical dataset."""
+        N, M = 200, 10
+        data = {"labels": [0] * N}
+        features = np.full((N, M), fill_value=np.random.rand(M))
+        lab = Datalab(data=data, label_name="labels")
+        lab.find_issues(features=features, issue_types={"non_iid": {}})
+        assert lab.get_issues()["is_non_iid_issue"].sum() == 1
+        assert lab.get_issue_summary()["score"].values[0] < 0.05
+
 
 class TestDatalabFindLabelIssues:
     @pytest.fixture
@@ -1036,6 +1046,17 @@ def test_incremental_search(self, pred_probs, random_embeddings):
         outlier_summary = lab.get_issue_summary("outlier")
         assert outlier_summary["num_issues"].values[0] > 0
 
+    def test_all_identical_dataset(self):
+        """Test that the non-IID issue finder correctly identifies an all-identical dataset."""
+        N, M = 200, 10
+        data = {"labels": [0] * N}
+        features = np.full((N, M), fill_value=np.random.rand(M))
+        lab = Datalab(data=data, label_name="labels")
+        lab.find_issues(features=features, issue_types={"outlier": {}})
+        outlier_issues = lab.get_issues("outlier")
+        assert outlier_issues["is_outlier_issue"].sum() == 0
+        np.testing.assert_allclose(outlier_issues["outlier_score"].to_numpy(), 1)
+
 
 class TestDatalabFindNearDuplicateIssues:
     @pytest.fixture
@@ -1148,6 +1169,17 @@ def test_fixed_embeddings_outputs(self, fixed_embeddings):
         unique_non_empty_sets = [tuple(s) for s in near_duplicate_sets if len(s) > 0]
         assert len(set(unique_non_empty_sets)) == 18
 
+    def test_all_identical_dataset(self):
+        """Test that the non-IID issue finder correctly identifies an all-identical dataset."""
+        N, M = 200, 10
+        data = {"labels": [0] * N}
+        features = np.full((N, M), fill_value=np.random.rand(M))
+        lab = Datalab(data=data, label_name="labels")
+        lab.find_issues(features=features, issue_types={"near_duplicate": {}})
+        near_duplicate_issues = lab.get_issues("near_duplicate")
+        assert near_duplicate_issues["is_near_duplicate_issue"].sum() == N
+        np.testing.assert_allclose(near_duplicate_issues["near_duplicate_score"].to_numpy(), 0)
+
 
 class TestDatalabWithoutLabels:
     num_examples = 100
@@ -1402,6 +1434,22 @@ def test_no_cluster_ids(self, data):
         )
         assert len(lab.issue_summary["issue_type"].values) == 0
 
+    def test_all_identical_dataset(self):
+        """Test that the non-IID issue finder correctly handles an all-identical dataset."""
+        N, M = 200, 10
+        data = {"labels": [0] * N}
+        features = np.full((N, M), fill_value=np.random.rand(M))
+        pred_probs = np.full((N, 2), fill_value=[1, 0])
+        lab = Datalab(data=data, label_name="labels")
+        lab.find_issues(
+            features=features, pred_probs=pred_probs, issue_types={"underperforming_group": {}}
+        )
+        underperforming_issues = lab.get_issues("underperforming_group")
+        assert underperforming_issues["is_underperforming_group_issue"].sum() == 0
+        np.testing.assert_allclose(
+            underperforming_issues["underperforming_group_score"].to_numpy(), 1
+        )
+
 
 class TestDataLabNullIssues:
     K = 3
@@ -1610,6 +1658,23 @@ def test_find_issues_with_different_metrics(self, dataset, knn_graph):
                 lab.get_info("statistics")["weighted_knn_graph"].toarray(),
             )
 
+    def test_all_identical_dataset(self):
+        """Test that the data_valuation issue finder correctly handles an all-identical dataset."""
+        N, M = 11, 10
+        data = {"labels": [0] * N}
+        features = np.full((N, M), fill_value=np.random.rand(M))
+        lab = Datalab(data=data, label_name="labels")
+        lab.find_issues(features=features, issue_types={"data_valuation": {}})
+        data_valuation_issues = lab.get_issues("data_valuation")
+        assert data_valuation_issues["is_data_valuation_issue"].sum() == 0
+
+        # For a full knn-graph, all data points have the same value. Here, they all contribute the same value.
+        # The score of 54/99 is a value that works for 11 identical data points.
+        # TODO: Find a reasonable test for larger dataset, with k much smaller than N. Hard to guarantee a score of 0.5.
+        np.testing.assert_allclose(
+            data_valuation_issues["data_valuation_score"].to_numpy(), 54 / 99
+        )
+
 
 class TestIssueManagersReuseKnnGraph:
     """
diff --git a/tests/internal/neighbor/test_metric.py b/tests/internal/neighbor/test_metric.py
new file mode 100644
index 0000000000..ae65a242c0
--- /dev/null
+++ b/tests/internal/neighbor/test_metric.py
@@ -0,0 +1,31 @@
+import numpy as np
+import pytest
+
+from cleanlab.internal.neighbor.metric import decide_default_metric
+
+
+@pytest.mark.parametrize(
+    "N",
+    [2, 10, 50, 100, 101],
+)
+def test_decide_default_metric_for_2d_and_3d_features(N):
+    # 2D and 3D features should always use the euclidean metric, disregarding the different implementations.
+    for M in [2, 3]:
+        X = np.random.rand(N, M)
+        metric = decide_default_metric(X)
+        if hasattr(metric, "__name__"):
+            error_msg = "The metric should be the string 'euclidean' for N > 100."
+            assert N <= 100, error_msg
+            metric = getattr(metric, "__name__")
+        assert metric == "euclidean"
+
+
+@pytest.mark.parametrize(
+    "M",
+    [4, 5, 10, 50, 100],
+)
+def test_decide_default_metric_for_high_dimensional_features(M):
+    # High-dimensional features should always use the cosine metric.
+    X = np.random.rand(100, M)
+    metric = decide_default_metric(X)
+    assert metric == "cosine"
diff --git a/tests/internal/neighbor/test_neighbor.py b/tests/internal/neighbor/test_neighbor.py
new file mode 100644
index 0000000000..b3f6180a2d
--- /dev/null
+++ b/tests/internal/neighbor/test_neighbor.py
@@ -0,0 +1,105 @@
+from typing import cast
+import pytest
+import numpy as np
+from sklearn.neighbors import NearestNeighbors
+
+from cleanlab.internal.neighbor import features_to_knn
+from cleanlab.internal.neighbor.knn_graph import construct_knn_graph_from_index
+
+
+@pytest.mark.parametrize(
+    "N",
+    [2, 10, 100, 101],
+    ids=lambda x: f"N={x}",
+)
+@pytest.mark.parametrize(
+    "M",
+    [2, 3, 4, 5, 10, 50, 100],
+    ids=lambda x: f"M={x}",
+)
+def test_features_to_knn(N, M):
+
+    features = np.random.rand(N, M)
+    if N >= 100:
+        features[-10:] = features[-11]  # Make the last 11 entries all identical, as an edge-case.
+    knn = features_to_knn(features)
+
+    assert isinstance(knn, NearestNeighbors)
+    knn = cast(NearestNeighbors, knn)
+    assert knn.n_neighbors == min(10, N - 1)
+    if M > 3:
+        metric = knn.metric
+        assert metric == "cosine"
+    else:
+        metric = knn.metric
+        if N <= 100:
+            assert hasattr(metric, "__name__")
+            metric = metric.__name__
+        assert metric == "euclidean"
+
+    if N >= 100:
+        distances, indices = knn.kneighbors(n_neighbors=10)
+        # Assert that the last 10 rows are identical to the 11th last row.
+        assert np.allclose(features[-10:], features[-11])
+        np.testing.assert_allclose(distances[-11:], 0, atol=1e-15)
+        # All the indices belong to the same example, so the set of indices should be the same.
+        # No guarantees about the order of the indices, but each point is not considered its own neighbor.
+        np.testing.assert_allclose(np.unique(indices[-11:]), np.arange(start=N - 11, stop=N))
+
+    # The knn object should be fitted to the features.
+    # TODO: This is not a good test, but it's the best we can do without exposing the internal state of the knn object.
+    # Assert is_
+    assert knn._fit_X is features
+
+
+def test_knn_kwargs():
+    """Check that features_to_knn passes additional keyword arguments to the NearestNeighbors constructor correctly."""
+    N, M = 100, 10
+    features = np.random.rand(N, M)
+    V = features.var(axis=0)
+    knn = features_to_knn(
+        features,
+        n_neighbors=6,
+        metric="seuclidean",
+        metric_params={"V": V},
+    )
+
+    assert knn.n_neighbors == 6
+    assert knn.radius == 1.0
+    assert (alg := knn.algorithm) == "auto"
+    assert knn.leaf_size == 30
+    assert knn.metric == "seuclidean"
+    assert knn.metric_params == {"V": V}
+    assert knn.p == 2
+    assert knn._fit_X is features  # Not a public attribute, bad idea to rely on this attribute.
+
+    # Attributes estimated from fitted data
+    assert knn.n_features_in_ == M
+    assert knn.effective_metric_params_ == {"V": V}
+    assert knn.effective_metric_ == "seuclidean"
+    assert knn.n_samples_fit_ == N
+    assert (
+        knn._fit_method == "ball_tree" if alg == "auto" else alg
+    )  # Should be one of ["kd_tree", "ball_tree" and "brute"], set with "algorithm"
+
+
+@pytest.mark.parametrize("metric", ["cosine", "euclidean"])
+def test_construct_knn_graph_from_index(metric):
+    N, k = 100, 10
+    knn = NearestNeighbors(n_neighbors=k, metric=metric)
+    X = np.random.rand(N, 10)
+    knn.fit(X)
+    knn_graph = construct_knn_graph_from_index(knn)
+
+    assert knn_graph.shape == (N, N)
+    assert knn_graph.nnz == N * k
+    assert knn_graph.dtype == np.float64
+    assert np.all(knn_graph.data >= 0)
+    assert np.all(knn_graph.indices >= 0)
+    assert np.all(knn_graph.indices < 100)
+
+    distances = knn_graph.data.reshape(N, k)
+    indices = knn_graph.indices.reshape(N, k)
+
+    # Assert all rows in distances are sorted
+    assert np.all(np.diff(distances, axis=1) >= 0)