forked from scikit-learn/scikit-learn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path_hierarchical_fast.pyx
537 lines (438 loc) · 16.6 KB
/
_hierarchical_fast.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
# Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
import numpy as np
cimport numpy as np
cimport cython
ctypedef np.float64_t DOUBLE
ctypedef np.npy_intp INTP
ctypedef np.int8_t INT8
# Numpy must be initialized. When using numpy from C or Cython you must
# _always_ do that, or you will have segfaults
np.import_array()
from ..neighbors._dist_metrics cimport DistanceMetric
from ..utils._fast_dict cimport IntFloatDict
# C++
from cython.operator cimport dereference as deref, preincrement as inc
from libcpp.map cimport map as cpp_map
from libc.math cimport fmax
DTYPE = np.float64
ctypedef np.float64_t DTYPE_t
ITYPE = np.intp
ctypedef np.intp_t ITYPE_t
from numpy.math cimport INFINITY
###############################################################################
# Utilities for computing the ward momentum
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
def compute_ward_dist(np.ndarray[DOUBLE, ndim=1, mode='c'] m_1,
np.ndarray[DOUBLE, ndim=2, mode='c'] m_2,
np.ndarray[INTP, ndim=1, mode='c'] coord_row,
np.ndarray[INTP, ndim=1, mode='c'] coord_col,
np.ndarray[DOUBLE, ndim=1, mode='c'] res):
cdef INTP size_max = coord_row.shape[0]
cdef INTP n_features = m_2.shape[1]
cdef INTP i, j, row, col
cdef DOUBLE pa, n
for i in range(size_max):
row = coord_row[i]
col = coord_col[i]
n = (m_1[row] * m_1[col]) / (m_1[row] + m_1[col])
pa = 0.
for j in range(n_features):
pa += (m_2[row, j] / m_1[row] - m_2[col, j] / m_1[col]) ** 2
res[i] = pa * n
return res
###############################################################################
# Utilities for cutting and exploring a hierarchical tree
def _hc_get_descendent(INTP node, children, INTP n_leaves):
"""
Function returning all the descendent leaves of a set of nodes in the tree.
Parameters
----------
node : integer
The node for which we want the descendents.
children : list of pairs, length n_nodes
The children of each non-leaf node. Values less than `n_samples` refer
to leaves of the tree. A greater value `i` indicates a node with
children `children[i - n_samples]`.
n_leaves : integer
Number of leaves.
Returns
-------
descendent : list of int
"""
ind = [node]
if node < n_leaves:
return ind
descendent = []
# It is actually faster to do the accounting of the number of
# elements is the list ourselves: len is a lengthy operation on a
# chained list
cdef INTP i, n_indices = 1
while n_indices:
i = ind.pop()
if i < n_leaves:
descendent.append(i)
n_indices -= 1
else:
ind.extend(children[i - n_leaves])
n_indices += 1
return descendent
@cython.boundscheck(False)
@cython.wraparound(False)
def hc_get_heads(np.ndarray[INTP, ndim=1] parents, copy=True):
"""Returns the heads of the forest, as defined by parents.
Parameters
----------
parents : array of integers
The parent structure defining the forest (ensemble of trees)
copy : boolean
If copy is False, the input 'parents' array is modified inplace
Returns
-------
heads : array of integers of same shape as parents
The indices in the 'parents' of the tree heads
"""
cdef INTP parent, node0, node, size
if copy:
parents = np.copy(parents)
size = parents.size
# Start from the top of the tree and go down
for node0 in range(size - 1, -1, -1):
node = node0
parent = parents[node]
while parent != node:
parents[node0] = parent
node = parent
parent = parents[node]
return parents
@cython.boundscheck(False)
@cython.wraparound(False)
def _get_parents(nodes, heads, np.ndarray[INTP, ndim=1] parents,
np.ndarray[INT8, ndim=1, mode='c'] not_visited):
"""Returns the heads of the given nodes, as defined by parents.
Modifies 'heads' and 'not_visited' in-place.
Parameters
----------
nodes : list of integers
The nodes to start from
heads : list of integers
A list to hold the results (modified inplace)
parents : array of integers
The parent structure defining the tree
not_visited
The tree nodes to consider (modified inplace)
"""
cdef INTP parent, node
for node in nodes:
parent = parents[node]
while parent != node:
node = parent
parent = parents[node]
if not_visited[node]:
not_visited[node] = 0
heads.append(node)
return heads
###############################################################################
# merge strategies implemented on IntFloatDicts
# These are used in the hierarchical clustering code, to implement
# merging between two clusters, defined as a dict containing node number
# as keys and edge weights as values.
@cython.boundscheck(False)
@cython.wraparound(False)
def max_merge(IntFloatDict a, IntFloatDict b,
np.ndarray[ITYPE_t, ndim=1] mask,
ITYPE_t n_a, ITYPE_t n_b):
"""Merge two IntFloatDicts with the max strategy: when the same key is
present in the two dicts, the max of the two values is used.
Parameters
==========
a, b : IntFloatDict object
The IntFloatDicts to merge
mask : ndarray array of dtype integer and of dimension 1
a mask for keys to ignore: if not mask[key] the corresponding key
is skipped in the output dictionary
n_a, n_b : float
n_a and n_b are weights for a and b for the merge strategy.
They are not used in the case of a max merge.
Returns
=======
out : IntFloatDict object
The IntFloatDict resulting from the merge
"""
cdef IntFloatDict out_obj = IntFloatDict.__new__(IntFloatDict)
cdef cpp_map[ITYPE_t, DTYPE_t].iterator a_it = a.my_map.begin()
cdef cpp_map[ITYPE_t, DTYPE_t].iterator a_end = a.my_map.end()
cdef ITYPE_t key
cdef DTYPE_t value
# First copy a into out
while a_it != a_end:
key = deref(a_it).first
if mask[key]:
out_obj.my_map[key] = deref(a_it).second
inc(a_it)
# Then merge b into out
cdef cpp_map[ITYPE_t, DTYPE_t].iterator out_it = out_obj.my_map.begin()
cdef cpp_map[ITYPE_t, DTYPE_t].iterator out_end = out_obj.my_map.end()
cdef cpp_map[ITYPE_t, DTYPE_t].iterator b_it = b.my_map.begin()
cdef cpp_map[ITYPE_t, DTYPE_t].iterator b_end = b.my_map.end()
while b_it != b_end:
key = deref(b_it).first
value = deref(b_it).second
if mask[key]:
out_it = out_obj.my_map.find(key)
if out_it == out_end:
# Key not found
out_obj.my_map[key] = value
else:
deref(out_it).second = fmax(deref(out_it).second, value)
inc(b_it)
return out_obj
@cython.boundscheck(False)
@cython.wraparound(False)
def average_merge(IntFloatDict a, IntFloatDict b,
np.ndarray[ITYPE_t, ndim=1] mask,
ITYPE_t n_a, ITYPE_t n_b):
"""Merge two IntFloatDicts with the average strategy: when the
same key is present in the two dicts, the weighted average of the two
values is used.
Parameters
==========
a, b : IntFloatDict object
The IntFloatDicts to merge
mask : ndarray array of dtype integer and of dimension 1
a mask for keys to ignore: if not mask[key] the corresponding key
is skipped in the output dictionary
n_a, n_b : float
n_a and n_b are weights for a and b for the merge strategy.
They are used for a weighted mean.
Returns
=======
out : IntFloatDict object
The IntFloatDict resulting from the merge
"""
cdef IntFloatDict out_obj = IntFloatDict.__new__(IntFloatDict)
cdef cpp_map[ITYPE_t, DTYPE_t].iterator a_it = a.my_map.begin()
cdef cpp_map[ITYPE_t, DTYPE_t].iterator a_end = a.my_map.end()
cdef ITYPE_t key
cdef DTYPE_t value
cdef DTYPE_t n_out = <DTYPE_t> (n_a + n_b)
# First copy a into out
while a_it != a_end:
key = deref(a_it).first
if mask[key]:
out_obj.my_map[key] = deref(a_it).second
inc(a_it)
# Then merge b into out
cdef cpp_map[ITYPE_t, DTYPE_t].iterator out_it = out_obj.my_map.begin()
cdef cpp_map[ITYPE_t, DTYPE_t].iterator out_end = out_obj.my_map.end()
cdef cpp_map[ITYPE_t, DTYPE_t].iterator b_it = b.my_map.begin()
cdef cpp_map[ITYPE_t, DTYPE_t].iterator b_end = b.my_map.end()
while b_it != b_end:
key = deref(b_it).first
value = deref(b_it).second
if mask[key]:
out_it = out_obj.my_map.find(key)
if out_it == out_end:
# Key not found
out_obj.my_map[key] = value
else:
deref(out_it).second = (n_a * deref(out_it).second
+ n_b * value) / n_out
inc(b_it)
return out_obj
###############################################################################
# An edge object for fast comparisons
cdef class WeightedEdge:
cdef public ITYPE_t a
cdef public ITYPE_t b
cdef public DTYPE_t weight
def __init__(self, DTYPE_t weight, ITYPE_t a, ITYPE_t b):
self.weight = weight
self.a = a
self.b = b
@cython.nonecheck(False)
def __richcmp__(self, WeightedEdge other, int op):
"""Cython-specific comparison method.
op is the comparison code::
< 0
== 2
> 4
<= 1
!= 3
>= 5
"""
if op == 0:
return self.weight < other.weight
elif op == 1:
return self.weight <= other.weight
elif op == 2:
return self.weight == other.weight
elif op == 3:
return self.weight != other.weight
elif op == 4:
return self.weight > other.weight
elif op == 5:
return self.weight >= other.weight
def __repr__(self):
return "%s(weight=%f, a=%i, b=%i)" % (self.__class__.__name__,
self.weight,
self.a, self.b)
################################################################################
# Efficient labelling/conversion of MSTs to single linkage hierarchies
cdef class UnionFind(object):
cdef ITYPE_t next_label
cdef ITYPE_t[:] parent
cdef ITYPE_t[:] size
def __init__(self, N):
self.parent = np.full(2 * N - 1, -1., dtype=ITYPE, order='C')
self.next_label = N
self.size = np.hstack((np.ones(N, dtype=ITYPE),
np.zeros(N - 1, dtype=ITYPE)))
@cython.boundscheck(False)
@cython.nonecheck(False)
cdef void union(self, ITYPE_t m, ITYPE_t n):
self.parent[m] = self.next_label
self.parent[n] = self.next_label
self.size[self.next_label] = self.size[m] + self.size[n]
self.next_label += 1
return
@cython.boundscheck(False)
@cython.nonecheck(False)
cdef ITYPE_t fast_find(self, ITYPE_t n):
cdef ITYPE_t p
p = n
# find the highest node in the linkage graph so far
while self.parent[n] != -1:
n = self.parent[n]
# provide a shortcut up to the highest node
while self.parent[p] != n:
p, self.parent[p] = self.parent[p], n
return n
@cython.boundscheck(False)
@cython.nonecheck(False)
cpdef np.ndarray[DTYPE_t, ndim=2] _single_linkage_label(
np.ndarray[DTYPE_t, ndim=2] L):
"""
Convert an linkage array or MST to a tree by labelling clusters at merges.
This is done by using a Union find structure to keep track of merges
efficiently. This is the private version of the function that assumes that
``L`` has been properly validated. See ``single_linkage_label`` for the
user facing version of this function.
Parameters
----------
L: array of shape (n_samples - 1, 3)
The linkage array or MST where each row specifies two samples
to be merged and a distance or weight at which the merge occurs. This
array is assumed to be sorted by the distance/weight.
Returns
-------
A tree in the format used by scipy.cluster.hierarchy.
"""
cdef np.ndarray[DTYPE_t, ndim=2] result_arr
cdef DTYPE_t[:, ::1] result
cdef ITYPE_t left, left_cluster, right, right_cluster, index
cdef DTYPE_t delta
result_arr = np.zeros((L.shape[0], 4), dtype=DTYPE)
result = result_arr
U = UnionFind(L.shape[0] + 1)
for index in range(L.shape[0]):
left = <ITYPE_t> L[index, 0]
right = <ITYPE_t> L[index, 1]
delta = L[index, 2]
left_cluster = U.fast_find(left)
right_cluster = U.fast_find(right)
result[index][0] = left_cluster
result[index][1] = right_cluster
result[index][2] = delta
result[index][3] = U.size[left_cluster] + U.size[right_cluster]
U.union(left_cluster, right_cluster)
return result_arr
def single_linkage_label(L):
"""
Convert an linkage array or MST to a tree by labelling clusters at merges.
This is done by using a Union find structure to keep track of merges
efficiently.
Parameters
----------
L: array of shape (n_samples - 1, 3)
The linkage array or MST where each row specifies two samples
to be merged and a distance or weight at which the merge occurs. This
array is assumed to be sorted by the distance/weight.
Returns
-------
A tree in the format used by scipy.cluster.hierarchy.
"""
# Validate L
if L[:, :2].min() < 0 or L[:, :2].max() >= 2 * L.shape[0] + 1:
raise ValueError("Input MST array is not a validly formatted MST array")
is_sorted = lambda x: np.all(x[:-1] <= x[1:])
if not is_sorted(L[:, 2]):
raise ValueError("Input MST array must be sorted by weight")
return _single_linkage_label(L)
# Implements MST-LINKAGE-CORE from https://arxiv.org/abs/1109.2378
@cython.boundscheck(False)
@cython.nonecheck(False)
def mst_linkage_core(
const DTYPE_t [:, ::1] raw_data,
DistanceMetric dist_metric):
"""
Compute the necessary elements of a minimum spanning
tree for computation of single linkage clustering. This
represents the MST-LINKAGE-CORE algorithm (Figure 6) from
*Modern hierarchical, agglomerative clustering algorithms*
by Daniel Mullner (https://arxiv.org/abs/1109.2378).
In contrast to the scipy implementation is never computes
a full distance matrix, generating distances only as they
are needed and releasing them when no longer needed.
Parameters
----------
raw_data: array of shape (n_samples, n_features)
The array of feature data to be clustered. Must be C-aligned
dist_metric: DistanceMetric
A DistanceMetric object conforming to the API from
``sklearn.neighbors._dist_metrics.pxd`` that will be
used to compute distances.
Returns
-------
mst_core_data: array of shape (n_samples, 3)
An array providing information from which one
can either compute an MST, or the linkage hierarchy
very efficiently. See https://arxiv.org/abs/1109.2378
algorithm MST-LINKAGE-CORE for more details.
"""
cdef:
ITYPE_t n_samples = raw_data.shape[0]
np.int8_t[:] in_tree = np.zeros(n_samples, dtype=np.int8)
DTYPE_t[:, ::1] result = np.zeros((n_samples - 1, 3))
np.ndarray label_filter
ITYPE_t current_node = 0
ITYPE_t new_node
ITYPE_t i
ITYPE_t j
ITYPE_t num_features = raw_data.shape[1]
DTYPE_t right_value
DTYPE_t left_value
DTYPE_t new_distance
DTYPE_t[:] current_distances = np.full(n_samples, INFINITY)
for i in range(n_samples - 1):
in_tree[current_node] = 1
new_distance = INFINITY
new_node = 0
for j in range(n_samples):
if in_tree[j]:
continue
right_value = current_distances[j]
left_value = dist_metric.dist(&raw_data[current_node, 0],
&raw_data[j, 0],
num_features)
if left_value < right_value:
current_distances[j] = left_value
if current_distances[j] < new_distance:
new_distance = current_distances[j]
new_node = j
result[i, 0] = current_node
result[i, 1] = new_node
result[i, 2] = new_distance
current_node = new_node
return np.array(result)