Skip to content
This repository has been archived by the owner on Jun 29, 2019. It is now read-only.

Commit

Permalink
first set of distances tested
Browse files Browse the repository at this point in the history
  • Loading branch information
lindahua committed Jan 28, 2013
1 parent 240d9a4 commit 2bd9293
Show file tree
Hide file tree
Showing 5 changed files with 468 additions and 12 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ Evaluation of distances between vectors.

## A List of supported distances

* Minkowski distance
* Euclidean distance
* Squared euclidean distance
* Cityblock distance
* Chebyshev distance
* Minkowski distance
* Hamming distance
* Cosine distance
* Spearman distance
Expand Down
223 changes: 212 additions & 11 deletions src/Distances.jl
Original file line number Diff line number Diff line change
@@ -1,30 +1,82 @@
module Distances

export
# generic types/functions
GeneralizedDistance,
Distance,
cdist,
pdist,

# distance classes
EuclideanDistance,
SquaredEuclideanDistance,
eucdist,
sqeucdist
CityblockDistance,
ChebyshevDistance,
MinkowskiDistance,
HammingDistance,
CosineDistance,
KullbackLeiblerDivergence,
JensenShannonDivergence,

# convenient functions
euclidean_dist,
sqeuclidean_dist,
cityblock_dist,
chebyshev_dist,
minkowski_dist,
hamming_dist,
cosine_dist,
kl_div,
js_div


##########################################################################
#
# The type hierarchy for distances
#
##########################################################################

# any thing that people typically think of as a kind of distance
# that takes two vectors and outputs a real scalar
# e.g. K-L divergence
abstract GeneralizedDistance

# distance that satisfies d(x, x) == 0 and d(x, y) == d(y, x)
# pairwise computation can take advantage of these properties to save computation
abstract Distance <: GeneralizedDistance

# specific distance types

type EuclideanDistance <: Distance
end

type SquaredEuclideanDistance <: Distance
end

type CityblockDistance <: Distance
end

type ChebyshevDistance <: Distance
end

type MinkowskiDistance <: Distance
p::Number
end

type HammingDistance <: Distance
end

type CosineDistance <: Distance
end

type KullbackLeiblerDivergence <: GeneralizedDistance
end

type JensenShannonDivergence <: Distance
end



##########################################################################
#
# General cdist implementation
Expand Down Expand Up @@ -68,7 +120,7 @@ function cdist(dist::GeneralizedDistance, a::AbstractMatrix, b::AbstractVector)
# calculate the rest

for i = 2 : n
r[i] = cdist(dist, sub(a, 1:m, i), b)
r[i] = cdist(dist, a[:,i], b)
end

return r
Expand Down Expand Up @@ -119,49 +171,198 @@ function pdist(dist::GeneralizedDistance, a::Matrix, b::Matrix)

for j = 2 : n
bj = b[:,j]
r[i, j] = cdist(dist, a[:,i], bj)
for i = 1 : m
r[i, j] = cdist(dist, a[:,i], bj)
end
end

return r
end

pdist(dist::GeneralizedDistance, a::Matrix) = pdist(dist, a, a)

function pdist(dist::Distance, a::Matrix)
n = size(a, 2)

# calculate the first distance (to determine distance value type)

d1 = cdist(dist, a[:,1], b[:,1])
r = Array(typeof(d1), (n, n))
r[1, 1] = 0

# calculate the rest

for i = 2 : n
b1 = b[:,1]
r[i, 1] = cdist(dist, a[:,i], b1)
end

for j = 2 : n
bj = b[:,j]

for i = 1 : j-1
r[i, j] = r[j, i]
end

r[j, j] = 0

for i = j+1 : n
r[i, j] = cdist(dist, a[:,i], bj)
end
end

return r
end




##########################################################################
#
# Specific distances
# Implementation of distance functions
#
# Most implementations use de-vectorized codes for the sake of
# run-time performance.
#
##########################################################################

# euclidean

function cdist{T<:Real}(dist::EuclideanDistance, a::AbstractVector{T}, b::AbstractVector{T})
s::T = 0
n = length(a)
for i = 1 : n # devectorize to make it faster
for i = 1 : n
v = a[i] - b[i]
s += v * v
end
return sqrt(s)
end

# squared euclidean

function cdist{T<:Real}(dist::SquaredEuclideanDistance, a::AbstractVector{T}, b::AbstractVector{T})
s::T = 0
n = length(a)
for i = 1 : n # devectorize to make it faster
for i = 1 : n
v = a[i] - b[i]
s += v * v
end
return s
end

# cityblock

function cdist{T<:Real}(dist::CityblockDistance, a::AbstractVector{T}, b::AbstractVector{T})
s::T = 0
n = length(a)
for i = 1 : n
s += abs(a[i] - b[i])
end
return s
end

# chebyshev

function cdist{T<:Real}(dist::ChebyshevDistance, a::AbstractVector{T}, b::AbstractVector{T})
s::T = 0
n = length(a)
for i = 1 : n
s = max(s, abs(a[i] - b[i]))
end
return s
end

# minkowski

function cdist{T<:Real}(dist::MinkowskiDistance, a::AbstractVector{T}, b::AbstractVector{T})
s::T = 0
n = length(a)
for i = 1 : n
s += abs(a[i] - b[i]) ^ dist.p
end
return s ^ (1 / dist.p)
end

# hamming

function cdist{T<:Real}(dist::HammingDistance, a::AbstractVector{T}, b::AbstractVector{T})
s::Int = 0
n = length(a)
for i = 1 : n
if (a[i] != b[i])
s += 1
end
end
return s
end

# cosine

function cdist{T<:Real}(dist::CosineDistance, a::AbstractVector{T}, b::AbstractVector{T})
xx::T = 0
xy::T = 0
yy::T = 0
n = length(a)
for i = 1 : n
ai = a[i]
bi = b[i]
xx += ai * ai
xy += ai * bi
yy += bi * bi
end
return xy / (sqrt(xx) * sqrt(yy))
end

# K-L divergence

xlogx(x::Real) = x > 0 ? x * log(x) : zero(x)

function cdist{T<:Real}(dist::KullbackLeiblerDivergence, a::AbstractVector{T}, b::AbstractVector{T})
s::T = 0
n = length(a)
for i = 1 : n
ai = a[i]
if ai > 0
s += ai * (log(ai) - log(b[i]))
end
end
return s
end

# J-S divergence

function cdist{T<:Real}(dist::JensenShannonDivergence, a::AbstractVector{T}, b::AbstractVector{T})
s::T = 0
n = length(a)
for i = 1 : n
ai = a[i]
bi = b[i]
vi = (ai + bi) / 2
s += ( (xlogx(ai) + xlogx(bi)) / 2 - xlogx(vi) )
end
return s
end


##########################################################################
#
# Convenient functions
#
##########################################################################

eucdist(a::AbstractArray, b::AbstractArray) = cdist(EuclideanDistance(), a, b)
sqeucdist(a::AbstractArray, b::AbstractArray) = cdist(SquaredEuclideanDistance(), a, b)


euclidean_dist(a::AbstractArray, b::AbstractArray) = cdist(EuclideanDistance(), a, b)
sqeuclidean_dist(a::AbstractArray, b::AbstractArray) = cdist(SquaredEuclideanDistance(), a, b)
cityblock_dist(a::AbstractArray, b::AbstractArray) = cdist(CityblockDistance(), a, b)
chebyshev_dist(a::AbstractArray, b::AbstractArray) = cdist(ChebyshevDistance(), a, b)

minkowski_dist(a::AbstractArray, b::AbstractArray, p::Number) = cdist(MinkowskiDistance(p), a, b)

hamming_dist(a::AbstractArray, b::AbstractArray) = cdist(HammingDistance(), a, b)
cosine_dist(a::AbstractArray, b::AbstractArray) = cdist(CosineDistance(), a, b)

kl_div(a::AbstractArray, b::AbstractArray) = cdist(KullbackLeiblerDivergence(), a, b)
js_div(a::AbstractArray, b::AbstractArray) = cdist(JensenShannonDivergence(), a, b)

end # module end


81 changes: 81 additions & 0 deletions test/bench_dists.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@

# This scripts compares the performance of different ways to implement column-wise
# euclidean distances


function sqeuc_raw_forloop{T<:Real}(a::AbstractMatrix{T}, b::AbstractMatrix{T})
# completely de-vectorized for-loop
m, n = size(a)
r = Array(T, n)
for j = 1 : n
s::T = 0
for i = 1 : m
v = a[i, j] - b[i, j]
s += v * v
end
r[j] = sqrt(s)
end
return r
end

function sqeuc_sumsqr_percol{T<:Real}(a::AbstractMatrix{T}, b::AbstractMatrix{T})
# take the some of vectorized square per column
n = size(a, 2)
r = Array(T, n)
for i = 1 : n
r[i] = sqrt(sum( (a[:,i] - b[:,i]) .^ 2 ))
end
return r
end

function sqeuc_norm_percol{T<:Real}(a::AbstractMatrix{T}, b::AbstractMatrix{T})
# take norm of vectorized difference per column
n = size(a, 2)
r = Array(T, n)
for i = 1 : n
r[i] = norm(a[:,i] - b[:,i])
end
return r
end

function sqeuc_norm_percol_s{T<:Real}(a::AbstractMatrix{T}, b::AbstractMatrix{T})
# take norm of vectorized difference per column (using sub)
m, n = size(a)
r = Array(T, n)
for i = 1 : n
r[i] = norm(sub(a, 1:m, i) - sub(b, 1:m, i))
end
return r
end

function sqeuc_map_norm{T<:Real}(a::AbstractMatrix{T}, b::AbstractMatrix{T})
# map a norm function to each column
r = map( i -> norm(a[:,i] - b[:,i]), 1:size(a,2) )
end


macro my_bench(f)
quote
# warming up
$f(x, y)
# timeing
println("bench: ", $string($f))
@time for i = 1 : 10
$f(x, y)
end
println(" ")
end
end


m = 200
n = 100000

x = rand(m, n)
y = rand(m, n)

@my_bench sqeuc_raw_forloop
@my_bench sqeuc_sumsqr_percol
@my_bench sqeuc_norm_percol
@my_bench sqeuc_norm_percol_s
@my_bench sqeuc_map_norm
Loading

0 comments on commit 2bd9293

Please sign in to comment.