Skip to content

Commit

Permalink
BTW: python program finished. next is to work out the purely function…
Browse files Browse the repository at this point in the history
…al solution.
  • Loading branch information
liuxinyu95 committed Jun 21, 2011
1 parent 29c968b commit 91b974b
Showing 1 changed file with 62 additions and 8 deletions.
70 changes: 62 additions & 8 deletions transform/BWT/src/bwt.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
#! /usr/bin/python

EOF = "\0"

# Algorithm 1, Naive version from [1]

# using \0 as the special EOF char
def bwt(s):
s += "\0"
s += EOF
table = sorted(s[i:] + s[:i] for i in range(len(s)))
last_colum = [row[-1] for row in table]
return "".join(last_colum)
Expand All @@ -13,14 +14,14 @@ def ibwt(r):
table = [""] * len(r)
for i in range(len(r)):
table = sorted(r[i] + table[i] for i in range(len(r)))
s = [row for row in table if row.endswith("\0")][0]
return s.rstrip("\0")
s = [row for row in table if row.endswith(EOF)][0]
return s.rstrip(EOF)

# Algorithm 2,
# Instead of sorting on rotation, we can sort on suffixes by
# adding a special EOF char. [2].

# example: s = apple, EOF = $
# example: s = apple, EOF = $ (== -inf)
#
# suffixes rotations
# apple$ apple$
Expand All @@ -32,23 +33,72 @@ def ibwt(r):
#
# sorting
#
# let i = len(s) - len(suffixes)
# let i = len(s) - len(suffix)
# i s[i-1]
# $ 5 e $apple
# apple$ 0 $ apple$
# e$ 4 l e$appl
# le$ 3 p le$app
# ple$ 2 p ple$ap
# pple$ 1 a pple$a
# $ 5 e $apple
#
# result: e$lppa

def bwt1(s):
s += "\0" # using \0 as the special EOF char
s += EOF
l = len(s)
table = sorted(s[i:] for i in range(l))
last_colum = [s[l-len(row)-1] for row in table]
return "".join(last_colum)

# Algorithm 3,
# Improvement 1: Instead of using explicit EOF, we can provide
# a customized compare funciton for sorting. [3]
# Improvement 2: Instead of using instance of suffix strings,
# using suffix indecs. to save spaces.

# fact:
# let
# s' = s + EOF
# table = sort(suffixes(s'))
#
# we have: for every row in table,
# index of row[0] in s' == len(s) - len(row)

def bwt2(s):
ids = sorted(range(len(s)+1), lambda x, y: scmp(s, x, y))
last_colum = [s[i-1] for i in ids]
ifst = ids.index(1) # the position of s[0] in last_colum
ieof = ids.index(0) # EOF, which is s[-1] virtually
return ("".join(last_colum), ifst, ieof)

# suffixes comparison function, which can handle virtual EOF
# and we treat EOF as -inf
def scmp(s, i, j):
x = s[i:]
y = s[j:]
if x == []: # x is EOF
return -1
if y == []: # y is EOF
return 1
return cmp(x, y)

def ibwt2(t):
(r, ifst, ieof) = t
fst_colum = [r[ieof]]+sorted(r[:ieof]+r[ieof+1:]) # first element is EOF
ids = [False]+[True]*(len(r)-1)
trans=[0]*len(r) # transform vector
for i in range(len(r)):
if i != ieof:
x = [j for j in range(len(r)) if ids[j] and fst_colum[j] == r[i]][0]
trans[x] = i
ids[x] = False
s = ""
i = ifst
for j in range(len(r)-1):
s = s+r[i]
i = trans[i]
return s

def test():
s = "this is the demo program of bwt transform in the real programming code"
Expand All @@ -58,10 +108,14 @@ def test():
print "ibwt(r) ==>", ibwt(r)
print "bwt1(s) ==>", bwt1(s)
print "ibwt(bwt1(s)) ==>", ibwt(bwt1(s))
t = bwt2(s)
print "bwt2(s) ==>", t
print "ibwt2(r) ==>", ibwt2(t)

if __name__ == "__main__":
test()

# reference
# [1], WIKI. http://en.wikipedia.org/wiki/Burrows%E2%80%93Wheeler_transform
# [2], M. Burrows and D.J. Wheeler `A Block-sorting Loseless Data Compression Algorithm'. DEC Research center, SRC research report, May 10, 1994
# [3], Mark Nelson. `Data Compression with the Burrows-Wheeler Transform'. http://marknelson.us/1996/09/01/bwt/

0 comments on commit 91b974b

Please sign in to comment.