Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
fumieval committed Dec 6, 2018
0 parents commit f952d60
Showing 7 changed files with 283 additions and 0 deletions.
22 changes: 22 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
dist
dist-*
cabal-dev
*.o
*.hi
*.chi
*.chs.h
*.dyn_o
*.dyn_hi
.hpc
.hsenv
.cabal-sandbox/
cabal.sandbox.config
*.prof
*.aux
*.hp
*.eventlog
.stack-work/
cabal.project.local
cabal.project.local~
.HTF/
.ghc.environment.*
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Revision history for elias-fano

## 0 -- YYYY-mm-dd

* First version. Released on an unsuspecting world.
57 changes: 57 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
Copyright (c) 2018, Fumiaki Kinoshita

All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials provided
with the distribution.

* Neither the name of Fumiaki Kinoshita nor the names of other
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

Copyright 2013 Edward Kmett

All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:

1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.

THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
2 changes: 2 additions & 0 deletions Setup.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
import Distribution.Simple
main = defaultMain
27 changes: 27 additions & 0 deletions elias-fano.cabal
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
-- Initial elias-fano.cabal generated by cabal init. For further
-- documentation, see http://haskell.org/cabal/users-guide/

name: elias-fano
version: 0
synopsis: Elias-fano encoder
-- description:
homepage: https://github.com/fumieval/elias-fano
license: BSD3
license-file: LICENSE
author: Fumiaki Kinoshita
maintainer: fumiexcel@gmail.com
-- copyright:
category: Codec
build-type: Simple
extra-source-files: CHANGELOG.md
cabal-version: >=1.10

library
exposed-modules: Codec.EliasFano
, Codec.EliasFano.Internal
-- other-modules:
-- other-extensions:
build-depends: base >=4.8 && <4.14, vector, QuickCheck, split
hs-source-dirs: src
ghc-options: -O2 -Wall
default-language: Haskell2010
56 changes: 56 additions & 0 deletions src/Codec/EliasFano.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
module Codec.EliasFano (EliasFano(..), unsafeEncode, access, prop_access) where

import Control.Monad.ST
import Data.Bits
import qualified Data.Vector.Storable as V
import qualified Data.Vector.Storable.Mutable as MV
import Data.Word

import Codec.EliasFano.Internal

import qualified Test.QuickCheck as QC

unsafeEncodeMax :: Int -> V.Vector Int -> EliasFano
unsafeEncodeMax maxValue vec = runST $ do
let counterSize = 1 `unsafeShiftL` ceiling (logBase 2 $ fromIntegral len :: Double)
mcounter <- MV.replicate counterSize 0
V.forM_ vec $ \v -> MV.unsafeModify mcounter (+1) (v `unsafeShiftR` width)
counter <- V.unsafeFreeze mcounter
let upd (Left i)
| i == len = Skip (Right 0)
| otherwise = Yield width (fromIntegral $ V.unsafeIndex vec i) (Left $! i + 1)
upd (Right i)
| i == counterSize = Done
| otherwise = let n = V.unsafeIndex counter i in Yield (n + 1) (mask n) (Right $! i + 1)
return EliasFano
{ efWidth = width
, efLength = len
, efContent = V.fromList $ build $ BitStream (Left 0) upd
}
where
len = V.length vec
width = max 1 $ ceiling $ logBase 2 (fromIntegral maxValue / fromIntegral len :: Double)

unsafeEncode :: V.Vector Int -> EliasFano
unsafeEncode vec
| V.null vec = unsafeEncodeMax 1 vec
| otherwise = unsafeEncodeMax (V.last vec + 1) vec

data EliasFano = EliasFano
{ efWidth :: !Int
, efLength :: !Int
, efContent :: !(V.Vector Word64)
}
deriving Show

access :: EliasFano -> Int -> Int
access (EliasFano width len vec) i = unsafeShiftL (selectFrom (len * width) vec i - i) width
.|. fromIntegral (readBits vec width (i * width))

prop_access :: [QC.NonNegative Int] -> QC.NonNegative Int -> QC.Property
prop_access xs i_ = QC.counterexample (show (base, ef, i))
$ access ef i == base !! i
where
i = QC.getNonNegative i_ `mod` length base
base = scanl (+) 0 $ map QC.getNonNegative xs
ef = unsafeEncode $ V.fromList base
114 changes: 114 additions & 0 deletions src/Codec/EliasFano/Internal.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE ExistentialQuantification #-}
module Codec.EliasFano.Internal (Step(..)
, BitStream(..)
, mask
, build
, readBits
, selectFrom
) where

import Control.Exception (assert)
import Data.Bits
import Data.String
import Data.Word (Word64)
import Data.List.Split (chunksOf)
import qualified Data.Vector.Storable as V

data Step s = Done
| Skip s
| Yield !Int !Word64 s

data BitStream = forall s. BitStream s (s -> Step s)

instance Show BitStream where
show (BitStream s0 f) = show (unwords $ chunksOf 8 $ go s0) where
go s = case f s of
Done -> ""
Skip s' -> go s'
Yield width w s' -> [if testBit w i then '1' else '0' | i <- [0..width - 1]] ++ go s'

instance IsString BitStream where
fromString str = BitStream str go where
go [] = Done
go ('0' : xs) = Yield 1 0 xs
go (_ : xs) = Yield 1 1 xs

mask :: Int -> Word64
mask n = unsafeShiftL 1 n - 1
{-# INLINE mask #-}

build :: BitStream -> [Word64]
build (BitStream s0 upd) = go s0 zeroBits 0 where
go s !acc !len = case upd s of
Done -> [acc | len > 0]
Skip s' -> go s' acc len
Yield width w s' -> case mask width .&. w of
w' | width + len >= 64 -> (acc .|. unsafeShiftL w' len)
: go s' (unsafeShiftR w' (64 - len)) (len + width - 64)
| otherwise -> go s' (acc .|. unsafeShiftL w' len) (len + width)

readBits :: V.Vector Word64 -> Int -> Int -> Word64
readBits vec width pos
| b + width > 64 = unsafeShiftL extra (64 - b) .|. base
| otherwise = base
where
i = unsafeShiftR pos 6
b = pos .&. 63
base = (V.unsafeIndex vec i `unsafeShiftR` b) .&. mask width
extra = V.unsafeIndex vec (i + 1) .&. mask (width + b - 64)

selectFrom :: Int -> V.Vector Word64 -> Int -> Int
selectFrom offset vec = go offset 0 where
go ofs i q
| ofs >= 64 = go (ofs - 64) (i + 1) q
| ofs == 0, popCount v < q = 64 + go ofs (i + 1) (q - popCount v)
| ofs == 0 = selectWord64 v q
| i >= V.length vec - 1 = selectWord64 (v `unsafeShiftR` ofs) q
| popCount v' <= q = 64 + go ofs (i + 1) (q - popCount v')
| otherwise = selectWord64 v' q
where
v = V.unsafeIndex vec i
v' = v `unsafeShiftR` ofs .|. V.unsafeIndex vec (i + 1) `unsafeShiftL` (64 - ofs)

-- | Convert a word of various bits into a word where each byte contains the count of bits in the corresponding original byte
--
-- @'popCount' = 'byteSum' . 'byteCounts'@
byteCounts :: Word64 -> Word64
byteCounts a = d .&. lsns where
threes = 0x3333333333333333
as = 0xAAAAAAAAAAAAAAAA
lsbs = 0x0101010101010101
lsns = 0x0f * lsbs
b = a - shiftR (a .&. as) 1
c = (b .&. threes) + (shiftR b 2 .&. threes)
d = c + shiftR c 4

-- | signed compare byte by byte, returning whether or not the result is less than or equal to
-- the corresponding byte in the other word as the least significant bit of each byte
leq8 :: Word64 -> Word64 -> Word64
leq8 x y = shiftR (w .&. msbs) 7 where
msbs = 0x8080808080808080
z = (y .|. msbs) - (x .&. complement msbs)
w = x `xor` y `xor` z

-- https://github.com/ekmett/succinct/blob/7e884138c2e943f5ca08f56b58b409d08b870ab9/src/Succinct/Internal/Broadword.hs
nonzero8 :: Word64 -> Word64
nonzero8 x = shiftR ((x .|. ((x .|. msbs) - lsbs)) .&. msbs) 7 where
msbs = 0x8080808080808080
lsbs = 0x0101010101010101

-- https://github.com/ekmett/succinct/blob/7e884138c2e943f5ca08f56b58b409d08b870ab9/src/Succinct/Internal/Broadword.hs
selectWord64 :: Word64 -> Int -> Int
selectWord64 x k = assert (k < popCount x) (place + offset) where
wk = fromIntegral k
lsbs = 0x0101010101010101
hi = 0xFFFFFFFFFFFFFFF8
inc = 0x8040201008040201
sums = byteCounts x * lsbs
steps = wk * lsbs
place = fromIntegral $ shiftR (leq8 sums steps * lsbs) 53 .&. hi
br = wk - (shiftR (shiftL sums 8) place .&. 0xFF)
spread = (shiftR x place .&. 0xFF) * lsbs
bitSums = nonzero8 (spread .&. inc) * lsbs
offset = fromIntegral $ shiftR (leq8 bitSums (br * lsbs) * lsbs) 56

0 comments on commit f952d60

Please sign in to comment.