From f952d603903b637b17fc5666cc206babe87f03fa Mon Sep 17 00:00:00 2001 From: Fumiaki Kinoshita Date: Wed, 5 Dec 2018 19:19:36 +0900 Subject: [PATCH] initial commit --- .gitignore | 22 ++++++ CHANGELOG.md | 5 ++ LICENSE | 57 ++++++++++++++++ Setup.hs | 2 + elias-fano.cabal | 27 ++++++++ src/Codec/EliasFano.hs | 56 ++++++++++++++++ src/Codec/EliasFano/Internal.hs | 114 ++++++++++++++++++++++++++++++++ 7 files changed, 283 insertions(+) create mode 100644 .gitignore create mode 100644 CHANGELOG.md create mode 100644 LICENSE create mode 100644 Setup.hs create mode 100644 elias-fano.cabal create mode 100644 src/Codec/EliasFano.hs create mode 100644 src/Codec/EliasFano/Internal.hs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..82f3a88 --- /dev/null +++ b/.gitignore @@ -0,0 +1,22 @@ +dist +dist-* +cabal-dev +*.o +*.hi +*.chi +*.chs.h +*.dyn_o +*.dyn_hi +.hpc +.hsenv +.cabal-sandbox/ +cabal.sandbox.config +*.prof +*.aux +*.hp +*.eventlog +.stack-work/ +cabal.project.local +cabal.project.local~ +.HTF/ +.ghc.environment.* diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..c193fec --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,5 @@ +# Revision history for elias-fano + +## 0 -- YYYY-mm-dd + +* First version. Released on an unsuspecting world. diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..cfb2a15 --- /dev/null +++ b/LICENSE @@ -0,0 +1,57 @@ +Copyright (c) 2018, Fumiaki Kinoshita + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of Fumiaki Kinoshita nor the names of other + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Copyright 2013 Edward Kmett + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. diff --git a/Setup.hs b/Setup.hs new file mode 100644 index 0000000..9a994af --- /dev/null +++ b/Setup.hs @@ -0,0 +1,2 @@ +import Distribution.Simple +main = defaultMain diff --git a/elias-fano.cabal b/elias-fano.cabal new file mode 100644 index 0000000..e56b41f --- /dev/null +++ b/elias-fano.cabal @@ -0,0 +1,27 @@ +-- Initial elias-fano.cabal generated by cabal init. For further +-- documentation, see http://haskell.org/cabal/users-guide/ + +name: elias-fano +version: 0 +synopsis: Elias-fano encoder +-- description: +homepage: https://github.com/fumieval/elias-fano +license: BSD3 +license-file: LICENSE +author: Fumiaki Kinoshita +maintainer: fumiexcel@gmail.com +-- copyright: +category: Codec +build-type: Simple +extra-source-files: CHANGELOG.md +cabal-version: >=1.10 + +library + exposed-modules: Codec.EliasFano + , Codec.EliasFano.Internal + -- other-modules: + -- other-extensions: + build-depends: base >=4.8 && <4.14, vector, QuickCheck, split + hs-source-dirs: src + ghc-options: -O2 -Wall + default-language: Haskell2010 diff --git a/src/Codec/EliasFano.hs b/src/Codec/EliasFano.hs new file mode 100644 index 0000000..eaf924e --- /dev/null +++ b/src/Codec/EliasFano.hs @@ -0,0 +1,56 @@ +module Codec.EliasFano (EliasFano(..), unsafeEncode, access, prop_access) where + +import Control.Monad.ST +import Data.Bits +import qualified Data.Vector.Storable as V +import qualified Data.Vector.Storable.Mutable as MV +import Data.Word + +import Codec.EliasFano.Internal + +import qualified Test.QuickCheck as QC + +unsafeEncodeMax :: Int -> V.Vector Int -> EliasFano +unsafeEncodeMax maxValue vec = runST $ do + let counterSize = 1 `unsafeShiftL` ceiling (logBase 2 $ fromIntegral len :: Double) + mcounter <- MV.replicate counterSize 0 + V.forM_ vec $ \v -> MV.unsafeModify mcounter (+1) (v `unsafeShiftR` width) + counter <- V.unsafeFreeze mcounter + let upd (Left i) + | i == len = Skip (Right 0) + | otherwise = Yield width (fromIntegral $ V.unsafeIndex vec i) (Left $! i + 1) + upd (Right i) + | i == counterSize = Done + | otherwise = let n = V.unsafeIndex counter i in Yield (n + 1) (mask n) (Right $! i + 1) + return EliasFano + { efWidth = width + , efLength = len + , efContent = V.fromList $ build $ BitStream (Left 0) upd + } + where + len = V.length vec + width = max 1 $ ceiling $ logBase 2 (fromIntegral maxValue / fromIntegral len :: Double) + +unsafeEncode :: V.Vector Int -> EliasFano +unsafeEncode vec + | V.null vec = unsafeEncodeMax 1 vec + | otherwise = unsafeEncodeMax (V.last vec + 1) vec + +data EliasFano = EliasFano + { efWidth :: !Int + , efLength :: !Int + , efContent :: !(V.Vector Word64) + } + deriving Show + +access :: EliasFano -> Int -> Int +access (EliasFano width len vec) i = unsafeShiftL (selectFrom (len * width) vec i - i) width + .|. fromIntegral (readBits vec width (i * width)) + +prop_access :: [QC.NonNegative Int] -> QC.NonNegative Int -> QC.Property +prop_access xs i_ = QC.counterexample (show (base, ef, i)) + $ access ef i == base !! i + where + i = QC.getNonNegative i_ `mod` length base + base = scanl (+) 0 $ map QC.getNonNegative xs + ef = unsafeEncode $ V.fromList base diff --git a/src/Codec/EliasFano/Internal.hs b/src/Codec/EliasFano/Internal.hs new file mode 100644 index 0000000..3165371 --- /dev/null +++ b/src/Codec/EliasFano/Internal.hs @@ -0,0 +1,114 @@ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE ExistentialQuantification #-} +module Codec.EliasFano.Internal (Step(..) + , BitStream(..) + , mask + , build + , readBits + , selectFrom + ) where + +import Control.Exception (assert) +import Data.Bits +import Data.String +import Data.Word (Word64) +import Data.List.Split (chunksOf) +import qualified Data.Vector.Storable as V + +data Step s = Done + | Skip s + | Yield !Int !Word64 s + +data BitStream = forall s. BitStream s (s -> Step s) + +instance Show BitStream where + show (BitStream s0 f) = show (unwords $ chunksOf 8 $ go s0) where + go s = case f s of + Done -> "" + Skip s' -> go s' + Yield width w s' -> [if testBit w i then '1' else '0' | i <- [0..width - 1]] ++ go s' + +instance IsString BitStream where + fromString str = BitStream str go where + go [] = Done + go ('0' : xs) = Yield 1 0 xs + go (_ : xs) = Yield 1 1 xs + +mask :: Int -> Word64 +mask n = unsafeShiftL 1 n - 1 +{-# INLINE mask #-} + +build :: BitStream -> [Word64] +build (BitStream s0 upd) = go s0 zeroBits 0 where + go s !acc !len = case upd s of + Done -> [acc | len > 0] + Skip s' -> go s' acc len + Yield width w s' -> case mask width .&. w of + w' | width + len >= 64 -> (acc .|. unsafeShiftL w' len) + : go s' (unsafeShiftR w' (64 - len)) (len + width - 64) + | otherwise -> go s' (acc .|. unsafeShiftL w' len) (len + width) + +readBits :: V.Vector Word64 -> Int -> Int -> Word64 +readBits vec width pos + | b + width > 64 = unsafeShiftL extra (64 - b) .|. base + | otherwise = base + where + i = unsafeShiftR pos 6 + b = pos .&. 63 + base = (V.unsafeIndex vec i `unsafeShiftR` b) .&. mask width + extra = V.unsafeIndex vec (i + 1) .&. mask (width + b - 64) + +selectFrom :: Int -> V.Vector Word64 -> Int -> Int +selectFrom offset vec = go offset 0 where + go ofs i q + | ofs >= 64 = go (ofs - 64) (i + 1) q + | ofs == 0, popCount v < q = 64 + go ofs (i + 1) (q - popCount v) + | ofs == 0 = selectWord64 v q + | i >= V.length vec - 1 = selectWord64 (v `unsafeShiftR` ofs) q + | popCount v' <= q = 64 + go ofs (i + 1) (q - popCount v') + | otherwise = selectWord64 v' q + where + v = V.unsafeIndex vec i + v' = v `unsafeShiftR` ofs .|. V.unsafeIndex vec (i + 1) `unsafeShiftL` (64 - ofs) + +-- | Convert a word of various bits into a word where each byte contains the count of bits in the corresponding original byte +-- +-- @'popCount' = 'byteSum' . 'byteCounts'@ +byteCounts :: Word64 -> Word64 +byteCounts a = d .&. lsns where + threes = 0x3333333333333333 + as = 0xAAAAAAAAAAAAAAAA + lsbs = 0x0101010101010101 + lsns = 0x0f * lsbs + b = a - shiftR (a .&. as) 1 + c = (b .&. threes) + (shiftR b 2 .&. threes) + d = c + shiftR c 4 + +-- | signed compare byte by byte, returning whether or not the result is less than or equal to +-- the corresponding byte in the other word as the least significant bit of each byte +leq8 :: Word64 -> Word64 -> Word64 +leq8 x y = shiftR (w .&. msbs) 7 where + msbs = 0x8080808080808080 + z = (y .|. msbs) - (x .&. complement msbs) + w = x `xor` y `xor` z + +-- https://github.com/ekmett/succinct/blob/7e884138c2e943f5ca08f56b58b409d08b870ab9/src/Succinct/Internal/Broadword.hs +nonzero8 :: Word64 -> Word64 +nonzero8 x = shiftR ((x .|. ((x .|. msbs) - lsbs)) .&. msbs) 7 where + msbs = 0x8080808080808080 + lsbs = 0x0101010101010101 + +-- https://github.com/ekmett/succinct/blob/7e884138c2e943f5ca08f56b58b409d08b870ab9/src/Succinct/Internal/Broadword.hs +selectWord64 :: Word64 -> Int -> Int +selectWord64 x k = assert (k < popCount x) (place + offset) where + wk = fromIntegral k + lsbs = 0x0101010101010101 + hi = 0xFFFFFFFFFFFFFFF8 + inc = 0x8040201008040201 + sums = byteCounts x * lsbs + steps = wk * lsbs + place = fromIntegral $ shiftR (leq8 sums steps * lsbs) 53 .&. hi + br = wk - (shiftR (shiftL sums 8) place .&. 0xFF) + spread = (shiftR x place .&. 0xFF) * lsbs + bitSums = nonzero8 (spread .&. inc) * lsbs + offset = fromIntegral $ shiftR (leq8 bitSums (br * lsbs) * lsbs) 56