Skip to content

Commit

Permalink
Move generation logic to its own crate
Browse files Browse the repository at this point in the history
  • Loading branch information
sfackler committed Feb 22, 2015
1 parent 40dbc32 commit cfeee87
Show file tree
Hide file tree
Showing 5 changed files with 160 additions and 132 deletions.
15 changes: 15 additions & 0 deletions phf_generator/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[package]
name = "phf_generator"
authors = ["Steven Fackler <sfackler@gmail.com>"]
version = "0.6.9"
license = "MIT"
description = "PHF generation logic"
repository = "https://github.com/sfackler/rust-phf"
documentation = "https://sfackler.github.io/rust-phf/doc/phf_generator"

[dependencies]
rand = "0.1"

[dependencies.phf_shared]
path = "../phf_shared"
version = "=0.6.9"
115 changes: 115 additions & 0 deletions phf_generator/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
extern crate phf_shared;
extern crate rand;

use phf_shared::PhfHash;
use rand::{SeedableRng, XorShiftRng, Rng};

const DEFAULT_LAMBDA: usize = 5;

const FIXED_SEED: [u32; 4] = [3141592653, 589793238, 462643383, 2795028841];

pub struct HashState {
pub key: u64,
pub disps: Vec<(u32, u32)>,
pub map: Vec<usize>,
}

pub fn generate_hash<H: PhfHash>(entries: &[H]) -> HashState {
let mut rng: XorShiftRng = SeedableRng::from_seed(FIXED_SEED);
loop {
if let Some(s) = try_generate_hash(entries, &mut rng) {
return s;
}
}
}

fn try_generate_hash<H: PhfHash>(entries: &[H], rng: &mut XorShiftRng) -> Option<HashState> {
struct Bucket {
idx: usize,
keys: Vec<usize>,
}

struct Hashes {
g: u32,
f1: u32,
f2: u32,
}

let key = rng.gen();

let hashes: Vec<_> = entries.iter().map(|entry| {
let (g, f1, f2) = entry.phf_hash(key);
Hashes {
g: g,
f1: f1,
f2: f2
}
}).collect();

let buckets_len = (entries.len() + DEFAULT_LAMBDA - 1) / DEFAULT_LAMBDA;
let mut buckets = (0..buckets_len)
.map(|i| Bucket { idx: i, keys: vec![] })
.collect::<Vec<_>>();

for (i, hash) in hashes.iter().enumerate() {
buckets[(hash.g % (buckets_len as u32)) as usize].keys.push(i);
}

// Sort descending
buckets.sort_by(|a, b| a.keys.len().cmp(&b.keys.len()).reverse());

let table_len = entries.len();
let mut map = vec![None; table_len];
let mut disps = vec![(0u32, 0u32); buckets_len];

// store whether an element from the bucket being placed is
// located at a certain position, to allow for efficient overlap
// checks. It works by storing the generation in each cell and
// each new placement-attempt is a new generation, so you can tell
// if this is legitimately full by checking that the generations
// are equal. (A u64 is far too large to overflow in a reasonable
// time for current hardware.)
let mut try_map = vec![0u64; table_len];
let mut generation = 0u64;

// the actual values corresponding to the markers above, as
// (index, key) pairs, for adding to the main map once we've
// chosen the right disps.
let mut values_to_add = vec![];

'buckets: for bucket in buckets.iter() {
for d1 in 0..(table_len as u32) {
'disps: for d2 in 0..(table_len as u32) {
values_to_add.clear();
generation += 1;

for &key in bucket.keys.iter() {
let idx = (phf_shared::displace(hashes[key].f1, hashes[key].f2, d1, d2)
% (table_len as u32)) as usize;
if map[idx].is_some() || try_map[idx] == generation {
continue 'disps;
}
try_map[idx] = generation;
values_to_add.push((idx, key));
}

// We've picked a good set of disps
disps[bucket.idx] = (d1, d2);
for &(idx, key) in values_to_add.iter() {
map[idx] = Some(key);
}
continue 'buckets;
}
}

// Unable to find displacements for a bucket
return None;
}

Some(HashState {
key: key,
disps: disps,
map: map.into_iter().map(|i| i.unwrap()).collect(),
})
}

5 changes: 3 additions & 2 deletions phf_macros/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@ test = false
[features]
stats = ["time"]

[dependencies]
rand = "0.1"
[dependencies.phf_generator]
path = "../phf_generator"
version = "=0.6.9"

[dependencies.phf_shared]
path = "../phf_shared"
Expand Down
25 changes: 22 additions & 3 deletions phf_macros/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@
//!
//! See the documentation for the `phf` crate for more details.
#![doc(html_root_url="http://sfackler.github.io/rust-phf/doc")]
#![feature(plugin_registrar, quote, rustc_private, core, env, std_misc)]
#![feature(plugin_registrar, quote, rustc_private, env, std_misc)]

extern crate rand;
extern crate syntax;
#[cfg(feature = "stats")]
extern crate time;
extern crate rustc;
extern crate phf_shared;
extern crate phf_generator;

use std::collections::HashMap;
use std::collections::hash_map::Entry::{Occupied, Vacant};
Expand All @@ -23,9 +23,11 @@ use syntax::parse;
use syntax::parse::token::{InternedString, Comma, Eof, FatArrow};
use syntax::print::pprust;
use rustc::plugin::Registry;
use phf_generator::HashState;
use std::env;

use util::{Entry, Key};
use util::{generate_hash, create_map, create_set, create_ordered_map, create_ordered_set};
use util::{create_map, create_set, create_ordered_map, create_ordered_set};

pub mod util;

Expand All @@ -38,6 +40,23 @@ pub fn macro_registrar(reg: &mut Registry) {
reg.register_macro("phf_ordered_set", expand_phf_ordered_set);
}

fn generate_hash(cx: &mut ExtCtxt, sp: Span, entries: &[Entry]) -> HashState {
#[cfg(feature = "stats")]
use time::precise_time_s;
#[cfg(not(feature = "stats"))]
fn precise_time_s() -> f64 { 0. }

let start = precise_time_s();
let state = phf_generator::generate_hash(entries);
let time = precise_time_s() - start;

if cfg!(feature = "stats") && env::var_os("PHF_STATS").is_some() {
cx.span_note(sp, &format!("PHF generation took {} seconds", time));
}

state
}

fn expand_phf_map(cx: &mut ExtCtxt, sp: Span, tts: &[TokenTree]) -> Box<MacResult+'static> {
let entries = match parse_map(cx, tts) {
Some(entries) => entries,
Expand Down
132 changes: 5 additions & 127 deletions phf_macros/src/util.rs
Original file line number Diff line number Diff line change
@@ -1,21 +1,15 @@
use std::env;
use std::rc::Rc;
use std::hash::{Hash, Hasher};
use std::iter::repeat;

use syntax::ast::Expr;
use syntax::codemap::Span;
use syntax::ext::base::{ExtCtxt, MacResult, MacExpr};
use syntax::ext::build::AstBuilder;
use syntax::parse::token::InternedString;
use syntax::ptr::P;
use rand::{Rng, SeedableRng, XorShiftRng};

use phf_shared::{self, PhfHash};

const DEFAULT_LAMBDA: usize = 5;

const FIXED_SEED: [u32; 4] = [3141592653, 589793238, 462643383, 2795028841];
use phf_shared::PhfHash;
use phf_generator::HashState;

#[derive(PartialEq, Eq, Clone)]
pub enum Key {
Expand Down Expand Up @@ -77,126 +71,10 @@ pub struct Entry {
pub value: P<Expr>
}

pub struct HashState {
key: u64,
disps: Vec<(u32, u32)>,
map: Vec<usize>,
}

pub fn generate_hash(cx: &mut ExtCtxt, sp: Span, entries: &[Entry]) -> HashState {
#[cfg(feature = "stats")]
use time::precise_time_s;
#[cfg(not(feature = "stats"))]
fn precise_time_s() -> f64 { 0.0 }

let mut rng: XorShiftRng = SeedableRng::from_seed(FIXED_SEED);
let start = precise_time_s();
let state;
loop {
match try_generate_hash(entries, &mut rng) {
Some(s) => {
state = s;
break;
}
None => {}
}
}
let time = precise_time_s() - start;
if cfg!(feature = "stats") && env::var_os("PHF_STATS").is_some() {
cx.span_note(sp, &*format!("PHF generation took {} seconds", time));
}

state
}

pub fn try_generate_hash(entries: &[Entry], rng: &mut XorShiftRng) -> Option<HashState> {
struct Bucket {
idx: usize,
keys: Vec<usize>,
}

struct Hashes {
g: u32,
f1: u32,
f2: u32,
}

let key = rng.gen();

let hashes: Vec<_> = entries.iter().map(|entry| {
let (g, f1, f2) = entry.key_contents.phf_hash(key);
Hashes {
g: g,
f1: f1,
f2: f2
}
}).collect();

let buckets_len = (entries.len() + DEFAULT_LAMBDA - 1) / DEFAULT_LAMBDA;
let mut buckets = range(0, buckets_len)
.map(|i| Bucket { idx: i, keys: vec![] })
.collect::<Vec<_>>();

for (i, hash) in hashes.iter().enumerate() {
buckets[(hash.g % (buckets_len as u32)) as usize].keys.push(i);
}

// Sort descending
buckets.sort_by(|a, b| a.keys.len().cmp(&b.keys.len()).reverse());

let table_len = entries.len();
let mut map = repeat(None).take(table_len).collect::<Vec<_>>();
let mut disps = repeat((0u32, 0u32)).take(buckets_len).collect::<Vec<_>>();

// store whether an element from the bucket being placed is
// located at a certain position, to allow for efficient overlap
// checks. It works by storing the generation in each cell and
// each new placement-attempt is a new generation, so you can tell
// if this is legitimately full by checking that the generations
// are equal. (A u64 is far too large to overflow in a reasonable
// time for current hardware.)
let mut try_map = repeat(0u64).take(table_len).collect::<Vec<_>>();
let mut generation = 0u64;

// the actual values corresponding to the markers above, as
// (index, key) pairs, for adding to the main map once we've
// chosen the right disps.
let mut values_to_add = vec![];

'buckets: for bucket in buckets.iter() {
for d1 in range(0, table_len as u32) {
'disps: for d2 in range(0, table_len as u32) {
values_to_add.clear();
generation += 1;

for &key in bucket.keys.iter() {
let idx = (phf_shared::displace(hashes[key].f1, hashes[key].f2, d1, d2)
% (table_len as u32)) as usize;
if map[idx].is_some() || try_map[idx] == generation {
continue 'disps;
}
try_map[idx] = generation;
values_to_add.push((idx, key));
}

// We've picked a good set of disps
disps[bucket.idx] = (d1, d2);
for &(idx, key) in values_to_add.iter() {
map[idx] = Some(key);
}
continue 'buckets;
}
}

// Unable to find displacements for a bucket
return None;
impl PhfHash for Entry {
fn phf_hash(&self, key: u64) -> (u32, u32, u32) {
self.key_contents.phf_hash(key)
}

Some(HashState {
key: key,
disps: disps,
map: map.into_iter().map(|i| i.unwrap()).collect(),
})
}

pub fn create_map(cx: &mut ExtCtxt, sp: Span, entries: Vec<Entry>, state: HashState)
Expand Down

0 comments on commit cfeee87

Please sign in to comment.