Skip to content

Commit

Permalink
Large hash buckets (in the 10's of thousands range) are costly to walk
Browse files Browse the repository at this point in the history
in find_all_dups. Instead, we catch those buckets during hash insert
time and add filerec pointers (for each unique filerec) to the
dupe_blocks_list structure. Later, when find_all_dups() encounters
these it walks the filerecs directly instead of the blocks list.

This patch halves the time (from 16s to 8s) to calculate extents
from a sample hash file (created from my build partition).

The value chosen at the moment (30000) is arbitrary. We might want to
make it user selectable. There is also the question as to whether it
makes sense to _always_ do this but the downside is we then lose the
block_ever_seen() shortcut in find_all_dups.

Signed-off-by: Mark Fasheh <mfasheh@suse.de>
  • Loading branch information
Mark Fasheh committed Sep 25, 2014
1 parent 73cf50e commit dad9711
Show file tree
Hide file tree
Showing 6 changed files with 210 additions and 13 deletions.
1 change: 1 addition & 0 deletions debug.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ declare_alloc_tracking_header(dupe_extents);
declare_alloc_tracking_header(extent);
declare_alloc_tracking_header(filerec);
declare_alloc_tracking_header(files_compared);
declare_alloc_tracking_header(filerec_token);
/* Can be called anywhere we want to dump the above statistics */
void print_mem_stats(void);

Expand Down
63 changes: 61 additions & 2 deletions duperemove.c
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,8 @@ static void print_dupes_table(struct results_tree *res)
printf("\n");
}

printf("Start\t\tLength\t\tFilename\n");
printf("Start\t\tLength\t\tFilename (%u extents)\n",
dext->de_num_dupes);
list_for_each_entry(extent, &dext->de_extents, e_list) {
printf("%s\t%s\t\"%s\"\n",
pretty_size(extent->e_loff),
Expand Down Expand Up @@ -885,6 +886,50 @@ static int compare_files(struct results_tree *res, struct filerec *file1, struct
return mark_filerecs_compared(file1, file2);
}

/*
* This dupe list is too large for the extent search algorithm to
* handle efficiently. Instead of walking the block list, we walk the
* list of files referenced and compare them to each other directly.
*
* A future improvement might be to always do this, at the cost of
* extra memory usage.
*/
static int walk_large_dups(struct hash_tree *tree,
struct results_tree *res,
struct dupe_blocks_list *dups)
{
int ret;
struct rb_node *node = rb_first(&dups->dl_files_root);
struct rb_node *next;
struct filerec_token *t1, *t2;
struct filerec *file1, *file2;

while (node) {
t1 = rb_entry(node, struct filerec_token, t_node);
file1 = t1->t_file;

next = rb_next(node);
while (next) {
t2 = rb_entry(next, struct filerec_token, t_node);
file2 = t2->t_file;

/* filerec token tree does not allow duplicates */
abort_on(file1 == file2);
if (!filerecs_compared(file1, file2)) {
ret = compare_files(res, file1, file2);
if (ret)
return ret;
break;
}

next = rb_next(next);
}
node = rb_next(node);
}

return 0;
}

/*
* The following doesn't actually find all dupes. In the case of a
* n-way dupe when n > 2 it only finds n dupes. But this shouldn't be
Expand All @@ -899,14 +944,22 @@ static int find_all_dups(struct hash_tree *tree, struct results_tree *res)
struct dupe_blocks_list *dups;
struct file_block *block1, *block2;
struct filerec *file1, *file2;
LIST_HEAD(large_dupes);

while (1) {
if (node == NULL)
break;

dups = rb_entry(node, struct dupe_blocks_list, dl_node);

if (dups->dl_num_elem > 1) {
if (dups->dl_num_files) {
list_add_tail(&dups->dl_large_list, &large_dupes);

printf("Hash (\"");
debug_print_digest(stdout, dups->dl_hash);
printf("\") has %u items (across %u files), processing later.\n",
dups->dl_num_elem, dups->dl_num_files);
} else if (dups->dl_num_elem > 1) {
list_for_each_entry(block1, &dups->dl_list, b_list) {
file1 = block1->b_file;
block2 = block1;
Expand Down Expand Up @@ -935,6 +988,12 @@ static int find_all_dups(struct hash_tree *tree, struct results_tree *res)
node = rb_next(node);
}

list_for_each_entry(dups, &large_dupes, dl_large_list) {
ret = walk_large_dups(tree, res, dups);
if (ret)
return ret;
}

return 0;
}

Expand Down
104 changes: 104 additions & 0 deletions hash-tree.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,97 @@

declare_alloc_tracking(file_block);
declare_alloc_tracking(dupe_blocks_list);
declare_alloc_tracking(filerec_token);

struct filerec_token *find_filerec_token_rb(struct dupe_blocks_list *dups,
struct filerec *val)
{
struct rb_node *n = dups->dl_files_root.rb_node;
struct filerec_token *t;

while (n) {
t = rb_entry(n, struct filerec_token, t_node);

if (t->t_file > val)
n = n->rb_left;
else if (t->t_file < val)
n = n->rb_right;
else
return t;
}
return NULL;
}

static void insert_filerec_token_rb(struct dupe_blocks_list *dups,
struct filerec_token *token)
{
struct rb_node **p = &dups->dl_files_root.rb_node;
struct rb_node *parent = NULL;
struct filerec_token *tmp;

while (*p) {
parent = *p;

tmp = rb_entry(parent, struct filerec_token, t_node);

if (tmp->t_file > token->t_file)
p = &(*p)->rb_left;
else if (tmp->t_file < token->t_file)
p = &(*p)->rb_right;
else
abort_lineno(); /* We should never find a duplicate */
}

rb_link_node(&token->t_node, parent, p);
rb_insert_color(&token->t_node, &dups->dl_files_root);
}

static int add_one_filerec_token(struct dupe_blocks_list *dups,
struct filerec *file)
{
struct filerec_token *t = NULL;

if (find_filerec_token_rb(dups, file))
return 0;

t = malloc_filerec_token();
if (!t)
return ENOMEM;

rb_init_node(&t->t_node);
t->t_file = file;

insert_filerec_token_rb(dups, t);
dups->dl_num_files++;
return 0;
}

static int add_filerec_tokens(struct dupe_blocks_list *dups)
{
struct file_block *block;

list_for_each_entry(block, &dups->dl_list, b_list) {
if (add_one_filerec_token(dups, block->b_file))
return ENOMEM;
}
return 0;
}

static void free_filerec_tokens(struct dupe_blocks_list *dups)
{
struct rb_node *node = rb_first(&dups->dl_files_root);
struct filerec_token *t;

while (node) {
t = rb_entry(node, struct filerec_token, t_node);

node = rb_next(node);

dups->dl_num_files--;
rb_erase(&t->t_node, &dups->dl_files_root);
free_filerec_token(t);
}
}

static void insert_block_list(struct hash_tree *tree,
struct dupe_blocks_list *list)
Expand Down Expand Up @@ -103,17 +194,29 @@ int insert_hashed_block(struct hash_tree *tree, unsigned char *digest,
rb_init_node(&d->dl_node);
rb_init_node(&d->dl_by_size);
INIT_LIST_HEAD(&d->dl_list);
INIT_LIST_HEAD(&d->dl_large_list);
d->dl_files_root = RB_ROOT;

insert_block_list(tree, d);
}

if (d->dl_num_elem >= DUPLIST_CONVERT_LIMIT && d->dl_num_files == 0) {
if (add_filerec_tokens(d))
return ENOMEM;
}

e->b_file = file;
e->b_seen = 0;
e->b_loff = loff;
list_add_tail(&e->b_file_next, &file->block_list);
file->num_blocks++;
e->b_parent = d;

if (d->dl_num_files) {
if (add_one_filerec_token(d, file))
return ENOMEM;
}

d->dl_num_elem++;
list_add_tail(&e->b_list, &d->dl_list);

Expand Down Expand Up @@ -141,6 +244,7 @@ static void remove_hashed_block(struct hash_tree *tree,
rb_erase(&blocklist->dl_node, &tree->root);
tree->num_hashes--;

free_filerec_tokens(blocklist);
free_dupe_blocks_list(blocklist);
}

Expand Down
18 changes: 18 additions & 0 deletions hash-tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,16 @@ struct dupe_blocks_list {
unsigned int dl_num_elem;
struct list_head dl_list;

/*
* num_files and files_root are used when the total number of
* blocks in the list exceeds DUPLIST_CONVERT_LIMIT (defined
* below)
*/
unsigned int dl_num_files;
struct rb_root dl_files_root;
struct list_head dl_large_list; /* Temporary list for
* use by extent finding code */

unsigned char dl_hash[DIGEST_LEN_MAX];
};

Expand All @@ -29,6 +39,14 @@ struct file_block {
struct list_head b_file_next; /* filerec->block_list */
};

/* Max number of blocks before we'll add filerec tokens */
#define DUPLIST_CONVERT_LIMIT 30000

struct filerec_token {
struct filerec *t_file;
struct rb_node t_node;
};

int insert_hashed_block(struct hash_tree *tree, unsigned char *digest,
struct filerec *file, uint64_t loff);
void remove_hashed_blocks(struct hash_tree *tree, struct filerec *file);
Expand Down
36 changes: 25 additions & 11 deletions results-tree.c
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,30 @@ static void insert_extent_list_free(struct dupe_extents *dext,
}
}

static struct dupe_extents *dupe_extents_new(struct results_tree *res,
unsigned char *digest,
uint64_t len)
{
struct dupe_extents *dext;

dext = calloc_dupe_extents(1);
if (!dext)
return NULL;

memcpy(dext->de_hash, digest, digest_len);
dext->de_len = len;
INIT_LIST_HEAD(&dext->de_extents);
dext->de_extents_root = RB_ROOT;

rb_init_node(&dext->de_node);

insert_dupe_extents(res, dext);

dext->de_score = len;

return dext;
}

int insert_result(struct results_tree *res, unsigned char *digest,
struct filerec *recs[2], uint64_t startoff[2],
uint64_t endoff[2])
Expand All @@ -187,19 +211,9 @@ int insert_result(struct results_tree *res, unsigned char *digest,

dext = find_dupe_extents(res, digest, len);
if (!dext) {
dext = calloc_dupe_extents(1);
dext = dupe_extents_new(res, digest, len);
if (!dext)
return ENOMEM;

memcpy(dext->de_hash, digest, digest_len);
dext->de_len = len;
INIT_LIST_HEAD(&dext->de_extents);
rb_init_node(&dext->de_node);
dext->de_extents_root = RB_ROOT;

insert_dupe_extents(res, dext);

dext->de_score = len;
add_score = 0;
}

Expand Down
1 change: 1 addition & 0 deletions util.c
Original file line number Diff line number Diff line change
Expand Up @@ -126,4 +126,5 @@ void print_mem_stats(void)
show_allocs_extent();
show_allocs_filerec();
show_allocs_files_compared();
show_allocs_filerec_token();
}

0 comments on commit dad9711

Please sign in to comment.