Large hash buckets (in the 10's of thousands range) are costly to walk

in find_all_dups. Instead, we catch those buckets during hash insert time and add filerec pointers (for each unique filerec) to the dupe_blocks_list structure. Later, when find_all_dups() encounters these it walks the filerecs directly instead of the blocks list. This patch halves the time (from 16s to 8s) to calculate extents from a sample hash file (created from my build partition). The value chosen at the moment (30000) is arbitrary. We might want to make it user selectable. There is also the question as to whether it makes sense to _always_ do this but the downside is we then lose the block_ever_seen() shortcut in find_all_dups. Signed-off-by: Mark Fasheh <mfasheh@suse.de>
rx80 · Sep 25, 2014 · dad9711 · dad9711
1 parent 73cf50e
commit dad9711
Show file tree

Hide file tree

Showing 6 changed files with 210 additions and 13 deletions.
diff --git a/debug.h b/debug.h
@@ -55,6 +55,7 @@ declare_alloc_tracking_header(dupe_extents);
 declare_alloc_tracking_header(extent);
 declare_alloc_tracking_header(filerec);
 declare_alloc_tracking_header(files_compared);
+declare_alloc_tracking_header(filerec_token);
 /* Can be called anywhere we want to dump the above statistics */
 void print_mem_stats(void);
 

diff --git a/duperemove.c b/duperemove.c
@@ -150,7 +150,8 @@ static void print_dupes_table(struct results_tree *res)
 			printf("\n");
 		}
 
-		printf("Start\t\tLength\t\tFilename\n");
+		printf("Start\t\tLength\t\tFilename (%u extents)\n",
+		       dext->de_num_dupes);
 		list_for_each_entry(extent, &dext->de_extents, e_list) {
 			printf("%s\t%s\t\"%s\"\n",
 			       pretty_size(extent->e_loff),
@@ -885,6 +886,50 @@ static int compare_files(struct results_tree *res, struct filerec *file1, struct
 	return mark_filerecs_compared(file1, file2);
 }
 
+/*
+ * This dupe list is too large for the extent search algorithm to
+ * handle efficiently. Instead of walking the block list, we walk the
+ * list of files referenced and compare them to each other directly.
+ *
+ * A future improvement might be to always do this, at the cost of
+ * extra memory usage.
+ */
+static int walk_large_dups(struct hash_tree *tree,
+			   struct results_tree *res,
+			   struct dupe_blocks_list *dups)
+{
+	int ret;
+	struct rb_node *node = rb_first(&dups->dl_files_root);
+	struct rb_node *next;
+	struct filerec_token *t1, *t2;
+	struct filerec *file1, *file2;
+
+	while (node) {
+		t1 = rb_entry(node, struct filerec_token, t_node);
+		file1 = t1->t_file;
+
+		next = rb_next(node);
+		while (next) {
+			t2 = rb_entry(next, struct filerec_token, t_node);
+			file2 = t2->t_file;
+
+			/* filerec token tree does not allow duplicates */
+			abort_on(file1 == file2);
+			if (!filerecs_compared(file1, file2)) {
+				ret = compare_files(res, file1, file2);
+				if (ret)
+					return ret;
+				break;
+			}
+
+			next = rb_next(next);
+		}
+		node = rb_next(node);
+	}
+
+	return 0;
+}
+
 /*
  * The following doesn't actually find all dupes. In the case of a
  * n-way dupe when n > 2 it only finds n dupes. But this shouldn't be
@@ -899,14 +944,22 @@ static int find_all_dups(struct hash_tree *tree, struct results_tree *res)
 	struct dupe_blocks_list *dups;
 	struct file_block *block1, *block2;
 	struct filerec *file1, *file2;
+	LIST_HEAD(large_dupes);
 
 	while (1) {
 		if (node == NULL)
 			break;
 
 		dups = rb_entry(node, struct dupe_blocks_list, dl_node);
 
-		if (dups->dl_num_elem > 1) {
+		if (dups->dl_num_files) {
+			list_add_tail(&dups->dl_large_list, &large_dupes);
+
+			printf("Hash (\"");
+			debug_print_digest(stdout, dups->dl_hash);
+			printf("\") has %u items (across %u files), processing later.\n",
+			       dups->dl_num_elem, dups->dl_num_files);
+		} else if (dups->dl_num_elem > 1) {
 			list_for_each_entry(block1, &dups->dl_list, b_list) {
 				file1 = block1->b_file;
 				block2 = block1;
@@ -935,6 +988,12 @@ static int find_all_dups(struct hash_tree *tree, struct results_tree *res)
 		node = rb_next(node);
 	}
 
+	list_for_each_entry(dups, &large_dupes, dl_large_list) {
+		ret = walk_large_dups(tree, res, dups);
+		if (ret)
+			return ret;
+	}
+
 	return 0;
 }
 

diff --git a/hash-tree.c b/hash-tree.c
@@ -33,6 +33,97 @@
 
 declare_alloc_tracking(file_block);
 declare_alloc_tracking(dupe_blocks_list);
+declare_alloc_tracking(filerec_token);
+
+struct filerec_token *find_filerec_token_rb(struct dupe_blocks_list *dups,
+					    struct filerec *val)
+{
+	struct rb_node *n = dups->dl_files_root.rb_node;
+	struct filerec_token *t;
+
+	while (n) {
+		t = rb_entry(n, struct filerec_token, t_node);
+
+		if (t->t_file > val)
+			n = n->rb_left;
+		else if (t->t_file < val)
+			n = n->rb_right;
+		else
+			return t;
+	}
+	return NULL;
+}
+
+static void insert_filerec_token_rb(struct dupe_blocks_list *dups,
+				    struct filerec_token *token)
+{
+	struct rb_node **p = &dups->dl_files_root.rb_node;
+	struct rb_node *parent = NULL;
+	struct filerec_token *tmp;
+
+	while (*p) {
+		parent = *p;
+
+		tmp = rb_entry(parent, struct filerec_token, t_node);
+
+		if (tmp->t_file > token->t_file)
+			p = &(*p)->rb_left;
+		else if (tmp->t_file < token->t_file)
+			p = &(*p)->rb_right;
+		else
+			abort_lineno(); /* We should never find a duplicate */
+	}
+
+	rb_link_node(&token->t_node, parent, p);
+	rb_insert_color(&token->t_node, &dups->dl_files_root);
+}
+
+static int add_one_filerec_token(struct dupe_blocks_list *dups,
+				 struct filerec *file)
+{
+	struct filerec_token *t = NULL;
+
+	if (find_filerec_token_rb(dups, file))
+		return 0;
+
+	t = malloc_filerec_token();
+	if (!t)
+		return ENOMEM;
+
+	rb_init_node(&t->t_node);
+	t->t_file = file;
+
+	insert_filerec_token_rb(dups, t);
+	dups->dl_num_files++;
+	return 0;
+}
+
+static int add_filerec_tokens(struct dupe_blocks_list *dups)
+{
+	struct file_block *block;
+
+	list_for_each_entry(block, &dups->dl_list, b_list) {
+		if (add_one_filerec_token(dups, block->b_file))
+			return ENOMEM;
+	}
+	return 0;
+}
+
+static void free_filerec_tokens(struct dupe_blocks_list *dups)
+{
+	struct rb_node *node = rb_first(&dups->dl_files_root);
+	struct filerec_token *t;
+
+	while (node) {
+		t = rb_entry(node, struct filerec_token, t_node);
+
+		node = rb_next(node);
+
+		dups->dl_num_files--;
+		rb_erase(&t->t_node, &dups->dl_files_root);
+		free_filerec_token(t);
+	}
+}
 
 static void insert_block_list(struct hash_tree *tree,
 			      struct dupe_blocks_list *list)
@@ -103,17 +194,29 @@ int insert_hashed_block(struct hash_tree *tree,	unsigned char *digest,
 		rb_init_node(&d->dl_node);
 		rb_init_node(&d->dl_by_size);
 		INIT_LIST_HEAD(&d->dl_list);
+		INIT_LIST_HEAD(&d->dl_large_list);
+		d->dl_files_root = RB_ROOT;
 
 		insert_block_list(tree, d);
 	}
 
+	if (d->dl_num_elem >= DUPLIST_CONVERT_LIMIT && d->dl_num_files == 0) {
+		if (add_filerec_tokens(d))
+			return ENOMEM;
+	}
+
 	e->b_file = file;
 	e->b_seen = 0;
 	e->b_loff = loff;
 	list_add_tail(&e->b_file_next, &file->block_list);
 	file->num_blocks++;
 	e->b_parent = d;
 
+	if (d->dl_num_files) {
+		if (add_one_filerec_token(d, file))
+			return ENOMEM;
+	}
+
 	d->dl_num_elem++;
 	list_add_tail(&e->b_list, &d->dl_list);
 
@@ -141,6 +244,7 @@ static void remove_hashed_block(struct hash_tree *tree,
 		rb_erase(&blocklist->dl_node, &tree->root);
 		tree->num_hashes--;
 
+		free_filerec_tokens(blocklist);
 		free_dupe_blocks_list(blocklist);
 	}
 

diff --git a/hash-tree.h b/hash-tree.h
@@ -14,6 +14,16 @@ struct dupe_blocks_list {
 	unsigned int	dl_num_elem;
 	struct list_head	dl_list;
 
+	/*
+	 * num_files and files_root are used when the total number of
+	 * blocks in the list exceeds DUPLIST_CONVERT_LIMIT (defined
+	 * below)
+	 */
+	unsigned int		dl_num_files;
+	struct rb_root		dl_files_root;
+	struct list_head	dl_large_list; /* Temporary list for
+						* use by extent finding code */
+
 	unsigned char		dl_hash[DIGEST_LEN_MAX];
 };
 
@@ -29,6 +39,14 @@ struct file_block {
 	struct list_head	b_file_next; /* filerec->block_list */
 };
 
+/* Max number of blocks before we'll add filerec tokens */
+#define	DUPLIST_CONVERT_LIMIT		30000
+
+struct filerec_token {
+	struct filerec	*t_file;
+	struct rb_node	t_node;
+};
+
 int insert_hashed_block(struct hash_tree *tree, unsigned char *digest,
 			struct filerec *file, uint64_t loff);
 void remove_hashed_blocks(struct hash_tree *tree, struct filerec *file);

diff --git a/results-tree.c b/results-tree.c
@@ -172,6 +172,30 @@ static void insert_extent_list_free(struct dupe_extents *dext,
 	}
 }
 
+static struct dupe_extents *dupe_extents_new(struct results_tree *res,
+					     unsigned char *digest,
+					     uint64_t len)
+{
+	struct dupe_extents *dext;
+
+	dext = calloc_dupe_extents(1);
+	if (!dext)
+		return NULL;
+
+	memcpy(dext->de_hash, digest, digest_len);
+	dext->de_len = len;
+	INIT_LIST_HEAD(&dext->de_extents);
+	dext->de_extents_root = RB_ROOT;
+
+	rb_init_node(&dext->de_node);
+
+	insert_dupe_extents(res, dext);
+
+	dext->de_score = len;
+
+	return dext;
+}
+
 int insert_result(struct results_tree *res, unsigned char *digest,
 		  struct filerec *recs[2], uint64_t startoff[2],
 		  uint64_t endoff[2])
@@ -187,19 +211,9 @@ int insert_result(struct results_tree *res, unsigned char *digest,
 
 	dext = find_dupe_extents(res, digest, len);
 	if (!dext) {
-		dext = calloc_dupe_extents(1);
+		dext = dupe_extents_new(res, digest, len);
 		if (!dext)
 			return ENOMEM;
-
-		memcpy(dext->de_hash, digest, digest_len);
-		dext->de_len = len;
-		INIT_LIST_HEAD(&dext->de_extents);
-		rb_init_node(&dext->de_node);
-		dext->de_extents_root = RB_ROOT;
-
-		insert_dupe_extents(res, dext);
-
-		dext->de_score = len;
 		add_score = 0;
 	}
 

diff --git a/util.c b/util.c
@@ -126,4 +126,5 @@ void print_mem_stats(void)
 	show_allocs_extent();
 	show_allocs_filerec();
 	show_allocs_files_compared();
+	show_allocs_filerec_token();
 }