Commit 3eaa2885 authored by Chris Mason's avatar Chris Mason
Browse files

Btrfs: Fix the defragmention code and the block relocation code for data=ordered



Before setting an extent to delalloc, the code needs to wait for
pending ordered extents.

Also, the relocation code needs to wait for ordered IO before scanning
the block group again.  This is because the extents are not removed
until the IO for the new extents is finished

Signed-off-by: default avatarChris Mason <chris.mason@oracle.com>
parent 64f26f74
Loading
Loading
Loading
Loading
+7 −0
Original line number Diff line number Diff line
@@ -538,6 +538,13 @@ struct btrfs_fs_info {
	struct list_head dead_roots;
	atomic_t nr_async_submits;

	/*
	 * this is used by the balancing code to wait for all the pending
	 * ordered extents
	 */
	spinlock_t ordered_extent_lock;
	struct list_head ordered_extents;

	/*
	 * there is a pool of worker threads for checksumming during writes
	 * and a pool for checksumming after reads.  This is because readers
+3 −0
Original line number Diff line number Diff line
@@ -1252,6 +1252,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
	fs_info->btree_inode->i_nlink = 1;
	fs_info->thread_pool_size = min(num_online_cpus() + 2, 8);

	INIT_LIST_HEAD(&fs_info->ordered_extents);
	spin_lock_init(&fs_info->ordered_extent_lock);

	sb->s_blocksize = 4096;
	sb->s_blocksize_bits = blksize_bits(4096);

+28 −11
Original line number Diff line number Diff line
@@ -2640,6 +2640,7 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
	struct file_ra_state *ra;
	unsigned long total_read = 0;
	unsigned long ra_pages;
	struct btrfs_ordered_extent *ordered;
	struct btrfs_trans_handle *trans;

	ra = kzalloc(sizeof(*ra), GFP_NOFS);
@@ -2658,9 +2659,9 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
				       calc_ra(i, last_index, ra_pages));
		}
		total_read++;
		if (((u64)i << PAGE_CACHE_SHIFT) > inode->i_size)
again:
		if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
			goto truncate_racing;

		page = grab_cache_page(inode->i_mapping, i);
		if (!page) {
			goto out_unlock;
@@ -2674,18 +2675,24 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
				goto out_unlock;
			}
		}
#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
		ClearPageDirty(page);
#else
		cancel_dirty_page(page, PAGE_CACHE_SIZE);
#endif
		wait_on_page_writeback(page);
		set_page_extent_mapped(page);

		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
		page_end = page_start + PAGE_CACHE_SIZE - 1;

		lock_extent(io_tree, page_start, page_end, GFP_NOFS);

		ordered = btrfs_lookup_ordered_extent(inode, page_start);
		if (ordered) {
			unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
			unlock_page(page);
			page_cache_release(page);
			btrfs_start_ordered_extent(inode, ordered, 1);
			btrfs_put_ordered_extent(ordered);
			goto again;
		}
		set_page_extent_mapped(page);


		set_extent_delalloc(io_tree, page_start,
				    page_end, GFP_NOFS);
		set_page_dirty(page);
@@ -2694,10 +2701,18 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
		unlock_page(page);
		page_cache_release(page);
	}
	balance_dirty_pages_ratelimited_nr(inode->i_mapping,
					   total_read);

out_unlock:
	/* we have to start the IO in order to get the ordered extents
	 * instantiated.  This allows the relocation to code to wait
	 * for all the ordered extents to hit the disk.
	 *
	 * Otherwise, it would constantly loop over the same extents
	 * because the old ones don't get deleted  until the IO is
	 * started
	 */
	btrfs_fdatawrite_range(inode->i_mapping, start, start + len - 1,
			       WB_SYNC_NONE);
	kfree(ra);
	trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
	if (trans) {
@@ -3238,6 +3253,8 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)

		btrfs_clean_old_snapshots(tree_root);

		btrfs_wait_ordered_extents(tree_root);

		trans = btrfs_start_transaction(tree_root, 1);
		btrfs_commit_transaction(trans, tree_root);
		mutex_lock(&root->fs_info->alloc_mutex);
+14 −7
Original line number Diff line number Diff line
@@ -213,6 +213,7 @@ int btrfs_defrag_file(struct file *file)
	struct inode *inode = fdentry(file)->d_inode;
	struct btrfs_root *root = BTRFS_I(inode)->root;
	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
	struct btrfs_ordered_extent *ordered;
	struct page *page;
	unsigned long last_index;
	unsigned long ra_pages = root->fs_info->bdi.ra_pages;
@@ -234,6 +235,7 @@ int btrfs_defrag_file(struct file *file)
				       min(last_index, i + ra_pages - 1));
		}
		total_read++;
again:
		page = grab_cache_page(inode->i_mapping, i);
		if (!page)
			goto out_unlock;
@@ -247,18 +249,23 @@ int btrfs_defrag_file(struct file *file)
			}
		}

#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
		ClearPageDirty(page);
#else
		cancel_dirty_page(page, PAGE_CACHE_SIZE);
#endif
		wait_on_page_writeback(page);
		set_page_extent_mapped(page);

		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
		page_end = page_start + PAGE_CACHE_SIZE - 1;

		lock_extent(io_tree, page_start, page_end, GFP_NOFS);

		ordered = btrfs_lookup_ordered_extent(inode, page_start);
		if (ordered) {
			unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
			unlock_page(page);
			page_cache_release(page);
			btrfs_start_ordered_extent(inode, ordered, 1);
			btrfs_put_ordered_extent(ordered);
			goto again;
		}
		set_page_extent_mapped(page);

		set_extent_delalloc(io_tree, page_start,
				    page_end, GFP_NOFS);

+54 −2
Original line number Diff line number Diff line
@@ -167,20 +167,28 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
	entry->file_offset = file_offset;
	entry->start = start;
	entry->len = len;
	entry->inode = inode;

	/* one ref for the tree */
	atomic_set(&entry->refs, 1);
	init_waitqueue_head(&entry->wait);
	INIT_LIST_HEAD(&entry->list);
	INIT_LIST_HEAD(&entry->root_extent_list);

	node = tree_insert(&tree->tree, file_offset,
			   &entry->rb_node);
	if (node) {
		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
		atomic_inc(&entry->refs);
		printk("warning dup entry from add_ordered_extent\n");
		BUG();
	}
	set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
			   entry_end(entry) - 1, GFP_NOFS);

	spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
	list_add_tail(&entry->root_extent_list,
		      &BTRFS_I(inode)->root->fs_info->ordered_extents);
	spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);

	mutex_unlock(&tree->mutex);
	BUG_ON(node);
	return 0;
@@ -285,11 +293,55 @@ int btrfs_remove_ordered_extent(struct inode *inode,
	rb_erase(node, &tree->tree);
	tree->last = NULL;
	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);

	spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
	list_del_init(&entry->root_extent_list);
	spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);

	mutex_unlock(&tree->mutex);
	wake_up(&entry->wait);
	return 0;
}

int btrfs_wait_ordered_extents(struct btrfs_root *root)
{
	struct list_head splice;
	struct list_head *cur;
	struct btrfs_ordered_extent *ordered;
	struct inode *inode;

	INIT_LIST_HEAD(&splice);

	spin_lock(&root->fs_info->ordered_extent_lock);
	list_splice_init(&root->fs_info->ordered_extents, &splice);
	while(!list_empty(&splice)) {
		cur = splice.next;
		ordered = list_entry(cur, struct btrfs_ordered_extent,
				     root_extent_list);
		list_del_init(&ordered->root_extent_list);
		atomic_inc(&ordered->refs);
		inode = ordered->inode;

		/*
		 * the inode can't go away until all the pages are gone
		 * and the pages won't go away while there is still
		 * an ordered extent and the ordered extent won't go
		 * away until it is off this list.  So, we can safely
		 * increment i_count here and call iput later
		 */
		atomic_inc(&inode->i_count);
		spin_unlock(&root->fs_info->ordered_extent_lock);

		btrfs_start_ordered_extent(inode, ordered, 1);
		btrfs_put_ordered_extent(ordered);
		iput(inode);

		spin_lock(&root->fs_info->ordered_extent_lock);
	}
	spin_unlock(&root->fs_info->ordered_extent_lock);
	return 0;
}

/*
 * Used to start IO or wait for a given ordered extent to finish.
 *
Loading