diff options
Diffstat (limited to 'drivers/md')
82 files changed, 63982 insertions, 0 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig new file mode 100644 index 00000000..10f122a3 --- /dev/null +++ b/drivers/md/Kconfig @@ -0,0 +1,393 @@ +# +# Block device driver configuration +# + +menuconfig MD + bool "Multiple devices driver support (RAID and LVM)" + depends on BLOCK + help + Support multiple physical spindles through a single logical device. + Required for RAID and logical volume management. + +if MD + +config BLK_DEV_MD + tristate "RAID support" + ---help--- + This driver lets you combine several hard disk partitions into one + logical block device. This can be used to simply append one + partition to another one or to combine several redundant hard disks + into a RAID1/4/5 device so as to provide protection against hard + disk failures. This is called "Software RAID" since the combining of + the partitions is done by the kernel. "Hardware RAID" means that the + combining is done by a dedicated controller; if you have such a + controller, you do not need to say Y here. + + More information about Software RAID on Linux is contained in the + Software RAID mini-HOWTO, available from + <http://www.tldp.org/docs.html#howto>. There you will also learn + where to get the supporting user space utilities raidtools. + + If unsure, say N. + +config MD_AUTODETECT + bool "Autodetect RAID arrays during kernel boot" + depends on BLK_DEV_MD=y + default y + ---help--- + If you say Y here, then the kernel will try to autodetect raid + arrays as part of its boot process. + + If you don't use raid and say Y, this autodetection can cause + a several-second delay in the boot time due to various + synchronisation steps that are part of this step. + + If unsure, say Y. + +config MD_LINEAR + tristate "Linear (append) mode" + depends on BLK_DEV_MD + ---help--- + If you say Y here, then your multiple devices driver will be able to + use the so-called linear mode, i.e. it will combine the hard disk + partitions by simply appending one to the other. + + To compile this as a module, choose M here: the module + will be called linear. + + If unsure, say Y. + +config MD_RAID0 + tristate "RAID-0 (striping) mode" + depends on BLK_DEV_MD + ---help--- + If you say Y here, then your multiple devices driver will be able to + use the so-called raid0 mode, i.e. it will combine the hard disk + partitions into one logical device in such a fashion as to fill them + up evenly, one chunk here and one chunk there. This will increase + the throughput rate if the partitions reside on distinct disks. + + Information about Software RAID on Linux is contained in the + Software-RAID mini-HOWTO, available from + <http://www.tldp.org/docs.html#howto>. There you will also + learn where to get the supporting user space utilities raidtools. + + To compile this as a module, choose M here: the module + will be called raid0. + + If unsure, say Y. + +config MD_RAID1 + tristate "RAID-1 (mirroring) mode" + depends on BLK_DEV_MD + ---help--- + A RAID-1 set consists of several disk drives which are exact copies + of each other. In the event of a mirror failure, the RAID driver + will continue to use the operational mirrors in the set, providing + an error free MD (multiple device) to the higher levels of the + kernel. In a set with N drives, the available space is the capacity + of a single drive, and the set protects against a failure of (N - 1) + drives. + + Information about Software RAID on Linux is contained in the + Software-RAID mini-HOWTO, available from + <http://www.tldp.org/docs.html#howto>. There you will also + learn where to get the supporting user space utilities raidtools. + + If you want to use such a RAID-1 set, say Y. To compile this code + as a module, choose M here: the module will be called raid1. + + If unsure, say Y. + +config MD_RAID10 + tristate "RAID-10 (mirrored striping) mode" + depends on BLK_DEV_MD + ---help--- + RAID-10 provides a combination of striping (RAID-0) and + mirroring (RAID-1) with easier configuration and more flexible + layout. + Unlike RAID-0, but like RAID-1, RAID-10 requires all devices to + be the same size (or at least, only as much as the smallest device + will be used). + RAID-10 provides a variety of layouts that provide different levels + of redundancy and performance. + + RAID-10 requires mdadm-1.7.0 or later, available at: + + ftp://ftp.kernel.org/pub/linux/utils/raid/mdadm/ + + If unsure, say Y. + +config MD_RAID456 + tristate "RAID-4/RAID-5/RAID-6 mode" + depends on BLK_DEV_MD + select RAID6_PQ + select ASYNC_MEMCPY + select ASYNC_XOR + select ASYNC_PQ + select ASYNC_RAID6_RECOV + ---help--- + A RAID-5 set of N drives with a capacity of C MB per drive provides + the capacity of C * (N - 1) MB, and protects against a failure + of a single drive. For a given sector (row) number, (N - 1) drives + contain data sectors, and one drive contains the parity protection. + For a RAID-4 set, the parity blocks are present on a single drive, + while a RAID-5 set distributes the parity across the drives in one + of the available parity distribution methods. + + A RAID-6 set of N drives with a capacity of C MB per drive + provides the capacity of C * (N - 2) MB, and protects + against a failure of any two drives. For a given sector + (row) number, (N - 2) drives contain data sectors, and two + drives contains two independent redundancy syndromes. Like + RAID-5, RAID-6 distributes the syndromes across the drives + in one of the available parity distribution methods. + + Information about Software RAID on Linux is contained in the + Software-RAID mini-HOWTO, available from + <http://www.tldp.org/docs.html#howto>. There you will also + learn where to get the supporting user space utilities raidtools. + + If you want to use such a RAID-4/RAID-5/RAID-6 set, say Y. To + compile this code as a module, choose M here: the module + will be called raid456. + + If unsure, say Y. + +config MULTICORE_RAID456 + bool "RAID-4/RAID-5/RAID-6 Multicore processing (EXPERIMENTAL)" + depends on MD_RAID456 + depends on SMP + depends on EXPERIMENTAL + ---help--- + Enable the raid456 module to dispatch per-stripe raid operations to a + thread pool. + + If unsure, say N. + +config MD_MULTIPATH + tristate "Multipath I/O support" + depends on BLK_DEV_MD + help + MD_MULTIPATH provides a simple multi-path personality for use + the MD framework. It is not under active development. New + projects should consider using DM_MULTIPATH which has more + features and more testing. + + If unsure, say N. + +config MD_FAULTY + tristate "Faulty test module for MD" + depends on BLK_DEV_MD + help + The "faulty" module allows for a block device that occasionally returns + read or write errors. It is useful for testing. + + In unsure, say N. + +config BLK_DEV_DM + tristate "Device mapper support" + ---help--- + Device-mapper is a low level volume manager. It works by allowing + people to specify mappings for ranges of logical sectors. Various + mapping types are available, in addition people may write their own + modules containing custom mappings if they wish. + + Higher level volume managers such as LVM2 use this driver. + + To compile this as a module, choose M here: the module will be + called dm-mod. + + If unsure, say N. + +config DM_DEBUG + boolean "Device mapper debugging support" + depends on BLK_DEV_DM + ---help--- + Enable this for messages that may help debug device-mapper problems. + + If unsure, say N. + +config DM_BUFIO + tristate + depends on BLK_DEV_DM && EXPERIMENTAL + ---help--- + This interface allows you to do buffered I/O on a device and acts + as a cache, holding recently-read blocks in memory and performing + delayed writes. + +source "drivers/md/persistent-data/Kconfig" + +config DM_CRYPT + tristate "Crypt target support" + depends on BLK_DEV_DM + select CRYPTO + select CRYPTO_CBC + ---help--- + This device-mapper target allows you to create a device that + transparently encrypts the data on it. You'll need to activate + the ciphers you're going to use in the cryptoapi configuration. + + Information on how to use dm-crypt can be found on + + <http://www.saout.de/misc/dm-crypt/> + + To compile this code as a module, choose M here: the module will + be called dm-crypt. + + If unsure, say N. + +config DM_SNAPSHOT + tristate "Snapshot target" + depends on BLK_DEV_DM + ---help--- + Allow volume managers to take writable snapshots of a device. + +config DM_THIN_PROVISIONING + tristate "Thin provisioning target (EXPERIMENTAL)" + depends on BLK_DEV_DM && EXPERIMENTAL + select DM_PERSISTENT_DATA + ---help--- + Provides thin provisioning and snapshots that share a data store. + +config DM_DEBUG_BLOCK_STACK_TRACING + boolean "Keep stack trace of thin provisioning block lock holders" + depends on STACKTRACE_SUPPORT && DM_THIN_PROVISIONING + select STACKTRACE + ---help--- + Enable this for messages that may help debug problems with the + block manager locking used by thin provisioning. + + If unsure, say N. + +config DM_DEBUG_SPACE_MAPS + boolean "Extra validation for thin provisioning space maps" + depends on DM_THIN_PROVISIONING + ---help--- + Enable this for messages that may help debug problems with the + space maps used by thin provisioning. + + If unsure, say N. + +config DM_MIRROR + tristate "Mirror target" + depends on BLK_DEV_DM + ---help--- + Allow volume managers to mirror logical volumes, also + needed for live data migration tools such as 'pvmove'. + +config DM_RAID + tristate "RAID 1/4/5/6 target" + depends on BLK_DEV_DM + select MD_RAID1 + select MD_RAID456 + select BLK_DEV_MD + ---help--- + A dm target that supports RAID1, RAID4, RAID5 and RAID6 mappings + + A RAID-5 set of N drives with a capacity of C MB per drive provides + the capacity of C * (N - 1) MB, and protects against a failure + of a single drive. For a given sector (row) number, (N - 1) drives + contain data sectors, and one drive contains the parity protection. + For a RAID-4 set, the parity blocks are present on a single drive, + while a RAID-5 set distributes the parity across the drives in one + of the available parity distribution methods. + + A RAID-6 set of N drives with a capacity of C MB per drive + provides the capacity of C * (N - 2) MB, and protects + against a failure of any two drives. For a given sector + (row) number, (N - 2) drives contain data sectors, and two + drives contains two independent redundancy syndromes. Like + RAID-5, RAID-6 distributes the syndromes across the drives + in one of the available parity distribution methods. + +config DM_LOG_USERSPACE + tristate "Mirror userspace logging (EXPERIMENTAL)" + depends on DM_MIRROR && EXPERIMENTAL && NET + select CONNECTOR + ---help--- + The userspace logging module provides a mechanism for + relaying the dm-dirty-log API to userspace. Log designs + which are more suited to userspace implementation (e.g. + shared storage logs) or experimental logs can be implemented + by leveraging this framework. + +config DM_ZERO + tristate "Zero target" + depends on BLK_DEV_DM + ---help--- + A target that discards writes, and returns all zeroes for + reads. Useful in some recovery situations. + +config DM_MULTIPATH + tristate "Multipath target" + depends on BLK_DEV_DM + # nasty syntax but means make DM_MULTIPATH independent + # of SCSI_DH if the latter isn't defined but if + # it is, DM_MULTIPATH must depend on it. We get a build + # error if SCSI_DH=m and DM_MULTIPATH=y + depends on SCSI_DH || !SCSI_DH + ---help--- + Allow volume managers to support multipath hardware. + +config DM_MULTIPATH_QL + tristate "I/O Path Selector based on the number of in-flight I/Os" + depends on DM_MULTIPATH + ---help--- + This path selector is a dynamic load balancer which selects + the path with the least number of in-flight I/Os. + + If unsure, say N. + +config DM_MULTIPATH_ST + tristate "I/O Path Selector based on the service time" + depends on DM_MULTIPATH + ---help--- + This path selector is a dynamic load balancer which selects + the path expected to complete the incoming I/O in the shortest + time. + + If unsure, say N. + +config DM_DELAY + tristate "I/O delaying target (EXPERIMENTAL)" + depends on BLK_DEV_DM && EXPERIMENTAL + ---help--- + A target that delays reads and/or writes and can send + them to different devices. Useful for testing. + + If unsure, say N. + +config DM_UEVENT + bool "DM uevents" + depends on BLK_DEV_DM + ---help--- + Generate udev events for DM events. + +config DM_FLAKEY + tristate "Flakey target (EXPERIMENTAL)" + depends on BLK_DEV_DM && EXPERIMENTAL + ---help--- + A target that intermittently fails I/O for debugging purposes. + +config DM_VERITY + tristate "Verity target support (EXPERIMENTAL)" + depends on BLK_DEV_DM && EXPERIMENTAL + select CRYPTO + select CRYPTO_HASH + select DM_BUFIO + ---help--- + This device-mapper target creates a read-only device that + transparently validates the data on one underlying device against + a pre-generated tree of cryptographic checksums stored on a second + device. + + You'll need to activate the digests you're going to use in the + cryptoapi configuration. + + To compile this code as a module, choose M here: the module will + be called dm-verity. + + If unsure, say N. + +endif # MD diff --git a/drivers/md/Makefile b/drivers/md/Makefile new file mode 100644 index 00000000..8b2e0dff --- /dev/null +++ b/drivers/md/Makefile @@ -0,0 +1,49 @@ +# +# Makefile for the kernel software RAID and LVM drivers. +# + +dm-mod-y += dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ + dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o +dm-multipath-y += dm-path-selector.o dm-mpath.o +dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ + dm-snap-persistent.o +dm-mirror-y += dm-raid1.o +dm-log-userspace-y \ + += dm-log-userspace-base.o dm-log-userspace-transfer.o +dm-thin-pool-y += dm-thin.o dm-thin-metadata.o +md-mod-y += md.o bitmap.o +raid456-y += raid5.o + +# Note: link order is important. All raid personalities +# and must come before md.o, as they each initialise +# themselves, and md.o may use the personalities when it +# auto-initialised. + +obj-$(CONFIG_MD_LINEAR) += linear.o +obj-$(CONFIG_MD_RAID0) += raid0.o +obj-$(CONFIG_MD_RAID1) += raid1.o +obj-$(CONFIG_MD_RAID10) += raid10.o +obj-$(CONFIG_MD_RAID456) += raid456.o +obj-$(CONFIG_MD_MULTIPATH) += multipath.o +obj-$(CONFIG_MD_FAULTY) += faulty.o +obj-$(CONFIG_BLK_DEV_MD) += md-mod.o +obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o +obj-$(CONFIG_DM_BUFIO) += dm-bufio.o +obj-$(CONFIG_DM_CRYPT) += dm-crypt.o +obj-$(CONFIG_DM_DELAY) += dm-delay.o +obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o +obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o +obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o +obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o +obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o +obj-$(CONFIG_DM_PERSISTENT_DATA) += persistent-data/ +obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o +obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o +obj-$(CONFIG_DM_ZERO) += dm-zero.o +obj-$(CONFIG_DM_RAID) += dm-raid.o +obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o +obj-$(CONFIG_DM_VERITY) += dm-verity.o + +ifeq ($(CONFIG_DM_UEVENT),y) +dm-mod-objs += dm-uevent.o +endif diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c new file mode 100644 index 00000000..17e2b472 --- /dev/null +++ b/drivers/md/bitmap.c @@ -0,0 +1,2113 @@ +/* + * bitmap.c two-level bitmap (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003 + * + * bitmap_create - sets up the bitmap structure + * bitmap_destroy - destroys the bitmap structure + * + * additions, Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.: + * - added disk storage for bitmap + * - changes to allow various bitmap chunk sizes + */ + +/* + * Still to do: + * + * flush after percent set rather than just time based. (maybe both). + */ + +#include <linux/blkdev.h> +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/timer.h> +#include <linux/sched.h> +#include <linux/list.h> +#include <linux/file.h> +#include <linux/mount.h> +#include <linux/buffer_head.h> +#include <linux/seq_file.h> +#include "md.h" +#include "bitmap.h" + +static inline char *bmname(struct bitmap *bitmap) +{ + return bitmap->mddev ? mdname(bitmap->mddev) : "mdX"; +} + +/* + * check a page and, if necessary, allocate it (or hijack it if the alloc fails) + * + * 1) check to see if this page is allocated, if it's not then try to alloc + * 2) if the alloc fails, set the page's hijacked flag so we'll use the + * page pointer directly as a counter + * + * if we find our page, we increment the page's refcount so that it stays + * allocated while we're using it + */ +static int bitmap_checkpage(struct bitmap *bitmap, + unsigned long page, int create) +__releases(bitmap->lock) +__acquires(bitmap->lock) +{ + unsigned char *mappage; + + if (page >= bitmap->pages) { + /* This can happen if bitmap_start_sync goes beyond + * End-of-device while looking for a whole page. + * It is harmless. + */ + return -EINVAL; + } + + if (bitmap->bp[page].hijacked) /* it's hijacked, don't try to alloc */ + return 0; + + if (bitmap->bp[page].map) /* page is already allocated, just return */ + return 0; + + if (!create) + return -ENOENT; + + /* this page has not been allocated yet */ + + spin_unlock_irq(&bitmap->lock); + mappage = kzalloc(PAGE_SIZE, GFP_NOIO); + spin_lock_irq(&bitmap->lock); + + if (mappage == NULL) { + pr_debug("%s: bitmap map page allocation failed, hijacking\n", + bmname(bitmap)); + /* failed - set the hijacked flag so that we can use the + * pointer as a counter */ + if (!bitmap->bp[page].map) + bitmap->bp[page].hijacked = 1; + } else if (bitmap->bp[page].map || + bitmap->bp[page].hijacked) { + /* somebody beat us to getting the page */ + kfree(mappage); + return 0; + } else { + + /* no page was in place and we have one, so install it */ + + bitmap->bp[page].map = mappage; + bitmap->missing_pages--; + } + return 0; +} + +/* if page is completely empty, put it back on the free list, or dealloc it */ +/* if page was hijacked, unmark the flag so it might get alloced next time */ +/* Note: lock should be held when calling this */ +static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page) +{ + char *ptr; + + if (bitmap->bp[page].count) /* page is still busy */ + return; + + /* page is no longer in use, it can be released */ + + if (bitmap->bp[page].hijacked) { /* page was hijacked, undo this now */ + bitmap->bp[page].hijacked = 0; + bitmap->bp[page].map = NULL; + } else { + /* normal case, free the page */ + ptr = bitmap->bp[page].map; + bitmap->bp[page].map = NULL; + bitmap->missing_pages++; + kfree(ptr); + } +} + +/* + * bitmap file handling - read and write the bitmap file and its superblock + */ + +/* + * basic page I/O operations + */ + +/* IO operations when bitmap is stored near all superblocks */ +static struct page *read_sb_page(struct mddev *mddev, loff_t offset, + struct page *page, + unsigned long index, int size) +{ + /* choose a good rdev and read the page from there */ + + struct md_rdev *rdev; + sector_t target; + int did_alloc = 0; + + if (!page) { + page = alloc_page(GFP_KERNEL); + if (!page) + return ERR_PTR(-ENOMEM); + did_alloc = 1; + } + + rdev_for_each(rdev, mddev) { + if (! test_bit(In_sync, &rdev->flags) + || test_bit(Faulty, &rdev->flags)) + continue; + + target = offset + index * (PAGE_SIZE/512); + + if (sync_page_io(rdev, target, + roundup(size, bdev_logical_block_size(rdev->bdev)), + page, READ, true)) { + page->index = index; + attach_page_buffers(page, NULL); /* so that free_buffer will + * quietly no-op */ + return page; + } + } + if (did_alloc) + put_page(page); + return ERR_PTR(-EIO); + +} + +static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mddev) +{ + /* Iterate the disks of an mddev, using rcu to protect access to the + * linked list, and raising the refcount of devices we return to ensure + * they don't disappear while in use. + * As devices are only added or removed when raid_disk is < 0 and + * nr_pending is 0 and In_sync is clear, the entries we return will + * still be in the same position on the list when we re-enter + * list_for_each_continue_rcu. + */ + struct list_head *pos; + rcu_read_lock(); + if (rdev == NULL) + /* start at the beginning */ + pos = &mddev->disks; + else { + /* release the previous rdev and start from there. */ + rdev_dec_pending(rdev, mddev); + pos = &rdev->same_set; + } + list_for_each_continue_rcu(pos, &mddev->disks) { + rdev = list_entry(pos, struct md_rdev, same_set); + if (rdev->raid_disk >= 0 && + !test_bit(Faulty, &rdev->flags)) { + /* this is a usable devices */ + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + return rdev; + } + } + rcu_read_unlock(); + return NULL; +} + +static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) +{ + struct md_rdev *rdev = NULL; + struct block_device *bdev; + struct mddev *mddev = bitmap->mddev; + + while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { + int size = PAGE_SIZE; + loff_t offset = mddev->bitmap_info.offset; + + bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; + + if (page->index == bitmap->file_pages-1) + size = roundup(bitmap->last_page_size, + bdev_logical_block_size(bdev)); + /* Just make sure we aren't corrupting data or + * metadata + */ + if (mddev->external) { + /* Bitmap could be anywhere. */ + if (rdev->sb_start + offset + (page->index + * (PAGE_SIZE/512)) + > rdev->data_offset + && + rdev->sb_start + offset + < (rdev->data_offset + mddev->dev_sectors + + (PAGE_SIZE/512))) + goto bad_alignment; + } else if (offset < 0) { + /* DATA BITMAP METADATA */ + if (offset + + (long)(page->index * (PAGE_SIZE/512)) + + size/512 > 0) + /* bitmap runs in to metadata */ + goto bad_alignment; + if (rdev->data_offset + mddev->dev_sectors + > rdev->sb_start + offset) + /* data runs in to bitmap */ + goto bad_alignment; + } else if (rdev->sb_start < rdev->data_offset) { + /* METADATA BITMAP DATA */ + if (rdev->sb_start + + offset + + page->index*(PAGE_SIZE/512) + size/512 + > rdev->data_offset) + /* bitmap runs in to data */ + goto bad_alignment; + } else { + /* DATA METADATA BITMAP - no problems */ + } + md_super_write(mddev, rdev, + rdev->sb_start + offset + + page->index * (PAGE_SIZE/512), + size, + page); + } + + if (wait) + md_super_wait(mddev); + return 0; + + bad_alignment: + return -EINVAL; +} + +static void bitmap_file_kick(struct bitmap *bitmap); +/* + * write out a page to a file + */ +static void write_page(struct bitmap *bitmap, struct page *page, int wait) +{ + struct buffer_head *bh; + + if (bitmap->file == NULL) { + switch (write_sb_page(bitmap, page, wait)) { + case -EINVAL: + bitmap->flags |= BITMAP_WRITE_ERROR; + } + } else { + + bh = page_buffers(page); + + while (bh && bh->b_blocknr) { + atomic_inc(&bitmap->pending_writes); + set_buffer_locked(bh); + set_buffer_mapped(bh); + submit_bh(WRITE | REQ_SYNC, bh); + bh = bh->b_this_page; + } + + if (wait) + wait_event(bitmap->write_wait, + atomic_read(&bitmap->pending_writes)==0); + } + if (bitmap->flags & BITMAP_WRITE_ERROR) + bitmap_file_kick(bitmap); +} + +static void end_bitmap_write(struct buffer_head *bh, int uptodate) +{ + struct bitmap *bitmap = bh->b_private; + unsigned long flags; + + if (!uptodate) { + spin_lock_irqsave(&bitmap->lock, flags); + bitmap->flags |= BITMAP_WRITE_ERROR; + spin_unlock_irqrestore(&bitmap->lock, flags); + } + if (atomic_dec_and_test(&bitmap->pending_writes)) + wake_up(&bitmap->write_wait); +} + +/* copied from buffer.c */ +static void +__clear_page_buffers(struct page *page) +{ + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); +} +static void free_buffers(struct page *page) +{ + struct buffer_head *bh = page_buffers(page); + + while (bh) { + struct buffer_head *next = bh->b_this_page; + free_buffer_head(bh); + bh = next; + } + __clear_page_buffers(page); + put_page(page); +} + +/* read a page from a file. + * We both read the page, and attach buffers to the page to record the + * address of each block (using bmap). These addresses will be used + * to write the block later, completely bypassing the filesystem. + * This usage is similar to how swap files are handled, and allows us + * to write to a file with no concerns of memory allocation failing. + */ +static struct page *read_page(struct file *file, unsigned long index, + struct bitmap *bitmap, + unsigned long count) +{ + struct page *page = NULL; + struct inode *inode = file->f_path.dentry->d_inode; + struct buffer_head *bh; + sector_t block; + + pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE, + (unsigned long long)index << PAGE_SHIFT); + + page = alloc_page(GFP_KERNEL); + if (!page) + page = ERR_PTR(-ENOMEM); + if (IS_ERR(page)) + goto out; + + bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0); + if (!bh) { + put_page(page); + page = ERR_PTR(-ENOMEM); + goto out; + } + attach_page_buffers(page, bh); + block = index << (PAGE_SHIFT - inode->i_blkbits); + while (bh) { + if (count == 0) + bh->b_blocknr = 0; + else { + bh->b_blocknr = bmap(inode, block); + if (bh->b_blocknr == 0) { + /* Cannot use this file! */ + free_buffers(page); + page = ERR_PTR(-EINVAL); + goto out; + } + bh->b_bdev = inode->i_sb->s_bdev; + if (count < (1<<inode->i_blkbits)) + count = 0; + else + count -= (1<<inode->i_blkbits); + + bh->b_end_io = end_bitmap_write; + bh->b_private = bitmap; + atomic_inc(&bitmap->pending_writes); + set_buffer_locked(bh); + set_buffer_mapped(bh); + submit_bh(READ, bh); + } + block++; + bh = bh->b_this_page; + } + page->index = index; + + wait_event(bitmap->write_wait, + atomic_read(&bitmap->pending_writes)==0); + if (bitmap->flags & BITMAP_WRITE_ERROR) { + free_buffers(page); + page = ERR_PTR(-EIO); + } +out: + if (IS_ERR(page)) + printk(KERN_ALERT "md: bitmap read error: (%dB @ %llu): %ld\n", + (int)PAGE_SIZE, + (unsigned long long)index << PAGE_SHIFT, + PTR_ERR(page)); + return page; +} + +/* + * bitmap file superblock operations + */ + +/* update the event counter and sync the superblock to disk */ +void bitmap_update_sb(struct bitmap *bitmap) +{ + bitmap_super_t *sb; + + if (!bitmap || !bitmap->mddev) /* no bitmap for this array */ + return; + if (bitmap->mddev->bitmap_info.external) + return; + if (!bitmap->sb_page) /* no superblock */ + return; + sb = kmap_atomic(bitmap->sb_page); + sb->events = cpu_to_le64(bitmap->mddev->events); + if (bitmap->mddev->events < bitmap->events_cleared) + /* rocking back to read-only */ + bitmap->events_cleared = bitmap->mddev->events; + sb->events_cleared = cpu_to_le64(bitmap->events_cleared); + sb->state = cpu_to_le32(bitmap->flags); + /* Just in case these have been changed via sysfs: */ + sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); + sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); + kunmap_atomic(sb); + write_page(bitmap, bitmap->sb_page, 1); +} + +/* print out the bitmap file superblock */ +void bitmap_print_sb(struct bitmap *bitmap) +{ + bitmap_super_t *sb; + + if (!bitmap || !bitmap->sb_page) + return; + sb = kmap_atomic(bitmap->sb_page); + printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); + printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); + printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); + printk(KERN_DEBUG " uuid: %08x.%08x.%08x.%08x\n", + *(__u32 *)(sb->uuid+0), + *(__u32 *)(sb->uuid+4), + *(__u32 *)(sb->uuid+8), + *(__u32 *)(sb->uuid+12)); + printk(KERN_DEBUG " events: %llu\n", + (unsigned long long) le64_to_cpu(sb->events)); + printk(KERN_DEBUG "events cleared: %llu\n", + (unsigned long long) le64_to_cpu(sb->events_cleared)); + printk(KERN_DEBUG " state: %08x\n", le32_to_cpu(sb->state)); + printk(KERN_DEBUG " chunksize: %d B\n", le32_to_cpu(sb->chunksize)); + printk(KERN_DEBUG " daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep)); + printk(KERN_DEBUG " sync size: %llu KB\n", + (unsigned long long)le64_to_cpu(sb->sync_size)/2); + printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind)); + kunmap_atomic(sb); +} + +/* + * bitmap_new_disk_sb + * @bitmap + * + * This function is somewhat the reverse of bitmap_read_sb. bitmap_read_sb + * reads and verifies the on-disk bitmap superblock and populates bitmap_info. + * This function verifies 'bitmap_info' and populates the on-disk bitmap + * structure, which is to be written to disk. + * + * Returns: 0 on success, -Exxx on error + */ +static int bitmap_new_disk_sb(struct bitmap *bitmap) +{ + bitmap_super_t *sb; + unsigned long chunksize, daemon_sleep, write_behind; + int err = -EINVAL; + + bitmap->sb_page = alloc_page(GFP_KERNEL); + if (IS_ERR(bitmap->sb_page)) { + err = PTR_ERR(bitmap->sb_page); + bitmap->sb_page = NULL; + return err; + } + bitmap->sb_page->index = 0; + + sb = kmap_atomic(bitmap->sb_page); + + sb->magic = cpu_to_le32(BITMAP_MAGIC); + sb->version = cpu_to_le32(BITMAP_MAJOR_HI); + + chunksize = bitmap->mddev->bitmap_info.chunksize; + BUG_ON(!chunksize); + if (!is_power_of_2(chunksize)) { + kunmap_atomic(sb); + printk(KERN_ERR "bitmap chunksize not a power of 2\n"); + return -EINVAL; + } + sb->chunksize = cpu_to_le32(chunksize); + + daemon_sleep = bitmap->mddev->bitmap_info.daemon_sleep; + if (!daemon_sleep || + (daemon_sleep < 1) || (daemon_sleep > MAX_SCHEDULE_TIMEOUT)) { + printk(KERN_INFO "Choosing daemon_sleep default (5 sec)\n"); + daemon_sleep = 5 * HZ; + } + sb->daemon_sleep = cpu_to_le32(daemon_sleep); + bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; + + /* + * FIXME: write_behind for RAID1. If not specified, what + * is a good choice? We choose COUNTER_MAX / 2 arbitrarily. + */ + write_behind = bitmap->mddev->bitmap_info.max_write_behind; + if (write_behind > COUNTER_MAX) + write_behind = COUNTER_MAX / 2; + sb->write_behind = cpu_to_le32(write_behind); + bitmap->mddev->bitmap_info.max_write_behind = write_behind; + + /* keep the array size field of the bitmap superblock up to date */ + sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); + + memcpy(sb->uuid, bitmap->mddev->uuid, 16); + + bitmap->flags |= BITMAP_STALE; + sb->state |= cpu_to_le32(BITMAP_STALE); + bitmap->events_cleared = bitmap->mddev->events; + sb->events_cleared = cpu_to_le64(bitmap->mddev->events); + + kunmap_atomic(sb); + + return 0; +} + +/* read the superblock from the bitmap file and initialize some bitmap fields */ +static int bitmap_read_sb(struct bitmap *bitmap) +{ + char *reason = NULL; + bitmap_super_t *sb; + unsigned long chunksize, daemon_sleep, write_behind; + unsigned long long events; + int err = -EINVAL; + + /* page 0 is the superblock, read it... */ + if (bitmap->file) { + loff_t isize = i_size_read(bitmap->file->f_mapping->host); + int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize; + + bitmap->sb_page = read_page(bitmap->file, 0, bitmap, bytes); + } else { + bitmap->sb_page = read_sb_page(bitmap->mddev, + bitmap->mddev->bitmap_info.offset, + NULL, + 0, sizeof(bitmap_super_t)); + } + if (IS_ERR(bitmap->sb_page)) { + err = PTR_ERR(bitmap->sb_page); + bitmap->sb_page = NULL; + return err; + } + + sb = kmap_atomic(bitmap->sb_page); + + chunksize = le32_to_cpu(sb->chunksize); + daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; + write_behind = le32_to_cpu(sb->write_behind); + + /* verify that the bitmap-specific fields are valid */ + if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) + reason = "bad magic"; + else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO || + le32_to_cpu(sb->version) > BITMAP_MAJOR_HI) + reason = "unrecognized superblock version"; + else if (chunksize < 512) + reason = "bitmap chunksize too small"; + else if (!is_power_of_2(chunksize)) + reason = "bitmap chunksize not a power of 2"; + else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT) + reason = "daemon sleep period out of range"; + else if (write_behind > COUNTER_MAX) + reason = "write-behind limit out of range (0 - 16383)"; + if (reason) { + printk(KERN_INFO "%s: invalid bitmap file superblock: %s\n", + bmname(bitmap), reason); + goto out; + } + + /* keep the array size field of the bitmap superblock up to date */ + sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); + + if (bitmap->mddev->persistent) { + /* + * We have a persistent array superblock, so compare the + * bitmap's UUID and event counter to the mddev's + */ + if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) { + printk(KERN_INFO + "%s: bitmap superblock UUID mismatch\n", + bmname(bitmap)); + goto out; + } + events = le64_to_cpu(sb->events); + if (events < bitmap->mddev->events) { + printk(KERN_INFO + "%s: bitmap file is out of date (%llu < %llu) " + "-- forcing full recovery\n", + bmname(bitmap), events, + (unsigned long long) bitmap->mddev->events); + sb->state |= cpu_to_le32(BITMAP_STALE); + } + } + + /* assign fields using values from superblock */ + bitmap->mddev->bitmap_info.chunksize = chunksize; + bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; + bitmap->mddev->bitmap_info.max_write_behind = write_behind; + bitmap->flags |= le32_to_cpu(sb->state); + if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) + bitmap->flags |= BITMAP_HOSTENDIAN; + bitmap->events_cleared = le64_to_cpu(sb->events_cleared); + if (bitmap->flags & BITMAP_STALE) + bitmap->events_cleared = bitmap->mddev->events; + err = 0; +out: + kunmap_atomic(sb); + if (err) + bitmap_print_sb(bitmap); + return err; +} + +enum bitmap_mask_op { + MASK_SET, + MASK_UNSET +}; + +/* record the state of the bitmap in the superblock. Return the old value */ +static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, + enum bitmap_mask_op op) +{ + bitmap_super_t *sb; + int old; + + if (!bitmap->sb_page) /* can't set the state */ + return 0; + sb = kmap_atomic(bitmap->sb_page); + old = le32_to_cpu(sb->state) & bits; + switch (op) { + case MASK_SET: + sb->state |= cpu_to_le32(bits); + bitmap->flags |= bits; + break; + case MASK_UNSET: + sb->state &= cpu_to_le32(~bits); + bitmap->flags &= ~bits; + break; + default: + BUG(); + } + kunmap_atomic(sb); + return old; +} + +/* + * general bitmap file operations + */ + +/* + * on-disk bitmap: + * + * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap + * file a page at a time. There's a superblock at the start of the file. + */ +/* calculate the index of the page that contains this bit */ +static inline unsigned long file_page_index(struct bitmap *bitmap, unsigned long chunk) +{ + if (!bitmap->mddev->bitmap_info.external) + chunk += sizeof(bitmap_super_t) << 3; + return chunk >> PAGE_BIT_SHIFT; +} + +/* calculate the (bit) offset of this bit within a page */ +static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned long chunk) +{ + if (!bitmap->mddev->bitmap_info.external) + chunk += sizeof(bitmap_super_t) << 3; + return chunk & (PAGE_BITS - 1); +} + +/* + * return a pointer to the page in the filemap that contains the given bit + * + * this lookup is complicated by the fact that the bitmap sb might be exactly + * 1 page (e.g., x86) or less than 1 page -- so the bitmap might start on page + * 0 or page 1 + */ +static inline struct page *filemap_get_page(struct bitmap *bitmap, + unsigned long chunk) +{ + if (file_page_index(bitmap, chunk) >= bitmap->file_pages) + return NULL; + return bitmap->filemap[file_page_index(bitmap, chunk) + - file_page_index(bitmap, 0)]; +} + +static void bitmap_file_unmap(struct bitmap *bitmap) +{ + struct page **map, *sb_page; + unsigned long *attr; + int pages; + unsigned long flags; + + spin_lock_irqsave(&bitmap->lock, flags); + map = bitmap->filemap; + bitmap->filemap = NULL; + attr = bitmap->filemap_attr; + bitmap->filemap_attr = NULL; + pages = bitmap->file_pages; + bitmap->file_pages = 0; + sb_page = bitmap->sb_page; + bitmap->sb_page = NULL; + spin_unlock_irqrestore(&bitmap->lock, flags); + + while (pages--) + if (map[pages] != sb_page) /* 0 is sb_page, release it below */ + free_buffers(map[pages]); + kfree(map); + kfree(attr); + + if (sb_page) + free_buffers(sb_page); +} + +static void bitmap_file_put(struct bitmap *bitmap) +{ + struct file *file; + unsigned long flags; + + spin_lock_irqsave(&bitmap->lock, flags); + file = bitmap->file; + bitmap->file = NULL; + spin_unlock_irqrestore(&bitmap->lock, flags); + + if (file) + wait_event(bitmap->write_wait, + atomic_read(&bitmap->pending_writes)==0); + bitmap_file_unmap(bitmap); + + if (file) { + struct inode *inode = file->f_path.dentry->d_inode; + invalidate_mapping_pages(inode->i_mapping, 0, -1); + fput(file); + } +} + +/* + * bitmap_file_kick - if an error occurs while manipulating the bitmap file + * then it is no longer reliable, so we stop using it and we mark the file + * as failed in the superblock + */ +static void bitmap_file_kick(struct bitmap *bitmap) +{ + char *path, *ptr = NULL; + + if (bitmap_mask_state(bitmap, BITMAP_STALE, MASK_SET) == 0) { + bitmap_update_sb(bitmap); + + if (bitmap->file) { + path = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (path) + ptr = d_path(&bitmap->file->f_path, path, + PAGE_SIZE); + + printk(KERN_ALERT + "%s: kicking failed bitmap file %s from array!\n", + bmname(bitmap), IS_ERR(ptr) ? "" : ptr); + + kfree(path); + } else + printk(KERN_ALERT + "%s: disabling internal bitmap due to errors\n", + bmname(bitmap)); + } + + bitmap_file_put(bitmap); + + return; +} + +enum bitmap_page_attr { + BITMAP_PAGE_DIRTY = 0, /* there are set bits that need to be synced */ + BITMAP_PAGE_PENDING = 1, /* there are bits that are being cleaned. + * i.e. counter is 1 or 2. */ + BITMAP_PAGE_NEEDWRITE = 2, /* there are cleared bits that need to be synced */ +}; + +static inline void set_page_attr(struct bitmap *bitmap, struct page *page, + enum bitmap_page_attr attr) +{ + __set_bit((page->index<<2) + attr, bitmap->filemap_attr); +} + +static inline void clear_page_attr(struct bitmap *bitmap, struct page *page, + enum bitmap_page_attr attr) +{ + __clear_bit((page->index<<2) + attr, bitmap->filemap_attr); +} + +static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page, + enum bitmap_page_attr attr) +{ + return test_bit((page->index<<2) + attr, bitmap->filemap_attr); +} + +/* + * bitmap_file_set_bit -- called before performing a write to the md device + * to set (and eventually sync) a particular bit in the bitmap file + * + * we set the bit immediately, then we record the page number so that + * when an unplug occurs, we can flush the dirty pages out to disk + */ +static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) +{ + unsigned long bit; + struct page *page; + void *kaddr; + unsigned long chunk = block >> bitmap->chunkshift; + + if (!bitmap->filemap) + return; + + page = filemap_get_page(bitmap, chunk); + if (!page) + return; + bit = file_page_offset(bitmap, chunk); + + /* set the bit */ + kaddr = kmap_atomic(page); + if (bitmap->flags & BITMAP_HOSTENDIAN) + set_bit(bit, kaddr); + else + __set_bit_le(bit, kaddr); + kunmap_atomic(kaddr); + pr_debug("set file bit %lu page %lu\n", bit, page->index); + /* record page number so it gets flushed to disk when unplug occurs */ + set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); +} + +/* this gets called when the md device is ready to unplug its underlying + * (slave) device queues -- before we let any writes go down, we need to + * sync the dirty pages of the bitmap file to disk */ +void bitmap_unplug(struct bitmap *bitmap) +{ + unsigned long i, flags; + int dirty, need_write; + struct page *page; + int wait = 0; + + if (!bitmap) + return; + + /* look at each page to see if there are any set bits that need to be + * flushed out to disk */ + for (i = 0; i < bitmap->file_pages; i++) { + spin_lock_irqsave(&bitmap->lock, flags); + if (!bitmap->filemap) { + spin_unlock_irqrestore(&bitmap->lock, flags); + return; + } + page = bitmap->filemap[i]; + dirty = test_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); + need_write = test_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); + clear_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); + clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); + if (dirty) + wait = 1; + spin_unlock_irqrestore(&bitmap->lock, flags); + + if (dirty || need_write) + write_page(bitmap, page, 0); + } + if (wait) { /* if any writes were performed, we need to wait on them */ + if (bitmap->file) + wait_event(bitmap->write_wait, + atomic_read(&bitmap->pending_writes)==0); + else + md_super_wait(bitmap->mddev); + } + if (bitmap->flags & BITMAP_WRITE_ERROR) + bitmap_file_kick(bitmap); +} +EXPORT_SYMBOL(bitmap_unplug); + +static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed); +/* * bitmap_init_from_disk -- called at bitmap_create time to initialize + * the in-memory bitmap from the on-disk bitmap -- also, sets up the + * memory mapping of the bitmap file + * Special cases: + * if there's no bitmap file, or if the bitmap file had been + * previously kicked from the array, we mark all the bits as + * 1's in order to cause a full resync. + * + * We ignore all bits for sectors that end earlier than 'start'. + * This is used when reading an out-of-date bitmap... + */ +static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) +{ + unsigned long i, chunks, index, oldindex, bit; + struct page *page = NULL, *oldpage = NULL; + unsigned long num_pages, bit_cnt = 0; + struct file *file; + unsigned long bytes, offset; + int outofdate; + int ret = -ENOSPC; + void *paddr; + + chunks = bitmap->chunks; + file = bitmap->file; + + BUG_ON(!file && !bitmap->mddev->bitmap_info.offset); + + outofdate = bitmap->flags & BITMAP_STALE; + if (outofdate) + printk(KERN_INFO "%s: bitmap file is out of date, doing full " + "recovery\n", bmname(bitmap)); + + bytes = DIV_ROUND_UP(bitmap->chunks, 8); + if (!bitmap->mddev->bitmap_info.external) + bytes += sizeof(bitmap_super_t); + + num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE); + + if (file && i_size_read(file->f_mapping->host) < bytes) { + printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n", + bmname(bitmap), + (unsigned long) i_size_read(file->f_mapping->host), + bytes); + goto err; + } + + ret = -ENOMEM; + + bitmap->filemap = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL); + if (!bitmap->filemap) + goto err; + + /* We need 4 bits per page, rounded up to a multiple of sizeof(unsigned long) */ + bitmap->filemap_attr = kzalloc( + roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)), + GFP_KERNEL); + if (!bitmap->filemap_attr) + goto err; + + oldindex = ~0L; + + for (i = 0; i < chunks; i++) { + int b; + index = file_page_index(bitmap, i); + bit = file_page_offset(bitmap, i); + if (index != oldindex) { /* this is a new page, read it in */ + int count; + /* unmap the old page, we're done with it */ + if (index == num_pages-1) + count = bytes - index * PAGE_SIZE; + else + count = PAGE_SIZE; + if (index == 0 && bitmap->sb_page) { + /* + * if we're here then the superblock page + * contains some bits (PAGE_SIZE != sizeof sb) + * we've already read it in, so just use it + */ + page = bitmap->sb_page; + offset = sizeof(bitmap_super_t); + if (!file) + page = read_sb_page( + bitmap->mddev, + bitmap->mddev->bitmap_info.offset, + page, + index, count); + } else if (file) { + page = read_page(file, index, bitmap, count); + offset = 0; + } else { + page = read_sb_page(bitmap->mddev, + bitmap->mddev->bitmap_info.offset, + NULL, + index, count); + offset = 0; + } + if (IS_ERR(page)) { /* read error */ + ret = PTR_ERR(page); + goto err; + } + + oldindex = index; + oldpage = page; + + bitmap->filemap[bitmap->file_pages++] = page; + bitmap->last_page_size = count; + + if (outofdate) { + /* + * if bitmap is out of date, dirty the + * whole page and write it out + */ + paddr = kmap_atomic(page); + memset(paddr + offset, 0xff, + PAGE_SIZE - offset); + kunmap_atomic(paddr); + write_page(bitmap, page, 1); + + ret = -EIO; + if (bitmap->flags & BITMAP_WRITE_ERROR) + goto err; + } + } + paddr = kmap_atomic(page); + if (bitmap->flags & BITMAP_HOSTENDIAN) + b = test_bit(bit, paddr); + else + b = test_bit_le(bit, paddr); + kunmap_atomic(paddr); + if (b) { + /* if the disk bit is set, set the memory bit */ + int needed = ((sector_t)(i+1) << bitmap->chunkshift + >= start); + bitmap_set_memory_bits(bitmap, + (sector_t)i << bitmap->chunkshift, + needed); + bit_cnt++; + } + } + + /* everything went OK */ + ret = 0; + bitmap_mask_state(bitmap, BITMAP_STALE, MASK_UNSET); + + if (bit_cnt) { /* Kick recovery if any bits were set */ + set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery); + md_wakeup_thread(bitmap->mddev->thread); + } + + printk(KERN_INFO "%s: bitmap initialized from disk: " + "read %lu/%lu pages, set %lu of %lu bits\n", + bmname(bitmap), bitmap->file_pages, num_pages, bit_cnt, chunks); + + return 0; + + err: + printk(KERN_INFO "%s: bitmap initialisation failed: %d\n", + bmname(bitmap), ret); + return ret; +} + +void bitmap_write_all(struct bitmap *bitmap) +{ + /* We don't actually write all bitmap blocks here, + * just flag them as needing to be written + */ + int i; + + spin_lock_irq(&bitmap->lock); + for (i = 0; i < bitmap->file_pages; i++) + set_page_attr(bitmap, bitmap->filemap[i], + BITMAP_PAGE_NEEDWRITE); + bitmap->allclean = 0; + spin_unlock_irq(&bitmap->lock); +} + +static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc) +{ + sector_t chunk = offset >> bitmap->chunkshift; + unsigned long page = chunk >> PAGE_COUNTER_SHIFT; + bitmap->bp[page].count += inc; + bitmap_checkfree(bitmap, page); +} +static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, + sector_t offset, sector_t *blocks, + int create); + +/* + * bitmap daemon -- periodically wakes up to clean bits and flush pages + * out to disk + */ + +void bitmap_daemon_work(struct mddev *mddev) +{ + struct bitmap *bitmap; + unsigned long j; + unsigned long flags; + struct page *page = NULL, *lastpage = NULL; + sector_t blocks; + void *paddr; + + /* Use a mutex to guard daemon_work against + * bitmap_destroy. + */ + mutex_lock(&mddev->bitmap_info.mutex); + bitmap = mddev->bitmap; + if (bitmap == NULL) { + mutex_unlock(&mddev->bitmap_info.mutex); + return; + } + if (time_before(jiffies, bitmap->daemon_lastrun + + mddev->bitmap_info.daemon_sleep)) + goto done; + + bitmap->daemon_lastrun = jiffies; + if (bitmap->allclean) { + mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; + goto done; + } + bitmap->allclean = 1; + + spin_lock_irqsave(&bitmap->lock, flags); + for (j = 0; j < bitmap->chunks; j++) { + bitmap_counter_t *bmc; + if (!bitmap->filemap) + /* error or shutdown */ + break; + + page = filemap_get_page(bitmap, j); + + if (page != lastpage) { + /* skip this page unless it's marked as needing cleaning */ + if (!test_page_attr(bitmap, page, BITMAP_PAGE_PENDING)) { + int need_write = test_page_attr(bitmap, page, + BITMAP_PAGE_NEEDWRITE); + if (need_write) + clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); + + spin_unlock_irqrestore(&bitmap->lock, flags); + if (need_write) + write_page(bitmap, page, 0); + spin_lock_irqsave(&bitmap->lock, flags); + j |= (PAGE_BITS - 1); + continue; + } + + /* grab the new page, sync and release the old */ + if (lastpage != NULL) { + if (test_page_attr(bitmap, lastpage, + BITMAP_PAGE_NEEDWRITE)) { + clear_page_attr(bitmap, lastpage, + BITMAP_PAGE_NEEDWRITE); + spin_unlock_irqrestore(&bitmap->lock, flags); + write_page(bitmap, lastpage, 0); + } else { + set_page_attr(bitmap, lastpage, + BITMAP_PAGE_NEEDWRITE); + bitmap->allclean = 0; + spin_unlock_irqrestore(&bitmap->lock, flags); + } + } else + spin_unlock_irqrestore(&bitmap->lock, flags); + lastpage = page; + + /* We are possibly going to clear some bits, so make + * sure that events_cleared is up-to-date. + */ + if (bitmap->need_sync && + mddev->bitmap_info.external == 0) { + bitmap_super_t *sb; + bitmap->need_sync = 0; + sb = kmap_atomic(bitmap->sb_page); + sb->events_cleared = + cpu_to_le64(bitmap->events_cleared); + kunmap_atomic(sb); + write_page(bitmap, bitmap->sb_page, 1); + } + spin_lock_irqsave(&bitmap->lock, flags); + if (!bitmap->need_sync) + clear_page_attr(bitmap, page, BITMAP_PAGE_PENDING); + else + bitmap->allclean = 0; + } + bmc = bitmap_get_counter(bitmap, + (sector_t)j << bitmap->chunkshift, + &blocks, 0); + if (!bmc) + j |= PAGE_COUNTER_MASK; + else if (*bmc) { + if (*bmc == 1 && !bitmap->need_sync) { + /* we can clear the bit */ + *bmc = 0; + bitmap_count_page(bitmap, + (sector_t)j << bitmap->chunkshift, + -1); + + /* clear the bit */ + paddr = kmap_atomic(page); + if (bitmap->flags & BITMAP_HOSTENDIAN) + clear_bit(file_page_offset(bitmap, j), + paddr); + else + __clear_bit_le( + file_page_offset(bitmap, + j), + paddr); + kunmap_atomic(paddr); + } else if (*bmc <= 2) { + *bmc = 1; /* maybe clear the bit next time */ + set_page_attr(bitmap, page, BITMAP_PAGE_PENDING); + bitmap->allclean = 0; + } + } + } + spin_unlock_irqrestore(&bitmap->lock, flags); + + /* now sync the final page */ + if (lastpage != NULL) { + spin_lock_irqsave(&bitmap->lock, flags); + if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) { + clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); + spin_unlock_irqrestore(&bitmap->lock, flags); + write_page(bitmap, lastpage, 0); + } else { + set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); + bitmap->allclean = 0; + spin_unlock_irqrestore(&bitmap->lock, flags); + } + } + + done: + if (bitmap->allclean == 0) + mddev->thread->timeout = + mddev->bitmap_info.daemon_sleep; + mutex_unlock(&mddev->bitmap_info.mutex); +} + +static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, + sector_t offset, sector_t *blocks, + int create) +__releases(bitmap->lock) +__acquires(bitmap->lock) +{ + /* If 'create', we might release the lock and reclaim it. + * The lock must have been taken with interrupts enabled. + * If !create, we don't release the lock. + */ + sector_t chunk = offset >> bitmap->chunkshift; + unsigned long page = chunk >> PAGE_COUNTER_SHIFT; + unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT; + sector_t csize; + int err; + + err = bitmap_checkpage(bitmap, page, create); + + if (bitmap->bp[page].hijacked || + bitmap->bp[page].map == NULL) + csize = ((sector_t)1) << (bitmap->chunkshift + + PAGE_COUNTER_SHIFT - 1); + else + csize = ((sector_t)1) << bitmap->chunkshift; + *blocks = csize - (offset & (csize - 1)); + + if (err < 0) + return NULL; + + /* now locked ... */ + + if (bitmap->bp[page].hijacked) { /* hijacked pointer */ + /* should we use the first or second counter field + * of the hijacked pointer? */ + int hi = (pageoff > PAGE_COUNTER_MASK); + return &((bitmap_counter_t *) + &bitmap->bp[page].map)[hi]; + } else /* page is allocated */ + return (bitmap_counter_t *) + &(bitmap->bp[page].map[pageoff]); +} + +int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind) +{ + if (!bitmap) + return 0; + + if (behind) { + int bw; + atomic_inc(&bitmap->behind_writes); + bw = atomic_read(&bitmap->behind_writes); + if (bw > bitmap->behind_writes_used) + bitmap->behind_writes_used = bw; + + pr_debug("inc write-behind count %d/%lu\n", + bw, bitmap->mddev->bitmap_info.max_write_behind); + } + + while (sectors) { + sector_t blocks; + bitmap_counter_t *bmc; + + spin_lock_irq(&bitmap->lock); + bmc = bitmap_get_counter(bitmap, offset, &blocks, 1); + if (!bmc) { + spin_unlock_irq(&bitmap->lock); + return 0; + } + + if (unlikely(COUNTER(*bmc) == COUNTER_MAX)) { + DEFINE_WAIT(__wait); + /* note that it is safe to do the prepare_to_wait + * after the test as long as we do it before dropping + * the spinlock. + */ + prepare_to_wait(&bitmap->overflow_wait, &__wait, + TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&bitmap->lock); + io_schedule(); + finish_wait(&bitmap->overflow_wait, &__wait); + continue; + } + + switch (*bmc) { + case 0: + bitmap_file_set_bit(bitmap, offset); + bitmap_count_page(bitmap, offset, 1); + /* fall through */ + case 1: + *bmc = 2; + } + + (*bmc)++; + + spin_unlock_irq(&bitmap->lock); + + offset += blocks; + if (sectors > blocks) + sectors -= blocks; + else + sectors = 0; + } + return 0; +} +EXPORT_SYMBOL(bitmap_startwrite); + +void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, + int success, int behind) +{ + if (!bitmap) + return; + if (behind) { + if (atomic_dec_and_test(&bitmap->behind_writes)) + wake_up(&bitmap->behind_wait); + pr_debug("dec write-behind count %d/%lu\n", + atomic_read(&bitmap->behind_writes), + bitmap->mddev->bitmap_info.max_write_behind); + } + + while (sectors) { + sector_t blocks; + unsigned long flags; + bitmap_counter_t *bmc; + + spin_lock_irqsave(&bitmap->lock, flags); + bmc = bitmap_get_counter(bitmap, offset, &blocks, 0); + if (!bmc) { + spin_unlock_irqrestore(&bitmap->lock, flags); + return; + } + + if (success && !bitmap->mddev->degraded && + bitmap->events_cleared < bitmap->mddev->events) { + bitmap->events_cleared = bitmap->mddev->events; + bitmap->need_sync = 1; + sysfs_notify_dirent_safe(bitmap->sysfs_can_clear); + } + + if (!success && !NEEDED(*bmc)) + *bmc |= NEEDED_MASK; + + if (COUNTER(*bmc) == COUNTER_MAX) + wake_up(&bitmap->overflow_wait); + + (*bmc)--; + if (*bmc <= 2) { + set_page_attr(bitmap, + filemap_get_page( + bitmap, + offset >> bitmap->chunkshift), + BITMAP_PAGE_PENDING); + bitmap->allclean = 0; + } + spin_unlock_irqrestore(&bitmap->lock, flags); + offset += blocks; + if (sectors > blocks) + sectors -= blocks; + else + sectors = 0; + } +} +EXPORT_SYMBOL(bitmap_endwrite); + +static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, + int degraded) +{ + bitmap_counter_t *bmc; + int rv; + if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */ + *blocks = 1024; + return 1; /* always resync if no bitmap */ + } + spin_lock_irq(&bitmap->lock); + bmc = bitmap_get_counter(bitmap, offset, blocks, 0); + rv = 0; + if (bmc) { + /* locked */ + if (RESYNC(*bmc)) + rv = 1; + else if (NEEDED(*bmc)) { + rv = 1; + if (!degraded) { /* don't set/clear bits if degraded */ + *bmc |= RESYNC_MASK; + *bmc &= ~NEEDED_MASK; + } + } + } + spin_unlock_irq(&bitmap->lock); + return rv; +} + +int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, + int degraded) +{ + /* bitmap_start_sync must always report on multiples of whole + * pages, otherwise resync (which is very PAGE_SIZE based) will + * get confused. + * So call __bitmap_start_sync repeatedly (if needed) until + * At least PAGE_SIZE>>9 blocks are covered. + * Return the 'or' of the result. + */ + int rv = 0; + sector_t blocks1; + + *blocks = 0; + while (*blocks < (PAGE_SIZE>>9)) { + rv |= __bitmap_start_sync(bitmap, offset, + &blocks1, degraded); + offset += blocks1; + *blocks += blocks1; + } + return rv; +} +EXPORT_SYMBOL(bitmap_start_sync); + +void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted) +{ + bitmap_counter_t *bmc; + unsigned long flags; + + if (bitmap == NULL) { + *blocks = 1024; + return; + } + spin_lock_irqsave(&bitmap->lock, flags); + bmc = bitmap_get_counter(bitmap, offset, blocks, 0); + if (bmc == NULL) + goto unlock; + /* locked */ + if (RESYNC(*bmc)) { + *bmc &= ~RESYNC_MASK; + + if (!NEEDED(*bmc) && aborted) + *bmc |= NEEDED_MASK; + else { + if (*bmc <= 2) { + set_page_attr(bitmap, + filemap_get_page(bitmap, offset >> bitmap->chunkshift), + BITMAP_PAGE_PENDING); + bitmap->allclean = 0; + } + } + } + unlock: + spin_unlock_irqrestore(&bitmap->lock, flags); +} +EXPORT_SYMBOL(bitmap_end_sync); + +void bitmap_close_sync(struct bitmap *bitmap) +{ + /* Sync has finished, and any bitmap chunks that weren't synced + * properly have been aborted. It remains to us to clear the + * RESYNC bit wherever it is still on + */ + sector_t sector = 0; + sector_t blocks; + if (!bitmap) + return; + while (sector < bitmap->mddev->resync_max_sectors) { + bitmap_end_sync(bitmap, sector, &blocks, 0); + sector += blocks; + } +} +EXPORT_SYMBOL(bitmap_close_sync); + +void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) +{ + sector_t s = 0; + sector_t blocks; + + if (!bitmap) + return; + if (sector == 0) { + bitmap->last_end_sync = jiffies; + return; + } + if (time_before(jiffies, (bitmap->last_end_sync + + bitmap->mddev->bitmap_info.daemon_sleep))) + return; + wait_event(bitmap->mddev->recovery_wait, + atomic_read(&bitmap->mddev->recovery_active) == 0); + + bitmap->mddev->curr_resync_completed = sector; + set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); + sector &= ~((1ULL << bitmap->chunkshift) - 1); + s = 0; + while (s < sector && s < bitmap->mddev->resync_max_sectors) { + bitmap_end_sync(bitmap, s, &blocks, 0); + s += blocks; + } + bitmap->last_end_sync = jiffies; + sysfs_notify(&bitmap->mddev->kobj, NULL, "sync_completed"); +} +EXPORT_SYMBOL(bitmap_cond_end_sync); + +static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) +{ + /* For each chunk covered by any of these sectors, set the + * counter to 1 and set resync_needed. They should all + * be 0 at this point + */ + + sector_t secs; + bitmap_counter_t *bmc; + spin_lock_irq(&bitmap->lock); + bmc = bitmap_get_counter(bitmap, offset, &secs, 1); + if (!bmc) { + spin_unlock_irq(&bitmap->lock); + return; + } + if (!*bmc) { + struct page *page; + *bmc = 2 | (needed ? NEEDED_MASK : 0); + bitmap_count_page(bitmap, offset, 1); + page = filemap_get_page(bitmap, offset >> bitmap->chunkshift); + set_page_attr(bitmap, page, BITMAP_PAGE_PENDING); + bitmap->allclean = 0; + } + spin_unlock_irq(&bitmap->lock); +} + +/* dirty the memory and file bits for bitmap chunks "s" to "e" */ +void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e) +{ + unsigned long chunk; + + for (chunk = s; chunk <= e; chunk++) { + sector_t sec = (sector_t)chunk << bitmap->chunkshift; + bitmap_set_memory_bits(bitmap, sec, 1); + spin_lock_irq(&bitmap->lock); + bitmap_file_set_bit(bitmap, sec); + spin_unlock_irq(&bitmap->lock); + if (sec < bitmap->mddev->recovery_cp) + /* We are asserting that the array is dirty, + * so move the recovery_cp address back so + * that it is obvious that it is dirty + */ + bitmap->mddev->recovery_cp = sec; + } +} + +/* + * flush out any pending updates + */ +void bitmap_flush(struct mddev *mddev) +{ + struct bitmap *bitmap = mddev->bitmap; + long sleep; + + if (!bitmap) /* there was no bitmap */ + return; + + /* run the daemon_work three time to ensure everything is flushed + * that can be + */ + sleep = mddev->bitmap_info.daemon_sleep * 2; + bitmap->daemon_lastrun -= sleep; + bitmap_daemon_work(mddev); + bitmap->daemon_lastrun -= sleep; + bitmap_daemon_work(mddev); + bitmap->daemon_lastrun -= sleep; + bitmap_daemon_work(mddev); + bitmap_update_sb(bitmap); +} + +/* + * free memory that was allocated + */ +static void bitmap_free(struct bitmap *bitmap) +{ + unsigned long k, pages; + struct bitmap_page *bp; + + if (!bitmap) /* there was no bitmap */ + return; + + /* release the bitmap file and kill the daemon */ + bitmap_file_put(bitmap); + + bp = bitmap->bp; + pages = bitmap->pages; + + /* free all allocated memory */ + + if (bp) /* deallocate the page memory */ + for (k = 0; k < pages; k++) + if (bp[k].map && !bp[k].hijacked) + kfree(bp[k].map); + kfree(bp); + kfree(bitmap); +} + +void bitmap_destroy(struct mddev *mddev) +{ + struct bitmap *bitmap = mddev->bitmap; + + if (!bitmap) /* there was no bitmap */ + return; + + mutex_lock(&mddev->bitmap_info.mutex); + mddev->bitmap = NULL; /* disconnect from the md device */ + mutex_unlock(&mddev->bitmap_info.mutex); + if (mddev->thread) + mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; + + if (bitmap->sysfs_can_clear) + sysfs_put(bitmap->sysfs_can_clear); + + bitmap_free(bitmap); +} + +/* + * initialize the bitmap structure + * if this returns an error, bitmap_destroy must be called to do clean up + */ +int bitmap_create(struct mddev *mddev) +{ + struct bitmap *bitmap; + sector_t blocks = mddev->resync_max_sectors; + unsigned long chunks; + unsigned long pages; + struct file *file = mddev->bitmap_info.file; + int err; + struct sysfs_dirent *bm = NULL; + + BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); + + if (!file + && !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */ + return 0; + + BUG_ON(file && mddev->bitmap_info.offset); + + bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); + if (!bitmap) + return -ENOMEM; + + spin_lock_init(&bitmap->lock); + atomic_set(&bitmap->pending_writes, 0); + init_waitqueue_head(&bitmap->write_wait); + init_waitqueue_head(&bitmap->overflow_wait); + init_waitqueue_head(&bitmap->behind_wait); + + bitmap->mddev = mddev; + + if (mddev->kobj.sd) + bm = sysfs_get_dirent(mddev->kobj.sd, NULL, "bitmap"); + if (bm) { + bitmap->sysfs_can_clear = sysfs_get_dirent(bm, NULL, "can_clear"); + sysfs_put(bm); + } else + bitmap->sysfs_can_clear = NULL; + + bitmap->file = file; + if (file) { + get_file(file); + /* As future accesses to this file will use bmap, + * and bypass the page cache, we must sync the file + * first. + */ + vfs_fsync(file, 1); + } + /* read superblock from bitmap file (this sets mddev->bitmap_info.chunksize) */ + if (!mddev->bitmap_info.external) { + /* + * If 'MD_ARRAY_FIRST_USE' is set, then device-mapper is + * instructing us to create a new on-disk bitmap instance. + */ + if (test_and_clear_bit(MD_ARRAY_FIRST_USE, &mddev->flags)) + err = bitmap_new_disk_sb(bitmap); + else + err = bitmap_read_sb(bitmap); + } else { + err = 0; + if (mddev->bitmap_info.chunksize == 0 || + mddev->bitmap_info.daemon_sleep == 0) + /* chunksize and time_base need to be + * set first. */ + err = -EINVAL; + } + if (err) + goto error; + + bitmap->daemon_lastrun = jiffies; + bitmap->chunkshift = (ffz(~mddev->bitmap_info.chunksize) + - BITMAP_BLOCK_SHIFT); + + chunks = (blocks + (1 << bitmap->chunkshift) - 1) >> + bitmap->chunkshift; + pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO; + + BUG_ON(!pages); + + bitmap->chunks = chunks; + bitmap->pages = pages; + bitmap->missing_pages = pages; + + bitmap->bp = kzalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL); + + err = -ENOMEM; + if (!bitmap->bp) + goto error; + + printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", + pages, bmname(bitmap)); + + mddev->bitmap = bitmap; + + + return (bitmap->flags & BITMAP_WRITE_ERROR) ? -EIO : 0; + + error: + bitmap_free(bitmap); + return err; +} + +int bitmap_load(struct mddev *mddev) +{ + int err = 0; + sector_t start = 0; + sector_t sector = 0; + struct bitmap *bitmap = mddev->bitmap; + + if (!bitmap) + goto out; + + /* Clear out old bitmap info first: Either there is none, or we + * are resuming after someone else has possibly changed things, + * so we should forget old cached info. + * All chunks should be clean, but some might need_sync. + */ + while (sector < mddev->resync_max_sectors) { + sector_t blocks; + bitmap_start_sync(bitmap, sector, &blocks, 0); + sector += blocks; + } + bitmap_close_sync(bitmap); + + if (mddev->degraded == 0 + || bitmap->events_cleared == mddev->events) + /* no need to keep dirty bits to optimise a + * re-add of a missing device */ + start = mddev->recovery_cp; + + mutex_lock(&mddev->bitmap_info.mutex); + err = bitmap_init_from_disk(bitmap, start); + mutex_unlock(&mddev->bitmap_info.mutex); + + if (err) + goto out; + + mddev->thread->timeout = mddev->bitmap_info.daemon_sleep; + md_wakeup_thread(mddev->thread); + + bitmap_update_sb(bitmap); + + if (bitmap->flags & BITMAP_WRITE_ERROR) + err = -EIO; +out: + return err; +} +EXPORT_SYMBOL_GPL(bitmap_load); + +void bitmap_status(struct seq_file *seq, struct bitmap *bitmap) +{ + unsigned long chunk_kb; + unsigned long flags; + + if (!bitmap) + return; + + spin_lock_irqsave(&bitmap->lock, flags); + chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10; + seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " + "%lu%s chunk", + bitmap->pages - bitmap->missing_pages, + bitmap->pages, + (bitmap->pages - bitmap->missing_pages) + << (PAGE_SHIFT - 10), + chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize, + chunk_kb ? "KB" : "B"); + if (bitmap->file) { + seq_printf(seq, ", file: "); + seq_path(seq, &bitmap->file->f_path, " \t\n"); + } + + seq_printf(seq, "\n"); + spin_unlock_irqrestore(&bitmap->lock, flags); +} + +static ssize_t +location_show(struct mddev *mddev, char *page) +{ + ssize_t len; + if (mddev->bitmap_info.file) + len = sprintf(page, "file"); + else if (mddev->bitmap_info.offset) + len = sprintf(page, "%+lld", (long long)mddev->bitmap_info.offset); + else + len = sprintf(page, "none"); + len += sprintf(page+len, "\n"); + return len; +} + +static ssize_t +location_store(struct mddev *mddev, const char *buf, size_t len) +{ + + if (mddev->pers) { + if (!mddev->pers->quiesce) + return -EBUSY; + if (mddev->recovery || mddev->sync_thread) + return -EBUSY; + } + + if (mddev->bitmap || mddev->bitmap_info.file || + mddev->bitmap_info.offset) { + /* bitmap already configured. Only option is to clear it */ + if (strncmp(buf, "none", 4) != 0) + return -EBUSY; + if (mddev->pers) { + mddev->pers->quiesce(mddev, 1); + bitmap_destroy(mddev); + mddev->pers->quiesce(mddev, 0); + } + mddev->bitmap_info.offset = 0; + if (mddev->bitmap_info.file) { + struct file *f = mddev->bitmap_info.file; + mddev->bitmap_info.file = NULL; + restore_bitmap_write_access(f); + fput(f); + } + } else { + /* No bitmap, OK to set a location */ + long long offset; + if (strncmp(buf, "none", 4) == 0) + /* nothing to be done */; + else if (strncmp(buf, "file:", 5) == 0) { + /* Not supported yet */ + return -EINVAL; + } else { + int rv; + if (buf[0] == '+') + rv = strict_strtoll(buf+1, 10, &offset); + else + rv = strict_strtoll(buf, 10, &offset); + if (rv) + return rv; + if (offset == 0) + return -EINVAL; + if (mddev->bitmap_info.external == 0 && + mddev->major_version == 0 && + offset != mddev->bitmap_info.default_offset) + return -EINVAL; + mddev->bitmap_info.offset = offset; + if (mddev->pers) { + mddev->pers->quiesce(mddev, 1); + rv = bitmap_create(mddev); + if (!rv) + rv = bitmap_load(mddev); + if (rv) { + bitmap_destroy(mddev); + mddev->bitmap_info.offset = 0; + } + mddev->pers->quiesce(mddev, 0); + if (rv) + return rv; + } + } + } + if (!mddev->external) { + /* Ensure new bitmap info is stored in + * metadata promptly. + */ + set_bit(MD_CHANGE_DEVS, &mddev->flags); + md_wakeup_thread(mddev->thread); + } + return len; +} + +static struct md_sysfs_entry bitmap_location = +__ATTR(location, S_IRUGO|S_IWUSR, location_show, location_store); + +static ssize_t +timeout_show(struct mddev *mddev, char *page) +{ + ssize_t len; + unsigned long secs = mddev->bitmap_info.daemon_sleep / HZ; + unsigned long jifs = mddev->bitmap_info.daemon_sleep % HZ; + + len = sprintf(page, "%lu", secs); + if (jifs) + len += sprintf(page+len, ".%03u", jiffies_to_msecs(jifs)); + len += sprintf(page+len, "\n"); + return len; +} + +static ssize_t +timeout_store(struct mddev *mddev, const char *buf, size_t len) +{ + /* timeout can be set at any time */ + unsigned long timeout; + int rv = strict_strtoul_scaled(buf, &timeout, 4); + if (rv) + return rv; + + /* just to make sure we don't overflow... */ + if (timeout >= LONG_MAX / HZ) + return -EINVAL; + + timeout = timeout * HZ / 10000; + + if (timeout >= MAX_SCHEDULE_TIMEOUT) + timeout = MAX_SCHEDULE_TIMEOUT-1; + if (timeout < 1) + timeout = 1; + mddev->bitmap_info.daemon_sleep = timeout; + if (mddev->thread) { + /* if thread->timeout is MAX_SCHEDULE_TIMEOUT, then + * the bitmap is all clean and we don't need to + * adjust the timeout right now + */ + if (mddev->thread->timeout < MAX_SCHEDULE_TIMEOUT) { + mddev->thread->timeout = timeout; + md_wakeup_thread(mddev->thread); + } + } + return len; +} + +static struct md_sysfs_entry bitmap_timeout = +__ATTR(time_base, S_IRUGO|S_IWUSR, timeout_show, timeout_store); + +static ssize_t +backlog_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%lu\n", mddev->bitmap_info.max_write_behind); +} + +static ssize_t +backlog_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned long backlog; + int rv = strict_strtoul(buf, 10, &backlog); + if (rv) + return rv; + if (backlog > COUNTER_MAX) + return -EINVAL; + mddev->bitmap_info.max_write_behind = backlog; + return len; +} + +static struct md_sysfs_entry bitmap_backlog = +__ATTR(backlog, S_IRUGO|S_IWUSR, backlog_show, backlog_store); + +static ssize_t +chunksize_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%lu\n", mddev->bitmap_info.chunksize); +} + +static ssize_t +chunksize_store(struct mddev *mddev, const char *buf, size_t len) +{ + /* Can only be changed when no bitmap is active */ + int rv; + unsigned long csize; + if (mddev->bitmap) + return -EBUSY; + rv = strict_strtoul(buf, 10, &csize); + if (rv) + return rv; + if (csize < 512 || + !is_power_of_2(csize)) + return -EINVAL; + mddev->bitmap_info.chunksize = csize; + return len; +} + +static struct md_sysfs_entry bitmap_chunksize = +__ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store); + +static ssize_t metadata_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%s\n", (mddev->bitmap_info.external + ? "external" : "internal")); +} + +static ssize_t metadata_store(struct mddev *mddev, const char *buf, size_t len) +{ + if (mddev->bitmap || + mddev->bitmap_info.file || + mddev->bitmap_info.offset) + return -EBUSY; + if (strncmp(buf, "external", 8) == 0) + mddev->bitmap_info.external = 1; + else if (strncmp(buf, "internal", 8) == 0) + mddev->bitmap_info.external = 0; + else + return -EINVAL; + return len; +} + +static struct md_sysfs_entry bitmap_metadata = +__ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store); + +static ssize_t can_clear_show(struct mddev *mddev, char *page) +{ + int len; + if (mddev->bitmap) + len = sprintf(page, "%s\n", (mddev->bitmap->need_sync ? + "false" : "true")); + else + len = sprintf(page, "\n"); + return len; +} + +static ssize_t can_clear_store(struct mddev *mddev, const char *buf, size_t len) +{ + if (mddev->bitmap == NULL) + return -ENOENT; + if (strncmp(buf, "false", 5) == 0) + mddev->bitmap->need_sync = 1; + else if (strncmp(buf, "true", 4) == 0) { + if (mddev->degraded) + return -EBUSY; + mddev->bitmap->need_sync = 0; + } else + return -EINVAL; + return len; +} + +static struct md_sysfs_entry bitmap_can_clear = +__ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store); + +static ssize_t +behind_writes_used_show(struct mddev *mddev, char *page) +{ + if (mddev->bitmap == NULL) + return sprintf(page, "0\n"); + return sprintf(page, "%lu\n", + mddev->bitmap->behind_writes_used); +} + +static ssize_t +behind_writes_used_reset(struct mddev *mddev, const char *buf, size_t len) +{ + if (mddev->bitmap) + mddev->bitmap->behind_writes_used = 0; + return len; +} + +static struct md_sysfs_entry max_backlog_used = +__ATTR(max_backlog_used, S_IRUGO | S_IWUSR, + behind_writes_used_show, behind_writes_used_reset); + +static struct attribute *md_bitmap_attrs[] = { + &bitmap_location.attr, + &bitmap_timeout.attr, + &bitmap_backlog.attr, + &bitmap_chunksize.attr, + &bitmap_metadata.attr, + &bitmap_can_clear.attr, + &max_backlog_used.attr, + NULL +}; +struct attribute_group md_bitmap_group = { + .name = "bitmap", + .attrs = md_bitmap_attrs, +}; + diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h new file mode 100644 index 00000000..b44b0aba --- /dev/null +++ b/drivers/md/bitmap.h @@ -0,0 +1,247 @@ +/* + * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003 + * + * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. + */ +#ifndef BITMAP_H +#define BITMAP_H 1 + +#define BITMAP_MAJOR_LO 3 +/* version 4 insists the bitmap is in little-endian order + * with version 3, it is host-endian which is non-portable + */ +#define BITMAP_MAJOR_HI 4 +#define BITMAP_MAJOR_HOSTENDIAN 3 + +/* + * in-memory bitmap: + * + * Use 16 bit block counters to track pending writes to each "chunk". + * The 2 high order bits are special-purpose, the first is a flag indicating + * whether a resync is needed. The second is a flag indicating whether a + * resync is active. + * This means that the counter is actually 14 bits: + * + * +--------+--------+------------------------------------------------+ + * | resync | resync | counter | + * | needed | active | | + * | (0-1) | (0-1) | (0-16383) | + * +--------+--------+------------------------------------------------+ + * + * The "resync needed" bit is set when: + * a '1' bit is read from storage at startup. + * a write request fails on some drives + * a resync is aborted on a chunk with 'resync active' set + * It is cleared (and resync-active set) when a resync starts across all drives + * of the chunk. + * + * + * The "resync active" bit is set when: + * a resync is started on all drives, and resync_needed is set. + * resync_needed will be cleared (as long as resync_active wasn't already set). + * It is cleared when a resync completes. + * + * The counter counts pending write requests, plus the on-disk bit. + * When the counter is '1' and the resync bits are clear, the on-disk + * bit can be cleared as well, thus setting the counter to 0. + * When we set a bit, or in the counter (to start a write), if the fields is + * 0, we first set the disk bit and set the counter to 1. + * + * If the counter is 0, the on-disk bit is clear and the stipe is clean + * Anything that dirties the stipe pushes the counter to 2 (at least) + * and sets the on-disk bit (lazily). + * If a periodic sweep find the counter at 2, it is decremented to 1. + * If the sweep find the counter at 1, the on-disk bit is cleared and the + * counter goes to zero. + * + * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block + * counters as a fallback when "page" memory cannot be allocated: + * + * Normal case (page memory allocated): + * + * page pointer (32-bit) + * + * [ ] ------+ + * | + * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters) + * c1 c2 c2048 + * + * Hijacked case (page memory allocation failed): + * + * hijacked page pointer (32-bit) + * + * [ ][ ] (no page memory allocated) + * counter #1 (16-bit) counter #2 (16-bit) + * + */ + +#ifdef __KERNEL__ + +#define PAGE_BITS (PAGE_SIZE << 3) +#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3) + +typedef __u16 bitmap_counter_t; +#define COUNTER_BITS 16 +#define COUNTER_BIT_SHIFT 4 +#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3) + +#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1))) +#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2))) +#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1) +#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK) +#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK) +#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX) + +/* how many counters per page? */ +#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS) +/* same, except a shift value for more efficient bitops */ +#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT) +/* same, except a mask value for more efficient bitops */ +#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) + +#define BITMAP_BLOCK_SHIFT 9 + +#endif + +/* + * bitmap structures: + */ + +#define BITMAP_MAGIC 0x6d746962 + +/* use these for bitmap->flags and bitmap->sb->state bit-fields */ +enum bitmap_state { + BITMAP_STALE = 0x002, /* the bitmap file is out of date or had -EIO */ + BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */ + BITMAP_HOSTENDIAN = 0x8000, +}; + +/* the superblock at the front of the bitmap file -- little endian */ +typedef struct bitmap_super_s { + __le32 magic; /* 0 BITMAP_MAGIC */ + __le32 version; /* 4 the bitmap major for now, could change... */ + __u8 uuid[16]; /* 8 128 bit uuid - must match md device uuid */ + __le64 events; /* 24 event counter for the bitmap (1)*/ + __le64 events_cleared;/*32 event counter when last bit cleared (2) */ + __le64 sync_size; /* 40 the size of the md device's sync range(3) */ + __le32 state; /* 48 bitmap state information */ + __le32 chunksize; /* 52 the bitmap chunk size in bytes */ + __le32 daemon_sleep; /* 56 seconds between disk flushes */ + __le32 write_behind; /* 60 number of outstanding write-behind writes */ + + __u8 pad[256 - 64]; /* set to zero */ +} bitmap_super_t; + +/* notes: + * (1) This event counter is updated before the eventcounter in the md superblock + * When a bitmap is loaded, it is only accepted if this event counter is equal + * to, or one greater than, the event counter in the superblock. + * (2) This event counter is updated when the other one is *if*and*only*if* the + * array is not degraded. As bits are not cleared when the array is degraded, + * this represents the last time that any bits were cleared. + * If a device is being added that has an event count with this value or + * higher, it is accepted as conforming to the bitmap. + * (3)This is the number of sectors represented by the bitmap, and is the range that + * resync happens across. For raid1 and raid5/6 it is the size of individual + * devices. For raid10 it is the size of the array. + */ + +#ifdef __KERNEL__ + +/* the in-memory bitmap is represented by bitmap_pages */ +struct bitmap_page { + /* + * map points to the actual memory page + */ + char *map; + /* + * in emergencies (when map cannot be alloced), hijack the map + * pointer and use it as two counters itself + */ + unsigned int hijacked:1; + /* + * count of dirty bits on the page + */ + unsigned int count:31; +}; + +/* the main bitmap structure - one per mddev */ +struct bitmap { + struct bitmap_page *bp; + unsigned long pages; /* total number of pages in the bitmap */ + unsigned long missing_pages; /* number of pages not yet allocated */ + + struct mddev *mddev; /* the md device that the bitmap is for */ + + /* bitmap chunksize -- how much data does each bit represent? */ + unsigned long chunkshift; /* chunksize = 2^(chunkshift+9) (for bitops) */ + unsigned long chunks; /* total number of data chunks for the array */ + + __u64 events_cleared; + int need_sync; + + /* bitmap spinlock */ + spinlock_t lock; + + struct file *file; /* backing disk file */ + struct page *sb_page; /* cached copy of the bitmap file superblock */ + struct page **filemap; /* list of cache pages for the file */ + unsigned long *filemap_attr; /* attributes associated w/ filemap pages */ + unsigned long file_pages; /* number of pages in the file */ + int last_page_size; /* bytes in the last page */ + + unsigned long flags; + + int allclean; + + atomic_t behind_writes; + unsigned long behind_writes_used; /* highest actual value at runtime */ + + /* + * the bitmap daemon - periodically wakes up and sweeps the bitmap + * file, cleaning up bits and flushing out pages to disk as necessary + */ + unsigned long daemon_lastrun; /* jiffies of last run */ + unsigned long last_end_sync; /* when we lasted called end_sync to + * update bitmap with resync progress */ + + atomic_t pending_writes; /* pending writes to the bitmap file */ + wait_queue_head_t write_wait; + wait_queue_head_t overflow_wait; + wait_queue_head_t behind_wait; + + struct sysfs_dirent *sysfs_can_clear; +}; + +/* the bitmap API */ + +/* these are used only by md/bitmap */ +int bitmap_create(struct mddev *mddev); +int bitmap_load(struct mddev *mddev); +void bitmap_flush(struct mddev *mddev); +void bitmap_destroy(struct mddev *mddev); + +void bitmap_print_sb(struct bitmap *bitmap); +void bitmap_update_sb(struct bitmap *bitmap); +void bitmap_status(struct seq_file *seq, struct bitmap *bitmap); + +int bitmap_setallbits(struct bitmap *bitmap); +void bitmap_write_all(struct bitmap *bitmap); + +void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e); + +/* these are exported */ +int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, + unsigned long sectors, int behind); +void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, + unsigned long sectors, int success, int behind); +int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded); +void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted); +void bitmap_close_sync(struct bitmap *bitmap); +void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); + +void bitmap_unplug(struct bitmap *bitmap); +void bitmap_daemon_work(struct mddev *mddev); +#endif + +#endif diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h new file mode 100644 index 00000000..3a8cfa26 --- /dev/null +++ b/drivers/md/dm-bio-record.h @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#ifndef DM_BIO_RECORD_H +#define DM_BIO_RECORD_H + +#include <linux/bio.h> + +/* + * There are lots of mutable fields in the bio struct that get + * changed by the lower levels of the block layer. Some targets, + * such as multipath, may wish to resubmit a bio on error. The + * functions in this file help the target record and restore the + * original bio state. + */ + +struct dm_bio_vec_details { +#if PAGE_SIZE < 65536 + __u16 bv_len; + __u16 bv_offset; +#else + unsigned bv_len; + unsigned bv_offset; +#endif +}; + +struct dm_bio_details { + sector_t bi_sector; + struct block_device *bi_bdev; + unsigned int bi_size; + unsigned short bi_idx; + unsigned long bi_flags; + struct dm_bio_vec_details bi_io_vec[BIO_MAX_PAGES]; +}; + +static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio) +{ + unsigned i; + + bd->bi_sector = bio->bi_sector; + bd->bi_bdev = bio->bi_bdev; + bd->bi_size = bio->bi_size; + bd->bi_idx = bio->bi_idx; + bd->bi_flags = bio->bi_flags; + + for (i = 0; i < bio->bi_vcnt; i++) { + bd->bi_io_vec[i].bv_len = bio->bi_io_vec[i].bv_len; + bd->bi_io_vec[i].bv_offset = bio->bi_io_vec[i].bv_offset; + } +} + +static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio) +{ + unsigned i; + + bio->bi_sector = bd->bi_sector; + bio->bi_bdev = bd->bi_bdev; + bio->bi_size = bd->bi_size; + bio->bi_idx = bd->bi_idx; + bio->bi_flags = bd->bi_flags; + + for (i = 0; i < bio->bi_vcnt; i++) { + bio->bi_io_vec[i].bv_len = bd->bi_io_vec[i].bv_len; + bio->bi_io_vec[i].bv_offset = bd->bi_io_vec[i].bv_offset; + } +} + +#endif diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c new file mode 100644 index 00000000..cc06a1e5 --- /dev/null +++ b/drivers/md/dm-bufio.c @@ -0,0 +1,1755 @@ +/* + * Copyright (C) 2009-2011 Red Hat, Inc. + * + * Author: Mikulas Patocka <mpatocka@redhat.com> + * + * This file is released under the GPL. + */ + +#include "dm-bufio.h" + +#include <linux/device-mapper.h> +#include <linux/dm-io.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/shrinker.h> +#include <linux/module.h> + +#define DM_MSG_PREFIX "bufio" + +/* + * Memory management policy: + * Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory + * or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower). + * Always allocate at least DM_BUFIO_MIN_BUFFERS buffers. + * Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT + * dirty buffers. + */ +#define DM_BUFIO_MIN_BUFFERS 8 + +#define DM_BUFIO_MEMORY_PERCENT 2 +#define DM_BUFIO_VMALLOC_PERCENT 25 +#define DM_BUFIO_WRITEBACK_PERCENT 75 + +/* + * Check buffer ages in this interval (seconds) + */ +#define DM_BUFIO_WORK_TIMER_SECS 10 + +/* + * Free buffers when they are older than this (seconds) + */ +#define DM_BUFIO_DEFAULT_AGE_SECS 60 + +/* + * The number of bvec entries that are embedded directly in the buffer. + * If the chunk size is larger, dm-io is used to do the io. + */ +#define DM_BUFIO_INLINE_VECS 16 + +/* + * Buffer hash + */ +#define DM_BUFIO_HASH_BITS 20 +#define DM_BUFIO_HASH(block) \ + ((((block) >> DM_BUFIO_HASH_BITS) ^ (block)) & \ + ((1 << DM_BUFIO_HASH_BITS) - 1)) + +/* + * Don't try to use kmem_cache_alloc for blocks larger than this. + * For explanation, see alloc_buffer_data below. + */ +#define DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT (PAGE_SIZE >> 1) +#define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT (PAGE_SIZE << (MAX_ORDER - 1)) + +/* + * dm_buffer->list_mode + */ +#define LIST_CLEAN 0 +#define LIST_DIRTY 1 +#define LIST_SIZE 2 + +/* + * Linking of buffers: + * All buffers are linked to cache_hash with their hash_list field. + * + * Clean buffers that are not being written (B_WRITING not set) + * are linked to lru[LIST_CLEAN] with their lru_list field. + * + * Dirty and clean buffers that are being written are linked to + * lru[LIST_DIRTY] with their lru_list field. When the write + * finishes, the buffer cannot be relinked immediately (because we + * are in an interrupt context and relinking requires process + * context), so some clean-not-writing buffers can be held on + * dirty_lru too. They are later added to lru in the process + * context. + */ +struct dm_bufio_client { + struct mutex lock; + + struct list_head lru[LIST_SIZE]; + unsigned long n_buffers[LIST_SIZE]; + + struct block_device *bdev; + unsigned block_size; + unsigned char sectors_per_block_bits; + unsigned char pages_per_block_bits; + unsigned char blocks_per_page_bits; + unsigned aux_size; + void (*alloc_callback)(struct dm_buffer *); + void (*write_callback)(struct dm_buffer *); + + struct dm_io_client *dm_io; + + struct list_head reserved_buffers; + unsigned need_reserved_buffers; + + struct hlist_head *cache_hash; + wait_queue_head_t free_buffer_wait; + + int async_write_error; + + struct list_head client_list; + struct shrinker shrinker; +}; + +/* + * Buffer state bits. + */ +#define B_READING 0 +#define B_WRITING 1 +#define B_DIRTY 2 + +/* + * Describes how the block was allocated: + * kmem_cache_alloc(), __get_free_pages() or vmalloc(). + * See the comment at alloc_buffer_data. + */ +enum data_mode { + DATA_MODE_SLAB = 0, + DATA_MODE_GET_FREE_PAGES = 1, + DATA_MODE_VMALLOC = 2, + DATA_MODE_LIMIT = 3 +}; + +struct dm_buffer { + struct hlist_node hash_list; + struct list_head lru_list; + sector_t block; + void *data; + enum data_mode data_mode; + unsigned char list_mode; /* LIST_* */ + unsigned hold_count; + int read_error; + int write_error; + unsigned long state; + unsigned long last_accessed; + struct dm_bufio_client *c; + struct bio bio; + struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS]; +}; + +/*----------------------------------------------------------------*/ + +static struct kmem_cache *dm_bufio_caches[PAGE_SHIFT - SECTOR_SHIFT]; +static char *dm_bufio_cache_names[PAGE_SHIFT - SECTOR_SHIFT]; + +static inline int dm_bufio_cache_index(struct dm_bufio_client *c) +{ + unsigned ret = c->blocks_per_page_bits - 1; + + BUG_ON(ret >= ARRAY_SIZE(dm_bufio_caches)); + + return ret; +} + +#define DM_BUFIO_CACHE(c) (dm_bufio_caches[dm_bufio_cache_index(c)]) +#define DM_BUFIO_CACHE_NAME(c) (dm_bufio_cache_names[dm_bufio_cache_index(c)]) + +#define dm_bufio_in_request() (!!current->bio_list) + +static void dm_bufio_lock(struct dm_bufio_client *c) +{ + mutex_lock_nested(&c->lock, dm_bufio_in_request()); +} + +static int dm_bufio_trylock(struct dm_bufio_client *c) +{ + return mutex_trylock(&c->lock); +} + +static void dm_bufio_unlock(struct dm_bufio_client *c) +{ + mutex_unlock(&c->lock); +} + +/* + * FIXME Move to sched.h? + */ +#ifdef CONFIG_PREEMPT_VOLUNTARY +# define dm_bufio_cond_resched() \ +do { \ + if (unlikely(need_resched())) \ + _cond_resched(); \ +} while (0) +#else +# define dm_bufio_cond_resched() do { } while (0) +#endif + +/*----------------------------------------------------------------*/ + +/* + * Default cache size: available memory divided by the ratio. + */ +static unsigned long dm_bufio_default_cache_size; + +/* + * Total cache size set by the user. + */ +static unsigned long dm_bufio_cache_size; + +/* + * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change + * at any time. If it disagrees, the user has changed cache size. + */ +static unsigned long dm_bufio_cache_size_latch; + +static DEFINE_SPINLOCK(param_spinlock); + +/* + * Buffers are freed after this timeout + */ +static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS; + +static unsigned long dm_bufio_peak_allocated; +static unsigned long dm_bufio_allocated_kmem_cache; +static unsigned long dm_bufio_allocated_get_free_pages; +static unsigned long dm_bufio_allocated_vmalloc; +static unsigned long dm_bufio_current_allocated; + +/*----------------------------------------------------------------*/ + +/* + * Per-client cache: dm_bufio_cache_size / dm_bufio_client_count + */ +static unsigned long dm_bufio_cache_size_per_client; + +/* + * The current number of clients. + */ +static int dm_bufio_client_count; + +/* + * The list of all clients. + */ +static LIST_HEAD(dm_bufio_all_clients); + +/* + * This mutex protects dm_bufio_cache_size_latch, + * dm_bufio_cache_size_per_client and dm_bufio_client_count + */ +static DEFINE_MUTEX(dm_bufio_clients_lock); + +/*----------------------------------------------------------------*/ + +static void adjust_total_allocated(enum data_mode data_mode, long diff) +{ + static unsigned long * const class_ptr[DATA_MODE_LIMIT] = { + &dm_bufio_allocated_kmem_cache, + &dm_bufio_allocated_get_free_pages, + &dm_bufio_allocated_vmalloc, + }; + + spin_lock(¶m_spinlock); + + *class_ptr[data_mode] += diff; + + dm_bufio_current_allocated += diff; + + if (dm_bufio_current_allocated > dm_bufio_peak_allocated) + dm_bufio_peak_allocated = dm_bufio_current_allocated; + + spin_unlock(¶m_spinlock); +} + +/* + * Change the number of clients and recalculate per-client limit. + */ +static void __cache_size_refresh(void) +{ + BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock)); + BUG_ON(dm_bufio_client_count < 0); + + dm_bufio_cache_size_latch = dm_bufio_cache_size; + + barrier(); + + /* + * Use default if set to 0 and report the actual cache size used. + */ + if (!dm_bufio_cache_size_latch) { + (void)cmpxchg(&dm_bufio_cache_size, 0, + dm_bufio_default_cache_size); + dm_bufio_cache_size_latch = dm_bufio_default_cache_size; + } + + dm_bufio_cache_size_per_client = dm_bufio_cache_size_latch / + (dm_bufio_client_count ? : 1); +} + +/* + * Allocating buffer data. + * + * Small buffers are allocated with kmem_cache, to use space optimally. + * + * For large buffers, we choose between get_free_pages and vmalloc. + * Each has advantages and disadvantages. + * + * __get_free_pages can randomly fail if the memory is fragmented. + * __vmalloc won't randomly fail, but vmalloc space is limited (it may be + * as low as 128M) so using it for caching is not appropriate. + * + * If the allocation may fail we use __get_free_pages. Memory fragmentation + * won't have a fatal effect here, but it just causes flushes of some other + * buffers and more I/O will be performed. Don't use __get_free_pages if it + * always fails (i.e. order >= MAX_ORDER). + * + * If the allocation shouldn't fail we use __vmalloc. This is only for the + * initial reserve allocation, so there's no risk of wasting all vmalloc + * space. + */ +static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, + enum data_mode *data_mode) +{ + if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) { + *data_mode = DATA_MODE_SLAB; + return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask); + } + + if (c->block_size <= DM_BUFIO_BLOCK_SIZE_GFP_LIMIT && + gfp_mask & __GFP_NORETRY) { + *data_mode = DATA_MODE_GET_FREE_PAGES; + return (void *)__get_free_pages(gfp_mask, + c->pages_per_block_bits); + } + + *data_mode = DATA_MODE_VMALLOC; + return __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL); +} + +/* + * Free buffer's data. + */ +static void free_buffer_data(struct dm_bufio_client *c, + void *data, enum data_mode data_mode) +{ + switch (data_mode) { + case DATA_MODE_SLAB: + kmem_cache_free(DM_BUFIO_CACHE(c), data); + break; + + case DATA_MODE_GET_FREE_PAGES: + free_pages((unsigned long)data, c->pages_per_block_bits); + break; + + case DATA_MODE_VMALLOC: + vfree(data); + break; + + default: + DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d", + data_mode); + BUG(); + } +} + +/* + * Allocate buffer and its data. + */ +static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask) +{ + struct dm_buffer *b = kmalloc(sizeof(struct dm_buffer) + c->aux_size, + gfp_mask); + + if (!b) + return NULL; + + b->c = c; + + b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode); + if (!b->data) { + kfree(b); + return NULL; + } + + adjust_total_allocated(b->data_mode, (long)c->block_size); + + return b; +} + +/* + * Free buffer and its data. + */ +static void free_buffer(struct dm_buffer *b) +{ + struct dm_bufio_client *c = b->c; + + adjust_total_allocated(b->data_mode, -(long)c->block_size); + + free_buffer_data(c, b->data, b->data_mode); + kfree(b); +} + +/* + * Link buffer to the hash list and clean or dirty queue. + */ +static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty) +{ + struct dm_bufio_client *c = b->c; + + c->n_buffers[dirty]++; + b->block = block; + b->list_mode = dirty; + list_add(&b->lru_list, &c->lru[dirty]); + hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]); + b->last_accessed = jiffies; +} + +/* + * Unlink buffer from the hash list and dirty or clean queue. + */ +static void __unlink_buffer(struct dm_buffer *b) +{ + struct dm_bufio_client *c = b->c; + + BUG_ON(!c->n_buffers[b->list_mode]); + + c->n_buffers[b->list_mode]--; + hlist_del(&b->hash_list); + list_del(&b->lru_list); +} + +/* + * Place the buffer to the head of dirty or clean LRU queue. + */ +static void __relink_lru(struct dm_buffer *b, int dirty) +{ + struct dm_bufio_client *c = b->c; + + BUG_ON(!c->n_buffers[b->list_mode]); + + c->n_buffers[b->list_mode]--; + c->n_buffers[dirty]++; + b->list_mode = dirty; + list_del(&b->lru_list); + list_add(&b->lru_list, &c->lru[dirty]); +} + +/*---------------------------------------------------------------- + * Submit I/O on the buffer. + * + * Bio interface is faster but it has some problems: + * the vector list is limited (increasing this limit increases + * memory-consumption per buffer, so it is not viable); + * + * the memory must be direct-mapped, not vmalloced; + * + * the I/O driver can reject requests spuriously if it thinks that + * the requests are too big for the device or if they cross a + * controller-defined memory boundary. + * + * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and + * it is not vmalloced, try using the bio interface. + * + * If the buffer is big, if it is vmalloced or if the underlying device + * rejects the bio because it is too large, use dm-io layer to do the I/O. + * The dm-io layer splits the I/O into multiple requests, avoiding the above + * shortcomings. + *--------------------------------------------------------------*/ + +/* + * dm-io completion routine. It just calls b->bio.bi_end_io, pretending + * that the request was handled directly with bio interface. + */ +static void dmio_complete(unsigned long error, void *context) +{ + struct dm_buffer *b = context; + + b->bio.bi_end_io(&b->bio, error ? -EIO : 0); +} + +static void use_dmio(struct dm_buffer *b, int rw, sector_t block, + bio_end_io_t *end_io) +{ + int r; + struct dm_io_request io_req = { + .bi_rw = rw, + .notify.fn = dmio_complete, + .notify.context = b, + .client = b->c->dm_io, + }; + struct dm_io_region region = { + .bdev = b->c->bdev, + .sector = block << b->c->sectors_per_block_bits, + .count = b->c->block_size >> SECTOR_SHIFT, + }; + + if (b->data_mode != DATA_MODE_VMALLOC) { + io_req.mem.type = DM_IO_KMEM; + io_req.mem.ptr.addr = b->data; + } else { + io_req.mem.type = DM_IO_VMA; + io_req.mem.ptr.vma = b->data; + } + + b->bio.bi_end_io = end_io; + + r = dm_io(&io_req, 1, ®ion, NULL); + if (r) + end_io(&b->bio, r); +} + +static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, + bio_end_io_t *end_io) +{ + char *ptr; + int len; + + bio_init(&b->bio); + b->bio.bi_io_vec = b->bio_vec; + b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS; + b->bio.bi_sector = block << b->c->sectors_per_block_bits; + b->bio.bi_bdev = b->c->bdev; + b->bio.bi_end_io = end_io; + + /* + * We assume that if len >= PAGE_SIZE ptr is page-aligned. + * If len < PAGE_SIZE the buffer doesn't cross page boundary. + */ + ptr = b->data; + len = b->c->block_size; + + if (len >= PAGE_SIZE) + BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1)); + else + BUG_ON((unsigned long)ptr & (len - 1)); + + do { + if (!bio_add_page(&b->bio, virt_to_page(ptr), + len < PAGE_SIZE ? len : PAGE_SIZE, + virt_to_phys(ptr) & (PAGE_SIZE - 1))) { + BUG_ON(b->c->block_size <= PAGE_SIZE); + use_dmio(b, rw, block, end_io); + return; + } + + len -= PAGE_SIZE; + ptr += PAGE_SIZE; + } while (len > 0); + + submit_bio(rw, &b->bio); +} + +static void submit_io(struct dm_buffer *b, int rw, sector_t block, + bio_end_io_t *end_io) +{ + if (rw == WRITE && b->c->write_callback) + b->c->write_callback(b); + + if (b->c->block_size <= DM_BUFIO_INLINE_VECS * PAGE_SIZE && + b->data_mode != DATA_MODE_VMALLOC) + use_inline_bio(b, rw, block, end_io); + else + use_dmio(b, rw, block, end_io); +} + +/*---------------------------------------------------------------- + * Writing dirty buffers + *--------------------------------------------------------------*/ + +/* + * The endio routine for write. + * + * Set the error, clear B_WRITING bit and wake anyone who was waiting on + * it. + */ +static void write_endio(struct bio *bio, int error) +{ + struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); + + b->write_error = error; + if (unlikely(error)) { + struct dm_bufio_client *c = b->c; + (void)cmpxchg(&c->async_write_error, 0, error); + } + + BUG_ON(!test_bit(B_WRITING, &b->state)); + + smp_mb__before_clear_bit(); + clear_bit(B_WRITING, &b->state); + smp_mb__after_clear_bit(); + + wake_up_bit(&b->state, B_WRITING); +} + +/* + * This function is called when wait_on_bit is actually waiting. + */ +static int do_io_schedule(void *word) +{ + io_schedule(); + + return 0; +} + +/* + * Initiate a write on a dirty buffer, but don't wait for it. + * + * - If the buffer is not dirty, exit. + * - If there some previous write going on, wait for it to finish (we can't + * have two writes on the same buffer simultaneously). + * - Submit our write and don't wait on it. We set B_WRITING indicating + * that there is a write in progress. + */ +static void __write_dirty_buffer(struct dm_buffer *b) +{ + if (!test_bit(B_DIRTY, &b->state)) + return; + + clear_bit(B_DIRTY, &b->state); + wait_on_bit_lock(&b->state, B_WRITING, + do_io_schedule, TASK_UNINTERRUPTIBLE); + + submit_io(b, WRITE, b->block, write_endio); +} + +/* + * Wait until any activity on the buffer finishes. Possibly write the + * buffer if it is dirty. When this function finishes, there is no I/O + * running on the buffer and the buffer is not dirty. + */ +static void __make_buffer_clean(struct dm_buffer *b) +{ + BUG_ON(b->hold_count); + + if (!b->state) /* fast case */ + return; + + wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE); + __write_dirty_buffer(b); + wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE); +} + +/* + * Find some buffer that is not held by anybody, clean it, unlink it and + * return it. + */ +static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c) +{ + struct dm_buffer *b; + + list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) { + BUG_ON(test_bit(B_WRITING, &b->state)); + BUG_ON(test_bit(B_DIRTY, &b->state)); + + if (!b->hold_count) { + __make_buffer_clean(b); + __unlink_buffer(b); + return b; + } + dm_bufio_cond_resched(); + } + + list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) { + BUG_ON(test_bit(B_READING, &b->state)); + + if (!b->hold_count) { + __make_buffer_clean(b); + __unlink_buffer(b); + return b; + } + dm_bufio_cond_resched(); + } + + return NULL; +} + +/* + * Wait until some other threads free some buffer or release hold count on + * some buffer. + * + * This function is entered with c->lock held, drops it and regains it + * before exiting. + */ +static void __wait_for_free_buffer(struct dm_bufio_client *c) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(&c->free_buffer_wait, &wait); + set_task_state(current, TASK_UNINTERRUPTIBLE); + dm_bufio_unlock(c); + + io_schedule(); + + set_task_state(current, TASK_RUNNING); + remove_wait_queue(&c->free_buffer_wait, &wait); + + dm_bufio_lock(c); +} + +enum new_flag { + NF_FRESH = 0, + NF_READ = 1, + NF_GET = 2, + NF_PREFETCH = 3 +}; + +/* + * Allocate a new buffer. If the allocation is not possible, wait until + * some other thread frees a buffer. + * + * May drop the lock and regain it. + */ +static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf) +{ + struct dm_buffer *b; + + /* + * dm-bufio is resistant to allocation failures (it just keeps + * one buffer reserved in cases all the allocations fail). + * So set flags to not try too hard: + * GFP_NOIO: don't recurse into the I/O layer + * __GFP_NORETRY: don't retry and rather return failure + * __GFP_NOMEMALLOC: don't use emergency reserves + * __GFP_NOWARN: don't print a warning in case of failure + * + * For debugging, if we set the cache size to 1, no new buffers will + * be allocated. + */ + while (1) { + if (dm_bufio_cache_size_latch != 1) { + b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); + if (b) + return b; + } + + if (nf == NF_PREFETCH) + return NULL; + + if (!list_empty(&c->reserved_buffers)) { + b = list_entry(c->reserved_buffers.next, + struct dm_buffer, lru_list); + list_del(&b->lru_list); + c->need_reserved_buffers++; + + return b; + } + + b = __get_unclaimed_buffer(c); + if (b) + return b; + + __wait_for_free_buffer(c); + } +} + +static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf) +{ + struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf); + + if (!b) + return NULL; + + if (c->alloc_callback) + c->alloc_callback(b); + + return b; +} + +/* + * Free a buffer and wake other threads waiting for free buffers. + */ +static void __free_buffer_wake(struct dm_buffer *b) +{ + struct dm_bufio_client *c = b->c; + + if (!c->need_reserved_buffers) + free_buffer(b); + else { + list_add(&b->lru_list, &c->reserved_buffers); + c->need_reserved_buffers--; + } + + wake_up(&c->free_buffer_wait); +} + +static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait) +{ + struct dm_buffer *b, *tmp; + + list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { + BUG_ON(test_bit(B_READING, &b->state)); + + if (!test_bit(B_DIRTY, &b->state) && + !test_bit(B_WRITING, &b->state)) { + __relink_lru(b, LIST_CLEAN); + continue; + } + + if (no_wait && test_bit(B_WRITING, &b->state)) + return; + + __write_dirty_buffer(b); + dm_bufio_cond_resched(); + } +} + +/* + * Get writeback threshold and buffer limit for a given client. + */ +static void __get_memory_limit(struct dm_bufio_client *c, + unsigned long *threshold_buffers, + unsigned long *limit_buffers) +{ + unsigned long buffers; + + if (dm_bufio_cache_size != dm_bufio_cache_size_latch) { + mutex_lock(&dm_bufio_clients_lock); + __cache_size_refresh(); + mutex_unlock(&dm_bufio_clients_lock); + } + + buffers = dm_bufio_cache_size_per_client >> + (c->sectors_per_block_bits + SECTOR_SHIFT); + + if (buffers < DM_BUFIO_MIN_BUFFERS) + buffers = DM_BUFIO_MIN_BUFFERS; + + *limit_buffers = buffers; + *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100; +} + +/* + * Check if we're over watermark. + * If we are over threshold_buffers, start freeing buffers. + * If we're over "limit_buffers", block until we get under the limit. + */ +static void __check_watermark(struct dm_bufio_client *c) +{ + unsigned long threshold_buffers, limit_buffers; + + __get_memory_limit(c, &threshold_buffers, &limit_buffers); + + while (c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY] > + limit_buffers) { + + struct dm_buffer *b = __get_unclaimed_buffer(c); + + if (!b) + return; + + __free_buffer_wake(b); + dm_bufio_cond_resched(); + } + + if (c->n_buffers[LIST_DIRTY] > threshold_buffers) + __write_dirty_buffers_async(c, 1); +} + +/* + * Find a buffer in the hash. + */ +static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) +{ + struct dm_buffer *b; + struct hlist_node *hn; + + hlist_for_each_entry(b, hn, &c->cache_hash[DM_BUFIO_HASH(block)], + hash_list) { + dm_bufio_cond_resched(); + if (b->block == block) + return b; + } + + return NULL; +} + +/*---------------------------------------------------------------- + * Getting a buffer + *--------------------------------------------------------------*/ + +static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, + enum new_flag nf, int *need_submit) +{ + struct dm_buffer *b, *new_b = NULL; + + *need_submit = 0; + + b = __find(c, block); + if (b) + goto found_buffer; + + if (nf == NF_GET) + return NULL; + + new_b = __alloc_buffer_wait(c, nf); + if (!new_b) + return NULL; + + /* + * We've had a period where the mutex was unlocked, so need to + * recheck the hash table. + */ + b = __find(c, block); + if (b) { + __free_buffer_wake(new_b); + goto found_buffer; + } + + __check_watermark(c); + + b = new_b; + b->hold_count = 1; + b->read_error = 0; + b->write_error = 0; + __link_buffer(b, block, LIST_CLEAN); + + if (nf == NF_FRESH) { + b->state = 0; + return b; + } + + b->state = 1 << B_READING; + *need_submit = 1; + + return b; + +found_buffer: + if (nf == NF_PREFETCH) + return NULL; + /* + * Note: it is essential that we don't wait for the buffer to be + * read if dm_bufio_get function is used. Both dm_bufio_get and + * dm_bufio_prefetch can be used in the driver request routine. + * If the user called both dm_bufio_prefetch and dm_bufio_get on + * the same buffer, it would deadlock if we waited. + */ + if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state))) + return NULL; + + b->hold_count++; + __relink_lru(b, test_bit(B_DIRTY, &b->state) || + test_bit(B_WRITING, &b->state)); + return b; +} + +/* + * The endio routine for reading: set the error, clear the bit and wake up + * anyone waiting on the buffer. + */ +static void read_endio(struct bio *bio, int error) +{ + struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); + + b->read_error = error; + + BUG_ON(!test_bit(B_READING, &b->state)); + + smp_mb__before_clear_bit(); + clear_bit(B_READING, &b->state); + smp_mb__after_clear_bit(); + + wake_up_bit(&b->state, B_READING); +} + +/* + * A common routine for dm_bufio_new and dm_bufio_read. Operation of these + * functions is similar except that dm_bufio_new doesn't read the + * buffer from the disk (assuming that the caller overwrites all the data + * and uses dm_bufio_mark_buffer_dirty to write new data back). + */ +static void *new_read(struct dm_bufio_client *c, sector_t block, + enum new_flag nf, struct dm_buffer **bp) +{ + int need_submit; + struct dm_buffer *b; + + dm_bufio_lock(c); + b = __bufio_new(c, block, nf, &need_submit); + dm_bufio_unlock(c); + + if (!b) + return b; + + if (need_submit) + submit_io(b, READ, b->block, read_endio); + + wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE); + + if (b->read_error) { + int error = b->read_error; + + dm_bufio_release(b); + + return ERR_PTR(error); + } + + *bp = b; + + return b->data; +} + +void *dm_bufio_get(struct dm_bufio_client *c, sector_t block, + struct dm_buffer **bp) +{ + return new_read(c, block, NF_GET, bp); +} +EXPORT_SYMBOL_GPL(dm_bufio_get); + +void *dm_bufio_read(struct dm_bufio_client *c, sector_t block, + struct dm_buffer **bp) +{ + BUG_ON(dm_bufio_in_request()); + + return new_read(c, block, NF_READ, bp); +} +EXPORT_SYMBOL_GPL(dm_bufio_read); + +void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, + struct dm_buffer **bp) +{ + BUG_ON(dm_bufio_in_request()); + + return new_read(c, block, NF_FRESH, bp); +} +EXPORT_SYMBOL_GPL(dm_bufio_new); + +void dm_bufio_prefetch(struct dm_bufio_client *c, + sector_t block, unsigned n_blocks) +{ + struct blk_plug plug; + + blk_start_plug(&plug); + dm_bufio_lock(c); + + for (; n_blocks--; block++) { + int need_submit; + struct dm_buffer *b; + b = __bufio_new(c, block, NF_PREFETCH, &need_submit); + if (unlikely(b != NULL)) { + dm_bufio_unlock(c); + + if (need_submit) + submit_io(b, READ, b->block, read_endio); + dm_bufio_release(b); + + dm_bufio_cond_resched(); + + if (!n_blocks) + goto flush_plug; + dm_bufio_lock(c); + } + + } + + dm_bufio_unlock(c); + +flush_plug: + blk_finish_plug(&plug); +} +EXPORT_SYMBOL_GPL(dm_bufio_prefetch); + +void dm_bufio_release(struct dm_buffer *b) +{ + struct dm_bufio_client *c = b->c; + + dm_bufio_lock(c); + + BUG_ON(!b->hold_count); + + b->hold_count--; + if (!b->hold_count) { + wake_up(&c->free_buffer_wait); + + /* + * If there were errors on the buffer, and the buffer is not + * to be written, free the buffer. There is no point in caching + * invalid buffer. + */ + if ((b->read_error || b->write_error) && + !test_bit(B_READING, &b->state) && + !test_bit(B_WRITING, &b->state) && + !test_bit(B_DIRTY, &b->state)) { + __unlink_buffer(b); + __free_buffer_wake(b); + } + } + + dm_bufio_unlock(c); +} +EXPORT_SYMBOL_GPL(dm_bufio_release); + +void dm_bufio_mark_buffer_dirty(struct dm_buffer *b) +{ + struct dm_bufio_client *c = b->c; + + dm_bufio_lock(c); + + BUG_ON(test_bit(B_READING, &b->state)); + + if (!test_and_set_bit(B_DIRTY, &b->state)) + __relink_lru(b, LIST_DIRTY); + + dm_bufio_unlock(c); +} +EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty); + +void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c) +{ + BUG_ON(dm_bufio_in_request()); + + dm_bufio_lock(c); + __write_dirty_buffers_async(c, 0); + dm_bufio_unlock(c); +} +EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async); + +/* + * For performance, it is essential that the buffers are written asynchronously + * and simultaneously (so that the block layer can merge the writes) and then + * waited upon. + * + * Finally, we flush hardware disk cache. + */ +int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c) +{ + int a, f; + unsigned long buffers_processed = 0; + struct dm_buffer *b, *tmp; + + dm_bufio_lock(c); + __write_dirty_buffers_async(c, 0); + +again: + list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { + int dropped_lock = 0; + + if (buffers_processed < c->n_buffers[LIST_DIRTY]) + buffers_processed++; + + BUG_ON(test_bit(B_READING, &b->state)); + + if (test_bit(B_WRITING, &b->state)) { + if (buffers_processed < c->n_buffers[LIST_DIRTY]) { + dropped_lock = 1; + b->hold_count++; + dm_bufio_unlock(c); + wait_on_bit(&b->state, B_WRITING, + do_io_schedule, + TASK_UNINTERRUPTIBLE); + dm_bufio_lock(c); + b->hold_count--; + } else + wait_on_bit(&b->state, B_WRITING, + do_io_schedule, + TASK_UNINTERRUPTIBLE); + } + + if (!test_bit(B_DIRTY, &b->state) && + !test_bit(B_WRITING, &b->state)) + __relink_lru(b, LIST_CLEAN); + + dm_bufio_cond_resched(); + + /* + * If we dropped the lock, the list is no longer consistent, + * so we must restart the search. + * + * In the most common case, the buffer just processed is + * relinked to the clean list, so we won't loop scanning the + * same buffer again and again. + * + * This may livelock if there is another thread simultaneously + * dirtying buffers, so we count the number of buffers walked + * and if it exceeds the total number of buffers, it means that + * someone is doing some writes simultaneously with us. In + * this case, stop, dropping the lock. + */ + if (dropped_lock) + goto again; + } + wake_up(&c->free_buffer_wait); + dm_bufio_unlock(c); + + a = xchg(&c->async_write_error, 0); + f = dm_bufio_issue_flush(c); + if (a) + return a; + + return f; +} +EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers); + +/* + * Use dm-io to send and empty barrier flush the device. + */ +int dm_bufio_issue_flush(struct dm_bufio_client *c) +{ + struct dm_io_request io_req = { + .bi_rw = REQ_FLUSH, + .mem.type = DM_IO_KMEM, + .mem.ptr.addr = NULL, + .client = c->dm_io, + }; + struct dm_io_region io_reg = { + .bdev = c->bdev, + .sector = 0, + .count = 0, + }; + + BUG_ON(dm_bufio_in_request()); + + return dm_io(&io_req, 1, &io_reg, NULL); +} +EXPORT_SYMBOL_GPL(dm_bufio_issue_flush); + +/* + * We first delete any other buffer that may be at that new location. + * + * Then, we write the buffer to the original location if it was dirty. + * + * Then, if we are the only one who is holding the buffer, relink the buffer + * in the hash queue for the new location. + * + * If there was someone else holding the buffer, we write it to the new + * location but not relink it, because that other user needs to have the buffer + * at the same place. + */ +void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block) +{ + struct dm_bufio_client *c = b->c; + struct dm_buffer *new; + + BUG_ON(dm_bufio_in_request()); + + dm_bufio_lock(c); + +retry: + new = __find(c, new_block); + if (new) { + if (new->hold_count) { + __wait_for_free_buffer(c); + goto retry; + } + + /* + * FIXME: Is there any point waiting for a write that's going + * to be overwritten in a bit? + */ + __make_buffer_clean(new); + __unlink_buffer(new); + __free_buffer_wake(new); + } + + BUG_ON(!b->hold_count); + BUG_ON(test_bit(B_READING, &b->state)); + + __write_dirty_buffer(b); + if (b->hold_count == 1) { + wait_on_bit(&b->state, B_WRITING, + do_io_schedule, TASK_UNINTERRUPTIBLE); + set_bit(B_DIRTY, &b->state); + __unlink_buffer(b); + __link_buffer(b, new_block, LIST_DIRTY); + } else { + sector_t old_block; + wait_on_bit_lock(&b->state, B_WRITING, + do_io_schedule, TASK_UNINTERRUPTIBLE); + /* + * Relink buffer to "new_block" so that write_callback + * sees "new_block" as a block number. + * After the write, link the buffer back to old_block. + * All this must be done in bufio lock, so that block number + * change isn't visible to other threads. + */ + old_block = b->block; + __unlink_buffer(b); + __link_buffer(b, new_block, b->list_mode); + submit_io(b, WRITE, new_block, write_endio); + wait_on_bit(&b->state, B_WRITING, + do_io_schedule, TASK_UNINTERRUPTIBLE); + __unlink_buffer(b); + __link_buffer(b, old_block, b->list_mode); + } + + dm_bufio_unlock(c); + dm_bufio_release(b); +} +EXPORT_SYMBOL_GPL(dm_bufio_release_move); + +unsigned dm_bufio_get_block_size(struct dm_bufio_client *c) +{ + return c->block_size; +} +EXPORT_SYMBOL_GPL(dm_bufio_get_block_size); + +sector_t dm_bufio_get_device_size(struct dm_bufio_client *c) +{ + return i_size_read(c->bdev->bd_inode) >> + (SECTOR_SHIFT + c->sectors_per_block_bits); +} +EXPORT_SYMBOL_GPL(dm_bufio_get_device_size); + +sector_t dm_bufio_get_block_number(struct dm_buffer *b) +{ + return b->block; +} +EXPORT_SYMBOL_GPL(dm_bufio_get_block_number); + +void *dm_bufio_get_block_data(struct dm_buffer *b) +{ + return b->data; +} +EXPORT_SYMBOL_GPL(dm_bufio_get_block_data); + +void *dm_bufio_get_aux_data(struct dm_buffer *b) +{ + return b + 1; +} +EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data); + +struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b) +{ + return b->c; +} +EXPORT_SYMBOL_GPL(dm_bufio_get_client); + +static void drop_buffers(struct dm_bufio_client *c) +{ + struct dm_buffer *b; + int i; + + BUG_ON(dm_bufio_in_request()); + + /* + * An optimization so that the buffers are not written one-by-one. + */ + dm_bufio_write_dirty_buffers_async(c); + + dm_bufio_lock(c); + + while ((b = __get_unclaimed_buffer(c))) + __free_buffer_wake(b); + + for (i = 0; i < LIST_SIZE; i++) + list_for_each_entry(b, &c->lru[i], lru_list) + DMERR("leaked buffer %llx, hold count %u, list %d", + (unsigned long long)b->block, b->hold_count, i); + + for (i = 0; i < LIST_SIZE; i++) + BUG_ON(!list_empty(&c->lru[i])); + + dm_bufio_unlock(c); +} + +/* + * Test if the buffer is unused and too old, and commit it. + * At if noio is set, we must not do any I/O because we hold + * dm_bufio_clients_lock and we would risk deadlock if the I/O gets rerouted to + * different bufio client. + */ +static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp, + unsigned long max_jiffies) +{ + if (jiffies - b->last_accessed < max_jiffies) + return 1; + + if (!(gfp & __GFP_IO)) { + if (test_bit(B_READING, &b->state) || + test_bit(B_WRITING, &b->state) || + test_bit(B_DIRTY, &b->state)) + return 1; + } + + if (b->hold_count) + return 1; + + __make_buffer_clean(b); + __unlink_buffer(b); + __free_buffer_wake(b); + + return 0; +} + +static void __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, + struct shrink_control *sc) +{ + int l; + struct dm_buffer *b, *tmp; + + for (l = 0; l < LIST_SIZE; l++) { + list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) + if (!__cleanup_old_buffer(b, sc->gfp_mask, 0) && + !--nr_to_scan) + return; + dm_bufio_cond_resched(); + } +} + +static int shrink(struct shrinker *shrinker, struct shrink_control *sc) +{ + struct dm_bufio_client *c = + container_of(shrinker, struct dm_bufio_client, shrinker); + unsigned long r; + unsigned long nr_to_scan = sc->nr_to_scan; + + if (sc->gfp_mask & __GFP_IO) + dm_bufio_lock(c); + else if (!dm_bufio_trylock(c)) + return !nr_to_scan ? 0 : -1; + + if (nr_to_scan) + __scan(c, nr_to_scan, sc); + + r = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; + if (r > INT_MAX) + r = INT_MAX; + + dm_bufio_unlock(c); + + return r; +} + +/* + * Create the buffering interface + */ +struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size, + unsigned reserved_buffers, unsigned aux_size, + void (*alloc_callback)(struct dm_buffer *), + void (*write_callback)(struct dm_buffer *)) +{ + int r; + struct dm_bufio_client *c; + unsigned i; + + BUG_ON(block_size < 1 << SECTOR_SHIFT || + (block_size & (block_size - 1))); + + c = kmalloc(sizeof(*c), GFP_KERNEL); + if (!c) { + r = -ENOMEM; + goto bad_client; + } + c->cache_hash = vmalloc(sizeof(struct hlist_head) << DM_BUFIO_HASH_BITS); + if (!c->cache_hash) { + r = -ENOMEM; + goto bad_hash; + } + + c->bdev = bdev; + c->block_size = block_size; + c->sectors_per_block_bits = ffs(block_size) - 1 - SECTOR_SHIFT; + c->pages_per_block_bits = (ffs(block_size) - 1 >= PAGE_SHIFT) ? + ffs(block_size) - 1 - PAGE_SHIFT : 0; + c->blocks_per_page_bits = (ffs(block_size) - 1 < PAGE_SHIFT ? + PAGE_SHIFT - (ffs(block_size) - 1) : 0); + + c->aux_size = aux_size; + c->alloc_callback = alloc_callback; + c->write_callback = write_callback; + + for (i = 0; i < LIST_SIZE; i++) { + INIT_LIST_HEAD(&c->lru[i]); + c->n_buffers[i] = 0; + } + + for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) + INIT_HLIST_HEAD(&c->cache_hash[i]); + + mutex_init(&c->lock); + INIT_LIST_HEAD(&c->reserved_buffers); + c->need_reserved_buffers = reserved_buffers; + + init_waitqueue_head(&c->free_buffer_wait); + c->async_write_error = 0; + + c->dm_io = dm_io_client_create(); + if (IS_ERR(c->dm_io)) { + r = PTR_ERR(c->dm_io); + goto bad_dm_io; + } + + mutex_lock(&dm_bufio_clients_lock); + if (c->blocks_per_page_bits) { + if (!DM_BUFIO_CACHE_NAME(c)) { + DM_BUFIO_CACHE_NAME(c) = kasprintf(GFP_KERNEL, "dm_bufio_cache-%u", c->block_size); + if (!DM_BUFIO_CACHE_NAME(c)) { + r = -ENOMEM; + mutex_unlock(&dm_bufio_clients_lock); + goto bad_cache; + } + } + + if (!DM_BUFIO_CACHE(c)) { + DM_BUFIO_CACHE(c) = kmem_cache_create(DM_BUFIO_CACHE_NAME(c), + c->block_size, + c->block_size, 0, NULL); + if (!DM_BUFIO_CACHE(c)) { + r = -ENOMEM; + mutex_unlock(&dm_bufio_clients_lock); + goto bad_cache; + } + } + } + mutex_unlock(&dm_bufio_clients_lock); + + while (c->need_reserved_buffers) { + struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL); + + if (!b) { + r = -ENOMEM; + goto bad_buffer; + } + __free_buffer_wake(b); + } + + mutex_lock(&dm_bufio_clients_lock); + dm_bufio_client_count++; + list_add(&c->client_list, &dm_bufio_all_clients); + __cache_size_refresh(); + mutex_unlock(&dm_bufio_clients_lock); + + c->shrinker.shrink = shrink; + c->shrinker.seeks = 1; + c->shrinker.batch = 0; + register_shrinker(&c->shrinker); + + return c; + +bad_buffer: +bad_cache: + while (!list_empty(&c->reserved_buffers)) { + struct dm_buffer *b = list_entry(c->reserved_buffers.next, + struct dm_buffer, lru_list); + list_del(&b->lru_list); + free_buffer(b); + } + dm_io_client_destroy(c->dm_io); +bad_dm_io: + vfree(c->cache_hash); +bad_hash: + kfree(c); +bad_client: + return ERR_PTR(r); +} +EXPORT_SYMBOL_GPL(dm_bufio_client_create); + +/* + * Free the buffering interface. + * It is required that there are no references on any buffers. + */ +void dm_bufio_client_destroy(struct dm_bufio_client *c) +{ + unsigned i; + + drop_buffers(c); + + unregister_shrinker(&c->shrinker); + + mutex_lock(&dm_bufio_clients_lock); + + list_del(&c->client_list); + dm_bufio_client_count--; + __cache_size_refresh(); + + mutex_unlock(&dm_bufio_clients_lock); + + for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) + BUG_ON(!hlist_empty(&c->cache_hash[i])); + + BUG_ON(c->need_reserved_buffers); + + while (!list_empty(&c->reserved_buffers)) { + struct dm_buffer *b = list_entry(c->reserved_buffers.next, + struct dm_buffer, lru_list); + list_del(&b->lru_list); + free_buffer(b); + } + + for (i = 0; i < LIST_SIZE; i++) + if (c->n_buffers[i]) + DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]); + + for (i = 0; i < LIST_SIZE; i++) + BUG_ON(c->n_buffers[i]); + + dm_io_client_destroy(c->dm_io); + vfree(c->cache_hash); + kfree(c); +} +EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); + +static void cleanup_old_buffers(void) +{ + unsigned long max_age = dm_bufio_max_age; + struct dm_bufio_client *c; + + barrier(); + + if (max_age > ULONG_MAX / HZ) + max_age = ULONG_MAX / HZ; + + mutex_lock(&dm_bufio_clients_lock); + list_for_each_entry(c, &dm_bufio_all_clients, client_list) { + if (!dm_bufio_trylock(c)) + continue; + + while (!list_empty(&c->lru[LIST_CLEAN])) { + struct dm_buffer *b; + b = list_entry(c->lru[LIST_CLEAN].prev, + struct dm_buffer, lru_list); + if (__cleanup_old_buffer(b, 0, max_age * HZ)) + break; + dm_bufio_cond_resched(); + } + + dm_bufio_unlock(c); + dm_bufio_cond_resched(); + } + mutex_unlock(&dm_bufio_clients_lock); +} + +static struct workqueue_struct *dm_bufio_wq; +static struct delayed_work dm_bufio_work; + +static void work_fn(struct work_struct *w) +{ + cleanup_old_buffers(); + + queue_delayed_work(dm_bufio_wq, &dm_bufio_work, + DM_BUFIO_WORK_TIMER_SECS * HZ); +} + +/*---------------------------------------------------------------- + * Module setup + *--------------------------------------------------------------*/ + +/* + * This is called only once for the whole dm_bufio module. + * It initializes memory limit. + */ +static int __init dm_bufio_init(void) +{ + __u64 mem; + + memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches); + memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names); + + mem = (__u64)((totalram_pages - totalhigh_pages) * + DM_BUFIO_MEMORY_PERCENT / 100) << PAGE_SHIFT; + + if (mem > ULONG_MAX) + mem = ULONG_MAX; + +#ifdef CONFIG_MMU + /* + * Get the size of vmalloc space the same way as VMALLOC_TOTAL + * in fs/proc/internal.h + */ + if (mem > (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100) + mem = (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100; +#endif + + dm_bufio_default_cache_size = mem; + + mutex_lock(&dm_bufio_clients_lock); + __cache_size_refresh(); + mutex_unlock(&dm_bufio_clients_lock); + + dm_bufio_wq = create_singlethread_workqueue("dm_bufio_cache"); + if (!dm_bufio_wq) + return -ENOMEM; + + INIT_DELAYED_WORK(&dm_bufio_work, work_fn); + queue_delayed_work(dm_bufio_wq, &dm_bufio_work, + DM_BUFIO_WORK_TIMER_SECS * HZ); + + return 0; +} + +/* + * This is called once when unloading the dm_bufio module. + */ +static void __exit dm_bufio_exit(void) +{ + int bug = 0; + int i; + + cancel_delayed_work_sync(&dm_bufio_work); + destroy_workqueue(dm_bufio_wq); + + for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++) { + struct kmem_cache *kc = dm_bufio_caches[i]; + + if (kc) + kmem_cache_destroy(kc); + } + + for (i = 0; i < ARRAY_SIZE(dm_bufio_cache_names); i++) + kfree(dm_bufio_cache_names[i]); + + if (dm_bufio_client_count) { + DMCRIT("%s: dm_bufio_client_count leaked: %d", + __func__, dm_bufio_client_count); + bug = 1; + } + + if (dm_bufio_current_allocated) { + DMCRIT("%s: dm_bufio_current_allocated leaked: %lu", + __func__, dm_bufio_current_allocated); + bug = 1; + } + + if (dm_bufio_allocated_get_free_pages) { + DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu", + __func__, dm_bufio_allocated_get_free_pages); + bug = 1; + } + + if (dm_bufio_allocated_vmalloc) { + DMCRIT("%s: dm_bufio_vmalloc leaked: %lu", + __func__, dm_bufio_allocated_vmalloc); + bug = 1; + } + + if (bug) + BUG(); +} + +module_init(dm_bufio_init) +module_exit(dm_bufio_exit) + +module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache"); + +module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds"); + +module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory"); + +module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO); +MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc"); + +module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO); +MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages"); + +module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO); +MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc"); + +module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO); +MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache"); + +MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>"); +MODULE_DESCRIPTION(DM_NAME " buffered I/O library"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-bufio.h b/drivers/md/dm-bufio.h new file mode 100644 index 00000000..b142946a --- /dev/null +++ b/drivers/md/dm-bufio.h @@ -0,0 +1,120 @@ +/* + * Copyright (C) 2009-2011 Red Hat, Inc. + * + * Author: Mikulas Patocka <mpatocka@redhat.com> + * + * This file is released under the GPL. + */ + +#ifndef DM_BUFIO_H +#define DM_BUFIO_H + +#include <linux/blkdev.h> +#include <linux/types.h> + +/*----------------------------------------------------------------*/ + +struct dm_bufio_client; +struct dm_buffer; + +/* + * Create a buffered IO cache on a given device + */ +struct dm_bufio_client * +dm_bufio_client_create(struct block_device *bdev, unsigned block_size, + unsigned reserved_buffers, unsigned aux_size, + void (*alloc_callback)(struct dm_buffer *), + void (*write_callback)(struct dm_buffer *)); + +/* + * Release a buffered IO cache. + */ +void dm_bufio_client_destroy(struct dm_bufio_client *c); + +/* + * WARNING: to avoid deadlocks, these conditions are observed: + * + * - At most one thread can hold at most "reserved_buffers" simultaneously. + * - Each other threads can hold at most one buffer. + * - Threads which call only dm_bufio_get can hold unlimited number of + * buffers. + */ + +/* + * Read a given block from disk. Returns pointer to data. Returns a + * pointer to dm_buffer that can be used to release the buffer or to make + * it dirty. + */ +void *dm_bufio_read(struct dm_bufio_client *c, sector_t block, + struct dm_buffer **bp); + +/* + * Like dm_bufio_read, but return buffer from cache, don't read + * it. If the buffer is not in the cache, return NULL. + */ +void *dm_bufio_get(struct dm_bufio_client *c, sector_t block, + struct dm_buffer **bp); + +/* + * Like dm_bufio_read, but don't read anything from the disk. It is + * expected that the caller initializes the buffer and marks it dirty. + */ +void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, + struct dm_buffer **bp); + +/* + * Prefetch the specified blocks to the cache. + * The function starts to read the blocks and returns without waiting for + * I/O to finish. + */ +void dm_bufio_prefetch(struct dm_bufio_client *c, + sector_t block, unsigned n_blocks); + +/* + * Release a reference obtained with dm_bufio_{read,get,new}. The data + * pointer and dm_buffer pointer is no longer valid after this call. + */ +void dm_bufio_release(struct dm_buffer *b); + +/* + * Mark a buffer dirty. It should be called after the buffer is modified. + * + * In case of memory pressure, the buffer may be written after + * dm_bufio_mark_buffer_dirty, but before dm_bufio_write_dirty_buffers. So + * dm_bufio_write_dirty_buffers guarantees that the buffer is on-disk but + * the actual writing may occur earlier. + */ +void dm_bufio_mark_buffer_dirty(struct dm_buffer *b); + +/* + * Initiate writing of dirty buffers, without waiting for completion. + */ +void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c); + +/* + * Write all dirty buffers. Guarantees that all dirty buffers created prior + * to this call are on disk when this call exits. + */ +int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c); + +/* + * Send an empty write barrier to the device to flush hardware disk cache. + */ +int dm_bufio_issue_flush(struct dm_bufio_client *c); + +/* + * Like dm_bufio_release but also move the buffer to the new + * block. dm_bufio_write_dirty_buffers is needed to commit the new block. + */ +void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block); + +unsigned dm_bufio_get_block_size(struct dm_bufio_client *c); +sector_t dm_bufio_get_device_size(struct dm_bufio_client *c); +sector_t dm_bufio_get_block_number(struct dm_buffer *b); +void *dm_bufio_get_block_data(struct dm_buffer *b); +void *dm_bufio_get_aux_data(struct dm_buffer *b); +struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b); + +/*----------------------------------------------------------------*/ + +#endif diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c new file mode 100644 index 00000000..3f06df59 --- /dev/null +++ b/drivers/md/dm-crypt.c @@ -0,0 +1,1914 @@ +/* + * Copyright (C) 2003 Christophe Saout <christophe@saout.de> + * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> + * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include <linux/completion.h> +#include <linux/err.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/mempool.h> +#include <linux/slab.h> +#include <linux/crypto.h> +#include <linux/workqueue.h> +#include <linux/backing-dev.h> +#include <linux/percpu.h> +#include <linux/atomic.h> +#include <linux/scatterlist.h> +#include <asm/page.h> +#include <asm/unaligned.h> +#include <crypto/hash.h> +#include <crypto/md5.h> +#include <crypto/algapi.h> + +#include <linux/device-mapper.h> + +#define DM_MSG_PREFIX "crypt" + +/* + * context holding the current state of a multi-part conversion + */ +struct convert_context { + struct completion restart; + struct bio *bio_in; + struct bio *bio_out; + unsigned int offset_in; + unsigned int offset_out; + unsigned int idx_in; + unsigned int idx_out; + sector_t sector; + atomic_t pending; +}; + +/* + * per bio private data + */ +struct dm_crypt_io { + struct dm_target *target; + struct bio *base_bio; + struct work_struct work; + + struct convert_context ctx; + + atomic_t pending; + int error; + sector_t sector; + struct dm_crypt_io *base_io; +}; + +struct dm_crypt_request { + struct convert_context *ctx; + struct scatterlist sg_in; + struct scatterlist sg_out; + sector_t iv_sector; +}; + +struct crypt_config; + +struct crypt_iv_operations { + int (*ctr)(struct crypt_config *cc, struct dm_target *ti, + const char *opts); + void (*dtr)(struct crypt_config *cc); + int (*init)(struct crypt_config *cc); + int (*wipe)(struct crypt_config *cc); + int (*generator)(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq); + int (*post)(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq); +}; + +struct iv_essiv_private { + struct crypto_hash *hash_tfm; + u8 *salt; +}; + +struct iv_benbi_private { + int shift; +}; + +#define LMK_SEED_SIZE 64 /* hash + 0 */ +struct iv_lmk_private { + struct crypto_shash *hash_tfm; + u8 *seed; +}; + +/* + * Crypt: maps a linear range of a block device + * and encrypts / decrypts at the same time. + */ +enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID }; + +/* + * Duplicated per-CPU state for cipher. + */ +struct crypt_cpu { + struct ablkcipher_request *req; + /* ESSIV: struct crypto_cipher *essiv_tfm */ + void *iv_private; + struct crypto_ablkcipher *tfms[0]; +}; + +/* + * The fields in here must be read only after initialization, + * changing state should be in crypt_cpu. + */ +struct crypt_config { + struct dm_dev *dev; + sector_t start; + + /* + * pool for per bio private data, crypto requests and + * encryption requeusts/buffer pages + */ + mempool_t *io_pool; + mempool_t *req_pool; + mempool_t *page_pool; + struct bio_set *bs; + + struct workqueue_struct *io_queue; + struct workqueue_struct *crypt_queue; + + char *cipher; + char *cipher_string; + + struct crypt_iv_operations *iv_gen_ops; + union { + struct iv_essiv_private essiv; + struct iv_benbi_private benbi; + struct iv_lmk_private lmk; + } iv_gen_private; + sector_t iv_offset; + unsigned int iv_size; + + /* + * Duplicated per cpu state. Access through + * per_cpu_ptr() only. + */ + struct crypt_cpu __percpu *cpu; + unsigned tfms_count; + + /* + * Layout of each crypto request: + * + * struct ablkcipher_request + * context + * padding + * struct dm_crypt_request + * padding + * IV + * + * The padding is added so that dm_crypt_request and the IV are + * correctly aligned. + */ + unsigned int dmreq_start; + + unsigned long flags; + unsigned int key_size; + unsigned int key_parts; + u8 key[0]; +}; + +#define MIN_IOS 16 +#define MIN_POOL_PAGES 32 + +static struct kmem_cache *_crypt_io_pool; + +static void clone_init(struct dm_crypt_io *, struct bio *); +static void kcryptd_queue_crypt(struct dm_crypt_io *io); +static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq); + +static struct crypt_cpu *this_crypt_config(struct crypt_config *cc) +{ + return this_cpu_ptr(cc->cpu); +} + +/* + * Use this to access cipher attributes that are the same for each CPU. + */ +static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc) +{ + return __this_cpu_ptr(cc->cpu)->tfms[0]; +} + +/* + * Different IV generation algorithms: + * + * plain: the initial vector is the 32-bit little-endian version of the sector + * number, padded with zeros if necessary. + * + * plain64: the initial vector is the 64-bit little-endian version of the sector + * number, padded with zeros if necessary. + * + * essiv: "encrypted sector|salt initial vector", the sector number is + * encrypted with the bulk cipher using a salt as key. The salt + * should be derived from the bulk cipher's key via hashing. + * + * benbi: the 64-bit "big-endian 'narrow block'-count", starting at 1 + * (needed for LRW-32-AES and possible other narrow block modes) + * + * null: the initial vector is always zero. Provides compatibility with + * obsolete loop_fish2 devices. Do not use for new devices. + * + * lmk: Compatible implementation of the block chaining mode used + * by the Loop-AES block device encryption system + * designed by Jari Ruusu. See http://loop-aes.sourceforge.net/ + * It operates on full 512 byte sectors and uses CBC + * with an IV derived from the sector number, the data and + * optionally extra IV seed. + * This means that after decryption the first block + * of sector must be tweaked according to decrypted data. + * Loop-AES can use three encryption schemes: + * version 1: is plain aes-cbc mode + * version 2: uses 64 multikey scheme with lmk IV generator + * version 3: the same as version 2 with additional IV seed + * (it uses 65 keys, last key is used as IV seed) + * + * plumb: unimplemented, see: + * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454 + */ + +static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + memset(iv, 0, cc->iv_size); + *(__le32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff); + + return 0; +} + +static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + memset(iv, 0, cc->iv_size); + *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector); + + return 0; +} + +/* Initialise ESSIV - compute salt but no local memory allocations */ +static int crypt_iv_essiv_init(struct crypt_config *cc) +{ + struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; + struct hash_desc desc; + struct scatterlist sg; + struct crypto_cipher *essiv_tfm; + int err, cpu; + + sg_init_one(&sg, cc->key, cc->key_size); + desc.tfm = essiv->hash_tfm; + desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + err = crypto_hash_digest(&desc, &sg, cc->key_size, essiv->salt); + if (err) + return err; + + for_each_possible_cpu(cpu) { + essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private, + + err = crypto_cipher_setkey(essiv_tfm, essiv->salt, + crypto_hash_digestsize(essiv->hash_tfm)); + if (err) + return err; + } + + return 0; +} + +/* Wipe salt and reset key derived from volume key */ +static int crypt_iv_essiv_wipe(struct crypt_config *cc) +{ + struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; + unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm); + struct crypto_cipher *essiv_tfm; + int cpu, r, err = 0; + + memset(essiv->salt, 0, salt_size); + + for_each_possible_cpu(cpu) { + essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private; + r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size); + if (r) + err = r; + } + + return err; +} + +/* Set up per cpu cipher state */ +static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc, + struct dm_target *ti, + u8 *salt, unsigned saltsize) +{ + struct crypto_cipher *essiv_tfm; + int err; + + /* Setup the essiv_tfm with the given salt */ + essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(essiv_tfm)) { + ti->error = "Error allocating crypto tfm for ESSIV"; + return essiv_tfm; + } + + if (crypto_cipher_blocksize(essiv_tfm) != + crypto_ablkcipher_ivsize(any_tfm(cc))) { + ti->error = "Block size of ESSIV cipher does " + "not match IV size of block cipher"; + crypto_free_cipher(essiv_tfm); + return ERR_PTR(-EINVAL); + } + + err = crypto_cipher_setkey(essiv_tfm, salt, saltsize); + if (err) { + ti->error = "Failed to set key for ESSIV cipher"; + crypto_free_cipher(essiv_tfm); + return ERR_PTR(err); + } + + return essiv_tfm; +} + +static void crypt_iv_essiv_dtr(struct crypt_config *cc) +{ + int cpu; + struct crypt_cpu *cpu_cc; + struct crypto_cipher *essiv_tfm; + struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; + + crypto_free_hash(essiv->hash_tfm); + essiv->hash_tfm = NULL; + + kzfree(essiv->salt); + essiv->salt = NULL; + + for_each_possible_cpu(cpu) { + cpu_cc = per_cpu_ptr(cc->cpu, cpu); + essiv_tfm = cpu_cc->iv_private; + + if (essiv_tfm) + crypto_free_cipher(essiv_tfm); + + cpu_cc->iv_private = NULL; + } +} + +static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, + const char *opts) +{ + struct crypto_cipher *essiv_tfm = NULL; + struct crypto_hash *hash_tfm = NULL; + u8 *salt = NULL; + int err, cpu; + + if (!opts) { + ti->error = "Digest algorithm missing for ESSIV mode"; + return -EINVAL; + } + + /* Allocate hash algorithm */ + hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(hash_tfm)) { + ti->error = "Error initializing ESSIV hash"; + err = PTR_ERR(hash_tfm); + goto bad; + } + + salt = kzalloc(crypto_hash_digestsize(hash_tfm), GFP_KERNEL); + if (!salt) { + ti->error = "Error kmallocing salt storage in ESSIV"; + err = -ENOMEM; + goto bad; + } + + cc->iv_gen_private.essiv.salt = salt; + cc->iv_gen_private.essiv.hash_tfm = hash_tfm; + + for_each_possible_cpu(cpu) { + essiv_tfm = setup_essiv_cpu(cc, ti, salt, + crypto_hash_digestsize(hash_tfm)); + if (IS_ERR(essiv_tfm)) { + crypt_iv_essiv_dtr(cc); + return PTR_ERR(essiv_tfm); + } + per_cpu_ptr(cc->cpu, cpu)->iv_private = essiv_tfm; + } + + return 0; + +bad: + if (hash_tfm && !IS_ERR(hash_tfm)) + crypto_free_hash(hash_tfm); + kfree(salt); + return err; +} + +static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private; + + memset(iv, 0, cc->iv_size); + *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector); + crypto_cipher_encrypt_one(essiv_tfm, iv, iv); + + return 0; +} + +static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, + const char *opts) +{ + unsigned bs = crypto_ablkcipher_blocksize(any_tfm(cc)); + int log = ilog2(bs); + + /* we need to calculate how far we must shift the sector count + * to get the cipher block count, we use this shift in _gen */ + + if (1 << log != bs) { + ti->error = "cypher blocksize is not a power of 2"; + return -EINVAL; + } + + if (log > 9) { + ti->error = "cypher blocksize is > 512"; + return -EINVAL; + } + + cc->iv_gen_private.benbi.shift = 9 - log; + + return 0; +} + +static void crypt_iv_benbi_dtr(struct crypt_config *cc) +{ +} + +static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + __be64 val; + + memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */ + + val = cpu_to_be64(((u64)dmreq->iv_sector << cc->iv_gen_private.benbi.shift) + 1); + put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64))); + + return 0; +} + +static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + memset(iv, 0, cc->iv_size); + + return 0; +} + +static void crypt_iv_lmk_dtr(struct crypt_config *cc) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + + if (lmk->hash_tfm && !IS_ERR(lmk->hash_tfm)) + crypto_free_shash(lmk->hash_tfm); + lmk->hash_tfm = NULL; + + kzfree(lmk->seed); + lmk->seed = NULL; +} + +static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti, + const char *opts) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + + lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0); + if (IS_ERR(lmk->hash_tfm)) { + ti->error = "Error initializing LMK hash"; + return PTR_ERR(lmk->hash_tfm); + } + + /* No seed in LMK version 2 */ + if (cc->key_parts == cc->tfms_count) { + lmk->seed = NULL; + return 0; + } + + lmk->seed = kzalloc(LMK_SEED_SIZE, GFP_KERNEL); + if (!lmk->seed) { + crypt_iv_lmk_dtr(cc); + ti->error = "Error kmallocing seed storage in LMK"; + return -ENOMEM; + } + + return 0; +} + +static int crypt_iv_lmk_init(struct crypt_config *cc) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + int subkey_size = cc->key_size / cc->key_parts; + + /* LMK seed is on the position of LMK_KEYS + 1 key */ + if (lmk->seed) + memcpy(lmk->seed, cc->key + (cc->tfms_count * subkey_size), + crypto_shash_digestsize(lmk->hash_tfm)); + + return 0; +} + +static int crypt_iv_lmk_wipe(struct crypt_config *cc) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + + if (lmk->seed) + memset(lmk->seed, 0, LMK_SEED_SIZE); + + return 0; +} + +static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq, + u8 *data) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + struct { + struct shash_desc desc; + char ctx[crypto_shash_descsize(lmk->hash_tfm)]; + } sdesc; + struct md5_state md5state; + u32 buf[4]; + int i, r; + + sdesc.desc.tfm = lmk->hash_tfm; + sdesc.desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + r = crypto_shash_init(&sdesc.desc); + if (r) + return r; + + if (lmk->seed) { + r = crypto_shash_update(&sdesc.desc, lmk->seed, LMK_SEED_SIZE); + if (r) + return r; + } + + /* Sector is always 512B, block size 16, add data of blocks 1-31 */ + r = crypto_shash_update(&sdesc.desc, data + 16, 16 * 31); + if (r) + return r; + + /* Sector is cropped to 56 bits here */ + buf[0] = cpu_to_le32(dmreq->iv_sector & 0xFFFFFFFF); + buf[1] = cpu_to_le32((((u64)dmreq->iv_sector >> 32) & 0x00FFFFFF) | 0x80000000); + buf[2] = cpu_to_le32(4024); + buf[3] = 0; + r = crypto_shash_update(&sdesc.desc, (u8 *)buf, sizeof(buf)); + if (r) + return r; + + /* No MD5 padding here */ + r = crypto_shash_export(&sdesc.desc, &md5state); + if (r) + return r; + + for (i = 0; i < MD5_HASH_WORDS; i++) + __cpu_to_le32s(&md5state.hash[i]); + memcpy(iv, &md5state.hash, cc->iv_size); + + return 0; +} + +static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + u8 *src; + int r = 0; + + if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) { + src = kmap_atomic(sg_page(&dmreq->sg_in)); + r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset); + kunmap_atomic(src); + } else + memset(iv, 0, cc->iv_size); + + return r; +} + +static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + u8 *dst; + int r; + + if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) + return 0; + + dst = kmap_atomic(sg_page(&dmreq->sg_out)); + r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset); + + /* Tweak the first block of plaintext sector */ + if (!r) + crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size); + + kunmap_atomic(dst); + return r; +} + +static struct crypt_iv_operations crypt_iv_plain_ops = { + .generator = crypt_iv_plain_gen +}; + +static struct crypt_iv_operations crypt_iv_plain64_ops = { + .generator = crypt_iv_plain64_gen +}; + +static struct crypt_iv_operations crypt_iv_essiv_ops = { + .ctr = crypt_iv_essiv_ctr, + .dtr = crypt_iv_essiv_dtr, + .init = crypt_iv_essiv_init, + .wipe = crypt_iv_essiv_wipe, + .generator = crypt_iv_essiv_gen +}; + +static struct crypt_iv_operations crypt_iv_benbi_ops = { + .ctr = crypt_iv_benbi_ctr, + .dtr = crypt_iv_benbi_dtr, + .generator = crypt_iv_benbi_gen +}; + +static struct crypt_iv_operations crypt_iv_null_ops = { + .generator = crypt_iv_null_gen +}; + +static struct crypt_iv_operations crypt_iv_lmk_ops = { + .ctr = crypt_iv_lmk_ctr, + .dtr = crypt_iv_lmk_dtr, + .init = crypt_iv_lmk_init, + .wipe = crypt_iv_lmk_wipe, + .generator = crypt_iv_lmk_gen, + .post = crypt_iv_lmk_post +}; + +static void crypt_convert_init(struct crypt_config *cc, + struct convert_context *ctx, + struct bio *bio_out, struct bio *bio_in, + sector_t sector) +{ + ctx->bio_in = bio_in; + ctx->bio_out = bio_out; + ctx->offset_in = 0; + ctx->offset_out = 0; + ctx->idx_in = bio_in ? bio_in->bi_idx : 0; + ctx->idx_out = bio_out ? bio_out->bi_idx : 0; + ctx->sector = sector + cc->iv_offset; + init_completion(&ctx->restart); +} + +static struct dm_crypt_request *dmreq_of_req(struct crypt_config *cc, + struct ablkcipher_request *req) +{ + return (struct dm_crypt_request *)((char *)req + cc->dmreq_start); +} + +static struct ablkcipher_request *req_of_dmreq(struct crypt_config *cc, + struct dm_crypt_request *dmreq) +{ + return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start); +} + +static u8 *iv_of_dmreq(struct crypt_config *cc, + struct dm_crypt_request *dmreq) +{ + return (u8 *)ALIGN((unsigned long)(dmreq + 1), + crypto_ablkcipher_alignmask(any_tfm(cc)) + 1); +} + +static int crypt_convert_block(struct crypt_config *cc, + struct convert_context *ctx, + struct ablkcipher_request *req) +{ + struct bio_vec *bv_in = bio_iovec_idx(ctx->bio_in, ctx->idx_in); + struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out); + struct dm_crypt_request *dmreq; + u8 *iv; + int r = 0; + + dmreq = dmreq_of_req(cc, req); + iv = iv_of_dmreq(cc, dmreq); + + dmreq->iv_sector = ctx->sector; + dmreq->ctx = ctx; + sg_init_table(&dmreq->sg_in, 1); + sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, + bv_in->bv_offset + ctx->offset_in); + + sg_init_table(&dmreq->sg_out, 1); + sg_set_page(&dmreq->sg_out, bv_out->bv_page, 1 << SECTOR_SHIFT, + bv_out->bv_offset + ctx->offset_out); + + ctx->offset_in += 1 << SECTOR_SHIFT; + if (ctx->offset_in >= bv_in->bv_len) { + ctx->offset_in = 0; + ctx->idx_in++; + } + + ctx->offset_out += 1 << SECTOR_SHIFT; + if (ctx->offset_out >= bv_out->bv_len) { + ctx->offset_out = 0; + ctx->idx_out++; + } + + if (cc->iv_gen_ops) { + r = cc->iv_gen_ops->generator(cc, iv, dmreq); + if (r < 0) + return r; + } + + ablkcipher_request_set_crypt(req, &dmreq->sg_in, &dmreq->sg_out, + 1 << SECTOR_SHIFT, iv); + + if (bio_data_dir(ctx->bio_in) == WRITE) + r = crypto_ablkcipher_encrypt(req); + else + r = crypto_ablkcipher_decrypt(req); + + if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post) + r = cc->iv_gen_ops->post(cc, iv, dmreq); + + return r; +} + +static void kcryptd_async_done(struct crypto_async_request *async_req, + int error); + +static void crypt_alloc_req(struct crypt_config *cc, + struct convert_context *ctx) +{ + struct crypt_cpu *this_cc = this_crypt_config(cc); + unsigned key_index = ctx->sector & (cc->tfms_count - 1); + + if (!this_cc->req) + this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); + + ablkcipher_request_set_tfm(this_cc->req, this_cc->tfms[key_index]); + ablkcipher_request_set_callback(this_cc->req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + kcryptd_async_done, dmreq_of_req(cc, this_cc->req)); +} + +/* + * Encrypt / decrypt data from one bio to another one (can be the same one) + */ +static int crypt_convert(struct crypt_config *cc, + struct convert_context *ctx) +{ + struct crypt_cpu *this_cc = this_crypt_config(cc); + int r; + + atomic_set(&ctx->pending, 1); + + while(ctx->idx_in < ctx->bio_in->bi_vcnt && + ctx->idx_out < ctx->bio_out->bi_vcnt) { + + crypt_alloc_req(cc, ctx); + + atomic_inc(&ctx->pending); + + r = crypt_convert_block(cc, ctx, this_cc->req); + + switch (r) { + /* async */ + case -EBUSY: + wait_for_completion(&ctx->restart); + INIT_COMPLETION(ctx->restart); + /* fall through*/ + case -EINPROGRESS: + this_cc->req = NULL; + ctx->sector++; + continue; + + /* sync */ + case 0: + atomic_dec(&ctx->pending); + ctx->sector++; + cond_resched(); + continue; + + /* error */ + default: + atomic_dec(&ctx->pending); + return r; + } + } + + return 0; +} + +static void dm_crypt_bio_destructor(struct bio *bio) +{ + struct dm_crypt_io *io = bio->bi_private; + struct crypt_config *cc = io->target->private; + + bio_free(bio, cc->bs); +} + +/* + * Generate a new unfragmented bio with the given size + * This should never violate the device limitations + * May return a smaller bio when running out of pages, indicated by + * *out_of_pages set to 1. + */ +static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size, + unsigned *out_of_pages) +{ + struct crypt_config *cc = io->target->private; + struct bio *clone; + unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM; + unsigned i, len; + struct page *page; + + clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs); + if (!clone) + return NULL; + + clone_init(io, clone); + *out_of_pages = 0; + + for (i = 0; i < nr_iovecs; i++) { + page = mempool_alloc(cc->page_pool, gfp_mask); + if (!page) { + *out_of_pages = 1; + break; + } + + /* + * If additional pages cannot be allocated without waiting, + * return a partially-allocated bio. The caller will then try + * to allocate more bios while submitting this partial bio. + */ + gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT; + + len = (size > PAGE_SIZE) ? PAGE_SIZE : size; + + if (!bio_add_page(clone, page, len, 0)) { + mempool_free(page, cc->page_pool); + break; + } + + size -= len; + } + + if (!clone->bi_size) { + bio_put(clone); + return NULL; + } + + return clone; +} + +static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone) +{ + unsigned int i; + struct bio_vec *bv; + + for (i = 0; i < clone->bi_vcnt; i++) { + bv = bio_iovec_idx(clone, i); + BUG_ON(!bv->bv_page); + mempool_free(bv->bv_page, cc->page_pool); + bv->bv_page = NULL; + } +} + +static struct dm_crypt_io *crypt_io_alloc(struct dm_target *ti, + struct bio *bio, sector_t sector) +{ + struct crypt_config *cc = ti->private; + struct dm_crypt_io *io; + + io = mempool_alloc(cc->io_pool, GFP_NOIO); + io->target = ti; + io->base_bio = bio; + io->sector = sector; + io->error = 0; + io->base_io = NULL; + atomic_set(&io->pending, 0); + + return io; +} + +static void crypt_inc_pending(struct dm_crypt_io *io) +{ + atomic_inc(&io->pending); +} + +/* + * One of the bios was finished. Check for completion of + * the whole request and correctly clean up the buffer. + * If base_io is set, wait for the last fragment to complete. + */ +static void crypt_dec_pending(struct dm_crypt_io *io) +{ + struct crypt_config *cc = io->target->private; + struct bio *base_bio = io->base_bio; + struct dm_crypt_io *base_io = io->base_io; + int error = io->error; + + if (!atomic_dec_and_test(&io->pending)) + return; + + mempool_free(io, cc->io_pool); + + if (likely(!base_io)) + bio_endio(base_bio, error); + else { + if (error && !base_io->error) + base_io->error = error; + crypt_dec_pending(base_io); + } +} + +/* + * kcryptd/kcryptd_io: + * + * Needed because it would be very unwise to do decryption in an + * interrupt context. + * + * kcryptd performs the actual encryption or decryption. + * + * kcryptd_io performs the IO submission. + * + * They must be separated as otherwise the final stages could be + * starved by new requests which can block in the first stages due + * to memory allocation. + * + * The work is done per CPU global for all dm-crypt instances. + * They should not depend on each other and do not block. + */ +static void crypt_endio(struct bio *clone, int error) +{ + struct dm_crypt_io *io = clone->bi_private; + struct crypt_config *cc = io->target->private; + unsigned rw = bio_data_dir(clone); + + if (unlikely(!bio_flagged(clone, BIO_UPTODATE) && !error)) + error = -EIO; + + /* + * free the processed pages + */ + if (rw == WRITE) + crypt_free_buffer_pages(cc, clone); + + bio_put(clone); + + if (rw == READ && !error) { + kcryptd_queue_crypt(io); + return; + } + + if (unlikely(error)) + io->error = error; + + crypt_dec_pending(io); +} + +static void clone_init(struct dm_crypt_io *io, struct bio *clone) +{ + struct crypt_config *cc = io->target->private; + + clone->bi_private = io; + clone->bi_end_io = crypt_endio; + clone->bi_bdev = cc->dev->bdev; + clone->bi_rw = io->base_bio->bi_rw; + clone->bi_destructor = dm_crypt_bio_destructor; +} + +static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) +{ + struct crypt_config *cc = io->target->private; + struct bio *base_bio = io->base_bio; + struct bio *clone; + + /* + * The block layer might modify the bvec array, so always + * copy the required bvecs because we need the original + * one in order to decrypt the whole bio data *afterwards*. + */ + clone = bio_alloc_bioset(gfp, bio_segments(base_bio), cc->bs); + if (!clone) + return 1; + + crypt_inc_pending(io); + + clone_init(io, clone); + clone->bi_idx = 0; + clone->bi_vcnt = bio_segments(base_bio); + clone->bi_size = base_bio->bi_size; + clone->bi_sector = cc->start + io->sector; + memcpy(clone->bi_io_vec, bio_iovec(base_bio), + sizeof(struct bio_vec) * clone->bi_vcnt); + + generic_make_request(clone); + return 0; +} + +static void kcryptd_io_write(struct dm_crypt_io *io) +{ + struct bio *clone = io->ctx.bio_out; + generic_make_request(clone); +} + +static void kcryptd_io(struct work_struct *work) +{ + struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); + + if (bio_data_dir(io->base_bio) == READ) { + crypt_inc_pending(io); + if (kcryptd_io_read(io, GFP_NOIO)) + io->error = -ENOMEM; + crypt_dec_pending(io); + } else + kcryptd_io_write(io); +} + +static void kcryptd_queue_io(struct dm_crypt_io *io) +{ + struct crypt_config *cc = io->target->private; + + INIT_WORK(&io->work, kcryptd_io); + queue_work(cc->io_queue, &io->work); +} + +static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) +{ + struct bio *clone = io->ctx.bio_out; + struct crypt_config *cc = io->target->private; + + if (unlikely(io->error < 0)) { + crypt_free_buffer_pages(cc, clone); + bio_put(clone); + crypt_dec_pending(io); + return; + } + + /* crypt_convert should have filled the clone bio */ + BUG_ON(io->ctx.idx_out < clone->bi_vcnt); + + clone->bi_sector = cc->start + io->sector; + + if (async) + kcryptd_queue_io(io); + else + generic_make_request(clone); +} + +static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) +{ + struct crypt_config *cc = io->target->private; + struct bio *clone; + struct dm_crypt_io *new_io; + int crypt_finished; + unsigned out_of_pages = 0; + unsigned remaining = io->base_bio->bi_size; + sector_t sector = io->sector; + int r; + + /* + * Prevent io from disappearing until this function completes. + */ + crypt_inc_pending(io); + crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, sector); + + /* + * The allocated buffers can be smaller than the whole bio, + * so repeat the whole process until all the data can be handled. + */ + while (remaining) { + clone = crypt_alloc_buffer(io, remaining, &out_of_pages); + if (unlikely(!clone)) { + io->error = -ENOMEM; + break; + } + + io->ctx.bio_out = clone; + io->ctx.idx_out = 0; + + remaining -= clone->bi_size; + sector += bio_sectors(clone); + + crypt_inc_pending(io); + + r = crypt_convert(cc, &io->ctx); + if (r < 0) + io->error = -EIO; + + crypt_finished = atomic_dec_and_test(&io->ctx.pending); + + /* Encryption was already finished, submit io now */ + if (crypt_finished) { + kcryptd_crypt_write_io_submit(io, 0); + + /* + * If there was an error, do not try next fragments. + * For async, error is processed in async handler. + */ + if (unlikely(r < 0)) + break; + + io->sector = sector; + } + + /* + * Out of memory -> run queues + * But don't wait if split was due to the io size restriction + */ + if (unlikely(out_of_pages)) + congestion_wait(BLK_RW_ASYNC, HZ/100); + + /* + * With async crypto it is unsafe to share the crypto context + * between fragments, so switch to a new dm_crypt_io structure. + */ + if (unlikely(!crypt_finished && remaining)) { + new_io = crypt_io_alloc(io->target, io->base_bio, + sector); + crypt_inc_pending(new_io); + crypt_convert_init(cc, &new_io->ctx, NULL, + io->base_bio, sector); + new_io->ctx.idx_in = io->ctx.idx_in; + new_io->ctx.offset_in = io->ctx.offset_in; + + /* + * Fragments after the first use the base_io + * pending count. + */ + if (!io->base_io) + new_io->base_io = io; + else { + new_io->base_io = io->base_io; + crypt_inc_pending(io->base_io); + crypt_dec_pending(io); + } + + io = new_io; + } + } + + crypt_dec_pending(io); +} + +static void kcryptd_crypt_read_done(struct dm_crypt_io *io) +{ + crypt_dec_pending(io); +} + +static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) +{ + struct crypt_config *cc = io->target->private; + int r = 0; + + crypt_inc_pending(io); + + crypt_convert_init(cc, &io->ctx, io->base_bio, io->base_bio, + io->sector); + + r = crypt_convert(cc, &io->ctx); + if (r < 0) + io->error = -EIO; + + if (atomic_dec_and_test(&io->ctx.pending)) + kcryptd_crypt_read_done(io); + + crypt_dec_pending(io); +} + +static void kcryptd_async_done(struct crypto_async_request *async_req, + int error) +{ + struct dm_crypt_request *dmreq = async_req->data; + struct convert_context *ctx = dmreq->ctx; + struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx); + struct crypt_config *cc = io->target->private; + + if (error == -EINPROGRESS) { + complete(&ctx->restart); + return; + } + + if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post) + error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq); + + if (error < 0) + io->error = -EIO; + + mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); + + if (!atomic_dec_and_test(&ctx->pending)) + return; + + if (bio_data_dir(io->base_bio) == READ) + kcryptd_crypt_read_done(io); + else + kcryptd_crypt_write_io_submit(io, 1); +} + +static void kcryptd_crypt(struct work_struct *work) +{ + struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); + + if (bio_data_dir(io->base_bio) == READ) + kcryptd_crypt_read_convert(io); + else + kcryptd_crypt_write_convert(io); +} + +static void kcryptd_queue_crypt(struct dm_crypt_io *io) +{ + struct crypt_config *cc = io->target->private; + + INIT_WORK(&io->work, kcryptd_crypt); + queue_work(cc->crypt_queue, &io->work); +} + +/* + * Decode key from its hex representation + */ +static int crypt_decode_key(u8 *key, char *hex, unsigned int size) +{ + char buffer[3]; + char *endp; + unsigned int i; + + buffer[2] = '\0'; + + for (i = 0; i < size; i++) { + buffer[0] = *hex++; + buffer[1] = *hex++; + + key[i] = (u8)simple_strtoul(buffer, &endp, 16); + + if (endp != &buffer[2]) + return -EINVAL; + } + + if (*hex != '\0') + return -EINVAL; + + return 0; +} + +/* + * Encode key into its hex representation + */ +static void crypt_encode_key(char *hex, u8 *key, unsigned int size) +{ + unsigned int i; + + for (i = 0; i < size; i++) { + sprintf(hex, "%02x", *key); + hex += 2; + key++; + } +} + +static void crypt_free_tfms(struct crypt_config *cc, int cpu) +{ + struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu); + unsigned i; + + for (i = 0; i < cc->tfms_count; i++) + if (cpu_cc->tfms[i] && !IS_ERR(cpu_cc->tfms[i])) { + crypto_free_ablkcipher(cpu_cc->tfms[i]); + cpu_cc->tfms[i] = NULL; + } +} + +static int crypt_alloc_tfms(struct crypt_config *cc, int cpu, char *ciphermode) +{ + struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu); + unsigned i; + int err; + + for (i = 0; i < cc->tfms_count; i++) { + cpu_cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0); + if (IS_ERR(cpu_cc->tfms[i])) { + err = PTR_ERR(cpu_cc->tfms[i]); + crypt_free_tfms(cc, cpu); + return err; + } + } + + return 0; +} + +static int crypt_setkey_allcpus(struct crypt_config *cc) +{ + unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count); + int cpu, err = 0, i, r; + + for_each_possible_cpu(cpu) { + for (i = 0; i < cc->tfms_count; i++) { + r = crypto_ablkcipher_setkey(per_cpu_ptr(cc->cpu, cpu)->tfms[i], + cc->key + (i * subkey_size), subkey_size); + if (r) + err = r; + } + } + + return err; +} + +static int crypt_set_key(struct crypt_config *cc, char *key) +{ + int r = -EINVAL; + int key_string_len = strlen(key); + + /* The key size may not be changed. */ + if (cc->key_size != (key_string_len >> 1)) + goto out; + + /* Hyphen (which gives a key_size of zero) means there is no key. */ + if (!cc->key_size && strcmp(key, "-")) + goto out; + + if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0) + goto out; + + set_bit(DM_CRYPT_KEY_VALID, &cc->flags); + + r = crypt_setkey_allcpus(cc); + +out: + /* Hex key string not needed after here, so wipe it. */ + memset(key, '0', key_string_len); + + return r; +} + +static int crypt_wipe_key(struct crypt_config *cc) +{ + clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); + memset(&cc->key, 0, cc->key_size * sizeof(u8)); + + return crypt_setkey_allcpus(cc); +} + +static void crypt_dtr(struct dm_target *ti) +{ + struct crypt_config *cc = ti->private; + struct crypt_cpu *cpu_cc; + int cpu; + + ti->private = NULL; + + if (!cc) + return; + + if (cc->io_queue) + destroy_workqueue(cc->io_queue); + if (cc->crypt_queue) + destroy_workqueue(cc->crypt_queue); + + if (cc->cpu) + for_each_possible_cpu(cpu) { + cpu_cc = per_cpu_ptr(cc->cpu, cpu); + if (cpu_cc->req) + mempool_free(cpu_cc->req, cc->req_pool); + crypt_free_tfms(cc, cpu); + } + + if (cc->bs) + bioset_free(cc->bs); + + if (cc->page_pool) + mempool_destroy(cc->page_pool); + if (cc->req_pool) + mempool_destroy(cc->req_pool); + if (cc->io_pool) + mempool_destroy(cc->io_pool); + + if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) + cc->iv_gen_ops->dtr(cc); + + if (cc->dev) + dm_put_device(ti, cc->dev); + + if (cc->cpu) + free_percpu(cc->cpu); + + kzfree(cc->cipher); + kzfree(cc->cipher_string); + + /* Must zero key material before freeing */ + kzfree(cc); +} + +static int crypt_ctr_cipher(struct dm_target *ti, + char *cipher_in, char *key) +{ + struct crypt_config *cc = ti->private; + char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount; + char *cipher_api = NULL; + int cpu, ret = -EINVAL; + char dummy; + + /* Convert to crypto api definition? */ + if (strchr(cipher_in, '(')) { + ti->error = "Bad cipher specification"; + return -EINVAL; + } + + cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL); + if (!cc->cipher_string) + goto bad_mem; + + /* + * Legacy dm-crypt cipher specification + * cipher[:keycount]-mode-iv:ivopts + */ + tmp = cipher_in; + keycount = strsep(&tmp, "-"); + cipher = strsep(&keycount, ":"); + + if (!keycount) + cc->tfms_count = 1; + else if (sscanf(keycount, "%u%c", &cc->tfms_count, &dummy) != 1 || + !is_power_of_2(cc->tfms_count)) { + ti->error = "Bad cipher key count specification"; + return -EINVAL; + } + cc->key_parts = cc->tfms_count; + + cc->cipher = kstrdup(cipher, GFP_KERNEL); + if (!cc->cipher) + goto bad_mem; + + chainmode = strsep(&tmp, "-"); + ivopts = strsep(&tmp, "-"); + ivmode = strsep(&ivopts, ":"); + + if (tmp) + DMWARN("Ignoring unexpected additional cipher options"); + + cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)) + + cc->tfms_count * sizeof(*(cc->cpu->tfms)), + __alignof__(struct crypt_cpu)); + if (!cc->cpu) { + ti->error = "Cannot allocate per cpu state"; + goto bad_mem; + } + + /* + * For compatibility with the original dm-crypt mapping format, if + * only the cipher name is supplied, use cbc-plain. + */ + if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) { + chainmode = "cbc"; + ivmode = "plain"; + } + + if (strcmp(chainmode, "ecb") && !ivmode) { + ti->error = "IV mechanism required"; + return -EINVAL; + } + + cipher_api = kmalloc(CRYPTO_MAX_ALG_NAME, GFP_KERNEL); + if (!cipher_api) + goto bad_mem; + + ret = snprintf(cipher_api, CRYPTO_MAX_ALG_NAME, + "%s(%s)", chainmode, cipher); + if (ret < 0) { + kfree(cipher_api); + goto bad_mem; + } + + /* Allocate cipher */ + for_each_possible_cpu(cpu) { + ret = crypt_alloc_tfms(cc, cpu, cipher_api); + if (ret < 0) { + ti->error = "Error allocating crypto tfm"; + goto bad; + } + } + + /* Initialize and set key */ + ret = crypt_set_key(cc, key); + if (ret < 0) { + ti->error = "Error decoding and setting key"; + goto bad; + } + + /* Initialize IV */ + cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc)); + if (cc->iv_size) + /* at least a 64 bit sector number should fit in our buffer */ + cc->iv_size = max(cc->iv_size, + (unsigned int)(sizeof(u64) / sizeof(u8))); + else if (ivmode) { + DMWARN("Selected cipher does not support IVs"); + ivmode = NULL; + } + + /* Choose ivmode, see comments at iv code. */ + if (ivmode == NULL) + cc->iv_gen_ops = NULL; + else if (strcmp(ivmode, "plain") == 0) + cc->iv_gen_ops = &crypt_iv_plain_ops; + else if (strcmp(ivmode, "plain64") == 0) + cc->iv_gen_ops = &crypt_iv_plain64_ops; + else if (strcmp(ivmode, "essiv") == 0) + cc->iv_gen_ops = &crypt_iv_essiv_ops; + else if (strcmp(ivmode, "benbi") == 0) + cc->iv_gen_ops = &crypt_iv_benbi_ops; + else if (strcmp(ivmode, "null") == 0) + cc->iv_gen_ops = &crypt_iv_null_ops; + else if (strcmp(ivmode, "lmk") == 0) { + cc->iv_gen_ops = &crypt_iv_lmk_ops; + /* Version 2 and 3 is recognised according + * to length of provided multi-key string. + * If present (version 3), last key is used as IV seed. + */ + if (cc->key_size % cc->key_parts) + cc->key_parts++; + } else { + ret = -EINVAL; + ti->error = "Invalid IV mode"; + goto bad; + } + + /* Allocate IV */ + if (cc->iv_gen_ops && cc->iv_gen_ops->ctr) { + ret = cc->iv_gen_ops->ctr(cc, ti, ivopts); + if (ret < 0) { + ti->error = "Error creating IV"; + goto bad; + } + } + + /* Initialize IV (set keys for ESSIV etc) */ + if (cc->iv_gen_ops && cc->iv_gen_ops->init) { + ret = cc->iv_gen_ops->init(cc); + if (ret < 0) { + ti->error = "Error initialising IV"; + goto bad; + } + } + + ret = 0; +bad: + kfree(cipher_api); + return ret; + +bad_mem: + ti->error = "Cannot allocate cipher strings"; + return -ENOMEM; +} + +/* + * Construct an encryption mapping: + * <cipher> <key> <iv_offset> <dev_path> <start> + */ +static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct crypt_config *cc; + unsigned int key_size, opt_params; + unsigned long long tmpll; + int ret; + struct dm_arg_set as; + const char *opt_string; + char dummy; + + static struct dm_arg _args[] = { + {0, 1, "Invalid number of feature args"}, + }; + + if (argc < 5) { + ti->error = "Not enough arguments"; + return -EINVAL; + } + + key_size = strlen(argv[1]) >> 1; + + cc = kzalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL); + if (!cc) { + ti->error = "Cannot allocate encryption context"; + return -ENOMEM; + } + cc->key_size = key_size; + + ti->private = cc; + ret = crypt_ctr_cipher(ti, argv[0], argv[1]); + if (ret < 0) + goto bad; + + ret = -ENOMEM; + cc->io_pool = mempool_create_slab_pool(MIN_IOS, _crypt_io_pool); + if (!cc->io_pool) { + ti->error = "Cannot allocate crypt io mempool"; + goto bad; + } + + cc->dmreq_start = sizeof(struct ablkcipher_request); + cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc)); + cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment()); + cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) & + ~(crypto_tfm_ctx_alignment() - 1); + + cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + + sizeof(struct dm_crypt_request) + cc->iv_size); + if (!cc->req_pool) { + ti->error = "Cannot allocate crypt request mempool"; + goto bad; + } + + cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); + if (!cc->page_pool) { + ti->error = "Cannot allocate page mempool"; + goto bad; + } + + cc->bs = bioset_create(MIN_IOS, 0); + if (!cc->bs) { + ti->error = "Cannot allocate crypt bioset"; + goto bad; + } + + ret = -EINVAL; + if (sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) { + ti->error = "Invalid iv_offset sector"; + goto bad; + } + cc->iv_offset = tmpll; + + if (dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), &cc->dev)) { + ti->error = "Device lookup failed"; + goto bad; + } + + if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) { + ti->error = "Invalid device sector"; + goto bad; + } + cc->start = tmpll; + + argv += 5; + argc -= 5; + + /* Optional parameters */ + if (argc) { + as.argc = argc; + as.argv = argv; + + ret = dm_read_arg_group(_args, &as, &opt_params, &ti->error); + if (ret) + goto bad; + + opt_string = dm_shift_arg(&as); + + if (opt_params == 1 && opt_string && + !strcasecmp(opt_string, "allow_discards")) + ti->num_discard_requests = 1; + else if (opt_params) { + ret = -EINVAL; + ti->error = "Invalid feature arguments"; + goto bad; + } + } + + ret = -ENOMEM; + cc->io_queue = alloc_workqueue("kcryptd_io", + WQ_NON_REENTRANT| + WQ_MEM_RECLAIM, + 1); + if (!cc->io_queue) { + ti->error = "Couldn't create kcryptd io queue"; + goto bad; + } + + cc->crypt_queue = alloc_workqueue("kcryptd", + WQ_NON_REENTRANT| + WQ_CPU_INTENSIVE| + WQ_MEM_RECLAIM, + 1); + if (!cc->crypt_queue) { + ti->error = "Couldn't create kcryptd queue"; + goto bad; + } + + ti->num_flush_requests = 1; + ti->discard_zeroes_data_unsupported = 1; + + return 0; + +bad: + crypt_dtr(ti); + return ret; +} + +static int crypt_map(struct dm_target *ti, struct bio *bio, + union map_info *map_context) +{ + struct dm_crypt_io *io; + struct crypt_config *cc; + + /* + * If bio is REQ_FLUSH or REQ_DISCARD, just bypass crypt queues. + * - for REQ_FLUSH device-mapper core ensures that no IO is in-flight + * - for REQ_DISCARD caller must use flush if IO ordering matters + */ + if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) { + cc = ti->private; + bio->bi_bdev = cc->dev->bdev; + if (bio_sectors(bio)) + bio->bi_sector = cc->start + dm_target_offset(ti, bio->bi_sector); + return DM_MAPIO_REMAPPED; + } + + io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector)); + + if (bio_data_dir(io->base_bio) == READ) { + if (kcryptd_io_read(io, GFP_NOWAIT)) + kcryptd_queue_io(io); + } else + kcryptd_queue_crypt(io); + + return DM_MAPIO_SUBMITTED; +} + +static int crypt_status(struct dm_target *ti, status_type_t type, + char *result, unsigned int maxlen) +{ + struct crypt_config *cc = ti->private; + unsigned int sz = 0; + + switch (type) { + case STATUSTYPE_INFO: + result[0] = '\0'; + break; + + case STATUSTYPE_TABLE: + DMEMIT("%s ", cc->cipher_string); + + if (cc->key_size > 0) { + if ((maxlen - sz) < ((cc->key_size << 1) + 1)) + return -ENOMEM; + + crypt_encode_key(result + sz, cc->key, cc->key_size); + sz += cc->key_size << 1; + } else { + if (sz >= maxlen) + return -ENOMEM; + result[sz++] = '-'; + } + + DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset, + cc->dev->name, (unsigned long long)cc->start); + + if (ti->num_discard_requests) + DMEMIT(" 1 allow_discards"); + + break; + } + return 0; +} + +static void crypt_postsuspend(struct dm_target *ti) +{ + struct crypt_config *cc = ti->private; + + set_bit(DM_CRYPT_SUSPENDED, &cc->flags); +} + +static int crypt_preresume(struct dm_target *ti) +{ + struct crypt_config *cc = ti->private; + + if (!test_bit(DM_CRYPT_KEY_VALID, &cc->flags)) { + DMERR("aborting resume - crypt key is not set."); + return -EAGAIN; + } + + return 0; +} + +static void crypt_resume(struct dm_target *ti) +{ + struct crypt_config *cc = ti->private; + + clear_bit(DM_CRYPT_SUSPENDED, &cc->flags); +} + +/* Message interface + * key set <key> + * key wipe + */ +static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) +{ + struct crypt_config *cc = ti->private; + int ret = -EINVAL; + + if (argc < 2) + goto error; + + if (!strcasecmp(argv[0], "key")) { + if (!test_bit(DM_CRYPT_SUSPENDED, &cc->flags)) { + DMWARN("not suspended during key manipulation."); + return -EINVAL; + } + if (argc == 3 && !strcasecmp(argv[1], "set")) { + ret = crypt_set_key(cc, argv[2]); + if (ret) + return ret; + if (cc->iv_gen_ops && cc->iv_gen_ops->init) + ret = cc->iv_gen_ops->init(cc); + return ret; + } + if (argc == 2 && !strcasecmp(argv[1], "wipe")) { + if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) { + ret = cc->iv_gen_ops->wipe(cc); + if (ret) + return ret; + } + return crypt_wipe_key(cc); + } + } + +error: + DMWARN("unrecognised message received."); + return -EINVAL; +} + +static int crypt_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size) +{ + struct crypt_config *cc = ti->private; + struct request_queue *q = bdev_get_queue(cc->dev->bdev); + + if (!q->merge_bvec_fn) + return max_size; + + bvm->bi_bdev = cc->dev->bdev; + bvm->bi_sector = cc->start + dm_target_offset(ti, bvm->bi_sector); + + return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); +} + +static int crypt_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct crypt_config *cc = ti->private; + + return fn(ti, cc->dev, cc->start, ti->len, data); +} + +static struct target_type crypt_target = { + .name = "crypt", + .version = {1, 11, 0}, + .module = THIS_MODULE, + .ctr = crypt_ctr, + .dtr = crypt_dtr, + .map = crypt_map, + .status = crypt_status, + .postsuspend = crypt_postsuspend, + .preresume = crypt_preresume, + .resume = crypt_resume, + .message = crypt_message, + .merge = crypt_merge, + .iterate_devices = crypt_iterate_devices, +}; + +static int __init dm_crypt_init(void) +{ + int r; + + _crypt_io_pool = KMEM_CACHE(dm_crypt_io, 0); + if (!_crypt_io_pool) + return -ENOMEM; + + r = dm_register_target(&crypt_target); + if (r < 0) { + DMERR("register failed %d", r); + kmem_cache_destroy(_crypt_io_pool); + } + + return r; +} + +static void __exit dm_crypt_exit(void) +{ + dm_unregister_target(&crypt_target); + kmem_cache_destroy(_crypt_io_pool); +} + +module_init(dm_crypt_init); +module_exit(dm_crypt_exit); + +MODULE_AUTHOR("Christophe Saout <christophe@saout.de>"); +MODULE_DESCRIPTION(DM_NAME " target for transparent encryption / decryption"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c new file mode 100644 index 00000000..2dc22ddd --- /dev/null +++ b/drivers/md/dm-delay.c @@ -0,0 +1,397 @@ +/* + * Copyright (C) 2005-2007 Red Hat GmbH + * + * A target that delays reads and/or writes and can send + * them to different devices. + * + * This file is released under the GPL. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/blkdev.h> +#include <linux/bio.h> +#include <linux/slab.h> + +#include <linux/device-mapper.h> + +#define DM_MSG_PREFIX "delay" + +struct delay_c { + struct timer_list delay_timer; + struct mutex timer_lock; + struct work_struct flush_expired_bios; + struct list_head delayed_bios; + atomic_t may_delay; + mempool_t *delayed_pool; + + struct dm_dev *dev_read; + sector_t start_read; + unsigned read_delay; + unsigned reads; + + struct dm_dev *dev_write; + sector_t start_write; + unsigned write_delay; + unsigned writes; +}; + +struct dm_delay_info { + struct delay_c *context; + struct list_head list; + struct bio *bio; + unsigned long expires; +}; + +static DEFINE_MUTEX(delayed_bios_lock); + +static struct workqueue_struct *kdelayd_wq; +static struct kmem_cache *delayed_cache; + +static void handle_delayed_timer(unsigned long data) +{ + struct delay_c *dc = (struct delay_c *)data; + + queue_work(kdelayd_wq, &dc->flush_expired_bios); +} + +static void queue_timeout(struct delay_c *dc, unsigned long expires) +{ + mutex_lock(&dc->timer_lock); + + if (!timer_pending(&dc->delay_timer) || expires < dc->delay_timer.expires) + mod_timer(&dc->delay_timer, expires); + + mutex_unlock(&dc->timer_lock); +} + +static void flush_bios(struct bio *bio) +{ + struct bio *n; + + while (bio) { + n = bio->bi_next; + bio->bi_next = NULL; + generic_make_request(bio); + bio = n; + } +} + +static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all) +{ + struct dm_delay_info *delayed, *next; + unsigned long next_expires = 0; + int start_timer = 0; + struct bio_list flush_bios = { }; + + mutex_lock(&delayed_bios_lock); + list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) { + if (flush_all || time_after_eq(jiffies, delayed->expires)) { + list_del(&delayed->list); + bio_list_add(&flush_bios, delayed->bio); + if ((bio_data_dir(delayed->bio) == WRITE)) + delayed->context->writes--; + else + delayed->context->reads--; + mempool_free(delayed, dc->delayed_pool); + continue; + } + + if (!start_timer) { + start_timer = 1; + next_expires = delayed->expires; + } else + next_expires = min(next_expires, delayed->expires); + } + + mutex_unlock(&delayed_bios_lock); + + if (start_timer) + queue_timeout(dc, next_expires); + + return bio_list_get(&flush_bios); +} + +static void flush_expired_bios(struct work_struct *work) +{ + struct delay_c *dc; + + dc = container_of(work, struct delay_c, flush_expired_bios); + flush_bios(flush_delayed_bios(dc, 0)); +} + +/* + * Mapping parameters: + * <device> <offset> <delay> [<write_device> <write_offset> <write_delay>] + * + * With separate write parameters, the first set is only used for reads. + * Delays are specified in milliseconds. + */ +static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct delay_c *dc; + unsigned long long tmpll; + char dummy; + + if (argc != 3 && argc != 6) { + ti->error = "requires exactly 3 or 6 arguments"; + return -EINVAL; + } + + dc = kmalloc(sizeof(*dc), GFP_KERNEL); + if (!dc) { + ti->error = "Cannot allocate context"; + return -ENOMEM; + } + + dc->reads = dc->writes = 0; + + if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1) { + ti->error = "Invalid device sector"; + goto bad; + } + dc->start_read = tmpll; + + if (sscanf(argv[2], "%u%c", &dc->read_delay, &dummy) != 1) { + ti->error = "Invalid delay"; + goto bad; + } + + if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), + &dc->dev_read)) { + ti->error = "Device lookup failed"; + goto bad; + } + + dc->dev_write = NULL; + if (argc == 3) + goto out; + + if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) { + ti->error = "Invalid write device sector"; + goto bad_dev_read; + } + dc->start_write = tmpll; + + if (sscanf(argv[5], "%u%c", &dc->write_delay, &dummy) != 1) { + ti->error = "Invalid write delay"; + goto bad_dev_read; + } + + if (dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), + &dc->dev_write)) { + ti->error = "Write device lookup failed"; + goto bad_dev_read; + } + +out: + dc->delayed_pool = mempool_create_slab_pool(128, delayed_cache); + if (!dc->delayed_pool) { + DMERR("Couldn't create delayed bio pool."); + goto bad_dev_write; + } + + setup_timer(&dc->delay_timer, handle_delayed_timer, (unsigned long)dc); + + INIT_WORK(&dc->flush_expired_bios, flush_expired_bios); + INIT_LIST_HEAD(&dc->delayed_bios); + mutex_init(&dc->timer_lock); + atomic_set(&dc->may_delay, 1); + + ti->num_flush_requests = 1; + ti->num_discard_requests = 1; + ti->private = dc; + return 0; + +bad_dev_write: + if (dc->dev_write) + dm_put_device(ti, dc->dev_write); +bad_dev_read: + dm_put_device(ti, dc->dev_read); +bad: + kfree(dc); + return -EINVAL; +} + +static void delay_dtr(struct dm_target *ti) +{ + struct delay_c *dc = ti->private; + + flush_workqueue(kdelayd_wq); + + dm_put_device(ti, dc->dev_read); + + if (dc->dev_write) + dm_put_device(ti, dc->dev_write); + + mempool_destroy(dc->delayed_pool); + kfree(dc); +} + +static int delay_bio(struct delay_c *dc, int delay, struct bio *bio) +{ + struct dm_delay_info *delayed; + unsigned long expires = 0; + + if (!delay || !atomic_read(&dc->may_delay)) + return 1; + + delayed = mempool_alloc(dc->delayed_pool, GFP_NOIO); + + delayed->context = dc; + delayed->bio = bio; + delayed->expires = expires = jiffies + (delay * HZ / 1000); + + mutex_lock(&delayed_bios_lock); + + if (bio_data_dir(bio) == WRITE) + dc->writes++; + else + dc->reads++; + + list_add_tail(&delayed->list, &dc->delayed_bios); + + mutex_unlock(&delayed_bios_lock); + + queue_timeout(dc, expires); + + return 0; +} + +static void delay_presuspend(struct dm_target *ti) +{ + struct delay_c *dc = ti->private; + + atomic_set(&dc->may_delay, 0); + del_timer_sync(&dc->delay_timer); + flush_bios(flush_delayed_bios(dc, 1)); +} + +static void delay_resume(struct dm_target *ti) +{ + struct delay_c *dc = ti->private; + + atomic_set(&dc->may_delay, 1); +} + +static int delay_map(struct dm_target *ti, struct bio *bio, + union map_info *map_context) +{ + struct delay_c *dc = ti->private; + + if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) { + bio->bi_bdev = dc->dev_write->bdev; + if (bio_sectors(bio)) + bio->bi_sector = dc->start_write + + dm_target_offset(ti, bio->bi_sector); + + return delay_bio(dc, dc->write_delay, bio); + } + + bio->bi_bdev = dc->dev_read->bdev; + bio->bi_sector = dc->start_read + dm_target_offset(ti, bio->bi_sector); + + return delay_bio(dc, dc->read_delay, bio); +} + +static int delay_status(struct dm_target *ti, status_type_t type, + char *result, unsigned maxlen) +{ + struct delay_c *dc = ti->private; + int sz = 0; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%u %u", dc->reads, dc->writes); + break; + + case STATUSTYPE_TABLE: + DMEMIT("%s %llu %u", dc->dev_read->name, + (unsigned long long) dc->start_read, + dc->read_delay); + if (dc->dev_write) + DMEMIT(" %s %llu %u", dc->dev_write->name, + (unsigned long long) dc->start_write, + dc->write_delay); + break; + } + + return 0; +} + +static int delay_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct delay_c *dc = ti->private; + int ret = 0; + + ret = fn(ti, dc->dev_read, dc->start_read, ti->len, data); + if (ret) + goto out; + + if (dc->dev_write) + ret = fn(ti, dc->dev_write, dc->start_write, ti->len, data); + +out: + return ret; +} + +static struct target_type delay_target = { + .name = "delay", + .version = {1, 1, 0}, + .module = THIS_MODULE, + .ctr = delay_ctr, + .dtr = delay_dtr, + .map = delay_map, + .presuspend = delay_presuspend, + .resume = delay_resume, + .status = delay_status, + .iterate_devices = delay_iterate_devices, +}; + +static int __init dm_delay_init(void) +{ + int r = -ENOMEM; + + kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); + if (!kdelayd_wq) { + DMERR("Couldn't start kdelayd"); + goto bad_queue; + } + + delayed_cache = KMEM_CACHE(dm_delay_info, 0); + if (!delayed_cache) { + DMERR("Couldn't create delayed bio cache."); + goto bad_memcache; + } + + r = dm_register_target(&delay_target); + if (r < 0) { + DMERR("register failed %d", r); + goto bad_register; + } + + return 0; + +bad_register: + kmem_cache_destroy(delayed_cache); +bad_memcache: + destroy_workqueue(kdelayd_wq); +bad_queue: + return r; +} + +static void __exit dm_delay_exit(void) +{ + dm_unregister_target(&delay_target); + kmem_cache_destroy(delayed_cache); + destroy_workqueue(kdelayd_wq); +} + +/* Module hooks */ +module_init(dm_delay_init); +module_exit(dm_delay_exit); + +MODULE_DESCRIPTION(DM_NAME " delay target"); +MODULE_AUTHOR("Heinz Mauelshagen <mauelshagen@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c new file mode 100644 index 00000000..aa70f7d4 --- /dev/null +++ b/drivers/md/dm-exception-store.c @@ -0,0 +1,295 @@ +/* + * Copyright (C) 2001-2002 Sistina Software (UK) Limited. + * Copyright (C) 2006-2008 Red Hat GmbH + * + * This file is released under the GPL. + */ + +#include "dm-exception-store.h" + +#include <linux/ctype.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/vmalloc.h> +#include <linux/module.h> +#include <linux/slab.h> + +#define DM_MSG_PREFIX "snapshot exception stores" + +static LIST_HEAD(_exception_store_types); +static DEFINE_SPINLOCK(_lock); + +static struct dm_exception_store_type *__find_exception_store_type(const char *name) +{ + struct dm_exception_store_type *type; + + list_for_each_entry(type, &_exception_store_types, list) + if (!strcmp(name, type->name)) + return type; + + return NULL; +} + +static struct dm_exception_store_type *_get_exception_store_type(const char *name) +{ + struct dm_exception_store_type *type; + + spin_lock(&_lock); + + type = __find_exception_store_type(name); + + if (type && !try_module_get(type->module)) + type = NULL; + + spin_unlock(&_lock); + + return type; +} + +/* + * get_type + * @type_name + * + * Attempt to retrieve the dm_exception_store_type by name. If not already + * available, attempt to load the appropriate module. + * + * Exstore modules are named "dm-exstore-" followed by the 'type_name'. + * Modules may contain multiple types. + * This function will first try the module "dm-exstore-<type_name>", + * then truncate 'type_name' on the last '-' and try again. + * + * For example, if type_name was "clustered-shared", it would search + * 'dm-exstore-clustered-shared' then 'dm-exstore-clustered'. + * + * 'dm-exception-store-<type_name>' is too long of a name in my + * opinion, which is why I've chosen to have the files + * containing exception store implementations be 'dm-exstore-<type_name>'. + * If you want your module to be autoloaded, you will follow this + * naming convention. + * + * Returns: dm_exception_store_type* on success, NULL on failure + */ +static struct dm_exception_store_type *get_type(const char *type_name) +{ + char *p, *type_name_dup; + struct dm_exception_store_type *type; + + type = _get_exception_store_type(type_name); + if (type) + return type; + + type_name_dup = kstrdup(type_name, GFP_KERNEL); + if (!type_name_dup) { + DMERR("No memory left to attempt load for \"%s\"", type_name); + return NULL; + } + + while (request_module("dm-exstore-%s", type_name_dup) || + !(type = _get_exception_store_type(type_name))) { + p = strrchr(type_name_dup, '-'); + if (!p) + break; + p[0] = '\0'; + } + + if (!type) + DMWARN("Module for exstore type \"%s\" not found.", type_name); + + kfree(type_name_dup); + + return type; +} + +static void put_type(struct dm_exception_store_type *type) +{ + spin_lock(&_lock); + module_put(type->module); + spin_unlock(&_lock); +} + +int dm_exception_store_type_register(struct dm_exception_store_type *type) +{ + int r = 0; + + spin_lock(&_lock); + if (!__find_exception_store_type(type->name)) + list_add(&type->list, &_exception_store_types); + else + r = -EEXIST; + spin_unlock(&_lock); + + return r; +} +EXPORT_SYMBOL(dm_exception_store_type_register); + +int dm_exception_store_type_unregister(struct dm_exception_store_type *type) +{ + spin_lock(&_lock); + + if (!__find_exception_store_type(type->name)) { + spin_unlock(&_lock); + return -EINVAL; + } + + list_del(&type->list); + + spin_unlock(&_lock); + + return 0; +} +EXPORT_SYMBOL(dm_exception_store_type_unregister); + +static int set_chunk_size(struct dm_exception_store *store, + const char *chunk_size_arg, char **error) +{ + unsigned long chunk_size_ulong; + char *value; + + chunk_size_ulong = simple_strtoul(chunk_size_arg, &value, 10); + if (*chunk_size_arg == '\0' || *value != '\0' || + chunk_size_ulong > UINT_MAX) { + *error = "Invalid chunk size"; + return -EINVAL; + } + + if (!chunk_size_ulong) { + store->chunk_size = store->chunk_mask = store->chunk_shift = 0; + return 0; + } + + return dm_exception_store_set_chunk_size(store, + (unsigned) chunk_size_ulong, + error); +} + +int dm_exception_store_set_chunk_size(struct dm_exception_store *store, + unsigned chunk_size, + char **error) +{ + /* Check chunk_size is a power of 2 */ + if (!is_power_of_2(chunk_size)) { + *error = "Chunk size is not a power of 2"; + return -EINVAL; + } + + /* Validate the chunk size against the device block size */ + if (chunk_size % + (bdev_logical_block_size(dm_snap_cow(store->snap)->bdev) >> 9) || + chunk_size % + (bdev_logical_block_size(dm_snap_origin(store->snap)->bdev) >> 9)) { + *error = "Chunk size is not a multiple of device blocksize"; + return -EINVAL; + } + + if (chunk_size > INT_MAX >> SECTOR_SHIFT) { + *error = "Chunk size is too high"; + return -EINVAL; + } + + store->chunk_size = chunk_size; + store->chunk_mask = chunk_size - 1; + store->chunk_shift = ffs(chunk_size) - 1; + + return 0; +} + +int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, + struct dm_snapshot *snap, + unsigned *args_used, + struct dm_exception_store **store) +{ + int r = 0; + struct dm_exception_store_type *type = NULL; + struct dm_exception_store *tmp_store; + char persistent; + + if (argc < 2) { + ti->error = "Insufficient exception store arguments"; + return -EINVAL; + } + + tmp_store = kmalloc(sizeof(*tmp_store), GFP_KERNEL); + if (!tmp_store) { + ti->error = "Exception store allocation failed"; + return -ENOMEM; + } + + persistent = toupper(*argv[0]); + if (persistent == 'P') + type = get_type("P"); + else if (persistent == 'N') + type = get_type("N"); + else { + ti->error = "Persistent flag is not P or N"; + r = -EINVAL; + goto bad_type; + } + + if (!type) { + ti->error = "Exception store type not recognised"; + r = -EINVAL; + goto bad_type; + } + + tmp_store->type = type; + tmp_store->snap = snap; + + r = set_chunk_size(tmp_store, argv[1], &ti->error); + if (r) + goto bad; + + r = type->ctr(tmp_store, 0, NULL); + if (r) { + ti->error = "Exception store type constructor failed"; + goto bad; + } + + *args_used = 2; + *store = tmp_store; + return 0; + +bad: + put_type(type); +bad_type: + kfree(tmp_store); + return r; +} +EXPORT_SYMBOL(dm_exception_store_create); + +void dm_exception_store_destroy(struct dm_exception_store *store) +{ + store->type->dtr(store); + put_type(store->type); + kfree(store); +} +EXPORT_SYMBOL(dm_exception_store_destroy); + +int dm_exception_store_init(void) +{ + int r; + + r = dm_transient_snapshot_init(); + if (r) { + DMERR("Unable to register transient exception store type."); + goto transient_fail; + } + + r = dm_persistent_snapshot_init(); + if (r) { + DMERR("Unable to register persistent exception store type"); + goto persistent_fail; + } + + return 0; + +persistent_fail: + dm_transient_snapshot_exit(); +transient_fail: + return r; +} + +void dm_exception_store_exit(void) +{ + dm_persistent_snapshot_exit(); + dm_transient_snapshot_exit(); +} diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h new file mode 100644 index 00000000..0b253624 --- /dev/null +++ b/drivers/md/dm-exception-store.h @@ -0,0 +1,227 @@ +/* + * Copyright (C) 2001-2002 Sistina Software (UK) Limited. + * Copyright (C) 2008 Red Hat, Inc. All rights reserved. + * + * Device-mapper snapshot exception store. + * + * This file is released under the GPL. + */ + +#ifndef _LINUX_DM_EXCEPTION_STORE +#define _LINUX_DM_EXCEPTION_STORE + +#include <linux/blkdev.h> +#include <linux/device-mapper.h> + +/* + * The snapshot code deals with largish chunks of the disk at a + * time. Typically 32k - 512k. + */ +typedef sector_t chunk_t; + +/* + * An exception is used where an old chunk of data has been + * replaced by a new one. + * If chunk_t is 64 bits in size, the top 8 bits of new_chunk hold the number + * of chunks that follow contiguously. Remaining bits hold the number of the + * chunk within the device. + */ +struct dm_exception { + struct list_head hash_list; + + chunk_t old_chunk; + chunk_t new_chunk; +}; + +/* + * Abstraction to handle the meta/layout of exception stores (the + * COW device). + */ +struct dm_exception_store; +struct dm_exception_store_type { + const char *name; + struct module *module; + + int (*ctr) (struct dm_exception_store *store, + unsigned argc, char **argv); + + /* + * Destroys this object when you've finished with it. + */ + void (*dtr) (struct dm_exception_store *store); + + /* + * The target shouldn't read the COW device until this is + * called. As exceptions are read from the COW, they are + * reported back via the callback. + */ + int (*read_metadata) (struct dm_exception_store *store, + int (*callback)(void *callback_context, + chunk_t old, chunk_t new), + void *callback_context); + + /* + * Find somewhere to store the next exception. + */ + int (*prepare_exception) (struct dm_exception_store *store, + struct dm_exception *e); + + /* + * Update the metadata with this exception. + */ + void (*commit_exception) (struct dm_exception_store *store, + struct dm_exception *e, + void (*callback) (void *, int success), + void *callback_context); + + /* + * Returns 0 if the exception store is empty. + * + * If there are exceptions still to be merged, sets + * *last_old_chunk and *last_new_chunk to the most recent + * still-to-be-merged chunk and returns the number of + * consecutive previous ones. + */ + int (*prepare_merge) (struct dm_exception_store *store, + chunk_t *last_old_chunk, chunk_t *last_new_chunk); + + /* + * Clear the last n exceptions. + * nr_merged must be <= the value returned by prepare_merge. + */ + int (*commit_merge) (struct dm_exception_store *store, int nr_merged); + + /* + * The snapshot is invalid, note this in the metadata. + */ + void (*drop_snapshot) (struct dm_exception_store *store); + + unsigned (*status) (struct dm_exception_store *store, + status_type_t status, char *result, + unsigned maxlen); + + /* + * Return how full the snapshot is. + */ + void (*usage) (struct dm_exception_store *store, + sector_t *total_sectors, sector_t *sectors_allocated, + sector_t *metadata_sectors); + + /* For internal device-mapper use only. */ + struct list_head list; +}; + +struct dm_snapshot; + +struct dm_exception_store { + struct dm_exception_store_type *type; + struct dm_snapshot *snap; + + /* Size of data blocks saved - must be a power of 2 */ + unsigned chunk_size; + unsigned chunk_mask; + unsigned chunk_shift; + + void *context; +}; + +/* + * Obtain the origin or cow device used by a given snapshot. + */ +struct dm_dev *dm_snap_origin(struct dm_snapshot *snap); +struct dm_dev *dm_snap_cow(struct dm_snapshot *snap); + +/* + * Funtions to manipulate consecutive chunks + */ +# if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64) +# define DM_CHUNK_CONSECUTIVE_BITS 8 +# define DM_CHUNK_NUMBER_BITS 56 + +static inline chunk_t dm_chunk_number(chunk_t chunk) +{ + return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL); +} + +static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e) +{ + return e->new_chunk >> DM_CHUNK_NUMBER_BITS; +} + +static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e) +{ + e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS); + + BUG_ON(!dm_consecutive_chunk_count(e)); +} + +static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e) +{ + BUG_ON(!dm_consecutive_chunk_count(e)); + + e->new_chunk -= (1ULL << DM_CHUNK_NUMBER_BITS); +} + +# else +# define DM_CHUNK_CONSECUTIVE_BITS 0 + +static inline chunk_t dm_chunk_number(chunk_t chunk) +{ + return chunk; +} + +static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e) +{ + return 0; +} + +static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e) +{ +} + +static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e) +{ +} + +# endif + +/* + * Return the number of sectors in the device. + */ +static inline sector_t get_dev_size(struct block_device *bdev) +{ + return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; +} + +static inline chunk_t sector_to_chunk(struct dm_exception_store *store, + sector_t sector) +{ + return sector >> store->chunk_shift; +} + +int dm_exception_store_type_register(struct dm_exception_store_type *type); +int dm_exception_store_type_unregister(struct dm_exception_store_type *type); + +int dm_exception_store_set_chunk_size(struct dm_exception_store *store, + unsigned chunk_size, + char **error); + +int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, + struct dm_snapshot *snap, + unsigned *args_used, + struct dm_exception_store **store); +void dm_exception_store_destroy(struct dm_exception_store *store); + +int dm_exception_store_init(void); +void dm_exception_store_exit(void); + +/* + * Two exception store implementations. + */ +int dm_persistent_snapshot_init(void); +void dm_persistent_snapshot_exit(void); + +int dm_transient_snapshot_init(void); +void dm_transient_snapshot_exit(void); + +#endif /* _LINUX_DM_EXCEPTION_STORE */ diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c new file mode 100644 index 00000000..ac49c01f --- /dev/null +++ b/drivers/md/dm-flakey.c @@ -0,0 +1,442 @@ +/* + * Copyright (C) 2003 Sistina Software (UK) Limited. + * Copyright (C) 2004, 2010-2011 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include <linux/device-mapper.h> + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/blkdev.h> +#include <linux/bio.h> +#include <linux/slab.h> + +#define DM_MSG_PREFIX "flakey" + +#define all_corrupt_bio_flags_match(bio, fc) \ + (((bio)->bi_rw & (fc)->corrupt_bio_flags) == (fc)->corrupt_bio_flags) + +/* + * Flakey: Used for testing only, simulates intermittent, + * catastrophic device failure. + */ +struct flakey_c { + struct dm_dev *dev; + unsigned long start_time; + sector_t start; + unsigned up_interval; + unsigned down_interval; + unsigned long flags; + unsigned corrupt_bio_byte; + unsigned corrupt_bio_rw; + unsigned corrupt_bio_value; + unsigned corrupt_bio_flags; +}; + +enum feature_flag_bits { + DROP_WRITES +}; + +static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, + struct dm_target *ti) +{ + int r; + unsigned argc; + const char *arg_name; + + static struct dm_arg _args[] = { + {0, 6, "Invalid number of feature args"}, + {1, UINT_MAX, "Invalid corrupt bio byte"}, + {0, 255, "Invalid corrupt value to write into bio byte (0-255)"}, + {0, UINT_MAX, "Invalid corrupt bio flags mask"}, + }; + + /* No feature arguments supplied. */ + if (!as->argc) + return 0; + + r = dm_read_arg_group(_args, as, &argc, &ti->error); + if (r) + return r; + + while (argc) { + arg_name = dm_shift_arg(as); + argc--; + + /* + * drop_writes + */ + if (!strcasecmp(arg_name, "drop_writes")) { + if (test_and_set_bit(DROP_WRITES, &fc->flags)) { + ti->error = "Feature drop_writes duplicated"; + return -EINVAL; + } + + continue; + } + + /* + * corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags> + */ + if (!strcasecmp(arg_name, "corrupt_bio_byte")) { + if (!argc) { + ti->error = "Feature corrupt_bio_byte requires parameters"; + return -EINVAL; + } + + r = dm_read_arg(_args + 1, as, &fc->corrupt_bio_byte, &ti->error); + if (r) + return r; + argc--; + + /* + * Direction r or w? + */ + arg_name = dm_shift_arg(as); + if (!strcasecmp(arg_name, "w")) + fc->corrupt_bio_rw = WRITE; + else if (!strcasecmp(arg_name, "r")) + fc->corrupt_bio_rw = READ; + else { + ti->error = "Invalid corrupt bio direction (r or w)"; + return -EINVAL; + } + argc--; + + /* + * Value of byte (0-255) to write in place of correct one. + */ + r = dm_read_arg(_args + 2, as, &fc->corrupt_bio_value, &ti->error); + if (r) + return r; + argc--; + + /* + * Only corrupt bios with these flags set. + */ + r = dm_read_arg(_args + 3, as, &fc->corrupt_bio_flags, &ti->error); + if (r) + return r; + argc--; + + continue; + } + + ti->error = "Unrecognised flakey feature requested"; + return -EINVAL; + } + + if (test_bit(DROP_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) { + ti->error = "drop_writes is incompatible with corrupt_bio_byte with the WRITE flag set"; + return -EINVAL; + } + + return 0; +} + +/* + * Construct a flakey mapping: + * <dev_path> <offset> <up interval> <down interval> [<#feature args> [<arg>]*] + * + * Feature args: + * [drop_writes] + * [corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags>] + * + * Nth_byte starts from 1 for the first byte. + * Direction is r for READ or w for WRITE. + * bio_flags is ignored if 0. + */ +static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + static struct dm_arg _args[] = { + {0, UINT_MAX, "Invalid up interval"}, + {0, UINT_MAX, "Invalid down interval"}, + }; + + int r; + struct flakey_c *fc; + unsigned long long tmpll; + struct dm_arg_set as; + const char *devname; + char dummy; + + as.argc = argc; + as.argv = argv; + + if (argc < 4) { + ti->error = "Invalid argument count"; + return -EINVAL; + } + + fc = kzalloc(sizeof(*fc), GFP_KERNEL); + if (!fc) { + ti->error = "Cannot allocate linear context"; + return -ENOMEM; + } + fc->start_time = jiffies; + + devname = dm_shift_arg(&as); + + if (sscanf(dm_shift_arg(&as), "%llu%c", &tmpll, &dummy) != 1) { + ti->error = "Invalid device sector"; + goto bad; + } + fc->start = tmpll; + + r = dm_read_arg(_args, &as, &fc->up_interval, &ti->error); + if (r) + goto bad; + + r = dm_read_arg(_args, &as, &fc->down_interval, &ti->error); + if (r) + goto bad; + + if (!(fc->up_interval + fc->down_interval)) { + ti->error = "Total (up + down) interval is zero"; + goto bad; + } + + if (fc->up_interval + fc->down_interval < fc->up_interval) { + ti->error = "Interval overflow"; + goto bad; + } + + r = parse_features(&as, fc, ti); + if (r) + goto bad; + + if (dm_get_device(ti, devname, dm_table_get_mode(ti->table), &fc->dev)) { + ti->error = "Device lookup failed"; + goto bad; + } + + ti->num_flush_requests = 1; + ti->num_discard_requests = 1; + ti->private = fc; + return 0; + +bad: + kfree(fc); + return -EINVAL; +} + +static void flakey_dtr(struct dm_target *ti) +{ + struct flakey_c *fc = ti->private; + + dm_put_device(ti, fc->dev); + kfree(fc); +} + +static sector_t flakey_map_sector(struct dm_target *ti, sector_t bi_sector) +{ + struct flakey_c *fc = ti->private; + + return fc->start + dm_target_offset(ti, bi_sector); +} + +static void flakey_map_bio(struct dm_target *ti, struct bio *bio) +{ + struct flakey_c *fc = ti->private; + + bio->bi_bdev = fc->dev->bdev; + if (bio_sectors(bio)) + bio->bi_sector = flakey_map_sector(ti, bio->bi_sector); +} + +static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc) +{ + unsigned bio_bytes = bio_cur_bytes(bio); + char *data = bio_data(bio); + + /* + * Overwrite the Nth byte of the data returned. + */ + if (data && bio_bytes >= fc->corrupt_bio_byte) { + data[fc->corrupt_bio_byte - 1] = fc->corrupt_bio_value; + + DMDEBUG("Corrupting data bio=%p by writing %u to byte %u " + "(rw=%c bi_rw=%lu bi_sector=%llu cur_bytes=%u)\n", + bio, fc->corrupt_bio_value, fc->corrupt_bio_byte, + (bio_data_dir(bio) == WRITE) ? 'w' : 'r', + bio->bi_rw, (unsigned long long)bio->bi_sector, bio_bytes); + } +} + +static int flakey_map(struct dm_target *ti, struct bio *bio, + union map_info *map_context) +{ + struct flakey_c *fc = ti->private; + unsigned elapsed; + + /* Are we alive ? */ + elapsed = (jiffies - fc->start_time) / HZ; + if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) { + /* + * Flag this bio as submitted while down. + */ + map_context->ll = 1; + + /* + * Map reads as normal. + */ + if (bio_data_dir(bio) == READ) + goto map_bio; + + /* + * Drop writes? + */ + if (test_bit(DROP_WRITES, &fc->flags)) { + bio_endio(bio, 0); + return DM_MAPIO_SUBMITTED; + } + + /* + * Corrupt matching writes. + */ + if (fc->corrupt_bio_byte && (fc->corrupt_bio_rw == WRITE)) { + if (all_corrupt_bio_flags_match(bio, fc)) + corrupt_bio_data(bio, fc); + goto map_bio; + } + + /* + * By default, error all I/O. + */ + return -EIO; + } + +map_bio: + flakey_map_bio(ti, bio); + + return DM_MAPIO_REMAPPED; +} + +static int flakey_end_io(struct dm_target *ti, struct bio *bio, + int error, union map_info *map_context) +{ + struct flakey_c *fc = ti->private; + unsigned bio_submitted_while_down = map_context->ll; + + /* + * Corrupt successful READs while in down state. + * If flags were specified, only corrupt those that match. + */ + if (fc->corrupt_bio_byte && !error && bio_submitted_while_down && + (bio_data_dir(bio) == READ) && (fc->corrupt_bio_rw == READ) && + all_corrupt_bio_flags_match(bio, fc)) + corrupt_bio_data(bio, fc); + + return error; +} + +static int flakey_status(struct dm_target *ti, status_type_t type, + char *result, unsigned int maxlen) +{ + unsigned sz = 0; + struct flakey_c *fc = ti->private; + unsigned drop_writes; + + switch (type) { + case STATUSTYPE_INFO: + result[0] = '\0'; + break; + + case STATUSTYPE_TABLE: + DMEMIT("%s %llu %u %u ", fc->dev->name, + (unsigned long long)fc->start, fc->up_interval, + fc->down_interval); + + drop_writes = test_bit(DROP_WRITES, &fc->flags); + DMEMIT("%u ", drop_writes + (fc->corrupt_bio_byte > 0) * 5); + + if (drop_writes) + DMEMIT("drop_writes "); + + if (fc->corrupt_bio_byte) + DMEMIT("corrupt_bio_byte %u %c %u %u ", + fc->corrupt_bio_byte, + (fc->corrupt_bio_rw == WRITE) ? 'w' : 'r', + fc->corrupt_bio_value, fc->corrupt_bio_flags); + + break; + } + return 0; +} + +static int flakey_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg) +{ + struct flakey_c *fc = ti->private; + struct dm_dev *dev = fc->dev; + int r = 0; + + /* + * Only pass ioctls through if the device sizes match exactly. + */ + if (fc->start || + ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT) + r = scsi_verify_blk_ioctl(NULL, cmd); + + return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg); +} + +static int flakey_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size) +{ + struct flakey_c *fc = ti->private; + struct request_queue *q = bdev_get_queue(fc->dev->bdev); + + if (!q->merge_bvec_fn) + return max_size; + + bvm->bi_bdev = fc->dev->bdev; + bvm->bi_sector = flakey_map_sector(ti, bvm->bi_sector); + + return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); +} + +static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) +{ + struct flakey_c *fc = ti->private; + + return fn(ti, fc->dev, fc->start, ti->len, data); +} + +static struct target_type flakey_target = { + .name = "flakey", + .version = {1, 2, 0}, + .module = THIS_MODULE, + .ctr = flakey_ctr, + .dtr = flakey_dtr, + .map = flakey_map, + .end_io = flakey_end_io, + .status = flakey_status, + .ioctl = flakey_ioctl, + .merge = flakey_merge, + .iterate_devices = flakey_iterate_devices, +}; + +static int __init dm_flakey_init(void) +{ + int r = dm_register_target(&flakey_target); + + if (r < 0) + DMERR("register failed %d", r); + + return r; +} + +static void __exit dm_flakey_exit(void) +{ + dm_unregister_target(&flakey_target); +} + +/* Module hooks */ +module_init(dm_flakey_init); +module_exit(dm_flakey_exit); + +MODULE_DESCRIPTION(DM_NAME " flakey target"); +MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c new file mode 100644 index 00000000..ea5dd289 --- /dev/null +++ b/drivers/md/dm-io.c @@ -0,0 +1,523 @@ +/* + * Copyright (C) 2003 Sistina Software + * Copyright (C) 2006 Red Hat GmbH + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include <linux/device-mapper.h> + +#include <linux/bio.h> +#include <linux/mempool.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/dm-io.h> + +#define DM_MSG_PREFIX "io" + +#define DM_IO_MAX_REGIONS BITS_PER_LONG +#define MIN_IOS 16 +#define MIN_BIOS 16 + +struct dm_io_client { + mempool_t *pool; + struct bio_set *bios; +}; + +/* + * Aligning 'struct io' reduces the number of bits required to store + * its address. Refer to store_io_and_region_in_bio() below. + */ +struct io { + unsigned long error_bits; + atomic_t count; + struct task_struct *sleeper; + struct dm_io_client *client; + io_notify_fn callback; + void *context; + void *vma_invalidate_address; + unsigned long vma_invalidate_size; +} __attribute__((aligned(DM_IO_MAX_REGIONS))); + +static struct kmem_cache *_dm_io_cache; + +/* + * Create a client with mempool and bioset. + */ +struct dm_io_client *dm_io_client_create(void) +{ + struct dm_io_client *client; + + client = kmalloc(sizeof(*client), GFP_KERNEL); + if (!client) + return ERR_PTR(-ENOMEM); + + client->pool = mempool_create_slab_pool(MIN_IOS, _dm_io_cache); + if (!client->pool) + goto bad; + + client->bios = bioset_create(MIN_BIOS, 0); + if (!client->bios) + goto bad; + + return client; + + bad: + if (client->pool) + mempool_destroy(client->pool); + kfree(client); + return ERR_PTR(-ENOMEM); +} +EXPORT_SYMBOL(dm_io_client_create); + +void dm_io_client_destroy(struct dm_io_client *client) +{ + mempool_destroy(client->pool); + bioset_free(client->bios); + kfree(client); +} +EXPORT_SYMBOL(dm_io_client_destroy); + +/*----------------------------------------------------------------- + * We need to keep track of which region a bio is doing io for. + * To avoid a memory allocation to store just 5 or 6 bits, we + * ensure the 'struct io' pointer is aligned so enough low bits are + * always zero and then combine it with the region number directly in + * bi_private. + *---------------------------------------------------------------*/ +static void store_io_and_region_in_bio(struct bio *bio, struct io *io, + unsigned region) +{ + if (unlikely(!IS_ALIGNED((unsigned long)io, DM_IO_MAX_REGIONS))) { + DMCRIT("Unaligned struct io pointer %p", io); + BUG(); + } + + bio->bi_private = (void *)((unsigned long)io | region); +} + +static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io, + unsigned *region) +{ + unsigned long val = (unsigned long)bio->bi_private; + + *io = (void *)(val & -(unsigned long)DM_IO_MAX_REGIONS); + *region = val & (DM_IO_MAX_REGIONS - 1); +} + +/*----------------------------------------------------------------- + * We need an io object to keep track of the number of bios that + * have been dispatched for a particular io. + *---------------------------------------------------------------*/ +static void dec_count(struct io *io, unsigned int region, int error) +{ + if (error) + set_bit(region, &io->error_bits); + + if (atomic_dec_and_test(&io->count)) { + if (io->vma_invalidate_size) + invalidate_kernel_vmap_range(io->vma_invalidate_address, + io->vma_invalidate_size); + + if (io->sleeper) + wake_up_process(io->sleeper); + + else { + unsigned long r = io->error_bits; + io_notify_fn fn = io->callback; + void *context = io->context; + + mempool_free(io, io->client->pool); + fn(r, context); + } + } +} + +static void endio(struct bio *bio, int error) +{ + struct io *io; + unsigned region; + + if (error && bio_data_dir(bio) == READ) + zero_fill_bio(bio); + + /* + * The bio destructor in bio_put() may use the io object. + */ + retrieve_io_and_region_from_bio(bio, &io, ®ion); + + bio_put(bio); + + dec_count(io, region, error); +} + +/*----------------------------------------------------------------- + * These little objects provide an abstraction for getting a new + * destination page for io. + *---------------------------------------------------------------*/ +struct dpages { + void (*get_page)(struct dpages *dp, + struct page **p, unsigned long *len, unsigned *offset); + void (*next_page)(struct dpages *dp); + + unsigned context_u; + void *context_ptr; + + void *vma_invalidate_address; + unsigned long vma_invalidate_size; +}; + +/* + * Functions for getting the pages from a list. + */ +static void list_get_page(struct dpages *dp, + struct page **p, unsigned long *len, unsigned *offset) +{ + unsigned o = dp->context_u; + struct page_list *pl = (struct page_list *) dp->context_ptr; + + *p = pl->page; + *len = PAGE_SIZE - o; + *offset = o; +} + +static void list_next_page(struct dpages *dp) +{ + struct page_list *pl = (struct page_list *) dp->context_ptr; + dp->context_ptr = pl->next; + dp->context_u = 0; +} + +static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned offset) +{ + dp->get_page = list_get_page; + dp->next_page = list_next_page; + dp->context_u = offset; + dp->context_ptr = pl; +} + +/* + * Functions for getting the pages from a bvec. + */ +static void bvec_get_page(struct dpages *dp, + struct page **p, unsigned long *len, unsigned *offset) +{ + struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr; + *p = bvec->bv_page; + *len = bvec->bv_len; + *offset = bvec->bv_offset; +} + +static void bvec_next_page(struct dpages *dp) +{ + struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr; + dp->context_ptr = bvec + 1; +} + +static void bvec_dp_init(struct dpages *dp, struct bio_vec *bvec) +{ + dp->get_page = bvec_get_page; + dp->next_page = bvec_next_page; + dp->context_ptr = bvec; +} + +/* + * Functions for getting the pages from a VMA. + */ +static void vm_get_page(struct dpages *dp, + struct page **p, unsigned long *len, unsigned *offset) +{ + *p = vmalloc_to_page(dp->context_ptr); + *offset = dp->context_u; + *len = PAGE_SIZE - dp->context_u; +} + +static void vm_next_page(struct dpages *dp) +{ + dp->context_ptr += PAGE_SIZE - dp->context_u; + dp->context_u = 0; +} + +static void vm_dp_init(struct dpages *dp, void *data) +{ + dp->get_page = vm_get_page; + dp->next_page = vm_next_page; + dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1); + dp->context_ptr = data; +} + +static void dm_bio_destructor(struct bio *bio) +{ + unsigned region; + struct io *io; + + retrieve_io_and_region_from_bio(bio, &io, ®ion); + + bio_free(bio, io->client->bios); +} + +/* + * Functions for getting the pages from kernel memory. + */ +static void km_get_page(struct dpages *dp, struct page **p, unsigned long *len, + unsigned *offset) +{ + *p = virt_to_page(dp->context_ptr); + *offset = dp->context_u; + *len = PAGE_SIZE - dp->context_u; +} + +static void km_next_page(struct dpages *dp) +{ + dp->context_ptr += PAGE_SIZE - dp->context_u; + dp->context_u = 0; +} + +static void km_dp_init(struct dpages *dp, void *data) +{ + dp->get_page = km_get_page; + dp->next_page = km_next_page; + dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1); + dp->context_ptr = data; +} + +/*----------------------------------------------------------------- + * IO routines that accept a list of pages. + *---------------------------------------------------------------*/ +static void do_region(int rw, unsigned region, struct dm_io_region *where, + struct dpages *dp, struct io *io) +{ + struct bio *bio; + struct page *page; + unsigned long len; + unsigned offset; + unsigned num_bvecs; + sector_t remaining = where->count; + struct request_queue *q = bdev_get_queue(where->bdev); + sector_t discard_sectors; + + /* + * where->count may be zero if rw holds a flush and we need to + * send a zero-sized flush. + */ + do { + /* + * Allocate a suitably sized-bio. + */ + if (rw & REQ_DISCARD) + num_bvecs = 1; + else + num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), + dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT))); + + bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); + bio->bi_sector = where->sector + (where->count - remaining); + bio->bi_bdev = where->bdev; + bio->bi_end_io = endio; + bio->bi_destructor = dm_bio_destructor; + store_io_and_region_in_bio(bio, io, region); + + if (rw & REQ_DISCARD) { + discard_sectors = min_t(sector_t, q->limits.max_discard_sectors, remaining); + bio->bi_size = discard_sectors << SECTOR_SHIFT; + remaining -= discard_sectors; + } else while (remaining) { + /* + * Try and add as many pages as possible. + */ + dp->get_page(dp, &page, &len, &offset); + len = min(len, to_bytes(remaining)); + if (!bio_add_page(bio, page, len, offset)) + break; + + offset = 0; + remaining -= to_sector(len); + dp->next_page(dp); + } + + atomic_inc(&io->count); + submit_bio(rw, bio); + } while (remaining); +} + +static void dispatch_io(int rw, unsigned int num_regions, + struct dm_io_region *where, struct dpages *dp, + struct io *io, int sync) +{ + int i; + struct dpages old_pages = *dp; + + BUG_ON(num_regions > DM_IO_MAX_REGIONS); + + if (sync) + rw |= REQ_SYNC; + + /* + * For multiple regions we need to be careful to rewind + * the dp object for each call to do_region. + */ + for (i = 0; i < num_regions; i++) { + *dp = old_pages; + if (where[i].count || (rw & REQ_FLUSH)) + do_region(rw, i, where + i, dp, io); + } + + /* + * Drop the extra reference that we were holding to avoid + * the io being completed too early. + */ + dec_count(io, 0, 0); +} + +static int sync_io(struct dm_io_client *client, unsigned int num_regions, + struct dm_io_region *where, int rw, struct dpages *dp, + unsigned long *error_bits) +{ + /* + * gcc <= 4.3 can't do the alignment for stack variables, so we must + * align it on our own. + * volatile prevents the optimizer from removing or reusing + * "io_" field from the stack frame (allowed in ANSI C). + */ + volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1]; + struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io)); + + if (num_regions > 1 && (rw & RW_MASK) != WRITE) { + WARN_ON(1); + return -EIO; + } + + io->error_bits = 0; + atomic_set(&io->count, 1); /* see dispatch_io() */ + io->sleeper = current; + io->client = client; + + io->vma_invalidate_address = dp->vma_invalidate_address; + io->vma_invalidate_size = dp->vma_invalidate_size; + + dispatch_io(rw, num_regions, where, dp, io, 1); + + while (1) { + set_current_state(TASK_UNINTERRUPTIBLE); + + if (!atomic_read(&io->count)) + break; + + io_schedule(); + } + set_current_state(TASK_RUNNING); + + if (error_bits) + *error_bits = io->error_bits; + + return io->error_bits ? -EIO : 0; +} + +static int async_io(struct dm_io_client *client, unsigned int num_regions, + struct dm_io_region *where, int rw, struct dpages *dp, + io_notify_fn fn, void *context) +{ + struct io *io; + + if (num_regions > 1 && (rw & RW_MASK) != WRITE) { + WARN_ON(1); + fn(1, context); + return -EIO; + } + + io = mempool_alloc(client->pool, GFP_NOIO); + io->error_bits = 0; + atomic_set(&io->count, 1); /* see dispatch_io() */ + io->sleeper = NULL; + io->client = client; + io->callback = fn; + io->context = context; + + io->vma_invalidate_address = dp->vma_invalidate_address; + io->vma_invalidate_size = dp->vma_invalidate_size; + + dispatch_io(rw, num_regions, where, dp, io, 0); + return 0; +} + +static int dp_init(struct dm_io_request *io_req, struct dpages *dp, + unsigned long size) +{ + /* Set up dpages based on memory type */ + + dp->vma_invalidate_address = NULL; + dp->vma_invalidate_size = 0; + + switch (io_req->mem.type) { + case DM_IO_PAGE_LIST: + list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset); + break; + + case DM_IO_BVEC: + bvec_dp_init(dp, io_req->mem.ptr.bvec); + break; + + case DM_IO_VMA: + flush_kernel_vmap_range(io_req->mem.ptr.vma, size); + if ((io_req->bi_rw & RW_MASK) == READ) { + dp->vma_invalidate_address = io_req->mem.ptr.vma; + dp->vma_invalidate_size = size; + } + vm_dp_init(dp, io_req->mem.ptr.vma); + break; + + case DM_IO_KMEM: + km_dp_init(dp, io_req->mem.ptr.addr); + break; + + default: + return -EINVAL; + } + + return 0; +} + +/* + * New collapsed (a)synchronous interface. + * + * If the IO is asynchronous (i.e. it has notify.fn), you must either unplug + * the queue with blk_unplug() some time later or set REQ_SYNC in +io_req->bi_rw. If you fail to do one of these, the IO will be submitted to + * the disk after q->unplug_delay, which defaults to 3ms in blk-settings.c. + */ +int dm_io(struct dm_io_request *io_req, unsigned num_regions, + struct dm_io_region *where, unsigned long *sync_error_bits) +{ + int r; + struct dpages dp; + + r = dp_init(io_req, &dp, (unsigned long)where->count << SECTOR_SHIFT); + if (r) + return r; + + if (!io_req->notify.fn) + return sync_io(io_req->client, num_regions, where, + io_req->bi_rw, &dp, sync_error_bits); + + return async_io(io_req->client, num_regions, where, io_req->bi_rw, + &dp, io_req->notify.fn, io_req->notify.context); +} +EXPORT_SYMBOL(dm_io); + +int __init dm_io_init(void) +{ + _dm_io_cache = KMEM_CACHE(io, 0); + if (!_dm_io_cache) + return -ENOMEM; + + return 0; +} + +void dm_io_exit(void) +{ + kmem_cache_destroy(_dm_io_cache); + _dm_io_cache = NULL; +} diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c new file mode 100644 index 00000000..a1a3e6df --- /dev/null +++ b/drivers/md/dm-ioctl.c @@ -0,0 +1,1782 @@ +/* + * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. + * Copyright (C) 2004 - 2006 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include <linux/module.h> +#include <linux/vmalloc.h> +#include <linux/miscdevice.h> +#include <linux/init.h> +#include <linux/wait.h> +#include <linux/slab.h> +#include <linux/dm-ioctl.h> +#include <linux/hdreg.h> +#include <linux/compat.h> + +#include <asm/uaccess.h> + +#define DM_MSG_PREFIX "ioctl" +#define DM_DRIVER_EMAIL "dm-devel@redhat.com" + +/*----------------------------------------------------------------- + * The ioctl interface needs to be able to look up devices by + * name or uuid. + *---------------------------------------------------------------*/ +struct hash_cell { + struct list_head name_list; + struct list_head uuid_list; + + char *name; + char *uuid; + struct mapped_device *md; + struct dm_table *new_map; +}; + +struct vers_iter { + size_t param_size; + struct dm_target_versions *vers, *old_vers; + char *end; + uint32_t flags; +}; + + +#define NUM_BUCKETS 64 +#define MASK_BUCKETS (NUM_BUCKETS - 1) +static struct list_head _name_buckets[NUM_BUCKETS]; +static struct list_head _uuid_buckets[NUM_BUCKETS]; + +static void dm_hash_remove_all(int keep_open_devices); + +/* + * Guards access to both hash tables. + */ +static DECLARE_RWSEM(_hash_lock); + +/* + * Protects use of mdptr to obtain hash cell name and uuid from mapped device. + */ +static DEFINE_MUTEX(dm_hash_cells_mutex); + +static void init_buckets(struct list_head *buckets) +{ + unsigned int i; + + for (i = 0; i < NUM_BUCKETS; i++) + INIT_LIST_HEAD(buckets + i); +} + +static int dm_hash_init(void) +{ + init_buckets(_name_buckets); + init_buckets(_uuid_buckets); + return 0; +} + +static void dm_hash_exit(void) +{ + dm_hash_remove_all(0); +} + +/*----------------------------------------------------------------- + * Hash function: + * We're not really concerned with the str hash function being + * fast since it's only used by the ioctl interface. + *---------------------------------------------------------------*/ +static unsigned int hash_str(const char *str) +{ + const unsigned int hash_mult = 2654435387U; + unsigned int h = 0; + + while (*str) + h = (h + (unsigned int) *str++) * hash_mult; + + return h & MASK_BUCKETS; +} + +/*----------------------------------------------------------------- + * Code for looking up a device by name + *---------------------------------------------------------------*/ +static struct hash_cell *__get_name_cell(const char *str) +{ + struct hash_cell *hc; + unsigned int h = hash_str(str); + + list_for_each_entry (hc, _name_buckets + h, name_list) + if (!strcmp(hc->name, str)) { + dm_get(hc->md); + return hc; + } + + return NULL; +} + +static struct hash_cell *__get_uuid_cell(const char *str) +{ + struct hash_cell *hc; + unsigned int h = hash_str(str); + + list_for_each_entry (hc, _uuid_buckets + h, uuid_list) + if (!strcmp(hc->uuid, str)) { + dm_get(hc->md); + return hc; + } + + return NULL; +} + +static struct hash_cell *__get_dev_cell(uint64_t dev) +{ + struct mapped_device *md; + struct hash_cell *hc; + + md = dm_get_md(huge_decode_dev(dev)); + if (!md) + return NULL; + + hc = dm_get_mdptr(md); + if (!hc) { + dm_put(md); + return NULL; + } + + return hc; +} + +/*----------------------------------------------------------------- + * Inserting, removing and renaming a device. + *---------------------------------------------------------------*/ +static struct hash_cell *alloc_cell(const char *name, const char *uuid, + struct mapped_device *md) +{ + struct hash_cell *hc; + + hc = kmalloc(sizeof(*hc), GFP_KERNEL); + if (!hc) + return NULL; + + hc->name = kstrdup(name, GFP_KERNEL); + if (!hc->name) { + kfree(hc); + return NULL; + } + + if (!uuid) + hc->uuid = NULL; + + else { + hc->uuid = kstrdup(uuid, GFP_KERNEL); + if (!hc->uuid) { + kfree(hc->name); + kfree(hc); + return NULL; + } + } + + INIT_LIST_HEAD(&hc->name_list); + INIT_LIST_HEAD(&hc->uuid_list); + hc->md = md; + hc->new_map = NULL; + return hc; +} + +static void free_cell(struct hash_cell *hc) +{ + if (hc) { + kfree(hc->name); + kfree(hc->uuid); + kfree(hc); + } +} + +/* + * The kdev_t and uuid of a device can never change once it is + * initially inserted. + */ +static int dm_hash_insert(const char *name, const char *uuid, struct mapped_device *md) +{ + struct hash_cell *cell, *hc; + + /* + * Allocate the new cells. + */ + cell = alloc_cell(name, uuid, md); + if (!cell) + return -ENOMEM; + + /* + * Insert the cell into both hash tables. + */ + down_write(&_hash_lock); + hc = __get_name_cell(name); + if (hc) { + dm_put(hc->md); + goto bad; + } + + list_add(&cell->name_list, _name_buckets + hash_str(name)); + + if (uuid) { + hc = __get_uuid_cell(uuid); + if (hc) { + list_del(&cell->name_list); + dm_put(hc->md); + goto bad; + } + list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid)); + } + dm_get(md); + mutex_lock(&dm_hash_cells_mutex); + dm_set_mdptr(md, cell); + mutex_unlock(&dm_hash_cells_mutex); + up_write(&_hash_lock); + + return 0; + + bad: + up_write(&_hash_lock); + free_cell(cell); + return -EBUSY; +} + +static void __hash_remove(struct hash_cell *hc) +{ + struct dm_table *table; + + /* remove from the dev hash */ + list_del(&hc->uuid_list); + list_del(&hc->name_list); + mutex_lock(&dm_hash_cells_mutex); + dm_set_mdptr(hc->md, NULL); + mutex_unlock(&dm_hash_cells_mutex); + + table = dm_get_live_table(hc->md); + if (table) { + dm_table_event(table); + dm_table_put(table); + } + + if (hc->new_map) + dm_table_destroy(hc->new_map); + dm_put(hc->md); + free_cell(hc); +} + +static void dm_hash_remove_all(int keep_open_devices) +{ + int i, dev_skipped; + struct hash_cell *hc; + struct mapped_device *md; + +retry: + dev_skipped = 0; + + down_write(&_hash_lock); + + for (i = 0; i < NUM_BUCKETS; i++) { + list_for_each_entry(hc, _name_buckets + i, name_list) { + md = hc->md; + dm_get(md); + + if (keep_open_devices && dm_lock_for_deletion(md)) { + dm_put(md); + dev_skipped++; + continue; + } + + __hash_remove(hc); + + up_write(&_hash_lock); + + dm_put(md); + if (likely(keep_open_devices)) + dm_destroy(md); + else + dm_destroy_immediate(md); + + /* + * Some mapped devices may be using other mapped + * devices, so repeat until we make no further + * progress. If a new mapped device is created + * here it will also get removed. + */ + goto retry; + } + } + + up_write(&_hash_lock); + + if (dev_skipped) + DMWARN("remove_all left %d open device(s)", dev_skipped); +} + +/* + * Set the uuid of a hash_cell that isn't already set. + */ +static void __set_cell_uuid(struct hash_cell *hc, char *new_uuid) +{ + mutex_lock(&dm_hash_cells_mutex); + hc->uuid = new_uuid; + mutex_unlock(&dm_hash_cells_mutex); + + list_add(&hc->uuid_list, _uuid_buckets + hash_str(new_uuid)); +} + +/* + * Changes the name of a hash_cell and returns the old name for + * the caller to free. + */ +static char *__change_cell_name(struct hash_cell *hc, char *new_name) +{ + char *old_name; + + /* + * Rename and move the name cell. + */ + list_del(&hc->name_list); + old_name = hc->name; + + mutex_lock(&dm_hash_cells_mutex); + hc->name = new_name; + mutex_unlock(&dm_hash_cells_mutex); + + list_add(&hc->name_list, _name_buckets + hash_str(new_name)); + + return old_name; +} + +static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, + const char *new) +{ + char *new_data, *old_name = NULL; + struct hash_cell *hc; + struct dm_table *table; + struct mapped_device *md; + unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0; + + /* + * duplicate new. + */ + new_data = kstrdup(new, GFP_KERNEL); + if (!new_data) + return ERR_PTR(-ENOMEM); + + down_write(&_hash_lock); + + /* + * Is new free ? + */ + if (change_uuid) + hc = __get_uuid_cell(new); + else + hc = __get_name_cell(new); + + if (hc) { + DMWARN("Unable to change %s on mapped device %s to one that " + "already exists: %s", + change_uuid ? "uuid" : "name", + param->name, new); + dm_put(hc->md); + up_write(&_hash_lock); + kfree(new_data); + return ERR_PTR(-EBUSY); + } + + /* + * Is there such a device as 'old' ? + */ + hc = __get_name_cell(param->name); + if (!hc) { + DMWARN("Unable to rename non-existent device, %s to %s%s", + param->name, change_uuid ? "uuid " : "", new); + up_write(&_hash_lock); + kfree(new_data); + return ERR_PTR(-ENXIO); + } + + /* + * Does this device already have a uuid? + */ + if (change_uuid && hc->uuid) { + DMWARN("Unable to change uuid of mapped device %s to %s " + "because uuid is already set to %s", + param->name, new, hc->uuid); + dm_put(hc->md); + up_write(&_hash_lock); + kfree(new_data); + return ERR_PTR(-EINVAL); + } + + if (change_uuid) + __set_cell_uuid(hc, new_data); + else + old_name = __change_cell_name(hc, new_data); + + /* + * Wake up any dm event waiters. + */ + table = dm_get_live_table(hc->md); + if (table) { + dm_table_event(table); + dm_table_put(table); + } + + if (!dm_kobject_uevent(hc->md, KOBJ_CHANGE, param->event_nr)) + param->flags |= DM_UEVENT_GENERATED_FLAG; + + md = hc->md; + up_write(&_hash_lock); + kfree(old_name); + + return md; +} + +/*----------------------------------------------------------------- + * Implementation of the ioctl commands + *---------------------------------------------------------------*/ +/* + * All the ioctl commands get dispatched to functions with this + * prototype. + */ +typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size); + +static int remove_all(struct dm_ioctl *param, size_t param_size) +{ + dm_hash_remove_all(1); + param->data_size = 0; + return 0; +} + +/* + * Round up the ptr to an 8-byte boundary. + */ +#define ALIGN_MASK 7 +static inline void *align_ptr(void *ptr) +{ + return (void *) (((size_t) (ptr + ALIGN_MASK)) & ~ALIGN_MASK); +} + +/* + * Retrieves the data payload buffer from an already allocated + * struct dm_ioctl. + */ +static void *get_result_buffer(struct dm_ioctl *param, size_t param_size, + size_t *len) +{ + param->data_start = align_ptr(param + 1) - (void *) param; + + if (param->data_start < param_size) + *len = param_size - param->data_start; + else + *len = 0; + + return ((void *) param) + param->data_start; +} + +static int list_devices(struct dm_ioctl *param, size_t param_size) +{ + unsigned int i; + struct hash_cell *hc; + size_t len, needed = 0; + struct gendisk *disk; + struct dm_name_list *nl, *old_nl = NULL; + + down_write(&_hash_lock); + + /* + * Loop through all the devices working out how much + * space we need. + */ + for (i = 0; i < NUM_BUCKETS; i++) { + list_for_each_entry (hc, _name_buckets + i, name_list) { + needed += sizeof(struct dm_name_list); + needed += strlen(hc->name) + 1; + needed += ALIGN_MASK; + } + } + + /* + * Grab our output buffer. + */ + nl = get_result_buffer(param, param_size, &len); + if (len < needed) { + param->flags |= DM_BUFFER_FULL_FLAG; + goto out; + } + param->data_size = param->data_start + needed; + + nl->dev = 0; /* Flags no data */ + + /* + * Now loop through filling out the names. + */ + for (i = 0; i < NUM_BUCKETS; i++) { + list_for_each_entry (hc, _name_buckets + i, name_list) { + if (old_nl) + old_nl->next = (uint32_t) ((void *) nl - + (void *) old_nl); + disk = dm_disk(hc->md); + nl->dev = huge_encode_dev(disk_devt(disk)); + nl->next = 0; + strcpy(nl->name, hc->name); + + old_nl = nl; + nl = align_ptr(((void *) ++nl) + strlen(hc->name) + 1); + } + } + + out: + up_write(&_hash_lock); + return 0; +} + +static void list_version_get_needed(struct target_type *tt, void *needed_param) +{ + size_t *needed = needed_param; + + *needed += sizeof(struct dm_target_versions); + *needed += strlen(tt->name); + *needed += ALIGN_MASK; +} + +static void list_version_get_info(struct target_type *tt, void *param) +{ + struct vers_iter *info = param; + + /* Check space - it might have changed since the first iteration */ + if ((char *)info->vers + sizeof(tt->version) + strlen(tt->name) + 1 > + info->end) { + + info->flags = DM_BUFFER_FULL_FLAG; + return; + } + + if (info->old_vers) + info->old_vers->next = (uint32_t) ((void *)info->vers - + (void *)info->old_vers); + info->vers->version[0] = tt->version[0]; + info->vers->version[1] = tt->version[1]; + info->vers->version[2] = tt->version[2]; + info->vers->next = 0; + strcpy(info->vers->name, tt->name); + + info->old_vers = info->vers; + info->vers = align_ptr(((void *) ++info->vers) + strlen(tt->name) + 1); +} + +static int list_versions(struct dm_ioctl *param, size_t param_size) +{ + size_t len, needed = 0; + struct dm_target_versions *vers; + struct vers_iter iter_info; + + /* + * Loop through all the devices working out how much + * space we need. + */ + dm_target_iterate(list_version_get_needed, &needed); + + /* + * Grab our output buffer. + */ + vers = get_result_buffer(param, param_size, &len); + if (len < needed) { + param->flags |= DM_BUFFER_FULL_FLAG; + goto out; + } + param->data_size = param->data_start + needed; + + iter_info.param_size = param_size; + iter_info.old_vers = NULL; + iter_info.vers = vers; + iter_info.flags = 0; + iter_info.end = (char *)vers+len; + + /* + * Now loop through filling out the names & versions. + */ + dm_target_iterate(list_version_get_info, &iter_info); + param->flags |= iter_info.flags; + + out: + return 0; +} + +static int check_name(const char *name) +{ + if (strchr(name, '/')) { + DMWARN("invalid device name"); + return -EINVAL; + } + + return 0; +} + +/* + * On successful return, the caller must not attempt to acquire + * _hash_lock without first calling dm_table_put, because dm_table_destroy + * waits for this dm_table_put and could be called under this lock. + */ +static struct dm_table *dm_get_inactive_table(struct mapped_device *md) +{ + struct hash_cell *hc; + struct dm_table *table = NULL; + + down_read(&_hash_lock); + hc = dm_get_mdptr(md); + if (!hc || hc->md != md) { + DMWARN("device has been removed from the dev hash table."); + goto out; + } + + table = hc->new_map; + if (table) + dm_table_get(table); + +out: + up_read(&_hash_lock); + + return table; +} + +static struct dm_table *dm_get_live_or_inactive_table(struct mapped_device *md, + struct dm_ioctl *param) +{ + return (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) ? + dm_get_inactive_table(md) : dm_get_live_table(md); +} + +/* + * Fills in a dm_ioctl structure, ready for sending back to + * userland. + */ +static void __dev_status(struct mapped_device *md, struct dm_ioctl *param) +{ + struct gendisk *disk = dm_disk(md); + struct dm_table *table; + + param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | + DM_ACTIVE_PRESENT_FLAG); + + if (dm_suspended_md(md)) + param->flags |= DM_SUSPEND_FLAG; + + param->dev = huge_encode_dev(disk_devt(disk)); + + /* + * Yes, this will be out of date by the time it gets back + * to userland, but it is still very useful for + * debugging. + */ + param->open_count = dm_open_count(md); + + param->event_nr = dm_get_event_nr(md); + param->target_count = 0; + + table = dm_get_live_table(md); + if (table) { + if (!(param->flags & DM_QUERY_INACTIVE_TABLE_FLAG)) { + if (get_disk_ro(disk)) + param->flags |= DM_READONLY_FLAG; + param->target_count = dm_table_get_num_targets(table); + } + dm_table_put(table); + + param->flags |= DM_ACTIVE_PRESENT_FLAG; + } + + if (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) { + table = dm_get_inactive_table(md); + if (table) { + if (!(dm_table_get_mode(table) & FMODE_WRITE)) + param->flags |= DM_READONLY_FLAG; + param->target_count = dm_table_get_num_targets(table); + dm_table_put(table); + } + } +} + +static int dev_create(struct dm_ioctl *param, size_t param_size) +{ + int r, m = DM_ANY_MINOR; + struct mapped_device *md; + + r = check_name(param->name); + if (r) + return r; + + if (param->flags & DM_PERSISTENT_DEV_FLAG) + m = MINOR(huge_decode_dev(param->dev)); + + r = dm_create(m, &md); + if (r) + return r; + + r = dm_hash_insert(param->name, *param->uuid ? param->uuid : NULL, md); + if (r) { + dm_put(md); + dm_destroy(md); + return r; + } + + param->flags &= ~DM_INACTIVE_PRESENT_FLAG; + + __dev_status(md, param); + + dm_put(md); + + return 0; +} + +/* + * Always use UUID for lookups if it's present, otherwise use name or dev. + */ +static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param) +{ + struct hash_cell *hc = NULL; + + if (*param->uuid) { + if (*param->name || param->dev) + return NULL; + + hc = __get_uuid_cell(param->uuid); + if (!hc) + return NULL; + } else if (*param->name) { + if (param->dev) + return NULL; + + hc = __get_name_cell(param->name); + if (!hc) + return NULL; + } else if (param->dev) { + hc = __get_dev_cell(param->dev); + if (!hc) + return NULL; + } else + return NULL; + + /* + * Sneakily write in both the name and the uuid + * while we have the cell. + */ + strlcpy(param->name, hc->name, sizeof(param->name)); + if (hc->uuid) + strlcpy(param->uuid, hc->uuid, sizeof(param->uuid)); + else + param->uuid[0] = '\0'; + + if (hc->new_map) + param->flags |= DM_INACTIVE_PRESENT_FLAG; + else + param->flags &= ~DM_INACTIVE_PRESENT_FLAG; + + return hc; +} + +static struct mapped_device *find_device(struct dm_ioctl *param) +{ + struct hash_cell *hc; + struct mapped_device *md = NULL; + + down_read(&_hash_lock); + hc = __find_device_hash_cell(param); + if (hc) + md = hc->md; + up_read(&_hash_lock); + + return md; +} + +static int dev_remove(struct dm_ioctl *param, size_t param_size) +{ + struct hash_cell *hc; + struct mapped_device *md; + int r; + + down_write(&_hash_lock); + hc = __find_device_hash_cell(param); + + if (!hc) { + DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table."); + up_write(&_hash_lock); + return -ENXIO; + } + + md = hc->md; + + /* + * Ensure the device is not open and nothing further can open it. + */ + r = dm_lock_for_deletion(md); + if (r) { + DMDEBUG_LIMIT("unable to remove open device %s", hc->name); + up_write(&_hash_lock); + dm_put(md); + return r; + } + + __hash_remove(hc); + up_write(&_hash_lock); + + if (!dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr)) + param->flags |= DM_UEVENT_GENERATED_FLAG; + + dm_put(md); + dm_destroy(md); + return 0; +} + +/* + * Check a string doesn't overrun the chunk of + * memory we copied from userland. + */ +static int invalid_str(char *str, void *end) +{ + while ((void *) str < end) + if (!*str++) + return 0; + + return -EINVAL; +} + +static int dev_rename(struct dm_ioctl *param, size_t param_size) +{ + int r; + char *new_data = (char *) param + param->data_start; + struct mapped_device *md; + unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0; + + if (new_data < param->data || + invalid_str(new_data, (void *) param + param_size) || + strlen(new_data) > (change_uuid ? DM_UUID_LEN - 1 : DM_NAME_LEN - 1)) { + DMWARN("Invalid new mapped device name or uuid string supplied."); + return -EINVAL; + } + + if (!change_uuid) { + r = check_name(new_data); + if (r) + return r; + } + + md = dm_hash_rename(param, new_data); + if (IS_ERR(md)) + return PTR_ERR(md); + + __dev_status(md, param); + dm_put(md); + + return 0; +} + +static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) +{ + int r = -EINVAL, x; + struct mapped_device *md; + struct hd_geometry geometry; + unsigned long indata[4]; + char *geostr = (char *) param + param->data_start; + char dummy; + + md = find_device(param); + if (!md) + return -ENXIO; + + if (geostr < param->data || + invalid_str(geostr, (void *) param + param_size)) { + DMWARN("Invalid geometry supplied."); + goto out; + } + + x = sscanf(geostr, "%lu %lu %lu %lu%c", indata, + indata + 1, indata + 2, indata + 3, &dummy); + + if (x != 4) { + DMWARN("Unable to interpret geometry settings."); + goto out; + } + + if (indata[0] > 65535 || indata[1] > 255 || + indata[2] > 255 || indata[3] > ULONG_MAX) { + DMWARN("Geometry exceeds range limits."); + goto out; + } + + geometry.cylinders = indata[0]; + geometry.heads = indata[1]; + geometry.sectors = indata[2]; + geometry.start = indata[3]; + + r = dm_set_geometry(md, &geometry); + + param->data_size = 0; + +out: + dm_put(md); + return r; +} + +static int do_suspend(struct dm_ioctl *param) +{ + int r = 0; + unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG; + struct mapped_device *md; + + md = find_device(param); + if (!md) + return -ENXIO; + + if (param->flags & DM_SKIP_LOCKFS_FLAG) + suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG; + if (param->flags & DM_NOFLUSH_FLAG) + suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; + + if (!dm_suspended_md(md)) { + r = dm_suspend(md, suspend_flags); + if (r) + goto out; + } + + __dev_status(md, param); + +out: + dm_put(md); + + return r; +} + +static int do_resume(struct dm_ioctl *param) +{ + int r = 0; + unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG; + struct hash_cell *hc; + struct mapped_device *md; + struct dm_table *new_map, *old_map = NULL; + + down_write(&_hash_lock); + + hc = __find_device_hash_cell(param); + if (!hc) { + DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table."); + up_write(&_hash_lock); + return -ENXIO; + } + + md = hc->md; + + new_map = hc->new_map; + hc->new_map = NULL; + param->flags &= ~DM_INACTIVE_PRESENT_FLAG; + + up_write(&_hash_lock); + + /* Do we need to load a new map ? */ + if (new_map) { + /* Suspend if it isn't already suspended */ + if (param->flags & DM_SKIP_LOCKFS_FLAG) + suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG; + if (param->flags & DM_NOFLUSH_FLAG) + suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; + if (!dm_suspended_md(md)) + dm_suspend(md, suspend_flags); + + old_map = dm_swap_table(md, new_map); + if (IS_ERR(old_map)) { + dm_table_destroy(new_map); + dm_put(md); + return PTR_ERR(old_map); + } + + if (dm_table_get_mode(new_map) & FMODE_WRITE) + set_disk_ro(dm_disk(md), 0); + else + set_disk_ro(dm_disk(md), 1); + } + + if (dm_suspended_md(md)) { + r = dm_resume(md); + if (!r && !dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr)) + param->flags |= DM_UEVENT_GENERATED_FLAG; + } + + if (old_map) + dm_table_destroy(old_map); + + if (!r) + __dev_status(md, param); + + dm_put(md); + return r; +} + +/* + * Set or unset the suspension state of a device. + * If the device already is in the requested state we just return its status. + */ +static int dev_suspend(struct dm_ioctl *param, size_t param_size) +{ + if (param->flags & DM_SUSPEND_FLAG) + return do_suspend(param); + + return do_resume(param); +} + +/* + * Copies device info back to user space, used by + * the create and info ioctls. + */ +static int dev_status(struct dm_ioctl *param, size_t param_size) +{ + struct mapped_device *md; + + md = find_device(param); + if (!md) + return -ENXIO; + + __dev_status(md, param); + dm_put(md); + + return 0; +} + +/* + * Build up the status struct for each target + */ +static void retrieve_status(struct dm_table *table, + struct dm_ioctl *param, size_t param_size) +{ + unsigned int i, num_targets; + struct dm_target_spec *spec; + char *outbuf, *outptr; + status_type_t type; + size_t remaining, len, used = 0; + + outptr = outbuf = get_result_buffer(param, param_size, &len); + + if (param->flags & DM_STATUS_TABLE_FLAG) + type = STATUSTYPE_TABLE; + else + type = STATUSTYPE_INFO; + + /* Get all the target info */ + num_targets = dm_table_get_num_targets(table); + for (i = 0; i < num_targets; i++) { + struct dm_target *ti = dm_table_get_target(table, i); + + remaining = len - (outptr - outbuf); + if (remaining <= sizeof(struct dm_target_spec)) { + param->flags |= DM_BUFFER_FULL_FLAG; + break; + } + + spec = (struct dm_target_spec *) outptr; + + spec->status = 0; + spec->sector_start = ti->begin; + spec->length = ti->len; + strncpy(spec->target_type, ti->type->name, + sizeof(spec->target_type)); + + outptr += sizeof(struct dm_target_spec); + remaining = len - (outptr - outbuf); + if (remaining <= 0) { + param->flags |= DM_BUFFER_FULL_FLAG; + break; + } + + /* Get the status/table string from the target driver */ + if (ti->type->status) { + if (ti->type->status(ti, type, outptr, remaining)) { + param->flags |= DM_BUFFER_FULL_FLAG; + break; + } + } else + outptr[0] = '\0'; + + outptr += strlen(outptr) + 1; + used = param->data_start + (outptr - outbuf); + + outptr = align_ptr(outptr); + spec->next = outptr - outbuf; + } + + if (used) + param->data_size = used; + + param->target_count = num_targets; +} + +/* + * Wait for a device to report an event + */ +static int dev_wait(struct dm_ioctl *param, size_t param_size) +{ + int r = 0; + struct mapped_device *md; + struct dm_table *table; + + md = find_device(param); + if (!md) + return -ENXIO; + + /* + * Wait for a notification event + */ + if (dm_wait_event(md, param->event_nr)) { + r = -ERESTARTSYS; + goto out; + } + + /* + * The userland program is going to want to know what + * changed to trigger the event, so we may as well tell + * him and save an ioctl. + */ + __dev_status(md, param); + + table = dm_get_live_or_inactive_table(md, param); + if (table) { + retrieve_status(table, param, param_size); + dm_table_put(table); + } + +out: + dm_put(md); + + return r; +} + +static inline fmode_t get_mode(struct dm_ioctl *param) +{ + fmode_t mode = FMODE_READ | FMODE_WRITE; + + if (param->flags & DM_READONLY_FLAG) + mode = FMODE_READ; + + return mode; +} + +static int next_target(struct dm_target_spec *last, uint32_t next, void *end, + struct dm_target_spec **spec, char **target_params) +{ + *spec = (struct dm_target_spec *) ((unsigned char *) last + next); + *target_params = (char *) (*spec + 1); + + if (*spec < (last + 1)) + return -EINVAL; + + return invalid_str(*target_params, end); +} + +static int populate_table(struct dm_table *table, + struct dm_ioctl *param, size_t param_size) +{ + int r; + unsigned int i = 0; + struct dm_target_spec *spec = (struct dm_target_spec *) param; + uint32_t next = param->data_start; + void *end = (void *) param + param_size; + char *target_params; + + if (!param->target_count) { + DMWARN("populate_table: no targets specified"); + return -EINVAL; + } + + for (i = 0; i < param->target_count; i++) { + + r = next_target(spec, next, end, &spec, &target_params); + if (r) { + DMWARN("unable to find target"); + return r; + } + + r = dm_table_add_target(table, spec->target_type, + (sector_t) spec->sector_start, + (sector_t) spec->length, + target_params); + if (r) { + DMWARN("error adding target to table"); + return r; + } + + next = spec->next; + } + + return dm_table_complete(table); +} + +static int table_load(struct dm_ioctl *param, size_t param_size) +{ + int r; + struct hash_cell *hc; + struct dm_table *t; + struct mapped_device *md; + struct target_type *immutable_target_type; + + md = find_device(param); + if (!md) + return -ENXIO; + + r = dm_table_create(&t, get_mode(param), param->target_count, md); + if (r) + goto out; + + r = populate_table(t, param, param_size); + if (r) { + dm_table_destroy(t); + goto out; + } + + immutable_target_type = dm_get_immutable_target_type(md); + if (immutable_target_type && + (immutable_target_type != dm_table_get_immutable_target_type(t))) { + DMWARN("can't replace immutable target type %s", + immutable_target_type->name); + dm_table_destroy(t); + r = -EINVAL; + goto out; + } + + /* Protect md->type and md->queue against concurrent table loads. */ + dm_lock_md_type(md); + if (dm_get_md_type(md) == DM_TYPE_NONE) + /* Initial table load: acquire type of table. */ + dm_set_md_type(md, dm_table_get_type(t)); + else if (dm_get_md_type(md) != dm_table_get_type(t)) { + DMWARN("can't change device type after initial table load."); + dm_table_destroy(t); + dm_unlock_md_type(md); + r = -EINVAL; + goto out; + } + + /* setup md->queue to reflect md's type (may block) */ + r = dm_setup_md_queue(md); + if (r) { + DMWARN("unable to set up device queue for new table."); + dm_table_destroy(t); + dm_unlock_md_type(md); + goto out; + } + dm_unlock_md_type(md); + + /* stage inactive table */ + down_write(&_hash_lock); + hc = dm_get_mdptr(md); + if (!hc || hc->md != md) { + DMWARN("device has been removed from the dev hash table."); + dm_table_destroy(t); + up_write(&_hash_lock); + r = -ENXIO; + goto out; + } + + if (hc->new_map) + dm_table_destroy(hc->new_map); + hc->new_map = t; + up_write(&_hash_lock); + + param->flags |= DM_INACTIVE_PRESENT_FLAG; + __dev_status(md, param); + +out: + dm_put(md); + + return r; +} + +static int table_clear(struct dm_ioctl *param, size_t param_size) +{ + struct hash_cell *hc; + struct mapped_device *md; + + down_write(&_hash_lock); + + hc = __find_device_hash_cell(param); + if (!hc) { + DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table."); + up_write(&_hash_lock); + return -ENXIO; + } + + if (hc->new_map) { + dm_table_destroy(hc->new_map); + hc->new_map = NULL; + } + + param->flags &= ~DM_INACTIVE_PRESENT_FLAG; + + __dev_status(hc->md, param); + md = hc->md; + up_write(&_hash_lock); + dm_put(md); + + return 0; +} + +/* + * Retrieves a list of devices used by a particular dm device. + */ +static void retrieve_deps(struct dm_table *table, + struct dm_ioctl *param, size_t param_size) +{ + unsigned int count = 0; + struct list_head *tmp; + size_t len, needed; + struct dm_dev_internal *dd; + struct dm_target_deps *deps; + + deps = get_result_buffer(param, param_size, &len); + + /* + * Count the devices. + */ + list_for_each (tmp, dm_table_get_devices(table)) + count++; + + /* + * Check we have enough space. + */ + needed = sizeof(*deps) + (sizeof(*deps->dev) * count); + if (len < needed) { + param->flags |= DM_BUFFER_FULL_FLAG; + return; + } + + /* + * Fill in the devices. + */ + deps->count = count; + count = 0; + list_for_each_entry (dd, dm_table_get_devices(table), list) + deps->dev[count++] = huge_encode_dev(dd->dm_dev.bdev->bd_dev); + + param->data_size = param->data_start + needed; +} + +static int table_deps(struct dm_ioctl *param, size_t param_size) +{ + struct mapped_device *md; + struct dm_table *table; + + md = find_device(param); + if (!md) + return -ENXIO; + + __dev_status(md, param); + + table = dm_get_live_or_inactive_table(md, param); + if (table) { + retrieve_deps(table, param, param_size); + dm_table_put(table); + } + + dm_put(md); + + return 0; +} + +/* + * Return the status of a device as a text string for each + * target. + */ +static int table_status(struct dm_ioctl *param, size_t param_size) +{ + struct mapped_device *md; + struct dm_table *table; + + md = find_device(param); + if (!md) + return -ENXIO; + + __dev_status(md, param); + + table = dm_get_live_or_inactive_table(md, param); + if (table) { + retrieve_status(table, param, param_size); + dm_table_put(table); + } + + dm_put(md); + + return 0; +} + +/* + * Pass a message to the target that's at the supplied device offset. + */ +static int target_message(struct dm_ioctl *param, size_t param_size) +{ + int r, argc; + char **argv; + struct mapped_device *md; + struct dm_table *table; + struct dm_target *ti; + struct dm_target_msg *tmsg = (void *) param + param->data_start; + + md = find_device(param); + if (!md) + return -ENXIO; + + if (tmsg < (struct dm_target_msg *) param->data || + invalid_str(tmsg->message, (void *) param + param_size)) { + DMWARN("Invalid target message parameters."); + r = -EINVAL; + goto out; + } + + r = dm_split_args(&argc, &argv, tmsg->message); + if (r) { + DMWARN("Failed to split target message parameters"); + goto out; + } + + if (!argc) { + DMWARN("Empty message received."); + goto out_argv; + } + + table = dm_get_live_table(md); + if (!table) + goto out_argv; + + if (dm_deleting_md(md)) { + r = -ENXIO; + goto out_table; + } + + ti = dm_table_find_target(table, tmsg->sector); + if (!dm_target_is_valid(ti)) { + DMWARN("Target message sector outside device."); + r = -EINVAL; + } else if (ti->type->message) + r = ti->type->message(ti, argc, argv); + else { + DMWARN("Target type does not support messages"); + r = -EINVAL; + } + + out_table: + dm_table_put(table); + out_argv: + kfree(argv); + out: + param->data_size = 0; + dm_put(md); + return r; +} + +/*----------------------------------------------------------------- + * Implementation of open/close/ioctl on the special char + * device. + *---------------------------------------------------------------*/ +static ioctl_fn lookup_ioctl(unsigned int cmd) +{ + static struct { + int cmd; + ioctl_fn fn; + } _ioctls[] = { + {DM_VERSION_CMD, NULL}, /* version is dealt with elsewhere */ + {DM_REMOVE_ALL_CMD, remove_all}, + {DM_LIST_DEVICES_CMD, list_devices}, + + {DM_DEV_CREATE_CMD, dev_create}, + {DM_DEV_REMOVE_CMD, dev_remove}, + {DM_DEV_RENAME_CMD, dev_rename}, + {DM_DEV_SUSPEND_CMD, dev_suspend}, + {DM_DEV_STATUS_CMD, dev_status}, + {DM_DEV_WAIT_CMD, dev_wait}, + + {DM_TABLE_LOAD_CMD, table_load}, + {DM_TABLE_CLEAR_CMD, table_clear}, + {DM_TABLE_DEPS_CMD, table_deps}, + {DM_TABLE_STATUS_CMD, table_status}, + + {DM_LIST_VERSIONS_CMD, list_versions}, + + {DM_TARGET_MSG_CMD, target_message}, + {DM_DEV_SET_GEOMETRY_CMD, dev_set_geometry} + }; + + return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn; +} + +/* + * As well as checking the version compatibility this always + * copies the kernel interface version out. + */ +static int check_version(unsigned int cmd, struct dm_ioctl __user *user) +{ + uint32_t version[3]; + int r = 0; + + if (copy_from_user(version, user->version, sizeof(version))) + return -EFAULT; + + if ((DM_VERSION_MAJOR != version[0]) || + (DM_VERSION_MINOR < version[1])) { + DMWARN("ioctl interface mismatch: " + "kernel(%u.%u.%u), user(%u.%u.%u), cmd(%d)", + DM_VERSION_MAJOR, DM_VERSION_MINOR, + DM_VERSION_PATCHLEVEL, + version[0], version[1], version[2], cmd); + r = -EINVAL; + } + + /* + * Fill in the kernel version. + */ + version[0] = DM_VERSION_MAJOR; + version[1] = DM_VERSION_MINOR; + version[2] = DM_VERSION_PATCHLEVEL; + if (copy_to_user(user->version, version, sizeof(version))) + return -EFAULT; + + return r; +} + +static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param) +{ + struct dm_ioctl tmp, *dmi; + int secure_data; + + if (copy_from_user(&tmp, user, sizeof(tmp) - sizeof(tmp.data))) + return -EFAULT; + + if (tmp.data_size < (sizeof(tmp) - sizeof(tmp.data))) + return -EINVAL; + + secure_data = tmp.flags & DM_SECURE_DATA_FLAG; + + dmi = vmalloc(tmp.data_size); + if (!dmi) { + if (secure_data && clear_user(user, tmp.data_size)) + return -EFAULT; + return -ENOMEM; + } + + if (copy_from_user(dmi, user, tmp.data_size)) + goto bad; + + /* Wipe the user buffer so we do not return it to userspace */ + if (secure_data && clear_user(user, tmp.data_size)) + goto bad; + + *param = dmi; + return 0; + +bad: + if (secure_data) + memset(dmi, 0, tmp.data_size); + vfree(dmi); + return -EFAULT; +} + +static int validate_params(uint cmd, struct dm_ioctl *param) +{ + /* Always clear this flag */ + param->flags &= ~DM_BUFFER_FULL_FLAG; + param->flags &= ~DM_UEVENT_GENERATED_FLAG; + param->flags &= ~DM_SECURE_DATA_FLAG; + + /* Ignores parameters */ + if (cmd == DM_REMOVE_ALL_CMD || + cmd == DM_LIST_DEVICES_CMD || + cmd == DM_LIST_VERSIONS_CMD) + return 0; + + if ((cmd == DM_DEV_CREATE_CMD)) { + if (!*param->name) { + DMWARN("name not supplied when creating device"); + return -EINVAL; + } + } else if ((*param->uuid && *param->name)) { + DMWARN("only supply one of name or uuid, cmd(%u)", cmd); + return -EINVAL; + } + + /* Ensure strings are terminated */ + param->name[DM_NAME_LEN - 1] = '\0'; + param->uuid[DM_UUID_LEN - 1] = '\0'; + + return 0; +} + +static int ctl_ioctl(uint command, struct dm_ioctl __user *user) +{ + int r = 0; + int wipe_buffer; + unsigned int cmd; + struct dm_ioctl *uninitialized_var(param); + ioctl_fn fn = NULL; + size_t input_param_size; + + /* only root can play with this */ + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (_IOC_TYPE(command) != DM_IOCTL) + return -ENOTTY; + + cmd = _IOC_NR(command); + + /* + * Check the interface version passed in. This also + * writes out the kernel's interface version. + */ + r = check_version(cmd, user); + if (r) + return r; + + /* + * Nothing more to do for the version command. + */ + if (cmd == DM_VERSION_CMD) + return 0; + + fn = lookup_ioctl(cmd); + if (!fn) { + DMWARN("dm_ctl_ioctl: unknown command 0x%x", command); + return -ENOTTY; + } + + /* + * Trying to avoid low memory issues when a device is + * suspended. + */ + current->flags |= PF_MEMALLOC; + + /* + * Copy the parameters into kernel space. + */ + r = copy_params(user, ¶m); + + current->flags &= ~PF_MEMALLOC; + + if (r) + return r; + + input_param_size = param->data_size; + wipe_buffer = param->flags & DM_SECURE_DATA_FLAG; + + r = validate_params(cmd, param); + if (r) + goto out; + + param->data_size = sizeof(*param); + r = fn(param, input_param_size); + + /* + * Copy the results back to userland. + */ + if (!r && copy_to_user(user, param, param->data_size)) + r = -EFAULT; + +out: + if (wipe_buffer) + memset(param, 0, input_param_size); + + vfree(param); + return r; +} + +static long dm_ctl_ioctl(struct file *file, uint command, ulong u) +{ + return (long)ctl_ioctl(command, (struct dm_ioctl __user *)u); +} + +#ifdef CONFIG_COMPAT +static long dm_compat_ctl_ioctl(struct file *file, uint command, ulong u) +{ + return (long)dm_ctl_ioctl(file, command, (ulong) compat_ptr(u)); +} +#else +#define dm_compat_ctl_ioctl NULL +#endif + +static const struct file_operations _ctl_fops = { + .open = nonseekable_open, + .unlocked_ioctl = dm_ctl_ioctl, + .compat_ioctl = dm_compat_ctl_ioctl, + .owner = THIS_MODULE, + .llseek = noop_llseek, +}; + +static struct miscdevice _dm_misc = { + .minor = MAPPER_CTRL_MINOR, + .name = DM_NAME, + .nodename = DM_DIR "/" DM_CONTROL_NODE, + .fops = &_ctl_fops +}; + +MODULE_ALIAS_MISCDEV(MAPPER_CTRL_MINOR); +MODULE_ALIAS("devname:" DM_DIR "/" DM_CONTROL_NODE); + +/* + * Create misc character device and link to DM_DIR/control. + */ +int __init dm_interface_init(void) +{ + int r; + + r = dm_hash_init(); + if (r) + return r; + + r = misc_register(&_dm_misc); + if (r) { + DMERR("misc_register failed for control device"); + dm_hash_exit(); + return r; + } + + DMINFO("%d.%d.%d%s initialised: %s", DM_VERSION_MAJOR, + DM_VERSION_MINOR, DM_VERSION_PATCHLEVEL, DM_VERSION_EXTRA, + DM_DRIVER_EMAIL); + return 0; +} + +void dm_interface_exit(void) +{ + if (misc_deregister(&_dm_misc) < 0) + DMERR("misc_deregister failed for control device"); + + dm_hash_exit(); +} + +/** + * dm_copy_name_and_uuid - Copy mapped device name & uuid into supplied buffers + * @md: Pointer to mapped_device + * @name: Buffer (size DM_NAME_LEN) for name + * @uuid: Buffer (size DM_UUID_LEN) for uuid or empty string if uuid not defined + */ +int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid) +{ + int r = 0; + struct hash_cell *hc; + + if (!md) + return -ENXIO; + + mutex_lock(&dm_hash_cells_mutex); + hc = dm_get_mdptr(md); + if (!hc || hc->md != md) { + r = -ENXIO; + goto out; + } + + if (name) + strcpy(name, hc->name); + if (uuid) + strcpy(uuid, hc->uuid ? : ""); + +out: + mutex_unlock(&dm_hash_cells_mutex); + + return r; +} diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c new file mode 100644 index 00000000..bed444c9 --- /dev/null +++ b/drivers/md/dm-kcopyd.c @@ -0,0 +1,756 @@ +/* + * Copyright (C) 2002 Sistina Software (UK) Limited. + * Copyright (C) 2006 Red Hat GmbH + * + * This file is released under the GPL. + * + * Kcopyd provides a simple interface for copying an area of one + * block-device to one or more other block-devices, with an asynchronous + * completion notification. + */ + +#include <linux/types.h> +#include <linux/atomic.h> +#include <linux/blkdev.h> +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/mempool.h> +#include <linux/module.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/workqueue.h> +#include <linux/mutex.h> +#include <linux/device-mapper.h> +#include <linux/dm-kcopyd.h> + +#include "dm.h" + +#define SUB_JOB_SIZE 128 +#define SPLIT_COUNT 8 +#define MIN_JOBS 8 +#define RESERVE_PAGES (DIV_ROUND_UP(SUB_JOB_SIZE << SECTOR_SHIFT, PAGE_SIZE)) + +/*----------------------------------------------------------------- + * Each kcopyd client has its own little pool of preallocated + * pages for kcopyd io. + *---------------------------------------------------------------*/ +struct dm_kcopyd_client { + struct page_list *pages; + unsigned nr_reserved_pages; + unsigned nr_free_pages; + + struct dm_io_client *io_client; + + wait_queue_head_t destroyq; + atomic_t nr_jobs; + + mempool_t *job_pool; + + struct workqueue_struct *kcopyd_wq; + struct work_struct kcopyd_work; + +/* + * We maintain three lists of jobs: + * + * i) jobs waiting for pages + * ii) jobs that have pages, and are waiting for the io to be issued. + * iii) jobs that have completed. + * + * All three of these are protected by job_lock. + */ + spinlock_t job_lock; + struct list_head complete_jobs; + struct list_head io_jobs; + struct list_head pages_jobs; +}; + +static struct page_list zero_page_list; + +static void wake(struct dm_kcopyd_client *kc) +{ + queue_work(kc->kcopyd_wq, &kc->kcopyd_work); +} + +/* + * Obtain one page for the use of kcopyd. + */ +static struct page_list *alloc_pl(gfp_t gfp) +{ + struct page_list *pl; + + pl = kmalloc(sizeof(*pl), gfp); + if (!pl) + return NULL; + + pl->page = alloc_page(gfp); + if (!pl->page) { + kfree(pl); + return NULL; + } + + return pl; +} + +static void free_pl(struct page_list *pl) +{ + __free_page(pl->page); + kfree(pl); +} + +/* + * Add the provided pages to a client's free page list, releasing + * back to the system any beyond the reserved_pages limit. + */ +static void kcopyd_put_pages(struct dm_kcopyd_client *kc, struct page_list *pl) +{ + struct page_list *next; + + do { + next = pl->next; + + if (kc->nr_free_pages >= kc->nr_reserved_pages) + free_pl(pl); + else { + pl->next = kc->pages; + kc->pages = pl; + kc->nr_free_pages++; + } + + pl = next; + } while (pl); +} + +static int kcopyd_get_pages(struct dm_kcopyd_client *kc, + unsigned int nr, struct page_list **pages) +{ + struct page_list *pl; + + *pages = NULL; + + do { + pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY); + if (unlikely(!pl)) { + /* Use reserved pages */ + pl = kc->pages; + if (unlikely(!pl)) + goto out_of_memory; + kc->pages = pl->next; + kc->nr_free_pages--; + } + pl->next = *pages; + *pages = pl; + } while (--nr); + + return 0; + +out_of_memory: + if (*pages) + kcopyd_put_pages(kc, *pages); + return -ENOMEM; +} + +/* + * These three functions resize the page pool. + */ +static void drop_pages(struct page_list *pl) +{ + struct page_list *next; + + while (pl) { + next = pl->next; + free_pl(pl); + pl = next; + } +} + +/* + * Allocate and reserve nr_pages for the use of a specific client. + */ +static int client_reserve_pages(struct dm_kcopyd_client *kc, unsigned nr_pages) +{ + unsigned i; + struct page_list *pl = NULL, *next; + + for (i = 0; i < nr_pages; i++) { + next = alloc_pl(GFP_KERNEL); + if (!next) { + if (pl) + drop_pages(pl); + return -ENOMEM; + } + next->next = pl; + pl = next; + } + + kc->nr_reserved_pages += nr_pages; + kcopyd_put_pages(kc, pl); + + return 0; +} + +static void client_free_pages(struct dm_kcopyd_client *kc) +{ + BUG_ON(kc->nr_free_pages != kc->nr_reserved_pages); + drop_pages(kc->pages); + kc->pages = NULL; + kc->nr_free_pages = kc->nr_reserved_pages = 0; +} + +/*----------------------------------------------------------------- + * kcopyd_jobs need to be allocated by the *clients* of kcopyd, + * for this reason we use a mempool to prevent the client from + * ever having to do io (which could cause a deadlock). + *---------------------------------------------------------------*/ +struct kcopyd_job { + struct dm_kcopyd_client *kc; + struct list_head list; + unsigned long flags; + + /* + * Error state of the job. + */ + int read_err; + unsigned long write_err; + + /* + * Either READ or WRITE + */ + int rw; + struct dm_io_region source; + + /* + * The destinations for the transfer. + */ + unsigned int num_dests; + struct dm_io_region dests[DM_KCOPYD_MAX_REGIONS]; + + struct page_list *pages; + + /* + * Set this to ensure you are notified when the job has + * completed. 'context' is for callback to use. + */ + dm_kcopyd_notify_fn fn; + void *context; + + /* + * These fields are only used if the job has been split + * into more manageable parts. + */ + struct mutex lock; + atomic_t sub_jobs; + sector_t progress; + + struct kcopyd_job *master_job; +}; + +static struct kmem_cache *_job_cache; + +int __init dm_kcopyd_init(void) +{ + _job_cache = kmem_cache_create("kcopyd_job", + sizeof(struct kcopyd_job) * (SPLIT_COUNT + 1), + __alignof__(struct kcopyd_job), 0, NULL); + if (!_job_cache) + return -ENOMEM; + + zero_page_list.next = &zero_page_list; + zero_page_list.page = ZERO_PAGE(0); + + return 0; +} + +void dm_kcopyd_exit(void) +{ + kmem_cache_destroy(_job_cache); + _job_cache = NULL; +} + +/* + * Functions to push and pop a job onto the head of a given job + * list. + */ +static struct kcopyd_job *pop(struct list_head *jobs, + struct dm_kcopyd_client *kc) +{ + struct kcopyd_job *job = NULL; + unsigned long flags; + + spin_lock_irqsave(&kc->job_lock, flags); + + if (!list_empty(jobs)) { + job = list_entry(jobs->next, struct kcopyd_job, list); + list_del(&job->list); + } + spin_unlock_irqrestore(&kc->job_lock, flags); + + return job; +} + +static void push(struct list_head *jobs, struct kcopyd_job *job) +{ + unsigned long flags; + struct dm_kcopyd_client *kc = job->kc; + + spin_lock_irqsave(&kc->job_lock, flags); + list_add_tail(&job->list, jobs); + spin_unlock_irqrestore(&kc->job_lock, flags); +} + + +static void push_head(struct list_head *jobs, struct kcopyd_job *job) +{ + unsigned long flags; + struct dm_kcopyd_client *kc = job->kc; + + spin_lock_irqsave(&kc->job_lock, flags); + list_add(&job->list, jobs); + spin_unlock_irqrestore(&kc->job_lock, flags); +} + +/* + * These three functions process 1 item from the corresponding + * job list. + * + * They return: + * < 0: error + * 0: success + * > 0: can't process yet. + */ +static int run_complete_job(struct kcopyd_job *job) +{ + void *context = job->context; + int read_err = job->read_err; + unsigned long write_err = job->write_err; + dm_kcopyd_notify_fn fn = job->fn; + struct dm_kcopyd_client *kc = job->kc; + + if (job->pages && job->pages != &zero_page_list) + kcopyd_put_pages(kc, job->pages); + /* + * If this is the master job, the sub jobs have already + * completed so we can free everything. + */ + if (job->master_job == job) + mempool_free(job, kc->job_pool); + fn(read_err, write_err, context); + + if (atomic_dec_and_test(&kc->nr_jobs)) + wake_up(&kc->destroyq); + + return 0; +} + +static void complete_io(unsigned long error, void *context) +{ + struct kcopyd_job *job = (struct kcopyd_job *) context; + struct dm_kcopyd_client *kc = job->kc; + + if (error) { + if (job->rw == WRITE) + job->write_err |= error; + else + job->read_err = 1; + + if (!test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags)) { + push(&kc->complete_jobs, job); + wake(kc); + return; + } + } + + if (job->rw == WRITE) + push(&kc->complete_jobs, job); + + else { + job->rw = WRITE; + push(&kc->io_jobs, job); + } + + wake(kc); +} + +/* + * Request io on as many buffer heads as we can currently get for + * a particular job. + */ +static int run_io_job(struct kcopyd_job *job) +{ + int r; + struct dm_io_request io_req = { + .bi_rw = job->rw, + .mem.type = DM_IO_PAGE_LIST, + .mem.ptr.pl = job->pages, + .mem.offset = 0, + .notify.fn = complete_io, + .notify.context = job, + .client = job->kc->io_client, + }; + + if (job->rw == READ) + r = dm_io(&io_req, 1, &job->source, NULL); + else + r = dm_io(&io_req, job->num_dests, job->dests, NULL); + + return r; +} + +static int run_pages_job(struct kcopyd_job *job) +{ + int r; + unsigned nr_pages = dm_div_up(job->dests[0].count, PAGE_SIZE >> 9); + + r = kcopyd_get_pages(job->kc, nr_pages, &job->pages); + if (!r) { + /* this job is ready for io */ + push(&job->kc->io_jobs, job); + return 0; + } + + if (r == -ENOMEM) + /* can't complete now */ + return 1; + + return r; +} + +/* + * Run through a list for as long as possible. Returns the count + * of successful jobs. + */ +static int process_jobs(struct list_head *jobs, struct dm_kcopyd_client *kc, + int (*fn) (struct kcopyd_job *)) +{ + struct kcopyd_job *job; + int r, count = 0; + + while ((job = pop(jobs, kc))) { + + r = fn(job); + + if (r < 0) { + /* error this rogue job */ + if (job->rw == WRITE) + job->write_err = (unsigned long) -1L; + else + job->read_err = 1; + push(&kc->complete_jobs, job); + break; + } + + if (r > 0) { + /* + * We couldn't service this job ATM, so + * push this job back onto the list. + */ + push_head(jobs, job); + break; + } + + count++; + } + + return count; +} + +/* + * kcopyd does this every time it's woken up. + */ +static void do_work(struct work_struct *work) +{ + struct dm_kcopyd_client *kc = container_of(work, + struct dm_kcopyd_client, kcopyd_work); + struct blk_plug plug; + + /* + * The order that these are called is *very* important. + * complete jobs can free some pages for pages jobs. + * Pages jobs when successful will jump onto the io jobs + * list. io jobs call wake when they complete and it all + * starts again. + */ + blk_start_plug(&plug); + process_jobs(&kc->complete_jobs, kc, run_complete_job); + process_jobs(&kc->pages_jobs, kc, run_pages_job); + process_jobs(&kc->io_jobs, kc, run_io_job); + blk_finish_plug(&plug); +} + +/* + * If we are copying a small region we just dispatch a single job + * to do the copy, otherwise the io has to be split up into many + * jobs. + */ +static void dispatch_job(struct kcopyd_job *job) +{ + struct dm_kcopyd_client *kc = job->kc; + atomic_inc(&kc->nr_jobs); + if (unlikely(!job->source.count)) + push(&kc->complete_jobs, job); + else if (job->pages == &zero_page_list) + push(&kc->io_jobs, job); + else + push(&kc->pages_jobs, job); + wake(kc); +} + +static void segment_complete(int read_err, unsigned long write_err, + void *context) +{ + /* FIXME: tidy this function */ + sector_t progress = 0; + sector_t count = 0; + struct kcopyd_job *sub_job = (struct kcopyd_job *) context; + struct kcopyd_job *job = sub_job->master_job; + struct dm_kcopyd_client *kc = job->kc; + + mutex_lock(&job->lock); + + /* update the error */ + if (read_err) + job->read_err = 1; + + if (write_err) + job->write_err |= write_err; + + /* + * Only dispatch more work if there hasn't been an error. + */ + if ((!job->read_err && !job->write_err) || + test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags)) { + /* get the next chunk of work */ + progress = job->progress; + count = job->source.count - progress; + if (count) { + if (count > SUB_JOB_SIZE) + count = SUB_JOB_SIZE; + + job->progress += count; + } + } + mutex_unlock(&job->lock); + + if (count) { + int i; + + *sub_job = *job; + sub_job->source.sector += progress; + sub_job->source.count = count; + + for (i = 0; i < job->num_dests; i++) { + sub_job->dests[i].sector += progress; + sub_job->dests[i].count = count; + } + + sub_job->fn = segment_complete; + sub_job->context = sub_job; + dispatch_job(sub_job); + + } else if (atomic_dec_and_test(&job->sub_jobs)) { + + /* + * Queue the completion callback to the kcopyd thread. + * + * Some callers assume that all the completions are called + * from a single thread and don't race with each other. + * + * We must not call the callback directly here because this + * code may not be executing in the thread. + */ + push(&kc->complete_jobs, job); + wake(kc); + } +} + +/* + * Create some sub jobs to share the work between them. + */ +static void split_job(struct kcopyd_job *master_job) +{ + int i; + + atomic_inc(&master_job->kc->nr_jobs); + + atomic_set(&master_job->sub_jobs, SPLIT_COUNT); + for (i = 0; i < SPLIT_COUNT; i++) { + master_job[i + 1].master_job = master_job; + segment_complete(0, 0u, &master_job[i + 1]); + } +} + +int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, + unsigned int num_dests, struct dm_io_region *dests, + unsigned int flags, dm_kcopyd_notify_fn fn, void *context) +{ + struct kcopyd_job *job; + + /* + * Allocate an array of jobs consisting of one master job + * followed by SPLIT_COUNT sub jobs. + */ + job = mempool_alloc(kc->job_pool, GFP_NOIO); + + /* + * set up for the read. + */ + job->kc = kc; + job->flags = flags; + job->read_err = 0; + job->write_err = 0; + + job->num_dests = num_dests; + memcpy(&job->dests, dests, sizeof(*dests) * num_dests); + + if (from) { + job->source = *from; + job->pages = NULL; + job->rw = READ; + } else { + memset(&job->source, 0, sizeof job->source); + job->source.count = job->dests[0].count; + job->pages = &zero_page_list; + job->rw = WRITE; + } + + job->fn = fn; + job->context = context; + job->master_job = job; + + if (job->source.count <= SUB_JOB_SIZE) + dispatch_job(job); + else { + mutex_init(&job->lock); + job->progress = 0; + split_job(job); + } + + return 0; +} +EXPORT_SYMBOL(dm_kcopyd_copy); + +int dm_kcopyd_zero(struct dm_kcopyd_client *kc, + unsigned num_dests, struct dm_io_region *dests, + unsigned flags, dm_kcopyd_notify_fn fn, void *context) +{ + return dm_kcopyd_copy(kc, NULL, num_dests, dests, flags, fn, context); +} +EXPORT_SYMBOL(dm_kcopyd_zero); + +void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc, + dm_kcopyd_notify_fn fn, void *context) +{ + struct kcopyd_job *job; + + job = mempool_alloc(kc->job_pool, GFP_NOIO); + + memset(job, 0, sizeof(struct kcopyd_job)); + job->kc = kc; + job->fn = fn; + job->context = context; + job->master_job = job; + + atomic_inc(&kc->nr_jobs); + + return job; +} +EXPORT_SYMBOL(dm_kcopyd_prepare_callback); + +void dm_kcopyd_do_callback(void *j, int read_err, unsigned long write_err) +{ + struct kcopyd_job *job = j; + struct dm_kcopyd_client *kc = job->kc; + + job->read_err = read_err; + job->write_err = write_err; + + push(&kc->complete_jobs, job); + wake(kc); +} +EXPORT_SYMBOL(dm_kcopyd_do_callback); + +/* + * Cancels a kcopyd job, eg. someone might be deactivating a + * mirror. + */ +#if 0 +int kcopyd_cancel(struct kcopyd_job *job, int block) +{ + /* FIXME: finish */ + return -1; +} +#endif /* 0 */ + +/*----------------------------------------------------------------- + * Client setup + *---------------------------------------------------------------*/ +struct dm_kcopyd_client *dm_kcopyd_client_create(void) +{ + int r = -ENOMEM; + struct dm_kcopyd_client *kc; + + kc = kmalloc(sizeof(*kc), GFP_KERNEL); + if (!kc) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&kc->job_lock); + INIT_LIST_HEAD(&kc->complete_jobs); + INIT_LIST_HEAD(&kc->io_jobs); + INIT_LIST_HEAD(&kc->pages_jobs); + + kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache); + if (!kc->job_pool) + goto bad_slab; + + INIT_WORK(&kc->kcopyd_work, do_work); + kc->kcopyd_wq = alloc_workqueue("kcopyd", + WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); + if (!kc->kcopyd_wq) + goto bad_workqueue; + + kc->pages = NULL; + kc->nr_reserved_pages = kc->nr_free_pages = 0; + r = client_reserve_pages(kc, RESERVE_PAGES); + if (r) + goto bad_client_pages; + + kc->io_client = dm_io_client_create(); + if (IS_ERR(kc->io_client)) { + r = PTR_ERR(kc->io_client); + goto bad_io_client; + } + + init_waitqueue_head(&kc->destroyq); + atomic_set(&kc->nr_jobs, 0); + + return kc; + +bad_io_client: + client_free_pages(kc); +bad_client_pages: + destroy_workqueue(kc->kcopyd_wq); +bad_workqueue: + mempool_destroy(kc->job_pool); +bad_slab: + kfree(kc); + + return ERR_PTR(r); +} +EXPORT_SYMBOL(dm_kcopyd_client_create); + +void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc) +{ + /* Wait for completion of all jobs submitted by this client. */ + wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs)); + + BUG_ON(!list_empty(&kc->complete_jobs)); + BUG_ON(!list_empty(&kc->io_jobs)); + BUG_ON(!list_empty(&kc->pages_jobs)); + destroy_workqueue(kc->kcopyd_wq); + dm_io_client_destroy(kc->io_client); + client_free_pages(kc); + mempool_destroy(kc->job_pool); + kfree(kc); +} +EXPORT_SYMBOL(dm_kcopyd_client_destroy); diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c new file mode 100644 index 00000000..3639eeab --- /dev/null +++ b/drivers/md/dm-linear.c @@ -0,0 +1,182 @@ +/* + * Copyright (C) 2001-2003 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" +#include <linux/module.h> +#include <linux/init.h> +#include <linux/blkdev.h> +#include <linux/bio.h> +#include <linux/slab.h> +#include <linux/device-mapper.h> + +#define DM_MSG_PREFIX "linear" + +/* + * Linear: maps a linear range of a device. + */ +struct linear_c { + struct dm_dev *dev; + sector_t start; +}; + +/* + * Construct a linear mapping: <dev_path> <offset> + */ +static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct linear_c *lc; + unsigned long long tmp; + char dummy; + + if (argc != 2) { + ti->error = "Invalid argument count"; + return -EINVAL; + } + + lc = kmalloc(sizeof(*lc), GFP_KERNEL); + if (lc == NULL) { + ti->error = "dm-linear: Cannot allocate linear context"; + return -ENOMEM; + } + + if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) { + ti->error = "dm-linear: Invalid device sector"; + goto bad; + } + lc->start = tmp; + + if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &lc->dev)) { + ti->error = "dm-linear: Device lookup failed"; + goto bad; + } + + ti->num_flush_requests = 1; + ti->num_discard_requests = 1; + ti->private = lc; + return 0; + + bad: + kfree(lc); + return -EINVAL; +} + +static void linear_dtr(struct dm_target *ti) +{ + struct linear_c *lc = (struct linear_c *) ti->private; + + dm_put_device(ti, lc->dev); + kfree(lc); +} + +static sector_t linear_map_sector(struct dm_target *ti, sector_t bi_sector) +{ + struct linear_c *lc = ti->private; + + return lc->start + dm_target_offset(ti, bi_sector); +} + +static void linear_map_bio(struct dm_target *ti, struct bio *bio) +{ + struct linear_c *lc = ti->private; + + bio->bi_bdev = lc->dev->bdev; + if (bio_sectors(bio)) + bio->bi_sector = linear_map_sector(ti, bio->bi_sector); +} + +static int linear_map(struct dm_target *ti, struct bio *bio, + union map_info *map_context) +{ + linear_map_bio(ti, bio); + + return DM_MAPIO_REMAPPED; +} + +static int linear_status(struct dm_target *ti, status_type_t type, + char *result, unsigned int maxlen) +{ + struct linear_c *lc = (struct linear_c *) ti->private; + + switch (type) { + case STATUSTYPE_INFO: + result[0] = '\0'; + break; + + case STATUSTYPE_TABLE: + snprintf(result, maxlen, "%s %llu", lc->dev->name, + (unsigned long long)lc->start); + break; + } + return 0; +} + +static int linear_ioctl(struct dm_target *ti, unsigned int cmd, + unsigned long arg) +{ + struct linear_c *lc = (struct linear_c *) ti->private; + struct dm_dev *dev = lc->dev; + int r = 0; + + /* + * Only pass ioctls through if the device sizes match exactly. + */ + if (lc->start || + ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT) + r = scsi_verify_blk_ioctl(NULL, cmd); + + return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg); +} + +static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size) +{ + struct linear_c *lc = ti->private; + struct request_queue *q = bdev_get_queue(lc->dev->bdev); + + if (!q->merge_bvec_fn) + return max_size; + + bvm->bi_bdev = lc->dev->bdev; + bvm->bi_sector = linear_map_sector(ti, bvm->bi_sector); + + return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); +} + +static int linear_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct linear_c *lc = ti->private; + + return fn(ti, lc->dev, lc->start, ti->len, data); +} + +static struct target_type linear_target = { + .name = "linear", + .version = {1, 1, 0}, + .module = THIS_MODULE, + .ctr = linear_ctr, + .dtr = linear_dtr, + .map = linear_map, + .status = linear_status, + .ioctl = linear_ioctl, + .merge = linear_merge, + .iterate_devices = linear_iterate_devices, +}; + +int __init dm_linear_init(void) +{ + int r = dm_register_target(&linear_target); + + if (r < 0) + DMERR("register failed %d", r); + + return r; +} + +void dm_linear_exit(void) +{ + dm_unregister_target(&linear_target); +} diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c new file mode 100644 index 00000000..9429159d --- /dev/null +++ b/drivers/md/dm-log-userspace-base.c @@ -0,0 +1,818 @@ +/* + * Copyright (C) 2006-2009 Red Hat, Inc. + * + * This file is released under the LGPL. + */ + +#include <linux/bio.h> +#include <linux/slab.h> +#include <linux/dm-dirty-log.h> +#include <linux/device-mapper.h> +#include <linux/dm-log-userspace.h> +#include <linux/module.h> + +#include "dm-log-userspace-transfer.h" + +#define DM_LOG_USERSPACE_VSN "1.1.0" + +struct flush_entry { + int type; + region_t region; + struct list_head list; +}; + +/* + * This limit on the number of mark and clear request is, to a degree, + * arbitrary. However, there is some basis for the choice in the limits + * imposed on the size of data payload by dm-log-userspace-transfer.c: + * dm_consult_userspace(). + */ +#define MAX_FLUSH_GROUP_COUNT 32 + +struct log_c { + struct dm_target *ti; + struct dm_dev *log_dev; + uint32_t region_size; + region_t region_count; + uint64_t luid; + char uuid[DM_UUID_LEN]; + + char *usr_argv_str; + uint32_t usr_argc; + + /* + * in_sync_hint gets set when doing is_remote_recovering. It + * represents the first region that needs recovery. IOW, the + * first zero bit of sync_bits. This can be useful for to limit + * traffic for calls like is_remote_recovering and get_resync_work, + * but be take care in its use for anything else. + */ + uint64_t in_sync_hint; + + /* + * Mark and clear requests are held until a flush is issued + * so that we can group, and thereby limit, the amount of + * network traffic between kernel and userspace. The 'flush_lock' + * is used to protect these lists. + */ + spinlock_t flush_lock; + struct list_head mark_list; + struct list_head clear_list; +}; + +static mempool_t *flush_entry_pool; + +static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data) +{ + return kmalloc(sizeof(struct flush_entry), gfp_mask); +} + +static void flush_entry_free(void *element, void *pool_data) +{ + kfree(element); +} + +static int userspace_do_request(struct log_c *lc, const char *uuid, + int request_type, char *data, size_t data_size, + char *rdata, size_t *rdata_size) +{ + int r; + + /* + * If the server isn't there, -ESRCH is returned, + * and we must keep trying until the server is + * restored. + */ +retry: + r = dm_consult_userspace(uuid, lc->luid, request_type, data, + data_size, rdata, rdata_size); + + if (r != -ESRCH) + return r; + + DMERR(" Userspace log server not found."); + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(2*HZ); + DMWARN("Attempting to contact userspace log server..."); + r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_CTR, + lc->usr_argv_str, + strlen(lc->usr_argv_str) + 1, + NULL, NULL); + if (!r) + break; + } + DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete"); + r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_RESUME, NULL, + 0, NULL, NULL); + if (!r) + goto retry; + + DMERR("Error trying to resume userspace log: %d", r); + + return -ESRCH; +} + +static int build_constructor_string(struct dm_target *ti, + unsigned argc, char **argv, + char **ctr_str) +{ + int i, str_size; + char *str = NULL; + + *ctr_str = NULL; + + for (i = 0, str_size = 0; i < argc; i++) + str_size += strlen(argv[i]) + 1; /* +1 for space between args */ + + str_size += 20; /* Max number of chars in a printed u64 number */ + + str = kzalloc(str_size, GFP_KERNEL); + if (!str) { + DMWARN("Unable to allocate memory for constructor string"); + return -ENOMEM; + } + + str_size = sprintf(str, "%llu", (unsigned long long)ti->len); + for (i = 0; i < argc; i++) + str_size += sprintf(str + str_size, " %s", argv[i]); + + *ctr_str = str; + return str_size; +} + +/* + * userspace_ctr + * + * argv contains: + * <UUID> <other args> + * Where 'other args' is the userspace implementation specific log + * arguments. An example might be: + * <UUID> clustered-disk <arg count> <log dev> <region_size> [[no]sync] + * + * So, this module will strip off the <UUID> for identification purposes + * when communicating with userspace about a log; but will pass on everything + * else. + */ +static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, + unsigned argc, char **argv) +{ + int r = 0; + int str_size; + char *ctr_str = NULL; + struct log_c *lc = NULL; + uint64_t rdata; + size_t rdata_size = sizeof(rdata); + char *devices_rdata = NULL; + size_t devices_rdata_size = DM_NAME_LEN; + + if (argc < 3) { + DMWARN("Too few arguments to userspace dirty log"); + return -EINVAL; + } + + lc = kzalloc(sizeof(*lc), GFP_KERNEL); + if (!lc) { + DMWARN("Unable to allocate userspace log context."); + return -ENOMEM; + } + + /* The ptr value is sufficient for local unique id */ + lc->luid = (unsigned long)lc; + + lc->ti = ti; + + if (strlen(argv[0]) > (DM_UUID_LEN - 1)) { + DMWARN("UUID argument too long."); + kfree(lc); + return -EINVAL; + } + + strncpy(lc->uuid, argv[0], DM_UUID_LEN); + spin_lock_init(&lc->flush_lock); + INIT_LIST_HEAD(&lc->mark_list); + INIT_LIST_HEAD(&lc->clear_list); + + str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str); + if (str_size < 0) { + kfree(lc); + return str_size; + } + + devices_rdata = kzalloc(devices_rdata_size, GFP_KERNEL); + if (!devices_rdata) { + DMERR("Failed to allocate memory for device information"); + r = -ENOMEM; + goto out; + } + + /* + * Send table string and get back any opened device. + */ + r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR, + ctr_str, str_size, + devices_rdata, &devices_rdata_size); + + if (r < 0) { + if (r == -ESRCH) + DMERR("Userspace log server not found"); + else + DMERR("Userspace log server failed to create log"); + goto out; + } + + /* Since the region size does not change, get it now */ + rdata_size = sizeof(rdata); + r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_GET_REGION_SIZE, + NULL, 0, (char *)&rdata, &rdata_size); + + if (r) { + DMERR("Failed to get region size of dirty log"); + goto out; + } + + lc->region_size = (uint32_t)rdata; + lc->region_count = dm_sector_div_up(ti->len, lc->region_size); + + if (devices_rdata_size) { + if (devices_rdata[devices_rdata_size - 1] != '\0') { + DMERR("DM_ULOG_CTR device return string not properly terminated"); + r = -EINVAL; + goto out; + } + r = dm_get_device(ti, devices_rdata, + dm_table_get_mode(ti->table), &lc->log_dev); + if (r) + DMERR("Failed to register %s with device-mapper", + devices_rdata); + } +out: + kfree(devices_rdata); + if (r) { + kfree(lc); + kfree(ctr_str); + } else { + lc->usr_argv_str = ctr_str; + lc->usr_argc = argc; + log->context = lc; + } + + return r; +} + +static void userspace_dtr(struct dm_dirty_log *log) +{ + struct log_c *lc = log->context; + + (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR, + NULL, 0, + NULL, NULL); + + if (lc->log_dev) + dm_put_device(lc->ti, lc->log_dev); + + kfree(lc->usr_argv_str); + kfree(lc); + + return; +} + +static int userspace_presuspend(struct dm_dirty_log *log) +{ + int r; + struct log_c *lc = log->context; + + r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_PRESUSPEND, + NULL, 0, + NULL, NULL); + + return r; +} + +static int userspace_postsuspend(struct dm_dirty_log *log) +{ + int r; + struct log_c *lc = log->context; + + r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_POSTSUSPEND, + NULL, 0, + NULL, NULL); + + return r; +} + +static int userspace_resume(struct dm_dirty_log *log) +{ + int r; + struct log_c *lc = log->context; + + lc->in_sync_hint = 0; + r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_RESUME, + NULL, 0, + NULL, NULL); + + return r; +} + +static uint32_t userspace_get_region_size(struct dm_dirty_log *log) +{ + struct log_c *lc = log->context; + + return lc->region_size; +} + +/* + * userspace_is_clean + * + * Check whether a region is clean. If there is any sort of + * failure when consulting the server, we return not clean. + * + * Returns: 1 if clean, 0 otherwise + */ +static int userspace_is_clean(struct dm_dirty_log *log, region_t region) +{ + int r; + uint64_t region64 = (uint64_t)region; + int64_t is_clean; + size_t rdata_size; + struct log_c *lc = log->context; + + rdata_size = sizeof(is_clean); + r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_CLEAN, + (char *)®ion64, sizeof(region64), + (char *)&is_clean, &rdata_size); + + return (r) ? 0 : (int)is_clean; +} + +/* + * userspace_in_sync + * + * Check if the region is in-sync. If there is any sort + * of failure when consulting the server, we assume that + * the region is not in sync. + * + * If 'can_block' is set, return immediately + * + * Returns: 1 if in-sync, 0 if not-in-sync, -EWOULDBLOCK + */ +static int userspace_in_sync(struct dm_dirty_log *log, region_t region, + int can_block) +{ + int r; + uint64_t region64 = region; + int64_t in_sync; + size_t rdata_size; + struct log_c *lc = log->context; + + /* + * We can never respond directly - even if in_sync_hint is + * set. This is because another machine could see a device + * failure and mark the region out-of-sync. If we don't go + * to userspace to ask, we might think the region is in-sync + * and allow a read to pick up data that is stale. (This is + * very unlikely if a device actually fails; but it is very + * likely if a connection to one device from one machine fails.) + * + * There still might be a problem if the mirror caches the region + * state as in-sync... but then this call would not be made. So, + * that is a mirror problem. + */ + if (!can_block) + return -EWOULDBLOCK; + + rdata_size = sizeof(in_sync); + r = userspace_do_request(lc, lc->uuid, DM_ULOG_IN_SYNC, + (char *)®ion64, sizeof(region64), + (char *)&in_sync, &rdata_size); + return (r) ? 0 : (int)in_sync; +} + +static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list) +{ + int r = 0; + struct flush_entry *fe; + + list_for_each_entry(fe, flush_list, list) { + r = userspace_do_request(lc, lc->uuid, fe->type, + (char *)&fe->region, + sizeof(fe->region), + NULL, NULL); + if (r) + break; + } + + return r; +} + +static int flush_by_group(struct log_c *lc, struct list_head *flush_list) +{ + int r = 0; + int count; + uint32_t type = 0; + struct flush_entry *fe, *tmp_fe; + LIST_HEAD(tmp_list); + uint64_t group[MAX_FLUSH_GROUP_COUNT]; + + /* + * Group process the requests + */ + while (!list_empty(flush_list)) { + count = 0; + + list_for_each_entry_safe(fe, tmp_fe, flush_list, list) { + group[count] = fe->region; + count++; + + list_move(&fe->list, &tmp_list); + + type = fe->type; + if (count >= MAX_FLUSH_GROUP_COUNT) + break; + } + + r = userspace_do_request(lc, lc->uuid, type, + (char *)(group), + count * sizeof(uint64_t), + NULL, NULL); + if (r) { + /* Group send failed. Attempt one-by-one. */ + list_splice_init(&tmp_list, flush_list); + r = flush_one_by_one(lc, flush_list); + break; + } + } + + /* + * Must collect flush_entrys that were successfully processed + * as a group so that they will be free'd by the caller. + */ + list_splice_init(&tmp_list, flush_list); + + return r; +} + +/* + * userspace_flush + * + * This function is ok to block. + * The flush happens in two stages. First, it sends all + * clear/mark requests that are on the list. Then it + * tells the server to commit them. This gives the + * server a chance to optimise the commit, instead of + * doing it for every request. + * + * Additionally, we could implement another thread that + * sends the requests up to the server - reducing the + * load on flush. Then the flush would have less in + * the list and be responsible for the finishing commit. + * + * Returns: 0 on success, < 0 on failure + */ +static int userspace_flush(struct dm_dirty_log *log) +{ + int r = 0; + unsigned long flags; + struct log_c *lc = log->context; + LIST_HEAD(mark_list); + LIST_HEAD(clear_list); + struct flush_entry *fe, *tmp_fe; + + spin_lock_irqsave(&lc->flush_lock, flags); + list_splice_init(&lc->mark_list, &mark_list); + list_splice_init(&lc->clear_list, &clear_list); + spin_unlock_irqrestore(&lc->flush_lock, flags); + + if (list_empty(&mark_list) && list_empty(&clear_list)) + return 0; + + r = flush_by_group(lc, &mark_list); + if (r) + goto fail; + + r = flush_by_group(lc, &clear_list); + if (r) + goto fail; + + r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, + NULL, 0, NULL, NULL); + +fail: + /* + * We can safely remove these entries, even if failure. + * Calling code will receive an error and will know that + * the log facility has failed. + */ + list_for_each_entry_safe(fe, tmp_fe, &mark_list, list) { + list_del(&fe->list); + mempool_free(fe, flush_entry_pool); + } + list_for_each_entry_safe(fe, tmp_fe, &clear_list, list) { + list_del(&fe->list); + mempool_free(fe, flush_entry_pool); + } + + if (r) + dm_table_event(lc->ti->table); + + return r; +} + +/* + * userspace_mark_region + * + * This function should avoid blocking unless absolutely required. + * (Memory allocation is valid for blocking.) + */ +static void userspace_mark_region(struct dm_dirty_log *log, region_t region) +{ + unsigned long flags; + struct log_c *lc = log->context; + struct flush_entry *fe; + + /* Wait for an allocation, but _never_ fail */ + fe = mempool_alloc(flush_entry_pool, GFP_NOIO); + BUG_ON(!fe); + + spin_lock_irqsave(&lc->flush_lock, flags); + fe->type = DM_ULOG_MARK_REGION; + fe->region = region; + list_add(&fe->list, &lc->mark_list); + spin_unlock_irqrestore(&lc->flush_lock, flags); + + return; +} + +/* + * userspace_clear_region + * + * This function must not block. + * So, the alloc can't block. In the worst case, it is ok to + * fail. It would simply mean we can't clear the region. + * Does nothing to current sync context, but does mean + * the region will be re-sync'ed on a reload of the mirror + * even though it is in-sync. + */ +static void userspace_clear_region(struct dm_dirty_log *log, region_t region) +{ + unsigned long flags; + struct log_c *lc = log->context; + struct flush_entry *fe; + + /* + * If we fail to allocate, we skip the clearing of + * the region. This doesn't hurt us in any way, except + * to cause the region to be resync'ed when the + * device is activated next time. + */ + fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC); + if (!fe) { + DMERR("Failed to allocate memory to clear region."); + return; + } + + spin_lock_irqsave(&lc->flush_lock, flags); + fe->type = DM_ULOG_CLEAR_REGION; + fe->region = region; + list_add(&fe->list, &lc->clear_list); + spin_unlock_irqrestore(&lc->flush_lock, flags); + + return; +} + +/* + * userspace_get_resync_work + * + * Get a region that needs recovery. It is valid to return + * an error for this function. + * + * Returns: 1 if region filled, 0 if no work, <0 on error + */ +static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region) +{ + int r; + size_t rdata_size; + struct log_c *lc = log->context; + struct { + int64_t i; /* 64-bit for mix arch compatibility */ + region_t r; + } pkg; + + if (lc->in_sync_hint >= lc->region_count) + return 0; + + rdata_size = sizeof(pkg); + r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK, + NULL, 0, + (char *)&pkg, &rdata_size); + + *region = pkg.r; + return (r) ? r : (int)pkg.i; +} + +/* + * userspace_set_region_sync + * + * Set the sync status of a given region. This function + * must not fail. + */ +static void userspace_set_region_sync(struct dm_dirty_log *log, + region_t region, int in_sync) +{ + int r; + struct log_c *lc = log->context; + struct { + region_t r; + int64_t i; + } pkg; + + pkg.r = region; + pkg.i = (int64_t)in_sync; + + r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC, + (char *)&pkg, sizeof(pkg), + NULL, NULL); + + /* + * It would be nice to be able to report failures. + * However, it is easy emough to detect and resolve. + */ + return; +} + +/* + * userspace_get_sync_count + * + * If there is any sort of failure when consulting the server, + * we assume that the sync count is zero. + * + * Returns: sync count on success, 0 on failure + */ +static region_t userspace_get_sync_count(struct dm_dirty_log *log) +{ + int r; + size_t rdata_size; + uint64_t sync_count; + struct log_c *lc = log->context; + + rdata_size = sizeof(sync_count); + r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT, + NULL, 0, + (char *)&sync_count, &rdata_size); + + if (r) + return 0; + + if (sync_count >= lc->region_count) + lc->in_sync_hint = lc->region_count; + + return (region_t)sync_count; +} + +/* + * userspace_status + * + * Returns: amount of space consumed + */ +static int userspace_status(struct dm_dirty_log *log, status_type_t status_type, + char *result, unsigned maxlen) +{ + int r = 0; + char *table_args; + size_t sz = (size_t)maxlen; + struct log_c *lc = log->context; + + switch (status_type) { + case STATUSTYPE_INFO: + r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO, + NULL, 0, + result, &sz); + + if (r) { + sz = 0; + DMEMIT("%s 1 COM_FAILURE", log->type->name); + } + break; + case STATUSTYPE_TABLE: + sz = 0; + table_args = strchr(lc->usr_argv_str, ' '); + BUG_ON(!table_args); /* There will always be a ' ' */ + table_args++; + + DMEMIT("%s %u %s %s ", log->type->name, lc->usr_argc, + lc->uuid, table_args); + break; + } + return (r) ? 0 : (int)sz; +} + +/* + * userspace_is_remote_recovering + * + * Returns: 1 if region recovering, 0 otherwise + */ +static int userspace_is_remote_recovering(struct dm_dirty_log *log, + region_t region) +{ + int r; + uint64_t region64 = region; + struct log_c *lc = log->context; + static unsigned long long limit; + struct { + int64_t is_recovering; + uint64_t in_sync_hint; + } pkg; + size_t rdata_size = sizeof(pkg); + + /* + * Once the mirror has been reported to be in-sync, + * it will never again ask for recovery work. So, + * we can safely say there is not a remote machine + * recovering if the device is in-sync. (in_sync_hint + * must be reset at resume time.) + */ + if (region < lc->in_sync_hint) + return 0; + else if (jiffies < limit) + return 1; + + limit = jiffies + (HZ / 4); + r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_REMOTE_RECOVERING, + (char *)®ion64, sizeof(region64), + (char *)&pkg, &rdata_size); + if (r) + return 1; + + lc->in_sync_hint = pkg.in_sync_hint; + + return (int)pkg.is_recovering; +} + +static struct dm_dirty_log_type _userspace_type = { + .name = "userspace", + .module = THIS_MODULE, + .ctr = userspace_ctr, + .dtr = userspace_dtr, + .presuspend = userspace_presuspend, + .postsuspend = userspace_postsuspend, + .resume = userspace_resume, + .get_region_size = userspace_get_region_size, + .is_clean = userspace_is_clean, + .in_sync = userspace_in_sync, + .flush = userspace_flush, + .mark_region = userspace_mark_region, + .clear_region = userspace_clear_region, + .get_resync_work = userspace_get_resync_work, + .set_region_sync = userspace_set_region_sync, + .get_sync_count = userspace_get_sync_count, + .status = userspace_status, + .is_remote_recovering = userspace_is_remote_recovering, +}; + +static int __init userspace_dirty_log_init(void) +{ + int r = 0; + + flush_entry_pool = mempool_create(100, flush_entry_alloc, + flush_entry_free, NULL); + + if (!flush_entry_pool) { + DMWARN("Unable to create flush_entry_pool: No memory."); + return -ENOMEM; + } + + r = dm_ulog_tfr_init(); + if (r) { + DMWARN("Unable to initialize userspace log communications"); + mempool_destroy(flush_entry_pool); + return r; + } + + r = dm_dirty_log_type_register(&_userspace_type); + if (r) { + DMWARN("Couldn't register userspace dirty log type"); + dm_ulog_tfr_exit(); + mempool_destroy(flush_entry_pool); + return r; + } + + DMINFO("version " DM_LOG_USERSPACE_VSN " loaded"); + return 0; +} + +static void __exit userspace_dirty_log_exit(void) +{ + dm_dirty_log_type_unregister(&_userspace_type); + dm_ulog_tfr_exit(); + mempool_destroy(flush_entry_pool); + + DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded"); + return; +} + +module_init(userspace_dirty_log_init); +module_exit(userspace_dirty_log_exit); + +MODULE_DESCRIPTION(DM_NAME " userspace dirty log link"); +MODULE_AUTHOR("Jonathan Brassow <dm-devel@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c new file mode 100644 index 00000000..08d9a207 --- /dev/null +++ b/drivers/md/dm-log-userspace-transfer.c @@ -0,0 +1,286 @@ +/* + * Copyright (C) 2006-2009 Red Hat, Inc. + * + * This file is released under the LGPL. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <net/sock.h> +#include <linux/workqueue.h> +#include <linux/connector.h> +#include <linux/device-mapper.h> +#include <linux/dm-log-userspace.h> + +#include "dm-log-userspace-transfer.h" + +static uint32_t dm_ulog_seq; + +/* + * Netlink/Connector is an unreliable protocol. How long should + * we wait for a response before assuming it was lost and retrying? + * (If we do receive a response after this time, it will be discarded + * and the response to the resent request will be waited for. + */ +#define DM_ULOG_RETRY_TIMEOUT (15 * HZ) + +/* + * Pre-allocated space for speed + */ +#define DM_ULOG_PREALLOCED_SIZE 512 +static struct cn_msg *prealloced_cn_msg; +static struct dm_ulog_request *prealloced_ulog_tfr; + +static struct cb_id ulog_cn_id = { + .idx = CN_IDX_DM, + .val = CN_VAL_DM_USERSPACE_LOG +}; + +static DEFINE_MUTEX(dm_ulog_lock); + +struct receiving_pkg { + struct list_head list; + struct completion complete; + + uint32_t seq; + + int error; + size_t *data_size; + char *data; +}; + +static DEFINE_SPINLOCK(receiving_list_lock); +static struct list_head receiving_list; + +static int dm_ulog_sendto_server(struct dm_ulog_request *tfr) +{ + int r; + struct cn_msg *msg = prealloced_cn_msg; + + memset(msg, 0, sizeof(struct cn_msg)); + + msg->id.idx = ulog_cn_id.idx; + msg->id.val = ulog_cn_id.val; + msg->ack = 0; + msg->seq = tfr->seq; + msg->len = sizeof(struct dm_ulog_request) + tfr->data_size; + + r = cn_netlink_send(msg, 0, gfp_any()); + + return r; +} + +/* + * Parameters for this function can be either msg or tfr, but not + * both. This function fills in the reply for a waiting request. + * If just msg is given, then the reply is simply an ACK from userspace + * that the request was received. + * + * Returns: 0 on success, -ENOENT on failure + */ +static int fill_pkg(struct cn_msg *msg, struct dm_ulog_request *tfr) +{ + uint32_t rtn_seq = (msg) ? msg->seq : (tfr) ? tfr->seq : 0; + struct receiving_pkg *pkg; + + /* + * The 'receiving_pkg' entries in this list are statically + * allocated on the stack in 'dm_consult_userspace'. + * Each process that is waiting for a reply from the user + * space server will have an entry in this list. + * + * We are safe to do it this way because the stack space + * is unique to each process, but still addressable by + * other processes. + */ + list_for_each_entry(pkg, &receiving_list, list) { + if (rtn_seq != pkg->seq) + continue; + + if (msg) { + pkg->error = -msg->ack; + /* + * If we are trying again, we will need to know our + * storage capacity. Otherwise, along with the + * error code, we make explicit that we have no data. + */ + if (pkg->error != -EAGAIN) + *(pkg->data_size) = 0; + } else if (tfr->data_size > *(pkg->data_size)) { + DMERR("Insufficient space to receive package [%u] " + "(%u vs %zu)", tfr->request_type, + tfr->data_size, *(pkg->data_size)); + + *(pkg->data_size) = 0; + pkg->error = -ENOSPC; + } else { + pkg->error = tfr->error; + memcpy(pkg->data, tfr->data, tfr->data_size); + *(pkg->data_size) = tfr->data_size; + } + complete(&pkg->complete); + return 0; + } + + return -ENOENT; +} + +/* + * This is the connector callback that delivers data + * that was sent from userspace. + */ +static void cn_ulog_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp) +{ + struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1); + + if (!capable(CAP_SYS_ADMIN)) + return; + + spin_lock(&receiving_list_lock); + if (msg->len == 0) + fill_pkg(msg, NULL); + else if (msg->len < sizeof(*tfr)) + DMERR("Incomplete message received (expected %u, got %u): [%u]", + (unsigned)sizeof(*tfr), msg->len, msg->seq); + else + fill_pkg(NULL, tfr); + spin_unlock(&receiving_list_lock); +} + +/** + * dm_consult_userspace + * @uuid: log's universal unique identifier (must be DM_UUID_LEN in size) + * @luid: log's local unique identifier + * @request_type: found in include/linux/dm-log-userspace.h + * @data: data to tx to the server + * @data_size: size of data in bytes + * @rdata: place to put return data from server + * @rdata_size: value-result (amount of space given/amount of space used) + * + * rdata_size is undefined on failure. + * + * Memory used to communicate with userspace is zero'ed + * before populating to ensure that no unwanted bits leak + * from kernel space to user-space. All userspace log communications + * between kernel and user space go through this function. + * + * Returns: 0 on success, -EXXX on failure + **/ +int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type, + char *data, size_t data_size, + char *rdata, size_t *rdata_size) +{ + int r = 0; + size_t dummy = 0; + int overhead_size = sizeof(struct dm_ulog_request) + sizeof(struct cn_msg); + struct dm_ulog_request *tfr = prealloced_ulog_tfr; + struct receiving_pkg pkg; + + /* + * Given the space needed to hold the 'struct cn_msg' and + * 'struct dm_ulog_request' - do we have enough payload + * space remaining? + */ + if (data_size > (DM_ULOG_PREALLOCED_SIZE - overhead_size)) { + DMINFO("Size of tfr exceeds preallocated size"); + return -EINVAL; + } + + if (!rdata_size) + rdata_size = &dummy; +resend: + /* + * We serialize the sending of requests so we can + * use the preallocated space. + */ + mutex_lock(&dm_ulog_lock); + + memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg)); + memcpy(tfr->uuid, uuid, DM_UUID_LEN); + tfr->version = DM_ULOG_REQUEST_VERSION; + tfr->luid = luid; + tfr->seq = dm_ulog_seq++; + + /* + * Must be valid request type (all other bits set to + * zero). This reserves other bits for possible future + * use. + */ + tfr->request_type = request_type & DM_ULOG_REQUEST_MASK; + + tfr->data_size = data_size; + if (data && data_size) + memcpy(tfr->data, data, data_size); + + memset(&pkg, 0, sizeof(pkg)); + init_completion(&pkg.complete); + pkg.seq = tfr->seq; + pkg.data_size = rdata_size; + pkg.data = rdata; + spin_lock(&receiving_list_lock); + list_add(&(pkg.list), &receiving_list); + spin_unlock(&receiving_list_lock); + + r = dm_ulog_sendto_server(tfr); + + mutex_unlock(&dm_ulog_lock); + + if (r) { + DMERR("Unable to send log request [%u] to userspace: %d", + request_type, r); + spin_lock(&receiving_list_lock); + list_del_init(&(pkg.list)); + spin_unlock(&receiving_list_lock); + + goto out; + } + + r = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT); + spin_lock(&receiving_list_lock); + list_del_init(&(pkg.list)); + spin_unlock(&receiving_list_lock); + if (!r) { + DMWARN("[%s] Request timed out: [%u/%u] - retrying", + (strlen(uuid) > 8) ? + (uuid + (strlen(uuid) - 8)) : (uuid), + request_type, pkg.seq); + goto resend; + } + + r = pkg.error; + if (r == -EAGAIN) + goto resend; + +out: + return r; +} + +int dm_ulog_tfr_init(void) +{ + int r; + void *prealloced; + + INIT_LIST_HEAD(&receiving_list); + + prealloced = kmalloc(DM_ULOG_PREALLOCED_SIZE, GFP_KERNEL); + if (!prealloced) + return -ENOMEM; + + prealloced_cn_msg = prealloced; + prealloced_ulog_tfr = prealloced + sizeof(struct cn_msg); + + r = cn_add_callback(&ulog_cn_id, "dmlogusr", cn_ulog_callback); + if (r) { + cn_del_callback(&ulog_cn_id); + return r; + } + + return 0; +} + +void dm_ulog_tfr_exit(void) +{ + cn_del_callback(&ulog_cn_id); + kfree(prealloced_cn_msg); +} diff --git a/drivers/md/dm-log-userspace-transfer.h b/drivers/md/dm-log-userspace-transfer.h new file mode 100644 index 00000000..04ee874f --- /dev/null +++ b/drivers/md/dm-log-userspace-transfer.h @@ -0,0 +1,18 @@ +/* + * Copyright (C) 2006-2009 Red Hat, Inc. + * + * This file is released under the LGPL. + */ + +#ifndef __DM_LOG_USERSPACE_TRANSFER_H__ +#define __DM_LOG_USERSPACE_TRANSFER_H__ + +#define DM_MSG_PREFIX "dm-log-userspace" + +int dm_ulog_tfr_init(void); +void dm_ulog_tfr_exit(void); +int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type, + char *data, size_t data_size, + char *rdata, size_t *rdata_size); + +#endif /* __DM_LOG_USERSPACE_TRANSFER_H__ */ diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c new file mode 100644 index 00000000..65ebaebf --- /dev/null +++ b/drivers/md/dm-log.c @@ -0,0 +1,897 @@ +/* + * Copyright (C) 2003 Sistina Software + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + * + * This file is released under the LGPL. + */ + +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/vmalloc.h> +#include <linux/dm-io.h> +#include <linux/dm-dirty-log.h> + +#include <linux/device-mapper.h> + +#define DM_MSG_PREFIX "dirty region log" + +static LIST_HEAD(_log_types); +static DEFINE_SPINLOCK(_lock); + +static struct dm_dirty_log_type *__find_dirty_log_type(const char *name) +{ + struct dm_dirty_log_type *log_type; + + list_for_each_entry(log_type, &_log_types, list) + if (!strcmp(name, log_type->name)) + return log_type; + + return NULL; +} + +static struct dm_dirty_log_type *_get_dirty_log_type(const char *name) +{ + struct dm_dirty_log_type *log_type; + + spin_lock(&_lock); + + log_type = __find_dirty_log_type(name); + if (log_type && !try_module_get(log_type->module)) + log_type = NULL; + + spin_unlock(&_lock); + + return log_type; +} + +/* + * get_type + * @type_name + * + * Attempt to retrieve the dm_dirty_log_type by name. If not already + * available, attempt to load the appropriate module. + * + * Log modules are named "dm-log-" followed by the 'type_name'. + * Modules may contain multiple types. + * This function will first try the module "dm-log-<type_name>", + * then truncate 'type_name' on the last '-' and try again. + * + * For example, if type_name was "clustered-disk", it would search + * 'dm-log-clustered-disk' then 'dm-log-clustered'. + * + * Returns: dirty_log_type* on success, NULL on failure + */ +static struct dm_dirty_log_type *get_type(const char *type_name) +{ + char *p, *type_name_dup; + struct dm_dirty_log_type *log_type; + + if (!type_name) + return NULL; + + log_type = _get_dirty_log_type(type_name); + if (log_type) + return log_type; + + type_name_dup = kstrdup(type_name, GFP_KERNEL); + if (!type_name_dup) { + DMWARN("No memory left to attempt log module load for \"%s\"", + type_name); + return NULL; + } + + while (request_module("dm-log-%s", type_name_dup) || + !(log_type = _get_dirty_log_type(type_name))) { + p = strrchr(type_name_dup, '-'); + if (!p) + break; + p[0] = '\0'; + } + + if (!log_type) + DMWARN("Module for logging type \"%s\" not found.", type_name); + + kfree(type_name_dup); + + return log_type; +} + +static void put_type(struct dm_dirty_log_type *type) +{ + if (!type) + return; + + spin_lock(&_lock); + if (!__find_dirty_log_type(type->name)) + goto out; + + module_put(type->module); + +out: + spin_unlock(&_lock); +} + +int dm_dirty_log_type_register(struct dm_dirty_log_type *type) +{ + int r = 0; + + spin_lock(&_lock); + if (!__find_dirty_log_type(type->name)) + list_add(&type->list, &_log_types); + else + r = -EEXIST; + spin_unlock(&_lock); + + return r; +} +EXPORT_SYMBOL(dm_dirty_log_type_register); + +int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type) +{ + spin_lock(&_lock); + + if (!__find_dirty_log_type(type->name)) { + spin_unlock(&_lock); + return -EINVAL; + } + + list_del(&type->list); + + spin_unlock(&_lock); + + return 0; +} +EXPORT_SYMBOL(dm_dirty_log_type_unregister); + +struct dm_dirty_log *dm_dirty_log_create(const char *type_name, + struct dm_target *ti, + int (*flush_callback_fn)(struct dm_target *ti), + unsigned int argc, char **argv) +{ + struct dm_dirty_log_type *type; + struct dm_dirty_log *log; + + log = kmalloc(sizeof(*log), GFP_KERNEL); + if (!log) + return NULL; + + type = get_type(type_name); + if (!type) { + kfree(log); + return NULL; + } + + log->flush_callback_fn = flush_callback_fn; + log->type = type; + if (type->ctr(log, ti, argc, argv)) { + kfree(log); + put_type(type); + return NULL; + } + + return log; +} +EXPORT_SYMBOL(dm_dirty_log_create); + +void dm_dirty_log_destroy(struct dm_dirty_log *log) +{ + log->type->dtr(log); + put_type(log->type); + kfree(log); +} +EXPORT_SYMBOL(dm_dirty_log_destroy); + +/*----------------------------------------------------------------- + * Persistent and core logs share a lot of their implementation. + * FIXME: need a reload method to be called from a resume + *---------------------------------------------------------------*/ +/* + * Magic for persistent mirrors: "MiRr" + */ +#define MIRROR_MAGIC 0x4D695272 + +/* + * The on-disk version of the metadata. + */ +#define MIRROR_DISK_VERSION 2 +#define LOG_OFFSET 2 + +struct log_header_disk { + __le32 magic; + + /* + * Simple, incrementing version. no backward + * compatibility. + */ + __le32 version; + __le64 nr_regions; +} __packed; + +struct log_header_core { + uint32_t magic; + uint32_t version; + uint64_t nr_regions; +}; + +struct log_c { + struct dm_target *ti; + int touched_dirtied; + int touched_cleaned; + int flush_failed; + uint32_t region_size; + unsigned int region_count; + region_t sync_count; + + unsigned bitset_uint32_count; + uint32_t *clean_bits; + uint32_t *sync_bits; + uint32_t *recovering_bits; /* FIXME: this seems excessive */ + + int sync_search; + + /* Resync flag */ + enum sync { + DEFAULTSYNC, /* Synchronize if necessary */ + NOSYNC, /* Devices known to be already in sync */ + FORCESYNC, /* Force a sync to happen */ + } sync; + + struct dm_io_request io_req; + + /* + * Disk log fields + */ + int log_dev_failed; + int log_dev_flush_failed; + struct dm_dev *log_dev; + struct log_header_core header; + + struct dm_io_region header_location; + struct log_header_disk *disk_header; +}; + +/* + * The touched member needs to be updated every time we access + * one of the bitsets. + */ +static inline int log_test_bit(uint32_t *bs, unsigned bit) +{ + return test_bit_le(bit, bs) ? 1 : 0; +} + +static inline void log_set_bit(struct log_c *l, + uint32_t *bs, unsigned bit) +{ + __set_bit_le(bit, bs); + l->touched_cleaned = 1; +} + +static inline void log_clear_bit(struct log_c *l, + uint32_t *bs, unsigned bit) +{ + __clear_bit_le(bit, bs); + l->touched_dirtied = 1; +} + +/*---------------------------------------------------------------- + * Header IO + *--------------------------------------------------------------*/ +static void header_to_disk(struct log_header_core *core, struct log_header_disk *disk) +{ + disk->magic = cpu_to_le32(core->magic); + disk->version = cpu_to_le32(core->version); + disk->nr_regions = cpu_to_le64(core->nr_regions); +} + +static void header_from_disk(struct log_header_core *core, struct log_header_disk *disk) +{ + core->magic = le32_to_cpu(disk->magic); + core->version = le32_to_cpu(disk->version); + core->nr_regions = le64_to_cpu(disk->nr_regions); +} + +static int rw_header(struct log_c *lc, int rw) +{ + lc->io_req.bi_rw = rw; + + return dm_io(&lc->io_req, 1, &lc->header_location, NULL); +} + +static int flush_header(struct log_c *lc) +{ + struct dm_io_region null_location = { + .bdev = lc->header_location.bdev, + .sector = 0, + .count = 0, + }; + + lc->io_req.bi_rw = WRITE_FLUSH; + + return dm_io(&lc->io_req, 1, &null_location, NULL); +} + +static int read_header(struct log_c *log) +{ + int r; + + r = rw_header(log, READ); + if (r) + return r; + + header_from_disk(&log->header, log->disk_header); + + /* New log required? */ + if (log->sync != DEFAULTSYNC || log->header.magic != MIRROR_MAGIC) { + log->header.magic = MIRROR_MAGIC; + log->header.version = MIRROR_DISK_VERSION; + log->header.nr_regions = 0; + } + +#ifdef __LITTLE_ENDIAN + if (log->header.version == 1) + log->header.version = 2; +#endif + + if (log->header.version != MIRROR_DISK_VERSION) { + DMWARN("incompatible disk log version"); + return -EINVAL; + } + + return 0; +} + +static int _check_region_size(struct dm_target *ti, uint32_t region_size) +{ + if (region_size < 2 || region_size > ti->len) + return 0; + + if (!is_power_of_2(region_size)) + return 0; + + return 1; +} + +/*---------------------------------------------------------------- + * core log constructor/destructor + * + * argv contains region_size followed optionally by [no]sync + *--------------------------------------------------------------*/ +#define BYTE_SHIFT 3 +static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, + unsigned int argc, char **argv, + struct dm_dev *dev) +{ + enum sync sync = DEFAULTSYNC; + + struct log_c *lc; + uint32_t region_size; + unsigned int region_count; + size_t bitset_size, buf_size; + int r; + char dummy; + + if (argc < 1 || argc > 2) { + DMWARN("wrong number of arguments to dirty region log"); + return -EINVAL; + } + + if (argc > 1) { + if (!strcmp(argv[1], "sync")) + sync = FORCESYNC; + else if (!strcmp(argv[1], "nosync")) + sync = NOSYNC; + else { + DMWARN("unrecognised sync argument to " + "dirty region log: %s", argv[1]); + return -EINVAL; + } + } + + if (sscanf(argv[0], "%u%c", ®ion_size, &dummy) != 1 || + !_check_region_size(ti, region_size)) { + DMWARN("invalid region size %s", argv[0]); + return -EINVAL; + } + + region_count = dm_sector_div_up(ti->len, region_size); + + lc = kmalloc(sizeof(*lc), GFP_KERNEL); + if (!lc) { + DMWARN("couldn't allocate core log"); + return -ENOMEM; + } + + lc->ti = ti; + lc->touched_dirtied = 0; + lc->touched_cleaned = 0; + lc->flush_failed = 0; + lc->region_size = region_size; + lc->region_count = region_count; + lc->sync = sync; + + /* + * Work out how many "unsigned long"s we need to hold the bitset. + */ + bitset_size = dm_round_up(region_count, + sizeof(*lc->clean_bits) << BYTE_SHIFT); + bitset_size >>= BYTE_SHIFT; + + lc->bitset_uint32_count = bitset_size / sizeof(*lc->clean_bits); + + /* + * Disk log? + */ + if (!dev) { + lc->clean_bits = vmalloc(bitset_size); + if (!lc->clean_bits) { + DMWARN("couldn't allocate clean bitset"); + kfree(lc); + return -ENOMEM; + } + lc->disk_header = NULL; + } else { + lc->log_dev = dev; + lc->log_dev_failed = 0; + lc->log_dev_flush_failed = 0; + lc->header_location.bdev = lc->log_dev->bdev; + lc->header_location.sector = 0; + + /* + * Buffer holds both header and bitset. + */ + buf_size = + dm_round_up((LOG_OFFSET << SECTOR_SHIFT) + bitset_size, + bdev_logical_block_size(lc->header_location. + bdev)); + + if (buf_size > i_size_read(dev->bdev->bd_inode)) { + DMWARN("log device %s too small: need %llu bytes", + dev->name, (unsigned long long)buf_size); + kfree(lc); + return -EINVAL; + } + + lc->header_location.count = buf_size >> SECTOR_SHIFT; + + lc->io_req.mem.type = DM_IO_VMA; + lc->io_req.notify.fn = NULL; + lc->io_req.client = dm_io_client_create(); + if (IS_ERR(lc->io_req.client)) { + r = PTR_ERR(lc->io_req.client); + DMWARN("couldn't allocate disk io client"); + kfree(lc); + return r; + } + + lc->disk_header = vmalloc(buf_size); + if (!lc->disk_header) { + DMWARN("couldn't allocate disk log buffer"); + dm_io_client_destroy(lc->io_req.client); + kfree(lc); + return -ENOMEM; + } + + lc->io_req.mem.ptr.vma = lc->disk_header; + lc->clean_bits = (void *)lc->disk_header + + (LOG_OFFSET << SECTOR_SHIFT); + } + + memset(lc->clean_bits, -1, bitset_size); + + lc->sync_bits = vmalloc(bitset_size); + if (!lc->sync_bits) { + DMWARN("couldn't allocate sync bitset"); + if (!dev) + vfree(lc->clean_bits); + else + dm_io_client_destroy(lc->io_req.client); + vfree(lc->disk_header); + kfree(lc); + return -ENOMEM; + } + memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size); + lc->sync_count = (sync == NOSYNC) ? region_count : 0; + + lc->recovering_bits = vzalloc(bitset_size); + if (!lc->recovering_bits) { + DMWARN("couldn't allocate sync bitset"); + vfree(lc->sync_bits); + if (!dev) + vfree(lc->clean_bits); + else + dm_io_client_destroy(lc->io_req.client); + vfree(lc->disk_header); + kfree(lc); + return -ENOMEM; + } + lc->sync_search = 0; + log->context = lc; + + return 0; +} + +static int core_ctr(struct dm_dirty_log *log, struct dm_target *ti, + unsigned int argc, char **argv) +{ + return create_log_context(log, ti, argc, argv, NULL); +} + +static void destroy_log_context(struct log_c *lc) +{ + vfree(lc->sync_bits); + vfree(lc->recovering_bits); + kfree(lc); +} + +static void core_dtr(struct dm_dirty_log *log) +{ + struct log_c *lc = (struct log_c *) log->context; + + vfree(lc->clean_bits); + destroy_log_context(lc); +} + +/*---------------------------------------------------------------- + * disk log constructor/destructor + * + * argv contains log_device region_size followed optionally by [no]sync + *--------------------------------------------------------------*/ +static int disk_ctr(struct dm_dirty_log *log, struct dm_target *ti, + unsigned int argc, char **argv) +{ + int r; + struct dm_dev *dev; + + if (argc < 2 || argc > 3) { + DMWARN("wrong number of arguments to disk dirty region log"); + return -EINVAL; + } + + r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dev); + if (r) + return r; + + r = create_log_context(log, ti, argc - 1, argv + 1, dev); + if (r) { + dm_put_device(ti, dev); + return r; + } + + return 0; +} + +static void disk_dtr(struct dm_dirty_log *log) +{ + struct log_c *lc = (struct log_c *) log->context; + + dm_put_device(lc->ti, lc->log_dev); + vfree(lc->disk_header); + dm_io_client_destroy(lc->io_req.client); + destroy_log_context(lc); +} + +static int count_bits32(uint32_t *addr, unsigned size) +{ + int count = 0, i; + + for (i = 0; i < size; i++) { + count += hweight32(*(addr+i)); + } + return count; +} + +static void fail_log_device(struct log_c *lc) +{ + if (lc->log_dev_failed) + return; + + lc->log_dev_failed = 1; + dm_table_event(lc->ti->table); +} + +static int disk_resume(struct dm_dirty_log *log) +{ + int r; + unsigned i; + struct log_c *lc = (struct log_c *) log->context; + size_t size = lc->bitset_uint32_count * sizeof(uint32_t); + + /* read the disk header */ + r = read_header(lc); + if (r) { + DMWARN("%s: Failed to read header on dirty region log device", + lc->log_dev->name); + fail_log_device(lc); + /* + * If the log device cannot be read, we must assume + * all regions are out-of-sync. If we simply return + * here, the state will be uninitialized and could + * lead us to return 'in-sync' status for regions + * that are actually 'out-of-sync'. + */ + lc->header.nr_regions = 0; + } + + /* set or clear any new bits -- device has grown */ + if (lc->sync == NOSYNC) + for (i = lc->header.nr_regions; i < lc->region_count; i++) + /* FIXME: amazingly inefficient */ + log_set_bit(lc, lc->clean_bits, i); + else + for (i = lc->header.nr_regions; i < lc->region_count; i++) + /* FIXME: amazingly inefficient */ + log_clear_bit(lc, lc->clean_bits, i); + + /* clear any old bits -- device has shrunk */ + for (i = lc->region_count; i % (sizeof(*lc->clean_bits) << BYTE_SHIFT); i++) + log_clear_bit(lc, lc->clean_bits, i); + + /* copy clean across to sync */ + memcpy(lc->sync_bits, lc->clean_bits, size); + lc->sync_count = count_bits32(lc->clean_bits, lc->bitset_uint32_count); + lc->sync_search = 0; + + /* set the correct number of regions in the header */ + lc->header.nr_regions = lc->region_count; + + header_to_disk(&lc->header, lc->disk_header); + + /* write the new header */ + r = rw_header(lc, WRITE); + if (!r) { + r = flush_header(lc); + if (r) + lc->log_dev_flush_failed = 1; + } + if (r) { + DMWARN("%s: Failed to write header on dirty region log device", + lc->log_dev->name); + fail_log_device(lc); + } + + return r; +} + +static uint32_t core_get_region_size(struct dm_dirty_log *log) +{ + struct log_c *lc = (struct log_c *) log->context; + return lc->region_size; +} + +static int core_resume(struct dm_dirty_log *log) +{ + struct log_c *lc = (struct log_c *) log->context; + lc->sync_search = 0; + return 0; +} + +static int core_is_clean(struct dm_dirty_log *log, region_t region) +{ + struct log_c *lc = (struct log_c *) log->context; + return log_test_bit(lc->clean_bits, region); +} + +static int core_in_sync(struct dm_dirty_log *log, region_t region, int block) +{ + struct log_c *lc = (struct log_c *) log->context; + return log_test_bit(lc->sync_bits, region); +} + +static int core_flush(struct dm_dirty_log *log) +{ + /* no op */ + return 0; +} + +static int disk_flush(struct dm_dirty_log *log) +{ + int r, i; + struct log_c *lc = log->context; + + /* only write if the log has changed */ + if (!lc->touched_cleaned && !lc->touched_dirtied) + return 0; + + if (lc->touched_cleaned && log->flush_callback_fn && + log->flush_callback_fn(lc->ti)) { + /* + * At this point it is impossible to determine which + * regions are clean and which are dirty (without + * re-reading the log off disk). So mark all of them + * dirty. + */ + lc->flush_failed = 1; + for (i = 0; i < lc->region_count; i++) + log_clear_bit(lc, lc->clean_bits, i); + } + + r = rw_header(lc, WRITE); + if (r) + fail_log_device(lc); + else { + if (lc->touched_dirtied) { + r = flush_header(lc); + if (r) { + lc->log_dev_flush_failed = 1; + fail_log_device(lc); + } else + lc->touched_dirtied = 0; + } + lc->touched_cleaned = 0; + } + + return r; +} + +static void core_mark_region(struct dm_dirty_log *log, region_t region) +{ + struct log_c *lc = (struct log_c *) log->context; + log_clear_bit(lc, lc->clean_bits, region); +} + +static void core_clear_region(struct dm_dirty_log *log, region_t region) +{ + struct log_c *lc = (struct log_c *) log->context; + if (likely(!lc->flush_failed)) + log_set_bit(lc, lc->clean_bits, region); +} + +static int core_get_resync_work(struct dm_dirty_log *log, region_t *region) +{ + struct log_c *lc = (struct log_c *) log->context; + + if (lc->sync_search >= lc->region_count) + return 0; + + do { + *region = find_next_zero_bit_le(lc->sync_bits, + lc->region_count, + lc->sync_search); + lc->sync_search = *region + 1; + + if (*region >= lc->region_count) + return 0; + + } while (log_test_bit(lc->recovering_bits, *region)); + + log_set_bit(lc, lc->recovering_bits, *region); + return 1; +} + +static void core_set_region_sync(struct dm_dirty_log *log, region_t region, + int in_sync) +{ + struct log_c *lc = (struct log_c *) log->context; + + log_clear_bit(lc, lc->recovering_bits, region); + if (in_sync) { + log_set_bit(lc, lc->sync_bits, region); + lc->sync_count++; + } else if (log_test_bit(lc->sync_bits, region)) { + lc->sync_count--; + log_clear_bit(lc, lc->sync_bits, region); + } +} + +static region_t core_get_sync_count(struct dm_dirty_log *log) +{ + struct log_c *lc = (struct log_c *) log->context; + + return lc->sync_count; +} + +#define DMEMIT_SYNC \ + if (lc->sync != DEFAULTSYNC) \ + DMEMIT("%ssync ", lc->sync == NOSYNC ? "no" : "") + +static int core_status(struct dm_dirty_log *log, status_type_t status, + char *result, unsigned int maxlen) +{ + int sz = 0; + struct log_c *lc = log->context; + + switch(status) { + case STATUSTYPE_INFO: + DMEMIT("1 %s", log->type->name); + break; + + case STATUSTYPE_TABLE: + DMEMIT("%s %u %u ", log->type->name, + lc->sync == DEFAULTSYNC ? 1 : 2, lc->region_size); + DMEMIT_SYNC; + } + + return sz; +} + +static int disk_status(struct dm_dirty_log *log, status_type_t status, + char *result, unsigned int maxlen) +{ + int sz = 0; + struct log_c *lc = log->context; + + switch(status) { + case STATUSTYPE_INFO: + DMEMIT("3 %s %s %c", log->type->name, lc->log_dev->name, + lc->log_dev_flush_failed ? 'F' : + lc->log_dev_failed ? 'D' : + 'A'); + break; + + case STATUSTYPE_TABLE: + DMEMIT("%s %u %s %u ", log->type->name, + lc->sync == DEFAULTSYNC ? 2 : 3, lc->log_dev->name, + lc->region_size); + DMEMIT_SYNC; + } + + return sz; +} + +static struct dm_dirty_log_type _core_type = { + .name = "core", + .module = THIS_MODULE, + .ctr = core_ctr, + .dtr = core_dtr, + .resume = core_resume, + .get_region_size = core_get_region_size, + .is_clean = core_is_clean, + .in_sync = core_in_sync, + .flush = core_flush, + .mark_region = core_mark_region, + .clear_region = core_clear_region, + .get_resync_work = core_get_resync_work, + .set_region_sync = core_set_region_sync, + .get_sync_count = core_get_sync_count, + .status = core_status, +}; + +static struct dm_dirty_log_type _disk_type = { + .name = "disk", + .module = THIS_MODULE, + .ctr = disk_ctr, + .dtr = disk_dtr, + .postsuspend = disk_flush, + .resume = disk_resume, + .get_region_size = core_get_region_size, + .is_clean = core_is_clean, + .in_sync = core_in_sync, + .flush = disk_flush, + .mark_region = core_mark_region, + .clear_region = core_clear_region, + .get_resync_work = core_get_resync_work, + .set_region_sync = core_set_region_sync, + .get_sync_count = core_get_sync_count, + .status = disk_status, +}; + +static int __init dm_dirty_log_init(void) +{ + int r; + + r = dm_dirty_log_type_register(&_core_type); + if (r) + DMWARN("couldn't register core log"); + + r = dm_dirty_log_type_register(&_disk_type); + if (r) { + DMWARN("couldn't register disk type"); + dm_dirty_log_type_unregister(&_core_type); + } + + return r; +} + +static void __exit dm_dirty_log_exit(void) +{ + dm_dirty_log_type_unregister(&_disk_type); + dm_dirty_log_type_unregister(&_core_type); +} + +module_init(dm_dirty_log_init); +module_exit(dm_dirty_log_exit); + +MODULE_DESCRIPTION(DM_NAME " dirty region log"); +MODULE_AUTHOR("Joe Thornber, Heinz Mauelshagen <dm-devel@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c new file mode 100644 index 00000000..754f38f8 --- /dev/null +++ b/drivers/md/dm-mpath.c @@ -0,0 +1,1723 @@ +/* + * Copyright (C) 2003 Sistina Software Limited. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include <linux/device-mapper.h> + +#include "dm-path-selector.h" +#include "dm-uevent.h" + +#include <linux/ctype.h> +#include <linux/init.h> +#include <linux/mempool.h> +#include <linux/module.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/time.h> +#include <linux/workqueue.h> +#include <scsi/scsi_dh.h> +#include <linux/atomic.h> + +#define DM_MSG_PREFIX "multipath" +#define DM_PG_INIT_DELAY_MSECS 2000 +#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1) + +/* Path properties */ +struct pgpath { + struct list_head list; + + struct priority_group *pg; /* Owning PG */ + unsigned is_active; /* Path status */ + unsigned fail_count; /* Cumulative failure count */ + + struct dm_path path; + struct delayed_work activate_path; +}; + +#define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) + +/* + * Paths are grouped into Priority Groups and numbered from 1 upwards. + * Each has a path selector which controls which path gets used. + */ +struct priority_group { + struct list_head list; + + struct multipath *m; /* Owning multipath instance */ + struct path_selector ps; + + unsigned pg_num; /* Reference number */ + unsigned bypassed; /* Temporarily bypass this PG? */ + + unsigned nr_pgpaths; /* Number of paths in PG */ + struct list_head pgpaths; +}; + +/* Multipath context */ +struct multipath { + struct list_head list; + struct dm_target *ti; + + spinlock_t lock; + + const char *hw_handler_name; + char *hw_handler_params; + + unsigned nr_priority_groups; + struct list_head priority_groups; + + wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ + + unsigned pg_init_required; /* pg_init needs calling? */ + unsigned pg_init_in_progress; /* Only one pg_init allowed at once */ + unsigned pg_init_delay_retry; /* Delay pg_init retry? */ + + unsigned nr_valid_paths; /* Total number of usable paths */ + struct pgpath *current_pgpath; + struct priority_group *current_pg; + struct priority_group *next_pg; /* Switch to this PG if set */ + unsigned repeat_count; /* I/Os left before calling PS again */ + + unsigned queue_io; /* Must we queue all I/O? */ + unsigned queue_if_no_path; /* Queue I/O if last path fails? */ + unsigned saved_queue_if_no_path;/* Saved state during suspension */ + unsigned pg_init_retries; /* Number of times to retry pg_init */ + unsigned pg_init_count; /* Number of times pg_init called */ + unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */ + + struct work_struct process_queued_ios; + struct list_head queued_ios; + unsigned queue_size; + + struct work_struct trigger_event; + + /* + * We must use a mempool of dm_mpath_io structs so that we + * can resubmit bios on error. + */ + mempool_t *mpio_pool; + + struct mutex work_mutex; +}; + +/* + * Context information attached to each bio we process. + */ +struct dm_mpath_io { + struct pgpath *pgpath; + size_t nr_bytes; +}; + +typedef int (*action_fn) (struct pgpath *pgpath); + +#define MIN_IOS 256 /* Mempool size */ + +static struct kmem_cache *_mpio_cache; + +static struct workqueue_struct *kmultipathd, *kmpath_handlerd; +static void process_queued_ios(struct work_struct *work); +static void trigger_event(struct work_struct *work); +static void activate_path(struct work_struct *work); + + +/*----------------------------------------------- + * Allocation routines + *-----------------------------------------------*/ + +static struct pgpath *alloc_pgpath(void) +{ + struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL); + + if (pgpath) { + pgpath->is_active = 1; + INIT_DELAYED_WORK(&pgpath->activate_path, activate_path); + } + + return pgpath; +} + +static void free_pgpath(struct pgpath *pgpath) +{ + kfree(pgpath); +} + +static struct priority_group *alloc_priority_group(void) +{ + struct priority_group *pg; + + pg = kzalloc(sizeof(*pg), GFP_KERNEL); + + if (pg) + INIT_LIST_HEAD(&pg->pgpaths); + + return pg; +} + +static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) +{ + struct pgpath *pgpath, *tmp; + struct multipath *m = ti->private; + + list_for_each_entry_safe(pgpath, tmp, pgpaths, list) { + list_del(&pgpath->list); + if (m->hw_handler_name) + scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev)); + dm_put_device(ti, pgpath->path.dev); + free_pgpath(pgpath); + } +} + +static void free_priority_group(struct priority_group *pg, + struct dm_target *ti) +{ + struct path_selector *ps = &pg->ps; + + if (ps->type) { + ps->type->destroy(ps); + dm_put_path_selector(ps->type); + } + + free_pgpaths(&pg->pgpaths, ti); + kfree(pg); +} + +static struct multipath *alloc_multipath(struct dm_target *ti) +{ + struct multipath *m; + + m = kzalloc(sizeof(*m), GFP_KERNEL); + if (m) { + INIT_LIST_HEAD(&m->priority_groups); + INIT_LIST_HEAD(&m->queued_ios); + spin_lock_init(&m->lock); + m->queue_io = 1; + m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; + INIT_WORK(&m->process_queued_ios, process_queued_ios); + INIT_WORK(&m->trigger_event, trigger_event); + init_waitqueue_head(&m->pg_init_wait); + mutex_init(&m->work_mutex); + m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache); + if (!m->mpio_pool) { + kfree(m); + return NULL; + } + m->ti = ti; + ti->private = m; + } + + return m; +} + +static void free_multipath(struct multipath *m) +{ + struct priority_group *pg, *tmp; + + list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) { + list_del(&pg->list); + free_priority_group(pg, m->ti); + } + + kfree(m->hw_handler_name); + kfree(m->hw_handler_params); + mempool_destroy(m->mpio_pool); + kfree(m); +} + +static int set_mapinfo(struct multipath *m, union map_info *info) +{ + struct dm_mpath_io *mpio; + + mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC); + if (!mpio) + return -ENOMEM; + + memset(mpio, 0, sizeof(*mpio)); + info->ptr = mpio; + + return 0; +} + +static void clear_mapinfo(struct multipath *m, union map_info *info) +{ + struct dm_mpath_io *mpio = info->ptr; + + info->ptr = NULL; + mempool_free(mpio, m->mpio_pool); +} + +/*----------------------------------------------- + * Path selection + *-----------------------------------------------*/ + +static void __pg_init_all_paths(struct multipath *m) +{ + struct pgpath *pgpath; + unsigned long pg_init_delay = 0; + + m->pg_init_count++; + m->pg_init_required = 0; + if (m->pg_init_delay_retry) + pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ? + m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS); + list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) { + /* Skip failed paths */ + if (!pgpath->is_active) + continue; + if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path, + pg_init_delay)) + m->pg_init_in_progress++; + } +} + +static void __switch_pg(struct multipath *m, struct pgpath *pgpath) +{ + m->current_pg = pgpath->pg; + + /* Must we initialise the PG first, and queue I/O till it's ready? */ + if (m->hw_handler_name) { + m->pg_init_required = 1; + m->queue_io = 1; + } else { + m->pg_init_required = 0; + m->queue_io = 0; + } + + m->pg_init_count = 0; +} + +static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg, + size_t nr_bytes) +{ + struct dm_path *path; + + path = pg->ps.type->select_path(&pg->ps, &m->repeat_count, nr_bytes); + if (!path) + return -ENXIO; + + m->current_pgpath = path_to_pgpath(path); + + if (m->current_pg != pg) + __switch_pg(m, m->current_pgpath); + + return 0; +} + +static void __choose_pgpath(struct multipath *m, size_t nr_bytes) +{ + struct priority_group *pg; + unsigned bypassed = 1; + + if (!m->nr_valid_paths) + goto failed; + + /* Were we instructed to switch PG? */ + if (m->next_pg) { + pg = m->next_pg; + m->next_pg = NULL; + if (!__choose_path_in_pg(m, pg, nr_bytes)) + return; + } + + /* Don't change PG until it has no remaining paths */ + if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes)) + return; + + /* + * Loop through priority groups until we find a valid path. + * First time we skip PGs marked 'bypassed'. + * Second time we only try the ones we skipped. + */ + do { + list_for_each_entry(pg, &m->priority_groups, list) { + if (pg->bypassed == bypassed) + continue; + if (!__choose_path_in_pg(m, pg, nr_bytes)) + return; + } + } while (bypassed--); + +failed: + m->current_pgpath = NULL; + m->current_pg = NULL; +} + +/* + * Check whether bios must be queued in the device-mapper core rather + * than here in the target. + * + * m->lock must be held on entry. + * + * If m->queue_if_no_path and m->saved_queue_if_no_path hold the + * same value then we are not between multipath_presuspend() + * and multipath_resume() calls and we have no need to check + * for the DMF_NOFLUSH_SUSPENDING flag. + */ +static int __must_push_back(struct multipath *m) +{ + return (m->queue_if_no_path != m->saved_queue_if_no_path && + dm_noflush_suspending(m->ti)); +} + +static int map_io(struct multipath *m, struct request *clone, + union map_info *map_context, unsigned was_queued) +{ + int r = DM_MAPIO_REMAPPED; + size_t nr_bytes = blk_rq_bytes(clone); + unsigned long flags; + struct pgpath *pgpath; + struct block_device *bdev; + struct dm_mpath_io *mpio = map_context->ptr; + + spin_lock_irqsave(&m->lock, flags); + + /* Do we need to select a new pgpath? */ + if (!m->current_pgpath || + (!m->queue_io && (m->repeat_count && --m->repeat_count == 0))) + __choose_pgpath(m, nr_bytes); + + pgpath = m->current_pgpath; + + if (was_queued) + m->queue_size--; + + if ((pgpath && m->queue_io) || + (!pgpath && m->queue_if_no_path)) { + /* Queue for the daemon to resubmit */ + list_add_tail(&clone->queuelist, &m->queued_ios); + m->queue_size++; + if ((m->pg_init_required && !m->pg_init_in_progress) || + !m->queue_io) + queue_work(kmultipathd, &m->process_queued_ios); + pgpath = NULL; + r = DM_MAPIO_SUBMITTED; + } else if (pgpath) { + bdev = pgpath->path.dev->bdev; + clone->q = bdev_get_queue(bdev); + clone->rq_disk = bdev->bd_disk; + } else if (__must_push_back(m)) + r = DM_MAPIO_REQUEUE; + else + r = -EIO; /* Failed */ + + mpio->pgpath = pgpath; + mpio->nr_bytes = nr_bytes; + + if (r == DM_MAPIO_REMAPPED && pgpath->pg->ps.type->start_io) + pgpath->pg->ps.type->start_io(&pgpath->pg->ps, &pgpath->path, + nr_bytes); + + spin_unlock_irqrestore(&m->lock, flags); + + return r; +} + +/* + * If we run out of usable paths, should we queue I/O or error it? + */ +static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path, + unsigned save_old_value) +{ + unsigned long flags; + + spin_lock_irqsave(&m->lock, flags); + + if (save_old_value) + m->saved_queue_if_no_path = m->queue_if_no_path; + else + m->saved_queue_if_no_path = queue_if_no_path; + m->queue_if_no_path = queue_if_no_path; + if (!m->queue_if_no_path && m->queue_size) + queue_work(kmultipathd, &m->process_queued_ios); + + spin_unlock_irqrestore(&m->lock, flags); + + return 0; +} + +/*----------------------------------------------------------------- + * The multipath daemon is responsible for resubmitting queued ios. + *---------------------------------------------------------------*/ + +static void dispatch_queued_ios(struct multipath *m) +{ + int r; + unsigned long flags; + union map_info *info; + struct request *clone, *n; + LIST_HEAD(cl); + + spin_lock_irqsave(&m->lock, flags); + list_splice_init(&m->queued_ios, &cl); + spin_unlock_irqrestore(&m->lock, flags); + + list_for_each_entry_safe(clone, n, &cl, queuelist) { + list_del_init(&clone->queuelist); + + info = dm_get_rq_mapinfo(clone); + + r = map_io(m, clone, info, 1); + if (r < 0) { + clear_mapinfo(m, info); + dm_kill_unmapped_request(clone, r); + } else if (r == DM_MAPIO_REMAPPED) + dm_dispatch_request(clone); + else if (r == DM_MAPIO_REQUEUE) { + clear_mapinfo(m, info); + dm_requeue_unmapped_request(clone); + } + } +} + +static void process_queued_ios(struct work_struct *work) +{ + struct multipath *m = + container_of(work, struct multipath, process_queued_ios); + struct pgpath *pgpath = NULL; + unsigned must_queue = 1; + unsigned long flags; + + spin_lock_irqsave(&m->lock, flags); + + if (!m->queue_size) + goto out; + + if (!m->current_pgpath) + __choose_pgpath(m, 0); + + pgpath = m->current_pgpath; + + if ((pgpath && !m->queue_io) || + (!pgpath && !m->queue_if_no_path)) + must_queue = 0; + + if (m->pg_init_required && !m->pg_init_in_progress && pgpath) + __pg_init_all_paths(m); + +out: + spin_unlock_irqrestore(&m->lock, flags); + if (!must_queue) + dispatch_queued_ios(m); +} + +/* + * An event is triggered whenever a path is taken out of use. + * Includes path failure and PG bypass. + */ +static void trigger_event(struct work_struct *work) +{ + struct multipath *m = + container_of(work, struct multipath, trigger_event); + + dm_table_event(m->ti->table); +} + +/*----------------------------------------------------------------- + * Constructor/argument parsing: + * <#multipath feature args> [<arg>]* + * <#hw_handler args> [hw_handler [<arg>]*] + * <#priority groups> + * <initial priority group> + * [<selector> <#selector args> [<arg>]* + * <#paths> <#per-path selector args> + * [<path> [<arg>]* ]+ ]+ + *---------------------------------------------------------------*/ +static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg, + struct dm_target *ti) +{ + int r; + struct path_selector_type *pst; + unsigned ps_argc; + + static struct dm_arg _args[] = { + {0, 1024, "invalid number of path selector args"}, + }; + + pst = dm_get_path_selector(dm_shift_arg(as)); + if (!pst) { + ti->error = "unknown path selector type"; + return -EINVAL; + } + + r = dm_read_arg_group(_args, as, &ps_argc, &ti->error); + if (r) { + dm_put_path_selector(pst); + return -EINVAL; + } + + r = pst->create(&pg->ps, ps_argc, as->argv); + if (r) { + dm_put_path_selector(pst); + ti->error = "path selector constructor failed"; + return r; + } + + pg->ps.type = pst; + dm_consume_args(as, ps_argc); + + return 0; +} + +static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps, + struct dm_target *ti) +{ + int r; + struct pgpath *p; + struct multipath *m = ti->private; + + /* we need at least a path arg */ + if (as->argc < 1) { + ti->error = "no device given"; + return ERR_PTR(-EINVAL); + } + + p = alloc_pgpath(); + if (!p) + return ERR_PTR(-ENOMEM); + + r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table), + &p->path.dev); + if (r) { + ti->error = "error getting device"; + goto bad; + } + + if (m->hw_handler_name) { + struct request_queue *q = bdev_get_queue(p->path.dev->bdev); + + r = scsi_dh_attach(q, m->hw_handler_name); + if (r == -EBUSY) { + /* + * Already attached to different hw_handler, + * try to reattach with correct one. + */ + scsi_dh_detach(q); + r = scsi_dh_attach(q, m->hw_handler_name); + } + + if (r < 0) { + ti->error = "error attaching hardware handler"; + dm_put_device(ti, p->path.dev); + goto bad; + } + + if (m->hw_handler_params) { + r = scsi_dh_set_params(q, m->hw_handler_params); + if (r < 0) { + ti->error = "unable to set hardware " + "handler parameters"; + scsi_dh_detach(q); + dm_put_device(ti, p->path.dev); + goto bad; + } + } + } + + r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error); + if (r) { + dm_put_device(ti, p->path.dev); + goto bad; + } + + return p; + + bad: + free_pgpath(p); + return ERR_PTR(r); +} + +static struct priority_group *parse_priority_group(struct dm_arg_set *as, + struct multipath *m) +{ + static struct dm_arg _args[] = { + {1, 1024, "invalid number of paths"}, + {0, 1024, "invalid number of selector args"} + }; + + int r; + unsigned i, nr_selector_args, nr_args; + struct priority_group *pg; + struct dm_target *ti = m->ti; + + if (as->argc < 2) { + as->argc = 0; + ti->error = "not enough priority group arguments"; + return ERR_PTR(-EINVAL); + } + + pg = alloc_priority_group(); + if (!pg) { + ti->error = "couldn't allocate priority group"; + return ERR_PTR(-ENOMEM); + } + pg->m = m; + + r = parse_path_selector(as, pg, ti); + if (r) + goto bad; + + /* + * read the paths + */ + r = dm_read_arg(_args, as, &pg->nr_pgpaths, &ti->error); + if (r) + goto bad; + + r = dm_read_arg(_args + 1, as, &nr_selector_args, &ti->error); + if (r) + goto bad; + + nr_args = 1 + nr_selector_args; + for (i = 0; i < pg->nr_pgpaths; i++) { + struct pgpath *pgpath; + struct dm_arg_set path_args; + + if (as->argc < nr_args) { + ti->error = "not enough path parameters"; + r = -EINVAL; + goto bad; + } + + path_args.argc = nr_args; + path_args.argv = as->argv; + + pgpath = parse_path(&path_args, &pg->ps, ti); + if (IS_ERR(pgpath)) { + r = PTR_ERR(pgpath); + goto bad; + } + + pgpath->pg = pg; + list_add_tail(&pgpath->list, &pg->pgpaths); + dm_consume_args(as, nr_args); + } + + return pg; + + bad: + free_priority_group(pg, ti); + return ERR_PTR(r); +} + +static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m) +{ + unsigned hw_argc; + int ret; + struct dm_target *ti = m->ti; + + static struct dm_arg _args[] = { + {0, 1024, "invalid number of hardware handler args"}, + }; + + if (dm_read_arg_group(_args, as, &hw_argc, &ti->error)) + return -EINVAL; + + if (!hw_argc) + return 0; + + m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL); + if (!try_then_request_module(scsi_dh_handler_exist(m->hw_handler_name), + "scsi_dh_%s", m->hw_handler_name)) { + ti->error = "unknown hardware handler type"; + ret = -EINVAL; + goto fail; + } + + if (hw_argc > 1) { + char *p; + int i, j, len = 4; + + for (i = 0; i <= hw_argc - 2; i++) + len += strlen(as->argv[i]) + 1; + p = m->hw_handler_params = kzalloc(len, GFP_KERNEL); + if (!p) { + ti->error = "memory allocation failed"; + ret = -ENOMEM; + goto fail; + } + j = sprintf(p, "%d", hw_argc - 1); + for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1) + j = sprintf(p, "%s", as->argv[i]); + } + dm_consume_args(as, hw_argc - 1); + + return 0; +fail: + kfree(m->hw_handler_name); + m->hw_handler_name = NULL; + return ret; +} + +static int parse_features(struct dm_arg_set *as, struct multipath *m) +{ + int r; + unsigned argc; + struct dm_target *ti = m->ti; + const char *arg_name; + + static struct dm_arg _args[] = { + {0, 5, "invalid number of feature args"}, + {1, 50, "pg_init_retries must be between 1 and 50"}, + {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"}, + }; + + r = dm_read_arg_group(_args, as, &argc, &ti->error); + if (r) + return -EINVAL; + + if (!argc) + return 0; + + do { + arg_name = dm_shift_arg(as); + argc--; + + if (!strcasecmp(arg_name, "queue_if_no_path")) { + r = queue_if_no_path(m, 1, 0); + continue; + } + + if (!strcasecmp(arg_name, "pg_init_retries") && + (argc >= 1)) { + r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error); + argc--; + continue; + } + + if (!strcasecmp(arg_name, "pg_init_delay_msecs") && + (argc >= 1)) { + r = dm_read_arg(_args + 2, as, &m->pg_init_delay_msecs, &ti->error); + argc--; + continue; + } + + ti->error = "Unrecognised multipath feature request"; + r = -EINVAL; + } while (argc && !r); + + return r; +} + +static int multipath_ctr(struct dm_target *ti, unsigned int argc, + char **argv) +{ + /* target arguments */ + static struct dm_arg _args[] = { + {0, 1024, "invalid number of priority groups"}, + {0, 1024, "invalid initial priority group number"}, + }; + + int r; + struct multipath *m; + struct dm_arg_set as; + unsigned pg_count = 0; + unsigned next_pg_num; + + as.argc = argc; + as.argv = argv; + + m = alloc_multipath(ti); + if (!m) { + ti->error = "can't allocate multipath"; + return -EINVAL; + } + + r = parse_features(&as, m); + if (r) + goto bad; + + r = parse_hw_handler(&as, m); + if (r) + goto bad; + + r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error); + if (r) + goto bad; + + r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error); + if (r) + goto bad; + + if ((!m->nr_priority_groups && next_pg_num) || + (m->nr_priority_groups && !next_pg_num)) { + ti->error = "invalid initial priority group"; + r = -EINVAL; + goto bad; + } + + /* parse the priority groups */ + while (as.argc) { + struct priority_group *pg; + + pg = parse_priority_group(&as, m); + if (IS_ERR(pg)) { + r = PTR_ERR(pg); + goto bad; + } + + m->nr_valid_paths += pg->nr_pgpaths; + list_add_tail(&pg->list, &m->priority_groups); + pg_count++; + pg->pg_num = pg_count; + if (!--next_pg_num) + m->next_pg = pg; + } + + if (pg_count != m->nr_priority_groups) { + ti->error = "priority group count mismatch"; + r = -EINVAL; + goto bad; + } + + ti->num_flush_requests = 1; + ti->num_discard_requests = 1; + + return 0; + + bad: + free_multipath(m); + return r; +} + +static void multipath_wait_for_pg_init_completion(struct multipath *m) +{ + DECLARE_WAITQUEUE(wait, current); + unsigned long flags; + + add_wait_queue(&m->pg_init_wait, &wait); + + while (1) { + set_current_state(TASK_UNINTERRUPTIBLE); + + spin_lock_irqsave(&m->lock, flags); + if (!m->pg_init_in_progress) { + spin_unlock_irqrestore(&m->lock, flags); + break; + } + spin_unlock_irqrestore(&m->lock, flags); + + io_schedule(); + } + set_current_state(TASK_RUNNING); + + remove_wait_queue(&m->pg_init_wait, &wait); +} + +static void flush_multipath_work(struct multipath *m) +{ + flush_workqueue(kmpath_handlerd); + multipath_wait_for_pg_init_completion(m); + flush_workqueue(kmultipathd); + flush_work_sync(&m->trigger_event); +} + +static void multipath_dtr(struct dm_target *ti) +{ + struct multipath *m = ti->private; + + flush_multipath_work(m); + free_multipath(m); +} + +/* + * Map cloned requests + */ +static int multipath_map(struct dm_target *ti, struct request *clone, + union map_info *map_context) +{ + int r; + struct multipath *m = (struct multipath *) ti->private; + + if (set_mapinfo(m, map_context) < 0) + /* ENOMEM, requeue */ + return DM_MAPIO_REQUEUE; + + clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; + r = map_io(m, clone, map_context, 0); + if (r < 0 || r == DM_MAPIO_REQUEUE) + clear_mapinfo(m, map_context); + + return r; +} + +/* + * Take a path out of use. + */ +static int fail_path(struct pgpath *pgpath) +{ + unsigned long flags; + struct multipath *m = pgpath->pg->m; + + spin_lock_irqsave(&m->lock, flags); + + if (!pgpath->is_active) + goto out; + + DMWARN("Failing path %s.", pgpath->path.dev->name); + + pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path); + pgpath->is_active = 0; + pgpath->fail_count++; + + m->nr_valid_paths--; + + if (pgpath == m->current_pgpath) + m->current_pgpath = NULL; + + dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti, + pgpath->path.dev->name, m->nr_valid_paths); + + schedule_work(&m->trigger_event); + +out: + spin_unlock_irqrestore(&m->lock, flags); + + return 0; +} + +/* + * Reinstate a previously-failed path + */ +static int reinstate_path(struct pgpath *pgpath) +{ + int r = 0; + unsigned long flags; + struct multipath *m = pgpath->pg->m; + + spin_lock_irqsave(&m->lock, flags); + + if (pgpath->is_active) + goto out; + + if (!pgpath->pg->ps.type->reinstate_path) { + DMWARN("Reinstate path not supported by path selector %s", + pgpath->pg->ps.type->name); + r = -EINVAL; + goto out; + } + + r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path); + if (r) + goto out; + + pgpath->is_active = 1; + + if (!m->nr_valid_paths++ && m->queue_size) { + m->current_pgpath = NULL; + queue_work(kmultipathd, &m->process_queued_ios); + } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) { + if (queue_work(kmpath_handlerd, &pgpath->activate_path.work)) + m->pg_init_in_progress++; + } + + dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti, + pgpath->path.dev->name, m->nr_valid_paths); + + schedule_work(&m->trigger_event); + +out: + spin_unlock_irqrestore(&m->lock, flags); + + return r; +} + +/* + * Fail or reinstate all paths that match the provided struct dm_dev. + */ +static int action_dev(struct multipath *m, struct dm_dev *dev, + action_fn action) +{ + int r = -EINVAL; + struct pgpath *pgpath; + struct priority_group *pg; + + list_for_each_entry(pg, &m->priority_groups, list) { + list_for_each_entry(pgpath, &pg->pgpaths, list) { + if (pgpath->path.dev == dev) + r = action(pgpath); + } + } + + return r; +} + +/* + * Temporarily try to avoid having to use the specified PG + */ +static void bypass_pg(struct multipath *m, struct priority_group *pg, + int bypassed) +{ + unsigned long flags; + + spin_lock_irqsave(&m->lock, flags); + + pg->bypassed = bypassed; + m->current_pgpath = NULL; + m->current_pg = NULL; + + spin_unlock_irqrestore(&m->lock, flags); + + schedule_work(&m->trigger_event); +} + +/* + * Switch to using the specified PG from the next I/O that gets mapped + */ +static int switch_pg_num(struct multipath *m, const char *pgstr) +{ + struct priority_group *pg; + unsigned pgnum; + unsigned long flags; + char dummy; + + if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || + (pgnum > m->nr_priority_groups)) { + DMWARN("invalid PG number supplied to switch_pg_num"); + return -EINVAL; + } + + spin_lock_irqsave(&m->lock, flags); + list_for_each_entry(pg, &m->priority_groups, list) { + pg->bypassed = 0; + if (--pgnum) + continue; + + m->current_pgpath = NULL; + m->current_pg = NULL; + m->next_pg = pg; + } + spin_unlock_irqrestore(&m->lock, flags); + + schedule_work(&m->trigger_event); + return 0; +} + +/* + * Set/clear bypassed status of a PG. + * PGs are numbered upwards from 1 in the order they were declared. + */ +static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed) +{ + struct priority_group *pg; + unsigned pgnum; + char dummy; + + if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || + (pgnum > m->nr_priority_groups)) { + DMWARN("invalid PG number supplied to bypass_pg"); + return -EINVAL; + } + + list_for_each_entry(pg, &m->priority_groups, list) { + if (!--pgnum) + break; + } + + bypass_pg(m, pg, bypassed); + return 0; +} + +/* + * Should we retry pg_init immediately? + */ +static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath) +{ + unsigned long flags; + int limit_reached = 0; + + spin_lock_irqsave(&m->lock, flags); + + if (m->pg_init_count <= m->pg_init_retries) + m->pg_init_required = 1; + else + limit_reached = 1; + + spin_unlock_irqrestore(&m->lock, flags); + + return limit_reached; +} + +static void pg_init_done(void *data, int errors) +{ + struct pgpath *pgpath = data; + struct priority_group *pg = pgpath->pg; + struct multipath *m = pg->m; + unsigned long flags; + unsigned delay_retry = 0; + + /* device or driver problems */ + switch (errors) { + case SCSI_DH_OK: + break; + case SCSI_DH_NOSYS: + if (!m->hw_handler_name) { + errors = 0; + break; + } + DMERR("Could not failover the device: Handler scsi_dh_%s " + "Error %d.", m->hw_handler_name, errors); + /* + * Fail path for now, so we do not ping pong + */ + fail_path(pgpath); + break; + case SCSI_DH_DEV_TEMP_BUSY: + /* + * Probably doing something like FW upgrade on the + * controller so try the other pg. + */ + bypass_pg(m, pg, 1); + break; + case SCSI_DH_RETRY: + /* Wait before retrying. */ + delay_retry = 1; + case SCSI_DH_IMM_RETRY: + case SCSI_DH_RES_TEMP_UNAVAIL: + if (pg_init_limit_reached(m, pgpath)) + fail_path(pgpath); + errors = 0; + break; + default: + /* + * We probably do not want to fail the path for a device + * error, but this is what the old dm did. In future + * patches we can do more advanced handling. + */ + fail_path(pgpath); + } + + spin_lock_irqsave(&m->lock, flags); + if (errors) { + if (pgpath == m->current_pgpath) { + DMERR("Could not failover device. Error %d.", errors); + m->current_pgpath = NULL; + m->current_pg = NULL; + } + } else if (!m->pg_init_required) + pg->bypassed = 0; + + if (--m->pg_init_in_progress) + /* Activations of other paths are still on going */ + goto out; + + if (!m->pg_init_required) + m->queue_io = 0; + + m->pg_init_delay_retry = delay_retry; + queue_work(kmultipathd, &m->process_queued_ios); + + /* + * Wake up any thread waiting to suspend. + */ + wake_up(&m->pg_init_wait); + +out: + spin_unlock_irqrestore(&m->lock, flags); +} + +static void activate_path(struct work_struct *work) +{ + struct pgpath *pgpath = + container_of(work, struct pgpath, activate_path.work); + + scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev), + pg_init_done, pgpath); +} + +/* + * end_io handling + */ +static int do_end_io(struct multipath *m, struct request *clone, + int error, struct dm_mpath_io *mpio) +{ + /* + * We don't queue any clone request inside the multipath target + * during end I/O handling, since those clone requests don't have + * bio clones. If we queue them inside the multipath target, + * we need to make bio clones, that requires memory allocation. + * (See drivers/md/dm.c:end_clone_bio() about why the clone requests + * don't have bio clones.) + * Instead of queueing the clone request here, we queue the original + * request into dm core, which will remake a clone request and + * clone bios for it and resubmit it later. + */ + int r = DM_ENDIO_REQUEUE; + unsigned long flags; + + if (!error && !clone->errors) + return 0; /* I/O complete */ + + if (error == -EOPNOTSUPP || error == -EREMOTEIO || error == -EILSEQ) + return error; + + if (mpio->pgpath) + fail_path(mpio->pgpath); + + spin_lock_irqsave(&m->lock, flags); + if (!m->nr_valid_paths) { + if (!m->queue_if_no_path) { + if (!__must_push_back(m)) + r = -EIO; + } else { + if (error == -EBADE) + r = error; + } + } + spin_unlock_irqrestore(&m->lock, flags); + + return r; +} + +static int multipath_end_io(struct dm_target *ti, struct request *clone, + int error, union map_info *map_context) +{ + struct multipath *m = ti->private; + struct dm_mpath_io *mpio = map_context->ptr; + struct pgpath *pgpath = mpio->pgpath; + struct path_selector *ps; + int r; + + BUG_ON(!mpio); + + r = do_end_io(m, clone, error, mpio); + if (pgpath) { + ps = &pgpath->pg->ps; + if (ps->type->end_io) + ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); + } + clear_mapinfo(m, map_context); + + return r; +} + +/* + * Suspend can't complete until all the I/O is processed so if + * the last path fails we must error any remaining I/O. + * Note that if the freeze_bdev fails while suspending, the + * queue_if_no_path state is lost - userspace should reset it. + */ +static void multipath_presuspend(struct dm_target *ti) +{ + struct multipath *m = (struct multipath *) ti->private; + + queue_if_no_path(m, 0, 1); +} + +static void multipath_postsuspend(struct dm_target *ti) +{ + struct multipath *m = ti->private; + + mutex_lock(&m->work_mutex); + flush_multipath_work(m); + mutex_unlock(&m->work_mutex); +} + +/* + * Restore the queue_if_no_path setting. + */ +static void multipath_resume(struct dm_target *ti) +{ + struct multipath *m = (struct multipath *) ti->private; + unsigned long flags; + + spin_lock_irqsave(&m->lock, flags); + m->queue_if_no_path = m->saved_queue_if_no_path; + spin_unlock_irqrestore(&m->lock, flags); +} + +/* + * Info output has the following format: + * num_multipath_feature_args [multipath_feature_args]* + * num_handler_status_args [handler_status_args]* + * num_groups init_group_number + * [A|D|E num_ps_status_args [ps_status_args]* + * num_paths num_selector_args + * [path_dev A|F fail_count [selector_args]* ]+ ]+ + * + * Table output has the following format (identical to the constructor string): + * num_feature_args [features_args]* + * num_handler_args hw_handler [hw_handler_args]* + * num_groups init_group_number + * [priority selector-name num_ps_args [ps_args]* + * num_paths num_selector_args [path_dev [selector_args]* ]+ ]+ + */ +static int multipath_status(struct dm_target *ti, status_type_t type, + char *result, unsigned int maxlen) +{ + int sz = 0; + unsigned long flags; + struct multipath *m = (struct multipath *) ti->private; + struct priority_group *pg; + struct pgpath *p; + unsigned pg_num; + char state; + + spin_lock_irqsave(&m->lock, flags); + + /* Features */ + if (type == STATUSTYPE_INFO) + DMEMIT("2 %u %u ", m->queue_size, m->pg_init_count); + else { + DMEMIT("%u ", m->queue_if_no_path + + (m->pg_init_retries > 0) * 2 + + (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2); + if (m->queue_if_no_path) + DMEMIT("queue_if_no_path "); + if (m->pg_init_retries) + DMEMIT("pg_init_retries %u ", m->pg_init_retries); + if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) + DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs); + } + + if (!m->hw_handler_name || type == STATUSTYPE_INFO) + DMEMIT("0 "); + else + DMEMIT("1 %s ", m->hw_handler_name); + + DMEMIT("%u ", m->nr_priority_groups); + + if (m->next_pg) + pg_num = m->next_pg->pg_num; + else if (m->current_pg) + pg_num = m->current_pg->pg_num; + else + pg_num = (m->nr_priority_groups ? 1 : 0); + + DMEMIT("%u ", pg_num); + + switch (type) { + case STATUSTYPE_INFO: + list_for_each_entry(pg, &m->priority_groups, list) { + if (pg->bypassed) + state = 'D'; /* Disabled */ + else if (pg == m->current_pg) + state = 'A'; /* Currently Active */ + else + state = 'E'; /* Enabled */ + + DMEMIT("%c ", state); + + if (pg->ps.type->status) + sz += pg->ps.type->status(&pg->ps, NULL, type, + result + sz, + maxlen - sz); + else + DMEMIT("0 "); + + DMEMIT("%u %u ", pg->nr_pgpaths, + pg->ps.type->info_args); + + list_for_each_entry(p, &pg->pgpaths, list) { + DMEMIT("%s %s %u ", p->path.dev->name, + p->is_active ? "A" : "F", + p->fail_count); + if (pg->ps.type->status) + sz += pg->ps.type->status(&pg->ps, + &p->path, type, result + sz, + maxlen - sz); + } + } + break; + + case STATUSTYPE_TABLE: + list_for_each_entry(pg, &m->priority_groups, list) { + DMEMIT("%s ", pg->ps.type->name); + + if (pg->ps.type->status) + sz += pg->ps.type->status(&pg->ps, NULL, type, + result + sz, + maxlen - sz); + else + DMEMIT("0 "); + + DMEMIT("%u %u ", pg->nr_pgpaths, + pg->ps.type->table_args); + + list_for_each_entry(p, &pg->pgpaths, list) { + DMEMIT("%s ", p->path.dev->name); + if (pg->ps.type->status) + sz += pg->ps.type->status(&pg->ps, + &p->path, type, result + sz, + maxlen - sz); + } + } + break; + } + + spin_unlock_irqrestore(&m->lock, flags); + + return 0; +} + +static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) +{ + int r = -EINVAL; + struct dm_dev *dev; + struct multipath *m = (struct multipath *) ti->private; + action_fn action; + + mutex_lock(&m->work_mutex); + + if (dm_suspended(ti)) { + r = -EBUSY; + goto out; + } + + if (argc == 1) { + if (!strcasecmp(argv[0], "queue_if_no_path")) { + r = queue_if_no_path(m, 1, 0); + goto out; + } else if (!strcasecmp(argv[0], "fail_if_no_path")) { + r = queue_if_no_path(m, 0, 0); + goto out; + } + } + + if (argc != 2) { + DMWARN("Unrecognised multipath message received."); + goto out; + } + + if (!strcasecmp(argv[0], "disable_group")) { + r = bypass_pg_num(m, argv[1], 1); + goto out; + } else if (!strcasecmp(argv[0], "enable_group")) { + r = bypass_pg_num(m, argv[1], 0); + goto out; + } else if (!strcasecmp(argv[0], "switch_group")) { + r = switch_pg_num(m, argv[1]); + goto out; + } else if (!strcasecmp(argv[0], "reinstate_path")) + action = reinstate_path; + else if (!strcasecmp(argv[0], "fail_path")) + action = fail_path; + else { + DMWARN("Unrecognised multipath message received."); + goto out; + } + + r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev); + if (r) { + DMWARN("message: error getting device %s", + argv[1]); + goto out; + } + + r = action_dev(m, dev, action); + + dm_put_device(ti, dev); + +out: + mutex_unlock(&m->work_mutex); + return r; +} + +static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, + unsigned long arg) +{ + struct multipath *m = (struct multipath *) ti->private; + struct block_device *bdev = NULL; + fmode_t mode = 0; + unsigned long flags; + int r = 0; + + spin_lock_irqsave(&m->lock, flags); + + if (!m->current_pgpath) + __choose_pgpath(m, 0); + + if (m->current_pgpath) { + bdev = m->current_pgpath->path.dev->bdev; + mode = m->current_pgpath->path.dev->mode; + } + + if (m->queue_io) + r = -EAGAIN; + else if (!bdev) + r = -EIO; + + spin_unlock_irqrestore(&m->lock, flags); + + /* + * Only pass ioctls through if the device sizes match exactly. + */ + if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) + r = scsi_verify_blk_ioctl(NULL, cmd); + + return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); +} + +static int multipath_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct multipath *m = ti->private; + struct priority_group *pg; + struct pgpath *p; + int ret = 0; + + list_for_each_entry(pg, &m->priority_groups, list) { + list_for_each_entry(p, &pg->pgpaths, list) { + ret = fn(ti, p->path.dev, ti->begin, ti->len, data); + if (ret) + goto out; + } + } + +out: + return ret; +} + +static int __pgpath_busy(struct pgpath *pgpath) +{ + struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev); + + return dm_underlying_device_busy(q); +} + +/* + * We return "busy", only when we can map I/Os but underlying devices + * are busy (so even if we map I/Os now, the I/Os will wait on + * the underlying queue). + * In other words, if we want to kill I/Os or queue them inside us + * due to map unavailability, we don't return "busy". Otherwise, + * dm core won't give us the I/Os and we can't do what we want. + */ +static int multipath_busy(struct dm_target *ti) +{ + int busy = 0, has_active = 0; + struct multipath *m = ti->private; + struct priority_group *pg; + struct pgpath *pgpath; + unsigned long flags; + + spin_lock_irqsave(&m->lock, flags); + + /* Guess which priority_group will be used at next mapping time */ + if (unlikely(!m->current_pgpath && m->next_pg)) + pg = m->next_pg; + else if (likely(m->current_pg)) + pg = m->current_pg; + else + /* + * We don't know which pg will be used at next mapping time. + * We don't call __choose_pgpath() here to avoid to trigger + * pg_init just by busy checking. + * So we don't know whether underlying devices we will be using + * at next mapping time are busy or not. Just try mapping. + */ + goto out; + + /* + * If there is one non-busy active path at least, the path selector + * will be able to select it. So we consider such a pg as not busy. + */ + busy = 1; + list_for_each_entry(pgpath, &pg->pgpaths, list) + if (pgpath->is_active) { + has_active = 1; + + if (!__pgpath_busy(pgpath)) { + busy = 0; + break; + } + } + + if (!has_active) + /* + * No active path in this pg, so this pg won't be used and + * the current_pg will be changed at next mapping time. + * We need to try mapping to determine it. + */ + busy = 0; + +out: + spin_unlock_irqrestore(&m->lock, flags); + + return busy; +} + +/*----------------------------------------------------------------- + * Module setup + *---------------------------------------------------------------*/ +static struct target_type multipath_target = { + .name = "multipath", + .version = {1, 3, 0}, + .module = THIS_MODULE, + .ctr = multipath_ctr, + .dtr = multipath_dtr, + .map_rq = multipath_map, + .rq_end_io = multipath_end_io, + .presuspend = multipath_presuspend, + .postsuspend = multipath_postsuspend, + .resume = multipath_resume, + .status = multipath_status, + .message = multipath_message, + .ioctl = multipath_ioctl, + .iterate_devices = multipath_iterate_devices, + .busy = multipath_busy, +}; + +static int __init dm_multipath_init(void) +{ + int r; + + /* allocate a slab for the dm_ios */ + _mpio_cache = KMEM_CACHE(dm_mpath_io, 0); + if (!_mpio_cache) + return -ENOMEM; + + r = dm_register_target(&multipath_target); + if (r < 0) { + DMERR("register failed %d", r); + kmem_cache_destroy(_mpio_cache); + return -EINVAL; + } + + kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0); + if (!kmultipathd) { + DMERR("failed to create workqueue kmpathd"); + dm_unregister_target(&multipath_target); + kmem_cache_destroy(_mpio_cache); + return -ENOMEM; + } + + /* + * A separate workqueue is used to handle the device handlers + * to avoid overloading existing workqueue. Overloading the + * old workqueue would also create a bottleneck in the + * path of the storage hardware device activation. + */ + kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd", + WQ_MEM_RECLAIM); + if (!kmpath_handlerd) { + DMERR("failed to create workqueue kmpath_handlerd"); + destroy_workqueue(kmultipathd); + dm_unregister_target(&multipath_target); + kmem_cache_destroy(_mpio_cache); + return -ENOMEM; + } + + DMINFO("version %u.%u.%u loaded", + multipath_target.version[0], multipath_target.version[1], + multipath_target.version[2]); + + return r; +} + +static void __exit dm_multipath_exit(void) +{ + destroy_workqueue(kmpath_handlerd); + destroy_workqueue(kmultipathd); + + dm_unregister_target(&multipath_target); + kmem_cache_destroy(_mpio_cache); +} + +module_init(dm_multipath_init); +module_exit(dm_multipath_exit); + +MODULE_DESCRIPTION(DM_NAME " multipath target"); +MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-mpath.h b/drivers/md/dm-mpath.h new file mode 100644 index 00000000..e230f719 --- /dev/null +++ b/drivers/md/dm-mpath.h @@ -0,0 +1,22 @@ +/* + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + * + * Multipath. + */ + +#ifndef DM_MPATH_H +#define DM_MPATH_H + +struct dm_dev; + +struct dm_path { + struct dm_dev *dev; /* Read-only */ + void *pscontext; /* For path-selector use */ +}; + +/* Callback for hwh_pg_init_fn to use when complete */ +void dm_pg_init_complete(struct dm_path *path, unsigned err_flags); + +#endif diff --git a/drivers/md/dm-path-selector.c b/drivers/md/dm-path-selector.c new file mode 100644 index 00000000..fa0ccc58 --- /dev/null +++ b/drivers/md/dm-path-selector.c @@ -0,0 +1,140 @@ +/* + * Copyright (C) 2003 Sistina Software. + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * Module Author: Heinz Mauelshagen + * + * This file is released under the GPL. + * + * Path selector registration. + */ + +#include <linux/device-mapper.h> +#include <linux/module.h> + +#include "dm-path-selector.h" + +#include <linux/slab.h> + +struct ps_internal { + struct path_selector_type pst; + struct list_head list; +}; + +#define pst_to_psi(__pst) container_of((__pst), struct ps_internal, pst) + +static LIST_HEAD(_path_selectors); +static DECLARE_RWSEM(_ps_lock); + +static struct ps_internal *__find_path_selector_type(const char *name) +{ + struct ps_internal *psi; + + list_for_each_entry(psi, &_path_selectors, list) { + if (!strcmp(name, psi->pst.name)) + return psi; + } + + return NULL; +} + +static struct ps_internal *get_path_selector(const char *name) +{ + struct ps_internal *psi; + + down_read(&_ps_lock); + psi = __find_path_selector_type(name); + if (psi && !try_module_get(psi->pst.module)) + psi = NULL; + up_read(&_ps_lock); + + return psi; +} + +struct path_selector_type *dm_get_path_selector(const char *name) +{ + struct ps_internal *psi; + + if (!name) + return NULL; + + psi = get_path_selector(name); + if (!psi) { + request_module("dm-%s", name); + psi = get_path_selector(name); + } + + return psi ? &psi->pst : NULL; +} + +void dm_put_path_selector(struct path_selector_type *pst) +{ + struct ps_internal *psi; + + if (!pst) + return; + + down_read(&_ps_lock); + psi = __find_path_selector_type(pst->name); + if (!psi) + goto out; + + module_put(psi->pst.module); +out: + up_read(&_ps_lock); +} + +static struct ps_internal *_alloc_path_selector(struct path_selector_type *pst) +{ + struct ps_internal *psi = kzalloc(sizeof(*psi), GFP_KERNEL); + + if (psi) + psi->pst = *pst; + + return psi; +} + +int dm_register_path_selector(struct path_selector_type *pst) +{ + int r = 0; + struct ps_internal *psi = _alloc_path_selector(pst); + + if (!psi) + return -ENOMEM; + + down_write(&_ps_lock); + + if (__find_path_selector_type(pst->name)) { + kfree(psi); + r = -EEXIST; + } else + list_add(&psi->list, &_path_selectors); + + up_write(&_ps_lock); + + return r; +} + +int dm_unregister_path_selector(struct path_selector_type *pst) +{ + struct ps_internal *psi; + + down_write(&_ps_lock); + + psi = __find_path_selector_type(pst->name); + if (!psi) { + up_write(&_ps_lock); + return -EINVAL; + } + + list_del(&psi->list); + + up_write(&_ps_lock); + + kfree(psi); + + return 0; +} + +EXPORT_SYMBOL_GPL(dm_register_path_selector); +EXPORT_SYMBOL_GPL(dm_unregister_path_selector); diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h new file mode 100644 index 00000000..e7d1fa8b --- /dev/null +++ b/drivers/md/dm-path-selector.h @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2003 Sistina Software. + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * Module Author: Heinz Mauelshagen + * + * This file is released under the GPL. + * + * Path-Selector registration. + */ + +#ifndef DM_PATH_SELECTOR_H +#define DM_PATH_SELECTOR_H + +#include <linux/device-mapper.h> + +#include "dm-mpath.h" + +/* + * We provide an abstraction for the code that chooses which path + * to send some io down. + */ +struct path_selector_type; +struct path_selector { + struct path_selector_type *type; + void *context; +}; + +/* Information about a path selector type */ +struct path_selector_type { + char *name; + struct module *module; + + unsigned int table_args; + unsigned int info_args; + + /* + * Constructs a path selector object, takes custom arguments + */ + int (*create) (struct path_selector *ps, unsigned argc, char **argv); + void (*destroy) (struct path_selector *ps); + + /* + * Add an opaque path object, along with some selector specific + * path args (eg, path priority). + */ + int (*add_path) (struct path_selector *ps, struct dm_path *path, + int argc, char **argv, char **error); + + /* + * Chooses a path for this io, if no paths are available then + * NULL will be returned. + * + * repeat_count is the number of times to use the path before + * calling the function again. 0 means don't call it again unless + * the path fails. + */ + struct dm_path *(*select_path) (struct path_selector *ps, + unsigned *repeat_count, + size_t nr_bytes); + + /* + * Notify the selector that a path has failed. + */ + void (*fail_path) (struct path_selector *ps, struct dm_path *p); + + /* + * Ask selector to reinstate a path. + */ + int (*reinstate_path) (struct path_selector *ps, struct dm_path *p); + + /* + * Table content based on parameters added in ps_add_path_fn + * or path selector status + */ + int (*status) (struct path_selector *ps, struct dm_path *path, + status_type_t type, char *result, unsigned int maxlen); + + int (*start_io) (struct path_selector *ps, struct dm_path *path, + size_t nr_bytes); + int (*end_io) (struct path_selector *ps, struct dm_path *path, + size_t nr_bytes); +}; + +/* Register a path selector */ +int dm_register_path_selector(struct path_selector_type *type); + +/* Unregister a path selector */ +int dm_unregister_path_selector(struct path_selector_type *type); + +/* Returns a registered path selector type */ +struct path_selector_type *dm_get_path_selector(const char *name); + +/* Releases a path selector */ +void dm_put_path_selector(struct path_selector_type *pst); + +#endif diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c new file mode 100644 index 00000000..3941fae0 --- /dev/null +++ b/drivers/md/dm-queue-length.c @@ -0,0 +1,264 @@ +/* + * Copyright (C) 2004-2005 IBM Corp. All Rights Reserved. + * Copyright (C) 2006-2009 NEC Corporation. + * + * dm-queue-length.c + * + * Module Author: Stefan Bader, IBM + * Modified by: Kiyoshi Ueda, NEC + * + * This file is released under the GPL. + * + * queue-length path selector - choose a path with the least number of + * in-flight I/Os. + */ + +#include "dm.h" +#include "dm-path-selector.h" + +#include <linux/slab.h> +#include <linux/ctype.h> +#include <linux/errno.h> +#include <linux/module.h> +#include <linux/atomic.h> + +#define DM_MSG_PREFIX "multipath queue-length" +#define QL_MIN_IO 128 +#define QL_VERSION "0.1.0" + +struct selector { + struct list_head valid_paths; + struct list_head failed_paths; +}; + +struct path_info { + struct list_head list; + struct dm_path *path; + unsigned repeat_count; + atomic_t qlen; /* the number of in-flight I/Os */ +}; + +static struct selector *alloc_selector(void) +{ + struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (s) { + INIT_LIST_HEAD(&s->valid_paths); + INIT_LIST_HEAD(&s->failed_paths); + } + + return s; +} + +static int ql_create(struct path_selector *ps, unsigned argc, char **argv) +{ + struct selector *s = alloc_selector(); + + if (!s) + return -ENOMEM; + + ps->context = s; + return 0; +} + +static void ql_free_paths(struct list_head *paths) +{ + struct path_info *pi, *next; + + list_for_each_entry_safe(pi, next, paths, list) { + list_del(&pi->list); + kfree(pi); + } +} + +static void ql_destroy(struct path_selector *ps) +{ + struct selector *s = ps->context; + + ql_free_paths(&s->valid_paths); + ql_free_paths(&s->failed_paths); + kfree(s); + ps->context = NULL; +} + +static int ql_status(struct path_selector *ps, struct dm_path *path, + status_type_t type, char *result, unsigned maxlen) +{ + unsigned sz = 0; + struct path_info *pi; + + /* When called with NULL path, return selector status/args. */ + if (!path) + DMEMIT("0 "); + else { + pi = path->pscontext; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%d ", atomic_read(&pi->qlen)); + break; + case STATUSTYPE_TABLE: + DMEMIT("%u ", pi->repeat_count); + break; + } + } + + return sz; +} + +static int ql_add_path(struct path_selector *ps, struct dm_path *path, + int argc, char **argv, char **error) +{ + struct selector *s = ps->context; + struct path_info *pi; + unsigned repeat_count = QL_MIN_IO; + char dummy; + + /* + * Arguments: [<repeat_count>] + * <repeat_count>: The number of I/Os before switching path. + * If not given, default (QL_MIN_IO) is used. + */ + if (argc > 1) { + *error = "queue-length ps: incorrect number of arguments"; + return -EINVAL; + } + + if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { + *error = "queue-length ps: invalid repeat count"; + return -EINVAL; + } + + /* Allocate the path information structure */ + pi = kmalloc(sizeof(*pi), GFP_KERNEL); + if (!pi) { + *error = "queue-length ps: Error allocating path information"; + return -ENOMEM; + } + + pi->path = path; + pi->repeat_count = repeat_count; + atomic_set(&pi->qlen, 0); + + path->pscontext = pi; + + list_add_tail(&pi->list, &s->valid_paths); + + return 0; +} + +static void ql_fail_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = ps->context; + struct path_info *pi = path->pscontext; + + list_move(&pi->list, &s->failed_paths); +} + +static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = ps->context; + struct path_info *pi = path->pscontext; + + list_move_tail(&pi->list, &s->valid_paths); + + return 0; +} + +/* + * Select a path having the minimum number of in-flight I/Os + */ +static struct dm_path *ql_select_path(struct path_selector *ps, + unsigned *repeat_count, size_t nr_bytes) +{ + struct selector *s = ps->context; + struct path_info *pi = NULL, *best = NULL; + + if (list_empty(&s->valid_paths)) + return NULL; + + /* Change preferred (first in list) path to evenly balance. */ + list_move_tail(s->valid_paths.next, &s->valid_paths); + + list_for_each_entry(pi, &s->valid_paths, list) { + if (!best || + (atomic_read(&pi->qlen) < atomic_read(&best->qlen))) + best = pi; + + if (!atomic_read(&best->qlen)) + break; + } + + if (!best) + return NULL; + + *repeat_count = best->repeat_count; + + return best->path; +} + +static int ql_start_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes) +{ + struct path_info *pi = path->pscontext; + + atomic_inc(&pi->qlen); + + return 0; +} + +static int ql_end_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes) +{ + struct path_info *pi = path->pscontext; + + atomic_dec(&pi->qlen); + + return 0; +} + +static struct path_selector_type ql_ps = { + .name = "queue-length", + .module = THIS_MODULE, + .table_args = 1, + .info_args = 1, + .create = ql_create, + .destroy = ql_destroy, + .status = ql_status, + .add_path = ql_add_path, + .fail_path = ql_fail_path, + .reinstate_path = ql_reinstate_path, + .select_path = ql_select_path, + .start_io = ql_start_io, + .end_io = ql_end_io, +}; + +static int __init dm_ql_init(void) +{ + int r = dm_register_path_selector(&ql_ps); + + if (r < 0) + DMERR("register failed %d", r); + + DMINFO("version " QL_VERSION " loaded"); + + return r; +} + +static void __exit dm_ql_exit(void) +{ + int r = dm_unregister_path_selector(&ql_ps); + + if (r < 0) + DMERR("unregister failed %d", r); +} + +module_init(dm_ql_init); +module_exit(dm_ql_exit); + +MODULE_AUTHOR("Stefan Bader <Stefan.Bader at de.ibm.com>"); +MODULE_DESCRIPTION( + "(C) Copyright IBM Corp. 2004,2005 All Rights Reserved.\n" + DM_NAME " path selector to balance the number of in-flight I/Os" +); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c new file mode 100644 index 00000000..68965e66 --- /dev/null +++ b/drivers/md/dm-raid.c @@ -0,0 +1,1297 @@ +/* + * Copyright (C) 2010-2011 Neil Brown + * Copyright (C) 2010-2011 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include <linux/slab.h> +#include <linux/module.h> + +#include "md.h" +#include "raid1.h" +#include "raid5.h" +#include "bitmap.h" + +#include <linux/device-mapper.h> + +#define DM_MSG_PREFIX "raid" + +/* + * The following flags are used by dm-raid.c to set up the array state. + * They must be cleared before md_run is called. + */ +#define FirstUse 10 /* rdev flag */ + +struct raid_dev { + /* + * Two DM devices, one to hold metadata and one to hold the + * actual data/parity. The reason for this is to not confuse + * ti->len and give more flexibility in altering size and + * characteristics. + * + * While it is possible for this device to be associated + * with a different physical device than the data_dev, it + * is intended for it to be the same. + * |--------- Physical Device ---------| + * |- meta_dev -|------ data_dev ------| + */ + struct dm_dev *meta_dev; + struct dm_dev *data_dev; + struct md_rdev rdev; +}; + +/* + * Flags for rs->print_flags field. + */ +#define DMPF_SYNC 0x1 +#define DMPF_NOSYNC 0x2 +#define DMPF_REBUILD 0x4 +#define DMPF_DAEMON_SLEEP 0x8 +#define DMPF_MIN_RECOVERY_RATE 0x10 +#define DMPF_MAX_RECOVERY_RATE 0x20 +#define DMPF_MAX_WRITE_BEHIND 0x40 +#define DMPF_STRIPE_CACHE 0x80 +#define DMPF_REGION_SIZE 0X100 +struct raid_set { + struct dm_target *ti; + + uint32_t bitmap_loaded; + uint32_t print_flags; + + struct mddev md; + struct raid_type *raid_type; + struct dm_target_callbacks callbacks; + + struct raid_dev dev[0]; +}; + +/* Supported raid types and properties. */ +static struct raid_type { + const char *name; /* RAID algorithm. */ + const char *descr; /* Descriptor text for logging. */ + const unsigned parity_devs; /* # of parity devices. */ + const unsigned minimal_devs; /* minimal # of devices in set. */ + const unsigned level; /* RAID level. */ + const unsigned algorithm; /* RAID algorithm. */ +} raid_types[] = { + {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */}, + {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, + {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, + {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, + {"raid5_ls", "RAID5 (left symmetric)", 1, 2, 5, ALGORITHM_LEFT_SYMMETRIC}, + {"raid5_rs", "RAID5 (right symmetric)", 1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC}, + {"raid6_zr", "RAID6 (zero restart)", 2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART}, + {"raid6_nr", "RAID6 (N restart)", 2, 4, 6, ALGORITHM_ROTATING_N_RESTART}, + {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} +}; + +static struct raid_type *get_raid_type(char *name) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(raid_types); i++) + if (!strcmp(raid_types[i].name, name)) + return &raid_types[i]; + + return NULL; +} + +static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *raid_type, unsigned raid_devs) +{ + unsigned i; + struct raid_set *rs; + sector_t sectors_per_dev; + + if (raid_devs <= raid_type->parity_devs) { + ti->error = "Insufficient number of devices"; + return ERR_PTR(-EINVAL); + } + + sectors_per_dev = ti->len; + if ((raid_type->level > 1) && + sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) { + ti->error = "Target length not divisible by number of data devices"; + return ERR_PTR(-EINVAL); + } + + rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL); + if (!rs) { + ti->error = "Cannot allocate raid context"; + return ERR_PTR(-ENOMEM); + } + + mddev_init(&rs->md); + + rs->ti = ti; + rs->raid_type = raid_type; + rs->md.raid_disks = raid_devs; + rs->md.level = raid_type->level; + rs->md.new_level = rs->md.level; + rs->md.dev_sectors = sectors_per_dev; + rs->md.layout = raid_type->algorithm; + rs->md.new_layout = rs->md.layout; + rs->md.delta_disks = 0; + rs->md.recovery_cp = 0; + + for (i = 0; i < raid_devs; i++) + md_rdev_init(&rs->dev[i].rdev); + + /* + * Remaining items to be initialized by further RAID params: + * rs->md.persistent + * rs->md.external + * rs->md.chunk_sectors + * rs->md.new_chunk_sectors + */ + + return rs; +} + +static void context_free(struct raid_set *rs) +{ + int i; + + for (i = 0; i < rs->md.raid_disks; i++) { + if (rs->dev[i].meta_dev) + dm_put_device(rs->ti, rs->dev[i].meta_dev); + if (rs->dev[i].rdev.sb_page) + put_page(rs->dev[i].rdev.sb_page); + rs->dev[i].rdev.sb_page = NULL; + rs->dev[i].rdev.sb_loaded = 0; + if (rs->dev[i].data_dev) + dm_put_device(rs->ti, rs->dev[i].data_dev); + } + + kfree(rs); +} + +/* + * For every device we have two words + * <meta_dev>: meta device name or '-' if missing + * <data_dev>: data device name or '-' if missing + * + * The following are permitted: + * - - + * - <data_dev> + * <meta_dev> <data_dev> + * + * The following is not allowed: + * <meta_dev> - + * + * This code parses those words. If there is a failure, + * the caller must use context_free to unwind the operations. + */ +static int dev_parms(struct raid_set *rs, char **argv) +{ + int i; + int rebuild = 0; + int metadata_available = 0; + int ret = 0; + + for (i = 0; i < rs->md.raid_disks; i++, argv += 2) { + rs->dev[i].rdev.raid_disk = i; + + rs->dev[i].meta_dev = NULL; + rs->dev[i].data_dev = NULL; + + /* + * There are no offsets, since there is a separate device + * for data and metadata. + */ + rs->dev[i].rdev.data_offset = 0; + rs->dev[i].rdev.mddev = &rs->md; + + if (strcmp(argv[0], "-")) { + ret = dm_get_device(rs->ti, argv[0], + dm_table_get_mode(rs->ti->table), + &rs->dev[i].meta_dev); + rs->ti->error = "RAID metadata device lookup failure"; + if (ret) + return ret; + + rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL); + if (!rs->dev[i].rdev.sb_page) + return -ENOMEM; + } + + if (!strcmp(argv[1], "-")) { + if (!test_bit(In_sync, &rs->dev[i].rdev.flags) && + (!rs->dev[i].rdev.recovery_offset)) { + rs->ti->error = "Drive designated for rebuild not specified"; + return -EINVAL; + } + + rs->ti->error = "No data device supplied with metadata device"; + if (rs->dev[i].meta_dev) + return -EINVAL; + + continue; + } + + ret = dm_get_device(rs->ti, argv[1], + dm_table_get_mode(rs->ti->table), + &rs->dev[i].data_dev); + if (ret) { + rs->ti->error = "RAID device lookup failure"; + return ret; + } + + if (rs->dev[i].meta_dev) { + metadata_available = 1; + rs->dev[i].rdev.meta_bdev = rs->dev[i].meta_dev->bdev; + } + rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev; + list_add(&rs->dev[i].rdev.same_set, &rs->md.disks); + if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) + rebuild++; + } + + if (metadata_available) { + rs->md.external = 0; + rs->md.persistent = 1; + rs->md.major_version = 2; + } else if (rebuild && !rs->md.recovery_cp) { + /* + * Without metadata, we will not be able to tell if the array + * is in-sync or not - we must assume it is not. Therefore, + * it is impossible to rebuild a drive. + * + * Even if there is metadata, the on-disk information may + * indicate that the array is not in-sync and it will then + * fail at that time. + * + * User could specify 'nosync' option if desperate. + */ + DMERR("Unable to rebuild drive while array is not in-sync"); + rs->ti->error = "RAID device lookup failure"; + return -EINVAL; + } + + return 0; +} + +/* + * validate_region_size + * @rs + * @region_size: region size in sectors. If 0, pick a size (4MiB default). + * + * Set rs->md.bitmap_info.chunksize (which really refers to 'region size'). + * Ensure that (ti->len/region_size < 2^21) - required by MD bitmap. + * + * Returns: 0 on success, -EINVAL on failure. + */ +static int validate_region_size(struct raid_set *rs, unsigned long region_size) +{ + unsigned long min_region_size = rs->ti->len / (1 << 21); + + if (!region_size) { + /* + * Choose a reasonable default. All figures in sectors. + */ + if (min_region_size > (1 << 13)) { + DMINFO("Choosing default region size of %lu sectors", + region_size); + region_size = min_region_size; + } else { + DMINFO("Choosing default region size of 4MiB"); + region_size = 1 << 13; /* sectors */ + } + } else { + /* + * Validate user-supplied value. + */ + if (region_size > rs->ti->len) { + rs->ti->error = "Supplied region size is too large"; + return -EINVAL; + } + + if (region_size < min_region_size) { + DMERR("Supplied region_size (%lu sectors) below minimum (%lu)", + region_size, min_region_size); + rs->ti->error = "Supplied region size is too small"; + return -EINVAL; + } + + if (!is_power_of_2(region_size)) { + rs->ti->error = "Region size is not a power of 2"; + return -EINVAL; + } + + if (region_size < rs->md.chunk_sectors) { + rs->ti->error = "Region size is smaller than the chunk size"; + return -EINVAL; + } + } + + /* + * Convert sectors to bytes. + */ + rs->md.bitmap_info.chunksize = (region_size << 9); + + return 0; +} + +/* + * Possible arguments are... + * <chunk_size> [optional_args] + * + * Argument definitions + * <chunk_size> The number of sectors per disk that + * will form the "stripe" + * [[no]sync] Force or prevent recovery of the + * entire array + * [rebuild <idx>] Rebuild the drive indicated by the index + * [daemon_sleep <ms>] Time between bitmap daemon work to + * clear bits + * [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization + * [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization + * [write_mostly <idx>] Indicate a write mostly drive via index + * [max_write_behind <sectors>] See '-write-behind=' (man mdadm) + * [stripe_cache <sectors>] Stripe cache size for higher RAIDs + * [region_size <sectors>] Defines granularity of bitmap + */ +static int parse_raid_params(struct raid_set *rs, char **argv, + unsigned num_raid_params) +{ + unsigned i, rebuild_cnt = 0; + unsigned long value, region_size = 0; + char *key; + + /* + * First, parse the in-order required arguments + * "chunk_size" is the only argument of this type. + */ + if ((strict_strtoul(argv[0], 10, &value) < 0)) { + rs->ti->error = "Bad chunk size"; + return -EINVAL; + } else if (rs->raid_type->level == 1) { + if (value) + DMERR("Ignoring chunk size parameter for RAID 1"); + value = 0; + } else if (!is_power_of_2(value)) { + rs->ti->error = "Chunk size must be a power of 2"; + return -EINVAL; + } else if (value < 8) { + rs->ti->error = "Chunk size value is too small"; + return -EINVAL; + } + + rs->md.new_chunk_sectors = rs->md.chunk_sectors = value; + argv++; + num_raid_params--; + + /* + * We set each individual device as In_sync with a completed + * 'recovery_offset'. If there has been a device failure or + * replacement then one of the following cases applies: + * + * 1) User specifies 'rebuild'. + * - Device is reset when param is read. + * 2) A new device is supplied. + * - No matching superblock found, resets device. + * 3) Device failure was transient and returns on reload. + * - Failure noticed, resets device for bitmap replay. + * 4) Device hadn't completed recovery after previous failure. + * - Superblock is read and overrides recovery_offset. + * + * What is found in the superblocks of the devices is always + * authoritative, unless 'rebuild' or '[no]sync' was specified. + */ + for (i = 0; i < rs->md.raid_disks; i++) { + set_bit(In_sync, &rs->dev[i].rdev.flags); + rs->dev[i].rdev.recovery_offset = MaxSector; + } + + /* + * Second, parse the unordered optional arguments + */ + for (i = 0; i < num_raid_params; i++) { + if (!strcasecmp(argv[i], "nosync")) { + rs->md.recovery_cp = MaxSector; + rs->print_flags |= DMPF_NOSYNC; + continue; + } + if (!strcasecmp(argv[i], "sync")) { + rs->md.recovery_cp = 0; + rs->print_flags |= DMPF_SYNC; + continue; + } + + /* The rest of the optional arguments come in key/value pairs */ + if ((i + 1) >= num_raid_params) { + rs->ti->error = "Wrong number of raid parameters given"; + return -EINVAL; + } + + key = argv[i++]; + if (strict_strtoul(argv[i], 10, &value) < 0) { + rs->ti->error = "Bad numerical argument given in raid params"; + return -EINVAL; + } + + if (!strcasecmp(key, "rebuild")) { + rebuild_cnt++; + if (((rs->raid_type->level != 1) && + (rebuild_cnt > rs->raid_type->parity_devs)) || + ((rs->raid_type->level == 1) && + (rebuild_cnt > (rs->md.raid_disks - 1)))) { + rs->ti->error = "Too many rebuild devices specified for given RAID type"; + return -EINVAL; + } + if (value > rs->md.raid_disks) { + rs->ti->error = "Invalid rebuild index given"; + return -EINVAL; + } + clear_bit(In_sync, &rs->dev[value].rdev.flags); + rs->dev[value].rdev.recovery_offset = 0; + rs->print_flags |= DMPF_REBUILD; + } else if (!strcasecmp(key, "write_mostly")) { + if (rs->raid_type->level != 1) { + rs->ti->error = "write_mostly option is only valid for RAID1"; + return -EINVAL; + } + if (value >= rs->md.raid_disks) { + rs->ti->error = "Invalid write_mostly drive index given"; + return -EINVAL; + } + set_bit(WriteMostly, &rs->dev[value].rdev.flags); + } else if (!strcasecmp(key, "max_write_behind")) { + if (rs->raid_type->level != 1) { + rs->ti->error = "max_write_behind option is only valid for RAID1"; + return -EINVAL; + } + rs->print_flags |= DMPF_MAX_WRITE_BEHIND; + + /* + * In device-mapper, we specify things in sectors, but + * MD records this value in kB + */ + value /= 2; + if (value > COUNTER_MAX) { + rs->ti->error = "Max write-behind limit out of range"; + return -EINVAL; + } + rs->md.bitmap_info.max_write_behind = value; + } else if (!strcasecmp(key, "daemon_sleep")) { + rs->print_flags |= DMPF_DAEMON_SLEEP; + if (!value || (value > MAX_SCHEDULE_TIMEOUT)) { + rs->ti->error = "daemon sleep period out of range"; + return -EINVAL; + } + rs->md.bitmap_info.daemon_sleep = value; + } else if (!strcasecmp(key, "stripe_cache")) { + rs->print_flags |= DMPF_STRIPE_CACHE; + + /* + * In device-mapper, we specify things in sectors, but + * MD records this value in kB + */ + value /= 2; + + if (rs->raid_type->level < 5) { + rs->ti->error = "Inappropriate argument: stripe_cache"; + return -EINVAL; + } + if (raid5_set_cache_size(&rs->md, (int)value)) { + rs->ti->error = "Bad stripe_cache size"; + return -EINVAL; + } + } else if (!strcasecmp(key, "min_recovery_rate")) { + rs->print_flags |= DMPF_MIN_RECOVERY_RATE; + if (value > INT_MAX) { + rs->ti->error = "min_recovery_rate out of range"; + return -EINVAL; + } + rs->md.sync_speed_min = (int)value; + } else if (!strcasecmp(key, "max_recovery_rate")) { + rs->print_flags |= DMPF_MAX_RECOVERY_RATE; + if (value > INT_MAX) { + rs->ti->error = "max_recovery_rate out of range"; + return -EINVAL; + } + rs->md.sync_speed_max = (int)value; + } else if (!strcasecmp(key, "region_size")) { + rs->print_flags |= DMPF_REGION_SIZE; + region_size = value; + } else { + DMERR("Unable to parse RAID parameter: %s", key); + rs->ti->error = "Unable to parse RAID parameters"; + return -EINVAL; + } + } + + if (validate_region_size(rs, region_size)) + return -EINVAL; + + if (rs->md.chunk_sectors) + rs->ti->split_io = rs->md.chunk_sectors; + else + rs->ti->split_io = region_size; + + if (rs->md.chunk_sectors) + rs->ti->split_io = rs->md.chunk_sectors; + else + rs->ti->split_io = region_size; + + /* Assume there are no metadata devices until the drives are parsed */ + rs->md.persistent = 0; + rs->md.external = 1; + + return 0; +} + +static void do_table_event(struct work_struct *ws) +{ + struct raid_set *rs = container_of(ws, struct raid_set, md.event_work); + + dm_table_event(rs->ti->table); +} + +static int raid_is_congested(struct dm_target_callbacks *cb, int bits) +{ + struct raid_set *rs = container_of(cb, struct raid_set, callbacks); + + if (rs->raid_type->level == 1) + return md_raid1_congested(&rs->md, bits); + + return md_raid5_congested(&rs->md, bits); +} + +/* + * This structure is never routinely used by userspace, unlike md superblocks. + * Devices with this superblock should only ever be accessed via device-mapper. + */ +#define DM_RAID_MAGIC 0x64526D44 +struct dm_raid_superblock { + __le32 magic; /* "DmRd" */ + __le32 features; /* Used to indicate possible future changes */ + + __le32 num_devices; /* Number of devices in this array. (Max 64) */ + __le32 array_position; /* The position of this drive in the array */ + + __le64 events; /* Incremented by md when superblock updated */ + __le64 failed_devices; /* Bit field of devices to indicate failures */ + + /* + * This offset tracks the progress of the repair or replacement of + * an individual drive. + */ + __le64 disk_recovery_offset; + + /* + * This offset tracks the progress of the initial array + * synchronisation/parity calculation. + */ + __le64 array_resync_offset; + + /* + * RAID characteristics + */ + __le32 level; + __le32 layout; + __le32 stripe_sectors; + + __u8 pad[452]; /* Round struct to 512 bytes. */ + /* Always set to 0 when writing. */ +} __packed; + +static int read_disk_sb(struct md_rdev *rdev, int size) +{ + BUG_ON(!rdev->sb_page); + + if (rdev->sb_loaded) + return 0; + + if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) { + DMERR("Failed to read superblock of device at position %d", + rdev->raid_disk); + set_bit(Faulty, &rdev->flags); + return -EINVAL; + } + + rdev->sb_loaded = 1; + + return 0; +} + +static void super_sync(struct mddev *mddev, struct md_rdev *rdev) +{ + struct md_rdev *r; + uint64_t failed_devices; + struct dm_raid_superblock *sb; + + sb = page_address(rdev->sb_page); + failed_devices = le64_to_cpu(sb->failed_devices); + + rdev_for_each(r, mddev) + if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags)) + failed_devices |= (1ULL << r->raid_disk); + + memset(sb, 0, sizeof(*sb)); + + sb->magic = cpu_to_le32(DM_RAID_MAGIC); + sb->features = cpu_to_le32(0); /* No features yet */ + + sb->num_devices = cpu_to_le32(mddev->raid_disks); + sb->array_position = cpu_to_le32(rdev->raid_disk); + + sb->events = cpu_to_le64(mddev->events); + sb->failed_devices = cpu_to_le64(failed_devices); + + sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset); + sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp); + + sb->level = cpu_to_le32(mddev->level); + sb->layout = cpu_to_le32(mddev->layout); + sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors); +} + +/* + * super_load + * + * This function creates a superblock if one is not found on the device + * and will decide which superblock to use if there's a choice. + * + * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise + */ +static int super_load(struct md_rdev *rdev, struct md_rdev *refdev) +{ + int ret; + struct dm_raid_superblock *sb; + struct dm_raid_superblock *refsb; + uint64_t events_sb, events_refsb; + + rdev->sb_start = 0; + rdev->sb_size = sizeof(*sb); + + ret = read_disk_sb(rdev, rdev->sb_size); + if (ret) + return ret; + + sb = page_address(rdev->sb_page); + + /* + * Two cases that we want to write new superblocks and rebuild: + * 1) New device (no matching magic number) + * 2) Device specified for rebuild (!In_sync w/ offset == 0) + */ + if ((sb->magic != cpu_to_le32(DM_RAID_MAGIC)) || + (!test_bit(In_sync, &rdev->flags) && !rdev->recovery_offset)) { + super_sync(rdev->mddev, rdev); + + set_bit(FirstUse, &rdev->flags); + + /* Force writing of superblocks to disk */ + set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags); + + /* Any superblock is better than none, choose that if given */ + return refdev ? 0 : 1; + } + + if (!refdev) + return 1; + + events_sb = le64_to_cpu(sb->events); + + refsb = page_address(refdev->sb_page); + events_refsb = le64_to_cpu(refsb->events); + + return (events_sb > events_refsb) ? 1 : 0; +} + +static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev) +{ + int role; + struct raid_set *rs = container_of(mddev, struct raid_set, md); + uint64_t events_sb; + uint64_t failed_devices; + struct dm_raid_superblock *sb; + uint32_t new_devs = 0; + uint32_t rebuilds = 0; + struct md_rdev *r; + struct dm_raid_superblock *sb2; + + sb = page_address(rdev->sb_page); + events_sb = le64_to_cpu(sb->events); + failed_devices = le64_to_cpu(sb->failed_devices); + + /* + * Initialise to 1 if this is a new superblock. + */ + mddev->events = events_sb ? : 1; + + /* + * Reshaping is not currently allowed + */ + if ((le32_to_cpu(sb->level) != mddev->level) || + (le32_to_cpu(sb->layout) != mddev->layout) || + (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) { + DMERR("Reshaping arrays not yet supported."); + return -EINVAL; + } + + /* We can only change the number of devices in RAID1 right now */ + if ((rs->raid_type->level != 1) && + (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) { + DMERR("Reshaping arrays not yet supported."); + return -EINVAL; + } + + if (!(rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))) + mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset); + + /* + * During load, we set FirstUse if a new superblock was written. + * There are two reasons we might not have a superblock: + * 1) The array is brand new - in which case, all of the + * devices must have their In_sync bit set. Also, + * recovery_cp must be 0, unless forced. + * 2) This is a new device being added to an old array + * and the new device needs to be rebuilt - in which + * case the In_sync bit will /not/ be set and + * recovery_cp must be MaxSector. + */ + rdev_for_each(r, mddev) { + if (!test_bit(In_sync, &r->flags)) { + DMINFO("Device %d specified for rebuild: " + "Clearing superblock", r->raid_disk); + rebuilds++; + } else if (test_bit(FirstUse, &r->flags)) + new_devs++; + } + + if (!rebuilds) { + if (new_devs == mddev->raid_disks) { + DMINFO("Superblocks created for new array"); + set_bit(MD_ARRAY_FIRST_USE, &mddev->flags); + } else if (new_devs) { + DMERR("New device injected " + "into existing array without 'rebuild' " + "parameter specified"); + return -EINVAL; + } + } else if (new_devs) { + DMERR("'rebuild' devices cannot be " + "injected into an array with other first-time devices"); + return -EINVAL; + } else if (mddev->recovery_cp != MaxSector) { + DMERR("'rebuild' specified while array is not in-sync"); + return -EINVAL; + } + + /* + * Now we set the Faulty bit for those devices that are + * recorded in the superblock as failed. + */ + rdev_for_each(r, mddev) { + if (!r->sb_page) + continue; + sb2 = page_address(r->sb_page); + sb2->failed_devices = 0; + + /* + * Check for any device re-ordering. + */ + if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) { + role = le32_to_cpu(sb2->array_position); + if (role != r->raid_disk) { + if (rs->raid_type->level != 1) { + rs->ti->error = "Cannot change device " + "positions in RAID array"; + return -EINVAL; + } + DMINFO("RAID1 device #%d now at position #%d", + role, r->raid_disk); + } + + /* + * Partial recovery is performed on + * returning failed devices. + */ + if (failed_devices & (1 << role)) + set_bit(Faulty, &r->flags); + } + } + + return 0; +} + +static int super_validate(struct mddev *mddev, struct md_rdev *rdev) +{ + struct dm_raid_superblock *sb = page_address(rdev->sb_page); + + /* + * If mddev->events is not set, we know we have not yet initialized + * the array. + */ + if (!mddev->events && super_init_validation(mddev, rdev)) + return -EINVAL; + + mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */ + rdev->mddev->bitmap_info.default_offset = 4096 >> 9; + if (!test_bit(FirstUse, &rdev->flags)) { + rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset); + if (rdev->recovery_offset != MaxSector) + clear_bit(In_sync, &rdev->flags); + } + + /* + * If a device comes back, set it as not In_sync and no longer faulty. + */ + if (test_bit(Faulty, &rdev->flags)) { + clear_bit(Faulty, &rdev->flags); + clear_bit(In_sync, &rdev->flags); + rdev->saved_raid_disk = rdev->raid_disk; + rdev->recovery_offset = 0; + } + + clear_bit(FirstUse, &rdev->flags); + + return 0; +} + +/* + * Analyse superblocks and select the freshest. + */ +static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) +{ + int ret; + unsigned redundancy = 0; + struct raid_dev *dev; + struct md_rdev *rdev, *tmp, *freshest; + struct mddev *mddev = &rs->md; + + switch (rs->raid_type->level) { + case 1: + redundancy = rs->md.raid_disks - 1; + break; + case 4: + case 5: + case 6: + redundancy = rs->raid_type->parity_devs; + break; + default: + ti->error = "Unknown RAID type"; + return -EINVAL; + } + + freshest = NULL; + rdev_for_each_safe(rdev, tmp, mddev) { + if (!rdev->meta_bdev) + continue; + + ret = super_load(rdev, freshest); + + switch (ret) { + case 1: + freshest = rdev; + break; + case 0: + break; + default: + dev = container_of(rdev, struct raid_dev, rdev); + if (redundancy--) { + if (dev->meta_dev) + dm_put_device(ti, dev->meta_dev); + + dev->meta_dev = NULL; + rdev->meta_bdev = NULL; + + if (rdev->sb_page) + put_page(rdev->sb_page); + + rdev->sb_page = NULL; + + rdev->sb_loaded = 0; + + /* + * We might be able to salvage the data device + * even though the meta device has failed. For + * now, we behave as though '- -' had been + * set for this device in the table. + */ + if (dev->data_dev) + dm_put_device(ti, dev->data_dev); + + dev->data_dev = NULL; + rdev->bdev = NULL; + + list_del(&rdev->same_set); + + continue; + } + ti->error = "Failed to load superblock"; + return ret; + } + } + + if (!freshest) + return 0; + + /* + * Validation of the freshest device provides the source of + * validation for the remaining devices. + */ + ti->error = "Unable to assemble array: Invalid superblocks"; + if (super_validate(mddev, freshest)) + return -EINVAL; + + rdev_for_each(rdev, mddev) + if ((rdev != freshest) && super_validate(mddev, rdev)) + return -EINVAL; + + return 0; +} + +/* + * Construct a RAID4/5/6 mapping: + * Args: + * <raid_type> <#raid_params> <raid_params> \ + * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> } + * + * <raid_params> varies by <raid_type>. See 'parse_raid_params' for + * details on possible <raid_params>. + */ +static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) +{ + int ret; + struct raid_type *rt; + unsigned long num_raid_params, num_raid_devs; + struct raid_set *rs = NULL; + + /* Must have at least <raid_type> <#raid_params> */ + if (argc < 2) { + ti->error = "Too few arguments"; + return -EINVAL; + } + + /* raid type */ + rt = get_raid_type(argv[0]); + if (!rt) { + ti->error = "Unrecognised raid_type"; + return -EINVAL; + } + argc--; + argv++; + + /* number of RAID parameters */ + if (strict_strtoul(argv[0], 10, &num_raid_params) < 0) { + ti->error = "Cannot understand number of RAID parameters"; + return -EINVAL; + } + argc--; + argv++; + + /* Skip over RAID params for now and find out # of devices */ + if (num_raid_params + 1 > argc) { + ti->error = "Arguments do not agree with counts given"; + return -EINVAL; + } + + if ((strict_strtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) || + (num_raid_devs >= INT_MAX)) { + ti->error = "Cannot understand number of raid devices"; + return -EINVAL; + } + + rs = context_alloc(ti, rt, (unsigned)num_raid_devs); + if (IS_ERR(rs)) + return PTR_ERR(rs); + + ret = parse_raid_params(rs, argv, (unsigned)num_raid_params); + if (ret) + goto bad; + + ret = -EINVAL; + + argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */ + argv += num_raid_params + 1; + + if (argc != (num_raid_devs * 2)) { + ti->error = "Supplied RAID devices does not match the count given"; + goto bad; + } + + ret = dev_parms(rs, argv); + if (ret) + goto bad; + + rs->md.sync_super = super_sync; + ret = analyse_superblocks(ti, rs); + if (ret) + goto bad; + + INIT_WORK(&rs->md.event_work, do_table_event); + ti->private = rs; + ti->num_flush_requests = 1; + + mutex_lock(&rs->md.reconfig_mutex); + ret = md_run(&rs->md); + rs->md.in_sync = 0; /* Assume already marked dirty */ + mutex_unlock(&rs->md.reconfig_mutex); + + if (ret) { + ti->error = "Fail to run raid array"; + goto bad; + } + + rs->callbacks.congested_fn = raid_is_congested; + dm_table_add_target_callbacks(ti->table, &rs->callbacks); + + mddev_suspend(&rs->md); + return 0; + +bad: + context_free(rs); + + return ret; +} + +static void raid_dtr(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + + list_del_init(&rs->callbacks.list); + md_stop(&rs->md); + context_free(rs); +} + +static int raid_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) +{ + struct raid_set *rs = ti->private; + struct mddev *mddev = &rs->md; + + mddev->pers->make_request(mddev, bio); + + return DM_MAPIO_SUBMITTED; +} + +static int raid_status(struct dm_target *ti, status_type_t type, + char *result, unsigned maxlen) +{ + struct raid_set *rs = ti->private; + unsigned raid_param_cnt = 1; /* at least 1 for chunksize */ + unsigned sz = 0; + int i, array_in_sync = 0; + sector_t sync; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks); + + if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery)) + sync = rs->md.curr_resync_completed; + else + sync = rs->md.recovery_cp; + + if (sync >= rs->md.resync_max_sectors) { + array_in_sync = 1; + sync = rs->md.resync_max_sectors; + } else { + /* + * The array may be doing an initial sync, or it may + * be rebuilding individual components. If all the + * devices are In_sync, then it is the array that is + * being initialized. + */ + for (i = 0; i < rs->md.raid_disks; i++) + if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) + array_in_sync = 1; + } + /* + * Status characters: + * 'D' = Dead/Failed device + * 'a' = Alive but not in-sync + * 'A' = Alive and in-sync + */ + for (i = 0; i < rs->md.raid_disks; i++) { + if (test_bit(Faulty, &rs->dev[i].rdev.flags)) + DMEMIT("D"); + else if (!array_in_sync || + !test_bit(In_sync, &rs->dev[i].rdev.flags)) + DMEMIT("a"); + else + DMEMIT("A"); + } + + /* + * In-sync ratio: + * The in-sync ratio shows the progress of: + * - Initializing the array + * - Rebuilding a subset of devices of the array + * The user can distinguish between the two by referring + * to the status characters. + */ + DMEMIT(" %llu/%llu", + (unsigned long long) sync, + (unsigned long long) rs->md.resync_max_sectors); + + break; + case STATUSTYPE_TABLE: + /* The string you would use to construct this array */ + for (i = 0; i < rs->md.raid_disks; i++) { + if ((rs->print_flags & DMPF_REBUILD) && + rs->dev[i].data_dev && + !test_bit(In_sync, &rs->dev[i].rdev.flags)) + raid_param_cnt += 2; /* for rebuilds */ + if (rs->dev[i].data_dev && + test_bit(WriteMostly, &rs->dev[i].rdev.flags)) + raid_param_cnt += 2; + } + + raid_param_cnt += (hweight32(rs->print_flags & ~DMPF_REBUILD) * 2); + if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)) + raid_param_cnt--; + + DMEMIT("%s %u %u", rs->raid_type->name, + raid_param_cnt, rs->md.chunk_sectors); + + if ((rs->print_flags & DMPF_SYNC) && + (rs->md.recovery_cp == MaxSector)) + DMEMIT(" sync"); + if (rs->print_flags & DMPF_NOSYNC) + DMEMIT(" nosync"); + + for (i = 0; i < rs->md.raid_disks; i++) + if ((rs->print_flags & DMPF_REBUILD) && + rs->dev[i].data_dev && + !test_bit(In_sync, &rs->dev[i].rdev.flags)) + DMEMIT(" rebuild %u", i); + + if (rs->print_flags & DMPF_DAEMON_SLEEP) + DMEMIT(" daemon_sleep %lu", + rs->md.bitmap_info.daemon_sleep); + + if (rs->print_flags & DMPF_MIN_RECOVERY_RATE) + DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min); + + if (rs->print_flags & DMPF_MAX_RECOVERY_RATE) + DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max); + + for (i = 0; i < rs->md.raid_disks; i++) + if (rs->dev[i].data_dev && + test_bit(WriteMostly, &rs->dev[i].rdev.flags)) + DMEMIT(" write_mostly %u", i); + + if (rs->print_flags & DMPF_MAX_WRITE_BEHIND) + DMEMIT(" max_write_behind %lu", + rs->md.bitmap_info.max_write_behind); + + if (rs->print_flags & DMPF_STRIPE_CACHE) { + struct r5conf *conf = rs->md.private; + + /* convert from kiB to sectors */ + DMEMIT(" stripe_cache %d", + conf ? conf->max_nr_stripes * 2 : 0); + } + + if (rs->print_flags & DMPF_REGION_SIZE) + DMEMIT(" region_size %lu", + rs->md.bitmap_info.chunksize >> 9); + + DMEMIT(" %d", rs->md.raid_disks); + for (i = 0; i < rs->md.raid_disks; i++) { + if (rs->dev[i].meta_dev) + DMEMIT(" %s", rs->dev[i].meta_dev->name); + else + DMEMIT(" -"); + + if (rs->dev[i].data_dev) + DMEMIT(" %s", rs->dev[i].data_dev->name); + else + DMEMIT(" -"); + } + } + + return 0; +} + +static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) +{ + struct raid_set *rs = ti->private; + unsigned i; + int ret = 0; + + for (i = 0; !ret && i < rs->md.raid_disks; i++) + if (rs->dev[i].data_dev) + ret = fn(ti, + rs->dev[i].data_dev, + 0, /* No offset on data devs */ + rs->md.dev_sectors, + data); + + return ret; +} + +static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct raid_set *rs = ti->private; + unsigned chunk_size = rs->md.chunk_sectors << 9; + struct r5conf *conf = rs->md.private; + + blk_limits_io_min(limits, chunk_size); + blk_limits_io_opt(limits, chunk_size * (conf->raid_disks - conf->max_degraded)); +} + +static void raid_presuspend(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + + md_stop_writes(&rs->md); +} + +static void raid_postsuspend(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + + mddev_suspend(&rs->md); +} + +static void raid_resume(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + + if (!rs->bitmap_loaded) { + bitmap_load(&rs->md); + rs->bitmap_loaded = 1; + } else + md_wakeup_thread(rs->md.thread); + + mddev_resume(&rs->md); +} + +static struct target_type raid_target = { + .name = "raid", + .version = {1, 2, 0}, + .module = THIS_MODULE, + .ctr = raid_ctr, + .dtr = raid_dtr, + .map = raid_map, + .status = raid_status, + .iterate_devices = raid_iterate_devices, + .io_hints = raid_io_hints, + .presuspend = raid_presuspend, + .postsuspend = raid_postsuspend, + .resume = raid_resume, +}; + +static int __init dm_raid_init(void) +{ + return dm_register_target(&raid_target); +} + +static void __exit dm_raid_exit(void) +{ + dm_unregister_target(&raid_target); +} + +module_init(dm_raid_init); +module_exit(dm_raid_exit); + +MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target"); +MODULE_ALIAS("dm-raid4"); +MODULE_ALIAS("dm-raid5"); +MODULE_ALIAS("dm-raid6"); +MODULE_AUTHOR("Neil Brown <dm-devel@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c new file mode 100644 index 00000000..d039de83 --- /dev/null +++ b/drivers/md/dm-raid1.c @@ -0,0 +1,1470 @@ +/* + * Copyright (C) 2003 Sistina Software Limited. + * Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include "dm-bio-record.h" + +#include <linux/init.h> +#include <linux/mempool.h> +#include <linux/module.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/workqueue.h> +#include <linux/device-mapper.h> +#include <linux/dm-io.h> +#include <linux/dm-dirty-log.h> +#include <linux/dm-kcopyd.h> +#include <linux/dm-region-hash.h> + +#define DM_MSG_PREFIX "raid1" + +#define MAX_RECOVERY 1 /* Maximum number of regions recovered in parallel. */ + +#define DM_RAID1_HANDLE_ERRORS 0x01 +#define errors_handled(p) ((p)->features & DM_RAID1_HANDLE_ERRORS) + +static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped); + +/*----------------------------------------------------------------- + * Mirror set structures. + *---------------------------------------------------------------*/ +enum dm_raid1_error { + DM_RAID1_WRITE_ERROR, + DM_RAID1_FLUSH_ERROR, + DM_RAID1_SYNC_ERROR, + DM_RAID1_READ_ERROR +}; + +struct mirror { + struct mirror_set *ms; + atomic_t error_count; + unsigned long error_type; + struct dm_dev *dev; + sector_t offset; +}; + +struct mirror_set { + struct dm_target *ti; + struct list_head list; + + uint64_t features; + + spinlock_t lock; /* protects the lists */ + struct bio_list reads; + struct bio_list writes; + struct bio_list failures; + struct bio_list holds; /* bios are waiting until suspend */ + + struct dm_region_hash *rh; + struct dm_kcopyd_client *kcopyd_client; + struct dm_io_client *io_client; + mempool_t *read_record_pool; + + /* recovery */ + region_t nr_regions; + int in_sync; + int log_failure; + int leg_failure; + atomic_t suspend; + + atomic_t default_mirror; /* Default mirror */ + + struct workqueue_struct *kmirrord_wq; + struct work_struct kmirrord_work; + struct timer_list timer; + unsigned long timer_pending; + + struct work_struct trigger_event; + + unsigned nr_mirrors; + struct mirror mirror[0]; +}; + +static void wakeup_mirrord(void *context) +{ + struct mirror_set *ms = context; + + queue_work(ms->kmirrord_wq, &ms->kmirrord_work); +} + +static void delayed_wake_fn(unsigned long data) +{ + struct mirror_set *ms = (struct mirror_set *) data; + + clear_bit(0, &ms->timer_pending); + wakeup_mirrord(ms); +} + +static void delayed_wake(struct mirror_set *ms) +{ + if (test_and_set_bit(0, &ms->timer_pending)) + return; + + ms->timer.expires = jiffies + HZ / 5; + ms->timer.data = (unsigned long) ms; + ms->timer.function = delayed_wake_fn; + add_timer(&ms->timer); +} + +static void wakeup_all_recovery_waiters(void *context) +{ + wake_up_all(&_kmirrord_recovery_stopped); +} + +static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) +{ + unsigned long flags; + int should_wake = 0; + struct bio_list *bl; + + bl = (rw == WRITE) ? &ms->writes : &ms->reads; + spin_lock_irqsave(&ms->lock, flags); + should_wake = !(bl->head); + bio_list_add(bl, bio); + spin_unlock_irqrestore(&ms->lock, flags); + + if (should_wake) + wakeup_mirrord(ms); +} + +static void dispatch_bios(void *context, struct bio_list *bio_list) +{ + struct mirror_set *ms = context; + struct bio *bio; + + while ((bio = bio_list_pop(bio_list))) + queue_bio(ms, bio, WRITE); +} + +#define MIN_READ_RECORDS 20 +struct dm_raid1_read_record { + struct mirror *m; + struct dm_bio_details details; +}; + +static struct kmem_cache *_dm_raid1_read_record_cache; + +/* + * Every mirror should look like this one. + */ +#define DEFAULT_MIRROR 0 + +/* + * This is yucky. We squirrel the mirror struct away inside + * bi_next for read/write buffers. This is safe since the bh + * doesn't get submitted to the lower levels of block layer. + */ +static struct mirror *bio_get_m(struct bio *bio) +{ + return (struct mirror *) bio->bi_next; +} + +static void bio_set_m(struct bio *bio, struct mirror *m) +{ + bio->bi_next = (struct bio *) m; +} + +static struct mirror *get_default_mirror(struct mirror_set *ms) +{ + return &ms->mirror[atomic_read(&ms->default_mirror)]; +} + +static void set_default_mirror(struct mirror *m) +{ + struct mirror_set *ms = m->ms; + struct mirror *m0 = &(ms->mirror[0]); + + atomic_set(&ms->default_mirror, m - m0); +} + +static struct mirror *get_valid_mirror(struct mirror_set *ms) +{ + struct mirror *m; + + for (m = ms->mirror; m < ms->mirror + ms->nr_mirrors; m++) + if (!atomic_read(&m->error_count)) + return m; + + return NULL; +} + +/* fail_mirror + * @m: mirror device to fail + * @error_type: one of the enum's, DM_RAID1_*_ERROR + * + * If errors are being handled, record the type of + * error encountered for this device. If this type + * of error has already been recorded, we can return; + * otherwise, we must signal userspace by triggering + * an event. Additionally, if the device is the + * primary device, we must choose a new primary, but + * only if the mirror is in-sync. + * + * This function must not block. + */ +static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type) +{ + struct mirror_set *ms = m->ms; + struct mirror *new; + + ms->leg_failure = 1; + + /* + * error_count is used for nothing more than a + * simple way to tell if a device has encountered + * errors. + */ + atomic_inc(&m->error_count); + + if (test_and_set_bit(error_type, &m->error_type)) + return; + + if (!errors_handled(ms)) + return; + + if (m != get_default_mirror(ms)) + goto out; + + if (!ms->in_sync) { + /* + * Better to issue requests to same failing device + * than to risk returning corrupt data. + */ + DMERR("Primary mirror (%s) failed while out-of-sync: " + "Reads may fail.", m->dev->name); + goto out; + } + + new = get_valid_mirror(ms); + if (new) + set_default_mirror(new); + else + DMWARN("All sides of mirror have failed."); + +out: + schedule_work(&ms->trigger_event); +} + +static int mirror_flush(struct dm_target *ti) +{ + struct mirror_set *ms = ti->private; + unsigned long error_bits; + + unsigned int i; + struct dm_io_region io[ms->nr_mirrors]; + struct mirror *m; + struct dm_io_request io_req = { + .bi_rw = WRITE_FLUSH, + .mem.type = DM_IO_KMEM, + .mem.ptr.addr = NULL, + .client = ms->io_client, + }; + + for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) { + io[i].bdev = m->dev->bdev; + io[i].sector = 0; + io[i].count = 0; + } + + error_bits = -1; + dm_io(&io_req, ms->nr_mirrors, io, &error_bits); + if (unlikely(error_bits != 0)) { + for (i = 0; i < ms->nr_mirrors; i++) + if (test_bit(i, &error_bits)) + fail_mirror(ms->mirror + i, + DM_RAID1_FLUSH_ERROR); + return -EIO; + } + + return 0; +} + +/*----------------------------------------------------------------- + * Recovery. + * + * When a mirror is first activated we may find that some regions + * are in the no-sync state. We have to recover these by + * recopying from the default mirror to all the others. + *---------------------------------------------------------------*/ +static void recovery_complete(int read_err, unsigned long write_err, + void *context) +{ + struct dm_region *reg = context; + struct mirror_set *ms = dm_rh_region_context(reg); + int m, bit = 0; + + if (read_err) { + /* Read error means the failure of default mirror. */ + DMERR_LIMIT("Unable to read primary mirror during recovery"); + fail_mirror(get_default_mirror(ms), DM_RAID1_SYNC_ERROR); + } + + if (write_err) { + DMERR_LIMIT("Write error during recovery (error = 0x%lx)", + write_err); + /* + * Bits correspond to devices (excluding default mirror). + * The default mirror cannot change during recovery. + */ + for (m = 0; m < ms->nr_mirrors; m++) { + if (&ms->mirror[m] == get_default_mirror(ms)) + continue; + if (test_bit(bit, &write_err)) + fail_mirror(ms->mirror + m, + DM_RAID1_SYNC_ERROR); + bit++; + } + } + + dm_rh_recovery_end(reg, !(read_err || write_err)); +} + +static int recover(struct mirror_set *ms, struct dm_region *reg) +{ + int r; + unsigned i; + struct dm_io_region from, to[DM_KCOPYD_MAX_REGIONS], *dest; + struct mirror *m; + unsigned long flags = 0; + region_t key = dm_rh_get_region_key(reg); + sector_t region_size = dm_rh_get_region_size(ms->rh); + + /* fill in the source */ + m = get_default_mirror(ms); + from.bdev = m->dev->bdev; + from.sector = m->offset + dm_rh_region_to_sector(ms->rh, key); + if (key == (ms->nr_regions - 1)) { + /* + * The final region may be smaller than + * region_size. + */ + from.count = ms->ti->len & (region_size - 1); + if (!from.count) + from.count = region_size; + } else + from.count = region_size; + + /* fill in the destinations */ + for (i = 0, dest = to; i < ms->nr_mirrors; i++) { + if (&ms->mirror[i] == get_default_mirror(ms)) + continue; + + m = ms->mirror + i; + dest->bdev = m->dev->bdev; + dest->sector = m->offset + dm_rh_region_to_sector(ms->rh, key); + dest->count = from.count; + dest++; + } + + /* hand to kcopyd */ + if (!errors_handled(ms)) + set_bit(DM_KCOPYD_IGNORE_ERROR, &flags); + + r = dm_kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, + flags, recovery_complete, reg); + + return r; +} + +static void do_recovery(struct mirror_set *ms) +{ + struct dm_region *reg; + struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); + int r; + + /* + * Start quiescing some regions. + */ + dm_rh_recovery_prepare(ms->rh); + + /* + * Copy any already quiesced regions. + */ + while ((reg = dm_rh_recovery_start(ms->rh))) { + r = recover(ms, reg); + if (r) + dm_rh_recovery_end(reg, 0); + } + + /* + * Update the in sync flag. + */ + if (!ms->in_sync && + (log->type->get_sync_count(log) == ms->nr_regions)) { + /* the sync is complete */ + dm_table_event(ms->ti->table); + ms->in_sync = 1; + } +} + +/*----------------------------------------------------------------- + * Reads + *---------------------------------------------------------------*/ +static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) +{ + struct mirror *m = get_default_mirror(ms); + + do { + if (likely(!atomic_read(&m->error_count))) + return m; + + if (m-- == ms->mirror) + m += ms->nr_mirrors; + } while (m != get_default_mirror(ms)); + + return NULL; +} + +static int default_ok(struct mirror *m) +{ + struct mirror *default_mirror = get_default_mirror(m->ms); + + return !atomic_read(&default_mirror->error_count); +} + +static int mirror_available(struct mirror_set *ms, struct bio *bio) +{ + struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); + region_t region = dm_rh_bio_to_region(ms->rh, bio); + + if (log->type->in_sync(log, region, 0)) + return choose_mirror(ms, bio->bi_sector) ? 1 : 0; + + return 0; +} + +/* + * remap a buffer to a particular mirror. + */ +static sector_t map_sector(struct mirror *m, struct bio *bio) +{ + if (unlikely(!bio->bi_size)) + return 0; + return m->offset + dm_target_offset(m->ms->ti, bio->bi_sector); +} + +static void map_bio(struct mirror *m, struct bio *bio) +{ + bio->bi_bdev = m->dev->bdev; + bio->bi_sector = map_sector(m, bio); +} + +static void map_region(struct dm_io_region *io, struct mirror *m, + struct bio *bio) +{ + io->bdev = m->dev->bdev; + io->sector = map_sector(m, bio); + io->count = bio->bi_size >> 9; +} + +static void hold_bio(struct mirror_set *ms, struct bio *bio) +{ + /* + * Lock is required to avoid race condition during suspend + * process. + */ + spin_lock_irq(&ms->lock); + + if (atomic_read(&ms->suspend)) { + spin_unlock_irq(&ms->lock); + + /* + * If device is suspended, complete the bio. + */ + if (dm_noflush_suspending(ms->ti)) + bio_endio(bio, DM_ENDIO_REQUEUE); + else + bio_endio(bio, -EIO); + return; + } + + /* + * Hold bio until the suspend is complete. + */ + bio_list_add(&ms->holds, bio); + spin_unlock_irq(&ms->lock); +} + +/*----------------------------------------------------------------- + * Reads + *---------------------------------------------------------------*/ +static void read_callback(unsigned long error, void *context) +{ + struct bio *bio = context; + struct mirror *m; + + m = bio_get_m(bio); + bio_set_m(bio, NULL); + + if (likely(!error)) { + bio_endio(bio, 0); + return; + } + + fail_mirror(m, DM_RAID1_READ_ERROR); + + if (likely(default_ok(m)) || mirror_available(m->ms, bio)) { + DMWARN_LIMIT("Read failure on mirror device %s. " + "Trying alternative device.", + m->dev->name); + queue_bio(m->ms, bio, bio_rw(bio)); + return; + } + + DMERR_LIMIT("Read failure on mirror device %s. Failing I/O.", + m->dev->name); + bio_endio(bio, -EIO); +} + +/* Asynchronous read. */ +static void read_async_bio(struct mirror *m, struct bio *bio) +{ + struct dm_io_region io; + struct dm_io_request io_req = { + .bi_rw = READ, + .mem.type = DM_IO_BVEC, + .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, + .notify.fn = read_callback, + .notify.context = bio, + .client = m->ms->io_client, + }; + + map_region(&io, m, bio); + bio_set_m(bio, m); + BUG_ON(dm_io(&io_req, 1, &io, NULL)); +} + +static inline int region_in_sync(struct mirror_set *ms, region_t region, + int may_block) +{ + int state = dm_rh_get_state(ms->rh, region, may_block); + return state == DM_RH_CLEAN || state == DM_RH_DIRTY; +} + +static void do_reads(struct mirror_set *ms, struct bio_list *reads) +{ + region_t region; + struct bio *bio; + struct mirror *m; + + while ((bio = bio_list_pop(reads))) { + region = dm_rh_bio_to_region(ms->rh, bio); + m = get_default_mirror(ms); + + /* + * We can only read balance if the region is in sync. + */ + if (likely(region_in_sync(ms, region, 1))) + m = choose_mirror(ms, bio->bi_sector); + else if (m && atomic_read(&m->error_count)) + m = NULL; + + if (likely(m)) + read_async_bio(m, bio); + else + bio_endio(bio, -EIO); + } +} + +/*----------------------------------------------------------------- + * Writes. + * + * We do different things with the write io depending on the + * state of the region that it's in: + * + * SYNC: increment pending, use kcopyd to write to *all* mirrors + * RECOVERING: delay the io until recovery completes + * NOSYNC: increment pending, just write to the default mirror + *---------------------------------------------------------------*/ + + +static void write_callback(unsigned long error, void *context) +{ + unsigned i, ret = 0; + struct bio *bio = (struct bio *) context; + struct mirror_set *ms; + int should_wake = 0; + unsigned long flags; + + ms = bio_get_m(bio)->ms; + bio_set_m(bio, NULL); + + /* + * NOTE: We don't decrement the pending count here, + * instead it is done by the targets endio function. + * This way we handle both writes to SYNC and NOSYNC + * regions with the same code. + */ + if (likely(!error)) { + bio_endio(bio, ret); + return; + } + + for (i = 0; i < ms->nr_mirrors; i++) + if (test_bit(i, &error)) + fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR); + + /* + * Need to raise event. Since raising + * events can block, we need to do it in + * the main thread. + */ + spin_lock_irqsave(&ms->lock, flags); + if (!ms->failures.head) + should_wake = 1; + bio_list_add(&ms->failures, bio); + spin_unlock_irqrestore(&ms->lock, flags); + if (should_wake) + wakeup_mirrord(ms); +} + +static void do_write(struct mirror_set *ms, struct bio *bio) +{ + unsigned int i; + struct dm_io_region io[ms->nr_mirrors], *dest = io; + struct mirror *m; + struct dm_io_request io_req = { + .bi_rw = WRITE | (bio->bi_rw & WRITE_FLUSH_FUA), + .mem.type = DM_IO_BVEC, + .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, + .notify.fn = write_callback, + .notify.context = bio, + .client = ms->io_client, + }; + + if (bio->bi_rw & REQ_DISCARD) { + io_req.bi_rw |= REQ_DISCARD; + io_req.mem.type = DM_IO_KMEM; + io_req.mem.ptr.addr = NULL; + } + + for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) + map_region(dest++, m, bio); + + /* + * Use default mirror because we only need it to retrieve the reference + * to the mirror set in write_callback(). + */ + bio_set_m(bio, get_default_mirror(ms)); + + BUG_ON(dm_io(&io_req, ms->nr_mirrors, io, NULL)); +} + +static void do_writes(struct mirror_set *ms, struct bio_list *writes) +{ + int state; + struct bio *bio; + struct bio_list sync, nosync, recover, *this_list = NULL; + struct bio_list requeue; + struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); + region_t region; + + if (!writes->head) + return; + + /* + * Classify each write. + */ + bio_list_init(&sync); + bio_list_init(&nosync); + bio_list_init(&recover); + bio_list_init(&requeue); + + while ((bio = bio_list_pop(writes))) { + if ((bio->bi_rw & REQ_FLUSH) || + (bio->bi_rw & REQ_DISCARD)) { + bio_list_add(&sync, bio); + continue; + } + + region = dm_rh_bio_to_region(ms->rh, bio); + + if (log->type->is_remote_recovering && + log->type->is_remote_recovering(log, region)) { + bio_list_add(&requeue, bio); + continue; + } + + state = dm_rh_get_state(ms->rh, region, 1); + switch (state) { + case DM_RH_CLEAN: + case DM_RH_DIRTY: + this_list = &sync; + break; + + case DM_RH_NOSYNC: + this_list = &nosync; + break; + + case DM_RH_RECOVERING: + this_list = &recover; + break; + } + + bio_list_add(this_list, bio); + } + + /* + * Add bios that are delayed due to remote recovery + * back on to the write queue + */ + if (unlikely(requeue.head)) { + spin_lock_irq(&ms->lock); + bio_list_merge(&ms->writes, &requeue); + spin_unlock_irq(&ms->lock); + delayed_wake(ms); + } + + /* + * Increment the pending counts for any regions that will + * be written to (writes to recover regions are going to + * be delayed). + */ + dm_rh_inc_pending(ms->rh, &sync); + dm_rh_inc_pending(ms->rh, &nosync); + + /* + * If the flush fails on a previous call and succeeds here, + * we must not reset the log_failure variable. We need + * userspace interaction to do that. + */ + ms->log_failure = dm_rh_flush(ms->rh) ? 1 : ms->log_failure; + + /* + * Dispatch io. + */ + if (unlikely(ms->log_failure) && errors_handled(ms)) { + spin_lock_irq(&ms->lock); + bio_list_merge(&ms->failures, &sync); + spin_unlock_irq(&ms->lock); + wakeup_mirrord(ms); + } else + while ((bio = bio_list_pop(&sync))) + do_write(ms, bio); + + while ((bio = bio_list_pop(&recover))) + dm_rh_delay(ms->rh, bio); + + while ((bio = bio_list_pop(&nosync))) { + if (unlikely(ms->leg_failure) && errors_handled(ms)) { + spin_lock_irq(&ms->lock); + bio_list_add(&ms->failures, bio); + spin_unlock_irq(&ms->lock); + wakeup_mirrord(ms); + } else { + map_bio(get_default_mirror(ms), bio); + generic_make_request(bio); + } + } +} + +static void do_failures(struct mirror_set *ms, struct bio_list *failures) +{ + struct bio *bio; + + if (likely(!failures->head)) + return; + + /* + * If the log has failed, unattempted writes are being + * put on the holds list. We can't issue those writes + * until a log has been marked, so we must store them. + * + * If a 'noflush' suspend is in progress, we can requeue + * the I/O's to the core. This give userspace a chance + * to reconfigure the mirror, at which point the core + * will reissue the writes. If the 'noflush' flag is + * not set, we have no choice but to return errors. + * + * Some writes on the failures list may have been + * submitted before the log failure and represent a + * failure to write to one of the devices. It is ok + * for us to treat them the same and requeue them + * as well. + */ + while ((bio = bio_list_pop(failures))) { + if (!ms->log_failure) { + ms->in_sync = 0; + dm_rh_mark_nosync(ms->rh, bio); + } + + /* + * If all the legs are dead, fail the I/O. + * If we have been told to handle errors, hold the bio + * and wait for userspace to deal with the problem. + * Otherwise pretend that the I/O succeeded. (This would + * be wrong if the failed leg returned after reboot and + * got replicated back to the good legs.) + */ + if (!get_valid_mirror(ms)) + bio_endio(bio, -EIO); + else if (errors_handled(ms)) + hold_bio(ms, bio); + else + bio_endio(bio, 0); + } +} + +static void trigger_event(struct work_struct *work) +{ + struct mirror_set *ms = + container_of(work, struct mirror_set, trigger_event); + + dm_table_event(ms->ti->table); +} + +/*----------------------------------------------------------------- + * kmirrord + *---------------------------------------------------------------*/ +static void do_mirror(struct work_struct *work) +{ + struct mirror_set *ms = container_of(work, struct mirror_set, + kmirrord_work); + struct bio_list reads, writes, failures; + unsigned long flags; + + spin_lock_irqsave(&ms->lock, flags); + reads = ms->reads; + writes = ms->writes; + failures = ms->failures; + bio_list_init(&ms->reads); + bio_list_init(&ms->writes); + bio_list_init(&ms->failures); + spin_unlock_irqrestore(&ms->lock, flags); + + dm_rh_update_states(ms->rh, errors_handled(ms)); + do_recovery(ms); + do_reads(ms, &reads); + do_writes(ms, &writes); + do_failures(ms, &failures); +} + +/*----------------------------------------------------------------- + * Target functions + *---------------------------------------------------------------*/ +static struct mirror_set *alloc_context(unsigned int nr_mirrors, + uint32_t region_size, + struct dm_target *ti, + struct dm_dirty_log *dl) +{ + size_t len; + struct mirror_set *ms = NULL; + + len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors); + + ms = kzalloc(len, GFP_KERNEL); + if (!ms) { + ti->error = "Cannot allocate mirror context"; + return NULL; + } + + spin_lock_init(&ms->lock); + bio_list_init(&ms->reads); + bio_list_init(&ms->writes); + bio_list_init(&ms->failures); + bio_list_init(&ms->holds); + + ms->ti = ti; + ms->nr_mirrors = nr_mirrors; + ms->nr_regions = dm_sector_div_up(ti->len, region_size); + ms->in_sync = 0; + ms->log_failure = 0; + ms->leg_failure = 0; + atomic_set(&ms->suspend, 0); + atomic_set(&ms->default_mirror, DEFAULT_MIRROR); + + ms->read_record_pool = mempool_create_slab_pool(MIN_READ_RECORDS, + _dm_raid1_read_record_cache); + + if (!ms->read_record_pool) { + ti->error = "Error creating mirror read_record_pool"; + kfree(ms); + return NULL; + } + + ms->io_client = dm_io_client_create(); + if (IS_ERR(ms->io_client)) { + ti->error = "Error creating dm_io client"; + mempool_destroy(ms->read_record_pool); + kfree(ms); + return NULL; + } + + ms->rh = dm_region_hash_create(ms, dispatch_bios, wakeup_mirrord, + wakeup_all_recovery_waiters, + ms->ti->begin, MAX_RECOVERY, + dl, region_size, ms->nr_regions); + if (IS_ERR(ms->rh)) { + ti->error = "Error creating dirty region hash"; + dm_io_client_destroy(ms->io_client); + mempool_destroy(ms->read_record_pool); + kfree(ms); + return NULL; + } + + return ms; +} + +static void free_context(struct mirror_set *ms, struct dm_target *ti, + unsigned int m) +{ + while (m--) + dm_put_device(ti, ms->mirror[m].dev); + + dm_io_client_destroy(ms->io_client); + dm_region_hash_destroy(ms->rh); + mempool_destroy(ms->read_record_pool); + kfree(ms); +} + +static int get_mirror(struct mirror_set *ms, struct dm_target *ti, + unsigned int mirror, char **argv) +{ + unsigned long long offset; + char dummy; + + if (sscanf(argv[1], "%llu%c", &offset, &dummy) != 1) { + ti->error = "Invalid offset"; + return -EINVAL; + } + + if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), + &ms->mirror[mirror].dev)) { + ti->error = "Device lookup failure"; + return -ENXIO; + } + + ms->mirror[mirror].ms = ms; + atomic_set(&(ms->mirror[mirror].error_count), 0); + ms->mirror[mirror].error_type = 0; + ms->mirror[mirror].offset = offset; + + return 0; +} + +/* + * Create dirty log: log_type #log_params <log_params> + */ +static struct dm_dirty_log *create_dirty_log(struct dm_target *ti, + unsigned argc, char **argv, + unsigned *args_used) +{ + unsigned param_count; + struct dm_dirty_log *dl; + char dummy; + + if (argc < 2) { + ti->error = "Insufficient mirror log arguments"; + return NULL; + } + + if (sscanf(argv[1], "%u%c", ¶m_count, &dummy) != 1) { + ti->error = "Invalid mirror log argument count"; + return NULL; + } + + *args_used = 2 + param_count; + + if (argc < *args_used) { + ti->error = "Insufficient mirror log arguments"; + return NULL; + } + + dl = dm_dirty_log_create(argv[0], ti, mirror_flush, param_count, + argv + 2); + if (!dl) { + ti->error = "Error creating mirror dirty log"; + return NULL; + } + + return dl; +} + +static int parse_features(struct mirror_set *ms, unsigned argc, char **argv, + unsigned *args_used) +{ + unsigned num_features; + struct dm_target *ti = ms->ti; + char dummy; + + *args_used = 0; + + if (!argc) + return 0; + + if (sscanf(argv[0], "%u%c", &num_features, &dummy) != 1) { + ti->error = "Invalid number of features"; + return -EINVAL; + } + + argc--; + argv++; + (*args_used)++; + + if (num_features > argc) { + ti->error = "Not enough arguments to support feature count"; + return -EINVAL; + } + + if (!strcmp("handle_errors", argv[0])) + ms->features |= DM_RAID1_HANDLE_ERRORS; + else { + ti->error = "Unrecognised feature requested"; + return -EINVAL; + } + + (*args_used)++; + + return 0; +} + +/* + * Construct a mirror mapping: + * + * log_type #log_params <log_params> + * #mirrors [mirror_path offset]{2,} + * [#features <features>] + * + * log_type is "core" or "disk" + * #log_params is between 1 and 3 + * + * If present, features must be "handle_errors". + */ +static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + int r; + unsigned int nr_mirrors, m, args_used; + struct mirror_set *ms; + struct dm_dirty_log *dl; + char dummy; + + dl = create_dirty_log(ti, argc, argv, &args_used); + if (!dl) + return -EINVAL; + + argv += args_used; + argc -= args_used; + + if (!argc || sscanf(argv[0], "%u%c", &nr_mirrors, &dummy) != 1 || + nr_mirrors < 2 || nr_mirrors > DM_KCOPYD_MAX_REGIONS + 1) { + ti->error = "Invalid number of mirrors"; + dm_dirty_log_destroy(dl); + return -EINVAL; + } + + argv++, argc--; + + if (argc < nr_mirrors * 2) { + ti->error = "Too few mirror arguments"; + dm_dirty_log_destroy(dl); + return -EINVAL; + } + + ms = alloc_context(nr_mirrors, dl->type->get_region_size(dl), ti, dl); + if (!ms) { + dm_dirty_log_destroy(dl); + return -ENOMEM; + } + + /* Get the mirror parameter sets */ + for (m = 0; m < nr_mirrors; m++) { + r = get_mirror(ms, ti, m, argv); + if (r) { + free_context(ms, ti, m); + return r; + } + argv += 2; + argc -= 2; + } + + ti->private = ms; + ti->split_io = dm_rh_get_region_size(ms->rh); + ti->num_flush_requests = 1; + ti->num_discard_requests = 1; + + ms->kmirrord_wq = alloc_workqueue("kmirrord", + WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); + if (!ms->kmirrord_wq) { + DMERR("couldn't start kmirrord"); + r = -ENOMEM; + goto err_free_context; + } + INIT_WORK(&ms->kmirrord_work, do_mirror); + init_timer(&ms->timer); + ms->timer_pending = 0; + INIT_WORK(&ms->trigger_event, trigger_event); + + r = parse_features(ms, argc, argv, &args_used); + if (r) + goto err_destroy_wq; + + argv += args_used; + argc -= args_used; + + /* + * Any read-balancing addition depends on the + * DM_RAID1_HANDLE_ERRORS flag being present. + * This is because the decision to balance depends + * on the sync state of a region. If the above + * flag is not present, we ignore errors; and + * the sync state may be inaccurate. + */ + + if (argc) { + ti->error = "Too many mirror arguments"; + r = -EINVAL; + goto err_destroy_wq; + } + + ms->kcopyd_client = dm_kcopyd_client_create(); + if (IS_ERR(ms->kcopyd_client)) { + r = PTR_ERR(ms->kcopyd_client); + goto err_destroy_wq; + } + + wakeup_mirrord(ms); + return 0; + +err_destroy_wq: + destroy_workqueue(ms->kmirrord_wq); +err_free_context: + free_context(ms, ti, ms->nr_mirrors); + return r; +} + +static void mirror_dtr(struct dm_target *ti) +{ + struct mirror_set *ms = (struct mirror_set *) ti->private; + + del_timer_sync(&ms->timer); + flush_workqueue(ms->kmirrord_wq); + flush_work_sync(&ms->trigger_event); + dm_kcopyd_client_destroy(ms->kcopyd_client); + destroy_workqueue(ms->kmirrord_wq); + free_context(ms, ti, ms->nr_mirrors); +} + +/* + * Mirror mapping function + */ +static int mirror_map(struct dm_target *ti, struct bio *bio, + union map_info *map_context) +{ + int r, rw = bio_rw(bio); + struct mirror *m; + struct mirror_set *ms = ti->private; + struct dm_raid1_read_record *read_record = NULL; + struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); + + if (rw == WRITE) { + /* Save region for mirror_end_io() handler */ + map_context->ll = dm_rh_bio_to_region(ms->rh, bio); + queue_bio(ms, bio, rw); + return DM_MAPIO_SUBMITTED; + } + + r = log->type->in_sync(log, dm_rh_bio_to_region(ms->rh, bio), 0); + if (r < 0 && r != -EWOULDBLOCK) + return r; + + /* + * If region is not in-sync queue the bio. + */ + if (!r || (r == -EWOULDBLOCK)) { + if (rw == READA) + return -EWOULDBLOCK; + + queue_bio(ms, bio, rw); + return DM_MAPIO_SUBMITTED; + } + + /* + * The region is in-sync and we can perform reads directly. + * Store enough information so we can retry if it fails. + */ + m = choose_mirror(ms, bio->bi_sector); + if (unlikely(!m)) + return -EIO; + + read_record = mempool_alloc(ms->read_record_pool, GFP_NOIO); + if (likely(read_record)) { + dm_bio_record(&read_record->details, bio); + map_context->ptr = read_record; + read_record->m = m; + } + + map_bio(m, bio); + + return DM_MAPIO_REMAPPED; +} + +static int mirror_end_io(struct dm_target *ti, struct bio *bio, + int error, union map_info *map_context) +{ + int rw = bio_rw(bio); + struct mirror_set *ms = (struct mirror_set *) ti->private; + struct mirror *m = NULL; + struct dm_bio_details *bd = NULL; + struct dm_raid1_read_record *read_record = map_context->ptr; + + /* + * We need to dec pending if this was a write. + */ + if (rw == WRITE) { + if (!(bio->bi_rw & REQ_FLUSH)) + dm_rh_dec(ms->rh, map_context->ll); + return error; + } + + if (error == -EOPNOTSUPP) + goto out; + + if ((error == -EWOULDBLOCK) && (bio->bi_rw & REQ_RAHEAD)) + goto out; + + if (unlikely(error)) { + if (!read_record) { + /* + * There wasn't enough memory to record necessary + * information for a retry or there was no other + * mirror in-sync. + */ + DMERR_LIMIT("Mirror read failed."); + return -EIO; + } + + m = read_record->m; + + DMERR("Mirror read failed from %s. Trying alternative device.", + m->dev->name); + + fail_mirror(m, DM_RAID1_READ_ERROR); + + /* + * A failed read is requeued for another attempt using an intact + * mirror. + */ + if (default_ok(m) || mirror_available(ms, bio)) { + bd = &read_record->details; + + dm_bio_restore(bd, bio); + mempool_free(read_record, ms->read_record_pool); + map_context->ptr = NULL; + queue_bio(ms, bio, rw); + return 1; + } + DMERR("All replicated volumes dead, failing I/O"); + } + +out: + if (read_record) { + mempool_free(read_record, ms->read_record_pool); + map_context->ptr = NULL; + } + + return error; +} + +static void mirror_presuspend(struct dm_target *ti) +{ + struct mirror_set *ms = (struct mirror_set *) ti->private; + struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); + + struct bio_list holds; + struct bio *bio; + + atomic_set(&ms->suspend, 1); + + /* + * Process bios in the hold list to start recovery waiting + * for bios in the hold list. After the process, no bio has + * a chance to be added in the hold list because ms->suspend + * is set. + */ + spin_lock_irq(&ms->lock); + holds = ms->holds; + bio_list_init(&ms->holds); + spin_unlock_irq(&ms->lock); + + while ((bio = bio_list_pop(&holds))) + hold_bio(ms, bio); + + /* + * We must finish up all the work that we've + * generated (i.e. recovery work). + */ + dm_rh_stop_recovery(ms->rh); + + wait_event(_kmirrord_recovery_stopped, + !dm_rh_recovery_in_flight(ms->rh)); + + if (log->type->presuspend && log->type->presuspend(log)) + /* FIXME: need better error handling */ + DMWARN("log presuspend failed"); + + /* + * Now that recovery is complete/stopped and the + * delayed bios are queued, we need to wait for + * the worker thread to complete. This way, + * we know that all of our I/O has been pushed. + */ + flush_workqueue(ms->kmirrord_wq); +} + +static void mirror_postsuspend(struct dm_target *ti) +{ + struct mirror_set *ms = ti->private; + struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); + + if (log->type->postsuspend && log->type->postsuspend(log)) + /* FIXME: need better error handling */ + DMWARN("log postsuspend failed"); +} + +static void mirror_resume(struct dm_target *ti) +{ + struct mirror_set *ms = ti->private; + struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); + + atomic_set(&ms->suspend, 0); + if (log->type->resume && log->type->resume(log)) + /* FIXME: need better error handling */ + DMWARN("log resume failed"); + dm_rh_start_recovery(ms->rh); +} + +/* + * device_status_char + * @m: mirror device/leg we want the status of + * + * We return one character representing the most severe error + * we have encountered. + * A => Alive - No failures + * D => Dead - A write failure occurred leaving mirror out-of-sync + * S => Sync - A sychronization failure occurred, mirror out-of-sync + * R => Read - A read failure occurred, mirror data unaffected + * + * Returns: <char> + */ +static char device_status_char(struct mirror *m) +{ + if (!atomic_read(&(m->error_count))) + return 'A'; + + return (test_bit(DM_RAID1_FLUSH_ERROR, &(m->error_type))) ? 'F' : + (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' : + (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' : + (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U'; +} + + +static int mirror_status(struct dm_target *ti, status_type_t type, + char *result, unsigned int maxlen) +{ + unsigned int m, sz = 0; + struct mirror_set *ms = (struct mirror_set *) ti->private; + struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); + char buffer[ms->nr_mirrors + 1]; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%d ", ms->nr_mirrors); + for (m = 0; m < ms->nr_mirrors; m++) { + DMEMIT("%s ", ms->mirror[m].dev->name); + buffer[m] = device_status_char(&(ms->mirror[m])); + } + buffer[m] = '\0'; + + DMEMIT("%llu/%llu 1 %s ", + (unsigned long long)log->type->get_sync_count(log), + (unsigned long long)ms->nr_regions, buffer); + + sz += log->type->status(log, type, result+sz, maxlen-sz); + + break; + + case STATUSTYPE_TABLE: + sz = log->type->status(log, type, result, maxlen); + + DMEMIT("%d", ms->nr_mirrors); + for (m = 0; m < ms->nr_mirrors; m++) + DMEMIT(" %s %llu", ms->mirror[m].dev->name, + (unsigned long long)ms->mirror[m].offset); + + if (ms->features & DM_RAID1_HANDLE_ERRORS) + DMEMIT(" 1 handle_errors"); + } + + return 0; +} + +static int mirror_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct mirror_set *ms = ti->private; + int ret = 0; + unsigned i; + + for (i = 0; !ret && i < ms->nr_mirrors; i++) + ret = fn(ti, ms->mirror[i].dev, + ms->mirror[i].offset, ti->len, data); + + return ret; +} + +static struct target_type mirror_target = { + .name = "mirror", + .version = {1, 12, 1}, + .module = THIS_MODULE, + .ctr = mirror_ctr, + .dtr = mirror_dtr, + .map = mirror_map, + .end_io = mirror_end_io, + .presuspend = mirror_presuspend, + .postsuspend = mirror_postsuspend, + .resume = mirror_resume, + .status = mirror_status, + .iterate_devices = mirror_iterate_devices, +}; + +static int __init dm_mirror_init(void) +{ + int r; + + _dm_raid1_read_record_cache = KMEM_CACHE(dm_raid1_read_record, 0); + if (!_dm_raid1_read_record_cache) { + DMERR("Can't allocate dm_raid1_read_record cache"); + r = -ENOMEM; + goto bad_cache; + } + + r = dm_register_target(&mirror_target); + if (r < 0) { + DMERR("Failed to register mirror target"); + goto bad_target; + } + + return 0; + +bad_target: + kmem_cache_destroy(_dm_raid1_read_record_cache); +bad_cache: + return r; +} + +static void __exit dm_mirror_exit(void) +{ + dm_unregister_target(&mirror_target); + kmem_cache_destroy(_dm_raid1_read_record_cache); +} + +/* Module hooks */ +module_init(dm_mirror_init); +module_exit(dm_mirror_exit); + +MODULE_DESCRIPTION(DM_NAME " mirror target"); +MODULE_AUTHOR("Joe Thornber"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c new file mode 100644 index 00000000..7771ed21 --- /dev/null +++ b/drivers/md/dm-region-hash.c @@ -0,0 +1,720 @@ +/* + * Copyright (C) 2003 Sistina Software Limited. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include <linux/dm-dirty-log.h> +#include <linux/dm-region-hash.h> + +#include <linux/ctype.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> + +#include "dm.h" + +#define DM_MSG_PREFIX "region hash" + +/*----------------------------------------------------------------- + * Region hash + * + * The mirror splits itself up into discrete regions. Each + * region can be in one of three states: clean, dirty, + * nosync. There is no need to put clean regions in the hash. + * + * In addition to being present in the hash table a region _may_ + * be present on one of three lists. + * + * clean_regions: Regions on this list have no io pending to + * them, they are in sync, we are no longer interested in them, + * they are dull. dm_rh_update_states() will remove them from the + * hash table. + * + * quiesced_regions: These regions have been spun down, ready + * for recovery. rh_recovery_start() will remove regions from + * this list and hand them to kmirrord, which will schedule the + * recovery io with kcopyd. + * + * recovered_regions: Regions that kcopyd has successfully + * recovered. dm_rh_update_states() will now schedule any delayed + * io, up the recovery_count, and remove the region from the + * hash. + * + * There are 2 locks: + * A rw spin lock 'hash_lock' protects just the hash table, + * this is never held in write mode from interrupt context, + * which I believe means that we only have to disable irqs when + * doing a write lock. + * + * An ordinary spin lock 'region_lock' that protects the three + * lists in the region_hash, with the 'state', 'list' and + * 'delayed_bios' fields of the regions. This is used from irq + * context, so all other uses will have to suspend local irqs. + *---------------------------------------------------------------*/ +struct dm_region_hash { + uint32_t region_size; + unsigned region_shift; + + /* holds persistent region state */ + struct dm_dirty_log *log; + + /* hash table */ + rwlock_t hash_lock; + mempool_t *region_pool; + unsigned mask; + unsigned nr_buckets; + unsigned prime; + unsigned shift; + struct list_head *buckets; + + unsigned max_recovery; /* Max # of regions to recover in parallel */ + + spinlock_t region_lock; + atomic_t recovery_in_flight; + struct semaphore recovery_count; + struct list_head clean_regions; + struct list_head quiesced_regions; + struct list_head recovered_regions; + struct list_head failed_recovered_regions; + + /* + * If there was a flush failure no regions can be marked clean. + */ + int flush_failure; + + void *context; + sector_t target_begin; + + /* Callback function to schedule bios writes */ + void (*dispatch_bios)(void *context, struct bio_list *bios); + + /* Callback function to wakeup callers worker thread. */ + void (*wakeup_workers)(void *context); + + /* Callback function to wakeup callers recovery waiters. */ + void (*wakeup_all_recovery_waiters)(void *context); +}; + +struct dm_region { + struct dm_region_hash *rh; /* FIXME: can we get rid of this ? */ + region_t key; + int state; + + struct list_head hash_list; + struct list_head list; + + atomic_t pending; + struct bio_list delayed_bios; +}; + +/* + * Conversion fns + */ +static region_t dm_rh_sector_to_region(struct dm_region_hash *rh, sector_t sector) +{ + return sector >> rh->region_shift; +} + +sector_t dm_rh_region_to_sector(struct dm_region_hash *rh, region_t region) +{ + return region << rh->region_shift; +} +EXPORT_SYMBOL_GPL(dm_rh_region_to_sector); + +region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio) +{ + return dm_rh_sector_to_region(rh, bio->bi_sector - rh->target_begin); +} +EXPORT_SYMBOL_GPL(dm_rh_bio_to_region); + +void *dm_rh_region_context(struct dm_region *reg) +{ + return reg->rh->context; +} +EXPORT_SYMBOL_GPL(dm_rh_region_context); + +region_t dm_rh_get_region_key(struct dm_region *reg) +{ + return reg->key; +} +EXPORT_SYMBOL_GPL(dm_rh_get_region_key); + +sector_t dm_rh_get_region_size(struct dm_region_hash *rh) +{ + return rh->region_size; +} +EXPORT_SYMBOL_GPL(dm_rh_get_region_size); + +/* + * FIXME: shall we pass in a structure instead of all these args to + * dm_region_hash_create()???? + */ +#define RH_HASH_MULT 2654435387U +#define RH_HASH_SHIFT 12 + +#define MIN_REGIONS 64 +struct dm_region_hash *dm_region_hash_create( + void *context, void (*dispatch_bios)(void *context, + struct bio_list *bios), + void (*wakeup_workers)(void *context), + void (*wakeup_all_recovery_waiters)(void *context), + sector_t target_begin, unsigned max_recovery, + struct dm_dirty_log *log, uint32_t region_size, + region_t nr_regions) +{ + struct dm_region_hash *rh; + unsigned nr_buckets, max_buckets; + size_t i; + + /* + * Calculate a suitable number of buckets for our hash + * table. + */ + max_buckets = nr_regions >> 6; + for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1) + ; + nr_buckets >>= 1; + + rh = kmalloc(sizeof(*rh), GFP_KERNEL); + if (!rh) { + DMERR("unable to allocate region hash memory"); + return ERR_PTR(-ENOMEM); + } + + rh->context = context; + rh->dispatch_bios = dispatch_bios; + rh->wakeup_workers = wakeup_workers; + rh->wakeup_all_recovery_waiters = wakeup_all_recovery_waiters; + rh->target_begin = target_begin; + rh->max_recovery = max_recovery; + rh->log = log; + rh->region_size = region_size; + rh->region_shift = ffs(region_size) - 1; + rwlock_init(&rh->hash_lock); + rh->mask = nr_buckets - 1; + rh->nr_buckets = nr_buckets; + + rh->shift = RH_HASH_SHIFT; + rh->prime = RH_HASH_MULT; + + rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets)); + if (!rh->buckets) { + DMERR("unable to allocate region hash bucket memory"); + kfree(rh); + return ERR_PTR(-ENOMEM); + } + + for (i = 0; i < nr_buckets; i++) + INIT_LIST_HEAD(rh->buckets + i); + + spin_lock_init(&rh->region_lock); + sema_init(&rh->recovery_count, 0); + atomic_set(&rh->recovery_in_flight, 0); + INIT_LIST_HEAD(&rh->clean_regions); + INIT_LIST_HEAD(&rh->quiesced_regions); + INIT_LIST_HEAD(&rh->recovered_regions); + INIT_LIST_HEAD(&rh->failed_recovered_regions); + rh->flush_failure = 0; + + rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, + sizeof(struct dm_region)); + if (!rh->region_pool) { + vfree(rh->buckets); + kfree(rh); + rh = ERR_PTR(-ENOMEM); + } + + return rh; +} +EXPORT_SYMBOL_GPL(dm_region_hash_create); + +void dm_region_hash_destroy(struct dm_region_hash *rh) +{ + unsigned h; + struct dm_region *reg, *nreg; + + BUG_ON(!list_empty(&rh->quiesced_regions)); + for (h = 0; h < rh->nr_buckets; h++) { + list_for_each_entry_safe(reg, nreg, rh->buckets + h, + hash_list) { + BUG_ON(atomic_read(®->pending)); + mempool_free(reg, rh->region_pool); + } + } + + if (rh->log) + dm_dirty_log_destroy(rh->log); + + if (rh->region_pool) + mempool_destroy(rh->region_pool); + + vfree(rh->buckets); + kfree(rh); +} +EXPORT_SYMBOL_GPL(dm_region_hash_destroy); + +struct dm_dirty_log *dm_rh_dirty_log(struct dm_region_hash *rh) +{ + return rh->log; +} +EXPORT_SYMBOL_GPL(dm_rh_dirty_log); + +static unsigned rh_hash(struct dm_region_hash *rh, region_t region) +{ + return (unsigned) ((region * rh->prime) >> rh->shift) & rh->mask; +} + +static struct dm_region *__rh_lookup(struct dm_region_hash *rh, region_t region) +{ + struct dm_region *reg; + struct list_head *bucket = rh->buckets + rh_hash(rh, region); + + list_for_each_entry(reg, bucket, hash_list) + if (reg->key == region) + return reg; + + return NULL; +} + +static void __rh_insert(struct dm_region_hash *rh, struct dm_region *reg) +{ + list_add(®->hash_list, rh->buckets + rh_hash(rh, reg->key)); +} + +static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region) +{ + struct dm_region *reg, *nreg; + + nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); + if (unlikely(!nreg)) + nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL); + + nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? + DM_RH_CLEAN : DM_RH_NOSYNC; + nreg->rh = rh; + nreg->key = region; + INIT_LIST_HEAD(&nreg->list); + atomic_set(&nreg->pending, 0); + bio_list_init(&nreg->delayed_bios); + + write_lock_irq(&rh->hash_lock); + reg = __rh_lookup(rh, region); + if (reg) + /* We lost the race. */ + mempool_free(nreg, rh->region_pool); + else { + __rh_insert(rh, nreg); + if (nreg->state == DM_RH_CLEAN) { + spin_lock(&rh->region_lock); + list_add(&nreg->list, &rh->clean_regions); + spin_unlock(&rh->region_lock); + } + + reg = nreg; + } + write_unlock_irq(&rh->hash_lock); + + return reg; +} + +static struct dm_region *__rh_find(struct dm_region_hash *rh, region_t region) +{ + struct dm_region *reg; + + reg = __rh_lookup(rh, region); + if (!reg) { + read_unlock(&rh->hash_lock); + reg = __rh_alloc(rh, region); + read_lock(&rh->hash_lock); + } + + return reg; +} + +int dm_rh_get_state(struct dm_region_hash *rh, region_t region, int may_block) +{ + int r; + struct dm_region *reg; + + read_lock(&rh->hash_lock); + reg = __rh_lookup(rh, region); + read_unlock(&rh->hash_lock); + + if (reg) + return reg->state; + + /* + * The region wasn't in the hash, so we fall back to the + * dirty log. + */ + r = rh->log->type->in_sync(rh->log, region, may_block); + + /* + * Any error from the dirty log (eg. -EWOULDBLOCK) gets + * taken as a DM_RH_NOSYNC + */ + return r == 1 ? DM_RH_CLEAN : DM_RH_NOSYNC; +} +EXPORT_SYMBOL_GPL(dm_rh_get_state); + +static void complete_resync_work(struct dm_region *reg, int success) +{ + struct dm_region_hash *rh = reg->rh; + + rh->log->type->set_region_sync(rh->log, reg->key, success); + + /* + * Dispatch the bios before we call 'wake_up_all'. + * This is important because if we are suspending, + * we want to know that recovery is complete and + * the work queue is flushed. If we wake_up_all + * before we dispatch_bios (queue bios and call wake()), + * then we risk suspending before the work queue + * has been properly flushed. + */ + rh->dispatch_bios(rh->context, ®->delayed_bios); + if (atomic_dec_and_test(&rh->recovery_in_flight)) + rh->wakeup_all_recovery_waiters(rh->context); + up(&rh->recovery_count); +} + +/* dm_rh_mark_nosync + * @ms + * @bio + * + * The bio was written on some mirror(s) but failed on other mirror(s). + * We can successfully endio the bio but should avoid the region being + * marked clean by setting the state DM_RH_NOSYNC. + * + * This function is _not_ safe in interrupt context! + */ +void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio) +{ + unsigned long flags; + struct dm_dirty_log *log = rh->log; + struct dm_region *reg; + region_t region = dm_rh_bio_to_region(rh, bio); + int recovering = 0; + + if (bio->bi_rw & REQ_FLUSH) { + rh->flush_failure = 1; + return; + } + + /* We must inform the log that the sync count has changed. */ + log->type->set_region_sync(log, region, 0); + + read_lock(&rh->hash_lock); + reg = __rh_find(rh, region); + read_unlock(&rh->hash_lock); + + /* region hash entry should exist because write was in-flight */ + BUG_ON(!reg); + BUG_ON(!list_empty(®->list)); + + spin_lock_irqsave(&rh->region_lock, flags); + /* + * Possible cases: + * 1) DM_RH_DIRTY + * 2) DM_RH_NOSYNC: was dirty, other preceding writes failed + * 3) DM_RH_RECOVERING: flushing pending writes + * Either case, the region should have not been connected to list. + */ + recovering = (reg->state == DM_RH_RECOVERING); + reg->state = DM_RH_NOSYNC; + BUG_ON(!list_empty(®->list)); + spin_unlock_irqrestore(&rh->region_lock, flags); + + if (recovering) + complete_resync_work(reg, 0); +} +EXPORT_SYMBOL_GPL(dm_rh_mark_nosync); + +void dm_rh_update_states(struct dm_region_hash *rh, int errors_handled) +{ + struct dm_region *reg, *next; + + LIST_HEAD(clean); + LIST_HEAD(recovered); + LIST_HEAD(failed_recovered); + + /* + * Quickly grab the lists. + */ + write_lock_irq(&rh->hash_lock); + spin_lock(&rh->region_lock); + if (!list_empty(&rh->clean_regions)) { + list_splice_init(&rh->clean_regions, &clean); + + list_for_each_entry(reg, &clean, list) + list_del(®->hash_list); + } + + if (!list_empty(&rh->recovered_regions)) { + list_splice_init(&rh->recovered_regions, &recovered); + + list_for_each_entry(reg, &recovered, list) + list_del(®->hash_list); + } + + if (!list_empty(&rh->failed_recovered_regions)) { + list_splice_init(&rh->failed_recovered_regions, + &failed_recovered); + + list_for_each_entry(reg, &failed_recovered, list) + list_del(®->hash_list); + } + + spin_unlock(&rh->region_lock); + write_unlock_irq(&rh->hash_lock); + + /* + * All the regions on the recovered and clean lists have + * now been pulled out of the system, so no need to do + * any more locking. + */ + list_for_each_entry_safe(reg, next, &recovered, list) { + rh->log->type->clear_region(rh->log, reg->key); + complete_resync_work(reg, 1); + mempool_free(reg, rh->region_pool); + } + + list_for_each_entry_safe(reg, next, &failed_recovered, list) { + complete_resync_work(reg, errors_handled ? 0 : 1); + mempool_free(reg, rh->region_pool); + } + + list_for_each_entry_safe(reg, next, &clean, list) { + rh->log->type->clear_region(rh->log, reg->key); + mempool_free(reg, rh->region_pool); + } + + rh->log->type->flush(rh->log); +} +EXPORT_SYMBOL_GPL(dm_rh_update_states); + +static void rh_inc(struct dm_region_hash *rh, region_t region) +{ + struct dm_region *reg; + + read_lock(&rh->hash_lock); + reg = __rh_find(rh, region); + + spin_lock_irq(&rh->region_lock); + atomic_inc(®->pending); + + if (reg->state == DM_RH_CLEAN) { + reg->state = DM_RH_DIRTY; + list_del_init(®->list); /* take off the clean list */ + spin_unlock_irq(&rh->region_lock); + + rh->log->type->mark_region(rh->log, reg->key); + } else + spin_unlock_irq(&rh->region_lock); + + + read_unlock(&rh->hash_lock); +} + +void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios) +{ + struct bio *bio; + + for (bio = bios->head; bio; bio = bio->bi_next) { + if (bio->bi_rw & REQ_FLUSH) + continue; + rh_inc(rh, dm_rh_bio_to_region(rh, bio)); + } +} +EXPORT_SYMBOL_GPL(dm_rh_inc_pending); + +void dm_rh_dec(struct dm_region_hash *rh, region_t region) +{ + unsigned long flags; + struct dm_region *reg; + int should_wake = 0; + + read_lock(&rh->hash_lock); + reg = __rh_lookup(rh, region); + read_unlock(&rh->hash_lock); + + spin_lock_irqsave(&rh->region_lock, flags); + if (atomic_dec_and_test(®->pending)) { + /* + * There is no pending I/O for this region. + * We can move the region to corresponding list for next action. + * At this point, the region is not yet connected to any list. + * + * If the state is DM_RH_NOSYNC, the region should be kept off + * from clean list. + * The hash entry for DM_RH_NOSYNC will remain in memory + * until the region is recovered or the map is reloaded. + */ + + /* do nothing for DM_RH_NOSYNC */ + if (unlikely(rh->flush_failure)) { + /* + * If a write flush failed some time ago, we + * don't know whether or not this write made it + * to the disk, so we must resync the device. + */ + reg->state = DM_RH_NOSYNC; + } else if (reg->state == DM_RH_RECOVERING) { + list_add_tail(®->list, &rh->quiesced_regions); + } else if (reg->state == DM_RH_DIRTY) { + reg->state = DM_RH_CLEAN; + list_add(®->list, &rh->clean_regions); + } + should_wake = 1; + } + spin_unlock_irqrestore(&rh->region_lock, flags); + + if (should_wake) + rh->wakeup_workers(rh->context); +} +EXPORT_SYMBOL_GPL(dm_rh_dec); + +/* + * Starts quiescing a region in preparation for recovery. + */ +static int __rh_recovery_prepare(struct dm_region_hash *rh) +{ + int r; + region_t region; + struct dm_region *reg; + + /* + * Ask the dirty log what's next. + */ + r = rh->log->type->get_resync_work(rh->log, ®ion); + if (r <= 0) + return r; + + /* + * Get this region, and start it quiescing by setting the + * recovering flag. + */ + read_lock(&rh->hash_lock); + reg = __rh_find(rh, region); + read_unlock(&rh->hash_lock); + + spin_lock_irq(&rh->region_lock); + reg->state = DM_RH_RECOVERING; + + /* Already quiesced ? */ + if (atomic_read(®->pending)) + list_del_init(®->list); + else + list_move(®->list, &rh->quiesced_regions); + + spin_unlock_irq(&rh->region_lock); + + return 1; +} + +void dm_rh_recovery_prepare(struct dm_region_hash *rh) +{ + /* Extra reference to avoid race with dm_rh_stop_recovery */ + atomic_inc(&rh->recovery_in_flight); + + while (!down_trylock(&rh->recovery_count)) { + atomic_inc(&rh->recovery_in_flight); + if (__rh_recovery_prepare(rh) <= 0) { + atomic_dec(&rh->recovery_in_flight); + up(&rh->recovery_count); + break; + } + } + + /* Drop the extra reference */ + if (atomic_dec_and_test(&rh->recovery_in_flight)) + rh->wakeup_all_recovery_waiters(rh->context); +} +EXPORT_SYMBOL_GPL(dm_rh_recovery_prepare); + +/* + * Returns any quiesced regions. + */ +struct dm_region *dm_rh_recovery_start(struct dm_region_hash *rh) +{ + struct dm_region *reg = NULL; + + spin_lock_irq(&rh->region_lock); + if (!list_empty(&rh->quiesced_regions)) { + reg = list_entry(rh->quiesced_regions.next, + struct dm_region, list); + list_del_init(®->list); /* remove from the quiesced list */ + } + spin_unlock_irq(&rh->region_lock); + + return reg; +} +EXPORT_SYMBOL_GPL(dm_rh_recovery_start); + +void dm_rh_recovery_end(struct dm_region *reg, int success) +{ + struct dm_region_hash *rh = reg->rh; + + spin_lock_irq(&rh->region_lock); + if (success) + list_add(®->list, ®->rh->recovered_regions); + else + list_add(®->list, ®->rh->failed_recovered_regions); + + spin_unlock_irq(&rh->region_lock); + + rh->wakeup_workers(rh->context); +} +EXPORT_SYMBOL_GPL(dm_rh_recovery_end); + +/* Return recovery in flight count. */ +int dm_rh_recovery_in_flight(struct dm_region_hash *rh) +{ + return atomic_read(&rh->recovery_in_flight); +} +EXPORT_SYMBOL_GPL(dm_rh_recovery_in_flight); + +int dm_rh_flush(struct dm_region_hash *rh) +{ + return rh->log->type->flush(rh->log); +} +EXPORT_SYMBOL_GPL(dm_rh_flush); + +void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio) +{ + struct dm_region *reg; + + read_lock(&rh->hash_lock); + reg = __rh_find(rh, dm_rh_bio_to_region(rh, bio)); + bio_list_add(®->delayed_bios, bio); + read_unlock(&rh->hash_lock); +} +EXPORT_SYMBOL_GPL(dm_rh_delay); + +void dm_rh_stop_recovery(struct dm_region_hash *rh) +{ + int i; + + /* wait for any recovering regions */ + for (i = 0; i < rh->max_recovery; i++) + down(&rh->recovery_count); +} +EXPORT_SYMBOL_GPL(dm_rh_stop_recovery); + +void dm_rh_start_recovery(struct dm_region_hash *rh) +{ + int i; + + for (i = 0; i < rh->max_recovery; i++) + up(&rh->recovery_count); + + rh->wakeup_workers(rh->context); +} +EXPORT_SYMBOL_GPL(dm_rh_start_recovery); + +MODULE_DESCRIPTION(DM_NAME " region hash"); +MODULE_AUTHOR("Joe Thornber/Heinz Mauelshagen <dm-devel@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c new file mode 100644 index 00000000..6ab1192c --- /dev/null +++ b/drivers/md/dm-round-robin.c @@ -0,0 +1,219 @@ +/* + * Copyright (C) 2003 Sistina Software. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * Module Author: Heinz Mauelshagen + * + * This file is released under the GPL. + * + * Round-robin path selector. + */ + +#include <linux/device-mapper.h> + +#include "dm-path-selector.h" + +#include <linux/slab.h> +#include <linux/module.h> + +#define DM_MSG_PREFIX "multipath round-robin" + +/*----------------------------------------------------------------- + * Path-handling code, paths are held in lists + *---------------------------------------------------------------*/ +struct path_info { + struct list_head list; + struct dm_path *path; + unsigned repeat_count; +}; + +static void free_paths(struct list_head *paths) +{ + struct path_info *pi, *next; + + list_for_each_entry_safe(pi, next, paths, list) { + list_del(&pi->list); + kfree(pi); + } +} + +/*----------------------------------------------------------------- + * Round-robin selector + *---------------------------------------------------------------*/ + +#define RR_MIN_IO 1000 + +struct selector { + struct list_head valid_paths; + struct list_head invalid_paths; +}; + +static struct selector *alloc_selector(void) +{ + struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (s) { + INIT_LIST_HEAD(&s->valid_paths); + INIT_LIST_HEAD(&s->invalid_paths); + } + + return s; +} + +static int rr_create(struct path_selector *ps, unsigned argc, char **argv) +{ + struct selector *s; + + s = alloc_selector(); + if (!s) + return -ENOMEM; + + ps->context = s; + return 0; +} + +static void rr_destroy(struct path_selector *ps) +{ + struct selector *s = (struct selector *) ps->context; + + free_paths(&s->valid_paths); + free_paths(&s->invalid_paths); + kfree(s); + ps->context = NULL; +} + +static int rr_status(struct path_selector *ps, struct dm_path *path, + status_type_t type, char *result, unsigned int maxlen) +{ + struct path_info *pi; + int sz = 0; + + if (!path) + DMEMIT("0 "); + else { + switch(type) { + case STATUSTYPE_INFO: + break; + case STATUSTYPE_TABLE: + pi = path->pscontext; + DMEMIT("%u ", pi->repeat_count); + break; + } + } + + return sz; +} + +/* + * Called during initialisation to register each path with an + * optional repeat_count. + */ +static int rr_add_path(struct path_selector *ps, struct dm_path *path, + int argc, char **argv, char **error) +{ + struct selector *s = (struct selector *) ps->context; + struct path_info *pi; + unsigned repeat_count = RR_MIN_IO; + char dummy; + + if (argc > 1) { + *error = "round-robin ps: incorrect number of arguments"; + return -EINVAL; + } + + /* First path argument is number of I/Os before switching path */ + if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { + *error = "round-robin ps: invalid repeat count"; + return -EINVAL; + } + + /* allocate the path */ + pi = kmalloc(sizeof(*pi), GFP_KERNEL); + if (!pi) { + *error = "round-robin ps: Error allocating path context"; + return -ENOMEM; + } + + pi->path = path; + pi->repeat_count = repeat_count; + + path->pscontext = pi; + + list_add_tail(&pi->list, &s->valid_paths); + + return 0; +} + +static void rr_fail_path(struct path_selector *ps, struct dm_path *p) +{ + struct selector *s = (struct selector *) ps->context; + struct path_info *pi = p->pscontext; + + list_move(&pi->list, &s->invalid_paths); +} + +static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p) +{ + struct selector *s = (struct selector *) ps->context; + struct path_info *pi = p->pscontext; + + list_move(&pi->list, &s->valid_paths); + + return 0; +} + +static struct dm_path *rr_select_path(struct path_selector *ps, + unsigned *repeat_count, size_t nr_bytes) +{ + struct selector *s = (struct selector *) ps->context; + struct path_info *pi = NULL; + + if (!list_empty(&s->valid_paths)) { + pi = list_entry(s->valid_paths.next, struct path_info, list); + list_move_tail(&pi->list, &s->valid_paths); + *repeat_count = pi->repeat_count; + } + + return pi ? pi->path : NULL; +} + +static struct path_selector_type rr_ps = { + .name = "round-robin", + .module = THIS_MODULE, + .table_args = 1, + .info_args = 0, + .create = rr_create, + .destroy = rr_destroy, + .status = rr_status, + .add_path = rr_add_path, + .fail_path = rr_fail_path, + .reinstate_path = rr_reinstate_path, + .select_path = rr_select_path, +}; + +static int __init dm_rr_init(void) +{ + int r = dm_register_path_selector(&rr_ps); + + if (r < 0) + DMERR("register failed %d", r); + + DMINFO("version 1.0.0 loaded"); + + return r; +} + +static void __exit dm_rr_exit(void) +{ + int r = dm_unregister_path_selector(&rr_ps); + + if (r < 0) + DMERR("unregister failed %d", r); +} + +module_init(dm_rr_init); +module_exit(dm_rr_exit); + +MODULE_DESCRIPTION(DM_NAME " round-robin multipath path selector"); +MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c new file mode 100644 index 00000000..9df8f6bd --- /dev/null +++ b/drivers/md/dm-service-time.c @@ -0,0 +1,343 @@ +/* + * Copyright (C) 2007-2009 NEC Corporation. All Rights Reserved. + * + * Module Author: Kiyoshi Ueda + * + * This file is released under the GPL. + * + * Throughput oriented path selector. + */ + +#include "dm.h" +#include "dm-path-selector.h" + +#include <linux/slab.h> +#include <linux/module.h> + +#define DM_MSG_PREFIX "multipath service-time" +#define ST_MIN_IO 1 +#define ST_MAX_RELATIVE_THROUGHPUT 100 +#define ST_MAX_RELATIVE_THROUGHPUT_SHIFT 7 +#define ST_MAX_INFLIGHT_SIZE ((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT) +#define ST_VERSION "0.2.0" + +struct selector { + struct list_head valid_paths; + struct list_head failed_paths; +}; + +struct path_info { + struct list_head list; + struct dm_path *path; + unsigned repeat_count; + unsigned relative_throughput; + atomic_t in_flight_size; /* Total size of in-flight I/Os */ +}; + +static struct selector *alloc_selector(void) +{ + struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (s) { + INIT_LIST_HEAD(&s->valid_paths); + INIT_LIST_HEAD(&s->failed_paths); + } + + return s; +} + +static int st_create(struct path_selector *ps, unsigned argc, char **argv) +{ + struct selector *s = alloc_selector(); + + if (!s) + return -ENOMEM; + + ps->context = s; + return 0; +} + +static void free_paths(struct list_head *paths) +{ + struct path_info *pi, *next; + + list_for_each_entry_safe(pi, next, paths, list) { + list_del(&pi->list); + kfree(pi); + } +} + +static void st_destroy(struct path_selector *ps) +{ + struct selector *s = ps->context; + + free_paths(&s->valid_paths); + free_paths(&s->failed_paths); + kfree(s); + ps->context = NULL; +} + +static int st_status(struct path_selector *ps, struct dm_path *path, + status_type_t type, char *result, unsigned maxlen) +{ + unsigned sz = 0; + struct path_info *pi; + + if (!path) + DMEMIT("0 "); + else { + pi = path->pscontext; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%d %u ", atomic_read(&pi->in_flight_size), + pi->relative_throughput); + break; + case STATUSTYPE_TABLE: + DMEMIT("%u %u ", pi->repeat_count, + pi->relative_throughput); + break; + } + } + + return sz; +} + +static int st_add_path(struct path_selector *ps, struct dm_path *path, + int argc, char **argv, char **error) +{ + struct selector *s = ps->context; + struct path_info *pi; + unsigned repeat_count = ST_MIN_IO; + unsigned relative_throughput = 1; + char dummy; + + /* + * Arguments: [<repeat_count> [<relative_throughput>]] + * <repeat_count>: The number of I/Os before switching path. + * If not given, default (ST_MIN_IO) is used. + * <relative_throughput>: The relative throughput value of + * the path among all paths in the path-group. + * The valid range: 0-<ST_MAX_RELATIVE_THROUGHPUT> + * If not given, minimum value '1' is used. + * If '0' is given, the path isn't selected while + * other paths having a positive value are + * available. + */ + if (argc > 2) { + *error = "service-time ps: incorrect number of arguments"; + return -EINVAL; + } + + if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { + *error = "service-time ps: invalid repeat count"; + return -EINVAL; + } + + if ((argc == 2) && + (sscanf(argv[1], "%u%c", &relative_throughput, &dummy) != 1 || + relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) { + *error = "service-time ps: invalid relative_throughput value"; + return -EINVAL; + } + + /* allocate the path */ + pi = kmalloc(sizeof(*pi), GFP_KERNEL); + if (!pi) { + *error = "service-time ps: Error allocating path context"; + return -ENOMEM; + } + + pi->path = path; + pi->repeat_count = repeat_count; + pi->relative_throughput = relative_throughput; + atomic_set(&pi->in_flight_size, 0); + + path->pscontext = pi; + + list_add_tail(&pi->list, &s->valid_paths); + + return 0; +} + +static void st_fail_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = ps->context; + struct path_info *pi = path->pscontext; + + list_move(&pi->list, &s->failed_paths); +} + +static int st_reinstate_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = ps->context; + struct path_info *pi = path->pscontext; + + list_move_tail(&pi->list, &s->valid_paths); + + return 0; +} + +/* + * Compare the estimated service time of 2 paths, pi1 and pi2, + * for the incoming I/O. + * + * Returns: + * < 0 : pi1 is better + * 0 : no difference between pi1 and pi2 + * > 0 : pi2 is better + * + * Description: + * Basically, the service time is estimated by: + * ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput' + * To reduce the calculation, some optimizations are made. + * (See comments inline) + */ +static int st_compare_load(struct path_info *pi1, struct path_info *pi2, + size_t incoming) +{ + size_t sz1, sz2, st1, st2; + + sz1 = atomic_read(&pi1->in_flight_size); + sz2 = atomic_read(&pi2->in_flight_size); + + /* + * Case 1: Both have same throughput value. Choose less loaded path. + */ + if (pi1->relative_throughput == pi2->relative_throughput) + return sz1 - sz2; + + /* + * Case 2a: Both have same load. Choose higher throughput path. + * Case 2b: One path has no throughput value. Choose the other one. + */ + if (sz1 == sz2 || + !pi1->relative_throughput || !pi2->relative_throughput) + return pi2->relative_throughput - pi1->relative_throughput; + + /* + * Case 3: Calculate service time. Choose faster path. + * Service time using pi1: + * st1 = (sz1 + incoming) / pi1->relative_throughput + * Service time using pi2: + * st2 = (sz2 + incoming) / pi2->relative_throughput + * + * To avoid the division, transform the expression to use + * multiplication. + * Because ->relative_throughput > 0 here, if st1 < st2, + * the expressions below are the same meaning: + * (sz1 + incoming) / pi1->relative_throughput < + * (sz2 + incoming) / pi2->relative_throughput + * (sz1 + incoming) * pi2->relative_throughput < + * (sz2 + incoming) * pi1->relative_throughput + * So use the later one. + */ + sz1 += incoming; + sz2 += incoming; + if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE || + sz2 >= ST_MAX_INFLIGHT_SIZE)) { + /* + * Size may be too big for multiplying pi->relative_throughput + * and overflow. + * To avoid the overflow and mis-selection, shift down both. + */ + sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; + sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; + } + st1 = sz1 * pi2->relative_throughput; + st2 = sz2 * pi1->relative_throughput; + if (st1 != st2) + return st1 - st2; + + /* + * Case 4: Service time is equal. Choose higher throughput path. + */ + return pi2->relative_throughput - pi1->relative_throughput; +} + +static struct dm_path *st_select_path(struct path_selector *ps, + unsigned *repeat_count, size_t nr_bytes) +{ + struct selector *s = ps->context; + struct path_info *pi = NULL, *best = NULL; + + if (list_empty(&s->valid_paths)) + return NULL; + + /* Change preferred (first in list) path to evenly balance. */ + list_move_tail(s->valid_paths.next, &s->valid_paths); + + list_for_each_entry(pi, &s->valid_paths, list) + if (!best || (st_compare_load(pi, best, nr_bytes) < 0)) + best = pi; + + if (!best) + return NULL; + + *repeat_count = best->repeat_count; + + return best->path; +} + +static int st_start_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes) +{ + struct path_info *pi = path->pscontext; + + atomic_add(nr_bytes, &pi->in_flight_size); + + return 0; +} + +static int st_end_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes) +{ + struct path_info *pi = path->pscontext; + + atomic_sub(nr_bytes, &pi->in_flight_size); + + return 0; +} + +static struct path_selector_type st_ps = { + .name = "service-time", + .module = THIS_MODULE, + .table_args = 2, + .info_args = 2, + .create = st_create, + .destroy = st_destroy, + .status = st_status, + .add_path = st_add_path, + .fail_path = st_fail_path, + .reinstate_path = st_reinstate_path, + .select_path = st_select_path, + .start_io = st_start_io, + .end_io = st_end_io, +}; + +static int __init dm_st_init(void) +{ + int r = dm_register_path_selector(&st_ps); + + if (r < 0) + DMERR("register failed %d", r); + + DMINFO("version " ST_VERSION " loaded"); + + return r; +} + +static void __exit dm_st_exit(void) +{ + int r = dm_unregister_path_selector(&st_ps); + + if (r < 0) + DMERR("unregister failed %d", r); +} + +module_init(dm_st_init); +module_exit(dm_st_exit); + +MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector"); +MODULE_AUTHOR("Kiyoshi Ueda <k-ueda@ct.jp.nec.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c new file mode 100644 index 00000000..3ac41567 --- /dev/null +++ b/drivers/md/dm-snap-persistent.c @@ -0,0 +1,898 @@ +/* + * Copyright (C) 2001-2002 Sistina Software (UK) Limited. + * Copyright (C) 2006-2008 Red Hat GmbH + * + * This file is released under the GPL. + */ + +#include "dm-exception-store.h" + +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/vmalloc.h> +#include <linux/export.h> +#include <linux/slab.h> +#include <linux/dm-io.h> + +#define DM_MSG_PREFIX "persistent snapshot" +#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */ + +/*----------------------------------------------------------------- + * Persistent snapshots, by persistent we mean that the snapshot + * will survive a reboot. + *---------------------------------------------------------------*/ + +/* + * We need to store a record of which parts of the origin have + * been copied to the snapshot device. The snapshot code + * requires that we copy exception chunks to chunk aligned areas + * of the COW store. It makes sense therefore, to store the + * metadata in chunk size blocks. + * + * There is no backward or forward compatibility implemented, + * snapshots with different disk versions than the kernel will + * not be usable. It is expected that "lvcreate" will blank out + * the start of a fresh COW device before calling the snapshot + * constructor. + * + * The first chunk of the COW device just contains the header. + * After this there is a chunk filled with exception metadata, + * followed by as many exception chunks as can fit in the + * metadata areas. + * + * All on disk structures are in little-endian format. The end + * of the exceptions info is indicated by an exception with a + * new_chunk of 0, which is invalid since it would point to the + * header chunk. + */ + +/* + * Magic for persistent snapshots: "SnAp" - Feeble isn't it. + */ +#define SNAP_MAGIC 0x70416e53 + +/* + * The on-disk version of the metadata. + */ +#define SNAPSHOT_DISK_VERSION 1 + +#define NUM_SNAPSHOT_HDR_CHUNKS 1 + +struct disk_header { + __le32 magic; + + /* + * Is this snapshot valid. There is no way of recovering + * an invalid snapshot. + */ + __le32 valid; + + /* + * Simple, incrementing version. no backward + * compatibility. + */ + __le32 version; + + /* In sectors */ + __le32 chunk_size; +} __packed; + +struct disk_exception { + __le64 old_chunk; + __le64 new_chunk; +} __packed; + +struct core_exception { + uint64_t old_chunk; + uint64_t new_chunk; +}; + +struct commit_callback { + void (*callback)(void *, int success); + void *context; +}; + +/* + * The top level structure for a persistent exception store. + */ +struct pstore { + struct dm_exception_store *store; + int version; + int valid; + uint32_t exceptions_per_area; + + /* + * Now that we have an asynchronous kcopyd there is no + * need for large chunk sizes, so it wont hurt to have a + * whole chunks worth of metadata in memory at once. + */ + void *area; + + /* + * An area of zeros used to clear the next area. + */ + void *zero_area; + + /* + * An area used for header. The header can be written + * concurrently with metadata (when invalidating the snapshot), + * so it needs a separate buffer. + */ + void *header_area; + + /* + * Used to keep track of which metadata area the data in + * 'chunk' refers to. + */ + chunk_t current_area; + + /* + * The next free chunk for an exception. + * + * When creating exceptions, all the chunks here and above are + * free. It holds the next chunk to be allocated. On rare + * occasions (e.g. after a system crash) holes can be left in + * the exception store because chunks can be committed out of + * order. + * + * When merging exceptions, it does not necessarily mean all the + * chunks here and above are free. It holds the value it would + * have held if all chunks had been committed in order of + * allocation. Consequently the value may occasionally be + * slightly too low, but since it's only used for 'status' and + * it can never reach its minimum value too early this doesn't + * matter. + */ + + chunk_t next_free; + + /* + * The index of next free exception in the current + * metadata area. + */ + uint32_t current_committed; + + atomic_t pending_count; + uint32_t callback_count; + struct commit_callback *callbacks; + struct dm_io_client *io_client; + + struct workqueue_struct *metadata_wq; +}; + +static int alloc_area(struct pstore *ps) +{ + int r = -ENOMEM; + size_t len; + + len = ps->store->chunk_size << SECTOR_SHIFT; + + /* + * Allocate the chunk_size block of memory that will hold + * a single metadata area. + */ + ps->area = vmalloc(len); + if (!ps->area) + goto err_area; + + ps->zero_area = vzalloc(len); + if (!ps->zero_area) + goto err_zero_area; + + ps->header_area = vmalloc(len); + if (!ps->header_area) + goto err_header_area; + + return 0; + +err_header_area: + vfree(ps->zero_area); + +err_zero_area: + vfree(ps->area); + +err_area: + return r; +} + +static void free_area(struct pstore *ps) +{ + if (ps->area) + vfree(ps->area); + ps->area = NULL; + + if (ps->zero_area) + vfree(ps->zero_area); + ps->zero_area = NULL; + + if (ps->header_area) + vfree(ps->header_area); + ps->header_area = NULL; +} + +struct mdata_req { + struct dm_io_region *where; + struct dm_io_request *io_req; + struct work_struct work; + int result; +}; + +static void do_metadata(struct work_struct *work) +{ + struct mdata_req *req = container_of(work, struct mdata_req, work); + + req->result = dm_io(req->io_req, 1, req->where, NULL); +} + +/* + * Read or write a chunk aligned and sized block of data from a device. + */ +static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw, + int metadata) +{ + struct dm_io_region where = { + .bdev = dm_snap_cow(ps->store->snap)->bdev, + .sector = ps->store->chunk_size * chunk, + .count = ps->store->chunk_size, + }; + struct dm_io_request io_req = { + .bi_rw = rw, + .mem.type = DM_IO_VMA, + .mem.ptr.vma = area, + .client = ps->io_client, + .notify.fn = NULL, + }; + struct mdata_req req; + + if (!metadata) + return dm_io(&io_req, 1, &where, NULL); + + req.where = &where; + req.io_req = &io_req; + + /* + * Issue the synchronous I/O from a different thread + * to avoid generic_make_request recursion. + */ + INIT_WORK_ONSTACK(&req.work, do_metadata); + queue_work(ps->metadata_wq, &req.work); + flush_work(&req.work); + + return req.result; +} + +/* + * Convert a metadata area index to a chunk index. + */ +static chunk_t area_location(struct pstore *ps, chunk_t area) +{ + return NUM_SNAPSHOT_HDR_CHUNKS + ((ps->exceptions_per_area + 1) * area); +} + +/* + * Read or write a metadata area. Remembering to skip the first + * chunk which holds the header. + */ +static int area_io(struct pstore *ps, int rw) +{ + int r; + chunk_t chunk; + + chunk = area_location(ps, ps->current_area); + + r = chunk_io(ps, ps->area, chunk, rw, 0); + if (r) + return r; + + return 0; +} + +static void zero_memory_area(struct pstore *ps) +{ + memset(ps->area, 0, ps->store->chunk_size << SECTOR_SHIFT); +} + +static int zero_disk_area(struct pstore *ps, chunk_t area) +{ + return chunk_io(ps, ps->zero_area, area_location(ps, area), WRITE, 0); +} + +static int read_header(struct pstore *ps, int *new_snapshot) +{ + int r; + struct disk_header *dh; + unsigned chunk_size; + int chunk_size_supplied = 1; + char *chunk_err; + + /* + * Use default chunk size (or logical_block_size, if larger) + * if none supplied + */ + if (!ps->store->chunk_size) { + ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, + bdev_logical_block_size(dm_snap_cow(ps->store->snap)-> + bdev) >> 9); + ps->store->chunk_mask = ps->store->chunk_size - 1; + ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1; + chunk_size_supplied = 0; + } + + ps->io_client = dm_io_client_create(); + if (IS_ERR(ps->io_client)) + return PTR_ERR(ps->io_client); + + r = alloc_area(ps); + if (r) + return r; + + r = chunk_io(ps, ps->header_area, 0, READ, 1); + if (r) + goto bad; + + dh = ps->header_area; + + if (le32_to_cpu(dh->magic) == 0) { + *new_snapshot = 1; + return 0; + } + + if (le32_to_cpu(dh->magic) != SNAP_MAGIC) { + DMWARN("Invalid or corrupt snapshot"); + r = -ENXIO; + goto bad; + } + + *new_snapshot = 0; + ps->valid = le32_to_cpu(dh->valid); + ps->version = le32_to_cpu(dh->version); + chunk_size = le32_to_cpu(dh->chunk_size); + + if (ps->store->chunk_size == chunk_size) + return 0; + + if (chunk_size_supplied) + DMWARN("chunk size %u in device metadata overrides " + "table chunk size of %u.", + chunk_size, ps->store->chunk_size); + + /* We had a bogus chunk_size. Fix stuff up. */ + free_area(ps); + + r = dm_exception_store_set_chunk_size(ps->store, chunk_size, + &chunk_err); + if (r) { + DMERR("invalid on-disk chunk size %u: %s.", + chunk_size, chunk_err); + return r; + } + + r = alloc_area(ps); + return r; + +bad: + free_area(ps); + return r; +} + +static int write_header(struct pstore *ps) +{ + struct disk_header *dh; + + memset(ps->header_area, 0, ps->store->chunk_size << SECTOR_SHIFT); + + dh = ps->header_area; + dh->magic = cpu_to_le32(SNAP_MAGIC); + dh->valid = cpu_to_le32(ps->valid); + dh->version = cpu_to_le32(ps->version); + dh->chunk_size = cpu_to_le32(ps->store->chunk_size); + + return chunk_io(ps, ps->header_area, 0, WRITE, 1); +} + +/* + * Access functions for the disk exceptions, these do the endian conversions. + */ +static struct disk_exception *get_exception(struct pstore *ps, uint32_t index) +{ + BUG_ON(index >= ps->exceptions_per_area); + + return ((struct disk_exception *) ps->area) + index; +} + +static void read_exception(struct pstore *ps, + uint32_t index, struct core_exception *result) +{ + struct disk_exception *de = get_exception(ps, index); + + /* copy it */ + result->old_chunk = le64_to_cpu(de->old_chunk); + result->new_chunk = le64_to_cpu(de->new_chunk); +} + +static void write_exception(struct pstore *ps, + uint32_t index, struct core_exception *e) +{ + struct disk_exception *de = get_exception(ps, index); + + /* copy it */ + de->old_chunk = cpu_to_le64(e->old_chunk); + de->new_chunk = cpu_to_le64(e->new_chunk); +} + +static void clear_exception(struct pstore *ps, uint32_t index) +{ + struct disk_exception *de = get_exception(ps, index); + + /* clear it */ + de->old_chunk = 0; + de->new_chunk = 0; +} + +/* + * Registers the exceptions that are present in the current area. + * 'full' is filled in to indicate if the area has been + * filled. + */ +static int insert_exceptions(struct pstore *ps, + int (*callback)(void *callback_context, + chunk_t old, chunk_t new), + void *callback_context, + int *full) +{ + int r; + unsigned int i; + struct core_exception e; + + /* presume the area is full */ + *full = 1; + + for (i = 0; i < ps->exceptions_per_area; i++) { + read_exception(ps, i, &e); + + /* + * If the new_chunk is pointing at the start of + * the COW device, where the first metadata area + * is we know that we've hit the end of the + * exceptions. Therefore the area is not full. + */ + if (e.new_chunk == 0LL) { + ps->current_committed = i; + *full = 0; + break; + } + + /* + * Keep track of the start of the free chunks. + */ + if (ps->next_free <= e.new_chunk) + ps->next_free = e.new_chunk + 1; + + /* + * Otherwise we add the exception to the snapshot. + */ + r = callback(callback_context, e.old_chunk, e.new_chunk); + if (r) + return r; + } + + return 0; +} + +static int read_exceptions(struct pstore *ps, + int (*callback)(void *callback_context, chunk_t old, + chunk_t new), + void *callback_context) +{ + int r, full = 1; + + /* + * Keeping reading chunks and inserting exceptions until + * we find a partially full area. + */ + for (ps->current_area = 0; full; ps->current_area++) { + r = area_io(ps, READ); + if (r) + return r; + + r = insert_exceptions(ps, callback, callback_context, &full); + if (r) + return r; + } + + ps->current_area--; + + return 0; +} + +static struct pstore *get_info(struct dm_exception_store *store) +{ + return (struct pstore *) store->context; +} + +static void persistent_usage(struct dm_exception_store *store, + sector_t *total_sectors, + sector_t *sectors_allocated, + sector_t *metadata_sectors) +{ + struct pstore *ps = get_info(store); + + *sectors_allocated = ps->next_free * store->chunk_size; + *total_sectors = get_dev_size(dm_snap_cow(store->snap)->bdev); + + /* + * First chunk is the fixed header. + * Then there are (ps->current_area + 1) metadata chunks, each one + * separated from the next by ps->exceptions_per_area data chunks. + */ + *metadata_sectors = (ps->current_area + 1 + NUM_SNAPSHOT_HDR_CHUNKS) * + store->chunk_size; +} + +static void persistent_dtr(struct dm_exception_store *store) +{ + struct pstore *ps = get_info(store); + + destroy_workqueue(ps->metadata_wq); + + /* Created in read_header */ + if (ps->io_client) + dm_io_client_destroy(ps->io_client); + free_area(ps); + + /* Allocated in persistent_read_metadata */ + if (ps->callbacks) + vfree(ps->callbacks); + + kfree(ps); +} + +static int persistent_read_metadata(struct dm_exception_store *store, + int (*callback)(void *callback_context, + chunk_t old, chunk_t new), + void *callback_context) +{ + int r, uninitialized_var(new_snapshot); + struct pstore *ps = get_info(store); + + /* + * Read the snapshot header. + */ + r = read_header(ps, &new_snapshot); + if (r) + return r; + + /* + * Now we know correct chunk_size, complete the initialisation. + */ + ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) / + sizeof(struct disk_exception); + ps->callbacks = dm_vcalloc(ps->exceptions_per_area, + sizeof(*ps->callbacks)); + if (!ps->callbacks) + return -ENOMEM; + + /* + * Do we need to setup a new snapshot ? + */ + if (new_snapshot) { + r = write_header(ps); + if (r) { + DMWARN("write_header failed"); + return r; + } + + ps->current_area = 0; + zero_memory_area(ps); + r = zero_disk_area(ps, 0); + if (r) + DMWARN("zero_disk_area(0) failed"); + return r; + } + /* + * Sanity checks. + */ + if (ps->version != SNAPSHOT_DISK_VERSION) { + DMWARN("unable to handle snapshot disk version %d", + ps->version); + return -EINVAL; + } + + /* + * Metadata are valid, but snapshot is invalidated + */ + if (!ps->valid) + return 1; + + /* + * Read the metadata. + */ + r = read_exceptions(ps, callback, callback_context); + + return r; +} + +static int persistent_prepare_exception(struct dm_exception_store *store, + struct dm_exception *e) +{ + struct pstore *ps = get_info(store); + uint32_t stride; + chunk_t next_free; + sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev); + + /* Is there enough room ? */ + if (size < ((ps->next_free + 1) * store->chunk_size)) + return -ENOSPC; + + e->new_chunk = ps->next_free; + + /* + * Move onto the next free pending, making sure to take + * into account the location of the metadata chunks. + */ + stride = (ps->exceptions_per_area + 1); + next_free = ++ps->next_free; + if (sector_div(next_free, stride) == 1) + ps->next_free++; + + atomic_inc(&ps->pending_count); + return 0; +} + +static void persistent_commit_exception(struct dm_exception_store *store, + struct dm_exception *e, + void (*callback) (void *, int success), + void *callback_context) +{ + unsigned int i; + struct pstore *ps = get_info(store); + struct core_exception ce; + struct commit_callback *cb; + + ce.old_chunk = e->old_chunk; + ce.new_chunk = e->new_chunk; + write_exception(ps, ps->current_committed++, &ce); + + /* + * Add the callback to the back of the array. This code + * is the only place where the callback array is + * manipulated, and we know that it will never be called + * multiple times concurrently. + */ + cb = ps->callbacks + ps->callback_count++; + cb->callback = callback; + cb->context = callback_context; + + /* + * If there are exceptions in flight and we have not yet + * filled this metadata area there's nothing more to do. + */ + if (!atomic_dec_and_test(&ps->pending_count) && + (ps->current_committed != ps->exceptions_per_area)) + return; + + /* + * If we completely filled the current area, then wipe the next one. + */ + if ((ps->current_committed == ps->exceptions_per_area) && + zero_disk_area(ps, ps->current_area + 1)) + ps->valid = 0; + + /* + * Commit exceptions to disk. + */ + if (ps->valid && area_io(ps, WRITE_FLUSH_FUA)) + ps->valid = 0; + + /* + * Advance to the next area if this one is full. + */ + if (ps->current_committed == ps->exceptions_per_area) { + ps->current_committed = 0; + ps->current_area++; + zero_memory_area(ps); + } + + for (i = 0; i < ps->callback_count; i++) { + cb = ps->callbacks + i; + cb->callback(cb->context, ps->valid); + } + + ps->callback_count = 0; +} + +static int persistent_prepare_merge(struct dm_exception_store *store, + chunk_t *last_old_chunk, + chunk_t *last_new_chunk) +{ + struct pstore *ps = get_info(store); + struct core_exception ce; + int nr_consecutive; + int r; + + /* + * When current area is empty, move back to preceding area. + */ + if (!ps->current_committed) { + /* + * Have we finished? + */ + if (!ps->current_area) + return 0; + + ps->current_area--; + r = area_io(ps, READ); + if (r < 0) + return r; + ps->current_committed = ps->exceptions_per_area; + } + + read_exception(ps, ps->current_committed - 1, &ce); + *last_old_chunk = ce.old_chunk; + *last_new_chunk = ce.new_chunk; + + /* + * Find number of consecutive chunks within the current area, + * working backwards. + */ + for (nr_consecutive = 1; nr_consecutive < ps->current_committed; + nr_consecutive++) { + read_exception(ps, ps->current_committed - 1 - nr_consecutive, + &ce); + if (ce.old_chunk != *last_old_chunk - nr_consecutive || + ce.new_chunk != *last_new_chunk - nr_consecutive) + break; + } + + return nr_consecutive; +} + +static int persistent_commit_merge(struct dm_exception_store *store, + int nr_merged) +{ + int r, i; + struct pstore *ps = get_info(store); + + BUG_ON(nr_merged > ps->current_committed); + + for (i = 0; i < nr_merged; i++) + clear_exception(ps, ps->current_committed - 1 - i); + + r = area_io(ps, WRITE_FLUSH_FUA); + if (r < 0) + return r; + + ps->current_committed -= nr_merged; + + /* + * At this stage, only persistent_usage() uses ps->next_free, so + * we make no attempt to keep ps->next_free strictly accurate + * as exceptions may have been committed out-of-order originally. + * Once a snapshot has become merging, we set it to the value it + * would have held had all the exceptions been committed in order. + * + * ps->current_area does not get reduced by prepare_merge() until + * after commit_merge() has removed the nr_merged previous exceptions. + */ + ps->next_free = area_location(ps, ps->current_area) + + ps->current_committed + 1; + + return 0; +} + +static void persistent_drop_snapshot(struct dm_exception_store *store) +{ + struct pstore *ps = get_info(store); + + ps->valid = 0; + if (write_header(ps)) + DMWARN("write header failed"); +} + +static int persistent_ctr(struct dm_exception_store *store, + unsigned argc, char **argv) +{ + struct pstore *ps; + + /* allocate the pstore */ + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) + return -ENOMEM; + + ps->store = store; + ps->valid = 1; + ps->version = SNAPSHOT_DISK_VERSION; + ps->area = NULL; + ps->zero_area = NULL; + ps->header_area = NULL; + ps->next_free = NUM_SNAPSHOT_HDR_CHUNKS + 1; /* header and 1st area */ + ps->current_committed = 0; + + ps->callback_count = 0; + atomic_set(&ps->pending_count, 0); + ps->callbacks = NULL; + + ps->metadata_wq = alloc_workqueue("ksnaphd", WQ_MEM_RECLAIM, 0); + if (!ps->metadata_wq) { + kfree(ps); + DMERR("couldn't start header metadata update thread"); + return -ENOMEM; + } + + store->context = ps; + + return 0; +} + +static unsigned persistent_status(struct dm_exception_store *store, + status_type_t status, char *result, + unsigned maxlen) +{ + unsigned sz = 0; + + switch (status) { + case STATUSTYPE_INFO: + break; + case STATUSTYPE_TABLE: + DMEMIT(" P %llu", (unsigned long long)store->chunk_size); + } + + return sz; +} + +static struct dm_exception_store_type _persistent_type = { + .name = "persistent", + .module = THIS_MODULE, + .ctr = persistent_ctr, + .dtr = persistent_dtr, + .read_metadata = persistent_read_metadata, + .prepare_exception = persistent_prepare_exception, + .commit_exception = persistent_commit_exception, + .prepare_merge = persistent_prepare_merge, + .commit_merge = persistent_commit_merge, + .drop_snapshot = persistent_drop_snapshot, + .usage = persistent_usage, + .status = persistent_status, +}; + +static struct dm_exception_store_type _persistent_compat_type = { + .name = "P", + .module = THIS_MODULE, + .ctr = persistent_ctr, + .dtr = persistent_dtr, + .read_metadata = persistent_read_metadata, + .prepare_exception = persistent_prepare_exception, + .commit_exception = persistent_commit_exception, + .prepare_merge = persistent_prepare_merge, + .commit_merge = persistent_commit_merge, + .drop_snapshot = persistent_drop_snapshot, + .usage = persistent_usage, + .status = persistent_status, +}; + +int dm_persistent_snapshot_init(void) +{ + int r; + + r = dm_exception_store_type_register(&_persistent_type); + if (r) { + DMERR("Unable to register persistent exception store type"); + return r; + } + + r = dm_exception_store_type_register(&_persistent_compat_type); + if (r) { + DMERR("Unable to register old-style persistent exception " + "store type"); + dm_exception_store_type_unregister(&_persistent_type); + return r; + } + + return r; +} + +void dm_persistent_snapshot_exit(void) +{ + dm_exception_store_type_unregister(&_persistent_type); + dm_exception_store_type_unregister(&_persistent_compat_type); +} diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c new file mode 100644 index 00000000..1ce9a258 --- /dev/null +++ b/drivers/md/dm-snap-transient.c @@ -0,0 +1,153 @@ +/* + * Copyright (C) 2001-2002 Sistina Software (UK) Limited. + * Copyright (C) 2006-2008 Red Hat GmbH + * + * This file is released under the GPL. + */ + +#include "dm-exception-store.h" + +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/vmalloc.h> +#include <linux/export.h> +#include <linux/slab.h> +#include <linux/dm-io.h> + +#define DM_MSG_PREFIX "transient snapshot" + +/*----------------------------------------------------------------- + * Implementation of the store for non-persistent snapshots. + *---------------------------------------------------------------*/ +struct transient_c { + sector_t next_free; +}; + +static void transient_dtr(struct dm_exception_store *store) +{ + kfree(store->context); +} + +static int transient_read_metadata(struct dm_exception_store *store, + int (*callback)(void *callback_context, + chunk_t old, chunk_t new), + void *callback_context) +{ + return 0; +} + +static int transient_prepare_exception(struct dm_exception_store *store, + struct dm_exception *e) +{ + struct transient_c *tc = store->context; + sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev); + + if (size < (tc->next_free + store->chunk_size)) + return -1; + + e->new_chunk = sector_to_chunk(store, tc->next_free); + tc->next_free += store->chunk_size; + + return 0; +} + +static void transient_commit_exception(struct dm_exception_store *store, + struct dm_exception *e, + void (*callback) (void *, int success), + void *callback_context) +{ + /* Just succeed */ + callback(callback_context, 1); +} + +static void transient_usage(struct dm_exception_store *store, + sector_t *total_sectors, + sector_t *sectors_allocated, + sector_t *metadata_sectors) +{ + *sectors_allocated = ((struct transient_c *) store->context)->next_free; + *total_sectors = get_dev_size(dm_snap_cow(store->snap)->bdev); + *metadata_sectors = 0; +} + +static int transient_ctr(struct dm_exception_store *store, + unsigned argc, char **argv) +{ + struct transient_c *tc; + + tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL); + if (!tc) + return -ENOMEM; + + tc->next_free = 0; + store->context = tc; + + return 0; +} + +static unsigned transient_status(struct dm_exception_store *store, + status_type_t status, char *result, + unsigned maxlen) +{ + unsigned sz = 0; + + switch (status) { + case STATUSTYPE_INFO: + break; + case STATUSTYPE_TABLE: + DMEMIT(" N %llu", (unsigned long long)store->chunk_size); + } + + return sz; +} + +static struct dm_exception_store_type _transient_type = { + .name = "transient", + .module = THIS_MODULE, + .ctr = transient_ctr, + .dtr = transient_dtr, + .read_metadata = transient_read_metadata, + .prepare_exception = transient_prepare_exception, + .commit_exception = transient_commit_exception, + .usage = transient_usage, + .status = transient_status, +}; + +static struct dm_exception_store_type _transient_compat_type = { + .name = "N", + .module = THIS_MODULE, + .ctr = transient_ctr, + .dtr = transient_dtr, + .read_metadata = transient_read_metadata, + .prepare_exception = transient_prepare_exception, + .commit_exception = transient_commit_exception, + .usage = transient_usage, + .status = transient_status, +}; + +int dm_transient_snapshot_init(void) +{ + int r; + + r = dm_exception_store_type_register(&_transient_type); + if (r) { + DMWARN("Unable to register transient exception store type"); + return r; + } + + r = dm_exception_store_type_register(&_transient_compat_type); + if (r) { + DMWARN("Unable to register old-style transient " + "exception store type"); + dm_exception_store_type_unregister(&_transient_type); + return r; + } + + return r; +} + +void dm_transient_snapshot_exit(void) +{ + dm_exception_store_type_unregister(&_transient_type); + dm_exception_store_type_unregister(&_transient_compat_type); +} diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c new file mode 100644 index 00000000..6f758870 --- /dev/null +++ b/drivers/md/dm-snap.c @@ -0,0 +1,2329 @@ +/* + * dm-snapshot.c + * + * Copyright (C) 2001-2002 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include <linux/blkdev.h> +#include <linux/device-mapper.h> +#include <linux/delay.h> +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/kdev_t.h> +#include <linux/list.h> +#include <linux/mempool.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/log2.h> +#include <linux/dm-kcopyd.h> + +#include "dm-exception-store.h" + +#define DM_MSG_PREFIX "snapshots" + +static const char dm_snapshot_merge_target_name[] = "snapshot-merge"; + +#define dm_target_is_snapshot_merge(ti) \ + ((ti)->type->name == dm_snapshot_merge_target_name) + +/* + * The size of the mempool used to track chunks in use. + */ +#define MIN_IOS 256 + +#define DM_TRACKED_CHUNK_HASH_SIZE 16 +#define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \ + (DM_TRACKED_CHUNK_HASH_SIZE - 1)) + +struct dm_exception_table { + uint32_t hash_mask; + unsigned hash_shift; + struct list_head *table; +}; + +struct dm_snapshot { + struct rw_semaphore lock; + + struct dm_dev *origin; + struct dm_dev *cow; + + struct dm_target *ti; + + /* List of snapshots per Origin */ + struct list_head list; + + /* + * You can't use a snapshot if this is 0 (e.g. if full). + * A snapshot-merge target never clears this. + */ + int valid; + + /* Origin writes don't trigger exceptions until this is set */ + int active; + + atomic_t pending_exceptions_count; + + mempool_t *pending_pool; + + struct dm_exception_table pending; + struct dm_exception_table complete; + + /* + * pe_lock protects all pending_exception operations and access + * as well as the snapshot_bios list. + */ + spinlock_t pe_lock; + + /* Chunks with outstanding reads */ + spinlock_t tracked_chunk_lock; + mempool_t *tracked_chunk_pool; + struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE]; + + /* The on disk metadata handler */ + struct dm_exception_store *store; + + struct dm_kcopyd_client *kcopyd_client; + + /* Wait for events based on state_bits */ + unsigned long state_bits; + + /* Range of chunks currently being merged. */ + chunk_t first_merging_chunk; + int num_merging_chunks; + + /* + * The merge operation failed if this flag is set. + * Failure modes are handled as follows: + * - I/O error reading the header + * => don't load the target; abort. + * - Header does not have "valid" flag set + * => use the origin; forget about the snapshot. + * - I/O error when reading exceptions + * => don't load the target; abort. + * (We can't use the intermediate origin state.) + * - I/O error while merging + * => stop merging; set merge_failed; process I/O normally. + */ + int merge_failed; + + /* + * Incoming bios that overlap with chunks being merged must wait + * for them to be committed. + */ + struct bio_list bios_queued_during_merge; +}; + +/* + * state_bits: + * RUNNING_MERGE - Merge operation is in progress. + * SHUTDOWN_MERGE - Set to signal that merge needs to be stopped; + * cleared afterwards. + */ +#define RUNNING_MERGE 0 +#define SHUTDOWN_MERGE 1 + +struct dm_dev *dm_snap_origin(struct dm_snapshot *s) +{ + return s->origin; +} +EXPORT_SYMBOL(dm_snap_origin); + +struct dm_dev *dm_snap_cow(struct dm_snapshot *s) +{ + return s->cow; +} +EXPORT_SYMBOL(dm_snap_cow); + +static sector_t chunk_to_sector(struct dm_exception_store *store, + chunk_t chunk) +{ + return chunk << store->chunk_shift; +} + +static int bdev_equal(struct block_device *lhs, struct block_device *rhs) +{ + /* + * There is only ever one instance of a particular block + * device so we can compare pointers safely. + */ + return lhs == rhs; +} + +struct dm_snap_pending_exception { + struct dm_exception e; + + /* + * Origin buffers waiting for this to complete are held + * in a bio list + */ + struct bio_list origin_bios; + struct bio_list snapshot_bios; + + /* Pointer back to snapshot context */ + struct dm_snapshot *snap; + + /* + * 1 indicates the exception has already been sent to + * kcopyd. + */ + int started; + + /* + * For writing a complete chunk, bypassing the copy. + */ + struct bio *full_bio; + bio_end_io_t *full_bio_end_io; + void *full_bio_private; +}; + +/* + * Hash table mapping origin volumes to lists of snapshots and + * a lock to protect it + */ +static struct kmem_cache *exception_cache; +static struct kmem_cache *pending_cache; + +struct dm_snap_tracked_chunk { + struct hlist_node node; + chunk_t chunk; +}; + +static struct kmem_cache *tracked_chunk_cache; + +static struct dm_snap_tracked_chunk *track_chunk(struct dm_snapshot *s, + chunk_t chunk) +{ + struct dm_snap_tracked_chunk *c = mempool_alloc(s->tracked_chunk_pool, + GFP_NOIO); + unsigned long flags; + + c->chunk = chunk; + + spin_lock_irqsave(&s->tracked_chunk_lock, flags); + hlist_add_head(&c->node, + &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)]); + spin_unlock_irqrestore(&s->tracked_chunk_lock, flags); + + return c; +} + +static void stop_tracking_chunk(struct dm_snapshot *s, + struct dm_snap_tracked_chunk *c) +{ + unsigned long flags; + + spin_lock_irqsave(&s->tracked_chunk_lock, flags); + hlist_del(&c->node); + spin_unlock_irqrestore(&s->tracked_chunk_lock, flags); + + mempool_free(c, s->tracked_chunk_pool); +} + +static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk) +{ + struct dm_snap_tracked_chunk *c; + struct hlist_node *hn; + int found = 0; + + spin_lock_irq(&s->tracked_chunk_lock); + + hlist_for_each_entry(c, hn, + &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)], node) { + if (c->chunk == chunk) { + found = 1; + break; + } + } + + spin_unlock_irq(&s->tracked_chunk_lock); + + return found; +} + +/* + * This conflicting I/O is extremely improbable in the caller, + * so msleep(1) is sufficient and there is no need for a wait queue. + */ +static void __check_for_conflicting_io(struct dm_snapshot *s, chunk_t chunk) +{ + while (__chunk_is_tracked(s, chunk)) + msleep(1); +} + +/* + * One of these per registered origin, held in the snapshot_origins hash + */ +struct origin { + /* The origin device */ + struct block_device *bdev; + + struct list_head hash_list; + + /* List of snapshots for this origin */ + struct list_head snapshots; +}; + +/* + * Size of the hash table for origin volumes. If we make this + * the size of the minors list then it should be nearly perfect + */ +#define ORIGIN_HASH_SIZE 256 +#define ORIGIN_MASK 0xFF +static struct list_head *_origins; +static struct rw_semaphore _origins_lock; + +static DECLARE_WAIT_QUEUE_HEAD(_pending_exceptions_done); +static DEFINE_SPINLOCK(_pending_exceptions_done_spinlock); +static uint64_t _pending_exceptions_done_count; + +static int init_origin_hash(void) +{ + int i; + + _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head), + GFP_KERNEL); + if (!_origins) { + DMERR("unable to allocate memory"); + return -ENOMEM; + } + + for (i = 0; i < ORIGIN_HASH_SIZE; i++) + INIT_LIST_HEAD(_origins + i); + init_rwsem(&_origins_lock); + + return 0; +} + +static void exit_origin_hash(void) +{ + kfree(_origins); +} + +static unsigned origin_hash(struct block_device *bdev) +{ + return bdev->bd_dev & ORIGIN_MASK; +} + +static struct origin *__lookup_origin(struct block_device *origin) +{ + struct list_head *ol; + struct origin *o; + + ol = &_origins[origin_hash(origin)]; + list_for_each_entry (o, ol, hash_list) + if (bdev_equal(o->bdev, origin)) + return o; + + return NULL; +} + +static void __insert_origin(struct origin *o) +{ + struct list_head *sl = &_origins[origin_hash(o->bdev)]; + list_add_tail(&o->hash_list, sl); +} + +/* + * _origins_lock must be held when calling this function. + * Returns number of snapshots registered using the supplied cow device, plus: + * snap_src - a snapshot suitable for use as a source of exception handover + * snap_dest - a snapshot capable of receiving exception handover. + * snap_merge - an existing snapshot-merge target linked to the same origin. + * There can be at most one snapshot-merge target. The parameter is optional. + * + * Possible return values and states of snap_src and snap_dest. + * 0: NULL, NULL - first new snapshot + * 1: snap_src, NULL - normal snapshot + * 2: snap_src, snap_dest - waiting for handover + * 2: snap_src, NULL - handed over, waiting for old to be deleted + * 1: NULL, snap_dest - source got destroyed without handover + */ +static int __find_snapshots_sharing_cow(struct dm_snapshot *snap, + struct dm_snapshot **snap_src, + struct dm_snapshot **snap_dest, + struct dm_snapshot **snap_merge) +{ + struct dm_snapshot *s; + struct origin *o; + int count = 0; + int active; + + o = __lookup_origin(snap->origin->bdev); + if (!o) + goto out; + + list_for_each_entry(s, &o->snapshots, list) { + if (dm_target_is_snapshot_merge(s->ti) && snap_merge) + *snap_merge = s; + if (!bdev_equal(s->cow->bdev, snap->cow->bdev)) + continue; + + down_read(&s->lock); + active = s->active; + up_read(&s->lock); + + if (active) { + if (snap_src) + *snap_src = s; + } else if (snap_dest) + *snap_dest = s; + + count++; + } + +out: + return count; +} + +/* + * On success, returns 1 if this snapshot is a handover destination, + * otherwise returns 0. + */ +static int __validate_exception_handover(struct dm_snapshot *snap) +{ + struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; + struct dm_snapshot *snap_merge = NULL; + + /* Does snapshot need exceptions handed over to it? */ + if ((__find_snapshots_sharing_cow(snap, &snap_src, &snap_dest, + &snap_merge) == 2) || + snap_dest) { + snap->ti->error = "Snapshot cow pairing for exception " + "table handover failed"; + return -EINVAL; + } + + /* + * If no snap_src was found, snap cannot become a handover + * destination. + */ + if (!snap_src) + return 0; + + /* + * Non-snapshot-merge handover? + */ + if (!dm_target_is_snapshot_merge(snap->ti)) + return 1; + + /* + * Do not allow more than one merging snapshot. + */ + if (snap_merge) { + snap->ti->error = "A snapshot is already merging."; + return -EINVAL; + } + + if (!snap_src->store->type->prepare_merge || + !snap_src->store->type->commit_merge) { + snap->ti->error = "Snapshot exception store does not " + "support snapshot-merge."; + return -EINVAL; + } + + return 1; +} + +static void __insert_snapshot(struct origin *o, struct dm_snapshot *s) +{ + struct dm_snapshot *l; + + /* Sort the list according to chunk size, largest-first smallest-last */ + list_for_each_entry(l, &o->snapshots, list) + if (l->store->chunk_size < s->store->chunk_size) + break; + list_add_tail(&s->list, &l->list); +} + +/* + * Make a note of the snapshot and its origin so we can look it + * up when the origin has a write on it. + * + * Also validate snapshot exception store handovers. + * On success, returns 1 if this registration is a handover destination, + * otherwise returns 0. + */ +static int register_snapshot(struct dm_snapshot *snap) +{ + struct origin *o, *new_o = NULL; + struct block_device *bdev = snap->origin->bdev; + int r = 0; + + new_o = kmalloc(sizeof(*new_o), GFP_KERNEL); + if (!new_o) + return -ENOMEM; + + down_write(&_origins_lock); + + r = __validate_exception_handover(snap); + if (r < 0) { + kfree(new_o); + goto out; + } + + o = __lookup_origin(bdev); + if (o) + kfree(new_o); + else { + /* New origin */ + o = new_o; + + /* Initialise the struct */ + INIT_LIST_HEAD(&o->snapshots); + o->bdev = bdev; + + __insert_origin(o); + } + + __insert_snapshot(o, snap); + +out: + up_write(&_origins_lock); + + return r; +} + +/* + * Move snapshot to correct place in list according to chunk size. + */ +static void reregister_snapshot(struct dm_snapshot *s) +{ + struct block_device *bdev = s->origin->bdev; + + down_write(&_origins_lock); + + list_del(&s->list); + __insert_snapshot(__lookup_origin(bdev), s); + + up_write(&_origins_lock); +} + +static void unregister_snapshot(struct dm_snapshot *s) +{ + struct origin *o; + + down_write(&_origins_lock); + o = __lookup_origin(s->origin->bdev); + + list_del(&s->list); + if (o && list_empty(&o->snapshots)) { + list_del(&o->hash_list); + kfree(o); + } + + up_write(&_origins_lock); +} + +/* + * Implementation of the exception hash tables. + * The lowest hash_shift bits of the chunk number are ignored, allowing + * some consecutive chunks to be grouped together. + */ +static int dm_exception_table_init(struct dm_exception_table *et, + uint32_t size, unsigned hash_shift) +{ + unsigned int i; + + et->hash_shift = hash_shift; + et->hash_mask = size - 1; + et->table = dm_vcalloc(size, sizeof(struct list_head)); + if (!et->table) + return -ENOMEM; + + for (i = 0; i < size; i++) + INIT_LIST_HEAD(et->table + i); + + return 0; +} + +static void dm_exception_table_exit(struct dm_exception_table *et, + struct kmem_cache *mem) +{ + struct list_head *slot; + struct dm_exception *ex, *next; + int i, size; + + size = et->hash_mask + 1; + for (i = 0; i < size; i++) { + slot = et->table + i; + + list_for_each_entry_safe (ex, next, slot, hash_list) + kmem_cache_free(mem, ex); + } + + vfree(et->table); +} + +static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk) +{ + return (chunk >> et->hash_shift) & et->hash_mask; +} + +static void dm_remove_exception(struct dm_exception *e) +{ + list_del(&e->hash_list); +} + +/* + * Return the exception data for a sector, or NULL if not + * remapped. + */ +static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et, + chunk_t chunk) +{ + struct list_head *slot; + struct dm_exception *e; + + slot = &et->table[exception_hash(et, chunk)]; + list_for_each_entry (e, slot, hash_list) + if (chunk >= e->old_chunk && + chunk <= e->old_chunk + dm_consecutive_chunk_count(e)) + return e; + + return NULL; +} + +static struct dm_exception *alloc_completed_exception(void) +{ + struct dm_exception *e; + + e = kmem_cache_alloc(exception_cache, GFP_NOIO); + if (!e) + e = kmem_cache_alloc(exception_cache, GFP_ATOMIC); + + return e; +} + +static void free_completed_exception(struct dm_exception *e) +{ + kmem_cache_free(exception_cache, e); +} + +static struct dm_snap_pending_exception *alloc_pending_exception(struct dm_snapshot *s) +{ + struct dm_snap_pending_exception *pe = mempool_alloc(s->pending_pool, + GFP_NOIO); + + atomic_inc(&s->pending_exceptions_count); + pe->snap = s; + + return pe; +} + +static void free_pending_exception(struct dm_snap_pending_exception *pe) +{ + struct dm_snapshot *s = pe->snap; + + mempool_free(pe, s->pending_pool); + smp_mb__before_atomic_dec(); + atomic_dec(&s->pending_exceptions_count); +} + +static void dm_insert_exception(struct dm_exception_table *eh, + struct dm_exception *new_e) +{ + struct list_head *l; + struct dm_exception *e = NULL; + + l = &eh->table[exception_hash(eh, new_e->old_chunk)]; + + /* Add immediately if this table doesn't support consecutive chunks */ + if (!eh->hash_shift) + goto out; + + /* List is ordered by old_chunk */ + list_for_each_entry_reverse(e, l, hash_list) { + /* Insert after an existing chunk? */ + if (new_e->old_chunk == (e->old_chunk + + dm_consecutive_chunk_count(e) + 1) && + new_e->new_chunk == (dm_chunk_number(e->new_chunk) + + dm_consecutive_chunk_count(e) + 1)) { + dm_consecutive_chunk_count_inc(e); + free_completed_exception(new_e); + return; + } + + /* Insert before an existing chunk? */ + if (new_e->old_chunk == (e->old_chunk - 1) && + new_e->new_chunk == (dm_chunk_number(e->new_chunk) - 1)) { + dm_consecutive_chunk_count_inc(e); + e->old_chunk--; + e->new_chunk--; + free_completed_exception(new_e); + return; + } + + if (new_e->old_chunk > e->old_chunk) + break; + } + +out: + list_add(&new_e->hash_list, e ? &e->hash_list : l); +} + +/* + * Callback used by the exception stores to load exceptions when + * initialising. + */ +static int dm_add_exception(void *context, chunk_t old, chunk_t new) +{ + struct dm_snapshot *s = context; + struct dm_exception *e; + + e = alloc_completed_exception(); + if (!e) + return -ENOMEM; + + e->old_chunk = old; + + /* Consecutive_count is implicitly initialised to zero */ + e->new_chunk = new; + + dm_insert_exception(&s->complete, e); + + return 0; +} + +/* + * Return a minimum chunk size of all snapshots that have the specified origin. + * Return zero if the origin has no snapshots. + */ +static sector_t __minimum_chunk_size(struct origin *o) +{ + struct dm_snapshot *snap; + unsigned chunk_size = 0; + + if (o) + list_for_each_entry(snap, &o->snapshots, list) + chunk_size = min_not_zero(chunk_size, + snap->store->chunk_size); + + return chunk_size; +} + +/* + * Hard coded magic. + */ +static int calc_max_buckets(void) +{ + /* use a fixed size of 2MB */ + unsigned long mem = 2 * 1024 * 1024; + mem /= sizeof(struct list_head); + + return mem; +} + +/* + * Allocate room for a suitable hash table. + */ +static int init_hash_tables(struct dm_snapshot *s) +{ + sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets; + + /* + * Calculate based on the size of the original volume or + * the COW volume... + */ + cow_dev_size = get_dev_size(s->cow->bdev); + origin_dev_size = get_dev_size(s->origin->bdev); + max_buckets = calc_max_buckets(); + + hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift; + hash_size = min(hash_size, max_buckets); + + if (hash_size < 64) + hash_size = 64; + hash_size = rounddown_pow_of_two(hash_size); + if (dm_exception_table_init(&s->complete, hash_size, + DM_CHUNK_CONSECUTIVE_BITS)) + return -ENOMEM; + + /* + * Allocate hash table for in-flight exceptions + * Make this smaller than the real hash table + */ + hash_size >>= 3; + if (hash_size < 64) + hash_size = 64; + + if (dm_exception_table_init(&s->pending, hash_size, 0)) { + dm_exception_table_exit(&s->complete, exception_cache); + return -ENOMEM; + } + + return 0; +} + +static void merge_shutdown(struct dm_snapshot *s) +{ + clear_bit_unlock(RUNNING_MERGE, &s->state_bits); + smp_mb__after_clear_bit(); + wake_up_bit(&s->state_bits, RUNNING_MERGE); +} + +static struct bio *__release_queued_bios_after_merge(struct dm_snapshot *s) +{ + s->first_merging_chunk = 0; + s->num_merging_chunks = 0; + + return bio_list_get(&s->bios_queued_during_merge); +} + +/* + * Remove one chunk from the index of completed exceptions. + */ +static int __remove_single_exception_chunk(struct dm_snapshot *s, + chunk_t old_chunk) +{ + struct dm_exception *e; + + e = dm_lookup_exception(&s->complete, old_chunk); + if (!e) { + DMERR("Corruption detected: exception for block %llu is " + "on disk but not in memory", + (unsigned long long)old_chunk); + return -EINVAL; + } + + /* + * If this is the only chunk using this exception, remove exception. + */ + if (!dm_consecutive_chunk_count(e)) { + dm_remove_exception(e); + free_completed_exception(e); + return 0; + } + + /* + * The chunk may be either at the beginning or the end of a + * group of consecutive chunks - never in the middle. We are + * removing chunks in the opposite order to that in which they + * were added, so this should always be true. + * Decrement the consecutive chunk counter and adjust the + * starting point if necessary. + */ + if (old_chunk == e->old_chunk) { + e->old_chunk++; + e->new_chunk++; + } else if (old_chunk != e->old_chunk + + dm_consecutive_chunk_count(e)) { + DMERR("Attempt to merge block %llu from the " + "middle of a chunk range [%llu - %llu]", + (unsigned long long)old_chunk, + (unsigned long long)e->old_chunk, + (unsigned long long) + e->old_chunk + dm_consecutive_chunk_count(e)); + return -EINVAL; + } + + dm_consecutive_chunk_count_dec(e); + + return 0; +} + +static void flush_bios(struct bio *bio); + +static int remove_single_exception_chunk(struct dm_snapshot *s) +{ + struct bio *b = NULL; + int r; + chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1; + + down_write(&s->lock); + + /* + * Process chunks (and associated exceptions) in reverse order + * so that dm_consecutive_chunk_count_dec() accounting works. + */ + do { + r = __remove_single_exception_chunk(s, old_chunk); + if (r) + goto out; + } while (old_chunk-- > s->first_merging_chunk); + + b = __release_queued_bios_after_merge(s); + +out: + up_write(&s->lock); + if (b) + flush_bios(b); + + return r; +} + +static int origin_write_extent(struct dm_snapshot *merging_snap, + sector_t sector, unsigned chunk_size); + +static void merge_callback(int read_err, unsigned long write_err, + void *context); + +static uint64_t read_pending_exceptions_done_count(void) +{ + uint64_t pending_exceptions_done; + + spin_lock(&_pending_exceptions_done_spinlock); + pending_exceptions_done = _pending_exceptions_done_count; + spin_unlock(&_pending_exceptions_done_spinlock); + + return pending_exceptions_done; +} + +static void increment_pending_exceptions_done_count(void) +{ + spin_lock(&_pending_exceptions_done_spinlock); + _pending_exceptions_done_count++; + spin_unlock(&_pending_exceptions_done_spinlock); + + wake_up_all(&_pending_exceptions_done); +} + +static void snapshot_merge_next_chunks(struct dm_snapshot *s) +{ + int i, linear_chunks; + chunk_t old_chunk, new_chunk; + struct dm_io_region src, dest; + sector_t io_size; + uint64_t previous_count; + + BUG_ON(!test_bit(RUNNING_MERGE, &s->state_bits)); + if (unlikely(test_bit(SHUTDOWN_MERGE, &s->state_bits))) + goto shut; + + /* + * valid flag never changes during merge, so no lock required. + */ + if (!s->valid) { + DMERR("Snapshot is invalid: can't merge"); + goto shut; + } + + linear_chunks = s->store->type->prepare_merge(s->store, &old_chunk, + &new_chunk); + if (linear_chunks <= 0) { + if (linear_chunks < 0) { + DMERR("Read error in exception store: " + "shutting down merge"); + down_write(&s->lock); + s->merge_failed = 1; + up_write(&s->lock); + } + goto shut; + } + + /* Adjust old_chunk and new_chunk to reflect start of linear region */ + old_chunk = old_chunk + 1 - linear_chunks; + new_chunk = new_chunk + 1 - linear_chunks; + + /* + * Use one (potentially large) I/O to copy all 'linear_chunks' + * from the exception store to the origin + */ + io_size = linear_chunks * s->store->chunk_size; + + dest.bdev = s->origin->bdev; + dest.sector = chunk_to_sector(s->store, old_chunk); + dest.count = min(io_size, get_dev_size(dest.bdev) - dest.sector); + + src.bdev = s->cow->bdev; + src.sector = chunk_to_sector(s->store, new_chunk); + src.count = dest.count; + + /* + * Reallocate any exceptions needed in other snapshots then + * wait for the pending exceptions to complete. + * Each time any pending exception (globally on the system) + * completes we are woken and repeat the process to find out + * if we can proceed. While this may not seem a particularly + * efficient algorithm, it is not expected to have any + * significant impact on performance. + */ + previous_count = read_pending_exceptions_done_count(); + while (origin_write_extent(s, dest.sector, io_size)) { + wait_event(_pending_exceptions_done, + (read_pending_exceptions_done_count() != + previous_count)); + /* Retry after the wait, until all exceptions are done. */ + previous_count = read_pending_exceptions_done_count(); + } + + down_write(&s->lock); + s->first_merging_chunk = old_chunk; + s->num_merging_chunks = linear_chunks; + up_write(&s->lock); + + /* Wait until writes to all 'linear_chunks' drain */ + for (i = 0; i < linear_chunks; i++) + __check_for_conflicting_io(s, old_chunk + i); + + dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, merge_callback, s); + return; + +shut: + merge_shutdown(s); +} + +static void error_bios(struct bio *bio); + +static void merge_callback(int read_err, unsigned long write_err, void *context) +{ + struct dm_snapshot *s = context; + struct bio *b = NULL; + + if (read_err || write_err) { + if (read_err) + DMERR("Read error: shutting down merge."); + else + DMERR("Write error: shutting down merge."); + goto shut; + } + + if (s->store->type->commit_merge(s->store, + s->num_merging_chunks) < 0) { + DMERR("Write error in exception store: shutting down merge"); + goto shut; + } + + if (remove_single_exception_chunk(s) < 0) + goto shut; + + snapshot_merge_next_chunks(s); + + return; + +shut: + down_write(&s->lock); + s->merge_failed = 1; + b = __release_queued_bios_after_merge(s); + up_write(&s->lock); + error_bios(b); + + merge_shutdown(s); +} + +static void start_merge(struct dm_snapshot *s) +{ + if (!test_and_set_bit(RUNNING_MERGE, &s->state_bits)) + snapshot_merge_next_chunks(s); +} + +static int wait_schedule(void *ptr) +{ + schedule(); + + return 0; +} + +/* + * Stop the merging process and wait until it finishes. + */ +static void stop_merge(struct dm_snapshot *s) +{ + set_bit(SHUTDOWN_MERGE, &s->state_bits); + wait_on_bit(&s->state_bits, RUNNING_MERGE, wait_schedule, + TASK_UNINTERRUPTIBLE); + clear_bit(SHUTDOWN_MERGE, &s->state_bits); +} + +/* + * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size> + */ +static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct dm_snapshot *s; + int i; + int r = -EINVAL; + char *origin_path, *cow_path; + unsigned args_used, num_flush_requests = 1; + fmode_t origin_mode = FMODE_READ; + + if (argc != 4) { + ti->error = "requires exactly 4 arguments"; + r = -EINVAL; + goto bad; + } + + if (dm_target_is_snapshot_merge(ti)) { + num_flush_requests = 2; + origin_mode = FMODE_WRITE; + } + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) { + ti->error = "Cannot allocate private snapshot structure"; + r = -ENOMEM; + goto bad; + } + + origin_path = argv[0]; + argv++; + argc--; + + r = dm_get_device(ti, origin_path, origin_mode, &s->origin); + if (r) { + ti->error = "Cannot get origin device"; + goto bad_origin; + } + + cow_path = argv[0]; + argv++; + argc--; + + r = dm_get_device(ti, cow_path, dm_table_get_mode(ti->table), &s->cow); + if (r) { + ti->error = "Cannot get COW device"; + goto bad_cow; + } + + r = dm_exception_store_create(ti, argc, argv, s, &args_used, &s->store); + if (r) { + ti->error = "Couldn't create exception store"; + r = -EINVAL; + goto bad_store; + } + + argv += args_used; + argc -= args_used; + + s->ti = ti; + s->valid = 1; + s->active = 0; + atomic_set(&s->pending_exceptions_count, 0); + init_rwsem(&s->lock); + INIT_LIST_HEAD(&s->list); + spin_lock_init(&s->pe_lock); + s->state_bits = 0; + s->merge_failed = 0; + s->first_merging_chunk = 0; + s->num_merging_chunks = 0; + bio_list_init(&s->bios_queued_during_merge); + + /* Allocate hash table for COW data */ + if (init_hash_tables(s)) { + ti->error = "Unable to allocate hash table space"; + r = -ENOMEM; + goto bad_hash_tables; + } + + s->kcopyd_client = dm_kcopyd_client_create(); + if (IS_ERR(s->kcopyd_client)) { + r = PTR_ERR(s->kcopyd_client); + ti->error = "Could not create kcopyd client"; + goto bad_kcopyd; + } + + s->pending_pool = mempool_create_slab_pool(MIN_IOS, pending_cache); + if (!s->pending_pool) { + ti->error = "Could not allocate mempool for pending exceptions"; + goto bad_pending_pool; + } + + s->tracked_chunk_pool = mempool_create_slab_pool(MIN_IOS, + tracked_chunk_cache); + if (!s->tracked_chunk_pool) { + ti->error = "Could not allocate tracked_chunk mempool for " + "tracking reads"; + goto bad_tracked_chunk_pool; + } + + for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++) + INIT_HLIST_HEAD(&s->tracked_chunk_hash[i]); + + spin_lock_init(&s->tracked_chunk_lock); + + ti->private = s; + ti->num_flush_requests = num_flush_requests; + + /* Add snapshot to the list of snapshots for this origin */ + /* Exceptions aren't triggered till snapshot_resume() is called */ + r = register_snapshot(s); + if (r == -ENOMEM) { + ti->error = "Snapshot origin struct allocation failed"; + goto bad_load_and_register; + } else if (r < 0) { + /* invalid handover, register_snapshot has set ti->error */ + goto bad_load_and_register; + } + + /* + * Metadata must only be loaded into one table at once, so skip this + * if metadata will be handed over during resume. + * Chunk size will be set during the handover - set it to zero to + * ensure it's ignored. + */ + if (r > 0) { + s->store->chunk_size = 0; + return 0; + } + + r = s->store->type->read_metadata(s->store, dm_add_exception, + (void *)s); + if (r < 0) { + ti->error = "Failed to read snapshot metadata"; + goto bad_read_metadata; + } else if (r > 0) { + s->valid = 0; + DMWARN("Snapshot is marked invalid."); + } + + if (!s->store->chunk_size) { + ti->error = "Chunk size not set"; + goto bad_read_metadata; + } + ti->split_io = s->store->chunk_size; + + return 0; + +bad_read_metadata: + unregister_snapshot(s); + +bad_load_and_register: + mempool_destroy(s->tracked_chunk_pool); + +bad_tracked_chunk_pool: + mempool_destroy(s->pending_pool); + +bad_pending_pool: + dm_kcopyd_client_destroy(s->kcopyd_client); + +bad_kcopyd: + dm_exception_table_exit(&s->pending, pending_cache); + dm_exception_table_exit(&s->complete, exception_cache); + +bad_hash_tables: + dm_exception_store_destroy(s->store); + +bad_store: + dm_put_device(ti, s->cow); + +bad_cow: + dm_put_device(ti, s->origin); + +bad_origin: + kfree(s); + +bad: + return r; +} + +static void __free_exceptions(struct dm_snapshot *s) +{ + dm_kcopyd_client_destroy(s->kcopyd_client); + s->kcopyd_client = NULL; + + dm_exception_table_exit(&s->pending, pending_cache); + dm_exception_table_exit(&s->complete, exception_cache); +} + +static void __handover_exceptions(struct dm_snapshot *snap_src, + struct dm_snapshot *snap_dest) +{ + union { + struct dm_exception_table table_swap; + struct dm_exception_store *store_swap; + } u; + + /* + * Swap all snapshot context information between the two instances. + */ + u.table_swap = snap_dest->complete; + snap_dest->complete = snap_src->complete; + snap_src->complete = u.table_swap; + + u.store_swap = snap_dest->store; + snap_dest->store = snap_src->store; + snap_src->store = u.store_swap; + + snap_dest->store->snap = snap_dest; + snap_src->store->snap = snap_src; + + snap_dest->ti->split_io = snap_dest->store->chunk_size; + snap_dest->valid = snap_src->valid; + + /* + * Set source invalid to ensure it receives no further I/O. + */ + snap_src->valid = 0; +} + +static void snapshot_dtr(struct dm_target *ti) +{ +#ifdef CONFIG_DM_DEBUG + int i; +#endif + struct dm_snapshot *s = ti->private; + struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; + + down_read(&_origins_lock); + /* Check whether exception handover must be cancelled */ + (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); + if (snap_src && snap_dest && (s == snap_src)) { + down_write(&snap_dest->lock); + snap_dest->valid = 0; + up_write(&snap_dest->lock); + DMERR("Cancelling snapshot handover."); + } + up_read(&_origins_lock); + + if (dm_target_is_snapshot_merge(ti)) + stop_merge(s); + + /* Prevent further origin writes from using this snapshot. */ + /* After this returns there can be no new kcopyd jobs. */ + unregister_snapshot(s); + + while (atomic_read(&s->pending_exceptions_count)) + msleep(1); + /* + * Ensure instructions in mempool_destroy aren't reordered + * before atomic_read. + */ + smp_mb(); + +#ifdef CONFIG_DM_DEBUG + for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++) + BUG_ON(!hlist_empty(&s->tracked_chunk_hash[i])); +#endif + + mempool_destroy(s->tracked_chunk_pool); + + __free_exceptions(s); + + mempool_destroy(s->pending_pool); + + dm_exception_store_destroy(s->store); + + dm_put_device(ti, s->cow); + + dm_put_device(ti, s->origin); + + kfree(s); +} + +/* + * Flush a list of buffers. + */ +static void flush_bios(struct bio *bio) +{ + struct bio *n; + + while (bio) { + n = bio->bi_next; + bio->bi_next = NULL; + generic_make_request(bio); + bio = n; + } +} + +static int do_origin(struct dm_dev *origin, struct bio *bio); + +/* + * Flush a list of buffers. + */ +static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio) +{ + struct bio *n; + int r; + + while (bio) { + n = bio->bi_next; + bio->bi_next = NULL; + r = do_origin(s->origin, bio); + if (r == DM_MAPIO_REMAPPED) + generic_make_request(bio); + bio = n; + } +} + +/* + * Error a list of buffers. + */ +static void error_bios(struct bio *bio) +{ + struct bio *n; + + while (bio) { + n = bio->bi_next; + bio->bi_next = NULL; + bio_io_error(bio); + bio = n; + } +} + +static void __invalidate_snapshot(struct dm_snapshot *s, int err) +{ + if (!s->valid) + return; + + if (err == -EIO) + DMERR("Invalidating snapshot: Error reading/writing."); + else if (err == -ENOMEM) + DMERR("Invalidating snapshot: Unable to allocate exception."); + + if (s->store->type->drop_snapshot) + s->store->type->drop_snapshot(s->store); + + s->valid = 0; + + dm_table_event(s->ti->table); +} + +static void pending_complete(struct dm_snap_pending_exception *pe, int success) +{ + struct dm_exception *e; + struct dm_snapshot *s = pe->snap; + struct bio *origin_bios = NULL; + struct bio *snapshot_bios = NULL; + struct bio *full_bio = NULL; + int error = 0; + + if (!success) { + /* Read/write error - snapshot is unusable */ + down_write(&s->lock); + __invalidate_snapshot(s, -EIO); + error = 1; + goto out; + } + + e = alloc_completed_exception(); + if (!e) { + down_write(&s->lock); + __invalidate_snapshot(s, -ENOMEM); + error = 1; + goto out; + } + *e = pe->e; + + down_write(&s->lock); + if (!s->valid) { + free_completed_exception(e); + error = 1; + goto out; + } + + /* Check for conflicting reads */ + __check_for_conflicting_io(s, pe->e.old_chunk); + + /* + * Add a proper exception, and remove the + * in-flight exception from the list. + */ + dm_insert_exception(&s->complete, e); + +out: + dm_remove_exception(&pe->e); + snapshot_bios = bio_list_get(&pe->snapshot_bios); + origin_bios = bio_list_get(&pe->origin_bios); + full_bio = pe->full_bio; + if (full_bio) { + full_bio->bi_end_io = pe->full_bio_end_io; + full_bio->bi_private = pe->full_bio_private; + } + free_pending_exception(pe); + + increment_pending_exceptions_done_count(); + + up_write(&s->lock); + + /* Submit any pending write bios */ + if (error) { + if (full_bio) + bio_io_error(full_bio); + error_bios(snapshot_bios); + } else { + if (full_bio) + bio_endio(full_bio, 0); + flush_bios(snapshot_bios); + } + + retry_origin_bios(s, origin_bios); +} + +static void commit_callback(void *context, int success) +{ + struct dm_snap_pending_exception *pe = context; + + pending_complete(pe, success); +} + +/* + * Called when the copy I/O has finished. kcopyd actually runs + * this code so don't block. + */ +static void copy_callback(int read_err, unsigned long write_err, void *context) +{ + struct dm_snap_pending_exception *pe = context; + struct dm_snapshot *s = pe->snap; + + if (read_err || write_err) + pending_complete(pe, 0); + + else + /* Update the metadata if we are persistent */ + s->store->type->commit_exception(s->store, &pe->e, + commit_callback, pe); +} + +/* + * Dispatches the copy operation to kcopyd. + */ +static void start_copy(struct dm_snap_pending_exception *pe) +{ + struct dm_snapshot *s = pe->snap; + struct dm_io_region src, dest; + struct block_device *bdev = s->origin->bdev; + sector_t dev_size; + + dev_size = get_dev_size(bdev); + + src.bdev = bdev; + src.sector = chunk_to_sector(s->store, pe->e.old_chunk); + src.count = min((sector_t)s->store->chunk_size, dev_size - src.sector); + + dest.bdev = s->cow->bdev; + dest.sector = chunk_to_sector(s->store, pe->e.new_chunk); + dest.count = src.count; + + /* Hand over to kcopyd */ + dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe); +} + +static void full_bio_end_io(struct bio *bio, int error) +{ + void *callback_data = bio->bi_private; + + dm_kcopyd_do_callback(callback_data, 0, error ? 1 : 0); +} + +static void start_full_bio(struct dm_snap_pending_exception *pe, + struct bio *bio) +{ + struct dm_snapshot *s = pe->snap; + void *callback_data; + + pe->full_bio = bio; + pe->full_bio_end_io = bio->bi_end_io; + pe->full_bio_private = bio->bi_private; + + callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client, + copy_callback, pe); + + bio->bi_end_io = full_bio_end_io; + bio->bi_private = callback_data; + + generic_make_request(bio); +} + +static struct dm_snap_pending_exception * +__lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk) +{ + struct dm_exception *e = dm_lookup_exception(&s->pending, chunk); + + if (!e) + return NULL; + + return container_of(e, struct dm_snap_pending_exception, e); +} + +/* + * Looks to see if this snapshot already has a pending exception + * for this chunk, otherwise it allocates a new one and inserts + * it into the pending table. + * + * NOTE: a write lock must be held on snap->lock before calling + * this. + */ +static struct dm_snap_pending_exception * +__find_pending_exception(struct dm_snapshot *s, + struct dm_snap_pending_exception *pe, chunk_t chunk) +{ + struct dm_snap_pending_exception *pe2; + + pe2 = __lookup_pending_exception(s, chunk); + if (pe2) { + free_pending_exception(pe); + return pe2; + } + + pe->e.old_chunk = chunk; + bio_list_init(&pe->origin_bios); + bio_list_init(&pe->snapshot_bios); + pe->started = 0; + pe->full_bio = NULL; + + if (s->store->type->prepare_exception(s->store, &pe->e)) { + free_pending_exception(pe); + return NULL; + } + + dm_insert_exception(&s->pending, &pe->e); + + return pe; +} + +static void remap_exception(struct dm_snapshot *s, struct dm_exception *e, + struct bio *bio, chunk_t chunk) +{ + bio->bi_bdev = s->cow->bdev; + bio->bi_sector = chunk_to_sector(s->store, + dm_chunk_number(e->new_chunk) + + (chunk - e->old_chunk)) + + (bio->bi_sector & + s->store->chunk_mask); +} + +static int snapshot_map(struct dm_target *ti, struct bio *bio, + union map_info *map_context) +{ + struct dm_exception *e; + struct dm_snapshot *s = ti->private; + int r = DM_MAPIO_REMAPPED; + chunk_t chunk; + struct dm_snap_pending_exception *pe = NULL; + + if (bio->bi_rw & REQ_FLUSH) { + bio->bi_bdev = s->cow->bdev; + return DM_MAPIO_REMAPPED; + } + + chunk = sector_to_chunk(s->store, bio->bi_sector); + + /* Full snapshots are not usable */ + /* To get here the table must be live so s->active is always set. */ + if (!s->valid) + return -EIO; + + /* FIXME: should only take write lock if we need + * to copy an exception */ + down_write(&s->lock); + + if (!s->valid) { + r = -EIO; + goto out_unlock; + } + + /* If the block is already remapped - use that, else remap it */ + e = dm_lookup_exception(&s->complete, chunk); + if (e) { + remap_exception(s, e, bio, chunk); + goto out_unlock; + } + + /* + * Write to snapshot - higher level takes care of RW/RO + * flags so we should only get this if we are + * writeable. + */ + if (bio_rw(bio) == WRITE) { + pe = __lookup_pending_exception(s, chunk); + if (!pe) { + up_write(&s->lock); + pe = alloc_pending_exception(s); + down_write(&s->lock); + + if (!s->valid) { + free_pending_exception(pe); + r = -EIO; + goto out_unlock; + } + + e = dm_lookup_exception(&s->complete, chunk); + if (e) { + free_pending_exception(pe); + remap_exception(s, e, bio, chunk); + goto out_unlock; + } + + pe = __find_pending_exception(s, pe, chunk); + if (!pe) { + __invalidate_snapshot(s, -ENOMEM); + r = -EIO; + goto out_unlock; + } + } + + remap_exception(s, &pe->e, bio, chunk); + + r = DM_MAPIO_SUBMITTED; + + if (!pe->started && + bio->bi_size == (s->store->chunk_size << SECTOR_SHIFT)) { + pe->started = 1; + up_write(&s->lock); + start_full_bio(pe, bio); + goto out; + } + + bio_list_add(&pe->snapshot_bios, bio); + + if (!pe->started) { + /* this is protected by snap->lock */ + pe->started = 1; + up_write(&s->lock); + start_copy(pe); + goto out; + } + } else { + bio->bi_bdev = s->origin->bdev; + map_context->ptr = track_chunk(s, chunk); + } + +out_unlock: + up_write(&s->lock); +out: + return r; +} + +/* + * A snapshot-merge target behaves like a combination of a snapshot + * target and a snapshot-origin target. It only generates new + * exceptions in other snapshots and not in the one that is being + * merged. + * + * For each chunk, if there is an existing exception, it is used to + * redirect I/O to the cow device. Otherwise I/O is sent to the origin, + * which in turn might generate exceptions in other snapshots. + * If merging is currently taking place on the chunk in question, the + * I/O is deferred by adding it to s->bios_queued_during_merge. + */ +static int snapshot_merge_map(struct dm_target *ti, struct bio *bio, + union map_info *map_context) +{ + struct dm_exception *e; + struct dm_snapshot *s = ti->private; + int r = DM_MAPIO_REMAPPED; + chunk_t chunk; + + if (bio->bi_rw & REQ_FLUSH) { + if (!map_context->target_request_nr) + bio->bi_bdev = s->origin->bdev; + else + bio->bi_bdev = s->cow->bdev; + map_context->ptr = NULL; + return DM_MAPIO_REMAPPED; + } + + chunk = sector_to_chunk(s->store, bio->bi_sector); + + down_write(&s->lock); + + /* Full merging snapshots are redirected to the origin */ + if (!s->valid) + goto redirect_to_origin; + + /* If the block is already remapped - use that */ + e = dm_lookup_exception(&s->complete, chunk); + if (e) { + /* Queue writes overlapping with chunks being merged */ + if (bio_rw(bio) == WRITE && + chunk >= s->first_merging_chunk && + chunk < (s->first_merging_chunk + + s->num_merging_chunks)) { + bio->bi_bdev = s->origin->bdev; + bio_list_add(&s->bios_queued_during_merge, bio); + r = DM_MAPIO_SUBMITTED; + goto out_unlock; + } + + remap_exception(s, e, bio, chunk); + + if (bio_rw(bio) == WRITE) + map_context->ptr = track_chunk(s, chunk); + goto out_unlock; + } + +redirect_to_origin: + bio->bi_bdev = s->origin->bdev; + + if (bio_rw(bio) == WRITE) { + up_write(&s->lock); + return do_origin(s->origin, bio); + } + +out_unlock: + up_write(&s->lock); + + return r; +} + +static int snapshot_end_io(struct dm_target *ti, struct bio *bio, + int error, union map_info *map_context) +{ + struct dm_snapshot *s = ti->private; + struct dm_snap_tracked_chunk *c = map_context->ptr; + + if (c) + stop_tracking_chunk(s, c); + + return 0; +} + +static void snapshot_merge_presuspend(struct dm_target *ti) +{ + struct dm_snapshot *s = ti->private; + + stop_merge(s); +} + +static int snapshot_preresume(struct dm_target *ti) +{ + int r = 0; + struct dm_snapshot *s = ti->private; + struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; + + down_read(&_origins_lock); + (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); + if (snap_src && snap_dest) { + down_read(&snap_src->lock); + if (s == snap_src) { + DMERR("Unable to resume snapshot source until " + "handover completes."); + r = -EINVAL; + } else if (!dm_suspended(snap_src->ti)) { + DMERR("Unable to perform snapshot handover until " + "source is suspended."); + r = -EINVAL; + } + up_read(&snap_src->lock); + } + up_read(&_origins_lock); + + return r; +} + +static void snapshot_resume(struct dm_target *ti) +{ + struct dm_snapshot *s = ti->private; + struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; + + down_read(&_origins_lock); + (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); + if (snap_src && snap_dest) { + down_write(&snap_src->lock); + down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING); + __handover_exceptions(snap_src, snap_dest); + up_write(&snap_dest->lock); + up_write(&snap_src->lock); + } + up_read(&_origins_lock); + + /* Now we have correct chunk size, reregister */ + reregister_snapshot(s); + + down_write(&s->lock); + s->active = 1; + up_write(&s->lock); +} + +static sector_t get_origin_minimum_chunksize(struct block_device *bdev) +{ + sector_t min_chunksize; + + down_read(&_origins_lock); + min_chunksize = __minimum_chunk_size(__lookup_origin(bdev)); + up_read(&_origins_lock); + + return min_chunksize; +} + +static void snapshot_merge_resume(struct dm_target *ti) +{ + struct dm_snapshot *s = ti->private; + + /* + * Handover exceptions from existing snapshot. + */ + snapshot_resume(ti); + + /* + * snapshot-merge acts as an origin, so set ti->split_io + */ + ti->split_io = get_origin_minimum_chunksize(s->origin->bdev); + + start_merge(s); +} + +static int snapshot_status(struct dm_target *ti, status_type_t type, + char *result, unsigned int maxlen) +{ + unsigned sz = 0; + struct dm_snapshot *snap = ti->private; + + switch (type) { + case STATUSTYPE_INFO: + + down_write(&snap->lock); + + if (!snap->valid) + DMEMIT("Invalid"); + else if (snap->merge_failed) + DMEMIT("Merge failed"); + else { + if (snap->store->type->usage) { + sector_t total_sectors, sectors_allocated, + metadata_sectors; + snap->store->type->usage(snap->store, + &total_sectors, + §ors_allocated, + &metadata_sectors); + DMEMIT("%llu/%llu %llu", + (unsigned long long)sectors_allocated, + (unsigned long long)total_sectors, + (unsigned long long)metadata_sectors); + } + else + DMEMIT("Unknown"); + } + + up_write(&snap->lock); + + break; + + case STATUSTYPE_TABLE: + /* + * kdevname returns a static pointer so we need + * to make private copies if the output is to + * make sense. + */ + DMEMIT("%s %s", snap->origin->name, snap->cow->name); + snap->store->type->status(snap->store, type, result + sz, + maxlen - sz); + break; + } + + return 0; +} + +static int snapshot_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct dm_snapshot *snap = ti->private; + int r; + + r = fn(ti, snap->origin, 0, ti->len, data); + + if (!r) + r = fn(ti, snap->cow, 0, get_dev_size(snap->cow->bdev), data); + + return r; +} + + +/*----------------------------------------------------------------- + * Origin methods + *---------------------------------------------------------------*/ + +/* + * If no exceptions need creating, DM_MAPIO_REMAPPED is returned and any + * supplied bio was ignored. The caller may submit it immediately. + * (No remapping actually occurs as the origin is always a direct linear + * map.) + * + * If further exceptions are required, DM_MAPIO_SUBMITTED is returned + * and any supplied bio is added to a list to be submitted once all + * the necessary exceptions exist. + */ +static int __origin_write(struct list_head *snapshots, sector_t sector, + struct bio *bio) +{ + int r = DM_MAPIO_REMAPPED; + struct dm_snapshot *snap; + struct dm_exception *e; + struct dm_snap_pending_exception *pe; + struct dm_snap_pending_exception *pe_to_start_now = NULL; + struct dm_snap_pending_exception *pe_to_start_last = NULL; + chunk_t chunk; + + /* Do all the snapshots on this origin */ + list_for_each_entry (snap, snapshots, list) { + /* + * Don't make new exceptions in a merging snapshot + * because it has effectively been deleted + */ + if (dm_target_is_snapshot_merge(snap->ti)) + continue; + + down_write(&snap->lock); + + /* Only deal with valid and active snapshots */ + if (!snap->valid || !snap->active) + goto next_snapshot; + + /* Nothing to do if writing beyond end of snapshot */ + if (sector >= dm_table_get_size(snap->ti->table)) + goto next_snapshot; + + /* + * Remember, different snapshots can have + * different chunk sizes. + */ + chunk = sector_to_chunk(snap->store, sector); + + /* + * Check exception table to see if block + * is already remapped in this snapshot + * and trigger an exception if not. + */ + e = dm_lookup_exception(&snap->complete, chunk); + if (e) + goto next_snapshot; + + pe = __lookup_pending_exception(snap, chunk); + if (!pe) { + up_write(&snap->lock); + pe = alloc_pending_exception(snap); + down_write(&snap->lock); + + if (!snap->valid) { + free_pending_exception(pe); + goto next_snapshot; + } + + e = dm_lookup_exception(&snap->complete, chunk); + if (e) { + free_pending_exception(pe); + goto next_snapshot; + } + + pe = __find_pending_exception(snap, pe, chunk); + if (!pe) { + __invalidate_snapshot(snap, -ENOMEM); + goto next_snapshot; + } + } + + r = DM_MAPIO_SUBMITTED; + + /* + * If an origin bio was supplied, queue it to wait for the + * completion of this exception, and start this one last, + * at the end of the function. + */ + if (bio) { + bio_list_add(&pe->origin_bios, bio); + bio = NULL; + + if (!pe->started) { + pe->started = 1; + pe_to_start_last = pe; + } + } + + if (!pe->started) { + pe->started = 1; + pe_to_start_now = pe; + } + +next_snapshot: + up_write(&snap->lock); + + if (pe_to_start_now) { + start_copy(pe_to_start_now); + pe_to_start_now = NULL; + } + } + + /* + * Submit the exception against which the bio is queued last, + * to give the other exceptions a head start. + */ + if (pe_to_start_last) + start_copy(pe_to_start_last); + + return r; +} + +/* + * Called on a write from the origin driver. + */ +static int do_origin(struct dm_dev *origin, struct bio *bio) +{ + struct origin *o; + int r = DM_MAPIO_REMAPPED; + + down_read(&_origins_lock); + o = __lookup_origin(origin->bdev); + if (o) + r = __origin_write(&o->snapshots, bio->bi_sector, bio); + up_read(&_origins_lock); + + return r; +} + +/* + * Trigger exceptions in all non-merging snapshots. + * + * The chunk size of the merging snapshot may be larger than the chunk + * size of some other snapshot so we may need to reallocate multiple + * chunks in other snapshots. + * + * We scan all the overlapping exceptions in the other snapshots. + * Returns 1 if anything was reallocated and must be waited for, + * otherwise returns 0. + * + * size must be a multiple of merging_snap's chunk_size. + */ +static int origin_write_extent(struct dm_snapshot *merging_snap, + sector_t sector, unsigned size) +{ + int must_wait = 0; + sector_t n; + struct origin *o; + + /* + * The origin's __minimum_chunk_size() got stored in split_io + * by snapshot_merge_resume(). + */ + down_read(&_origins_lock); + o = __lookup_origin(merging_snap->origin->bdev); + for (n = 0; n < size; n += merging_snap->ti->split_io) + if (__origin_write(&o->snapshots, sector + n, NULL) == + DM_MAPIO_SUBMITTED) + must_wait = 1; + up_read(&_origins_lock); + + return must_wait; +} + +/* + * Origin: maps a linear range of a device, with hooks for snapshotting. + */ + +/* + * Construct an origin mapping: <dev_path> + * The context for an origin is merely a 'struct dm_dev *' + * pointing to the real device. + */ +static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + int r; + struct dm_dev *dev; + + if (argc != 1) { + ti->error = "origin: incorrect number of arguments"; + return -EINVAL; + } + + r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dev); + if (r) { + ti->error = "Cannot get target device"; + return r; + } + + ti->private = dev; + ti->num_flush_requests = 1; + + return 0; +} + +static void origin_dtr(struct dm_target *ti) +{ + struct dm_dev *dev = ti->private; + dm_put_device(ti, dev); +} + +static int origin_map(struct dm_target *ti, struct bio *bio, + union map_info *map_context) +{ + struct dm_dev *dev = ti->private; + bio->bi_bdev = dev->bdev; + + if (bio->bi_rw & REQ_FLUSH) + return DM_MAPIO_REMAPPED; + + /* Only tell snapshots if this is a write */ + return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED; +} + +/* + * Set the target "split_io" field to the minimum of all the snapshots' + * chunk sizes. + */ +static void origin_resume(struct dm_target *ti) +{ + struct dm_dev *dev = ti->private; + + ti->split_io = get_origin_minimum_chunksize(dev->bdev); +} + +static int origin_status(struct dm_target *ti, status_type_t type, char *result, + unsigned int maxlen) +{ + struct dm_dev *dev = ti->private; + + switch (type) { + case STATUSTYPE_INFO: + result[0] = '\0'; + break; + + case STATUSTYPE_TABLE: + snprintf(result, maxlen, "%s", dev->name); + break; + } + + return 0; +} + +static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size) +{ + struct dm_dev *dev = ti->private; + struct request_queue *q = bdev_get_queue(dev->bdev); + + if (!q->merge_bvec_fn) + return max_size; + + bvm->bi_bdev = dev->bdev; + bvm->bi_sector = bvm->bi_sector; + + return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); +} + +static int origin_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct dm_dev *dev = ti->private; + + return fn(ti, dev, 0, ti->len, data); +} + +static struct target_type origin_target = { + .name = "snapshot-origin", + .version = {1, 7, 1}, + .module = THIS_MODULE, + .ctr = origin_ctr, + .dtr = origin_dtr, + .map = origin_map, + .resume = origin_resume, + .status = origin_status, + .merge = origin_merge, + .iterate_devices = origin_iterate_devices, +}; + +static struct target_type snapshot_target = { + .name = "snapshot", + .version = {1, 10, 0}, + .module = THIS_MODULE, + .ctr = snapshot_ctr, + .dtr = snapshot_dtr, + .map = snapshot_map, + .end_io = snapshot_end_io, + .preresume = snapshot_preresume, + .resume = snapshot_resume, + .status = snapshot_status, + .iterate_devices = snapshot_iterate_devices, +}; + +static struct target_type merge_target = { + .name = dm_snapshot_merge_target_name, + .version = {1, 1, 0}, + .module = THIS_MODULE, + .ctr = snapshot_ctr, + .dtr = snapshot_dtr, + .map = snapshot_merge_map, + .end_io = snapshot_end_io, + .presuspend = snapshot_merge_presuspend, + .preresume = snapshot_preresume, + .resume = snapshot_merge_resume, + .status = snapshot_status, + .iterate_devices = snapshot_iterate_devices, +}; + +static int __init dm_snapshot_init(void) +{ + int r; + + r = dm_exception_store_init(); + if (r) { + DMERR("Failed to initialize exception stores"); + return r; + } + + r = dm_register_target(&snapshot_target); + if (r < 0) { + DMERR("snapshot target register failed %d", r); + goto bad_register_snapshot_target; + } + + r = dm_register_target(&origin_target); + if (r < 0) { + DMERR("Origin target register failed %d", r); + goto bad_register_origin_target; + } + + r = dm_register_target(&merge_target); + if (r < 0) { + DMERR("Merge target register failed %d", r); + goto bad_register_merge_target; + } + + r = init_origin_hash(); + if (r) { + DMERR("init_origin_hash failed."); + goto bad_origin_hash; + } + + exception_cache = KMEM_CACHE(dm_exception, 0); + if (!exception_cache) { + DMERR("Couldn't create exception cache."); + r = -ENOMEM; + goto bad_exception_cache; + } + + pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0); + if (!pending_cache) { + DMERR("Couldn't create pending cache."); + r = -ENOMEM; + goto bad_pending_cache; + } + + tracked_chunk_cache = KMEM_CACHE(dm_snap_tracked_chunk, 0); + if (!tracked_chunk_cache) { + DMERR("Couldn't create cache to track chunks in use."); + r = -ENOMEM; + goto bad_tracked_chunk_cache; + } + + return 0; + +bad_tracked_chunk_cache: + kmem_cache_destroy(pending_cache); +bad_pending_cache: + kmem_cache_destroy(exception_cache); +bad_exception_cache: + exit_origin_hash(); +bad_origin_hash: + dm_unregister_target(&merge_target); +bad_register_merge_target: + dm_unregister_target(&origin_target); +bad_register_origin_target: + dm_unregister_target(&snapshot_target); +bad_register_snapshot_target: + dm_exception_store_exit(); + + return r; +} + +static void __exit dm_snapshot_exit(void) +{ + dm_unregister_target(&snapshot_target); + dm_unregister_target(&origin_target); + dm_unregister_target(&merge_target); + + exit_origin_hash(); + kmem_cache_destroy(pending_cache); + kmem_cache_destroy(exception_cache); + kmem_cache_destroy(tracked_chunk_cache); + + dm_exception_store_exit(); +} + +/* Module hooks */ +module_init(dm_snapshot_init); +module_exit(dm_snapshot_exit); + +MODULE_DESCRIPTION(DM_NAME " snapshot target"); +MODULE_AUTHOR("Joe Thornber"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c new file mode 100644 index 00000000..35c94ff2 --- /dev/null +++ b/drivers/md/dm-stripe.c @@ -0,0 +1,450 @@ +/* + * Copyright (C) 2001-2003 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include <linux/device-mapper.h> + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/blkdev.h> +#include <linux/bio.h> +#include <linux/slab.h> +#include <linux/log2.h> + +#define DM_MSG_PREFIX "striped" +#define DM_IO_ERROR_THRESHOLD 15 + +struct stripe { + struct dm_dev *dev; + sector_t physical_start; + + atomic_t error_count; +}; + +struct stripe_c { + uint32_t stripes; + int stripes_shift; + sector_t stripes_mask; + + /* The size of this target / num. stripes */ + sector_t stripe_width; + + /* stripe chunk size */ + uint32_t chunk_shift; + sector_t chunk_mask; + + /* Needed for handling events */ + struct dm_target *ti; + + /* Work struct used for triggering events*/ + struct work_struct trigger_event; + + struct stripe stripe[0]; +}; + +/* + * An event is triggered whenever a drive + * drops out of a stripe volume. + */ +static void trigger_event(struct work_struct *work) +{ + struct stripe_c *sc = container_of(work, struct stripe_c, + trigger_event); + dm_table_event(sc->ti->table); +} + +static inline struct stripe_c *alloc_context(unsigned int stripes) +{ + size_t len; + + if (dm_array_too_big(sizeof(struct stripe_c), sizeof(struct stripe), + stripes)) + return NULL; + + len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes); + + return kmalloc(len, GFP_KERNEL); +} + +/* + * Parse a single <dev> <sector> pair + */ +static int get_stripe(struct dm_target *ti, struct stripe_c *sc, + unsigned int stripe, char **argv) +{ + unsigned long long start; + char dummy; + + if (sscanf(argv[1], "%llu%c", &start, &dummy) != 1) + return -EINVAL; + + if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), + &sc->stripe[stripe].dev)) + return -ENXIO; + + sc->stripe[stripe].physical_start = start; + + return 0; +} + +/* + * Construct a striped mapping. + * <number of stripes> <chunk size (2^^n)> [<dev_path> <offset>]+ + */ +static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct stripe_c *sc; + sector_t width; + uint32_t stripes; + uint32_t chunk_size; + char *end; + int r; + unsigned int i; + + if (argc < 2) { + ti->error = "Not enough arguments"; + return -EINVAL; + } + + stripes = simple_strtoul(argv[0], &end, 10); + if (!stripes || *end) { + ti->error = "Invalid stripe count"; + return -EINVAL; + } + + chunk_size = simple_strtoul(argv[1], &end, 10); + if (*end) { + ti->error = "Invalid chunk_size"; + return -EINVAL; + } + + /* + * chunk_size is a power of two + */ + if (!is_power_of_2(chunk_size) || + (chunk_size < (PAGE_SIZE >> SECTOR_SHIFT))) { + ti->error = "Invalid chunk size"; + return -EINVAL; + } + + if (ti->len & (chunk_size - 1)) { + ti->error = "Target length not divisible by " + "chunk size"; + return -EINVAL; + } + + width = ti->len; + if (sector_div(width, stripes)) { + ti->error = "Target length not divisible by " + "number of stripes"; + return -EINVAL; + } + + /* + * Do we have enough arguments for that many stripes ? + */ + if (argc != (2 + 2 * stripes)) { + ti->error = "Not enough destinations " + "specified"; + return -EINVAL; + } + + sc = alloc_context(stripes); + if (!sc) { + ti->error = "Memory allocation for striped context " + "failed"; + return -ENOMEM; + } + + INIT_WORK(&sc->trigger_event, trigger_event); + + /* Set pointer to dm target; used in trigger_event */ + sc->ti = ti; + sc->stripes = stripes; + sc->stripe_width = width; + + if (stripes & (stripes - 1)) + sc->stripes_shift = -1; + else { + sc->stripes_shift = ffs(stripes) - 1; + sc->stripes_mask = ((sector_t) stripes) - 1; + } + + ti->split_io = chunk_size; + ti->num_flush_requests = stripes; + ti->num_discard_requests = stripes; + + sc->chunk_shift = ffs(chunk_size) - 1; + sc->chunk_mask = ((sector_t) chunk_size) - 1; + + /* + * Get the stripe destinations. + */ + for (i = 0; i < stripes; i++) { + argv += 2; + + r = get_stripe(ti, sc, i, argv); + if (r < 0) { + ti->error = "Couldn't parse stripe destination"; + while (i--) + dm_put_device(ti, sc->stripe[i].dev); + kfree(sc); + return r; + } + atomic_set(&(sc->stripe[i].error_count), 0); + } + + ti->private = sc; + + return 0; +} + +static void stripe_dtr(struct dm_target *ti) +{ + unsigned int i; + struct stripe_c *sc = (struct stripe_c *) ti->private; + + for (i = 0; i < sc->stripes; i++) + dm_put_device(ti, sc->stripe[i].dev); + + flush_work_sync(&sc->trigger_event); + kfree(sc); +} + +static void stripe_map_sector(struct stripe_c *sc, sector_t sector, + uint32_t *stripe, sector_t *result) +{ + sector_t offset = dm_target_offset(sc->ti, sector); + sector_t chunk = offset >> sc->chunk_shift; + + if (sc->stripes_shift < 0) + *stripe = sector_div(chunk, sc->stripes); + else { + *stripe = chunk & sc->stripes_mask; + chunk >>= sc->stripes_shift; + } + + *result = (chunk << sc->chunk_shift) | (offset & sc->chunk_mask); +} + +static void stripe_map_range_sector(struct stripe_c *sc, sector_t sector, + uint32_t target_stripe, sector_t *result) +{ + uint32_t stripe; + + stripe_map_sector(sc, sector, &stripe, result); + if (stripe == target_stripe) + return; + *result &= ~sc->chunk_mask; /* round down */ + if (target_stripe < stripe) + *result += sc->chunk_mask + 1; /* next chunk */ +} + +static int stripe_map_discard(struct stripe_c *sc, struct bio *bio, + uint32_t target_stripe) +{ + sector_t begin, end; + + stripe_map_range_sector(sc, bio->bi_sector, target_stripe, &begin); + stripe_map_range_sector(sc, bio->bi_sector + bio_sectors(bio), + target_stripe, &end); + if (begin < end) { + bio->bi_bdev = sc->stripe[target_stripe].dev->bdev; + bio->bi_sector = begin + sc->stripe[target_stripe].physical_start; + bio->bi_size = to_bytes(end - begin); + return DM_MAPIO_REMAPPED; + } else { + /* The range doesn't map to the target stripe */ + bio_endio(bio, 0); + return DM_MAPIO_SUBMITTED; + } +} + +static int stripe_map(struct dm_target *ti, struct bio *bio, + union map_info *map_context) +{ + struct stripe_c *sc = ti->private; + uint32_t stripe; + unsigned target_request_nr; + + if (bio->bi_rw & REQ_FLUSH) { + target_request_nr = map_context->target_request_nr; + BUG_ON(target_request_nr >= sc->stripes); + bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev; + return DM_MAPIO_REMAPPED; + } + if (unlikely(bio->bi_rw & REQ_DISCARD)) { + target_request_nr = map_context->target_request_nr; + BUG_ON(target_request_nr >= sc->stripes); + return stripe_map_discard(sc, bio, target_request_nr); + } + + stripe_map_sector(sc, bio->bi_sector, &stripe, &bio->bi_sector); + + bio->bi_sector += sc->stripe[stripe].physical_start; + bio->bi_bdev = sc->stripe[stripe].dev->bdev; + + return DM_MAPIO_REMAPPED; +} + +/* + * Stripe status: + * + * INFO + * #stripes [stripe_name <stripe_name>] [group word count] + * [error count 'A|D' <error count 'A|D'>] + * + * TABLE + * #stripes [stripe chunk size] + * [stripe_name physical_start <stripe_name physical_start>] + * + */ + +static int stripe_status(struct dm_target *ti, + status_type_t type, char *result, unsigned int maxlen) +{ + struct stripe_c *sc = (struct stripe_c *) ti->private; + char buffer[sc->stripes + 1]; + unsigned int sz = 0; + unsigned int i; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%d ", sc->stripes); + for (i = 0; i < sc->stripes; i++) { + DMEMIT("%s ", sc->stripe[i].dev->name); + buffer[i] = atomic_read(&(sc->stripe[i].error_count)) ? + 'D' : 'A'; + } + buffer[i] = '\0'; + DMEMIT("1 %s", buffer); + break; + + case STATUSTYPE_TABLE: + DMEMIT("%d %llu", sc->stripes, + (unsigned long long)sc->chunk_mask + 1); + for (i = 0; i < sc->stripes; i++) + DMEMIT(" %s %llu", sc->stripe[i].dev->name, + (unsigned long long)sc->stripe[i].physical_start); + break; + } + return 0; +} + +static int stripe_end_io(struct dm_target *ti, struct bio *bio, + int error, union map_info *map_context) +{ + unsigned i; + char major_minor[16]; + struct stripe_c *sc = ti->private; + + if (!error) + return 0; /* I/O complete */ + + if ((error == -EWOULDBLOCK) && (bio->bi_rw & REQ_RAHEAD)) + return error; + + if (error == -EOPNOTSUPP) + return error; + + memset(major_minor, 0, sizeof(major_minor)); + sprintf(major_minor, "%d:%d", + MAJOR(disk_devt(bio->bi_bdev->bd_disk)), + MINOR(disk_devt(bio->bi_bdev->bd_disk))); + + /* + * Test to see which stripe drive triggered the event + * and increment error count for all stripes on that device. + * If the error count for a given device exceeds the threshold + * value we will no longer trigger any further events. + */ + for (i = 0; i < sc->stripes; i++) + if (!strcmp(sc->stripe[i].dev->name, major_minor)) { + atomic_inc(&(sc->stripe[i].error_count)); + if (atomic_read(&(sc->stripe[i].error_count)) < + DM_IO_ERROR_THRESHOLD) + schedule_work(&sc->trigger_event); + } + + return error; +} + +static int stripe_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct stripe_c *sc = ti->private; + int ret = 0; + unsigned i = 0; + + do { + ret = fn(ti, sc->stripe[i].dev, + sc->stripe[i].physical_start, + sc->stripe_width, data); + } while (!ret && ++i < sc->stripes); + + return ret; +} + +static void stripe_io_hints(struct dm_target *ti, + struct queue_limits *limits) +{ + struct stripe_c *sc = ti->private; + unsigned chunk_size = (sc->chunk_mask + 1) << 9; + + blk_limits_io_min(limits, chunk_size); + blk_limits_io_opt(limits, chunk_size * sc->stripes); +} + +static int stripe_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size) +{ + struct stripe_c *sc = ti->private; + sector_t bvm_sector = bvm->bi_sector; + uint32_t stripe; + struct request_queue *q; + + stripe_map_sector(sc, bvm_sector, &stripe, &bvm_sector); + + q = bdev_get_queue(sc->stripe[stripe].dev->bdev); + if (!q->merge_bvec_fn) + return max_size; + + bvm->bi_bdev = sc->stripe[stripe].dev->bdev; + bvm->bi_sector = sc->stripe[stripe].physical_start + bvm_sector; + + return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); +} + +static struct target_type stripe_target = { + .name = "striped", + .version = {1, 4, 0}, + .module = THIS_MODULE, + .ctr = stripe_ctr, + .dtr = stripe_dtr, + .map = stripe_map, + .end_io = stripe_end_io, + .status = stripe_status, + .iterate_devices = stripe_iterate_devices, + .io_hints = stripe_io_hints, + .merge = stripe_merge, +}; + +int __init dm_stripe_init(void) +{ + int r; + + r = dm_register_target(&stripe_target); + if (r < 0) { + DMWARN("target registration failed"); + return r; + } + + return r; +} + +void dm_stripe_exit(void) +{ + dm_unregister_target(&stripe_target); +} diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c new file mode 100644 index 00000000..84d2b91e --- /dev/null +++ b/drivers/md/dm-sysfs.c @@ -0,0 +1,108 @@ +/* + * Copyright (C) 2008 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include <linux/sysfs.h> +#include <linux/dm-ioctl.h> +#include "dm.h" + +struct dm_sysfs_attr { + struct attribute attr; + ssize_t (*show)(struct mapped_device *, char *); + ssize_t (*store)(struct mapped_device *, char *); +}; + +#define DM_ATTR_RO(_name) \ +struct dm_sysfs_attr dm_attr_##_name = \ + __ATTR(_name, S_IRUGO, dm_attr_##_name##_show, NULL) + +static ssize_t dm_attr_show(struct kobject *kobj, struct attribute *attr, + char *page) +{ + struct dm_sysfs_attr *dm_attr; + struct mapped_device *md; + ssize_t ret; + + dm_attr = container_of(attr, struct dm_sysfs_attr, attr); + if (!dm_attr->show) + return -EIO; + + md = dm_get_from_kobject(kobj); + if (!md) + return -EINVAL; + + ret = dm_attr->show(md, page); + dm_put(md); + + return ret; +} + +static ssize_t dm_attr_name_show(struct mapped_device *md, char *buf) +{ + if (dm_copy_name_and_uuid(md, buf, NULL)) + return -EIO; + + strcat(buf, "\n"); + return strlen(buf); +} + +static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf) +{ + if (dm_copy_name_and_uuid(md, NULL, buf)) + return -EIO; + + strcat(buf, "\n"); + return strlen(buf); +} + +static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf) +{ + sprintf(buf, "%d\n", dm_suspended_md(md)); + + return strlen(buf); +} + +static DM_ATTR_RO(name); +static DM_ATTR_RO(uuid); +static DM_ATTR_RO(suspended); + +static struct attribute *dm_attrs[] = { + &dm_attr_name.attr, + &dm_attr_uuid.attr, + &dm_attr_suspended.attr, + NULL, +}; + +static const struct sysfs_ops dm_sysfs_ops = { + .show = dm_attr_show, +}; + +/* + * dm kobject is embedded in mapped_device structure + * no need to define release function here + */ +static struct kobj_type dm_ktype = { + .sysfs_ops = &dm_sysfs_ops, + .default_attrs = dm_attrs, +}; + +/* + * Initialize kobj + * because nobody using md yet, no need to call explicit dm_get/put + */ +int dm_sysfs_init(struct mapped_device *md) +{ + return kobject_init_and_add(dm_kobject(md), &dm_ktype, + &disk_to_dev(dm_disk(md))->kobj, + "%s", "dm"); +} + +/* + * Remove kobj, called after all references removed + */ +void dm_sysfs_exit(struct mapped_device *md) +{ + kobject_put(dm_kobject(md)); +} diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c new file mode 100644 index 00000000..2e227fbf --- /dev/null +++ b/drivers/md/dm-table.c @@ -0,0 +1,1577 @@ +/* + * Copyright (C) 2001 Sistina Software (UK) Limited. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include <linux/module.h> +#include <linux/vmalloc.h> +#include <linux/blkdev.h> +#include <linux/namei.h> +#include <linux/ctype.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <linux/mutex.h> +#include <linux/delay.h> +#include <linux/atomic.h> + +#define DM_MSG_PREFIX "table" + +#define MAX_DEPTH 16 +#define NODE_SIZE L1_CACHE_BYTES +#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t)) +#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1) + +/* + * The table has always exactly one reference from either mapped_device->map + * or hash_cell->new_map. This reference is not counted in table->holders. + * A pair of dm_create_table/dm_destroy_table functions is used for table + * creation/destruction. + * + * Temporary references from the other code increase table->holders. A pair + * of dm_table_get/dm_table_put functions is used to manipulate it. + * + * When the table is about to be destroyed, we wait for table->holders to + * drop to zero. + */ + +struct dm_table { + struct mapped_device *md; + atomic_t holders; + unsigned type; + + /* btree table */ + unsigned int depth; + unsigned int counts[MAX_DEPTH]; /* in nodes */ + sector_t *index[MAX_DEPTH]; + + unsigned int num_targets; + unsigned int num_allocated; + sector_t *highs; + struct dm_target *targets; + + struct target_type *immutable_target_type; + unsigned integrity_supported:1; + unsigned singleton:1; + + /* + * Indicates the rw permissions for the new logical + * device. This should be a combination of FMODE_READ + * and FMODE_WRITE. + */ + fmode_t mode; + + /* a list of devices used by this table */ + struct list_head devices; + + /* events get handed up using this callback */ + void (*event_fn)(void *); + void *event_context; + + struct dm_md_mempools *mempools; + + struct list_head target_callbacks; +}; + +/* + * Similar to ceiling(log_size(n)) + */ +static unsigned int int_log(unsigned int n, unsigned int base) +{ + int result = 0; + + while (n > 1) { + n = dm_div_up(n, base); + result++; + } + + return result; +} + +/* + * Calculate the index of the child node of the n'th node k'th key. + */ +static inline unsigned int get_child(unsigned int n, unsigned int k) +{ + return (n * CHILDREN_PER_NODE) + k; +} + +/* + * Return the n'th node of level l from table t. + */ +static inline sector_t *get_node(struct dm_table *t, + unsigned int l, unsigned int n) +{ + return t->index[l] + (n * KEYS_PER_NODE); +} + +/* + * Return the highest key that you could lookup from the n'th + * node on level l of the btree. + */ +static sector_t high(struct dm_table *t, unsigned int l, unsigned int n) +{ + for (; l < t->depth - 1; l++) + n = get_child(n, CHILDREN_PER_NODE - 1); + + if (n >= t->counts[l]) + return (sector_t) - 1; + + return get_node(t, l, n)[KEYS_PER_NODE - 1]; +} + +/* + * Fills in a level of the btree based on the highs of the level + * below it. + */ +static int setup_btree_index(unsigned int l, struct dm_table *t) +{ + unsigned int n, k; + sector_t *node; + + for (n = 0U; n < t->counts[l]; n++) { + node = get_node(t, l, n); + + for (k = 0U; k < KEYS_PER_NODE; k++) + node[k] = high(t, l + 1, get_child(n, k)); + } + + return 0; +} + +void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size) +{ + unsigned long size; + void *addr; + + /* + * Check that we're not going to overflow. + */ + if (nmemb > (ULONG_MAX / elem_size)) + return NULL; + + size = nmemb * elem_size; + addr = vzalloc(size); + + return addr; +} +EXPORT_SYMBOL(dm_vcalloc); + +/* + * highs, and targets are managed as dynamic arrays during a + * table load. + */ +static int alloc_targets(struct dm_table *t, unsigned int num) +{ + sector_t *n_highs; + struct dm_target *n_targets; + int n = t->num_targets; + + /* + * Allocate both the target array and offset array at once. + * Append an empty entry to catch sectors beyond the end of + * the device. + */ + n_highs = (sector_t *) dm_vcalloc(num + 1, sizeof(struct dm_target) + + sizeof(sector_t)); + if (!n_highs) + return -ENOMEM; + + n_targets = (struct dm_target *) (n_highs + num); + + if (n) { + memcpy(n_highs, t->highs, sizeof(*n_highs) * n); + memcpy(n_targets, t->targets, sizeof(*n_targets) * n); + } + + memset(n_highs + n, -1, sizeof(*n_highs) * (num - n)); + vfree(t->highs); + + t->num_allocated = num; + t->highs = n_highs; + t->targets = n_targets; + + return 0; +} + +int dm_table_create(struct dm_table **result, fmode_t mode, + unsigned num_targets, struct mapped_device *md) +{ + struct dm_table *t = kzalloc(sizeof(*t), GFP_KERNEL); + + if (!t) + return -ENOMEM; + + INIT_LIST_HEAD(&t->devices); + INIT_LIST_HEAD(&t->target_callbacks); + atomic_set(&t->holders, 0); + + if (!num_targets) + num_targets = KEYS_PER_NODE; + + num_targets = dm_round_up(num_targets, KEYS_PER_NODE); + + if (alloc_targets(t, num_targets)) { + kfree(t); + t = NULL; + return -ENOMEM; + } + + t->mode = mode; + t->md = md; + *result = t; + return 0; +} + +static void free_devices(struct list_head *devices) +{ + struct list_head *tmp, *next; + + list_for_each_safe(tmp, next, devices) { + struct dm_dev_internal *dd = + list_entry(tmp, struct dm_dev_internal, list); + DMWARN("dm_table_destroy: dm_put_device call missing for %s", + dd->dm_dev.name); + kfree(dd); + } +} + +void dm_table_destroy(struct dm_table *t) +{ + unsigned int i; + + if (!t) + return; + + while (atomic_read(&t->holders)) + msleep(1); + smp_mb(); + + /* free the indexes */ + if (t->depth >= 2) + vfree(t->index[t->depth - 2]); + + /* free the targets */ + for (i = 0; i < t->num_targets; i++) { + struct dm_target *tgt = t->targets + i; + + if (tgt->type->dtr) + tgt->type->dtr(tgt); + + dm_put_target_type(tgt->type); + } + + vfree(t->highs); + + /* free the device list */ + free_devices(&t->devices); + + dm_free_md_mempools(t->mempools); + + kfree(t); +} + +void dm_table_get(struct dm_table *t) +{ + atomic_inc(&t->holders); +} +EXPORT_SYMBOL(dm_table_get); + +void dm_table_put(struct dm_table *t) +{ + if (!t) + return; + + smp_mb__before_atomic_dec(); + atomic_dec(&t->holders); +} +EXPORT_SYMBOL(dm_table_put); + +/* + * Checks to see if we need to extend highs or targets. + */ +static inline int check_space(struct dm_table *t) +{ + if (t->num_targets >= t->num_allocated) + return alloc_targets(t, t->num_allocated * 2); + + return 0; +} + +/* + * See if we've already got a device in the list. + */ +static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev) +{ + struct dm_dev_internal *dd; + + list_for_each_entry (dd, l, list) + if (dd->dm_dev.bdev->bd_dev == dev) + return dd; + + return NULL; +} + +/* + * Open a device so we can use it as a map destination. + */ +static int open_dev(struct dm_dev_internal *d, dev_t dev, + struct mapped_device *md) +{ + static char *_claim_ptr = "I belong to device-mapper"; + struct block_device *bdev; + + int r; + + BUG_ON(d->dm_dev.bdev); + + bdev = blkdev_get_by_dev(dev, d->dm_dev.mode | FMODE_EXCL, _claim_ptr); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + + r = bd_link_disk_holder(bdev, dm_disk(md)); + if (r) { + blkdev_put(bdev, d->dm_dev.mode | FMODE_EXCL); + return r; + } + + d->dm_dev.bdev = bdev; + return 0; +} + +/* + * Close a device that we've been using. + */ +static void close_dev(struct dm_dev_internal *d, struct mapped_device *md) +{ + if (!d->dm_dev.bdev) + return; + + bd_unlink_disk_holder(d->dm_dev.bdev, dm_disk(md)); + blkdev_put(d->dm_dev.bdev, d->dm_dev.mode | FMODE_EXCL); + d->dm_dev.bdev = NULL; +} + +/* + * If possible, this checks an area of a destination device is invalid. + */ +static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct request_queue *q; + struct queue_limits *limits = data; + struct block_device *bdev = dev->bdev; + sector_t dev_size = + i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; + unsigned short logical_block_size_sectors = + limits->logical_block_size >> SECTOR_SHIFT; + char b[BDEVNAME_SIZE]; + + /* + * Some devices exist without request functions, + * such as loop devices not yet bound to backing files. + * Forbid the use of such devices. + */ + q = bdev_get_queue(bdev); + if (!q || !q->make_request_fn) { + DMWARN("%s: %s is not yet initialised: " + "start=%llu, len=%llu, dev_size=%llu", + dm_device_name(ti->table->md), bdevname(bdev, b), + (unsigned long long)start, + (unsigned long long)len, + (unsigned long long)dev_size); + return 1; + } + + if (!dev_size) + return 0; + + if ((start >= dev_size) || (start + len > dev_size)) { + DMWARN("%s: %s too small for target: " + "start=%llu, len=%llu, dev_size=%llu", + dm_device_name(ti->table->md), bdevname(bdev, b), + (unsigned long long)start, + (unsigned long long)len, + (unsigned long long)dev_size); + return 1; + } + + if (logical_block_size_sectors <= 1) + return 0; + + if (start & (logical_block_size_sectors - 1)) { + DMWARN("%s: start=%llu not aligned to h/w " + "logical block size %u of %s", + dm_device_name(ti->table->md), + (unsigned long long)start, + limits->logical_block_size, bdevname(bdev, b)); + return 1; + } + + if (len & (logical_block_size_sectors - 1)) { + DMWARN("%s: len=%llu not aligned to h/w " + "logical block size %u of %s", + dm_device_name(ti->table->md), + (unsigned long long)len, + limits->logical_block_size, bdevname(bdev, b)); + return 1; + } + + return 0; +} + +/* + * This upgrades the mode on an already open dm_dev, being + * careful to leave things as they were if we fail to reopen the + * device and not to touch the existing bdev field in case + * it is accessed concurrently inside dm_table_any_congested(). + */ +static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, + struct mapped_device *md) +{ + int r; + struct dm_dev_internal dd_new, dd_old; + + dd_new = dd_old = *dd; + + dd_new.dm_dev.mode |= new_mode; + dd_new.dm_dev.bdev = NULL; + + r = open_dev(&dd_new, dd->dm_dev.bdev->bd_dev, md); + if (r) + return r; + + dd->dm_dev.mode |= new_mode; + close_dev(&dd_old, md); + + return 0; +} + +/* + * Add a device to the list, or just increment the usage count if + * it's already present. + */ +int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, + struct dm_dev **result) +{ + int r; + dev_t uninitialized_var(dev); + struct dm_dev_internal *dd; + unsigned int major, minor; + struct dm_table *t = ti->table; + char dummy; + + BUG_ON(!t); + + if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) { + /* Extract the major/minor numbers */ + dev = MKDEV(major, minor); + if (MAJOR(dev) != major || MINOR(dev) != minor) + return -EOVERFLOW; + } else { + /* convert the path to a device */ + struct block_device *bdev = lookup_bdev(path); + + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + dev = bdev->bd_dev; + bdput(bdev); + } + + dd = find_device(&t->devices, dev); + if (!dd) { + dd = kmalloc(sizeof(*dd), GFP_KERNEL); + if (!dd) + return -ENOMEM; + + dd->dm_dev.mode = mode; + dd->dm_dev.bdev = NULL; + + if ((r = open_dev(dd, dev, t->md))) { + kfree(dd); + return r; + } + + format_dev_t(dd->dm_dev.name, dev); + + atomic_set(&dd->count, 0); + list_add(&dd->list, &t->devices); + + } else if (dd->dm_dev.mode != (mode | dd->dm_dev.mode)) { + r = upgrade_mode(dd, mode, t->md); + if (r) + return r; + } + atomic_inc(&dd->count); + + *result = &dd->dm_dev; + return 0; +} +EXPORT_SYMBOL(dm_get_device); + +int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct queue_limits *limits = data; + struct block_device *bdev = dev->bdev; + struct request_queue *q = bdev_get_queue(bdev); + char b[BDEVNAME_SIZE]; + + if (unlikely(!q)) { + DMWARN("%s: Cannot set limits for nonexistent device %s", + dm_device_name(ti->table->md), bdevname(bdev, b)); + return 0; + } + + if (bdev_stack_limits(limits, bdev, start) < 0) + DMWARN("%s: adding target device %s caused an alignment inconsistency: " + "physical_block_size=%u, logical_block_size=%u, " + "alignment_offset=%u, start=%llu", + dm_device_name(ti->table->md), bdevname(bdev, b), + q->limits.physical_block_size, + q->limits.logical_block_size, + q->limits.alignment_offset, + (unsigned long long) start << SECTOR_SHIFT); + + /* + * Check if merge fn is supported. + * If not we'll force DM to use PAGE_SIZE or + * smaller I/O, just to be safe. + */ + if (dm_queue_merge_is_compulsory(q) && !ti->type->merge) + blk_limits_max_hw_sectors(limits, + (unsigned int) (PAGE_SIZE >> 9)); + return 0; +} +EXPORT_SYMBOL_GPL(dm_set_device_limits); + +/* + * Decrement a device's use count and remove it if necessary. + */ +void dm_put_device(struct dm_target *ti, struct dm_dev *d) +{ + struct dm_dev_internal *dd = container_of(d, struct dm_dev_internal, + dm_dev); + + if (atomic_dec_and_test(&dd->count)) { + close_dev(dd, ti->table->md); + list_del(&dd->list); + kfree(dd); + } +} +EXPORT_SYMBOL(dm_put_device); + +/* + * Checks to see if the target joins onto the end of the table. + */ +static int adjoin(struct dm_table *table, struct dm_target *ti) +{ + struct dm_target *prev; + + if (!table->num_targets) + return !ti->begin; + + prev = &table->targets[table->num_targets - 1]; + return (ti->begin == (prev->begin + prev->len)); +} + +/* + * Used to dynamically allocate the arg array. + */ +static char **realloc_argv(unsigned *array_size, char **old_argv) +{ + char **argv; + unsigned new_size; + + new_size = *array_size ? *array_size * 2 : 64; + argv = kmalloc(new_size * sizeof(*argv), GFP_KERNEL); + if (argv) { + memcpy(argv, old_argv, *array_size * sizeof(*argv)); + *array_size = new_size; + } + + kfree(old_argv); + return argv; +} + +/* + * Destructively splits up the argument list to pass to ctr. + */ +int dm_split_args(int *argc, char ***argvp, char *input) +{ + char *start, *end = input, *out, **argv = NULL; + unsigned array_size = 0; + + *argc = 0; + + if (!input) { + *argvp = NULL; + return 0; + } + + argv = realloc_argv(&array_size, argv); + if (!argv) + return -ENOMEM; + + while (1) { + /* Skip whitespace */ + start = skip_spaces(end); + + if (!*start) + break; /* success, we hit the end */ + + /* 'out' is used to remove any back-quotes */ + end = out = start; + while (*end) { + /* Everything apart from '\0' can be quoted */ + if (*end == '\\' && *(end + 1)) { + *out++ = *(end + 1); + end += 2; + continue; + } + + if (isspace(*end)) + break; /* end of token */ + + *out++ = *end++; + } + + /* have we already filled the array ? */ + if ((*argc + 1) > array_size) { + argv = realloc_argv(&array_size, argv); + if (!argv) + return -ENOMEM; + } + + /* we know this is whitespace */ + if (*end) + end++; + + /* terminate the string and put it in the array */ + *out = '\0'; + argv[*argc] = start; + (*argc)++; + } + + *argvp = argv; + return 0; +} + +/* + * Impose necessary and sufficient conditions on a devices's table such + * that any incoming bio which respects its logical_block_size can be + * processed successfully. If it falls across the boundary between + * two or more targets, the size of each piece it gets split into must + * be compatible with the logical_block_size of the target processing it. + */ +static int validate_hardware_logical_block_alignment(struct dm_table *table, + struct queue_limits *limits) +{ + /* + * This function uses arithmetic modulo the logical_block_size + * (in units of 512-byte sectors). + */ + unsigned short device_logical_block_size_sects = + limits->logical_block_size >> SECTOR_SHIFT; + + /* + * Offset of the start of the next table entry, mod logical_block_size. + */ + unsigned short next_target_start = 0; + + /* + * Given an aligned bio that extends beyond the end of a + * target, how many sectors must the next target handle? + */ + unsigned short remaining = 0; + + struct dm_target *uninitialized_var(ti); + struct queue_limits ti_limits; + unsigned i = 0; + + /* + * Check each entry in the table in turn. + */ + while (i < dm_table_get_num_targets(table)) { + ti = dm_table_get_target(table, i++); + + blk_set_stacking_limits(&ti_limits); + + /* combine all target devices' limits */ + if (ti->type->iterate_devices) + ti->type->iterate_devices(ti, dm_set_device_limits, + &ti_limits); + + /* + * If the remaining sectors fall entirely within this + * table entry are they compatible with its logical_block_size? + */ + if (remaining < ti->len && + remaining & ((ti_limits.logical_block_size >> + SECTOR_SHIFT) - 1)) + break; /* Error */ + + next_target_start = + (unsigned short) ((next_target_start + ti->len) & + (device_logical_block_size_sects - 1)); + remaining = next_target_start ? + device_logical_block_size_sects - next_target_start : 0; + } + + if (remaining) { + DMWARN("%s: table line %u (start sect %llu len %llu) " + "not aligned to h/w logical block size %u", + dm_device_name(table->md), i, + (unsigned long long) ti->begin, + (unsigned long long) ti->len, + limits->logical_block_size); + return -EINVAL; + } + + return 0; +} + +int dm_table_add_target(struct dm_table *t, const char *type, + sector_t start, sector_t len, char *params) +{ + int r = -EINVAL, argc; + char **argv; + struct dm_target *tgt; + + if (t->singleton) { + DMERR("%s: target type %s must appear alone in table", + dm_device_name(t->md), t->targets->type->name); + return -EINVAL; + } + + if ((r = check_space(t))) + return r; + + tgt = t->targets + t->num_targets; + memset(tgt, 0, sizeof(*tgt)); + + if (!len) { + DMERR("%s: zero-length target", dm_device_name(t->md)); + return -EINVAL; + } + + tgt->type = dm_get_target_type(type); + if (!tgt->type) { + DMERR("%s: %s: unknown target type", dm_device_name(t->md), + type); + return -EINVAL; + } + + if (dm_target_needs_singleton(tgt->type)) { + if (t->num_targets) { + DMERR("%s: target type %s must appear alone in table", + dm_device_name(t->md), type); + return -EINVAL; + } + t->singleton = 1; + } + + if (dm_target_always_writeable(tgt->type) && !(t->mode & FMODE_WRITE)) { + DMERR("%s: target type %s may not be included in read-only tables", + dm_device_name(t->md), type); + return -EINVAL; + } + + if (t->immutable_target_type) { + if (t->immutable_target_type != tgt->type) { + DMERR("%s: immutable target type %s cannot be mixed with other target types", + dm_device_name(t->md), t->immutable_target_type->name); + return -EINVAL; + } + } else if (dm_target_is_immutable(tgt->type)) { + if (t->num_targets) { + DMERR("%s: immutable target type %s cannot be mixed with other target types", + dm_device_name(t->md), tgt->type->name); + return -EINVAL; + } + t->immutable_target_type = tgt->type; + } + + tgt->table = t; + tgt->begin = start; + tgt->len = len; + tgt->error = "Unknown error"; + + /* + * Does this target adjoin the previous one ? + */ + if (!adjoin(t, tgt)) { + tgt->error = "Gap in table"; + r = -EINVAL; + goto bad; + } + + r = dm_split_args(&argc, &argv, params); + if (r) { + tgt->error = "couldn't split parameters (insufficient memory)"; + goto bad; + } + + r = tgt->type->ctr(tgt, argc, argv); + kfree(argv); + if (r) + goto bad; + + t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; + + if (!tgt->num_discard_requests && tgt->discards_supported) + DMWARN("%s: %s: ignoring discards_supported because num_discard_requests is zero.", + dm_device_name(t->md), type); + + return 0; + + bad: + DMERR("%s: %s: %s", dm_device_name(t->md), type, tgt->error); + dm_put_target_type(tgt->type); + return r; +} + +/* + * Target argument parsing helpers. + */ +static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set, + unsigned *value, char **error, unsigned grouped) +{ + const char *arg_str = dm_shift_arg(arg_set); + char dummy; + + if (!arg_str || + (sscanf(arg_str, "%u%c", value, &dummy) != 1) || + (*value < arg->min) || + (*value > arg->max) || + (grouped && arg_set->argc < *value)) { + *error = arg->error; + return -EINVAL; + } + + return 0; +} + +int dm_read_arg(struct dm_arg *arg, struct dm_arg_set *arg_set, + unsigned *value, char **error) +{ + return validate_next_arg(arg, arg_set, value, error, 0); +} +EXPORT_SYMBOL(dm_read_arg); + +int dm_read_arg_group(struct dm_arg *arg, struct dm_arg_set *arg_set, + unsigned *value, char **error) +{ + return validate_next_arg(arg, arg_set, value, error, 1); +} +EXPORT_SYMBOL(dm_read_arg_group); + +const char *dm_shift_arg(struct dm_arg_set *as) +{ + char *r; + + if (as->argc) { + as->argc--; + r = *as->argv; + as->argv++; + return r; + } + + return NULL; +} +EXPORT_SYMBOL(dm_shift_arg); + +void dm_consume_args(struct dm_arg_set *as, unsigned num_args) +{ + BUG_ON(as->argc < num_args); + as->argc -= num_args; + as->argv += num_args; +} +EXPORT_SYMBOL(dm_consume_args); + +static int dm_table_set_type(struct dm_table *t) +{ + unsigned i; + unsigned bio_based = 0, request_based = 0; + struct dm_target *tgt; + struct dm_dev_internal *dd; + struct list_head *devices; + + for (i = 0; i < t->num_targets; i++) { + tgt = t->targets + i; + if (dm_target_request_based(tgt)) + request_based = 1; + else + bio_based = 1; + + if (bio_based && request_based) { + DMWARN("Inconsistent table: different target types" + " can't be mixed up"); + return -EINVAL; + } + } + + if (bio_based) { + /* We must use this table as bio-based */ + t->type = DM_TYPE_BIO_BASED; + return 0; + } + + BUG_ON(!request_based); /* No targets in this table */ + + /* Non-request-stackable devices can't be used for request-based dm */ + devices = dm_table_get_devices(t); + list_for_each_entry(dd, devices, list) { + if (!blk_queue_stackable(bdev_get_queue(dd->dm_dev.bdev))) { + DMWARN("table load rejected: including" + " non-request-stackable devices"); + return -EINVAL; + } + } + + /* + * Request-based dm supports only tables that have a single target now. + * To support multiple targets, request splitting support is needed, + * and that needs lots of changes in the block-layer. + * (e.g. request completion process for partial completion.) + */ + if (t->num_targets > 1) { + DMWARN("Request-based dm doesn't support multiple targets yet"); + return -EINVAL; + } + + t->type = DM_TYPE_REQUEST_BASED; + + return 0; +} + +unsigned dm_table_get_type(struct dm_table *t) +{ + return t->type; +} + +struct target_type *dm_table_get_immutable_target_type(struct dm_table *t) +{ + return t->immutable_target_type; +} + +bool dm_table_request_based(struct dm_table *t) +{ + return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED; +} + +int dm_table_alloc_md_mempools(struct dm_table *t) +{ + unsigned type = dm_table_get_type(t); + + if (unlikely(type == DM_TYPE_NONE)) { + DMWARN("no table type is set, can't allocate mempools"); + return -EINVAL; + } + + t->mempools = dm_alloc_md_mempools(type, t->integrity_supported); + if (!t->mempools) + return -ENOMEM; + + return 0; +} + +void dm_table_free_md_mempools(struct dm_table *t) +{ + dm_free_md_mempools(t->mempools); + t->mempools = NULL; +} + +struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t) +{ + return t->mempools; +} + +static int setup_indexes(struct dm_table *t) +{ + int i; + unsigned int total = 0; + sector_t *indexes; + + /* allocate the space for *all* the indexes */ + for (i = t->depth - 2; i >= 0; i--) { + t->counts[i] = dm_div_up(t->counts[i + 1], CHILDREN_PER_NODE); + total += t->counts[i]; + } + + indexes = (sector_t *) dm_vcalloc(total, (unsigned long) NODE_SIZE); + if (!indexes) + return -ENOMEM; + + /* set up internal nodes, bottom-up */ + for (i = t->depth - 2; i >= 0; i--) { + t->index[i] = indexes; + indexes += (KEYS_PER_NODE * t->counts[i]); + setup_btree_index(i, t); + } + + return 0; +} + +/* + * Builds the btree to index the map. + */ +static int dm_table_build_index(struct dm_table *t) +{ + int r = 0; + unsigned int leaf_nodes; + + /* how many indexes will the btree have ? */ + leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE); + t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE); + + /* leaf layer has already been set up */ + t->counts[t->depth - 1] = leaf_nodes; + t->index[t->depth - 1] = t->highs; + + if (t->depth >= 2) + r = setup_indexes(t); + + return r; +} + +/* + * Get a disk whose integrity profile reflects the table's profile. + * If %match_all is true, all devices' profiles must match. + * If %match_all is false, all devices must at least have an + * allocated integrity profile; but uninitialized is ok. + * Returns NULL if integrity support was inconsistent or unavailable. + */ +static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t, + bool match_all) +{ + struct list_head *devices = dm_table_get_devices(t); + struct dm_dev_internal *dd = NULL; + struct gendisk *prev_disk = NULL, *template_disk = NULL; + + list_for_each_entry(dd, devices, list) { + template_disk = dd->dm_dev.bdev->bd_disk; + if (!blk_get_integrity(template_disk)) + goto no_integrity; + if (!match_all && !blk_integrity_is_initialized(template_disk)) + continue; /* skip uninitialized profiles */ + else if (prev_disk && + blk_integrity_compare(prev_disk, template_disk) < 0) + goto no_integrity; + prev_disk = template_disk; + } + + return template_disk; + +no_integrity: + if (prev_disk) + DMWARN("%s: integrity not set: %s and %s profile mismatch", + dm_device_name(t->md), + prev_disk->disk_name, + template_disk->disk_name); + return NULL; +} + +/* + * Register the mapped device for blk_integrity support if + * the underlying devices have an integrity profile. But all devices + * may not have matching profiles (checking all devices isn't reliable + * during table load because this table may use other DM device(s) which + * must be resumed before they will have an initialized integity profile). + * Stacked DM devices force a 2 stage integrity profile validation: + * 1 - during load, validate all initialized integrity profiles match + * 2 - during resume, validate all integrity profiles match + */ +static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device *md) +{ + struct gendisk *template_disk = NULL; + + template_disk = dm_table_get_integrity_disk(t, false); + if (!template_disk) + return 0; + + if (!blk_integrity_is_initialized(dm_disk(md))) { + t->integrity_supported = 1; + return blk_integrity_register(dm_disk(md), NULL); + } + + /* + * If DM device already has an initalized integrity + * profile the new profile should not conflict. + */ + if (blk_integrity_is_initialized(template_disk) && + blk_integrity_compare(dm_disk(md), template_disk) < 0) { + DMWARN("%s: conflict with existing integrity profile: " + "%s profile mismatch", + dm_device_name(t->md), + template_disk->disk_name); + return 1; + } + + /* Preserve existing initialized integrity profile */ + t->integrity_supported = 1; + return 0; +} + +/* + * Prepares the table for use by building the indices, + * setting the type, and allocating mempools. + */ +int dm_table_complete(struct dm_table *t) +{ + int r; + + r = dm_table_set_type(t); + if (r) { + DMERR("unable to set table type"); + return r; + } + + r = dm_table_build_index(t); + if (r) { + DMERR("unable to build btrees"); + return r; + } + + r = dm_table_prealloc_integrity(t, t->md); + if (r) { + DMERR("could not register integrity profile."); + return r; + } + + r = dm_table_alloc_md_mempools(t); + if (r) + DMERR("unable to allocate mempools"); + + return r; +} + +static DEFINE_MUTEX(_event_lock); +void dm_table_event_callback(struct dm_table *t, + void (*fn)(void *), void *context) +{ + mutex_lock(&_event_lock); + t->event_fn = fn; + t->event_context = context; + mutex_unlock(&_event_lock); +} + +void dm_table_event(struct dm_table *t) +{ + /* + * You can no longer call dm_table_event() from interrupt + * context, use a bottom half instead. + */ + BUG_ON(in_interrupt()); + + mutex_lock(&_event_lock); + if (t->event_fn) + t->event_fn(t->event_context); + mutex_unlock(&_event_lock); +} +EXPORT_SYMBOL(dm_table_event); + +sector_t dm_table_get_size(struct dm_table *t) +{ + return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0; +} +EXPORT_SYMBOL(dm_table_get_size); + +struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index) +{ + if (index >= t->num_targets) + return NULL; + + return t->targets + index; +} + +/* + * Search the btree for the correct target. + * + * Caller should check returned pointer with dm_target_is_valid() + * to trap I/O beyond end of device. + */ +struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) +{ + unsigned int l, n = 0, k = 0; + sector_t *node; + + for (l = 0; l < t->depth; l++) { + n = get_child(n, k); + node = get_node(t, l, n); + + for (k = 0; k < KEYS_PER_NODE; k++) + if (node[k] >= sector) + break; + } + + return &t->targets[(KEYS_PER_NODE * n) + k]; +} + +/* + * Establish the new table's queue_limits and validate them. + */ +int dm_calculate_queue_limits(struct dm_table *table, + struct queue_limits *limits) +{ + struct dm_target *uninitialized_var(ti); + struct queue_limits ti_limits; + unsigned i = 0; + + blk_set_stacking_limits(limits); + + while (i < dm_table_get_num_targets(table)) { + blk_set_stacking_limits(&ti_limits); + + ti = dm_table_get_target(table, i++); + + if (!ti->type->iterate_devices) + goto combine_limits; + + /* + * Combine queue limits of all the devices this target uses. + */ + ti->type->iterate_devices(ti, dm_set_device_limits, + &ti_limits); + + /* Set I/O hints portion of queue limits */ + if (ti->type->io_hints) + ti->type->io_hints(ti, &ti_limits); + + /* + * Check each device area is consistent with the target's + * overall queue limits. + */ + if (ti->type->iterate_devices(ti, device_area_is_invalid, + &ti_limits)) + return -EINVAL; + +combine_limits: + /* + * Merge this target's queue limits into the overall limits + * for the table. + */ + if (blk_stack_limits(limits, &ti_limits, 0) < 0) + DMWARN("%s: adding target device " + "(start sect %llu len %llu) " + "caused an alignment inconsistency", + dm_device_name(table->md), + (unsigned long long) ti->begin, + (unsigned long long) ti->len); + } + + return validate_hardware_logical_block_alignment(table, limits); +} + +/* + * Set the integrity profile for this device if all devices used have + * matching profiles. We're quite deep in the resume path but still + * don't know if all devices (particularly DM devices this device + * may be stacked on) have matching profiles. Even if the profiles + * don't match we have no way to fail (to resume) at this point. + */ +static void dm_table_set_integrity(struct dm_table *t) +{ + struct gendisk *template_disk = NULL; + + if (!blk_get_integrity(dm_disk(t->md))) + return; + + template_disk = dm_table_get_integrity_disk(t, true); + if (template_disk) + blk_integrity_register(dm_disk(t->md), + blk_get_integrity(template_disk)); + else if (blk_integrity_is_initialized(dm_disk(t->md))) + DMWARN("%s: device no longer has a valid integrity profile", + dm_device_name(t->md)); + else + DMWARN("%s: unable to establish an integrity profile", + dm_device_name(t->md)); +} + +static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + unsigned flush = (*(unsigned *)data); + struct request_queue *q = bdev_get_queue(dev->bdev); + + return q && (q->flush_flags & flush); +} + +static bool dm_table_supports_flush(struct dm_table *t, unsigned flush) +{ + struct dm_target *ti; + unsigned i = 0; + + /* + * Require at least one underlying device to support flushes. + * t->devices includes internal dm devices such as mirror logs + * so we need to use iterate_devices here, which targets + * supporting flushes must provide. + */ + while (i < dm_table_get_num_targets(t)) { + ti = dm_table_get_target(t, i++); + + if (!ti->num_flush_requests) + continue; + + if (ti->type->iterate_devices && + ti->type->iterate_devices(ti, device_flush_capable, &flush)) + return 1; + } + + return 0; +} + +static bool dm_table_discard_zeroes_data(struct dm_table *t) +{ + struct dm_target *ti; + unsigned i = 0; + + /* Ensure that all targets supports discard_zeroes_data. */ + while (i < dm_table_get_num_targets(t)) { + ti = dm_table_get_target(t, i++); + + if (ti->discard_zeroes_data_unsupported) + return 0; + } + + return 1; +} + +static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct request_queue *q = bdev_get_queue(dev->bdev); + + return q && blk_queue_nonrot(q); +} + +static bool dm_table_is_nonrot(struct dm_table *t) +{ + struct dm_target *ti; + unsigned i = 0; + + /* Ensure that all underlying device are non-rotational. */ + while (i < dm_table_get_num_targets(t)) { + ti = dm_table_get_target(t, i++); + + if (!ti->type->iterate_devices || + !ti->type->iterate_devices(ti, device_is_nonrot, NULL)) + return 0; + } + + return 1; +} + +void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, + struct queue_limits *limits) +{ + unsigned flush = 0; + + /* + * Copy table's limits to the DM device's request_queue + */ + q->limits = *limits; + + if (!dm_table_supports_discards(t)) + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); + else + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); + + if (dm_table_supports_flush(t, REQ_FLUSH)) { + flush |= REQ_FLUSH; + if (dm_table_supports_flush(t, REQ_FUA)) + flush |= REQ_FUA; + } + blk_queue_flush(q, flush); + + if (!dm_table_discard_zeroes_data(t)) + q->limits.discard_zeroes_data = 0; + + if (dm_table_is_nonrot(t)) + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); + else + queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, q); + + dm_table_set_integrity(t); + + /* + * QUEUE_FLAG_STACKABLE must be set after all queue settings are + * visible to other CPUs because, once the flag is set, incoming bios + * are processed by request-based dm, which refers to the queue + * settings. + * Until the flag set, bios are passed to bio-based dm and queued to + * md->deferred where queue settings are not needed yet. + * Those bios are passed to request-based dm at the resume time. + */ + smp_mb(); + if (dm_table_request_based(t)) + queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q); +} + +unsigned int dm_table_get_num_targets(struct dm_table *t) +{ + return t->num_targets; +} + +struct list_head *dm_table_get_devices(struct dm_table *t) +{ + return &t->devices; +} + +fmode_t dm_table_get_mode(struct dm_table *t) +{ + return t->mode; +} +EXPORT_SYMBOL(dm_table_get_mode); + +static void suspend_targets(struct dm_table *t, unsigned postsuspend) +{ + int i = t->num_targets; + struct dm_target *ti = t->targets; + + while (i--) { + if (postsuspend) { + if (ti->type->postsuspend) + ti->type->postsuspend(ti); + } else if (ti->type->presuspend) + ti->type->presuspend(ti); + + ti++; + } +} + +void dm_table_presuspend_targets(struct dm_table *t) +{ + if (!t) + return; + + suspend_targets(t, 0); +} + +void dm_table_postsuspend_targets(struct dm_table *t) +{ + if (!t) + return; + + suspend_targets(t, 1); +} + +int dm_table_resume_targets(struct dm_table *t) +{ + int i, r = 0; + + for (i = 0; i < t->num_targets; i++) { + struct dm_target *ti = t->targets + i; + + if (!ti->type->preresume) + continue; + + r = ti->type->preresume(ti); + if (r) + return r; + } + + for (i = 0; i < t->num_targets; i++) { + struct dm_target *ti = t->targets + i; + + if (ti->type->resume) + ti->type->resume(ti); + } + + return 0; +} + +void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb) +{ + list_add(&cb->list, &t->target_callbacks); +} +EXPORT_SYMBOL_GPL(dm_table_add_target_callbacks); + +int dm_table_any_congested(struct dm_table *t, int bdi_bits) +{ + struct dm_dev_internal *dd; + struct list_head *devices = dm_table_get_devices(t); + struct dm_target_callbacks *cb; + int r = 0; + + list_for_each_entry(dd, devices, list) { + struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev); + char b[BDEVNAME_SIZE]; + + if (likely(q)) + r |= bdi_congested(&q->backing_dev_info, bdi_bits); + else + DMWARN_LIMIT("%s: any_congested: nonexistent device %s", + dm_device_name(t->md), + bdevname(dd->dm_dev.bdev, b)); + } + + list_for_each_entry(cb, &t->target_callbacks, list) + if (cb->congested_fn) + r |= cb->congested_fn(cb, bdi_bits); + + return r; +} + +int dm_table_any_busy_target(struct dm_table *t) +{ + unsigned i; + struct dm_target *ti; + + for (i = 0; i < t->num_targets; i++) { + ti = t->targets + i; + if (ti->type->busy && ti->type->busy(ti)) + return 1; + } + + return 0; +} + +struct mapped_device *dm_table_get_md(struct dm_table *t) +{ + return t->md; +} +EXPORT_SYMBOL(dm_table_get_md); + +static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct request_queue *q = bdev_get_queue(dev->bdev); + + return q && blk_queue_discard(q); +} + +bool dm_table_supports_discards(struct dm_table *t) +{ + struct dm_target *ti; + unsigned i = 0; + + /* + * Unless any target used by the table set discards_supported, + * require at least one underlying device to support discards. + * t->devices includes internal dm devices such as mirror logs + * so we need to use iterate_devices here, which targets + * supporting discard selectively must provide. + */ + while (i < dm_table_get_num_targets(t)) { + ti = dm_table_get_target(t, i++); + + if (!ti->num_discard_requests) + continue; + + if (ti->discards_supported) + return 1; + + if (ti->type->iterate_devices && + ti->type->iterate_devices(ti, device_discard_capable, NULL)) + return 1; + } + + return 0; +} diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c new file mode 100644 index 00000000..8da366cf --- /dev/null +++ b/drivers/md/dm-target.c @@ -0,0 +1,154 @@ +/* + * Copyright (C) 2001 Sistina Software (UK) Limited + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/kmod.h> +#include <linux/bio.h> + +#define DM_MSG_PREFIX "target" + +static LIST_HEAD(_targets); +static DECLARE_RWSEM(_lock); + +#define DM_MOD_NAME_SIZE 32 + +static inline struct target_type *__find_target_type(const char *name) +{ + struct target_type *tt; + + list_for_each_entry(tt, &_targets, list) + if (!strcmp(name, tt->name)) + return tt; + + return NULL; +} + +static struct target_type *get_target_type(const char *name) +{ + struct target_type *tt; + + down_read(&_lock); + + tt = __find_target_type(name); + if (tt && !try_module_get(tt->module)) + tt = NULL; + + up_read(&_lock); + return tt; +} + +static void load_module(const char *name) +{ + request_module("dm-%s", name); +} + +struct target_type *dm_get_target_type(const char *name) +{ + struct target_type *tt = get_target_type(name); + + if (!tt) { + load_module(name); + tt = get_target_type(name); + } + + return tt; +} + +void dm_put_target_type(struct target_type *tt) +{ + down_read(&_lock); + module_put(tt->module); + up_read(&_lock); +} + +int dm_target_iterate(void (*iter_func)(struct target_type *tt, + void *param), void *param) +{ + struct target_type *tt; + + down_read(&_lock); + list_for_each_entry(tt, &_targets, list) + iter_func(tt, param); + up_read(&_lock); + + return 0; +} + +int dm_register_target(struct target_type *tt) +{ + int rv = 0; + + down_write(&_lock); + if (__find_target_type(tt->name)) + rv = -EEXIST; + else + list_add(&tt->list, &_targets); + + up_write(&_lock); + return rv; +} + +void dm_unregister_target(struct target_type *tt) +{ + down_write(&_lock); + if (!__find_target_type(tt->name)) { + DMCRIT("Unregistering unrecognised target: %s", tt->name); + BUG(); + } + + list_del(&tt->list); + + up_write(&_lock); +} + +/* + * io-err: always fails an io, useful for bringing + * up LVs that have holes in them. + */ +static int io_err_ctr(struct dm_target *tt, unsigned int argc, char **args) +{ + /* + * Return error for discards instead of -EOPNOTSUPP + */ + tt->num_discard_requests = 1; + + return 0; +} + +static void io_err_dtr(struct dm_target *tt) +{ + /* empty */ +} + +static int io_err_map(struct dm_target *tt, struct bio *bio, + union map_info *map_context) +{ + return -EIO; +} + +static struct target_type error_target = { + .name = "error", + .version = {1, 0, 1}, + .ctr = io_err_ctr, + .dtr = io_err_dtr, + .map = io_err_map, +}; + +int __init dm_target_init(void) +{ + return dm_register_target(&error_target); +} + +void dm_target_exit(void) +{ + dm_unregister_target(&error_target); +} + +EXPORT_SYMBOL(dm_register_target); +EXPORT_SYMBOL(dm_unregister_target); diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c new file mode 100644 index 00000000..737d3886 --- /dev/null +++ b/drivers/md/dm-thin-metadata.c @@ -0,0 +1,1409 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-thin-metadata.h" +#include "persistent-data/dm-btree.h" +#include "persistent-data/dm-space-map.h" +#include "persistent-data/dm-space-map-disk.h" +#include "persistent-data/dm-transaction-manager.h" + +#include <linux/list.h> +#include <linux/device-mapper.h> +#include <linux/workqueue.h> + +/*-------------------------------------------------------------------------- + * As far as the metadata goes, there is: + * + * - A superblock in block zero, taking up fewer than 512 bytes for + * atomic writes. + * + * - A space map managing the metadata blocks. + * + * - A space map managing the data blocks. + * + * - A btree mapping our internal thin dev ids onto struct disk_device_details. + * + * - A hierarchical btree, with 2 levels which effectively maps (thin + * dev id, virtual block) -> block_time. Block time is a 64-bit + * field holding the time in the low 24 bits, and block in the top 48 + * bits. + * + * BTrees consist solely of btree_nodes, that fill a block. Some are + * internal nodes, as such their values are a __le64 pointing to other + * nodes. Leaf nodes can store data of any reasonable size (ie. much + * smaller than the block size). The nodes consist of the header, + * followed by an array of keys, followed by an array of values. We have + * to binary search on the keys so they're all held together to help the + * cpu cache. + * + * Space maps have 2 btrees: + * + * - One maps a uint64_t onto a struct index_entry. Which points to a + * bitmap block, and has some details about how many free entries there + * are etc. + * + * - The bitmap blocks have a header (for the checksum). Then the rest + * of the block is pairs of bits. With the meaning being: + * + * 0 - ref count is 0 + * 1 - ref count is 1 + * 2 - ref count is 2 + * 3 - ref count is higher than 2 + * + * - If the count is higher than 2 then the ref count is entered in a + * second btree that directly maps the block_address to a uint32_t ref + * count. + * + * The space map metadata variant doesn't have a bitmaps btree. Instead + * it has one single blocks worth of index_entries. This avoids + * recursive issues with the bitmap btree needing to allocate space in + * order to insert. With a small data block size such as 64k the + * metadata support data devices that are hundreds of terrabytes. + * + * The space maps allocate space linearly from front to back. Space that + * is freed in a transaction is never recycled within that transaction. + * To try and avoid fragmenting _free_ space the allocator always goes + * back and fills in gaps. + * + * All metadata io is in THIN_METADATA_BLOCK_SIZE sized/aligned chunks + * from the block manager. + *--------------------------------------------------------------------------*/ + +#define DM_MSG_PREFIX "thin metadata" + +#define THIN_SUPERBLOCK_MAGIC 27022010 +#define THIN_SUPERBLOCK_LOCATION 0 +#define THIN_VERSION 1 +#define THIN_METADATA_CACHE_SIZE 64 +#define SECTOR_TO_BLOCK_SHIFT 3 + +/* This should be plenty */ +#define SPACE_MAP_ROOT_SIZE 128 + +/* + * Little endian on-disk superblock and device details. + */ +struct thin_disk_superblock { + __le32 csum; /* Checksum of superblock except for this field. */ + __le32 flags; + __le64 blocknr; /* This block number, dm_block_t. */ + + __u8 uuid[16]; + __le64 magic; + __le32 version; + __le32 time; + + __le64 trans_id; + + /* + * Root held by userspace transactions. + */ + __le64 held_root; + + __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE]; + __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; + + /* + * 2-level btree mapping (dev_id, (dev block, time)) -> data block + */ + __le64 data_mapping_root; + + /* + * Device detail root mapping dev_id -> device_details + */ + __le64 device_details_root; + + __le32 data_block_size; /* In 512-byte sectors. */ + + __le32 metadata_block_size; /* In 512-byte sectors. */ + __le64 metadata_nr_blocks; + + __le32 compat_flags; + __le32 compat_ro_flags; + __le32 incompat_flags; +} __packed; + +struct disk_device_details { + __le64 mapped_blocks; + __le64 transaction_id; /* When created. */ + __le32 creation_time; + __le32 snapshotted_time; +} __packed; + +struct dm_pool_metadata { + struct hlist_node hash; + + struct block_device *bdev; + struct dm_block_manager *bm; + struct dm_space_map *metadata_sm; + struct dm_space_map *data_sm; + struct dm_transaction_manager *tm; + struct dm_transaction_manager *nb_tm; + + /* + * Two-level btree. + * First level holds thin_dev_t. + * Second level holds mappings. + */ + struct dm_btree_info info; + + /* + * Non-blocking version of the above. + */ + struct dm_btree_info nb_info; + + /* + * Just the top level for deleting whole devices. + */ + struct dm_btree_info tl_info; + + /* + * Just the bottom level for creating new devices. + */ + struct dm_btree_info bl_info; + + /* + * Describes the device details btree. + */ + struct dm_btree_info details_info; + + struct rw_semaphore root_lock; + uint32_t time; + int need_commit; + dm_block_t root; + dm_block_t details_root; + struct list_head thin_devices; + uint64_t trans_id; + unsigned long flags; + sector_t data_block_size; +}; + +struct dm_thin_device { + struct list_head list; + struct dm_pool_metadata *pmd; + dm_thin_id id; + + int open_count; + int changed; + uint64_t mapped_blocks; + uint64_t transaction_id; + uint32_t creation_time; + uint32_t snapshotted_time; +}; + +/*---------------------------------------------------------------- + * superblock validator + *--------------------------------------------------------------*/ + +#define SUPERBLOCK_CSUM_XOR 160774 + +static void sb_prepare_for_write(struct dm_block_validator *v, + struct dm_block *b, + size_t block_size) +{ + struct thin_disk_superblock *disk_super = dm_block_data(b); + + disk_super->blocknr = cpu_to_le64(dm_block_location(b)); + disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags, + block_size - sizeof(__le32), + SUPERBLOCK_CSUM_XOR)); +} + +static int sb_check(struct dm_block_validator *v, + struct dm_block *b, + size_t block_size) +{ + struct thin_disk_superblock *disk_super = dm_block_data(b); + __le32 csum_le; + + if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) { + DMERR("sb_check failed: blocknr %llu: " + "wanted %llu", le64_to_cpu(disk_super->blocknr), + (unsigned long long)dm_block_location(b)); + return -ENOTBLK; + } + + if (le64_to_cpu(disk_super->magic) != THIN_SUPERBLOCK_MAGIC) { + DMERR("sb_check failed: magic %llu: " + "wanted %llu", le64_to_cpu(disk_super->magic), + (unsigned long long)THIN_SUPERBLOCK_MAGIC); + return -EILSEQ; + } + + csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags, + block_size - sizeof(__le32), + SUPERBLOCK_CSUM_XOR)); + if (csum_le != disk_super->csum) { + DMERR("sb_check failed: csum %u: wanted %u", + le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum)); + return -EILSEQ; + } + + return 0; +} + +static struct dm_block_validator sb_validator = { + .name = "superblock", + .prepare_for_write = sb_prepare_for_write, + .check = sb_check +}; + +/*---------------------------------------------------------------- + * Methods for the btree value types + *--------------------------------------------------------------*/ + +static uint64_t pack_block_time(dm_block_t b, uint32_t t) +{ + return (b << 24) | t; +} + +static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t) +{ + *b = v >> 24; + *t = v & ((1 << 24) - 1); +} + +static void data_block_inc(void *context, void *value_le) +{ + struct dm_space_map *sm = context; + __le64 v_le; + uint64_t b; + uint32_t t; + + memcpy(&v_le, value_le, sizeof(v_le)); + unpack_block_time(le64_to_cpu(v_le), &b, &t); + dm_sm_inc_block(sm, b); +} + +static void data_block_dec(void *context, void *value_le) +{ + struct dm_space_map *sm = context; + __le64 v_le; + uint64_t b; + uint32_t t; + + memcpy(&v_le, value_le, sizeof(v_le)); + unpack_block_time(le64_to_cpu(v_le), &b, &t); + dm_sm_dec_block(sm, b); +} + +static int data_block_equal(void *context, void *value1_le, void *value2_le) +{ + __le64 v1_le, v2_le; + uint64_t b1, b2; + uint32_t t; + + memcpy(&v1_le, value1_le, sizeof(v1_le)); + memcpy(&v2_le, value2_le, sizeof(v2_le)); + unpack_block_time(le64_to_cpu(v1_le), &b1, &t); + unpack_block_time(le64_to_cpu(v2_le), &b2, &t); + + return b1 == b2; +} + +static void subtree_inc(void *context, void *value) +{ + struct dm_btree_info *info = context; + __le64 root_le; + uint64_t root; + + memcpy(&root_le, value, sizeof(root_le)); + root = le64_to_cpu(root_le); + dm_tm_inc(info->tm, root); +} + +static void subtree_dec(void *context, void *value) +{ + struct dm_btree_info *info = context; + __le64 root_le; + uint64_t root; + + memcpy(&root_le, value, sizeof(root_le)); + root = le64_to_cpu(root_le); + if (dm_btree_del(info, root)) + DMERR("btree delete failed\n"); +} + +static int subtree_equal(void *context, void *value1_le, void *value2_le) +{ + __le64 v1_le, v2_le; + memcpy(&v1_le, value1_le, sizeof(v1_le)); + memcpy(&v2_le, value2_le, sizeof(v2_le)); + + return v1_le == v2_le; +} + +/*----------------------------------------------------------------*/ + +static int superblock_all_zeroes(struct dm_block_manager *bm, int *result) +{ + int r; + unsigned i; + struct dm_block *b; + __le64 *data_le, zero = cpu_to_le64(0); + unsigned block_size = dm_bm_block_size(bm) / sizeof(__le64); + + /* + * We can't use a validator here - it may be all zeroes. + */ + r = dm_bm_read_lock(bm, THIN_SUPERBLOCK_LOCATION, NULL, &b); + if (r) + return r; + + data_le = dm_block_data(b); + *result = 1; + for (i = 0; i < block_size; i++) { + if (data_le[i] != zero) { + *result = 0; + break; + } + } + + return dm_bm_unlock(b); +} + +static int init_pmd(struct dm_pool_metadata *pmd, + struct dm_block_manager *bm, + dm_block_t nr_blocks, int create) +{ + int r; + struct dm_space_map *sm, *data_sm; + struct dm_transaction_manager *tm; + struct dm_block *sblock; + + if (create) { + r = dm_tm_create_with_sm(bm, THIN_SUPERBLOCK_LOCATION, + &sb_validator, &tm, &sm, &sblock); + if (r < 0) { + DMERR("tm_create_with_sm failed"); + return r; + } + + data_sm = dm_sm_disk_create(tm, nr_blocks); + if (IS_ERR(data_sm)) { + DMERR("sm_disk_create failed"); + dm_tm_unlock(tm, sblock); + r = PTR_ERR(data_sm); + goto bad; + } + } else { + struct thin_disk_superblock *disk_super = NULL; + size_t space_map_root_offset = + offsetof(struct thin_disk_superblock, metadata_space_map_root); + + r = dm_tm_open_with_sm(bm, THIN_SUPERBLOCK_LOCATION, + &sb_validator, space_map_root_offset, + SPACE_MAP_ROOT_SIZE, &tm, &sm, &sblock); + if (r < 0) { + DMERR("tm_open_with_sm failed"); + return r; + } + + disk_super = dm_block_data(sblock); + data_sm = dm_sm_disk_open(tm, disk_super->data_space_map_root, + sizeof(disk_super->data_space_map_root)); + if (IS_ERR(data_sm)) { + DMERR("sm_disk_open failed"); + r = PTR_ERR(data_sm); + goto bad; + } + } + + + r = dm_tm_unlock(tm, sblock); + if (r < 0) { + DMERR("couldn't unlock superblock"); + goto bad_data_sm; + } + + pmd->bm = bm; + pmd->metadata_sm = sm; + pmd->data_sm = data_sm; + pmd->tm = tm; + pmd->nb_tm = dm_tm_create_non_blocking_clone(tm); + if (!pmd->nb_tm) { + DMERR("could not create clone tm"); + r = -ENOMEM; + goto bad_data_sm; + } + + pmd->info.tm = tm; + pmd->info.levels = 2; + pmd->info.value_type.context = pmd->data_sm; + pmd->info.value_type.size = sizeof(__le64); + pmd->info.value_type.inc = data_block_inc; + pmd->info.value_type.dec = data_block_dec; + pmd->info.value_type.equal = data_block_equal; + + memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info)); + pmd->nb_info.tm = pmd->nb_tm; + + pmd->tl_info.tm = tm; + pmd->tl_info.levels = 1; + pmd->tl_info.value_type.context = &pmd->info; + pmd->tl_info.value_type.size = sizeof(__le64); + pmd->tl_info.value_type.inc = subtree_inc; + pmd->tl_info.value_type.dec = subtree_dec; + pmd->tl_info.value_type.equal = subtree_equal; + + pmd->bl_info.tm = tm; + pmd->bl_info.levels = 1; + pmd->bl_info.value_type.context = pmd->data_sm; + pmd->bl_info.value_type.size = sizeof(__le64); + pmd->bl_info.value_type.inc = data_block_inc; + pmd->bl_info.value_type.dec = data_block_dec; + pmd->bl_info.value_type.equal = data_block_equal; + + pmd->details_info.tm = tm; + pmd->details_info.levels = 1; + pmd->details_info.value_type.context = NULL; + pmd->details_info.value_type.size = sizeof(struct disk_device_details); + pmd->details_info.value_type.inc = NULL; + pmd->details_info.value_type.dec = NULL; + pmd->details_info.value_type.equal = NULL; + + pmd->root = 0; + + init_rwsem(&pmd->root_lock); + pmd->time = 0; + pmd->need_commit = 0; + pmd->details_root = 0; + pmd->trans_id = 0; + pmd->flags = 0; + INIT_LIST_HEAD(&pmd->thin_devices); + + return 0; + +bad_data_sm: + dm_sm_destroy(data_sm); +bad: + dm_tm_destroy(tm); + dm_sm_destroy(sm); + + return r; +} + +static int __begin_transaction(struct dm_pool_metadata *pmd) +{ + int r; + u32 features; + struct thin_disk_superblock *disk_super; + struct dm_block *sblock; + + /* + * __maybe_commit_transaction() resets these + */ + WARN_ON(pmd->need_commit); + + /* + * We re-read the superblock every time. Shouldn't need to do this + * really. + */ + r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, + &sb_validator, &sblock); + if (r) + return r; + + disk_super = dm_block_data(sblock); + pmd->time = le32_to_cpu(disk_super->time); + pmd->root = le64_to_cpu(disk_super->data_mapping_root); + pmd->details_root = le64_to_cpu(disk_super->device_details_root); + pmd->trans_id = le64_to_cpu(disk_super->trans_id); + pmd->flags = le32_to_cpu(disk_super->flags); + pmd->data_block_size = le32_to_cpu(disk_super->data_block_size); + + features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP; + if (features) { + DMERR("could not access metadata due to " + "unsupported optional features (%lx).", + (unsigned long)features); + r = -EINVAL; + goto out; + } + + /* + * Check for read-only metadata to skip the following RDWR checks. + */ + if (get_disk_ro(pmd->bdev->bd_disk)) + goto out; + + features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP; + if (features) { + DMERR("could not access metadata RDWR due to " + "unsupported optional features (%lx).", + (unsigned long)features); + r = -EINVAL; + } + +out: + dm_bm_unlock(sblock); + return r; +} + +static int __write_changed_details(struct dm_pool_metadata *pmd) +{ + int r; + struct dm_thin_device *td, *tmp; + struct disk_device_details details; + uint64_t key; + + list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) { + if (!td->changed) + continue; + + key = td->id; + + details.mapped_blocks = cpu_to_le64(td->mapped_blocks); + details.transaction_id = cpu_to_le64(td->transaction_id); + details.creation_time = cpu_to_le32(td->creation_time); + details.snapshotted_time = cpu_to_le32(td->snapshotted_time); + __dm_bless_for_disk(&details); + + r = dm_btree_insert(&pmd->details_info, pmd->details_root, + &key, &details, &pmd->details_root); + if (r) + return r; + + if (td->open_count) + td->changed = 0; + else { + list_del(&td->list); + kfree(td); + } + + pmd->need_commit = 1; + } + + return 0; +} + +static int __commit_transaction(struct dm_pool_metadata *pmd) +{ + /* + * FIXME: Associated pool should be made read-only on failure. + */ + int r; + size_t metadata_len, data_len; + struct thin_disk_superblock *disk_super; + struct dm_block *sblock; + + /* + * We need to know if the thin_disk_superblock exceeds a 512-byte sector. + */ + BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512); + + r = __write_changed_details(pmd); + if (r < 0) + goto out; + + if (!pmd->need_commit) + goto out; + + r = dm_sm_commit(pmd->data_sm); + if (r < 0) + goto out; + + r = dm_tm_pre_commit(pmd->tm); + if (r < 0) + goto out; + + r = dm_sm_root_size(pmd->metadata_sm, &metadata_len); + if (r < 0) + goto out; + + r = dm_sm_root_size(pmd->data_sm, &data_len); + if (r < 0) + goto out; + + r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, + &sb_validator, &sblock); + if (r) + goto out; + + disk_super = dm_block_data(sblock); + disk_super->time = cpu_to_le32(pmd->time); + disk_super->data_mapping_root = cpu_to_le64(pmd->root); + disk_super->device_details_root = cpu_to_le64(pmd->details_root); + disk_super->trans_id = cpu_to_le64(pmd->trans_id); + disk_super->flags = cpu_to_le32(pmd->flags); + + r = dm_sm_copy_root(pmd->metadata_sm, &disk_super->metadata_space_map_root, + metadata_len); + if (r < 0) + goto out_locked; + + r = dm_sm_copy_root(pmd->data_sm, &disk_super->data_space_map_root, + data_len); + if (r < 0) + goto out_locked; + + r = dm_tm_commit(pmd->tm, sblock); + if (!r) + pmd->need_commit = 0; + +out: + return r; + +out_locked: + dm_bm_unlock(sblock); + return r; +} + +struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, + sector_t data_block_size) +{ + int r; + struct thin_disk_superblock *disk_super; + struct dm_pool_metadata *pmd; + sector_t bdev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; + struct dm_block_manager *bm; + int create; + struct dm_block *sblock; + + pmd = kmalloc(sizeof(*pmd), GFP_KERNEL); + if (!pmd) { + DMERR("could not allocate metadata struct"); + return ERR_PTR(-ENOMEM); + } + + /* + * Max hex locks: + * 3 for btree insert + + * 2 for btree lookup used within space map + */ + bm = dm_block_manager_create(bdev, THIN_METADATA_BLOCK_SIZE, + THIN_METADATA_CACHE_SIZE, 5); + if (!bm) { + DMERR("could not create block manager"); + kfree(pmd); + return ERR_PTR(-ENOMEM); + } + + r = superblock_all_zeroes(bm, &create); + if (r) { + dm_block_manager_destroy(bm); + kfree(pmd); + return ERR_PTR(r); + } + + + r = init_pmd(pmd, bm, 0, create); + if (r) { + dm_block_manager_destroy(bm); + kfree(pmd); + return ERR_PTR(r); + } + pmd->bdev = bdev; + + if (!create) { + r = __begin_transaction(pmd); + if (r < 0) + goto bad; + return pmd; + } + + /* + * Create. + */ + r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, + &sb_validator, &sblock); + if (r) + goto bad; + + if (bdev_size > THIN_METADATA_MAX_SECTORS) + bdev_size = THIN_METADATA_MAX_SECTORS; + + disk_super = dm_block_data(sblock); + disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC); + disk_super->version = cpu_to_le32(THIN_VERSION); + disk_super->time = 0; + disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); + disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT); + disk_super->data_block_size = cpu_to_le32(data_block_size); + + r = dm_bm_unlock(sblock); + if (r < 0) + goto bad; + + r = dm_btree_empty(&pmd->info, &pmd->root); + if (r < 0) + goto bad; + + r = dm_btree_empty(&pmd->details_info, &pmd->details_root); + if (r < 0) { + DMERR("couldn't create devices root"); + goto bad; + } + + pmd->flags = 0; + pmd->need_commit = 1; + r = dm_pool_commit_metadata(pmd); + if (r < 0) { + DMERR("%s: dm_pool_commit_metadata() failed, error = %d", + __func__, r); + goto bad; + } + + return pmd; + +bad: + if (dm_pool_metadata_close(pmd) < 0) + DMWARN("%s: dm_pool_metadata_close() failed.", __func__); + return ERR_PTR(r); +} + +int dm_pool_metadata_close(struct dm_pool_metadata *pmd) +{ + int r; + unsigned open_devices = 0; + struct dm_thin_device *td, *tmp; + + down_read(&pmd->root_lock); + list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) { + if (td->open_count) + open_devices++; + else { + list_del(&td->list); + kfree(td); + } + } + up_read(&pmd->root_lock); + + if (open_devices) { + DMERR("attempt to close pmd when %u device(s) are still open", + open_devices); + return -EBUSY; + } + + r = __commit_transaction(pmd); + if (r < 0) + DMWARN("%s: __commit_transaction() failed, error = %d", + __func__, r); + + dm_tm_destroy(pmd->tm); + dm_tm_destroy(pmd->nb_tm); + dm_block_manager_destroy(pmd->bm); + dm_sm_destroy(pmd->metadata_sm); + dm_sm_destroy(pmd->data_sm); + kfree(pmd); + + return 0; +} + +/* + * __open_device: Returns @td corresponding to device with id @dev, + * creating it if @create is set and incrementing @td->open_count. + * On failure, @td is undefined. + */ +static int __open_device(struct dm_pool_metadata *pmd, + dm_thin_id dev, int create, + struct dm_thin_device **td) +{ + int r, changed = 0; + struct dm_thin_device *td2; + uint64_t key = dev; + struct disk_device_details details_le; + + /* + * If the device is already open, return it. + */ + list_for_each_entry(td2, &pmd->thin_devices, list) + if (td2->id == dev) { + /* + * May not create an already-open device. + */ + if (create) + return -EEXIST; + + td2->open_count++; + *td = td2; + return 0; + } + + /* + * Check the device exists. + */ + r = dm_btree_lookup(&pmd->details_info, pmd->details_root, + &key, &details_le); + if (r) { + if (r != -ENODATA || !create) + return r; + + /* + * Create new device. + */ + changed = 1; + details_le.mapped_blocks = 0; + details_le.transaction_id = cpu_to_le64(pmd->trans_id); + details_le.creation_time = cpu_to_le32(pmd->time); + details_le.snapshotted_time = cpu_to_le32(pmd->time); + } + + *td = kmalloc(sizeof(**td), GFP_NOIO); + if (!*td) + return -ENOMEM; + + (*td)->pmd = pmd; + (*td)->id = dev; + (*td)->open_count = 1; + (*td)->changed = changed; + (*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks); + (*td)->transaction_id = le64_to_cpu(details_le.transaction_id); + (*td)->creation_time = le32_to_cpu(details_le.creation_time); + (*td)->snapshotted_time = le32_to_cpu(details_le.snapshotted_time); + + list_add(&(*td)->list, &pmd->thin_devices); + + return 0; +} + +static void __close_device(struct dm_thin_device *td) +{ + --td->open_count; +} + +static int __create_thin(struct dm_pool_metadata *pmd, + dm_thin_id dev) +{ + int r; + dm_block_t dev_root; + uint64_t key = dev; + struct disk_device_details details_le; + struct dm_thin_device *td; + __le64 value; + + r = dm_btree_lookup(&pmd->details_info, pmd->details_root, + &key, &details_le); + if (!r) + return -EEXIST; + + /* + * Create an empty btree for the mappings. + */ + r = dm_btree_empty(&pmd->bl_info, &dev_root); + if (r) + return r; + + /* + * Insert it into the main mapping tree. + */ + value = cpu_to_le64(dev_root); + __dm_bless_for_disk(&value); + r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root); + if (r) { + dm_btree_del(&pmd->bl_info, dev_root); + return r; + } + + r = __open_device(pmd, dev, 1, &td); + if (r) { + dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); + dm_btree_del(&pmd->bl_info, dev_root); + return r; + } + __close_device(td); + + return r; +} + +int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev) +{ + int r; + + down_write(&pmd->root_lock); + r = __create_thin(pmd, dev); + up_write(&pmd->root_lock); + + return r; +} + +static int __set_snapshot_details(struct dm_pool_metadata *pmd, + struct dm_thin_device *snap, + dm_thin_id origin, uint32_t time) +{ + int r; + struct dm_thin_device *td; + + r = __open_device(pmd, origin, 0, &td); + if (r) + return r; + + td->changed = 1; + td->snapshotted_time = time; + + snap->mapped_blocks = td->mapped_blocks; + snap->snapshotted_time = time; + __close_device(td); + + return 0; +} + +static int __create_snap(struct dm_pool_metadata *pmd, + dm_thin_id dev, dm_thin_id origin) +{ + int r; + dm_block_t origin_root; + uint64_t key = origin, dev_key = dev; + struct dm_thin_device *td; + struct disk_device_details details_le; + __le64 value; + + /* check this device is unused */ + r = dm_btree_lookup(&pmd->details_info, pmd->details_root, + &dev_key, &details_le); + if (!r) + return -EEXIST; + + /* find the mapping tree for the origin */ + r = dm_btree_lookup(&pmd->tl_info, pmd->root, &key, &value); + if (r) + return r; + origin_root = le64_to_cpu(value); + + /* clone the origin, an inc will do */ + dm_tm_inc(pmd->tm, origin_root); + + /* insert into the main mapping tree */ + value = cpu_to_le64(origin_root); + __dm_bless_for_disk(&value); + key = dev; + r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root); + if (r) { + dm_tm_dec(pmd->tm, origin_root); + return r; + } + + pmd->time++; + + r = __open_device(pmd, dev, 1, &td); + if (r) + goto bad; + + r = __set_snapshot_details(pmd, td, origin, pmd->time); + __close_device(td); + + if (r) + goto bad; + + return 0; + +bad: + dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); + dm_btree_remove(&pmd->details_info, pmd->details_root, + &key, &pmd->details_root); + return r; +} + +int dm_pool_create_snap(struct dm_pool_metadata *pmd, + dm_thin_id dev, + dm_thin_id origin) +{ + int r; + + down_write(&pmd->root_lock); + r = __create_snap(pmd, dev, origin); + up_write(&pmd->root_lock); + + return r; +} + +static int __delete_device(struct dm_pool_metadata *pmd, dm_thin_id dev) +{ + int r; + uint64_t key = dev; + struct dm_thin_device *td; + + /* TODO: failure should mark the transaction invalid */ + r = __open_device(pmd, dev, 0, &td); + if (r) + return r; + + if (td->open_count > 1) { + __close_device(td); + return -EBUSY; + } + + list_del(&td->list); + kfree(td); + r = dm_btree_remove(&pmd->details_info, pmd->details_root, + &key, &pmd->details_root); + if (r) + return r; + + r = dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); + if (r) + return r; + + pmd->need_commit = 1; + + return 0; +} + +int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd, + dm_thin_id dev) +{ + int r; + + down_write(&pmd->root_lock); + r = __delete_device(pmd, dev); + up_write(&pmd->root_lock); + + return r; +} + +int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd, + uint64_t current_id, + uint64_t new_id) +{ + down_write(&pmd->root_lock); + if (pmd->trans_id != current_id) { + up_write(&pmd->root_lock); + DMERR("mismatched transaction id"); + return -EINVAL; + } + + pmd->trans_id = new_id; + pmd->need_commit = 1; + up_write(&pmd->root_lock); + + return 0; +} + +int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd, + uint64_t *result) +{ + down_read(&pmd->root_lock); + *result = pmd->trans_id; + up_read(&pmd->root_lock); + + return 0; +} + +static int __get_held_metadata_root(struct dm_pool_metadata *pmd, + dm_block_t *result) +{ + int r; + struct thin_disk_superblock *disk_super; + struct dm_block *sblock; + + r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, + &sb_validator, &sblock); + if (r) + return r; + + disk_super = dm_block_data(sblock); + *result = le64_to_cpu(disk_super->held_root); + + return dm_bm_unlock(sblock); +} + +int dm_pool_get_held_metadata_root(struct dm_pool_metadata *pmd, + dm_block_t *result) +{ + int r; + + down_read(&pmd->root_lock); + r = __get_held_metadata_root(pmd, result); + up_read(&pmd->root_lock); + + return r; +} + +int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev, + struct dm_thin_device **td) +{ + int r; + + down_write(&pmd->root_lock); + r = __open_device(pmd, dev, 0, td); + up_write(&pmd->root_lock); + + return r; +} + +int dm_pool_close_thin_device(struct dm_thin_device *td) +{ + down_write(&td->pmd->root_lock); + __close_device(td); + up_write(&td->pmd->root_lock); + + return 0; +} + +dm_thin_id dm_thin_dev_id(struct dm_thin_device *td) +{ + return td->id; +} + +static int __snapshotted_since(struct dm_thin_device *td, uint32_t time) +{ + return td->snapshotted_time > time; +} + +int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block, + int can_block, struct dm_thin_lookup_result *result) +{ + int r; + uint64_t block_time = 0; + __le64 value; + struct dm_pool_metadata *pmd = td->pmd; + dm_block_t keys[2] = { td->id, block }; + + if (can_block) { + down_read(&pmd->root_lock); + r = dm_btree_lookup(&pmd->info, pmd->root, keys, &value); + if (!r) + block_time = le64_to_cpu(value); + up_read(&pmd->root_lock); + + } else if (down_read_trylock(&pmd->root_lock)) { + r = dm_btree_lookup(&pmd->nb_info, pmd->root, keys, &value); + if (!r) + block_time = le64_to_cpu(value); + up_read(&pmd->root_lock); + + } else + return -EWOULDBLOCK; + + if (!r) { + dm_block_t exception_block; + uint32_t exception_time; + unpack_block_time(block_time, &exception_block, + &exception_time); + result->block = exception_block; + result->shared = __snapshotted_since(td, exception_time); + } + + return r; +} + +static int __insert(struct dm_thin_device *td, dm_block_t block, + dm_block_t data_block) +{ + int r, inserted; + __le64 value; + struct dm_pool_metadata *pmd = td->pmd; + dm_block_t keys[2] = { td->id, block }; + + pmd->need_commit = 1; + value = cpu_to_le64(pack_block_time(data_block, pmd->time)); + __dm_bless_for_disk(&value); + + r = dm_btree_insert_notify(&pmd->info, pmd->root, keys, &value, + &pmd->root, &inserted); + if (r) + return r; + + if (inserted) { + td->mapped_blocks++; + td->changed = 1; + } + + return 0; +} + +int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block, + dm_block_t data_block) +{ + int r; + + down_write(&td->pmd->root_lock); + r = __insert(td, block, data_block); + up_write(&td->pmd->root_lock); + + return r; +} + +static int __remove(struct dm_thin_device *td, dm_block_t block) +{ + int r; + struct dm_pool_metadata *pmd = td->pmd; + dm_block_t keys[2] = { td->id, block }; + + r = dm_btree_remove(&pmd->info, pmd->root, keys, &pmd->root); + if (r) + return r; + + td->mapped_blocks--; + td->changed = 1; + pmd->need_commit = 1; + + return 0; +} + +int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block) +{ + int r; + + down_write(&td->pmd->root_lock); + r = __remove(td, block); + up_write(&td->pmd->root_lock); + + return r; +} + +int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result) +{ + int r; + + down_write(&pmd->root_lock); + + r = dm_sm_new_block(pmd->data_sm, result); + pmd->need_commit = 1; + + up_write(&pmd->root_lock); + + return r; +} + +int dm_pool_commit_metadata(struct dm_pool_metadata *pmd) +{ + int r; + + down_write(&pmd->root_lock); + + r = __commit_transaction(pmd); + if (r <= 0) + goto out; + + /* + * Open the next transaction. + */ + r = __begin_transaction(pmd); +out: + up_write(&pmd->root_lock); + return r; +} + +int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result) +{ + int r; + + down_read(&pmd->root_lock); + r = dm_sm_get_nr_free(pmd->data_sm, result); + up_read(&pmd->root_lock); + + return r; +} + +int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd, + dm_block_t *result) +{ + int r; + + down_read(&pmd->root_lock); + r = dm_sm_get_nr_free(pmd->metadata_sm, result); + up_read(&pmd->root_lock); + + return r; +} + +int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd, + dm_block_t *result) +{ + int r; + + down_read(&pmd->root_lock); + r = dm_sm_get_nr_blocks(pmd->metadata_sm, result); + up_read(&pmd->root_lock); + + return r; +} + +int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result) +{ + down_read(&pmd->root_lock); + *result = pmd->data_block_size; + up_read(&pmd->root_lock); + + return 0; +} + +int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result) +{ + int r; + + down_read(&pmd->root_lock); + r = dm_sm_get_nr_blocks(pmd->data_sm, result); + up_read(&pmd->root_lock); + + return r; +} + +int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result) +{ + struct dm_pool_metadata *pmd = td->pmd; + + down_read(&pmd->root_lock); + *result = td->mapped_blocks; + up_read(&pmd->root_lock); + + return 0; +} + +static int __highest_block(struct dm_thin_device *td, dm_block_t *result) +{ + int r; + __le64 value_le; + dm_block_t thin_root; + struct dm_pool_metadata *pmd = td->pmd; + + r = dm_btree_lookup(&pmd->tl_info, pmd->root, &td->id, &value_le); + if (r) + return r; + + thin_root = le64_to_cpu(value_le); + + return dm_btree_find_highest_key(&pmd->bl_info, thin_root, result); +} + +int dm_thin_get_highest_mapped_block(struct dm_thin_device *td, + dm_block_t *result) +{ + int r; + struct dm_pool_metadata *pmd = td->pmd; + + down_read(&pmd->root_lock); + r = __highest_block(td, result); + up_read(&pmd->root_lock); + + return r; +} + +static int __resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) +{ + int r; + dm_block_t old_count; + + r = dm_sm_get_nr_blocks(pmd->data_sm, &old_count); + if (r) + return r; + + if (new_count == old_count) + return 0; + + if (new_count < old_count) { + DMERR("cannot reduce size of data device"); + return -EINVAL; + } + + r = dm_sm_extend(pmd->data_sm, new_count - old_count); + if (!r) + pmd->need_commit = 1; + + return r; +} + +int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) +{ + int r; + + down_write(&pmd->root_lock); + r = __resize_data_dev(pmd, new_count); + up_write(&pmd->root_lock); + + return r; +} diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h new file mode 100644 index 00000000..ed4725e6 --- /dev/null +++ b/drivers/md/dm-thin-metadata.h @@ -0,0 +1,169 @@ +/* + * Copyright (C) 2010-2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef DM_THIN_METADATA_H +#define DM_THIN_METADATA_H + +#include "persistent-data/dm-block-manager.h" + +#define THIN_METADATA_BLOCK_SIZE 4096 + +/* + * The metadata device is currently limited in size. + * + * We have one block of index, which can hold 255 index entries. Each + * index entry contains allocation info about 16k metadata blocks. + */ +#define THIN_METADATA_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT))) + +/* + * A metadata device larger than 16GB triggers a warning. + */ +#define THIN_METADATA_MAX_SECTORS_WARNING (16 * (1024 * 1024 * 1024 >> SECTOR_SHIFT)) + +/*----------------------------------------------------------------*/ + +struct dm_pool_metadata; +struct dm_thin_device; + +/* + * Device identifier + */ +typedef uint64_t dm_thin_id; + +/* + * Reopens or creates a new, empty metadata volume. + */ +struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, + sector_t data_block_size); + +int dm_pool_metadata_close(struct dm_pool_metadata *pmd); + +/* + * Compat feature flags. Any incompat flags beyond the ones + * specified below will prevent use of the thin metadata. + */ +#define THIN_FEATURE_COMPAT_SUPP 0UL +#define THIN_FEATURE_COMPAT_RO_SUPP 0UL +#define THIN_FEATURE_INCOMPAT_SUPP 0UL + +/* + * Device creation/deletion. + */ +int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev); + +/* + * An internal snapshot. + * + * You can only snapshot a quiesced origin i.e. one that is either + * suspended or not instanced at all. + */ +int dm_pool_create_snap(struct dm_pool_metadata *pmd, dm_thin_id dev, + dm_thin_id origin); + +/* + * Deletes a virtual device from the metadata. It _is_ safe to call this + * when that device is open. Operations on that device will just start + * failing. You still need to call close() on the device. + */ +int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd, + dm_thin_id dev); + +/* + * Commits _all_ metadata changes: device creation, deletion, mapping + * updates. + */ +int dm_pool_commit_metadata(struct dm_pool_metadata *pmd); + +/* + * Set/get userspace transaction id. + */ +int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd, + uint64_t current_id, + uint64_t new_id); + +int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd, + uint64_t *result); + +/* + * Hold/get root for userspace transaction. + */ +int dm_pool_hold_metadata_root(struct dm_pool_metadata *pmd); + +int dm_pool_get_held_metadata_root(struct dm_pool_metadata *pmd, + dm_block_t *result); + +/* + * Actions on a single virtual device. + */ + +/* + * Opening the same device more than once will fail with -EBUSY. + */ +int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev, + struct dm_thin_device **td); + +int dm_pool_close_thin_device(struct dm_thin_device *td); + +dm_thin_id dm_thin_dev_id(struct dm_thin_device *td); + +struct dm_thin_lookup_result { + dm_block_t block; + int shared; +}; + +/* + * Returns: + * -EWOULDBLOCK iff @can_block is set and would block. + * -ENODATA iff that mapping is not present. + * 0 success + */ +int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block, + int can_block, struct dm_thin_lookup_result *result); + +/* + * Obtain an unused block. + */ +int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result); + +/* + * Insert or remove block. + */ +int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block, + dm_block_t data_block); + +int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block); + +/* + * Queries. + */ +int dm_thin_get_highest_mapped_block(struct dm_thin_device *td, + dm_block_t *highest_mapped); + +int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result); + +int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, + dm_block_t *result); + +int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd, + dm_block_t *result); + +int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd, + dm_block_t *result); + +int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result); + +int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result); + +/* + * Returns -ENOSPC if the new size is too small and already allocated + * blocks would be lost. + */ +int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_size); + +/*----------------------------------------------------------------*/ + +#endif diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c new file mode 100644 index 00000000..eb3d138f --- /dev/null +++ b/drivers/md/dm-thin.c @@ -0,0 +1,2774 @@ +/* + * Copyright (C) 2011 Red Hat UK. + * + * This file is released under the GPL. + */ + +#include "dm-thin-metadata.h" + +#include <linux/device-mapper.h> +#include <linux/dm-io.h> +#include <linux/dm-kcopyd.h> +#include <linux/list.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/slab.h> + +#define DM_MSG_PREFIX "thin" + +/* + * Tunable constants + */ +#define ENDIO_HOOK_POOL_SIZE 10240 +#define DEFERRED_SET_SIZE 64 +#define MAPPING_POOL_SIZE 1024 +#define PRISON_CELLS 1024 +#define COMMIT_PERIOD HZ + +/* + * The block size of the device holding pool data must be + * between 64KB and 1GB. + */ +#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT) +#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) + +/* + * Device id is restricted to 24 bits. + */ +#define MAX_DEV_ID ((1 << 24) - 1) + +/* + * How do we handle breaking sharing of data blocks? + * ================================================= + * + * We use a standard copy-on-write btree to store the mappings for the + * devices (note I'm talking about copy-on-write of the metadata here, not + * the data). When you take an internal snapshot you clone the root node + * of the origin btree. After this there is no concept of an origin or a + * snapshot. They are just two device trees that happen to point to the + * same data blocks. + * + * When we get a write in we decide if it's to a shared data block using + * some timestamp magic. If it is, we have to break sharing. + * + * Let's say we write to a shared block in what was the origin. The + * steps are: + * + * i) plug io further to this physical block. (see bio_prison code). + * + * ii) quiesce any read io to that shared data block. Obviously + * including all devices that share this block. (see deferred_set code) + * + * iii) copy the data block to a newly allocate block. This step can be + * missed out if the io covers the block. (schedule_copy). + * + * iv) insert the new mapping into the origin's btree + * (process_prepared_mapping). This act of inserting breaks some + * sharing of btree nodes between the two devices. Breaking sharing only + * effects the btree of that specific device. Btrees for the other + * devices that share the block never change. The btree for the origin + * device as it was after the last commit is untouched, ie. we're using + * persistent data structures in the functional programming sense. + * + * v) unplug io to this physical block, including the io that triggered + * the breaking of sharing. + * + * Steps (ii) and (iii) occur in parallel. + * + * The metadata _doesn't_ need to be committed before the io continues. We + * get away with this because the io is always written to a _new_ block. + * If there's a crash, then: + * + * - The origin mapping will point to the old origin block (the shared + * one). This will contain the data as it was before the io that triggered + * the breaking of sharing came in. + * + * - The snap mapping still points to the old block. As it would after + * the commit. + * + * The downside of this scheme is the timestamp magic isn't perfect, and + * will continue to think that data block in the snapshot device is shared + * even after the write to the origin has broken sharing. I suspect data + * blocks will typically be shared by many different devices, so we're + * breaking sharing n + 1 times, rather than n, where n is the number of + * devices that reference this data block. At the moment I think the + * benefits far, far outweigh the disadvantages. + */ + +/*----------------------------------------------------------------*/ + +/* + * Sometimes we can't deal with a bio straight away. We put them in prison + * where they can't cause any mischief. Bios are put in a cell identified + * by a key, multiple bios can be in the same cell. When the cell is + * subsequently unlocked the bios become available. + */ +struct bio_prison; + +struct cell_key { + int virtual; + dm_thin_id dev; + dm_block_t block; +}; + +struct cell { + struct hlist_node list; + struct bio_prison *prison; + struct cell_key key; + struct bio *holder; + struct bio_list bios; +}; + +struct bio_prison { + spinlock_t lock; + mempool_t *cell_pool; + + unsigned nr_buckets; + unsigned hash_mask; + struct hlist_head *cells; +}; + +static uint32_t calc_nr_buckets(unsigned nr_cells) +{ + uint32_t n = 128; + + nr_cells /= 4; + nr_cells = min(nr_cells, 8192u); + + while (n < nr_cells) + n <<= 1; + + return n; +} + +/* + * @nr_cells should be the number of cells you want in use _concurrently_. + * Don't confuse it with the number of distinct keys. + */ +static struct bio_prison *prison_create(unsigned nr_cells) +{ + unsigned i; + uint32_t nr_buckets = calc_nr_buckets(nr_cells); + size_t len = sizeof(struct bio_prison) + + (sizeof(struct hlist_head) * nr_buckets); + struct bio_prison *prison = kmalloc(len, GFP_KERNEL); + + if (!prison) + return NULL; + + spin_lock_init(&prison->lock); + prison->cell_pool = mempool_create_kmalloc_pool(nr_cells, + sizeof(struct cell)); + if (!prison->cell_pool) { + kfree(prison); + return NULL; + } + + prison->nr_buckets = nr_buckets; + prison->hash_mask = nr_buckets - 1; + prison->cells = (struct hlist_head *) (prison + 1); + for (i = 0; i < nr_buckets; i++) + INIT_HLIST_HEAD(prison->cells + i); + + return prison; +} + +static void prison_destroy(struct bio_prison *prison) +{ + mempool_destroy(prison->cell_pool); + kfree(prison); +} + +static uint32_t hash_key(struct bio_prison *prison, struct cell_key *key) +{ + const unsigned long BIG_PRIME = 4294967291UL; + uint64_t hash = key->block * BIG_PRIME; + + return (uint32_t) (hash & prison->hash_mask); +} + +static int keys_equal(struct cell_key *lhs, struct cell_key *rhs) +{ + return (lhs->virtual == rhs->virtual) && + (lhs->dev == rhs->dev) && + (lhs->block == rhs->block); +} + +static struct cell *__search_bucket(struct hlist_head *bucket, + struct cell_key *key) +{ + struct cell *cell; + struct hlist_node *tmp; + + hlist_for_each_entry(cell, tmp, bucket, list) + if (keys_equal(&cell->key, key)) + return cell; + + return NULL; +} + +/* + * This may block if a new cell needs allocating. You must ensure that + * cells will be unlocked even if the calling thread is blocked. + * + * Returns 1 if the cell was already held, 0 if @inmate is the new holder. + */ +static int bio_detain(struct bio_prison *prison, struct cell_key *key, + struct bio *inmate, struct cell **ref) +{ + int r = 1; + unsigned long flags; + uint32_t hash = hash_key(prison, key); + struct cell *cell, *cell2; + + BUG_ON(hash > prison->nr_buckets); + + spin_lock_irqsave(&prison->lock, flags); + + cell = __search_bucket(prison->cells + hash, key); + if (cell) { + bio_list_add(&cell->bios, inmate); + goto out; + } + + /* + * Allocate a new cell + */ + spin_unlock_irqrestore(&prison->lock, flags); + cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO); + spin_lock_irqsave(&prison->lock, flags); + + /* + * We've been unlocked, so we have to double check that + * nobody else has inserted this cell in the meantime. + */ + cell = __search_bucket(prison->cells + hash, key); + if (cell) { + mempool_free(cell2, prison->cell_pool); + bio_list_add(&cell->bios, inmate); + goto out; + } + + /* + * Use new cell. + */ + cell = cell2; + + cell->prison = prison; + memcpy(&cell->key, key, sizeof(cell->key)); + cell->holder = inmate; + bio_list_init(&cell->bios); + hlist_add_head(&cell->list, prison->cells + hash); + + r = 0; + +out: + spin_unlock_irqrestore(&prison->lock, flags); + + *ref = cell; + + return r; +} + +/* + * @inmates must have been initialised prior to this call + */ +static void __cell_release(struct cell *cell, struct bio_list *inmates) +{ + struct bio_prison *prison = cell->prison; + + hlist_del(&cell->list); + + if (inmates) { + bio_list_add(inmates, cell->holder); + bio_list_merge(inmates, &cell->bios); + } + + mempool_free(cell, prison->cell_pool); +} + +static void cell_release(struct cell *cell, struct bio_list *bios) +{ + unsigned long flags; + struct bio_prison *prison = cell->prison; + + spin_lock_irqsave(&prison->lock, flags); + __cell_release(cell, bios); + spin_unlock_irqrestore(&prison->lock, flags); +} + +/* + * There are a couple of places where we put a bio into a cell briefly + * before taking it out again. In these situations we know that no other + * bio may be in the cell. This function releases the cell, and also does + * a sanity check. + */ +static void __cell_release_singleton(struct cell *cell, struct bio *bio) +{ + BUG_ON(cell->holder != bio); + BUG_ON(!bio_list_empty(&cell->bios)); + + __cell_release(cell, NULL); +} + +static void cell_release_singleton(struct cell *cell, struct bio *bio) +{ + unsigned long flags; + struct bio_prison *prison = cell->prison; + + spin_lock_irqsave(&prison->lock, flags); + __cell_release_singleton(cell, bio); + spin_unlock_irqrestore(&prison->lock, flags); +} + +/* + * Sometimes we don't want the holder, just the additional bios. + */ +static void __cell_release_no_holder(struct cell *cell, struct bio_list *inmates) +{ + struct bio_prison *prison = cell->prison; + + hlist_del(&cell->list); + bio_list_merge(inmates, &cell->bios); + + mempool_free(cell, prison->cell_pool); +} + +static void cell_release_no_holder(struct cell *cell, struct bio_list *inmates) +{ + unsigned long flags; + struct bio_prison *prison = cell->prison; + + spin_lock_irqsave(&prison->lock, flags); + __cell_release_no_holder(cell, inmates); + spin_unlock_irqrestore(&prison->lock, flags); +} + +static void cell_error(struct cell *cell) +{ + struct bio_prison *prison = cell->prison; + struct bio_list bios; + struct bio *bio; + unsigned long flags; + + bio_list_init(&bios); + + spin_lock_irqsave(&prison->lock, flags); + __cell_release(cell, &bios); + spin_unlock_irqrestore(&prison->lock, flags); + + while ((bio = bio_list_pop(&bios))) + bio_io_error(bio); +} + +/*----------------------------------------------------------------*/ + +/* + * We use the deferred set to keep track of pending reads to shared blocks. + * We do this to ensure the new mapping caused by a write isn't performed + * until these prior reads have completed. Otherwise the insertion of the + * new mapping could free the old block that the read bios are mapped to. + */ + +struct deferred_set; +struct deferred_entry { + struct deferred_set *ds; + unsigned count; + struct list_head work_items; +}; + +struct deferred_set { + spinlock_t lock; + unsigned current_entry; + unsigned sweeper; + struct deferred_entry entries[DEFERRED_SET_SIZE]; +}; + +static void ds_init(struct deferred_set *ds) +{ + int i; + + spin_lock_init(&ds->lock); + ds->current_entry = 0; + ds->sweeper = 0; + for (i = 0; i < DEFERRED_SET_SIZE; i++) { + ds->entries[i].ds = ds; + ds->entries[i].count = 0; + INIT_LIST_HEAD(&ds->entries[i].work_items); + } +} + +static struct deferred_entry *ds_inc(struct deferred_set *ds) +{ + unsigned long flags; + struct deferred_entry *entry; + + spin_lock_irqsave(&ds->lock, flags); + entry = ds->entries + ds->current_entry; + entry->count++; + spin_unlock_irqrestore(&ds->lock, flags); + + return entry; +} + +static unsigned ds_next(unsigned index) +{ + return (index + 1) % DEFERRED_SET_SIZE; +} + +static void __sweep(struct deferred_set *ds, struct list_head *head) +{ + while ((ds->sweeper != ds->current_entry) && + !ds->entries[ds->sweeper].count) { + list_splice_init(&ds->entries[ds->sweeper].work_items, head); + ds->sweeper = ds_next(ds->sweeper); + } + + if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count) + list_splice_init(&ds->entries[ds->sweeper].work_items, head); +} + +static void ds_dec(struct deferred_entry *entry, struct list_head *head) +{ + unsigned long flags; + + spin_lock_irqsave(&entry->ds->lock, flags); + BUG_ON(!entry->count); + --entry->count; + __sweep(entry->ds, head); + spin_unlock_irqrestore(&entry->ds->lock, flags); +} + +/* + * Returns 1 if deferred or 0 if no pending items to delay job. + */ +static int ds_add_work(struct deferred_set *ds, struct list_head *work) +{ + int r = 1; + unsigned long flags; + unsigned next_entry; + + spin_lock_irqsave(&ds->lock, flags); + if ((ds->sweeper == ds->current_entry) && + !ds->entries[ds->current_entry].count) + r = 0; + else { + list_add(work, &ds->entries[ds->current_entry].work_items); + next_entry = ds_next(ds->current_entry); + if (!ds->entries[next_entry].count) + ds->current_entry = next_entry; + } + spin_unlock_irqrestore(&ds->lock, flags); + + return r; +} + +/*----------------------------------------------------------------*/ + +/* + * Key building. + */ +static void build_data_key(struct dm_thin_device *td, + dm_block_t b, struct cell_key *key) +{ + key->virtual = 0; + key->dev = dm_thin_dev_id(td); + key->block = b; +} + +static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, + struct cell_key *key) +{ + key->virtual = 1; + key->dev = dm_thin_dev_id(td); + key->block = b; +} + +/*----------------------------------------------------------------*/ + +/* + * A pool device ties together a metadata device and a data device. It + * also provides the interface for creating and destroying internal + * devices. + */ +struct new_mapping; + +struct pool_features { + unsigned zero_new_blocks:1; + unsigned discard_enabled:1; + unsigned discard_passdown:1; +}; + +struct pool { + struct list_head list; + struct dm_target *ti; /* Only set if a pool target is bound */ + + struct mapped_device *pool_md; + struct block_device *md_dev; + struct dm_pool_metadata *pmd; + + uint32_t sectors_per_block; + unsigned block_shift; + dm_block_t offset_mask; + dm_block_t low_water_blocks; + + struct pool_features pf; + unsigned low_water_triggered:1; /* A dm event has been sent */ + unsigned no_free_space:1; /* A -ENOSPC warning has been issued */ + + struct bio_prison *prison; + struct dm_kcopyd_client *copier; + + struct workqueue_struct *wq; + struct work_struct worker; + struct delayed_work waker; + + unsigned ref_count; + unsigned long last_commit_jiffies; + + spinlock_t lock; + struct bio_list deferred_bios; + struct bio_list deferred_flush_bios; + struct list_head prepared_mappings; + struct list_head prepared_discards; + + struct bio_list retry_on_resume_list; + + struct deferred_set shared_read_ds; + struct deferred_set all_io_ds; + + struct new_mapping *next_mapping; + mempool_t *mapping_pool; + mempool_t *endio_hook_pool; +}; + +/* + * Target context for a pool. + */ +struct pool_c { + struct dm_target *ti; + struct pool *pool; + struct dm_dev *data_dev; + struct dm_dev *metadata_dev; + struct dm_target_callbacks callbacks; + + dm_block_t low_water_blocks; + struct pool_features pf; +}; + +/* + * Target context for a thin. + */ +struct thin_c { + struct dm_dev *pool_dev; + struct dm_dev *origin_dev; + dm_thin_id dev_id; + + struct pool *pool; + struct dm_thin_device *td; +}; + +/*----------------------------------------------------------------*/ + +/* + * A global list of pools that uses a struct mapped_device as a key. + */ +static struct dm_thin_pool_table { + struct mutex mutex; + struct list_head pools; +} dm_thin_pool_table; + +static void pool_table_init(void) +{ + mutex_init(&dm_thin_pool_table.mutex); + INIT_LIST_HEAD(&dm_thin_pool_table.pools); +} + +static void __pool_table_insert(struct pool *pool) +{ + BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); + list_add(&pool->list, &dm_thin_pool_table.pools); +} + +static void __pool_table_remove(struct pool *pool) +{ + BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); + list_del(&pool->list); +} + +static struct pool *__pool_table_lookup(struct mapped_device *md) +{ + struct pool *pool = NULL, *tmp; + + BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); + + list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) { + if (tmp->pool_md == md) { + pool = tmp; + break; + } + } + + return pool; +} + +static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev) +{ + struct pool *pool = NULL, *tmp; + + BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); + + list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) { + if (tmp->md_dev == md_dev) { + pool = tmp; + break; + } + } + + return pool; +} + +/*----------------------------------------------------------------*/ + +struct endio_hook { + struct thin_c *tc; + struct deferred_entry *shared_read_entry; + struct deferred_entry *all_io_entry; + struct new_mapping *overwrite_mapping; +}; + +static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) +{ + struct bio *bio; + struct bio_list bios; + + bio_list_init(&bios); + bio_list_merge(&bios, master); + bio_list_init(master); + + while ((bio = bio_list_pop(&bios))) { + struct endio_hook *h = dm_get_mapinfo(bio)->ptr; + if (h->tc == tc) + bio_endio(bio, DM_ENDIO_REQUEUE); + else + bio_list_add(master, bio); + } +} + +static void requeue_io(struct thin_c *tc) +{ + struct pool *pool = tc->pool; + unsigned long flags; + + spin_lock_irqsave(&pool->lock, flags); + __requeue_bio_list(tc, &pool->deferred_bios); + __requeue_bio_list(tc, &pool->retry_on_resume_list); + spin_unlock_irqrestore(&pool->lock, flags); +} + +/* + * This section of code contains the logic for processing a thin device's IO. + * Much of the code depends on pool object resources (lists, workqueues, etc) + * but most is exclusively called from the thin target rather than the thin-pool + * target. + */ + +static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio) +{ + return bio->bi_sector >> tc->pool->block_shift; +} + +static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) +{ + struct pool *pool = tc->pool; + + bio->bi_bdev = tc->pool_dev->bdev; + bio->bi_sector = (block << pool->block_shift) + + (bio->bi_sector & pool->offset_mask); +} + +static void remap_to_origin(struct thin_c *tc, struct bio *bio) +{ + bio->bi_bdev = tc->origin_dev->bdev; +} + +static void issue(struct thin_c *tc, struct bio *bio) +{ + struct pool *pool = tc->pool; + unsigned long flags; + + /* + * Batch together any FUA/FLUSH bios we find and then issue + * a single commit for them in process_deferred_bios(). + */ + if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { + spin_lock_irqsave(&pool->lock, flags); + bio_list_add(&pool->deferred_flush_bios, bio); + spin_unlock_irqrestore(&pool->lock, flags); + } else + generic_make_request(bio); +} + +static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio) +{ + remap_to_origin(tc, bio); + issue(tc, bio); +} + +static void remap_and_issue(struct thin_c *tc, struct bio *bio, + dm_block_t block) +{ + remap(tc, bio, block); + issue(tc, bio); +} + +/* + * wake_worker() is used when new work is queued and when pool_resume is + * ready to continue deferred IO processing. + */ +static void wake_worker(struct pool *pool) +{ + queue_work(pool->wq, &pool->worker); +} + +/*----------------------------------------------------------------*/ + +/* + * Bio endio functions. + */ +struct new_mapping { + struct list_head list; + + unsigned quiesced:1; + unsigned prepared:1; + unsigned pass_discard:1; + + struct thin_c *tc; + dm_block_t virt_block; + dm_block_t data_block; + struct cell *cell, *cell2; + int err; + + /* + * If the bio covers the whole area of a block then we can avoid + * zeroing or copying. Instead this bio is hooked. The bio will + * still be in the cell, so care has to be taken to avoid issuing + * the bio twice. + */ + struct bio *bio; + bio_end_io_t *saved_bi_end_io; +}; + +static void __maybe_add_mapping(struct new_mapping *m) +{ + struct pool *pool = m->tc->pool; + + if (m->quiesced && m->prepared) { + list_add(&m->list, &pool->prepared_mappings); + wake_worker(pool); + } +} + +static void copy_complete(int read_err, unsigned long write_err, void *context) +{ + unsigned long flags; + struct new_mapping *m = context; + struct pool *pool = m->tc->pool; + + m->err = read_err || write_err ? -EIO : 0; + + spin_lock_irqsave(&pool->lock, flags); + m->prepared = 1; + __maybe_add_mapping(m); + spin_unlock_irqrestore(&pool->lock, flags); +} + +static void overwrite_endio(struct bio *bio, int err) +{ + unsigned long flags; + struct endio_hook *h = dm_get_mapinfo(bio)->ptr; + struct new_mapping *m = h->overwrite_mapping; + struct pool *pool = m->tc->pool; + + m->err = err; + + spin_lock_irqsave(&pool->lock, flags); + m->prepared = 1; + __maybe_add_mapping(m); + spin_unlock_irqrestore(&pool->lock, flags); +} + +/*----------------------------------------------------------------*/ + +/* + * Workqueue. + */ + +/* + * Prepared mapping jobs. + */ + +/* + * This sends the bios in the cell back to the deferred_bios list. + */ +static void cell_defer(struct thin_c *tc, struct cell *cell, + dm_block_t data_block) +{ + struct pool *pool = tc->pool; + unsigned long flags; + + spin_lock_irqsave(&pool->lock, flags); + cell_release(cell, &pool->deferred_bios); + spin_unlock_irqrestore(&tc->pool->lock, flags); + + wake_worker(pool); +} + +/* + * Same as cell_defer above, except it omits one particular detainee, + * a write bio that covers the block and has already been processed. + */ +static void cell_defer_except(struct thin_c *tc, struct cell *cell) +{ + struct bio_list bios; + struct pool *pool = tc->pool; + unsigned long flags; + + bio_list_init(&bios); + + spin_lock_irqsave(&pool->lock, flags); + cell_release_no_holder(cell, &pool->deferred_bios); + spin_unlock_irqrestore(&pool->lock, flags); + + wake_worker(pool); +} + +static void process_prepared_mapping(struct new_mapping *m) +{ + struct thin_c *tc = m->tc; + struct bio *bio; + int r; + + bio = m->bio; + if (bio) + bio->bi_end_io = m->saved_bi_end_io; + + if (m->err) { + cell_error(m->cell); + return; + } + + /* + * Commit the prepared block into the mapping btree. + * Any I/O for this block arriving after this point will get + * remapped to it directly. + */ + r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); + if (r) { + DMERR("dm_thin_insert_block() failed"); + cell_error(m->cell); + return; + } + + /* + * Release any bios held while the block was being provisioned. + * If we are processing a write bio that completely covers the block, + * we already processed it so can ignore it now when processing + * the bios in the cell. + */ + if (bio) { + cell_defer_except(tc, m->cell); + bio_endio(bio, 0); + } else + cell_defer(tc, m->cell, m->data_block); + + list_del(&m->list); + mempool_free(m, tc->pool->mapping_pool); +} + +static void process_prepared_discard(struct new_mapping *m) +{ + int r; + struct thin_c *tc = m->tc; + + r = dm_thin_remove_block(tc->td, m->virt_block); + if (r) + DMERR("dm_thin_remove_block() failed"); + + /* + * Pass the discard down to the underlying device? + */ + if (m->pass_discard) + remap_and_issue(tc, m->bio, m->data_block); + else + bio_endio(m->bio, 0); + + cell_defer_except(tc, m->cell); + cell_defer_except(tc, m->cell2); + mempool_free(m, tc->pool->mapping_pool); +} + +static void process_prepared(struct pool *pool, struct list_head *head, + void (*fn)(struct new_mapping *)) +{ + unsigned long flags; + struct list_head maps; + struct new_mapping *m, *tmp; + + INIT_LIST_HEAD(&maps); + spin_lock_irqsave(&pool->lock, flags); + list_splice_init(head, &maps); + spin_unlock_irqrestore(&pool->lock, flags); + + list_for_each_entry_safe(m, tmp, &maps, list) + fn(m); +} + +/* + * Deferred bio jobs. + */ +static int io_overlaps_block(struct pool *pool, struct bio *bio) +{ + return !(bio->bi_sector & pool->offset_mask) && + (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT)); + +} + +static int io_overwrites_block(struct pool *pool, struct bio *bio) +{ + return (bio_data_dir(bio) == WRITE) && + io_overlaps_block(pool, bio); +} + +static void save_and_set_endio(struct bio *bio, bio_end_io_t **save, + bio_end_io_t *fn) +{ + *save = bio->bi_end_io; + bio->bi_end_io = fn; +} + +static int ensure_next_mapping(struct pool *pool) +{ + if (pool->next_mapping) + return 0; + + pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC); + + return pool->next_mapping ? 0 : -ENOMEM; +} + +static struct new_mapping *get_next_mapping(struct pool *pool) +{ + struct new_mapping *r = pool->next_mapping; + + BUG_ON(!pool->next_mapping); + + pool->next_mapping = NULL; + + return r; +} + +static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, + struct dm_dev *origin, dm_block_t data_origin, + dm_block_t data_dest, + struct cell *cell, struct bio *bio) +{ + int r; + struct pool *pool = tc->pool; + struct new_mapping *m = get_next_mapping(pool); + + INIT_LIST_HEAD(&m->list); + m->quiesced = 0; + m->prepared = 0; + m->tc = tc; + m->virt_block = virt_block; + m->data_block = data_dest; + m->cell = cell; + m->err = 0; + m->bio = NULL; + + if (!ds_add_work(&pool->shared_read_ds, &m->list)) + m->quiesced = 1; + + /* + * IO to pool_dev remaps to the pool target's data_dev. + * + * If the whole block of data is being overwritten, we can issue the + * bio immediately. Otherwise we use kcopyd to clone the data first. + */ + if (io_overwrites_block(pool, bio)) { + struct endio_hook *h = dm_get_mapinfo(bio)->ptr; + h->overwrite_mapping = m; + m->bio = bio; + save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); + remap_and_issue(tc, bio, data_dest); + } else { + struct dm_io_region from, to; + + from.bdev = origin->bdev; + from.sector = data_origin * pool->sectors_per_block; + from.count = pool->sectors_per_block; + + to.bdev = tc->pool_dev->bdev; + to.sector = data_dest * pool->sectors_per_block; + to.count = pool->sectors_per_block; + + r = dm_kcopyd_copy(pool->copier, &from, 1, &to, + 0, copy_complete, m); + if (r < 0) { + mempool_free(m, pool->mapping_pool); + DMERR("dm_kcopyd_copy() failed"); + cell_error(cell); + } + } +} + +static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block, + dm_block_t data_origin, dm_block_t data_dest, + struct cell *cell, struct bio *bio) +{ + schedule_copy(tc, virt_block, tc->pool_dev, + data_origin, data_dest, cell, bio); +} + +static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block, + dm_block_t data_dest, + struct cell *cell, struct bio *bio) +{ + schedule_copy(tc, virt_block, tc->origin_dev, + virt_block, data_dest, cell, bio); +} + +static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, + dm_block_t data_block, struct cell *cell, + struct bio *bio) +{ + struct pool *pool = tc->pool; + struct new_mapping *m = get_next_mapping(pool); + + INIT_LIST_HEAD(&m->list); + m->quiesced = 1; + m->prepared = 0; + m->tc = tc; + m->virt_block = virt_block; + m->data_block = data_block; + m->cell = cell; + m->err = 0; + m->bio = NULL; + + /* + * If the whole block of data is being overwritten or we are not + * zeroing pre-existing data, we can issue the bio immediately. + * Otherwise we use kcopyd to zero the data first. + */ + if (!pool->pf.zero_new_blocks) + process_prepared_mapping(m); + + else if (io_overwrites_block(pool, bio)) { + struct endio_hook *h = dm_get_mapinfo(bio)->ptr; + h->overwrite_mapping = m; + m->bio = bio; + save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); + remap_and_issue(tc, bio, data_block); + + } else { + int r; + struct dm_io_region to; + + to.bdev = tc->pool_dev->bdev; + to.sector = data_block * pool->sectors_per_block; + to.count = pool->sectors_per_block; + + r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m); + if (r < 0) { + mempool_free(m, pool->mapping_pool); + DMERR("dm_kcopyd_zero() failed"); + cell_error(cell); + } + } +} + +static int alloc_data_block(struct thin_c *tc, dm_block_t *result) +{ + int r; + dm_block_t free_blocks; + unsigned long flags; + struct pool *pool = tc->pool; + + r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); + if (r) + return r; + + if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) { + DMWARN("%s: reached low water mark, sending event.", + dm_device_name(pool->pool_md)); + spin_lock_irqsave(&pool->lock, flags); + pool->low_water_triggered = 1; + spin_unlock_irqrestore(&pool->lock, flags); + dm_table_event(pool->ti->table); + } + + if (!free_blocks) { + if (pool->no_free_space) + return -ENOSPC; + else { + /* + * Try to commit to see if that will free up some + * more space. + */ + r = dm_pool_commit_metadata(pool->pmd); + if (r) { + DMERR("%s: dm_pool_commit_metadata() failed, error = %d", + __func__, r); + return r; + } + + r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); + if (r) + return r; + + /* + * If we still have no space we set a flag to avoid + * doing all this checking and return -ENOSPC. + */ + if (!free_blocks) { + DMWARN("%s: no free space available.", + dm_device_name(pool->pool_md)); + spin_lock_irqsave(&pool->lock, flags); + pool->no_free_space = 1; + spin_unlock_irqrestore(&pool->lock, flags); + return -ENOSPC; + } + } + } + + r = dm_pool_alloc_data_block(pool->pmd, result); + if (r) + return r; + + return 0; +} + +/* + * If we have run out of space, queue bios until the device is + * resumed, presumably after having been reloaded with more space. + */ +static void retry_on_resume(struct bio *bio) +{ + struct endio_hook *h = dm_get_mapinfo(bio)->ptr; + struct thin_c *tc = h->tc; + struct pool *pool = tc->pool; + unsigned long flags; + + spin_lock_irqsave(&pool->lock, flags); + bio_list_add(&pool->retry_on_resume_list, bio); + spin_unlock_irqrestore(&pool->lock, flags); +} + +static void no_space(struct cell *cell) +{ + struct bio *bio; + struct bio_list bios; + + bio_list_init(&bios); + cell_release(cell, &bios); + + while ((bio = bio_list_pop(&bios))) + retry_on_resume(bio); +} + +static void process_discard(struct thin_c *tc, struct bio *bio) +{ + int r; + unsigned long flags; + struct pool *pool = tc->pool; + struct cell *cell, *cell2; + struct cell_key key, key2; + dm_block_t block = get_bio_block(tc, bio); + struct dm_thin_lookup_result lookup_result; + struct new_mapping *m; + + build_virtual_key(tc->td, block, &key); + if (bio_detain(tc->pool->prison, &key, bio, &cell)) + return; + + r = dm_thin_find_block(tc->td, block, 1, &lookup_result); + switch (r) { + case 0: + /* + * Check nobody is fiddling with this pool block. This can + * happen if someone's in the process of breaking sharing + * on this block. + */ + build_data_key(tc->td, lookup_result.block, &key2); + if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) { + cell_release_singleton(cell, bio); + break; + } + + if (io_overlaps_block(pool, bio)) { + /* + * IO may still be going to the destination block. We must + * quiesce before we can do the removal. + */ + m = get_next_mapping(pool); + m->tc = tc; + m->pass_discard = (!lookup_result.shared) & pool->pf.discard_passdown; + m->virt_block = block; + m->data_block = lookup_result.block; + m->cell = cell; + m->cell2 = cell2; + m->err = 0; + m->bio = bio; + + if (!ds_add_work(&pool->all_io_ds, &m->list)) { + spin_lock_irqsave(&pool->lock, flags); + list_add(&m->list, &pool->prepared_discards); + spin_unlock_irqrestore(&pool->lock, flags); + wake_worker(pool); + } + } else { + /* + * This path is hit if people are ignoring + * limits->discard_granularity. It ignores any + * part of the discard that is in a subsequent + * block. + */ + sector_t offset = bio->bi_sector - (block << pool->block_shift); + unsigned remaining = (pool->sectors_per_block - offset) << 9; + bio->bi_size = min(bio->bi_size, remaining); + + cell_release_singleton(cell, bio); + cell_release_singleton(cell2, bio); + remap_and_issue(tc, bio, lookup_result.block); + } + break; + + case -ENODATA: + /* + * It isn't provisioned, just forget it. + */ + cell_release_singleton(cell, bio); + bio_endio(bio, 0); + break; + + default: + DMERR("discard: find block unexpectedly returned %d", r); + cell_release_singleton(cell, bio); + bio_io_error(bio); + break; + } +} + +static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, + struct cell_key *key, + struct dm_thin_lookup_result *lookup_result, + struct cell *cell) +{ + int r; + dm_block_t data_block; + + r = alloc_data_block(tc, &data_block); + switch (r) { + case 0: + schedule_internal_copy(tc, block, lookup_result->block, + data_block, cell, bio); + break; + + case -ENOSPC: + no_space(cell); + break; + + default: + DMERR("%s: alloc_data_block() failed, error = %d", __func__, r); + cell_error(cell); + break; + } +} + +static void process_shared_bio(struct thin_c *tc, struct bio *bio, + dm_block_t block, + struct dm_thin_lookup_result *lookup_result) +{ + struct cell *cell; + struct pool *pool = tc->pool; + struct cell_key key; + + /* + * If cell is already occupied, then sharing is already in the process + * of being broken so we have nothing further to do here. + */ + build_data_key(tc->td, lookup_result->block, &key); + if (bio_detain(pool->prison, &key, bio, &cell)) + return; + + if (bio_data_dir(bio) == WRITE) + break_sharing(tc, bio, block, &key, lookup_result, cell); + else { + struct endio_hook *h = dm_get_mapinfo(bio)->ptr; + + h->shared_read_entry = ds_inc(&pool->shared_read_ds); + + cell_release_singleton(cell, bio); + remap_and_issue(tc, bio, lookup_result->block); + } +} + +static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block, + struct cell *cell) +{ + int r; + dm_block_t data_block; + + /* + * Remap empty bios (flushes) immediately, without provisioning. + */ + if (!bio->bi_size) { + cell_release_singleton(cell, bio); + remap_and_issue(tc, bio, 0); + return; + } + + /* + * Fill read bios with zeroes and complete them immediately. + */ + if (bio_data_dir(bio) == READ) { + zero_fill_bio(bio); + cell_release_singleton(cell, bio); + bio_endio(bio, 0); + return; + } + + r = alloc_data_block(tc, &data_block); + switch (r) { + case 0: + if (tc->origin_dev) + schedule_external_copy(tc, block, data_block, cell, bio); + else + schedule_zero(tc, block, data_block, cell, bio); + break; + + case -ENOSPC: + no_space(cell); + break; + + default: + DMERR("%s: alloc_data_block() failed, error = %d", __func__, r); + cell_error(cell); + break; + } +} + +static void process_bio(struct thin_c *tc, struct bio *bio) +{ + int r; + dm_block_t block = get_bio_block(tc, bio); + struct cell *cell; + struct cell_key key; + struct dm_thin_lookup_result lookup_result; + + /* + * If cell is already occupied, then the block is already + * being provisioned so we have nothing further to do here. + */ + build_virtual_key(tc->td, block, &key); + if (bio_detain(tc->pool->prison, &key, bio, &cell)) + return; + + r = dm_thin_find_block(tc->td, block, 1, &lookup_result); + switch (r) { + case 0: + /* + * We can release this cell now. This thread is the only + * one that puts bios into a cell, and we know there were + * no preceding bios. + */ + /* + * TODO: this will probably have to change when discard goes + * back in. + */ + cell_release_singleton(cell, bio); + + if (lookup_result.shared) + process_shared_bio(tc, bio, block, &lookup_result); + else + remap_and_issue(tc, bio, lookup_result.block); + break; + + case -ENODATA: + if (bio_data_dir(bio) == READ && tc->origin_dev) { + cell_release_singleton(cell, bio); + remap_to_origin_and_issue(tc, bio); + } else + provision_block(tc, bio, block, cell); + break; + + default: + DMERR("dm_thin_find_block() failed, error = %d", r); + cell_release_singleton(cell, bio); + bio_io_error(bio); + break; + } +} + +static int need_commit_due_to_time(struct pool *pool) +{ + return jiffies < pool->last_commit_jiffies || + jiffies > pool->last_commit_jiffies + COMMIT_PERIOD; +} + +static void process_deferred_bios(struct pool *pool) +{ + unsigned long flags; + struct bio *bio; + struct bio_list bios; + int r; + + bio_list_init(&bios); + + spin_lock_irqsave(&pool->lock, flags); + bio_list_merge(&bios, &pool->deferred_bios); + bio_list_init(&pool->deferred_bios); + spin_unlock_irqrestore(&pool->lock, flags); + + while ((bio = bio_list_pop(&bios))) { + struct endio_hook *h = dm_get_mapinfo(bio)->ptr; + struct thin_c *tc = h->tc; + + /* + * If we've got no free new_mapping structs, and processing + * this bio might require one, we pause until there are some + * prepared mappings to process. + */ + if (ensure_next_mapping(pool)) { + spin_lock_irqsave(&pool->lock, flags); + bio_list_merge(&pool->deferred_bios, &bios); + spin_unlock_irqrestore(&pool->lock, flags); + + break; + } + + if (bio->bi_rw & REQ_DISCARD) + process_discard(tc, bio); + else + process_bio(tc, bio); + } + + /* + * If there are any deferred flush bios, we must commit + * the metadata before issuing them. + */ + bio_list_init(&bios); + spin_lock_irqsave(&pool->lock, flags); + bio_list_merge(&bios, &pool->deferred_flush_bios); + bio_list_init(&pool->deferred_flush_bios); + spin_unlock_irqrestore(&pool->lock, flags); + + if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) + return; + + r = dm_pool_commit_metadata(pool->pmd); + if (r) { + DMERR("%s: dm_pool_commit_metadata() failed, error = %d", + __func__, r); + while ((bio = bio_list_pop(&bios))) + bio_io_error(bio); + return; + } + pool->last_commit_jiffies = jiffies; + + while ((bio = bio_list_pop(&bios))) + generic_make_request(bio); +} + +static void do_worker(struct work_struct *ws) +{ + struct pool *pool = container_of(ws, struct pool, worker); + + process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping); + process_prepared(pool, &pool->prepared_discards, process_prepared_discard); + process_deferred_bios(pool); +} + +/* + * We want to commit periodically so that not too much + * unwritten data builds up. + */ +static void do_waker(struct work_struct *ws) +{ + struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker); + wake_worker(pool); + queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD); +} + +/*----------------------------------------------------------------*/ + +/* + * Mapping functions. + */ + +/* + * Called only while mapping a thin bio to hand it over to the workqueue. + */ +static void thin_defer_bio(struct thin_c *tc, struct bio *bio) +{ + unsigned long flags; + struct pool *pool = tc->pool; + + spin_lock_irqsave(&pool->lock, flags); + bio_list_add(&pool->deferred_bios, bio); + spin_unlock_irqrestore(&pool->lock, flags); + + wake_worker(pool); +} + +static struct endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio) +{ + struct pool *pool = tc->pool; + struct endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO); + + h->tc = tc; + h->shared_read_entry = NULL; + h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds); + h->overwrite_mapping = NULL; + + return h; +} + +/* + * Non-blocking function called from the thin target's map function. + */ +static int thin_bio_map(struct dm_target *ti, struct bio *bio, + union map_info *map_context) +{ + int r; + struct thin_c *tc = ti->private; + dm_block_t block = get_bio_block(tc, bio); + struct dm_thin_device *td = tc->td; + struct dm_thin_lookup_result result; + + map_context->ptr = thin_hook_bio(tc, bio); + if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) { + thin_defer_bio(tc, bio); + return DM_MAPIO_SUBMITTED; + } + + r = dm_thin_find_block(td, block, 0, &result); + + /* + * Note that we defer readahead too. + */ + switch (r) { + case 0: + if (unlikely(result.shared)) { + /* + * We have a race condition here between the + * result.shared value returned by the lookup and + * snapshot creation, which may cause new + * sharing. + * + * To avoid this always quiesce the origin before + * taking the snap. You want to do this anyway to + * ensure a consistent application view + * (i.e. lockfs). + * + * More distant ancestors are irrelevant. The + * shared flag will be set in their case. + */ + thin_defer_bio(tc, bio); + r = DM_MAPIO_SUBMITTED; + } else { + remap(tc, bio, result.block); + r = DM_MAPIO_REMAPPED; + } + break; + + case -ENODATA: + /* + * In future, the failed dm_thin_find_block above could + * provide the hint to load the metadata into cache. + */ + case -EWOULDBLOCK: + thin_defer_bio(tc, bio); + r = DM_MAPIO_SUBMITTED; + break; + } + + return r; +} + +static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits) +{ + int r; + unsigned long flags; + struct pool_c *pt = container_of(cb, struct pool_c, callbacks); + + spin_lock_irqsave(&pt->pool->lock, flags); + r = !bio_list_empty(&pt->pool->retry_on_resume_list); + spin_unlock_irqrestore(&pt->pool->lock, flags); + + if (!r) { + struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); + r = bdi_congested(&q->backing_dev_info, bdi_bits); + } + + return r; +} + +static void __requeue_bios(struct pool *pool) +{ + bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list); + bio_list_init(&pool->retry_on_resume_list); +} + +/*---------------------------------------------------------------- + * Binding of control targets to a pool object + *--------------------------------------------------------------*/ +static int bind_control_target(struct pool *pool, struct dm_target *ti) +{ + struct pool_c *pt = ti->private; + + pool->ti = ti; + pool->low_water_blocks = pt->low_water_blocks; + pool->pf = pt->pf; + + /* + * If discard_passdown was enabled verify that the data device + * supports discards. Disable discard_passdown if not; otherwise + * -EOPNOTSUPP will be returned. + */ + if (pt->pf.discard_passdown) { + struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); + if (!q || !blk_queue_discard(q)) { + char buf[BDEVNAME_SIZE]; + DMWARN("Discard unsupported by data device (%s): Disabling discard passdown.", + bdevname(pt->data_dev->bdev, buf)); + pool->pf.discard_passdown = 0; + } + } + + return 0; +} + +static void unbind_control_target(struct pool *pool, struct dm_target *ti) +{ + if (pool->ti == ti) + pool->ti = NULL; +} + +/*---------------------------------------------------------------- + * Pool creation + *--------------------------------------------------------------*/ +/* Initialize pool features. */ +static void pool_features_init(struct pool_features *pf) +{ + pf->zero_new_blocks = 1; + pf->discard_enabled = 1; + pf->discard_passdown = 1; +} + +static void __pool_destroy(struct pool *pool) +{ + __pool_table_remove(pool); + + if (dm_pool_metadata_close(pool->pmd) < 0) + DMWARN("%s: dm_pool_metadata_close() failed.", __func__); + + prison_destroy(pool->prison); + dm_kcopyd_client_destroy(pool->copier); + + if (pool->wq) + destroy_workqueue(pool->wq); + + if (pool->next_mapping) + mempool_free(pool->next_mapping, pool->mapping_pool); + mempool_destroy(pool->mapping_pool); + mempool_destroy(pool->endio_hook_pool); + kfree(pool); +} + +static struct pool *pool_create(struct mapped_device *pool_md, + struct block_device *metadata_dev, + unsigned long block_size, char **error) +{ + int r; + void *err_p; + struct pool *pool; + struct dm_pool_metadata *pmd; + + pmd = dm_pool_metadata_open(metadata_dev, block_size); + if (IS_ERR(pmd)) { + *error = "Error creating metadata object"; + return (struct pool *)pmd; + } + + pool = kmalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) { + *error = "Error allocating memory for pool"; + err_p = ERR_PTR(-ENOMEM); + goto bad_pool; + } + + pool->pmd = pmd; + pool->sectors_per_block = block_size; + pool->block_shift = ffs(block_size) - 1; + pool->offset_mask = block_size - 1; + pool->low_water_blocks = 0; + pool_features_init(&pool->pf); + pool->prison = prison_create(PRISON_CELLS); + if (!pool->prison) { + *error = "Error creating pool's bio prison"; + err_p = ERR_PTR(-ENOMEM); + goto bad_prison; + } + + pool->copier = dm_kcopyd_client_create(); + if (IS_ERR(pool->copier)) { + r = PTR_ERR(pool->copier); + *error = "Error creating pool's kcopyd client"; + err_p = ERR_PTR(r); + goto bad_kcopyd_client; + } + + /* + * Create singlethreaded workqueue that will service all devices + * that use this metadata. + */ + pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); + if (!pool->wq) { + *error = "Error creating pool's workqueue"; + err_p = ERR_PTR(-ENOMEM); + goto bad_wq; + } + + INIT_WORK(&pool->worker, do_worker); + INIT_DELAYED_WORK(&pool->waker, do_waker); + spin_lock_init(&pool->lock); + bio_list_init(&pool->deferred_bios); + bio_list_init(&pool->deferred_flush_bios); + INIT_LIST_HEAD(&pool->prepared_mappings); + INIT_LIST_HEAD(&pool->prepared_discards); + pool->low_water_triggered = 0; + pool->no_free_space = 0; + bio_list_init(&pool->retry_on_resume_list); + ds_init(&pool->shared_read_ds); + ds_init(&pool->all_io_ds); + + pool->next_mapping = NULL; + pool->mapping_pool = + mempool_create_kmalloc_pool(MAPPING_POOL_SIZE, sizeof(struct new_mapping)); + if (!pool->mapping_pool) { + *error = "Error creating pool's mapping mempool"; + err_p = ERR_PTR(-ENOMEM); + goto bad_mapping_pool; + } + + pool->endio_hook_pool = + mempool_create_kmalloc_pool(ENDIO_HOOK_POOL_SIZE, sizeof(struct endio_hook)); + if (!pool->endio_hook_pool) { + *error = "Error creating pool's endio_hook mempool"; + err_p = ERR_PTR(-ENOMEM); + goto bad_endio_hook_pool; + } + pool->ref_count = 1; + pool->last_commit_jiffies = jiffies; + pool->pool_md = pool_md; + pool->md_dev = metadata_dev; + __pool_table_insert(pool); + + return pool; + +bad_endio_hook_pool: + mempool_destroy(pool->mapping_pool); +bad_mapping_pool: + destroy_workqueue(pool->wq); +bad_wq: + dm_kcopyd_client_destroy(pool->copier); +bad_kcopyd_client: + prison_destroy(pool->prison); +bad_prison: + kfree(pool); +bad_pool: + if (dm_pool_metadata_close(pmd)) + DMWARN("%s: dm_pool_metadata_close() failed.", __func__); + + return err_p; +} + +static void __pool_inc(struct pool *pool) +{ + BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); + pool->ref_count++; +} + +static void __pool_dec(struct pool *pool) +{ + BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); + BUG_ON(!pool->ref_count); + if (!--pool->ref_count) + __pool_destroy(pool); +} + +static struct pool *__pool_find(struct mapped_device *pool_md, + struct block_device *metadata_dev, + unsigned long block_size, char **error, + int *created) +{ + struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); + + if (pool) { + if (pool->pool_md != pool_md) + return ERR_PTR(-EBUSY); + __pool_inc(pool); + + } else { + pool = __pool_table_lookup(pool_md); + if (pool) { + if (pool->md_dev != metadata_dev) + return ERR_PTR(-EINVAL); + __pool_inc(pool); + + } else { + pool = pool_create(pool_md, metadata_dev, block_size, error); + *created = 1; + } + } + + return pool; +} + +/*---------------------------------------------------------------- + * Pool target methods + *--------------------------------------------------------------*/ +static void pool_dtr(struct dm_target *ti) +{ + struct pool_c *pt = ti->private; + + mutex_lock(&dm_thin_pool_table.mutex); + + unbind_control_target(pt->pool, ti); + __pool_dec(pt->pool); + dm_put_device(ti, pt->metadata_dev); + dm_put_device(ti, pt->data_dev); + kfree(pt); + + mutex_unlock(&dm_thin_pool_table.mutex); +} + +static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, + struct dm_target *ti) +{ + int r; + unsigned argc; + const char *arg_name; + + static struct dm_arg _args[] = { + {0, 3, "Invalid number of pool feature arguments"}, + }; + + /* + * No feature arguments supplied. + */ + if (!as->argc) + return 0; + + r = dm_read_arg_group(_args, as, &argc, &ti->error); + if (r) + return -EINVAL; + + while (argc && !r) { + arg_name = dm_shift_arg(as); + argc--; + + if (!strcasecmp(arg_name, "skip_block_zeroing")) { + pf->zero_new_blocks = 0; + continue; + } else if (!strcasecmp(arg_name, "ignore_discard")) { + pf->discard_enabled = 0; + continue; + } else if (!strcasecmp(arg_name, "no_discard_passdown")) { + pf->discard_passdown = 0; + continue; + } + + ti->error = "Unrecognised pool feature requested"; + r = -EINVAL; + } + + return r; +} + +/* + * thin-pool <metadata dev> <data dev> + * <data block size (sectors)> + * <low water mark (blocks)> + * [<#feature args> [<arg>]*] + * + * Optional feature arguments are: + * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. + * ignore_discard: disable discard + * no_discard_passdown: don't pass discards down to the data device + */ +static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) +{ + int r, pool_created = 0; + struct pool_c *pt; + struct pool *pool; + struct pool_features pf; + struct dm_arg_set as; + struct dm_dev *data_dev; + unsigned long block_size; + dm_block_t low_water_blocks; + struct dm_dev *metadata_dev; + sector_t metadata_dev_size; + char b[BDEVNAME_SIZE]; + + /* + * FIXME Remove validation from scope of lock. + */ + mutex_lock(&dm_thin_pool_table.mutex); + + if (argc < 4) { + ti->error = "Invalid argument count"; + r = -EINVAL; + goto out_unlock; + } + as.argc = argc; + as.argv = argv; + + r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev); + if (r) { + ti->error = "Error opening metadata block device"; + goto out_unlock; + } + + metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT; + if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) + DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", + bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); + + r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); + if (r) { + ti->error = "Error getting data device"; + goto out_metadata; + } + + if (kstrtoul(argv[2], 10, &block_size) || !block_size || + block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || + block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || + !is_power_of_2(block_size)) { + ti->error = "Invalid block size"; + r = -EINVAL; + goto out; + } + + if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) { + ti->error = "Invalid low water mark"; + r = -EINVAL; + goto out; + } + + /* + * Set default pool features. + */ + pool_features_init(&pf); + + dm_consume_args(&as, 4); + r = parse_pool_features(&as, &pf, ti); + if (r) + goto out; + + pt = kzalloc(sizeof(*pt), GFP_KERNEL); + if (!pt) { + r = -ENOMEM; + goto out; + } + + pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, + block_size, &ti->error, &pool_created); + if (IS_ERR(pool)) { + r = PTR_ERR(pool); + goto out_free_pt; + } + + /* + * 'pool_created' reflects whether this is the first table load. + * Top level discard support is not allowed to be changed after + * initial load. This would require a pool reload to trigger thin + * device changes. + */ + if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) { + ti->error = "Discard support cannot be disabled once enabled"; + r = -EINVAL; + goto out_flags_changed; + } + + pt->pool = pool; + pt->ti = ti; + pt->metadata_dev = metadata_dev; + pt->data_dev = data_dev; + pt->low_water_blocks = low_water_blocks; + pt->pf = pf; + ti->num_flush_requests = 1; + /* + * Only need to enable discards if the pool should pass + * them down to the data device. The thin device's discard + * processing will cause mappings to be removed from the btree. + */ + if (pf.discard_enabled && pf.discard_passdown) { + ti->num_discard_requests = 1; + /* + * Setting 'discards_supported' circumvents the normal + * stacking of discard limits (this keeps the pool and + * thin devices' discard limits consistent). + */ + ti->discards_supported = 1; + } + ti->private = pt; + + pt->callbacks.congested_fn = pool_is_congested; + dm_table_add_target_callbacks(ti->table, &pt->callbacks); + + mutex_unlock(&dm_thin_pool_table.mutex); + + return 0; + +out_flags_changed: + __pool_dec(pool); +out_free_pt: + kfree(pt); +out: + dm_put_device(ti, data_dev); +out_metadata: + dm_put_device(ti, metadata_dev); +out_unlock: + mutex_unlock(&dm_thin_pool_table.mutex); + + return r; +} + +static int pool_map(struct dm_target *ti, struct bio *bio, + union map_info *map_context) +{ + int r; + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + unsigned long flags; + + /* + * As this is a singleton target, ti->begin is always zero. + */ + spin_lock_irqsave(&pool->lock, flags); + bio->bi_bdev = pt->data_dev->bdev; + r = DM_MAPIO_REMAPPED; + spin_unlock_irqrestore(&pool->lock, flags); + + return r; +} + +/* + * Retrieves the number of blocks of the data device from + * the superblock and compares it to the actual device size, + * thus resizing the data device in case it has grown. + * + * This both copes with opening preallocated data devices in the ctr + * being followed by a resume + * -and- + * calling the resume method individually after userspace has + * grown the data device in reaction to a table event. + */ +static int pool_preresume(struct dm_target *ti) +{ + int r; + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + dm_block_t data_size, sb_data_size; + + /* + * Take control of the pool object. + */ + r = bind_control_target(pool, ti); + if (r) + return r; + + data_size = ti->len >> pool->block_shift; + r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size); + if (r) { + DMERR("failed to retrieve data device size"); + return r; + } + + if (data_size < sb_data_size) { + DMERR("pool target too small, is %llu blocks (expected %llu)", + data_size, sb_data_size); + return -EINVAL; + + } else if (data_size > sb_data_size) { + r = dm_pool_resize_data_dev(pool->pmd, data_size); + if (r) { + DMERR("failed to resize data device"); + return r; + } + + r = dm_pool_commit_metadata(pool->pmd); + if (r) { + DMERR("%s: dm_pool_commit_metadata() failed, error = %d", + __func__, r); + return r; + } + } + + return 0; +} + +static void pool_resume(struct dm_target *ti) +{ + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + unsigned long flags; + + spin_lock_irqsave(&pool->lock, flags); + pool->low_water_triggered = 0; + pool->no_free_space = 0; + __requeue_bios(pool); + spin_unlock_irqrestore(&pool->lock, flags); + + do_waker(&pool->waker.work); +} + +static void pool_postsuspend(struct dm_target *ti) +{ + int r; + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + + cancel_delayed_work(&pool->waker); + flush_workqueue(pool->wq); + + r = dm_pool_commit_metadata(pool->pmd); + if (r < 0) { + DMERR("%s: dm_pool_commit_metadata() failed, error = %d", + __func__, r); + /* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/ + } +} + +static int check_arg_count(unsigned argc, unsigned args_required) +{ + if (argc != args_required) { + DMWARN("Message received with %u arguments instead of %u.", + argc, args_required); + return -EINVAL; + } + + return 0; +} + +static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning) +{ + if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) && + *dev_id <= MAX_DEV_ID) + return 0; + + if (warning) + DMWARN("Message received with invalid device id: %s", arg); + + return -EINVAL; +} + +static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool) +{ + dm_thin_id dev_id; + int r; + + r = check_arg_count(argc, 2); + if (r) + return r; + + r = read_dev_id(argv[1], &dev_id, 1); + if (r) + return r; + + r = dm_pool_create_thin(pool->pmd, dev_id); + if (r) { + DMWARN("Creation of new thinly-provisioned device with id %s failed.", + argv[1]); + return r; + } + + return 0; +} + +static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool) +{ + dm_thin_id dev_id; + dm_thin_id origin_dev_id; + int r; + + r = check_arg_count(argc, 3); + if (r) + return r; + + r = read_dev_id(argv[1], &dev_id, 1); + if (r) + return r; + + r = read_dev_id(argv[2], &origin_dev_id, 1); + if (r) + return r; + + r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id); + if (r) { + DMWARN("Creation of new snapshot %s of device %s failed.", + argv[1], argv[2]); + return r; + } + + return 0; +} + +static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool) +{ + dm_thin_id dev_id; + int r; + + r = check_arg_count(argc, 2); + if (r) + return r; + + r = read_dev_id(argv[1], &dev_id, 1); + if (r) + return r; + + r = dm_pool_delete_thin_device(pool->pmd, dev_id); + if (r) + DMWARN("Deletion of thin device %s failed.", argv[1]); + + return r; +} + +static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool) +{ + dm_thin_id old_id, new_id; + int r; + + r = check_arg_count(argc, 3); + if (r) + return r; + + if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) { + DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]); + return -EINVAL; + } + + if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) { + DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]); + return -EINVAL; + } + + r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id); + if (r) { + DMWARN("Failed to change transaction id from %s to %s.", + argv[1], argv[2]); + return r; + } + + return 0; +} + +/* + * Messages supported: + * create_thin <dev_id> + * create_snap <dev_id> <origin_id> + * delete <dev_id> + * trim <dev_id> <new_size_in_sectors> + * set_transaction_id <current_trans_id> <new_trans_id> + */ +static int pool_message(struct dm_target *ti, unsigned argc, char **argv) +{ + int r = -EINVAL; + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + + if (!strcasecmp(argv[0], "create_thin")) + r = process_create_thin_mesg(argc, argv, pool); + + else if (!strcasecmp(argv[0], "create_snap")) + r = process_create_snap_mesg(argc, argv, pool); + + else if (!strcasecmp(argv[0], "delete")) + r = process_delete_mesg(argc, argv, pool); + + else if (!strcasecmp(argv[0], "set_transaction_id")) + r = process_set_transaction_id_mesg(argc, argv, pool); + + else + DMWARN("Unrecognised thin pool target message received: %s", argv[0]); + + if (!r) { + r = dm_pool_commit_metadata(pool->pmd); + if (r) + DMERR("%s message: dm_pool_commit_metadata() failed, error = %d", + argv[0], r); + } + + return r; +} + +/* + * Status line is: + * <transaction id> <used metadata sectors>/<total metadata sectors> + * <used data sectors>/<total data sectors> <held metadata root> + */ +static int pool_status(struct dm_target *ti, status_type_t type, + char *result, unsigned maxlen) +{ + int r, count; + unsigned sz = 0; + uint64_t transaction_id; + dm_block_t nr_free_blocks_data; + dm_block_t nr_free_blocks_metadata; + dm_block_t nr_blocks_data; + dm_block_t nr_blocks_metadata; + dm_block_t held_root; + char buf[BDEVNAME_SIZE]; + char buf2[BDEVNAME_SIZE]; + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + + switch (type) { + case STATUSTYPE_INFO: + r = dm_pool_get_metadata_transaction_id(pool->pmd, + &transaction_id); + if (r) + return r; + + r = dm_pool_get_free_metadata_block_count(pool->pmd, + &nr_free_blocks_metadata); + if (r) + return r; + + r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata); + if (r) + return r; + + r = dm_pool_get_free_block_count(pool->pmd, + &nr_free_blocks_data); + if (r) + return r; + + r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data); + if (r) + return r; + + r = dm_pool_get_held_metadata_root(pool->pmd, &held_root); + if (r) + return r; + + DMEMIT("%llu %llu/%llu %llu/%llu ", + (unsigned long long)transaction_id, + (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), + (unsigned long long)nr_blocks_metadata, + (unsigned long long)(nr_blocks_data - nr_free_blocks_data), + (unsigned long long)nr_blocks_data); + + if (held_root) + DMEMIT("%llu", held_root); + else + DMEMIT("-"); + + break; + + case STATUSTYPE_TABLE: + DMEMIT("%s %s %lu %llu ", + format_dev_t(buf, pt->metadata_dev->bdev->bd_dev), + format_dev_t(buf2, pt->data_dev->bdev->bd_dev), + (unsigned long)pool->sectors_per_block, + (unsigned long long)pt->low_water_blocks); + + count = !pool->pf.zero_new_blocks + !pool->pf.discard_enabled + + !pt->pf.discard_passdown; + DMEMIT("%u ", count); + + if (!pool->pf.zero_new_blocks) + DMEMIT("skip_block_zeroing "); + + if (!pool->pf.discard_enabled) + DMEMIT("ignore_discard "); + + if (!pt->pf.discard_passdown) + DMEMIT("no_discard_passdown "); + + break; + } + + return 0; +} + +static int pool_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct pool_c *pt = ti->private; + + return fn(ti, pt->data_dev, 0, ti->len, data); +} + +static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size) +{ + struct pool_c *pt = ti->private; + struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); + + if (!q->merge_bvec_fn) + return max_size; + + bvm->bi_bdev = pt->data_dev->bdev; + + return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); +} + +static void set_discard_limits(struct pool *pool, struct queue_limits *limits) +{ + /* + * FIXME: these limits may be incompatible with the pool's data device + */ + limits->max_discard_sectors = pool->sectors_per_block; + + /* + * This is just a hint, and not enforced. We have to cope with + * bios that overlap 2 blocks. + */ + limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; + limits->discard_zeroes_data = pool->pf.zero_new_blocks; +} + +static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + + blk_limits_io_min(limits, 0); + blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); + if (pool->pf.discard_enabled) + set_discard_limits(pool, limits); +} + +static struct target_type pool_target = { + .name = "thin-pool", + .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | + DM_TARGET_IMMUTABLE, + .version = {1, 1, 0}, + .module = THIS_MODULE, + .ctr = pool_ctr, + .dtr = pool_dtr, + .map = pool_map, + .postsuspend = pool_postsuspend, + .preresume = pool_preresume, + .resume = pool_resume, + .message = pool_message, + .status = pool_status, + .merge = pool_merge, + .iterate_devices = pool_iterate_devices, + .io_hints = pool_io_hints, +}; + +/*---------------------------------------------------------------- + * Thin target methods + *--------------------------------------------------------------*/ +static void thin_dtr(struct dm_target *ti) +{ + struct thin_c *tc = ti->private; + + mutex_lock(&dm_thin_pool_table.mutex); + + __pool_dec(tc->pool); + dm_pool_close_thin_device(tc->td); + dm_put_device(ti, tc->pool_dev); + if (tc->origin_dev) + dm_put_device(ti, tc->origin_dev); + kfree(tc); + + mutex_unlock(&dm_thin_pool_table.mutex); +} + +/* + * Thin target parameters: + * + * <pool_dev> <dev_id> [origin_dev] + * + * pool_dev: the path to the pool (eg, /dev/mapper/my_pool) + * dev_id: the internal device identifier + * origin_dev: a device external to the pool that should act as the origin + * + * If the pool device has discards disabled, they get disabled for the thin + * device as well. + */ +static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) +{ + int r; + struct thin_c *tc; + struct dm_dev *pool_dev, *origin_dev; + struct mapped_device *pool_md; + + mutex_lock(&dm_thin_pool_table.mutex); + + if (argc != 2 && argc != 3) { + ti->error = "Invalid argument count"; + r = -EINVAL; + goto out_unlock; + } + + tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL); + if (!tc) { + ti->error = "Out of memory"; + r = -ENOMEM; + goto out_unlock; + } + + if (argc == 3) { + r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev); + if (r) { + ti->error = "Error opening origin device"; + goto bad_origin_dev; + } + tc->origin_dev = origin_dev; + } + + r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev); + if (r) { + ti->error = "Error opening pool device"; + goto bad_pool_dev; + } + tc->pool_dev = pool_dev; + + if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) { + ti->error = "Invalid device id"; + r = -EINVAL; + goto bad_common; + } + + pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev); + if (!pool_md) { + ti->error = "Couldn't get pool mapped device"; + r = -EINVAL; + goto bad_common; + } + + tc->pool = __pool_table_lookup(pool_md); + if (!tc->pool) { + ti->error = "Couldn't find pool object"; + r = -EINVAL; + goto bad_pool_lookup; + } + __pool_inc(tc->pool); + + r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td); + if (r) { + ti->error = "Couldn't open thin internal device"; + goto bad_thin_open; + } + + ti->split_io = tc->pool->sectors_per_block; + ti->num_flush_requests = 1; + + /* In case the pool supports discards, pass them on. */ + if (tc->pool->pf.discard_enabled) { + ti->discards_supported = 1; + ti->num_discard_requests = 1; + } + + dm_put(pool_md); + + mutex_unlock(&dm_thin_pool_table.mutex); + + return 0; + +bad_thin_open: + __pool_dec(tc->pool); +bad_pool_lookup: + dm_put(pool_md); +bad_common: + dm_put_device(ti, tc->pool_dev); +bad_pool_dev: + if (tc->origin_dev) + dm_put_device(ti, tc->origin_dev); +bad_origin_dev: + kfree(tc); +out_unlock: + mutex_unlock(&dm_thin_pool_table.mutex); + + return r; +} + +static int thin_map(struct dm_target *ti, struct bio *bio, + union map_info *map_context) +{ + bio->bi_sector = dm_target_offset(ti, bio->bi_sector); + + return thin_bio_map(ti, bio, map_context); +} + +static int thin_endio(struct dm_target *ti, + struct bio *bio, int err, + union map_info *map_context) +{ + unsigned long flags; + struct endio_hook *h = map_context->ptr; + struct list_head work; + struct new_mapping *m, *tmp; + struct pool *pool = h->tc->pool; + + if (h->shared_read_entry) { + INIT_LIST_HEAD(&work); + ds_dec(h->shared_read_entry, &work); + + spin_lock_irqsave(&pool->lock, flags); + list_for_each_entry_safe(m, tmp, &work, list) { + list_del(&m->list); + m->quiesced = 1; + __maybe_add_mapping(m); + } + spin_unlock_irqrestore(&pool->lock, flags); + } + + if (h->all_io_entry) { + INIT_LIST_HEAD(&work); + ds_dec(h->all_io_entry, &work); + spin_lock_irqsave(&pool->lock, flags); + list_for_each_entry_safe(m, tmp, &work, list) + list_add(&m->list, &pool->prepared_discards); + spin_unlock_irqrestore(&pool->lock, flags); + } + + mempool_free(h, pool->endio_hook_pool); + + return 0; +} + +static void thin_postsuspend(struct dm_target *ti) +{ + if (dm_noflush_suspending(ti)) + requeue_io((struct thin_c *)ti->private); +} + +/* + * <nr mapped sectors> <highest mapped sector> + */ +static int thin_status(struct dm_target *ti, status_type_t type, + char *result, unsigned maxlen) +{ + int r; + ssize_t sz = 0; + dm_block_t mapped, highest; + char buf[BDEVNAME_SIZE]; + struct thin_c *tc = ti->private; + + if (!tc->td) + DMEMIT("-"); + else { + switch (type) { + case STATUSTYPE_INFO: + r = dm_thin_get_mapped_count(tc->td, &mapped); + if (r) + return r; + + r = dm_thin_get_highest_mapped_block(tc->td, &highest); + if (r < 0) + return r; + + DMEMIT("%llu ", mapped * tc->pool->sectors_per_block); + if (r) + DMEMIT("%llu", ((highest + 1) * + tc->pool->sectors_per_block) - 1); + else + DMEMIT("-"); + break; + + case STATUSTYPE_TABLE: + DMEMIT("%s %lu", + format_dev_t(buf, tc->pool_dev->bdev->bd_dev), + (unsigned long) tc->dev_id); + if (tc->origin_dev) + DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev)); + break; + } + } + + return 0; +} + +static int thin_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + dm_block_t blocks; + struct thin_c *tc = ti->private; + + /* + * We can't call dm_pool_get_data_dev_size() since that blocks. So + * we follow a more convoluted path through to the pool's target. + */ + if (!tc->pool->ti) + return 0; /* nothing is bound */ + + blocks = tc->pool->ti->len >> tc->pool->block_shift; + if (blocks) + return fn(ti, tc->pool_dev, 0, tc->pool->sectors_per_block * blocks, data); + + return 0; +} + +static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct thin_c *tc = ti->private; + struct pool *pool = tc->pool; + + blk_limits_io_min(limits, 0); + blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); + set_discard_limits(pool, limits); +} + +static struct target_type thin_target = { + .name = "thin", + .version = {1, 1, 0}, + .module = THIS_MODULE, + .ctr = thin_ctr, + .dtr = thin_dtr, + .map = thin_map, + .end_io = thin_endio, + .postsuspend = thin_postsuspend, + .status = thin_status, + .iterate_devices = thin_iterate_devices, + .io_hints = thin_io_hints, +}; + +/*----------------------------------------------------------------*/ + +static int __init dm_thin_init(void) +{ + int r; + + pool_table_init(); + + r = dm_register_target(&thin_target); + if (r) + return r; + + r = dm_register_target(&pool_target); + if (r) + dm_unregister_target(&thin_target); + + return r; +} + +static void dm_thin_exit(void) +{ + dm_unregister_target(&thin_target); + dm_unregister_target(&pool_target); +} + +module_init(dm_thin_init); +module_exit(dm_thin_exit); + +MODULE_DESCRIPTION(DM_NAME " thin provisioning target"); +MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-uevent.c b/drivers/md/dm-uevent.c new file mode 100644 index 00000000..8efe033b --- /dev/null +++ b/drivers/md/dm-uevent.c @@ -0,0 +1,219 @@ +/* + * Device Mapper Uevent Support (dm-uevent) + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2007 + * Author: Mike Anderson <andmike@linux.vnet.ibm.com> + */ +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/kobject.h> +#include <linux/dm-ioctl.h> +#include <linux/export.h> + +#include "dm.h" +#include "dm-uevent.h" + +#define DM_MSG_PREFIX "uevent" + +static const struct { + enum dm_uevent_type type; + enum kobject_action action; + char *name; +} _dm_uevent_type_names[] = { + {DM_UEVENT_PATH_FAILED, KOBJ_CHANGE, "PATH_FAILED"}, + {DM_UEVENT_PATH_REINSTATED, KOBJ_CHANGE, "PATH_REINSTATED"}, +}; + +static struct kmem_cache *_dm_event_cache; + +struct dm_uevent { + struct mapped_device *md; + enum kobject_action action; + struct kobj_uevent_env ku_env; + struct list_head elist; + char name[DM_NAME_LEN]; + char uuid[DM_UUID_LEN]; +}; + +static void dm_uevent_free(struct dm_uevent *event) +{ + kmem_cache_free(_dm_event_cache, event); +} + +static struct dm_uevent *dm_uevent_alloc(struct mapped_device *md) +{ + struct dm_uevent *event; + + event = kmem_cache_zalloc(_dm_event_cache, GFP_ATOMIC); + if (!event) + return NULL; + + INIT_LIST_HEAD(&event->elist); + event->md = md; + + return event; +} + +static struct dm_uevent *dm_build_path_uevent(struct mapped_device *md, + struct dm_target *ti, + enum kobject_action action, + const char *dm_action, + const char *path, + unsigned nr_valid_paths) +{ + struct dm_uevent *event; + + event = dm_uevent_alloc(md); + if (!event) { + DMERR("%s: dm_uevent_alloc() failed", __func__); + goto err_nomem; + } + + event->action = action; + + if (add_uevent_var(&event->ku_env, "DM_TARGET=%s", ti->type->name)) { + DMERR("%s: add_uevent_var() for DM_TARGET failed", + __func__); + goto err_add; + } + + if (add_uevent_var(&event->ku_env, "DM_ACTION=%s", dm_action)) { + DMERR("%s: add_uevent_var() for DM_ACTION failed", + __func__); + goto err_add; + } + + if (add_uevent_var(&event->ku_env, "DM_SEQNUM=%u", + dm_next_uevent_seq(md))) { + DMERR("%s: add_uevent_var() for DM_SEQNUM failed", + __func__); + goto err_add; + } + + if (add_uevent_var(&event->ku_env, "DM_PATH=%s", path)) { + DMERR("%s: add_uevent_var() for DM_PATH failed", __func__); + goto err_add; + } + + if (add_uevent_var(&event->ku_env, "DM_NR_VALID_PATHS=%d", + nr_valid_paths)) { + DMERR("%s: add_uevent_var() for DM_NR_VALID_PATHS failed", + __func__); + goto err_add; + } + + return event; + +err_add: + dm_uevent_free(event); +err_nomem: + return ERR_PTR(-ENOMEM); +} + +/** + * dm_send_uevents - send uevents for given list + * + * @events: list of events to send + * @kobj: kobject generating event + * + */ +void dm_send_uevents(struct list_head *events, struct kobject *kobj) +{ + int r; + struct dm_uevent *event, *next; + + list_for_each_entry_safe(event, next, events, elist) { + list_del_init(&event->elist); + + /* + * When a device is being removed this copy fails and we + * discard these unsent events. + */ + if (dm_copy_name_and_uuid(event->md, event->name, + event->uuid)) { + DMINFO("%s: skipping sending uevent for lost device", + __func__); + goto uevent_free; + } + + if (add_uevent_var(&event->ku_env, "DM_NAME=%s", event->name)) { + DMERR("%s: add_uevent_var() for DM_NAME failed", + __func__); + goto uevent_free; + } + + if (add_uevent_var(&event->ku_env, "DM_UUID=%s", event->uuid)) { + DMERR("%s: add_uevent_var() for DM_UUID failed", + __func__); + goto uevent_free; + } + + r = kobject_uevent_env(kobj, event->action, event->ku_env.envp); + if (r) + DMERR("%s: kobject_uevent_env failed", __func__); +uevent_free: + dm_uevent_free(event); + } +} +EXPORT_SYMBOL_GPL(dm_send_uevents); + +/** + * dm_path_uevent - called to create a new path event and queue it + * + * @event_type: path event type enum + * @ti: pointer to a dm_target + * @path: string containing pathname + * @nr_valid_paths: number of valid paths remaining + * + */ +void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti, + const char *path, unsigned nr_valid_paths) +{ + struct mapped_device *md = dm_table_get_md(ti->table); + struct dm_uevent *event; + + if (event_type >= ARRAY_SIZE(_dm_uevent_type_names)) { + DMERR("%s: Invalid event_type %d", __func__, event_type); + return; + } + + event = dm_build_path_uevent(md, ti, + _dm_uevent_type_names[event_type].action, + _dm_uevent_type_names[event_type].name, + path, nr_valid_paths); + if (IS_ERR(event)) + return; + + dm_uevent_add(md, &event->elist); +} +EXPORT_SYMBOL_GPL(dm_path_uevent); + +int dm_uevent_init(void) +{ + _dm_event_cache = KMEM_CACHE(dm_uevent, 0); + if (!_dm_event_cache) + return -ENOMEM; + + DMINFO("version 1.0.3"); + + return 0; +} + +void dm_uevent_exit(void) +{ + kmem_cache_destroy(_dm_event_cache); +} diff --git a/drivers/md/dm-uevent.h b/drivers/md/dm-uevent.h new file mode 100644 index 00000000..2eccc8bd --- /dev/null +++ b/drivers/md/dm-uevent.h @@ -0,0 +1,59 @@ +/* + * Device Mapper Uevent Support + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2007 + * Author: Mike Anderson <andmike@linux.vnet.ibm.com> + */ +#ifndef DM_UEVENT_H +#define DM_UEVENT_H + +enum dm_uevent_type { + DM_UEVENT_PATH_FAILED, + DM_UEVENT_PATH_REINSTATED, +}; + +#ifdef CONFIG_DM_UEVENT + +extern int dm_uevent_init(void); +extern void dm_uevent_exit(void); +extern void dm_send_uevents(struct list_head *events, struct kobject *kobj); +extern void dm_path_uevent(enum dm_uevent_type event_type, + struct dm_target *ti, const char *path, + unsigned nr_valid_paths); + +#else + +static inline int dm_uevent_init(void) +{ + return 0; +} +static inline void dm_uevent_exit(void) +{ +} +static inline void dm_send_uevents(struct list_head *events, + struct kobject *kobj) +{ +} +static inline void dm_path_uevent(enum dm_uevent_type event_type, + struct dm_target *ti, const char *path, + unsigned nr_valid_paths) +{ +} + +#endif /* CONFIG_DM_UEVENT */ + +#endif /* DM_UEVENT_H */ diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c new file mode 100644 index 00000000..fa365d39 --- /dev/null +++ b/drivers/md/dm-verity.c @@ -0,0 +1,913 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * Author: Mikulas Patocka <mpatocka@redhat.com> + * + * Based on Chromium dm-verity driver (C) 2011 The Chromium OS Authors + * + * This file is released under the GPLv2. + * + * In the file "/sys/module/dm_verity/parameters/prefetch_cluster" you can set + * default prefetch value. Data are read in "prefetch_cluster" chunks from the + * hash device. Setting this greatly improves performance when data and hash + * are on the same disk on different partitions on devices with poor random + * access behavior. + */ + +#include "dm-bufio.h" + +#include <linux/module.h> +#include <linux/device-mapper.h> +#include <crypto/hash.h> + +#define DM_MSG_PREFIX "verity" + +#define DM_VERITY_IO_VEC_INLINE 16 +#define DM_VERITY_MEMPOOL_SIZE 4 +#define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144 + +#define DM_VERITY_MAX_LEVELS 63 + +static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE; + +module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR); + +struct dm_verity { + struct dm_dev *data_dev; + struct dm_dev *hash_dev; + struct dm_target *ti; + struct dm_bufio_client *bufio; + char *alg_name; + struct crypto_shash *tfm; + u8 *root_digest; /* digest of the root block */ + u8 *salt; /* salt: its size is salt_size */ + unsigned salt_size; + sector_t data_start; /* data offset in 512-byte sectors */ + sector_t hash_start; /* hash start in blocks */ + sector_t data_blocks; /* the number of data blocks */ + sector_t hash_blocks; /* the number of hash blocks */ + unsigned char data_dev_block_bits; /* log2(data blocksize) */ + unsigned char hash_dev_block_bits; /* log2(hash blocksize) */ + unsigned char hash_per_block_bits; /* log2(hashes in hash block) */ + unsigned char levels; /* the number of tree levels */ + unsigned char version; + unsigned digest_size; /* digest size for the current hash algorithm */ + unsigned shash_descsize;/* the size of temporary space for crypto */ + int hash_failed; /* set to 1 if hash of any block failed */ + + mempool_t *io_mempool; /* mempool of struct dm_verity_io */ + mempool_t *vec_mempool; /* mempool of bio vector */ + + struct workqueue_struct *verify_wq; + + /* starting blocks for each tree level. 0 is the lowest level. */ + sector_t hash_level_block[DM_VERITY_MAX_LEVELS]; +}; + +struct dm_verity_io { + struct dm_verity *v; + struct bio *bio; + + /* original values of bio->bi_end_io and bio->bi_private */ + bio_end_io_t *orig_bi_end_io; + void *orig_bi_private; + + sector_t block; + unsigned n_blocks; + + /* saved bio vector */ + struct bio_vec *io_vec; + unsigned io_vec_size; + + struct work_struct work; + + /* A space for short vectors; longer vectors are allocated separately. */ + struct bio_vec io_vec_inline[DM_VERITY_IO_VEC_INLINE]; + + /* + * Three variably-size fields follow this struct: + * + * u8 hash_desc[v->shash_descsize]; + * u8 real_digest[v->digest_size]; + * u8 want_digest[v->digest_size]; + * + * To access them use: io_hash_desc(), io_real_digest() and io_want_digest(). + */ +}; + +static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io) +{ + return (struct shash_desc *)(io + 1); +} + +static u8 *io_real_digest(struct dm_verity *v, struct dm_verity_io *io) +{ + return (u8 *)(io + 1) + v->shash_descsize; +} + +static u8 *io_want_digest(struct dm_verity *v, struct dm_verity_io *io) +{ + return (u8 *)(io + 1) + v->shash_descsize + v->digest_size; +} + +/* + * Auxiliary structure appended to each dm-bufio buffer. If the value + * hash_verified is nonzero, hash of the block has been verified. + * + * The variable hash_verified is set to 0 when allocating the buffer, then + * it can be changed to 1 and it is never reset to 0 again. + * + * There is no lock around this value, a race condition can at worst cause + * that multiple processes verify the hash of the same buffer simultaneously + * and write 1 to hash_verified simultaneously. + * This condition is harmless, so we don't need locking. + */ +struct buffer_aux { + int hash_verified; +}; + +/* + * Initialize struct buffer_aux for a freshly created buffer. + */ +static void dm_bufio_alloc_callback(struct dm_buffer *buf) +{ + struct buffer_aux *aux = dm_bufio_get_aux_data(buf); + + aux->hash_verified = 0; +} + +/* + * Translate input sector number to the sector number on the target device. + */ +static sector_t verity_map_sector(struct dm_verity *v, sector_t bi_sector) +{ + return v->data_start + dm_target_offset(v->ti, bi_sector); +} + +/* + * Return hash position of a specified block at a specified tree level + * (0 is the lowest level). + * The lowest "hash_per_block_bits"-bits of the result denote hash position + * inside a hash block. The remaining bits denote location of the hash block. + */ +static sector_t verity_position_at_level(struct dm_verity *v, sector_t block, + int level) +{ + return block >> (level * v->hash_per_block_bits); +} + +static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level, + sector_t *hash_block, unsigned *offset) +{ + sector_t position = verity_position_at_level(v, block, level); + unsigned idx; + + *hash_block = v->hash_level_block[level] + (position >> v->hash_per_block_bits); + + if (!offset) + return; + + idx = position & ((1 << v->hash_per_block_bits) - 1); + if (!v->version) + *offset = idx * v->digest_size; + else + *offset = idx << (v->hash_dev_block_bits - v->hash_per_block_bits); +} + +/* + * Verify hash of a metadata block pertaining to the specified data block + * ("block" argument) at a specified level ("level" argument). + * + * On successful return, io_want_digest(v, io) contains the hash value for + * a lower tree level or for the data block (if we're at the lowest leve). + * + * If "skip_unverified" is true, unverified buffer is skipped and 1 is returned. + * If "skip_unverified" is false, unverified buffer is hashed and verified + * against current value of io_want_digest(v, io). + */ +static int verity_verify_level(struct dm_verity_io *io, sector_t block, + int level, bool skip_unverified) +{ + struct dm_verity *v = io->v; + struct dm_buffer *buf; + struct buffer_aux *aux; + u8 *data; + int r; + sector_t hash_block; + unsigned offset; + + verity_hash_at_level(v, block, level, &hash_block, &offset); + + data = dm_bufio_read(v->bufio, hash_block, &buf); + if (unlikely(IS_ERR(data))) + return PTR_ERR(data); + + aux = dm_bufio_get_aux_data(buf); + + if (!aux->hash_verified) { + struct shash_desc *desc; + u8 *result; + + if (skip_unverified) { + r = 1; + goto release_ret_r; + } + + desc = io_hash_desc(v, io); + desc->tfm = v->tfm; + desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + r = crypto_shash_init(desc); + if (r < 0) { + DMERR("crypto_shash_init failed: %d", r); + goto release_ret_r; + } + + if (likely(v->version >= 1)) { + r = crypto_shash_update(desc, v->salt, v->salt_size); + if (r < 0) { + DMERR("crypto_shash_update failed: %d", r); + goto release_ret_r; + } + } + + r = crypto_shash_update(desc, data, 1 << v->hash_dev_block_bits); + if (r < 0) { + DMERR("crypto_shash_update failed: %d", r); + goto release_ret_r; + } + + if (!v->version) { + r = crypto_shash_update(desc, v->salt, v->salt_size); + if (r < 0) { + DMERR("crypto_shash_update failed: %d", r); + goto release_ret_r; + } + } + + result = io_real_digest(v, io); + r = crypto_shash_final(desc, result); + if (r < 0) { + DMERR("crypto_shash_final failed: %d", r); + goto release_ret_r; + } + if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) { + DMERR_LIMIT("metadata block %llu is corrupted", + (unsigned long long)hash_block); + v->hash_failed = 1; + r = -EIO; + goto release_ret_r; + } else + aux->hash_verified = 1; + } + + data += offset; + + memcpy(io_want_digest(v, io), data, v->digest_size); + + dm_bufio_release(buf); + return 0; + +release_ret_r: + dm_bufio_release(buf); + + return r; +} + +/* + * Verify one "dm_verity_io" structure. + */ +static int verity_verify_io(struct dm_verity_io *io) +{ + struct dm_verity *v = io->v; + unsigned b; + int i; + unsigned vector = 0, offset = 0; + + for (b = 0; b < io->n_blocks; b++) { + struct shash_desc *desc; + u8 *result; + int r; + unsigned todo; + + if (likely(v->levels)) { + /* + * First, we try to get the requested hash for + * the current block. If the hash block itself is + * verified, zero is returned. If it isn't, this + * function returns 0 and we fall back to whole + * chain verification. + */ + int r = verity_verify_level(io, io->block + b, 0, true); + if (likely(!r)) + goto test_block_hash; + if (r < 0) + return r; + } + + memcpy(io_want_digest(v, io), v->root_digest, v->digest_size); + + for (i = v->levels - 1; i >= 0; i--) { + int r = verity_verify_level(io, io->block + b, i, false); + if (unlikely(r)) + return r; + } + +test_block_hash: + desc = io_hash_desc(v, io); + desc->tfm = v->tfm; + desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + r = crypto_shash_init(desc); + if (r < 0) { + DMERR("crypto_shash_init failed: %d", r); + return r; + } + + if (likely(v->version >= 1)) { + r = crypto_shash_update(desc, v->salt, v->salt_size); + if (r < 0) { + DMERR("crypto_shash_update failed: %d", r); + return r; + } + } + + todo = 1 << v->data_dev_block_bits; + do { + struct bio_vec *bv; + u8 *page; + unsigned len; + + BUG_ON(vector >= io->io_vec_size); + bv = &io->io_vec[vector]; + page = kmap_atomic(bv->bv_page); + len = bv->bv_len - offset; + if (likely(len >= todo)) + len = todo; + r = crypto_shash_update(desc, + page + bv->bv_offset + offset, len); + kunmap_atomic(page); + if (r < 0) { + DMERR("crypto_shash_update failed: %d", r); + return r; + } + offset += len; + if (likely(offset == bv->bv_len)) { + offset = 0; + vector++; + } + todo -= len; + } while (todo); + + if (!v->version) { + r = crypto_shash_update(desc, v->salt, v->salt_size); + if (r < 0) { + DMERR("crypto_shash_update failed: %d", r); + return r; + } + } + + result = io_real_digest(v, io); + r = crypto_shash_final(desc, result); + if (r < 0) { + DMERR("crypto_shash_final failed: %d", r); + return r; + } + if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) { + DMERR_LIMIT("data block %llu is corrupted", + (unsigned long long)(io->block + b)); + v->hash_failed = 1; + return -EIO; + } + } + BUG_ON(vector != io->io_vec_size); + BUG_ON(offset); + + return 0; +} + +/* + * End one "io" structure with a given error. + */ +static void verity_finish_io(struct dm_verity_io *io, int error) +{ + struct bio *bio = io->bio; + struct dm_verity *v = io->v; + + bio->bi_end_io = io->orig_bi_end_io; + bio->bi_private = io->orig_bi_private; + + if (io->io_vec != io->io_vec_inline) + mempool_free(io->io_vec, v->vec_mempool); + + mempool_free(io, v->io_mempool); + + bio_endio(bio, error); +} + +static void verity_work(struct work_struct *w) +{ + struct dm_verity_io *io = container_of(w, struct dm_verity_io, work); + + verity_finish_io(io, verity_verify_io(io)); +} + +static void verity_end_io(struct bio *bio, int error) +{ + struct dm_verity_io *io = bio->bi_private; + + if (error) { + verity_finish_io(io, error); + return; + } + + INIT_WORK(&io->work, verity_work); + queue_work(io->v->verify_wq, &io->work); +} + +/* + * Prefetch buffers for the specified io. + * The root buffer is not prefetched, it is assumed that it will be cached + * all the time. + */ +static void verity_prefetch_io(struct dm_verity *v, struct dm_verity_io *io) +{ + int i; + + for (i = v->levels - 2; i >= 0; i--) { + sector_t hash_block_start; + sector_t hash_block_end; + verity_hash_at_level(v, io->block, i, &hash_block_start, NULL); + verity_hash_at_level(v, io->block + io->n_blocks - 1, i, &hash_block_end, NULL); + if (!i) { + unsigned cluster = *(volatile unsigned *)&dm_verity_prefetch_cluster; + + cluster >>= v->data_dev_block_bits; + if (unlikely(!cluster)) + goto no_prefetch_cluster; + + if (unlikely(cluster & (cluster - 1))) + cluster = 1 << (fls(cluster) - 1); + + hash_block_start &= ~(sector_t)(cluster - 1); + hash_block_end |= cluster - 1; + if (unlikely(hash_block_end >= v->hash_blocks)) + hash_block_end = v->hash_blocks - 1; + } +no_prefetch_cluster: + dm_bufio_prefetch(v->bufio, hash_block_start, + hash_block_end - hash_block_start + 1); + } +} + +/* + * Bio map function. It allocates dm_verity_io structure and bio vector and + * fills them. Then it issues prefetches and the I/O. + */ +static int verity_map(struct dm_target *ti, struct bio *bio, + union map_info *map_context) +{ + struct dm_verity *v = ti->private; + struct dm_verity_io *io; + + bio->bi_bdev = v->data_dev->bdev; + bio->bi_sector = verity_map_sector(v, bio->bi_sector); + + if (((unsigned)bio->bi_sector | bio_sectors(bio)) & + ((1 << (v->data_dev_block_bits - SECTOR_SHIFT)) - 1)) { + DMERR_LIMIT("unaligned io"); + return -EIO; + } + + if ((bio->bi_sector + bio_sectors(bio)) >> + (v->data_dev_block_bits - SECTOR_SHIFT) > v->data_blocks) { + DMERR_LIMIT("io out of range"); + return -EIO; + } + + if (bio_data_dir(bio) == WRITE) + return -EIO; + + io = mempool_alloc(v->io_mempool, GFP_NOIO); + io->v = v; + io->bio = bio; + io->orig_bi_end_io = bio->bi_end_io; + io->orig_bi_private = bio->bi_private; + io->block = bio->bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT); + io->n_blocks = bio->bi_size >> v->data_dev_block_bits; + + bio->bi_end_io = verity_end_io; + bio->bi_private = io; + io->io_vec_size = bio->bi_vcnt - bio->bi_idx; + if (io->io_vec_size < DM_VERITY_IO_VEC_INLINE) + io->io_vec = io->io_vec_inline; + else + io->io_vec = mempool_alloc(v->vec_mempool, GFP_NOIO); + memcpy(io->io_vec, bio_iovec(bio), + io->io_vec_size * sizeof(struct bio_vec)); + + verity_prefetch_io(v, io); + + generic_make_request(bio); + + return DM_MAPIO_SUBMITTED; +} + +/* + * Status: V (valid) or C (corruption found) + */ +static int verity_status(struct dm_target *ti, status_type_t type, + char *result, unsigned maxlen) +{ + struct dm_verity *v = ti->private; + unsigned sz = 0; + unsigned x; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%c", v->hash_failed ? 'C' : 'V'); + break; + case STATUSTYPE_TABLE: + DMEMIT("%u %s %s %u %u %llu %llu %s ", + v->version, + v->data_dev->name, + v->hash_dev->name, + 1 << v->data_dev_block_bits, + 1 << v->hash_dev_block_bits, + (unsigned long long)v->data_blocks, + (unsigned long long)v->hash_start, + v->alg_name + ); + for (x = 0; x < v->digest_size; x++) + DMEMIT("%02x", v->root_digest[x]); + DMEMIT(" "); + if (!v->salt_size) + DMEMIT("-"); + else + for (x = 0; x < v->salt_size; x++) + DMEMIT("%02x", v->salt[x]); + break; + } + + return 0; +} + +static int verity_ioctl(struct dm_target *ti, unsigned cmd, + unsigned long arg) +{ + struct dm_verity *v = ti->private; + int r = 0; + + if (v->data_start || + ti->len != i_size_read(v->data_dev->bdev->bd_inode) >> SECTOR_SHIFT) + r = scsi_verify_blk_ioctl(NULL, cmd); + + return r ? : __blkdev_driver_ioctl(v->data_dev->bdev, v->data_dev->mode, + cmd, arg); +} + +static int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size) +{ + struct dm_verity *v = ti->private; + struct request_queue *q = bdev_get_queue(v->data_dev->bdev); + + if (!q->merge_bvec_fn) + return max_size; + + bvm->bi_bdev = v->data_dev->bdev; + bvm->bi_sector = verity_map_sector(v, bvm->bi_sector); + + return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); +} + +static int verity_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct dm_verity *v = ti->private; + + return fn(ti, v->data_dev, v->data_start, ti->len, data); +} + +static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct dm_verity *v = ti->private; + + if (limits->logical_block_size < 1 << v->data_dev_block_bits) + limits->logical_block_size = 1 << v->data_dev_block_bits; + + if (limits->physical_block_size < 1 << v->data_dev_block_bits) + limits->physical_block_size = 1 << v->data_dev_block_bits; + + blk_limits_io_min(limits, limits->logical_block_size); +} + +static void verity_dtr(struct dm_target *ti) +{ + struct dm_verity *v = ti->private; + + if (v->verify_wq) + destroy_workqueue(v->verify_wq); + + if (v->vec_mempool) + mempool_destroy(v->vec_mempool); + + if (v->io_mempool) + mempool_destroy(v->io_mempool); + + if (v->bufio) + dm_bufio_client_destroy(v->bufio); + + kfree(v->salt); + kfree(v->root_digest); + + if (v->tfm) + crypto_free_shash(v->tfm); + + kfree(v->alg_name); + + if (v->hash_dev) + dm_put_device(ti, v->hash_dev); + + if (v->data_dev) + dm_put_device(ti, v->data_dev); + + kfree(v); +} + +/* + * Target parameters: + * <version> The current format is version 1. + * Vsn 0 is compatible with original Chromium OS releases. + * <data device> + * <hash device> + * <data block size> + * <hash block size> + * <the number of data blocks> + * <hash start block> + * <algorithm> + * <digest> + * <salt> Hex string or "-" if no salt. + */ +static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) +{ + struct dm_verity *v; + unsigned num; + unsigned long long num_ll; + int r; + int i; + sector_t hash_position; + char dummy; + + v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL); + if (!v) { + ti->error = "Cannot allocate verity structure"; + return -ENOMEM; + } + ti->private = v; + v->ti = ti; + + if ((dm_table_get_mode(ti->table) & ~FMODE_READ)) { + ti->error = "Device must be readonly"; + r = -EINVAL; + goto bad; + } + + if (argc != 10) { + ti->error = "Invalid argument count: exactly 10 arguments required"; + r = -EINVAL; + goto bad; + } + + if (sscanf(argv[0], "%d%c", &num, &dummy) != 1 || + num < 0 || num > 1) { + ti->error = "Invalid version"; + r = -EINVAL; + goto bad; + } + v->version = num; + + r = dm_get_device(ti, argv[1], FMODE_READ, &v->data_dev); + if (r) { + ti->error = "Data device lookup failed"; + goto bad; + } + + r = dm_get_device(ti, argv[2], FMODE_READ, &v->hash_dev); + if (r) { + ti->error = "Data device lookup failed"; + goto bad; + } + + if (sscanf(argv[3], "%u%c", &num, &dummy) != 1 || + !num || (num & (num - 1)) || + num < bdev_logical_block_size(v->data_dev->bdev) || + num > PAGE_SIZE) { + ti->error = "Invalid data device block size"; + r = -EINVAL; + goto bad; + } + v->data_dev_block_bits = ffs(num) - 1; + + if (sscanf(argv[4], "%u%c", &num, &dummy) != 1 || + !num || (num & (num - 1)) || + num < bdev_logical_block_size(v->hash_dev->bdev) || + num > INT_MAX) { + ti->error = "Invalid hash device block size"; + r = -EINVAL; + goto bad; + } + v->hash_dev_block_bits = ffs(num) - 1; + + if (sscanf(argv[5], "%llu%c", &num_ll, &dummy) != 1 || + num_ll << (v->data_dev_block_bits - SECTOR_SHIFT) != + (sector_t)num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) { + ti->error = "Invalid data blocks"; + r = -EINVAL; + goto bad; + } + v->data_blocks = num_ll; + + if (ti->len > (v->data_blocks << (v->data_dev_block_bits - SECTOR_SHIFT))) { + ti->error = "Data device is too small"; + r = -EINVAL; + goto bad; + } + + if (sscanf(argv[6], "%llu%c", &num_ll, &dummy) != 1 || + num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT) != + (sector_t)num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT)) { + ti->error = "Invalid hash start"; + r = -EINVAL; + goto bad; + } + v->hash_start = num_ll; + + v->alg_name = kstrdup(argv[7], GFP_KERNEL); + if (!v->alg_name) { + ti->error = "Cannot allocate algorithm name"; + r = -ENOMEM; + goto bad; + } + + v->tfm = crypto_alloc_shash(v->alg_name, 0, 0); + if (IS_ERR(v->tfm)) { + ti->error = "Cannot initialize hash function"; + r = PTR_ERR(v->tfm); + v->tfm = NULL; + goto bad; + } + v->digest_size = crypto_shash_digestsize(v->tfm); + if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) { + ti->error = "Digest size too big"; + r = -EINVAL; + goto bad; + } + v->shash_descsize = + sizeof(struct shash_desc) + crypto_shash_descsize(v->tfm); + + v->root_digest = kmalloc(v->digest_size, GFP_KERNEL); + if (!v->root_digest) { + ti->error = "Cannot allocate root digest"; + r = -ENOMEM; + goto bad; + } + if (strlen(argv[8]) != v->digest_size * 2 || + hex2bin(v->root_digest, argv[8], v->digest_size)) { + ti->error = "Invalid root digest"; + r = -EINVAL; + goto bad; + } + + if (strcmp(argv[9], "-")) { + v->salt_size = strlen(argv[9]) / 2; + v->salt = kmalloc(v->salt_size, GFP_KERNEL); + if (!v->salt) { + ti->error = "Cannot allocate salt"; + r = -ENOMEM; + goto bad; + } + if (strlen(argv[9]) != v->salt_size * 2 || + hex2bin(v->salt, argv[9], v->salt_size)) { + ti->error = "Invalid salt"; + r = -EINVAL; + goto bad; + } + } + + v->hash_per_block_bits = + fls((1 << v->hash_dev_block_bits) / v->digest_size) - 1; + + v->levels = 0; + if (v->data_blocks) + while (v->hash_per_block_bits * v->levels < 64 && + (unsigned long long)(v->data_blocks - 1) >> + (v->hash_per_block_bits * v->levels)) + v->levels++; + + if (v->levels > DM_VERITY_MAX_LEVELS) { + ti->error = "Too many tree levels"; + r = -E2BIG; + goto bad; + } + + hash_position = v->hash_start; + for (i = v->levels - 1; i >= 0; i--) { + sector_t s; + v->hash_level_block[i] = hash_position; + s = verity_position_at_level(v, v->data_blocks, i); + s = (s >> v->hash_per_block_bits) + + !!(s & ((1 << v->hash_per_block_bits) - 1)); + if (hash_position + s < hash_position) { + ti->error = "Hash device offset overflow"; + r = -E2BIG; + goto bad; + } + hash_position += s; + } + v->hash_blocks = hash_position; + + v->bufio = dm_bufio_client_create(v->hash_dev->bdev, + 1 << v->hash_dev_block_bits, 1, sizeof(struct buffer_aux), + dm_bufio_alloc_callback, NULL); + if (IS_ERR(v->bufio)) { + ti->error = "Cannot initialize dm-bufio"; + r = PTR_ERR(v->bufio); + v->bufio = NULL; + goto bad; + } + + if (dm_bufio_get_device_size(v->bufio) < v->hash_blocks) { + ti->error = "Hash device is too small"; + r = -E2BIG; + goto bad; + } + + v->io_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE, + sizeof(struct dm_verity_io) + v->shash_descsize + v->digest_size * 2); + if (!v->io_mempool) { + ti->error = "Cannot allocate io mempool"; + r = -ENOMEM; + goto bad; + } + + v->vec_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE, + BIO_MAX_PAGES * sizeof(struct bio_vec)); + if (!v->vec_mempool) { + ti->error = "Cannot allocate vector mempool"; + r = -ENOMEM; + goto bad; + } + + /* WQ_UNBOUND greatly improves performance when running on ramdisk */ + v->verify_wq = alloc_workqueue("kverityd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus()); + if (!v->verify_wq) { + ti->error = "Cannot allocate workqueue"; + r = -ENOMEM; + goto bad; + } + + return 0; + +bad: + verity_dtr(ti); + + return r; +} + +static struct target_type verity_target = { + .name = "verity", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = verity_ctr, + .dtr = verity_dtr, + .map = verity_map, + .status = verity_status, + .ioctl = verity_ioctl, + .merge = verity_merge, + .iterate_devices = verity_iterate_devices, + .io_hints = verity_io_hints, +}; + +static int __init dm_verity_init(void) +{ + int r; + + r = dm_register_target(&verity_target); + if (r < 0) + DMERR("register failed %d", r); + + return r; +} + +static void __exit dm_verity_exit(void) +{ + dm_unregister_target(&verity_target); +} + +module_init(dm_verity_init); +module_exit(dm_verity_exit); + +MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>"); +MODULE_AUTHOR("Mandeep Baines <msb@chromium.org>"); +MODULE_AUTHOR("Will Drewry <wad@chromium.org>"); +MODULE_DESCRIPTION(DM_NAME " target for transparent disk integrity checking"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c new file mode 100644 index 00000000..cc2b3cb8 --- /dev/null +++ b/drivers/md/dm-zero.c @@ -0,0 +1,85 @@ +/* + * Copyright (C) 2003 Christophe Saout <christophe@saout.de> + * + * This file is released under the GPL. + */ + +#include <linux/device-mapper.h> + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/bio.h> + +#define DM_MSG_PREFIX "zero" + +/* + * Construct a dummy mapping that only returns zeros + */ +static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + if (argc != 0) { + ti->error = "No arguments required"; + return -EINVAL; + } + + /* + * Silently drop discards, avoiding -EOPNOTSUPP. + */ + ti->num_discard_requests = 1; + + return 0; +} + +/* + * Return zeros only on reads + */ +static int zero_map(struct dm_target *ti, struct bio *bio, + union map_info *map_context) +{ + switch(bio_rw(bio)) { + case READ: + zero_fill_bio(bio); + break; + case READA: + /* readahead of null bytes only wastes buffer cache */ + return -EIO; + case WRITE: + /* writes get silently dropped */ + break; + } + + bio_endio(bio, 0); + + /* accepted bio, don't make new request */ + return DM_MAPIO_SUBMITTED; +} + +static struct target_type zero_target = { + .name = "zero", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = zero_ctr, + .map = zero_map, +}; + +static int __init dm_zero_init(void) +{ + int r = dm_register_target(&zero_target); + + if (r < 0) + DMERR("register failed %d", r); + + return r; +} + +static void __exit dm_zero_exit(void) +{ + dm_unregister_target(&zero_target); +} + +module_init(dm_zero_init) +module_exit(dm_zero_exit) + +MODULE_AUTHOR("Christophe Saout <christophe@saout.de>"); +MODULE_DESCRIPTION(DM_NAME " dummy target returning zeros"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm.c b/drivers/md/dm.c new file mode 100644 index 00000000..e24143cc --- /dev/null +++ b/drivers/md/dm.c @@ -0,0 +1,2780 @@ +/* + * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include "dm.h" +#include "dm-uevent.h" + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/moduleparam.h> +#include <linux/blkpg.h> +#include <linux/bio.h> +#include <linux/mempool.h> +#include <linux/slab.h> +#include <linux/idr.h> +#include <linux/hdreg.h> +#include <linux/delay.h> + +#include <trace/events/block.h> + +#define DM_MSG_PREFIX "core" + +#ifdef CONFIG_PRINTK +/* + * ratelimit state to be used in DMXXX_LIMIT(). + */ +DEFINE_RATELIMIT_STATE(dm_ratelimit_state, + DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); +EXPORT_SYMBOL(dm_ratelimit_state); +#endif + +/* + * Cookies are numeric values sent with CHANGE and REMOVE + * uevents while resuming, removing or renaming the device. + */ +#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" +#define DM_COOKIE_LENGTH 24 + +static const char *_name = DM_NAME; + +static unsigned int major = 0; +static unsigned int _major = 0; + +static DEFINE_IDR(_minor_idr); + +static DEFINE_SPINLOCK(_minor_lock); +/* + * For bio-based dm. + * One of these is allocated per bio. + */ +struct dm_io { + struct mapped_device *md; + int error; + atomic_t io_count; + struct bio *bio; + unsigned long start_time; + spinlock_t endio_lock; +}; + +/* + * For bio-based dm. + * One of these is allocated per target within a bio. Hopefully + * this will be simplified out one day. + */ +struct dm_target_io { + struct dm_io *io; + struct dm_target *ti; + union map_info info; +}; + +/* + * For request-based dm. + * One of these is allocated per request. + */ +struct dm_rq_target_io { + struct mapped_device *md; + struct dm_target *ti; + struct request *orig, clone; + int error; + union map_info info; +}; + +/* + * For request-based dm. + * One of these is allocated per bio. + */ +struct dm_rq_clone_bio_info { + struct bio *orig; + struct dm_rq_target_io *tio; +}; + +union map_info *dm_get_mapinfo(struct bio *bio) +{ + if (bio && bio->bi_private) + return &((struct dm_target_io *)bio->bi_private)->info; + return NULL; +} + +union map_info *dm_get_rq_mapinfo(struct request *rq) +{ + if (rq && rq->end_io_data) + return &((struct dm_rq_target_io *)rq->end_io_data)->info; + return NULL; +} +EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); + +#define MINOR_ALLOCED ((void *)-1) + +/* + * Bits for the md->flags field. + */ +#define DMF_BLOCK_IO_FOR_SUSPEND 0 +#define DMF_SUSPENDED 1 +#define DMF_FROZEN 2 +#define DMF_FREEING 3 +#define DMF_DELETING 4 +#define DMF_NOFLUSH_SUSPENDING 5 +#define DMF_MERGE_IS_OPTIONAL 6 + +/* + * Work processed by per-device workqueue. + */ +struct mapped_device { + struct rw_semaphore io_lock; + struct mutex suspend_lock; + rwlock_t map_lock; + atomic_t holders; + atomic_t open_count; + + unsigned long flags; + + struct request_queue *queue; + unsigned type; + /* Protect queue and type against concurrent access. */ + struct mutex type_lock; + + struct target_type *immutable_target_type; + + struct gendisk *disk; + char name[16]; + + void *interface_ptr; + + /* + * A list of ios that arrived while we were suspended. + */ + atomic_t pending[2]; + wait_queue_head_t wait; + struct work_struct work; + struct bio_list deferred; + spinlock_t deferred_lock; + + /* + * Processing queue (flush) + */ + struct workqueue_struct *wq; + + /* + * The current mapping. + */ + struct dm_table *map; + + /* + * io objects are allocated from here. + */ + mempool_t *io_pool; + mempool_t *tio_pool; + + struct bio_set *bs; + + /* + * Event handling. + */ + atomic_t event_nr; + wait_queue_head_t eventq; + atomic_t uevent_seq; + struct list_head uevent_list; + spinlock_t uevent_lock; /* Protect access to uevent_list */ + + /* + * freeze/thaw support require holding onto a super block + */ + struct super_block *frozen_sb; + struct block_device *bdev; + + /* forced geometry settings */ + struct hd_geometry geometry; + + /* sysfs handle */ + struct kobject kobj; + + /* zero-length flush that will be cloned and submitted to targets */ + struct bio flush_bio; +}; + +/* + * For mempools pre-allocation at the table loading time. + */ +struct dm_md_mempools { + mempool_t *io_pool; + mempool_t *tio_pool; + struct bio_set *bs; +}; + +#define MIN_IOS 256 +static struct kmem_cache *_io_cache; +static struct kmem_cache *_tio_cache; +static struct kmem_cache *_rq_tio_cache; +static struct kmem_cache *_rq_bio_info_cache; + +static int __init local_init(void) +{ + int r = -ENOMEM; + + /* allocate a slab for the dm_ios */ + _io_cache = KMEM_CACHE(dm_io, 0); + if (!_io_cache) + return r; + + /* allocate a slab for the target ios */ + _tio_cache = KMEM_CACHE(dm_target_io, 0); + if (!_tio_cache) + goto out_free_io_cache; + + _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0); + if (!_rq_tio_cache) + goto out_free_tio_cache; + + _rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0); + if (!_rq_bio_info_cache) + goto out_free_rq_tio_cache; + + r = dm_uevent_init(); + if (r) + goto out_free_rq_bio_info_cache; + + _major = major; + r = register_blkdev(_major, _name); + if (r < 0) + goto out_uevent_exit; + + if (!_major) + _major = r; + + return 0; + +out_uevent_exit: + dm_uevent_exit(); +out_free_rq_bio_info_cache: + kmem_cache_destroy(_rq_bio_info_cache); +out_free_rq_tio_cache: + kmem_cache_destroy(_rq_tio_cache); +out_free_tio_cache: + kmem_cache_destroy(_tio_cache); +out_free_io_cache: + kmem_cache_destroy(_io_cache); + + return r; +} + +static void local_exit(void) +{ + kmem_cache_destroy(_rq_bio_info_cache); + kmem_cache_destroy(_rq_tio_cache); + kmem_cache_destroy(_tio_cache); + kmem_cache_destroy(_io_cache); + unregister_blkdev(_major, _name); + dm_uevent_exit(); + + _major = 0; + + DMINFO("cleaned up"); +} + +static int (*_inits[])(void) __initdata = { + local_init, + dm_target_init, + dm_linear_init, + dm_stripe_init, + dm_io_init, + dm_kcopyd_init, + dm_interface_init, +}; + +static void (*_exits[])(void) = { + local_exit, + dm_target_exit, + dm_linear_exit, + dm_stripe_exit, + dm_io_exit, + dm_kcopyd_exit, + dm_interface_exit, +}; + +static int __init dm_init(void) +{ + const int count = ARRAY_SIZE(_inits); + + int r, i; + + for (i = 0; i < count; i++) { + r = _inits[i](); + if (r) + goto bad; + } + + return 0; + + bad: + while (i--) + _exits[i](); + + return r; +} + +static void __exit dm_exit(void) +{ + int i = ARRAY_SIZE(_exits); + + while (i--) + _exits[i](); + + /* + * Should be empty by this point. + */ + idr_remove_all(&_minor_idr); + idr_destroy(&_minor_idr); +} + +/* + * Block device functions + */ +int dm_deleting_md(struct mapped_device *md) +{ + return test_bit(DMF_DELETING, &md->flags); +} + +static int dm_blk_open(struct block_device *bdev, fmode_t mode) +{ + struct mapped_device *md; + + spin_lock(&_minor_lock); + + md = bdev->bd_disk->private_data; + if (!md) + goto out; + + if (test_bit(DMF_FREEING, &md->flags) || + dm_deleting_md(md)) { + md = NULL; + goto out; + } + + dm_get(md); + atomic_inc(&md->open_count); + +out: + spin_unlock(&_minor_lock); + + return md ? 0 : -ENXIO; +} + +static int dm_blk_close(struct gendisk *disk, fmode_t mode) +{ + struct mapped_device *md = disk->private_data; + + spin_lock(&_minor_lock); + + atomic_dec(&md->open_count); + dm_put(md); + + spin_unlock(&_minor_lock); + + return 0; +} + +int dm_open_count(struct mapped_device *md) +{ + return atomic_read(&md->open_count); +} + +/* + * Guarantees nothing is using the device before it's deleted. + */ +int dm_lock_for_deletion(struct mapped_device *md) +{ + int r = 0; + + spin_lock(&_minor_lock); + + if (dm_open_count(md)) + r = -EBUSY; + else + set_bit(DMF_DELETING, &md->flags); + + spin_unlock(&_minor_lock); + + return r; +} + +static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + struct mapped_device *md = bdev->bd_disk->private_data; + + return dm_get_geometry(md, geo); +} + +static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct mapped_device *md = bdev->bd_disk->private_data; + struct dm_table *map = dm_get_live_table(md); + struct dm_target *tgt; + int r = -ENOTTY; + + if (!map || !dm_table_get_size(map)) + goto out; + + /* We only support devices that have a single target */ + if (dm_table_get_num_targets(map) != 1) + goto out; + + tgt = dm_table_get_target(map, 0); + + if (dm_suspended_md(md)) { + r = -EAGAIN; + goto out; + } + + if (tgt->type->ioctl) + r = tgt->type->ioctl(tgt, cmd, arg); + +out: + dm_table_put(map); + + return r; +} + +static struct dm_io *alloc_io(struct mapped_device *md) +{ + return mempool_alloc(md->io_pool, GFP_NOIO); +} + +static void free_io(struct mapped_device *md, struct dm_io *io) +{ + mempool_free(io, md->io_pool); +} + +static void free_tio(struct mapped_device *md, struct dm_target_io *tio) +{ + mempool_free(tio, md->tio_pool); +} + +static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, + gfp_t gfp_mask) +{ + return mempool_alloc(md->tio_pool, gfp_mask); +} + +static void free_rq_tio(struct dm_rq_target_io *tio) +{ + mempool_free(tio, tio->md->tio_pool); +} + +static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md) +{ + return mempool_alloc(md->io_pool, GFP_ATOMIC); +} + +static void free_bio_info(struct dm_rq_clone_bio_info *info) +{ + mempool_free(info, info->tio->md->io_pool); +} + +static int md_in_flight(struct mapped_device *md) +{ + return atomic_read(&md->pending[READ]) + + atomic_read(&md->pending[WRITE]); +} + +static void start_io_acct(struct dm_io *io) +{ + struct mapped_device *md = io->md; + int cpu; + int rw = bio_data_dir(io->bio); + + io->start_time = jiffies; + + cpu = part_stat_lock(); + part_round_stats(cpu, &dm_disk(md)->part0); + part_stat_unlock(); + atomic_set(&dm_disk(md)->part0.in_flight[rw], + atomic_inc_return(&md->pending[rw])); +} + +static void end_io_acct(struct dm_io *io) +{ + struct mapped_device *md = io->md; + struct bio *bio = io->bio; + unsigned long duration = jiffies - io->start_time; + int pending, cpu; + int rw = bio_data_dir(bio); + + cpu = part_stat_lock(); + part_round_stats(cpu, &dm_disk(md)->part0); + part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration); + part_stat_unlock(); + + /* + * After this is decremented the bio must not be touched if it is + * a flush. + */ + pending = atomic_dec_return(&md->pending[rw]); + atomic_set(&dm_disk(md)->part0.in_flight[rw], pending); + pending += atomic_read(&md->pending[rw^0x1]); + + /* nudge anyone waiting on suspend queue */ + if (!pending) + wake_up(&md->wait); +} + +/* + * Add the bio to the list of deferred io. + */ +static void queue_io(struct mapped_device *md, struct bio *bio) +{ + unsigned long flags; + + spin_lock_irqsave(&md->deferred_lock, flags); + bio_list_add(&md->deferred, bio); + spin_unlock_irqrestore(&md->deferred_lock, flags); + queue_work(md->wq, &md->work); +} + +/* + * Everyone (including functions in this file), should use this + * function to access the md->map field, and make sure they call + * dm_table_put() when finished. + */ +struct dm_table *dm_get_live_table(struct mapped_device *md) +{ + struct dm_table *t; + unsigned long flags; + + read_lock_irqsave(&md->map_lock, flags); + t = md->map; + if (t) + dm_table_get(t); + read_unlock_irqrestore(&md->map_lock, flags); + + return t; +} + +/* + * Get the geometry associated with a dm device + */ +int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) +{ + *geo = md->geometry; + + return 0; +} + +/* + * Set the geometry of a device. + */ +int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) +{ + sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; + + if (geo->start > sz) { + DMWARN("Start sector is beyond the geometry limits."); + return -EINVAL; + } + + md->geometry = *geo; + + return 0; +} + +/*----------------------------------------------------------------- + * CRUD START: + * A more elegant soln is in the works that uses the queue + * merge fn, unfortunately there are a couple of changes to + * the block layer that I want to make for this. So in the + * interests of getting something for people to use I give + * you this clearly demarcated crap. + *---------------------------------------------------------------*/ + +static int __noflush_suspending(struct mapped_device *md) +{ + return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); +} + +/* + * Decrements the number of outstanding ios that a bio has been + * cloned into, completing the original io if necc. + */ +static void dec_pending(struct dm_io *io, int error) +{ + unsigned long flags; + int io_error; + struct bio *bio; + struct mapped_device *md = io->md; + + /* Push-back supersedes any I/O errors */ + if (unlikely(error)) { + spin_lock_irqsave(&io->endio_lock, flags); + if (!(io->error > 0 && __noflush_suspending(md))) + io->error = error; + spin_unlock_irqrestore(&io->endio_lock, flags); + } + + if (atomic_dec_and_test(&io->io_count)) { + if (io->error == DM_ENDIO_REQUEUE) { + /* + * Target requested pushing back the I/O. + */ + spin_lock_irqsave(&md->deferred_lock, flags); + if (__noflush_suspending(md)) + bio_list_add_head(&md->deferred, io->bio); + else + /* noflush suspend was interrupted. */ + io->error = -EIO; + spin_unlock_irqrestore(&md->deferred_lock, flags); + } + + io_error = io->error; + bio = io->bio; + end_io_acct(io); + free_io(md, io); + + if (io_error == DM_ENDIO_REQUEUE) + return; + + if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) { + /* + * Preflush done for flush with data, reissue + * without REQ_FLUSH. + */ + bio->bi_rw &= ~REQ_FLUSH; + queue_io(md, bio); + } else { + /* done with normal IO or empty flush */ + trace_block_bio_complete(md->queue, bio, io_error); + bio_endio(bio, io_error); + } + } +} + +static void clone_endio(struct bio *bio, int error) +{ + int r = 0; + struct dm_target_io *tio = bio->bi_private; + struct dm_io *io = tio->io; + struct mapped_device *md = tio->io->md; + dm_endio_fn endio = tio->ti->type->end_io; + + if (!bio_flagged(bio, BIO_UPTODATE) && !error) + error = -EIO; + + if (endio) { + r = endio(tio->ti, bio, error, &tio->info); + if (r < 0 || r == DM_ENDIO_REQUEUE) + /* + * error and requeue request are handled + * in dec_pending(). + */ + error = r; + else if (r == DM_ENDIO_INCOMPLETE) + /* The target will handle the io */ + return; + else if (r) { + DMWARN("unimplemented target endio return value: %d", r); + BUG(); + } + } + + /* + * Store md for cleanup instead of tio which is about to get freed. + */ + bio->bi_private = md->bs; + + free_tio(md, tio); + bio_put(bio); + dec_pending(io, error); +} + +/* + * Partial completion handling for request-based dm + */ +static void end_clone_bio(struct bio *clone, int error) +{ + struct dm_rq_clone_bio_info *info = clone->bi_private; + struct dm_rq_target_io *tio = info->tio; + struct bio *bio = info->orig; + unsigned int nr_bytes = info->orig->bi_size; + + bio_put(clone); + + if (tio->error) + /* + * An error has already been detected on the request. + * Once error occurred, just let clone->end_io() handle + * the remainder. + */ + return; + else if (error) { + /* + * Don't notice the error to the upper layer yet. + * The error handling decision is made by the target driver, + * when the request is completed. + */ + tio->error = error; + return; + } + + /* + * I/O for the bio successfully completed. + * Notice the data completion to the upper layer. + */ + + /* + * bios are processed from the head of the list. + * So the completing bio should always be rq->bio. + * If it's not, something wrong is happening. + */ + if (tio->orig->bio != bio) + DMERR("bio completion is going in the middle of the request"); + + /* + * Update the original request. + * Do not use blk_end_request() here, because it may complete + * the original request before the clone, and break the ordering. + */ + blk_update_request(tio->orig, 0, nr_bytes); +} + +/* + * Don't touch any member of the md after calling this function because + * the md may be freed in dm_put() at the end of this function. + * Or do dm_get() before calling this function and dm_put() later. + */ +static void rq_completed(struct mapped_device *md, int rw, int run_queue) +{ + atomic_dec(&md->pending[rw]); + + /* nudge anyone waiting on suspend queue */ + if (!md_in_flight(md)) + wake_up(&md->wait); + + if (run_queue) + blk_run_queue(md->queue); + + /* + * dm_put() must be at the end of this function. See the comment above + */ + dm_put(md); +} + +static void free_rq_clone(struct request *clone) +{ + struct dm_rq_target_io *tio = clone->end_io_data; + + blk_rq_unprep_clone(clone); + free_rq_tio(tio); +} + +/* + * Complete the clone and the original request. + * Must be called without queue lock. + */ +static void dm_end_request(struct request *clone, int error) +{ + int rw = rq_data_dir(clone); + struct dm_rq_target_io *tio = clone->end_io_data; + struct mapped_device *md = tio->md; + struct request *rq = tio->orig; + + if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { + rq->errors = clone->errors; + rq->resid_len = clone->resid_len; + + if (rq->sense) + /* + * We are using the sense buffer of the original + * request. + * So setting the length of the sense data is enough. + */ + rq->sense_len = clone->sense_len; + } + + free_rq_clone(clone); + blk_end_request_all(rq, error); + rq_completed(md, rw, true); +} + +static void dm_unprep_request(struct request *rq) +{ + struct request *clone = rq->special; + + rq->special = NULL; + rq->cmd_flags &= ~REQ_DONTPREP; + + free_rq_clone(clone); +} + +/* + * Requeue the original request of a clone. + */ +void dm_requeue_unmapped_request(struct request *clone) +{ + int rw = rq_data_dir(clone); + struct dm_rq_target_io *tio = clone->end_io_data; + struct mapped_device *md = tio->md; + struct request *rq = tio->orig; + struct request_queue *q = rq->q; + unsigned long flags; + + dm_unprep_request(rq); + + spin_lock_irqsave(q->queue_lock, flags); + blk_requeue_request(q, rq); + spin_unlock_irqrestore(q->queue_lock, flags); + + rq_completed(md, rw, 0); +} +EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); + +static void __stop_queue(struct request_queue *q) +{ + blk_stop_queue(q); +} + +static void stop_queue(struct request_queue *q) +{ + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + __stop_queue(q); + spin_unlock_irqrestore(q->queue_lock, flags); +} + +static void __start_queue(struct request_queue *q) +{ + if (blk_queue_stopped(q)) + blk_start_queue(q); +} + +static void start_queue(struct request_queue *q) +{ + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + __start_queue(q); + spin_unlock_irqrestore(q->queue_lock, flags); +} + +static void dm_done(struct request *clone, int error, bool mapped) +{ + int r = error; + struct dm_rq_target_io *tio = clone->end_io_data; + dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io; + + if (mapped && rq_end_io) + r = rq_end_io(tio->ti, clone, error, &tio->info); + + if (r <= 0) + /* The target wants to complete the I/O */ + dm_end_request(clone, r); + else if (r == DM_ENDIO_INCOMPLETE) + /* The target will handle the I/O */ + return; + else if (r == DM_ENDIO_REQUEUE) + /* The target wants to requeue the I/O */ + dm_requeue_unmapped_request(clone); + else { + DMWARN("unimplemented target endio return value: %d", r); + BUG(); + } +} + +/* + * Request completion handler for request-based dm + */ +static void dm_softirq_done(struct request *rq) +{ + bool mapped = true; + struct request *clone = rq->completion_data; + struct dm_rq_target_io *tio = clone->end_io_data; + + if (rq->cmd_flags & REQ_FAILED) + mapped = false; + + dm_done(clone, tio->error, mapped); +} + +/* + * Complete the clone and the original request with the error status + * through softirq context. + */ +static void dm_complete_request(struct request *clone, int error) +{ + struct dm_rq_target_io *tio = clone->end_io_data; + struct request *rq = tio->orig; + + tio->error = error; + rq->completion_data = clone; + blk_complete_request(rq); +} + +/* + * Complete the not-mapped clone and the original request with the error status + * through softirq context. + * Target's rq_end_io() function isn't called. + * This may be used when the target's map_rq() function fails. + */ +void dm_kill_unmapped_request(struct request *clone, int error) +{ + struct dm_rq_target_io *tio = clone->end_io_data; + struct request *rq = tio->orig; + + rq->cmd_flags |= REQ_FAILED; + dm_complete_request(clone, error); +} +EXPORT_SYMBOL_GPL(dm_kill_unmapped_request); + +/* + * Called with the queue lock held + */ +static void end_clone_request(struct request *clone, int error) +{ + /* + * For just cleaning up the information of the queue in which + * the clone was dispatched. + * The clone is *NOT* freed actually here because it is alloced from + * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags. + */ + __blk_put_request(clone->q, clone); + + /* + * Actual request completion is done in a softirq context which doesn't + * hold the queue lock. Otherwise, deadlock could occur because: + * - another request may be submitted by the upper level driver + * of the stacking during the completion + * - the submission which requires queue lock may be done + * against this queue + */ + dm_complete_request(clone, error); +} + +/* + * Return maximum size of I/O possible at the supplied sector up to the current + * target boundary. + */ +static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti) +{ + sector_t target_offset = dm_target_offset(ti, sector); + + return ti->len - target_offset; +} + +static sector_t max_io_len(sector_t sector, struct dm_target *ti) +{ + sector_t len = max_io_len_target_boundary(sector, ti); + + /* + * Does the target need to split even further ? + */ + if (ti->split_io) { + sector_t boundary; + sector_t offset = dm_target_offset(ti, sector); + boundary = ((offset + ti->split_io) & ~(ti->split_io - 1)) + - offset; + if (len > boundary) + len = boundary; + } + + return len; +} + +static void __map_bio(struct dm_target *ti, struct bio *clone, + struct dm_target_io *tio) +{ + int r; + sector_t sector; + struct mapped_device *md; + + clone->bi_end_io = clone_endio; + clone->bi_private = tio; + + /* + * Map the clone. If r == 0 we don't need to do + * anything, the target has assumed ownership of + * this io. + */ + atomic_inc(&tio->io->io_count); + sector = clone->bi_sector; + r = ti->type->map(ti, clone, &tio->info); + if (r == DM_MAPIO_REMAPPED) { + /* the bio has been remapped so dispatch it */ + + trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone, + tio->io->bio->bi_bdev->bd_dev, sector); + + generic_make_request(clone); + } else if (r < 0 || r == DM_MAPIO_REQUEUE) { + /* error the io and bail out, or requeue it if needed */ + md = tio->io->md; + dec_pending(tio->io, r); + /* + * Store bio_set for cleanup. + */ + clone->bi_end_io = NULL; + clone->bi_private = md->bs; + bio_put(clone); + free_tio(md, tio); + } else if (r) { + DMWARN("unimplemented target map return value: %d", r); + BUG(); + } +} + +struct clone_info { + struct mapped_device *md; + struct dm_table *map; + struct bio *bio; + struct dm_io *io; + sector_t sector; + sector_t sector_count; + unsigned short idx; +}; + +static void dm_bio_destructor(struct bio *bio) +{ + struct bio_set *bs = bio->bi_private; + + bio_free(bio, bs); +} + +/* + * Creates a little bio that just does part of a bvec. + */ +static struct bio *split_bvec(struct bio *bio, sector_t sector, + unsigned short idx, unsigned int offset, + unsigned int len, struct bio_set *bs) +{ + struct bio *clone; + struct bio_vec *bv = bio->bi_io_vec + idx; + + clone = bio_alloc_bioset(GFP_NOIO, 1, bs); + clone->bi_destructor = dm_bio_destructor; + *clone->bi_io_vec = *bv; + + clone->bi_sector = sector; + clone->bi_bdev = bio->bi_bdev; + clone->bi_rw = bio->bi_rw; + clone->bi_vcnt = 1; + clone->bi_size = to_bytes(len); + clone->bi_io_vec->bv_offset = offset; + clone->bi_io_vec->bv_len = clone->bi_size; + clone->bi_flags |= 1 << BIO_CLONED; + + if (bio_integrity(bio)) { + bio_integrity_clone(clone, bio, GFP_NOIO, bs); + bio_integrity_trim(clone, + bio_sector_offset(bio, idx, offset), len); + } + + return clone; +} + +/* + * Creates a bio that consists of range of complete bvecs. + */ +static struct bio *clone_bio(struct bio *bio, sector_t sector, + unsigned short idx, unsigned short bv_count, + unsigned int len, struct bio_set *bs) +{ + struct bio *clone; + + clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); + __bio_clone(clone, bio); + clone->bi_destructor = dm_bio_destructor; + clone->bi_sector = sector; + clone->bi_idx = idx; + clone->bi_vcnt = idx + bv_count; + clone->bi_size = to_bytes(len); + clone->bi_flags &= ~(1 << BIO_SEG_VALID); + + if (bio_integrity(bio)) { + bio_integrity_clone(clone, bio, GFP_NOIO, bs); + + if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) + bio_integrity_trim(clone, + bio_sector_offset(bio, idx, 0), len); + } + + return clone; +} + +static struct dm_target_io *alloc_tio(struct clone_info *ci, + struct dm_target *ti) +{ + struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO); + + tio->io = ci->io; + tio->ti = ti; + memset(&tio->info, 0, sizeof(tio->info)); + + return tio; +} + +static void __issue_target_request(struct clone_info *ci, struct dm_target *ti, + unsigned request_nr, sector_t len) +{ + struct dm_target_io *tio = alloc_tio(ci, ti); + struct bio *clone; + + tio->info.target_request_nr = request_nr; + + /* + * Discard requests require the bio's inline iovecs be initialized. + * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush + * and discard, so no need for concern about wasted bvec allocations. + */ + clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs); + __bio_clone(clone, ci->bio); + clone->bi_destructor = dm_bio_destructor; + if (len) { + clone->bi_sector = ci->sector; + clone->bi_size = to_bytes(len); + } + + __map_bio(ti, clone, tio); +} + +static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti, + unsigned num_requests, sector_t len) +{ + unsigned request_nr; + + for (request_nr = 0; request_nr < num_requests; request_nr++) + __issue_target_request(ci, ti, request_nr, len); +} + +static int __clone_and_map_empty_flush(struct clone_info *ci) +{ + unsigned target_nr = 0; + struct dm_target *ti; + + BUG_ON(bio_has_data(ci->bio)); + while ((ti = dm_table_get_target(ci->map, target_nr++))) + __issue_target_requests(ci, ti, ti->num_flush_requests, 0); + + return 0; +} + +/* + * Perform all io with a single clone. + */ +static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti) +{ + struct bio *clone, *bio = ci->bio; + struct dm_target_io *tio; + + tio = alloc_tio(ci, ti); + clone = clone_bio(bio, ci->sector, ci->idx, + bio->bi_vcnt - ci->idx, ci->sector_count, + ci->md->bs); + __map_bio(ti, clone, tio); + ci->sector_count = 0; +} + +static int __clone_and_map_discard(struct clone_info *ci) +{ + struct dm_target *ti; + sector_t len; + + do { + ti = dm_table_find_target(ci->map, ci->sector); + if (!dm_target_is_valid(ti)) + return -EIO; + + /* + * Even though the device advertised discard support, + * that does not mean every target supports it, and + * reconfiguration might also have changed that since the + * check was performed. + */ + if (!ti->num_discard_requests) + return -EOPNOTSUPP; + + len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); + + __issue_target_requests(ci, ti, ti->num_discard_requests, len); + + ci->sector += len; + } while (ci->sector_count -= len); + + return 0; +} + +static int __clone_and_map(struct clone_info *ci) +{ + struct bio *clone, *bio = ci->bio; + struct dm_target *ti; + sector_t len = 0, max; + struct dm_target_io *tio; + + if (unlikely(bio->bi_rw & REQ_DISCARD)) + return __clone_and_map_discard(ci); + + ti = dm_table_find_target(ci->map, ci->sector); + if (!dm_target_is_valid(ti)) + return -EIO; + + max = max_io_len(ci->sector, ti); + + if (ci->sector_count <= max) { + /* + * Optimise for the simple case where we can do all of + * the remaining io with a single clone. + */ + __clone_and_map_simple(ci, ti); + + } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { + /* + * There are some bvecs that don't span targets. + * Do as many of these as possible. + */ + int i; + sector_t remaining = max; + sector_t bv_len; + + for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) { + bv_len = to_sector(bio->bi_io_vec[i].bv_len); + + if (bv_len > remaining) + break; + + remaining -= bv_len; + len += bv_len; + } + + tio = alloc_tio(ci, ti); + clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len, + ci->md->bs); + __map_bio(ti, clone, tio); + + ci->sector += len; + ci->sector_count -= len; + ci->idx = i; + + } else { + /* + * Handle a bvec that must be split between two or more targets. + */ + struct bio_vec *bv = bio->bi_io_vec + ci->idx; + sector_t remaining = to_sector(bv->bv_len); + unsigned int offset = 0; + + do { + if (offset) { + ti = dm_table_find_target(ci->map, ci->sector); + if (!dm_target_is_valid(ti)) + return -EIO; + + max = max_io_len(ci->sector, ti); + } + + len = min(remaining, max); + + tio = alloc_tio(ci, ti); + clone = split_bvec(bio, ci->sector, ci->idx, + bv->bv_offset + offset, len, + ci->md->bs); + + __map_bio(ti, clone, tio); + + ci->sector += len; + ci->sector_count -= len; + offset += to_bytes(len); + } while (remaining -= len); + + ci->idx++; + } + + return 0; +} + +/* + * Split the bio into several clones and submit it to targets. + */ +static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) +{ + struct clone_info ci; + int error = 0; + + ci.map = dm_get_live_table(md); + if (unlikely(!ci.map)) { + bio_io_error(bio); + return; + } + + ci.md = md; + ci.io = alloc_io(md); + ci.io->error = 0; + atomic_set(&ci.io->io_count, 1); + ci.io->bio = bio; + ci.io->md = md; + spin_lock_init(&ci.io->endio_lock); + ci.sector = bio->bi_sector; + ci.idx = bio->bi_idx; + + start_io_acct(ci.io); + if (bio->bi_rw & REQ_FLUSH) { + ci.bio = &ci.md->flush_bio; + ci.sector_count = 0; + error = __clone_and_map_empty_flush(&ci); + /* dec_pending submits any data associated with flush */ + } else { + ci.bio = bio; + ci.sector_count = bio_sectors(bio); + while (ci.sector_count && !error) + error = __clone_and_map(&ci); + } + + /* drop the extra reference count */ + dec_pending(ci.io, error); + dm_table_put(ci.map); +} +/*----------------------------------------------------------------- + * CRUD END + *---------------------------------------------------------------*/ + +static int dm_merge_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) +{ + struct mapped_device *md = q->queuedata; + struct dm_table *map = dm_get_live_table(md); + struct dm_target *ti; + sector_t max_sectors; + int max_size = 0; + + if (unlikely(!map)) + goto out; + + ti = dm_table_find_target(map, bvm->bi_sector); + if (!dm_target_is_valid(ti)) + goto out_table; + + /* + * Find maximum amount of I/O that won't need splitting + */ + max_sectors = min(max_io_len(bvm->bi_sector, ti), + (sector_t) BIO_MAX_SECTORS); + max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; + if (max_size < 0) + max_size = 0; + + /* + * merge_bvec_fn() returns number of bytes + * it can accept at this offset + * max is precomputed maximal io size + */ + if (max_size && ti->type->merge) + max_size = ti->type->merge(ti, bvm, biovec, max_size); + /* + * If the target doesn't support merge method and some of the devices + * provided their merge_bvec method (we know this by looking at + * queue_max_hw_sectors), then we can't allow bios with multiple vector + * entries. So always set max_size to 0, and the code below allows + * just one page. + */ + else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) + + max_size = 0; + +out_table: + dm_table_put(map); + +out: + /* + * Always allow an entire first page + */ + if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT)) + max_size = biovec->bv_len; + + return max_size; +} + +/* + * The request function that just remaps the bio built up by + * dm_merge_bvec. + */ +static void _dm_request(struct request_queue *q, struct bio *bio) +{ + int rw = bio_data_dir(bio); + struct mapped_device *md = q->queuedata; + int cpu; + + down_read(&md->io_lock); + + cpu = part_stat_lock(); + part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]); + part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio)); + part_stat_unlock(); + + /* if we're suspended, we have to queue this io for later */ + if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { + up_read(&md->io_lock); + + if (bio_rw(bio) != READA) + queue_io(md, bio); + else + bio_io_error(bio); + return; + } + + __split_and_process_bio(md, bio); + up_read(&md->io_lock); + return; +} + +static int dm_request_based(struct mapped_device *md) +{ + return blk_queue_stackable(md->queue); +} + +static void dm_request(struct request_queue *q, struct bio *bio) +{ + struct mapped_device *md = q->queuedata; + + if (dm_request_based(md)) + blk_queue_bio(q, bio); + else + _dm_request(q, bio); +} + +void dm_dispatch_request(struct request *rq) +{ + int r; + + if (blk_queue_io_stat(rq->q)) + rq->cmd_flags |= REQ_IO_STAT; + + rq->start_time = jiffies; + r = blk_insert_cloned_request(rq->q, rq); + if (r) + dm_complete_request(rq, r); +} +EXPORT_SYMBOL_GPL(dm_dispatch_request); + +static void dm_rq_bio_destructor(struct bio *bio) +{ + struct dm_rq_clone_bio_info *info = bio->bi_private; + struct mapped_device *md = info->tio->md; + + free_bio_info(info); + bio_free(bio, md->bs); +} + +static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, + void *data) +{ + struct dm_rq_target_io *tio = data; + struct mapped_device *md = tio->md; + struct dm_rq_clone_bio_info *info = alloc_bio_info(md); + + if (!info) + return -ENOMEM; + + info->orig = bio_orig; + info->tio = tio; + bio->bi_end_io = end_clone_bio; + bio->bi_private = info; + bio->bi_destructor = dm_rq_bio_destructor; + + return 0; +} + +static int setup_clone(struct request *clone, struct request *rq, + struct dm_rq_target_io *tio) +{ + int r; + + r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, + dm_rq_bio_constructor, tio); + if (r) + return r; + + clone->cmd = rq->cmd; + clone->cmd_len = rq->cmd_len; + clone->sense = rq->sense; + clone->buffer = rq->buffer; + clone->end_io = end_clone_request; + clone->end_io_data = tio; + + return 0; +} + +static struct request *clone_rq(struct request *rq, struct mapped_device *md, + gfp_t gfp_mask) +{ + struct request *clone; + struct dm_rq_target_io *tio; + + tio = alloc_rq_tio(md, gfp_mask); + if (!tio) + return NULL; + + tio->md = md; + tio->ti = NULL; + tio->orig = rq; + tio->error = 0; + memset(&tio->info, 0, sizeof(tio->info)); + + clone = &tio->clone; + if (setup_clone(clone, rq, tio)) { + /* -ENOMEM */ + free_rq_tio(tio); + return NULL; + } + + return clone; +} + +/* + * Called with the queue lock held. + */ +static int dm_prep_fn(struct request_queue *q, struct request *rq) +{ + struct mapped_device *md = q->queuedata; + struct request *clone; + + if (unlikely(rq->special)) { + DMWARN("Already has something in rq->special."); + return BLKPREP_KILL; + } + + clone = clone_rq(rq, md, GFP_ATOMIC); + if (!clone) + return BLKPREP_DEFER; + + rq->special = clone; + rq->cmd_flags |= REQ_DONTPREP; + + return BLKPREP_OK; +} + +/* + * Returns: + * 0 : the request has been processed (not requeued) + * !0 : the request has been requeued + */ +static int map_request(struct dm_target *ti, struct request *clone, + struct mapped_device *md) +{ + int r, requeued = 0; + struct dm_rq_target_io *tio = clone->end_io_data; + + /* + * Hold the md reference here for the in-flight I/O. + * We can't rely on the reference count by device opener, + * because the device may be closed during the request completion + * when all bios are completed. + * See the comment in rq_completed() too. + */ + dm_get(md); + + tio->ti = ti; + r = ti->type->map_rq(ti, clone, &tio->info); + switch (r) { + case DM_MAPIO_SUBMITTED: + /* The target has taken the I/O to submit by itself later */ + break; + case DM_MAPIO_REMAPPED: + /* The target has remapped the I/O so dispatch it */ + trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), + blk_rq_pos(tio->orig)); + dm_dispatch_request(clone); + break; + case DM_MAPIO_REQUEUE: + /* The target wants to requeue the I/O */ + dm_requeue_unmapped_request(clone); + requeued = 1; + break; + default: + if (r > 0) { + DMWARN("unimplemented target map return value: %d", r); + BUG(); + } + + /* The target wants to complete the I/O */ + dm_kill_unmapped_request(clone, r); + break; + } + + return requeued; +} + +/* + * q->request_fn for request-based dm. + * Called with the queue lock held. + */ +static void dm_request_fn(struct request_queue *q) +{ + struct mapped_device *md = q->queuedata; + struct dm_table *map = dm_get_live_table(md); + struct dm_target *ti; + struct request *rq, *clone; + sector_t pos; + + /* + * For suspend, check blk_queue_stopped() and increment + * ->pending within a single queue_lock not to increment the + * number of in-flight I/Os after the queue is stopped in + * dm_suspend(). + */ + while (!blk_queue_stopped(q)) { + rq = blk_peek_request(q); + if (!rq) + goto delay_and_out; + + /* always use block 0 to find the target for flushes for now */ + pos = 0; + if (!(rq->cmd_flags & REQ_FLUSH)) + pos = blk_rq_pos(rq); + + ti = dm_table_find_target(map, pos); + BUG_ON(!dm_target_is_valid(ti)); + + if (ti->type->busy && ti->type->busy(ti)) + goto delay_and_out; + + blk_start_request(rq); + clone = rq->special; + atomic_inc(&md->pending[rq_data_dir(clone)]); + + spin_unlock(q->queue_lock); + if (map_request(ti, clone, md)) + goto requeued; + + BUG_ON(!irqs_disabled()); + spin_lock(q->queue_lock); + } + + goto out; + +requeued: + BUG_ON(!irqs_disabled()); + spin_lock(q->queue_lock); + +delay_and_out: + blk_delay_queue(q, HZ / 10); +out: + dm_table_put(map); + + return; +} + +int dm_underlying_device_busy(struct request_queue *q) +{ + return blk_lld_busy(q); +} +EXPORT_SYMBOL_GPL(dm_underlying_device_busy); + +static int dm_lld_busy(struct request_queue *q) +{ + int r; + struct mapped_device *md = q->queuedata; + struct dm_table *map = dm_get_live_table(md); + + if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) + r = 1; + else + r = dm_table_any_busy_target(map); + + dm_table_put(map); + + return r; +} + +static int dm_any_congested(void *congested_data, int bdi_bits) +{ + int r = bdi_bits; + struct mapped_device *md = congested_data; + struct dm_table *map; + + if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { + map = dm_get_live_table(md); + if (map) { + /* + * Request-based dm cares about only own queue for + * the query about congestion status of request_queue + */ + if (dm_request_based(md)) + r = md->queue->backing_dev_info.state & + bdi_bits; + else + r = dm_table_any_congested(map, bdi_bits); + + dm_table_put(map); + } + } + + return r; +} + +/*----------------------------------------------------------------- + * An IDR is used to keep track of allocated minor numbers. + *---------------------------------------------------------------*/ +static void free_minor(int minor) +{ + spin_lock(&_minor_lock); + idr_remove(&_minor_idr, minor); + spin_unlock(&_minor_lock); +} + +/* + * See if the device with a specific minor # is free. + */ +static int specific_minor(int minor) +{ + int r, m; + + if (minor >= (1 << MINORBITS)) + return -EINVAL; + + r = idr_pre_get(&_minor_idr, GFP_KERNEL); + if (!r) + return -ENOMEM; + + spin_lock(&_minor_lock); + + if (idr_find(&_minor_idr, minor)) { + r = -EBUSY; + goto out; + } + + r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m); + if (r) + goto out; + + if (m != minor) { + idr_remove(&_minor_idr, m); + r = -EBUSY; + goto out; + } + +out: + spin_unlock(&_minor_lock); + return r; +} + +static int next_free_minor(int *minor) +{ + int r, m; + + r = idr_pre_get(&_minor_idr, GFP_KERNEL); + if (!r) + return -ENOMEM; + + spin_lock(&_minor_lock); + + r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m); + if (r) + goto out; + + if (m >= (1 << MINORBITS)) { + idr_remove(&_minor_idr, m); + r = -ENOSPC; + goto out; + } + + *minor = m; + +out: + spin_unlock(&_minor_lock); + return r; +} + +static const struct block_device_operations dm_blk_dops; + +static void dm_wq_work(struct work_struct *work); + +static void dm_init_md_queue(struct mapped_device *md) +{ + /* + * Request-based dm devices cannot be stacked on top of bio-based dm + * devices. The type of this dm device has not been decided yet. + * The type is decided at the first table loading time. + * To prevent problematic device stacking, clear the queue flag + * for request stacking support until then. + * + * This queue is new, so no concurrency on the queue_flags. + */ + queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); + + md->queue->queuedata = md; + md->queue->backing_dev_info.congested_fn = dm_any_congested; + md->queue->backing_dev_info.congested_data = md; + blk_queue_make_request(md->queue, dm_request); + blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); + blk_queue_merge_bvec(md->queue, dm_merge_bvec); +} + +/* + * Allocate and initialise a blank device with a given minor. + */ +static struct mapped_device *alloc_dev(int minor) +{ + int r; + struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL); + void *old_md; + + if (!md) { + DMWARN("unable to allocate device, out of memory."); + return NULL; + } + + if (!try_module_get(THIS_MODULE)) + goto bad_module_get; + + /* get a minor number for the dev */ + if (minor == DM_ANY_MINOR) + r = next_free_minor(&minor); + else + r = specific_minor(minor); + if (r < 0) + goto bad_minor; + + md->type = DM_TYPE_NONE; + init_rwsem(&md->io_lock); + mutex_init(&md->suspend_lock); + mutex_init(&md->type_lock); + spin_lock_init(&md->deferred_lock); + rwlock_init(&md->map_lock); + atomic_set(&md->holders, 1); + atomic_set(&md->open_count, 0); + atomic_set(&md->event_nr, 0); + atomic_set(&md->uevent_seq, 0); + INIT_LIST_HEAD(&md->uevent_list); + spin_lock_init(&md->uevent_lock); + + md->queue = blk_alloc_queue(GFP_KERNEL); + if (!md->queue) + goto bad_queue; + + dm_init_md_queue(md); + + md->disk = alloc_disk(1); + if (!md->disk) + goto bad_disk; + + atomic_set(&md->pending[0], 0); + atomic_set(&md->pending[1], 0); + init_waitqueue_head(&md->wait); + INIT_WORK(&md->work, dm_wq_work); + init_waitqueue_head(&md->eventq); + + md->disk->major = _major; + md->disk->first_minor = minor; + md->disk->fops = &dm_blk_dops; + md->disk->queue = md->queue; + md->disk->private_data = md; + sprintf(md->disk->disk_name, "dm-%d", minor); + add_disk(md->disk); + format_dev_t(md->name, MKDEV(_major, minor)); + + md->wq = alloc_workqueue("kdmflush", + WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); + if (!md->wq) + goto bad_thread; + + md->bdev = bdget_disk(md->disk, 0); + if (!md->bdev) + goto bad_bdev; + + bio_init(&md->flush_bio); + md->flush_bio.bi_bdev = md->bdev; + md->flush_bio.bi_rw = WRITE_FLUSH; + + /* Populate the mapping, nobody knows we exist yet */ + spin_lock(&_minor_lock); + old_md = idr_replace(&_minor_idr, md, minor); + spin_unlock(&_minor_lock); + + BUG_ON(old_md != MINOR_ALLOCED); + + return md; + +bad_bdev: + destroy_workqueue(md->wq); +bad_thread: + del_gendisk(md->disk); + put_disk(md->disk); +bad_disk: + blk_cleanup_queue(md->queue); +bad_queue: + free_minor(minor); +bad_minor: + module_put(THIS_MODULE); +bad_module_get: + kfree(md); + return NULL; +} + +static void unlock_fs(struct mapped_device *md); + +static void free_dev(struct mapped_device *md) +{ + int minor = MINOR(disk_devt(md->disk)); + + unlock_fs(md); + bdput(md->bdev); + destroy_workqueue(md->wq); + if (md->tio_pool) + mempool_destroy(md->tio_pool); + if (md->io_pool) + mempool_destroy(md->io_pool); + if (md->bs) + bioset_free(md->bs); + blk_integrity_unregister(md->disk); + del_gendisk(md->disk); + free_minor(minor); + + spin_lock(&_minor_lock); + md->disk->private_data = NULL; + spin_unlock(&_minor_lock); + + put_disk(md->disk); + blk_cleanup_queue(md->queue); + module_put(THIS_MODULE); + kfree(md); +} + +static void __bind_mempools(struct mapped_device *md, struct dm_table *t) +{ + struct dm_md_mempools *p; + + if (md->io_pool && md->tio_pool && md->bs) + /* the md already has necessary mempools */ + goto out; + + p = dm_table_get_md_mempools(t); + BUG_ON(!p || md->io_pool || md->tio_pool || md->bs); + + md->io_pool = p->io_pool; + p->io_pool = NULL; + md->tio_pool = p->tio_pool; + p->tio_pool = NULL; + md->bs = p->bs; + p->bs = NULL; + +out: + /* mempool bind completed, now no need any mempools in the table */ + dm_table_free_md_mempools(t); +} + +/* + * Bind a table to the device. + */ +static void event_callback(void *context) +{ + unsigned long flags; + LIST_HEAD(uevents); + struct mapped_device *md = (struct mapped_device *) context; + + spin_lock_irqsave(&md->uevent_lock, flags); + list_splice_init(&md->uevent_list, &uevents); + spin_unlock_irqrestore(&md->uevent_lock, flags); + + dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); + + atomic_inc(&md->event_nr); + wake_up(&md->eventq); +} + +/* + * Protected by md->suspend_lock obtained by dm_swap_table(). + */ +static void __set_size(struct mapped_device *md, sector_t size) +{ + set_capacity(md->disk, size); + + i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); +} + +/* + * Return 1 if the queue has a compulsory merge_bvec_fn function. + * + * If this function returns 0, then the device is either a non-dm + * device without a merge_bvec_fn, or it is a dm device that is + * able to split any bios it receives that are too big. + */ +int dm_queue_merge_is_compulsory(struct request_queue *q) +{ + struct mapped_device *dev_md; + + if (!q->merge_bvec_fn) + return 0; + + if (q->make_request_fn == dm_request) { + dev_md = q->queuedata; + if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags)) + return 0; + } + + return 1; +} + +static int dm_device_merge_is_compulsory(struct dm_target *ti, + struct dm_dev *dev, sector_t start, + sector_t len, void *data) +{ + struct block_device *bdev = dev->bdev; + struct request_queue *q = bdev_get_queue(bdev); + + return dm_queue_merge_is_compulsory(q); +} + +/* + * Return 1 if it is acceptable to ignore merge_bvec_fn based + * on the properties of the underlying devices. + */ +static int dm_table_merge_is_optional(struct dm_table *table) +{ + unsigned i = 0; + struct dm_target *ti; + + while (i < dm_table_get_num_targets(table)) { + ti = dm_table_get_target(table, i++); + + if (ti->type->iterate_devices && + ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL)) + return 0; + } + + return 1; +} + +/* + * Returns old map, which caller must destroy. + */ +static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, + struct queue_limits *limits) +{ + struct dm_table *old_map; + struct request_queue *q = md->queue; + sector_t size; + unsigned long flags; + int merge_is_optional; + + size = dm_table_get_size(t); + + /* + * Wipe any geometry if the size of the table changed. + */ + if (size != get_capacity(md->disk)) + memset(&md->geometry, 0, sizeof(md->geometry)); + + __set_size(md, size); + + dm_table_event_callback(t, event_callback, md); + + /* + * The queue hasn't been stopped yet, if the old table type wasn't + * for request-based during suspension. So stop it to prevent + * I/O mapping before resume. + * This must be done before setting the queue restrictions, + * because request-based dm may be run just after the setting. + */ + if (dm_table_request_based(t) && !blk_queue_stopped(q)) + stop_queue(q); + + __bind_mempools(md, t); + + merge_is_optional = dm_table_merge_is_optional(t); + + write_lock_irqsave(&md->map_lock, flags); + old_map = md->map; + md->map = t; + md->immutable_target_type = dm_table_get_immutable_target_type(t); + + dm_table_set_restrictions(t, q, limits); + if (merge_is_optional) + set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); + else + clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); + write_unlock_irqrestore(&md->map_lock, flags); + + return old_map; +} + +/* + * Returns unbound table for the caller to free. + */ +static struct dm_table *__unbind(struct mapped_device *md) +{ + struct dm_table *map = md->map; + unsigned long flags; + + if (!map) + return NULL; + + dm_table_event_callback(map, NULL, NULL); + write_lock_irqsave(&md->map_lock, flags); + md->map = NULL; + write_unlock_irqrestore(&md->map_lock, flags); + + return map; +} + +/* + * Constructor for a new device. + */ +int dm_create(int minor, struct mapped_device **result) +{ + struct mapped_device *md; + + md = alloc_dev(minor); + if (!md) + return -ENXIO; + + dm_sysfs_init(md); + + *result = md; + return 0; +} + +/* + * Functions to manage md->type. + * All are required to hold md->type_lock. + */ +void dm_lock_md_type(struct mapped_device *md) +{ + mutex_lock(&md->type_lock); +} + +void dm_unlock_md_type(struct mapped_device *md) +{ + mutex_unlock(&md->type_lock); +} + +void dm_set_md_type(struct mapped_device *md, unsigned type) +{ + md->type = type; +} + +unsigned dm_get_md_type(struct mapped_device *md) +{ + return md->type; +} + +struct target_type *dm_get_immutable_target_type(struct mapped_device *md) +{ + return md->immutable_target_type; +} + +/* + * Fully initialize a request-based queue (->elevator, ->request_fn, etc). + */ +static int dm_init_request_based_queue(struct mapped_device *md) +{ + struct request_queue *q = NULL; + + if (md->queue->elevator) + return 1; + + /* Fully initialize the queue */ + q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL); + if (!q) + return 0; + + md->queue = q; + dm_init_md_queue(md); + blk_queue_softirq_done(md->queue, dm_softirq_done); + blk_queue_prep_rq(md->queue, dm_prep_fn); + blk_queue_lld_busy(md->queue, dm_lld_busy); + + elv_register_queue(md->queue); + + return 1; +} + +/* + * Setup the DM device's queue based on md's type + */ +int dm_setup_md_queue(struct mapped_device *md) +{ + if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) && + !dm_init_request_based_queue(md)) { + DMWARN("Cannot initialize queue for request-based mapped device"); + return -EINVAL; + } + + return 0; +} + +static struct mapped_device *dm_find_md(dev_t dev) +{ + struct mapped_device *md; + unsigned minor = MINOR(dev); + + if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) + return NULL; + + spin_lock(&_minor_lock); + + md = idr_find(&_minor_idr, minor); + if (md && (md == MINOR_ALLOCED || + (MINOR(disk_devt(dm_disk(md))) != minor) || + dm_deleting_md(md) || + test_bit(DMF_FREEING, &md->flags))) { + md = NULL; + goto out; + } + +out: + spin_unlock(&_minor_lock); + + return md; +} + +struct mapped_device *dm_get_md(dev_t dev) +{ + struct mapped_device *md = dm_find_md(dev); + + if (md) + dm_get(md); + + return md; +} +EXPORT_SYMBOL_GPL(dm_get_md); + +void *dm_get_mdptr(struct mapped_device *md) +{ + return md->interface_ptr; +} + +void dm_set_mdptr(struct mapped_device *md, void *ptr) +{ + md->interface_ptr = ptr; +} + +void dm_get(struct mapped_device *md) +{ + atomic_inc(&md->holders); + BUG_ON(test_bit(DMF_FREEING, &md->flags)); +} + +const char *dm_device_name(struct mapped_device *md) +{ + return md->name; +} +EXPORT_SYMBOL_GPL(dm_device_name); + +static void __dm_destroy(struct mapped_device *md, bool wait) +{ + struct dm_table *map; + + might_sleep(); + + spin_lock(&_minor_lock); + map = dm_get_live_table(md); + idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); + set_bit(DMF_FREEING, &md->flags); + spin_unlock(&_minor_lock); + + if (!dm_suspended_md(md)) { + dm_table_presuspend_targets(map); + dm_table_postsuspend_targets(map); + } + + /* + * Rare, but there may be I/O requests still going to complete, + * for example. Wait for all references to disappear. + * No one should increment the reference count of the mapped_device, + * after the mapped_device state becomes DMF_FREEING. + */ + if (wait) + while (atomic_read(&md->holders)) + msleep(1); + else if (atomic_read(&md->holders)) + DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", + dm_device_name(md), atomic_read(&md->holders)); + + dm_sysfs_exit(md); + dm_table_put(map); + dm_table_destroy(__unbind(md)); + free_dev(md); +} + +void dm_destroy(struct mapped_device *md) +{ + __dm_destroy(md, true); +} + +void dm_destroy_immediate(struct mapped_device *md) +{ + __dm_destroy(md, false); +} + +void dm_put(struct mapped_device *md) +{ + atomic_dec(&md->holders); +} +EXPORT_SYMBOL_GPL(dm_put); + +static int dm_wait_for_completion(struct mapped_device *md, int interruptible) +{ + int r = 0; + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(&md->wait, &wait); + + while (1) { + set_current_state(interruptible); + + if (!md_in_flight(md)) + break; + + if (interruptible == TASK_INTERRUPTIBLE && + signal_pending(current)) { + r = -EINTR; + break; + } + + io_schedule(); + } + set_current_state(TASK_RUNNING); + + remove_wait_queue(&md->wait, &wait); + + return r; +} + +/* + * Process the deferred bios + */ +static void dm_wq_work(struct work_struct *work) +{ + struct mapped_device *md = container_of(work, struct mapped_device, + work); + struct bio *c; + + down_read(&md->io_lock); + + while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { + spin_lock_irq(&md->deferred_lock); + c = bio_list_pop(&md->deferred); + spin_unlock_irq(&md->deferred_lock); + + if (!c) + break; + + up_read(&md->io_lock); + + if (dm_request_based(md)) + generic_make_request(c); + else + __split_and_process_bio(md, c); + + down_read(&md->io_lock); + } + + up_read(&md->io_lock); +} + +static void dm_queue_flush(struct mapped_device *md) +{ + clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); + smp_mb__after_clear_bit(); + queue_work(md->wq, &md->work); +} + +/* + * Swap in a new table, returning the old one for the caller to destroy. + */ +struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) +{ + struct dm_table *map = ERR_PTR(-EINVAL); + struct queue_limits limits; + int r; + + mutex_lock(&md->suspend_lock); + + /* device must be suspended */ + if (!dm_suspended_md(md)) + goto out; + + r = dm_calculate_queue_limits(table, &limits); + if (r) { + map = ERR_PTR(r); + goto out; + } + + map = __bind(md, table, &limits); + +out: + mutex_unlock(&md->suspend_lock); + return map; +} + +/* + * Functions to lock and unlock any filesystem running on the + * device. + */ +static int lock_fs(struct mapped_device *md) +{ + int r; + + WARN_ON(md->frozen_sb); + + md->frozen_sb = freeze_bdev(md->bdev); + if (IS_ERR(md->frozen_sb)) { + r = PTR_ERR(md->frozen_sb); + md->frozen_sb = NULL; + return r; + } + + set_bit(DMF_FROZEN, &md->flags); + + return 0; +} + +static void unlock_fs(struct mapped_device *md) +{ + if (!test_bit(DMF_FROZEN, &md->flags)) + return; + + thaw_bdev(md->bdev, md->frozen_sb); + md->frozen_sb = NULL; + clear_bit(DMF_FROZEN, &md->flags); +} + +/* + * We need to be able to change a mapping table under a mounted + * filesystem. For example we might want to move some data in + * the background. Before the table can be swapped with + * dm_bind_table, dm_suspend must be called to flush any in + * flight bios and ensure that any further io gets deferred. + */ +/* + * Suspend mechanism in request-based dm. + * + * 1. Flush all I/Os by lock_fs() if needed. + * 2. Stop dispatching any I/O by stopping the request_queue. + * 3. Wait for all in-flight I/Os to be completed or requeued. + * + * To abort suspend, start the request_queue. + */ +int dm_suspend(struct mapped_device *md, unsigned suspend_flags) +{ + struct dm_table *map = NULL; + int r = 0; + int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; + int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; + + mutex_lock(&md->suspend_lock); + + if (dm_suspended_md(md)) { + r = -EINVAL; + goto out_unlock; + } + + map = dm_get_live_table(md); + + /* + * DMF_NOFLUSH_SUSPENDING must be set before presuspend. + * This flag is cleared before dm_suspend returns. + */ + if (noflush) + set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); + + /* This does not get reverted if there's an error later. */ + dm_table_presuspend_targets(map); + + /* + * Flush I/O to the device. + * Any I/O submitted after lock_fs() may not be flushed. + * noflush takes precedence over do_lockfs. + * (lock_fs() flushes I/Os and waits for them to complete.) + */ + if (!noflush && do_lockfs) { + r = lock_fs(md); + if (r) + goto out; + } + + /* + * Here we must make sure that no processes are submitting requests + * to target drivers i.e. no one may be executing + * __split_and_process_bio. This is called from dm_request and + * dm_wq_work. + * + * To get all processes out of __split_and_process_bio in dm_request, + * we take the write lock. To prevent any process from reentering + * __split_and_process_bio from dm_request and quiesce the thread + * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call + * flush_workqueue(md->wq). + */ + down_write(&md->io_lock); + set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); + up_write(&md->io_lock); + + /* + * Stop md->queue before flushing md->wq in case request-based + * dm defers requests to md->wq from md->queue. + */ + if (dm_request_based(md)) + stop_queue(md->queue); + + flush_workqueue(md->wq); + + /* + * At this point no more requests are entering target request routines. + * We call dm_wait_for_completion to wait for all existing requests + * to finish. + */ + r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE); + + down_write(&md->io_lock); + if (noflush) + clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); + up_write(&md->io_lock); + + /* were we interrupted ? */ + if (r < 0) { + dm_queue_flush(md); + + if (dm_request_based(md)) + start_queue(md->queue); + + unlock_fs(md); + goto out; /* pushback list is already flushed, so skip flush */ + } + + /* + * If dm_wait_for_completion returned 0, the device is completely + * quiescent now. There is no request-processing activity. All new + * requests are being added to md->deferred list. + */ + + set_bit(DMF_SUSPENDED, &md->flags); + + dm_table_postsuspend_targets(map); + +out: + dm_table_put(map); + +out_unlock: + mutex_unlock(&md->suspend_lock); + return r; +} + +int dm_resume(struct mapped_device *md) +{ + int r = -EINVAL; + struct dm_table *map = NULL; + + mutex_lock(&md->suspend_lock); + if (!dm_suspended_md(md)) + goto out; + + map = dm_get_live_table(md); + if (!map || !dm_table_get_size(map)) + goto out; + + r = dm_table_resume_targets(map); + if (r) + goto out; + + dm_queue_flush(md); + + /* + * Flushing deferred I/Os must be done after targets are resumed + * so that mapping of targets can work correctly. + * Request-based dm is queueing the deferred I/Os in its request_queue. + */ + if (dm_request_based(md)) + start_queue(md->queue); + + unlock_fs(md); + + clear_bit(DMF_SUSPENDED, &md->flags); + + r = 0; +out: + dm_table_put(map); + mutex_unlock(&md->suspend_lock); + + return r; +} + +/*----------------------------------------------------------------- + * Event notification. + *---------------------------------------------------------------*/ +int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, + unsigned cookie) +{ + char udev_cookie[DM_COOKIE_LENGTH]; + char *envp[] = { udev_cookie, NULL }; + + if (!cookie) + return kobject_uevent(&disk_to_dev(md->disk)->kobj, action); + else { + snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", + DM_COOKIE_ENV_VAR_NAME, cookie); + return kobject_uevent_env(&disk_to_dev(md->disk)->kobj, + action, envp); + } +} + +uint32_t dm_next_uevent_seq(struct mapped_device *md) +{ + return atomic_add_return(1, &md->uevent_seq); +} + +uint32_t dm_get_event_nr(struct mapped_device *md) +{ + return atomic_read(&md->event_nr); +} + +int dm_wait_event(struct mapped_device *md, int event_nr) +{ + return wait_event_interruptible(md->eventq, + (event_nr != atomic_read(&md->event_nr))); +} + +void dm_uevent_add(struct mapped_device *md, struct list_head *elist) +{ + unsigned long flags; + + spin_lock_irqsave(&md->uevent_lock, flags); + list_add(elist, &md->uevent_list); + spin_unlock_irqrestore(&md->uevent_lock, flags); +} + +/* + * The gendisk is only valid as long as you have a reference + * count on 'md'. + */ +struct gendisk *dm_disk(struct mapped_device *md) +{ + return md->disk; +} + +struct kobject *dm_kobject(struct mapped_device *md) +{ + return &md->kobj; +} + +/* + * struct mapped_device should not be exported outside of dm.c + * so use this check to verify that kobj is part of md structure + */ +struct mapped_device *dm_get_from_kobject(struct kobject *kobj) +{ + struct mapped_device *md; + + md = container_of(kobj, struct mapped_device, kobj); + if (&md->kobj != kobj) + return NULL; + + if (test_bit(DMF_FREEING, &md->flags) || + dm_deleting_md(md)) + return NULL; + + dm_get(md); + return md; +} + +int dm_suspended_md(struct mapped_device *md) +{ + return test_bit(DMF_SUSPENDED, &md->flags); +} + +int dm_suspended(struct dm_target *ti) +{ + return dm_suspended_md(dm_table_get_md(ti->table)); +} +EXPORT_SYMBOL_GPL(dm_suspended); + +int dm_noflush_suspending(struct dm_target *ti) +{ + return __noflush_suspending(dm_table_get_md(ti->table)); +} +EXPORT_SYMBOL_GPL(dm_noflush_suspending); + +struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity) +{ + struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL); + unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS; + + if (!pools) + return NULL; + + pools->io_pool = (type == DM_TYPE_BIO_BASED) ? + mempool_create_slab_pool(MIN_IOS, _io_cache) : + mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache); + if (!pools->io_pool) + goto free_pools_and_out; + + pools->tio_pool = (type == DM_TYPE_BIO_BASED) ? + mempool_create_slab_pool(MIN_IOS, _tio_cache) : + mempool_create_slab_pool(MIN_IOS, _rq_tio_cache); + if (!pools->tio_pool) + goto free_io_pool_and_out; + + pools->bs = bioset_create(pool_size, 0); + if (!pools->bs) + goto free_tio_pool_and_out; + + if (integrity && bioset_integrity_create(pools->bs, pool_size)) + goto free_bioset_and_out; + + return pools; + +free_bioset_and_out: + bioset_free(pools->bs); + +free_tio_pool_and_out: + mempool_destroy(pools->tio_pool); + +free_io_pool_and_out: + mempool_destroy(pools->io_pool); + +free_pools_and_out: + kfree(pools); + + return NULL; +} + +void dm_free_md_mempools(struct dm_md_mempools *pools) +{ + if (!pools) + return; + + if (pools->io_pool) + mempool_destroy(pools->io_pool); + + if (pools->tio_pool) + mempool_destroy(pools->tio_pool); + + if (pools->bs) + bioset_free(pools->bs); + + kfree(pools); +} + +static const struct block_device_operations dm_blk_dops = { + .open = dm_blk_open, + .release = dm_blk_close, + .ioctl = dm_blk_ioctl, + .getgeo = dm_blk_getgeo, + .owner = THIS_MODULE +}; + +EXPORT_SYMBOL(dm_get_mapinfo); + +/* + * module hooks + */ +module_init(dm_init); +module_exit(dm_exit); + +module_param(major, uint, 0); +MODULE_PARM_DESC(major, "The major number of the device mapper"); +MODULE_DESCRIPTION(DM_NAME " driver"); +MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm.h b/drivers/md/dm.h new file mode 100644 index 00000000..b7dacd59 --- /dev/null +++ b/drivers/md/dm.h @@ -0,0 +1,159 @@ +/* + * Internal header file for device mapper + * + * Copyright (C) 2001, 2002 Sistina Software + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This file is released under the LGPL. + */ + +#ifndef DM_INTERNAL_H +#define DM_INTERNAL_H + +#include <linux/fs.h> +#include <linux/device-mapper.h> +#include <linux/list.h> +#include <linux/blkdev.h> +#include <linux/hdreg.h> + +/* + * Suspend feature flags + */ +#define DM_SUSPEND_LOCKFS_FLAG (1 << 0) +#define DM_SUSPEND_NOFLUSH_FLAG (1 << 1) + +/* + * Type of table and mapped_device's mempool + */ +#define DM_TYPE_NONE 0 +#define DM_TYPE_BIO_BASED 1 +#define DM_TYPE_REQUEST_BASED 2 + +/* + * List of devices that a metadevice uses and should open/close. + */ +struct dm_dev_internal { + struct list_head list; + atomic_t count; + struct dm_dev dm_dev; +}; + +struct dm_table; +struct dm_md_mempools; + +/*----------------------------------------------------------------- + * Internal table functions. + *---------------------------------------------------------------*/ +void dm_table_destroy(struct dm_table *t); +void dm_table_event_callback(struct dm_table *t, + void (*fn)(void *), void *context); +struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index); +struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); +int dm_calculate_queue_limits(struct dm_table *table, + struct queue_limits *limits); +void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, + struct queue_limits *limits); +struct list_head *dm_table_get_devices(struct dm_table *t); +void dm_table_presuspend_targets(struct dm_table *t); +void dm_table_postsuspend_targets(struct dm_table *t); +int dm_table_resume_targets(struct dm_table *t); +int dm_table_any_congested(struct dm_table *t, int bdi_bits); +int dm_table_any_busy_target(struct dm_table *t); +unsigned dm_table_get_type(struct dm_table *t); +struct target_type *dm_table_get_immutable_target_type(struct dm_table *t); +bool dm_table_request_based(struct dm_table *t); +bool dm_table_supports_discards(struct dm_table *t); +int dm_table_alloc_md_mempools(struct dm_table *t); +void dm_table_free_md_mempools(struct dm_table *t); +struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); + +int dm_queue_merge_is_compulsory(struct request_queue *q); + +void dm_lock_md_type(struct mapped_device *md); +void dm_unlock_md_type(struct mapped_device *md); +void dm_set_md_type(struct mapped_device *md, unsigned type); +unsigned dm_get_md_type(struct mapped_device *md); +struct target_type *dm_get_immutable_target_type(struct mapped_device *md); + +int dm_setup_md_queue(struct mapped_device *md); + +/* + * To check the return value from dm_table_find_target(). + */ +#define dm_target_is_valid(t) ((t)->table) + +/* + * To check whether the target type is request-based or not (bio-based). + */ +#define dm_target_request_based(t) ((t)->type->map_rq != NULL) + +/*----------------------------------------------------------------- + * A registry of target types. + *---------------------------------------------------------------*/ +int dm_target_init(void); +void dm_target_exit(void); +struct target_type *dm_get_target_type(const char *name); +void dm_put_target_type(struct target_type *tt); +int dm_target_iterate(void (*iter_func)(struct target_type *tt, + void *param), void *param); + +int dm_split_args(int *argc, char ***argvp, char *input); + +/* + * Is this mapped_device being deleted? + */ +int dm_deleting_md(struct mapped_device *md); + +/* + * Is this mapped_device suspended? + */ +int dm_suspended_md(struct mapped_device *md); + +/* + * The device-mapper can be driven through one of two interfaces; + * ioctl or filesystem, depending which patch you have applied. + */ +int dm_interface_init(void); +void dm_interface_exit(void); + +/* + * sysfs interface + */ +int dm_sysfs_init(struct mapped_device *md); +void dm_sysfs_exit(struct mapped_device *md); +struct kobject *dm_kobject(struct mapped_device *md); +struct mapped_device *dm_get_from_kobject(struct kobject *kobj); + +/* + * Targets for linear and striped mappings + */ +int dm_linear_init(void); +void dm_linear_exit(void); + +int dm_stripe_init(void); +void dm_stripe_exit(void); + +/* + * mapped_device operations + */ +void dm_destroy(struct mapped_device *md); +void dm_destroy_immediate(struct mapped_device *md); +int dm_open_count(struct mapped_device *md); +int dm_lock_for_deletion(struct mapped_device *md); + +int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, + unsigned cookie); + +int dm_io_init(void); +void dm_io_exit(void); + +int dm_kcopyd_init(void); +void dm_kcopyd_exit(void); + +/* + * Mempool operations + */ +struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity); +void dm_free_md_mempools(struct dm_md_mempools *pools); + +#endif diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c new file mode 100644 index 00000000..45135f69 --- /dev/null +++ b/drivers/md/faulty.c @@ -0,0 +1,367 @@ +/* + * faulty.c : Multiple Devices driver for Linux + * + * Copyright (C) 2004 Neil Brown + * + * fautly-device-simulator personality for md + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * You should have received a copy of the GNU General Public License + * (for example /usr/src/linux/COPYING); if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + + +/* + * The "faulty" personality causes some requests to fail. + * + * Possible failure modes are: + * reads fail "randomly" but succeed on retry + * writes fail "randomly" but succeed on retry + * reads for some address fail and then persist until a write + * reads for some address fail and then persist irrespective of write + * writes for some address fail and persist + * all writes fail + * + * Different modes can be active at a time, but only + * one can be set at array creation. Others can be added later. + * A mode can be one-shot or recurrent with the recurrence being + * once in every N requests. + * The bottom 5 bits of the "layout" indicate the mode. The + * remainder indicate a period, or 0 for one-shot. + * + * There is an implementation limit on the number of concurrently + * persisting-faulty blocks. When a new fault is requested that would + * exceed the limit, it is ignored. + * All current faults can be clear using a layout of "0". + * + * Requests are always sent to the device. If they are to fail, + * we clone the bio and insert a new b_end_io into the chain. + */ + +#define WriteTransient 0 +#define ReadTransient 1 +#define WritePersistent 2 +#define ReadPersistent 3 +#define WriteAll 4 /* doesn't go to device */ +#define ReadFixable 5 +#define Modes 6 + +#define ClearErrors 31 +#define ClearFaults 30 + +#define AllPersist 100 /* internal use only */ +#define NoPersist 101 + +#define ModeMask 0x1f +#define ModeShift 5 + +#define MaxFault 50 +#include <linux/blkdev.h> +#include <linux/module.h> +#include <linux/raid/md_u.h> +#include <linux/slab.h> +#include "md.h" +#include <linux/seq_file.h> + + +static void faulty_fail(struct bio *bio, int error) +{ + struct bio *b = bio->bi_private; + + b->bi_size = bio->bi_size; + b->bi_sector = bio->bi_sector; + + bio_put(bio); + + bio_io_error(b); +} + +struct faulty_conf { + int period[Modes]; + atomic_t counters[Modes]; + sector_t faults[MaxFault]; + int modes[MaxFault]; + int nfaults; + struct md_rdev *rdev; +}; + +static int check_mode(struct faulty_conf *conf, int mode) +{ + if (conf->period[mode] == 0 && + atomic_read(&conf->counters[mode]) <= 0) + return 0; /* no failure, no decrement */ + + + if (atomic_dec_and_test(&conf->counters[mode])) { + if (conf->period[mode]) + atomic_set(&conf->counters[mode], conf->period[mode]); + return 1; + } + return 0; +} + +static int check_sector(struct faulty_conf *conf, sector_t start, sector_t end, int dir) +{ + /* If we find a ReadFixable sector, we fix it ... */ + int i; + for (i=0; i<conf->nfaults; i++) + if (conf->faults[i] >= start && + conf->faults[i] < end) { + /* found it ... */ + switch (conf->modes[i] * 2 + dir) { + case WritePersistent*2+WRITE: return 1; + case ReadPersistent*2+READ: return 1; + case ReadFixable*2+READ: return 1; + case ReadFixable*2+WRITE: + conf->modes[i] = NoPersist; + return 0; + case AllPersist*2+READ: + case AllPersist*2+WRITE: return 1; + default: + return 0; + } + } + return 0; +} + +static void add_sector(struct faulty_conf *conf, sector_t start, int mode) +{ + int i; + int n = conf->nfaults; + for (i=0; i<conf->nfaults; i++) + if (conf->faults[i] == start) { + switch(mode) { + case NoPersist: conf->modes[i] = mode; return; + case WritePersistent: + if (conf->modes[i] == ReadPersistent || + conf->modes[i] == ReadFixable) + conf->modes[i] = AllPersist; + else + conf->modes[i] = WritePersistent; + return; + case ReadPersistent: + if (conf->modes[i] == WritePersistent) + conf->modes[i] = AllPersist; + else + conf->modes[i] = ReadPersistent; + return; + case ReadFixable: + if (conf->modes[i] == WritePersistent || + conf->modes[i] == ReadPersistent) + conf->modes[i] = AllPersist; + else + conf->modes[i] = ReadFixable; + return; + } + } else if (conf->modes[i] == NoPersist) + n = i; + + if (n >= MaxFault) + return; + conf->faults[n] = start; + conf->modes[n] = mode; + if (conf->nfaults == n) + conf->nfaults = n+1; +} + +static void make_request(struct mddev *mddev, struct bio *bio) +{ + struct faulty_conf *conf = mddev->private; + int failit = 0; + + if (bio_data_dir(bio) == WRITE) { + /* write request */ + if (atomic_read(&conf->counters[WriteAll])) { + /* special case - don't decrement, don't generic_make_request, + * just fail immediately + */ + bio_endio(bio, -EIO); + return; + } + + if (check_sector(conf, bio->bi_sector, bio->bi_sector+(bio->bi_size>>9), + WRITE)) + failit = 1; + if (check_mode(conf, WritePersistent)) { + add_sector(conf, bio->bi_sector, WritePersistent); + failit = 1; + } + if (check_mode(conf, WriteTransient)) + failit = 1; + } else { + /* read request */ + if (check_sector(conf, bio->bi_sector, bio->bi_sector + (bio->bi_size>>9), + READ)) + failit = 1; + if (check_mode(conf, ReadTransient)) + failit = 1; + if (check_mode(conf, ReadPersistent)) { + add_sector(conf, bio->bi_sector, ReadPersistent); + failit = 1; + } + if (check_mode(conf, ReadFixable)) { + add_sector(conf, bio->bi_sector, ReadFixable); + failit = 1; + } + } + if (failit) { + struct bio *b = bio_clone_mddev(bio, GFP_NOIO, mddev); + + b->bi_bdev = conf->rdev->bdev; + b->bi_private = bio; + b->bi_end_io = faulty_fail; + bio = b; + } else + bio->bi_bdev = conf->rdev->bdev; + + generic_make_request(bio); +} + +static void status(struct seq_file *seq, struct mddev *mddev) +{ + struct faulty_conf *conf = mddev->private; + int n; + + if ((n=atomic_read(&conf->counters[WriteTransient])) != 0) + seq_printf(seq, " WriteTransient=%d(%d)", + n, conf->period[WriteTransient]); + + if ((n=atomic_read(&conf->counters[ReadTransient])) != 0) + seq_printf(seq, " ReadTransient=%d(%d)", + n, conf->period[ReadTransient]); + + if ((n=atomic_read(&conf->counters[WritePersistent])) != 0) + seq_printf(seq, " WritePersistent=%d(%d)", + n, conf->period[WritePersistent]); + + if ((n=atomic_read(&conf->counters[ReadPersistent])) != 0) + seq_printf(seq, " ReadPersistent=%d(%d)", + n, conf->period[ReadPersistent]); + + + if ((n=atomic_read(&conf->counters[ReadFixable])) != 0) + seq_printf(seq, " ReadFixable=%d(%d)", + n, conf->period[ReadFixable]); + + if ((n=atomic_read(&conf->counters[WriteAll])) != 0) + seq_printf(seq, " WriteAll"); + + seq_printf(seq, " nfaults=%d", conf->nfaults); +} + + +static int reshape(struct mddev *mddev) +{ + int mode = mddev->new_layout & ModeMask; + int count = mddev->new_layout >> ModeShift; + struct faulty_conf *conf = mddev->private; + + if (mddev->new_layout < 0) + return 0; + + /* new layout */ + if (mode == ClearFaults) + conf->nfaults = 0; + else if (mode == ClearErrors) { + int i; + for (i=0 ; i < Modes ; i++) { + conf->period[i] = 0; + atomic_set(&conf->counters[i], 0); + } + } else if (mode < Modes) { + conf->period[mode] = count; + if (!count) count++; + atomic_set(&conf->counters[mode], count); + } else + return -EINVAL; + mddev->new_layout = -1; + mddev->layout = -1; /* makes sure further changes come through */ + return 0; +} + +static sector_t faulty_size(struct mddev *mddev, sector_t sectors, int raid_disks) +{ + WARN_ONCE(raid_disks, + "%s does not support generic reshape\n", __func__); + + if (sectors == 0) + return mddev->dev_sectors; + + return sectors; +} + +static int run(struct mddev *mddev) +{ + struct md_rdev *rdev; + int i; + struct faulty_conf *conf; + + if (md_check_no_bitmap(mddev)) + return -EINVAL; + + conf = kmalloc(sizeof(*conf), GFP_KERNEL); + if (!conf) + return -ENOMEM; + + for (i=0; i<Modes; i++) { + atomic_set(&conf->counters[i], 0); + conf->period[i] = 0; + } + conf->nfaults = 0; + + rdev_for_each(rdev, mddev) + conf->rdev = rdev; + + md_set_array_sectors(mddev, faulty_size(mddev, 0, 0)); + mddev->private = conf; + + reshape(mddev); + + return 0; +} + +static int stop(struct mddev *mddev) +{ + struct faulty_conf *conf = mddev->private; + + kfree(conf); + mddev->private = NULL; + return 0; +} + +static struct md_personality faulty_personality = +{ + .name = "faulty", + .level = LEVEL_FAULTY, + .owner = THIS_MODULE, + .make_request = make_request, + .run = run, + .stop = stop, + .status = status, + .check_reshape = reshape, + .size = faulty_size, +}; + +static int __init raid_init(void) +{ + return register_md_personality(&faulty_personality); +} + +static void raid_exit(void) +{ + unregister_md_personality(&faulty_personality); +} + +module_init(raid_init); +module_exit(raid_exit); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Fault injection personality for MD"); +MODULE_ALIAS("md-personality-10"); /* faulty */ +MODULE_ALIAS("md-faulty"); +MODULE_ALIAS("md-level--5"); diff --git a/drivers/md/linear.c b/drivers/md/linear.c new file mode 100644 index 00000000..fa211d80 --- /dev/null +++ b/drivers/md/linear.c @@ -0,0 +1,369 @@ +/* + linear.c : Multiple Devices driver for Linux + Copyright (C) 1994-96 Marc ZYNGIER + <zyngier@ufr-info-p7.ibp.fr> or + <maz@gloups.fdn.fr> + + Linear mode management functions. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include <linux/blkdev.h> +#include <linux/raid/md_u.h> +#include <linux/seq_file.h> +#include <linux/module.h> +#include <linux/slab.h> +#include "md.h" +#include "linear.h" + +/* + * find which device holds a particular offset + */ +static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector) +{ + int lo, mid, hi; + struct linear_conf *conf; + + lo = 0; + hi = mddev->raid_disks - 1; + conf = rcu_dereference(mddev->private); + + /* + * Binary Search + */ + + while (hi > lo) { + + mid = (hi + lo) / 2; + if (sector < conf->disks[mid].end_sector) + hi = mid; + else + lo = mid + 1; + } + + return conf->disks + lo; +} + +/** + * linear_mergeable_bvec -- tell bio layer if two requests can be merged + * @q: request queue + * @bvm: properties of new bio + * @biovec: the request that could be merged to it. + * + * Return amount of bytes we can take at this offset + */ +static int linear_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) +{ + struct mddev *mddev = q->queuedata; + struct dev_info *dev0; + unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9; + sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); + int maxbytes = biovec->bv_len; + struct request_queue *subq; + + rcu_read_lock(); + dev0 = which_dev(mddev, sector); + maxsectors = dev0->end_sector - sector; + subq = bdev_get_queue(dev0->rdev->bdev); + if (subq->merge_bvec_fn) { + bvm->bi_bdev = dev0->rdev->bdev; + bvm->bi_sector -= dev0->end_sector - dev0->rdev->sectors; + maxbytes = min(maxbytes, subq->merge_bvec_fn(subq, bvm, + biovec)); + } + rcu_read_unlock(); + + if (maxsectors < bio_sectors) + maxsectors = 0; + else + maxsectors -= bio_sectors; + + if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0) + return maxbytes; + + if (maxsectors > (maxbytes >> 9)) + return maxbytes; + else + return maxsectors << 9; +} + +static int linear_congested(void *data, int bits) +{ + struct mddev *mddev = data; + struct linear_conf *conf; + int i, ret = 0; + + if (mddev_congested(mddev, bits)) + return 1; + + rcu_read_lock(); + conf = rcu_dereference(mddev->private); + + for (i = 0; i < mddev->raid_disks && !ret ; i++) { + struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev); + ret |= bdi_congested(&q->backing_dev_info, bits); + } + + rcu_read_unlock(); + return ret; +} + +static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disks) +{ + struct linear_conf *conf; + sector_t array_sectors; + + rcu_read_lock(); + conf = rcu_dereference(mddev->private); + WARN_ONCE(sectors || raid_disks, + "%s does not support generic reshape\n", __func__); + array_sectors = conf->array_sectors; + rcu_read_unlock(); + + return array_sectors; +} + +static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) +{ + struct linear_conf *conf; + struct md_rdev *rdev; + int i, cnt; + + conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(struct dev_info), + GFP_KERNEL); + if (!conf) + return NULL; + + cnt = 0; + conf->array_sectors = 0; + + rdev_for_each(rdev, mddev) { + int j = rdev->raid_disk; + struct dev_info *disk = conf->disks + j; + sector_t sectors; + + if (j < 0 || j >= raid_disks || disk->rdev) { + printk(KERN_ERR "md/linear:%s: disk numbering problem. Aborting!\n", + mdname(mddev)); + goto out; + } + + disk->rdev = rdev; + if (mddev->chunk_sectors) { + sectors = rdev->sectors; + sector_div(sectors, mddev->chunk_sectors); + rdev->sectors = sectors * mddev->chunk_sectors; + } + + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + + conf->array_sectors += rdev->sectors; + cnt++; + + } + if (cnt != raid_disks) { + printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n", + mdname(mddev)); + goto out; + } + + /* + * Here we calculate the device offsets. + */ + conf->disks[0].end_sector = conf->disks[0].rdev->sectors; + + for (i = 1; i < raid_disks; i++) + conf->disks[i].end_sector = + conf->disks[i-1].end_sector + + conf->disks[i].rdev->sectors; + + return conf; + +out: + kfree(conf); + return NULL; +} + +static int linear_run (struct mddev *mddev) +{ + struct linear_conf *conf; + int ret; + + if (md_check_no_bitmap(mddev)) + return -EINVAL; + conf = linear_conf(mddev, mddev->raid_disks); + + if (!conf) + return 1; + mddev->private = conf; + md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); + + blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); + mddev->queue->backing_dev_info.congested_fn = linear_congested; + mddev->queue->backing_dev_info.congested_data = mddev; + + ret = md_integrity_register(mddev); + if (ret) { + kfree(conf); + mddev->private = NULL; + } + return ret; +} + +static int linear_add(struct mddev *mddev, struct md_rdev *rdev) +{ + /* Adding a drive to a linear array allows the array to grow. + * It is permitted if the new drive has a matching superblock + * already on it, with raid_disk equal to raid_disks. + * It is achieved by creating a new linear_private_data structure + * and swapping it in in-place of the current one. + * The current one is never freed until the array is stopped. + * This avoids races. + */ + struct linear_conf *newconf, *oldconf; + + if (rdev->saved_raid_disk != mddev->raid_disks) + return -EINVAL; + + rdev->raid_disk = rdev->saved_raid_disk; + rdev->saved_raid_disk = -1; + + newconf = linear_conf(mddev,mddev->raid_disks+1); + + if (!newconf) + return -ENOMEM; + + oldconf = rcu_dereference(mddev->private); + mddev->raid_disks++; + rcu_assign_pointer(mddev->private, newconf); + md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + kfree_rcu(oldconf, rcu); + return 0; +} + +static int linear_stop (struct mddev *mddev) +{ + struct linear_conf *conf = mddev->private; + + /* + * We do not require rcu protection here since + * we hold reconfig_mutex for both linear_add and + * linear_stop, so they cannot race. + * We should make sure any old 'conf's are properly + * freed though. + */ + rcu_barrier(); + blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ + kfree(conf); + mddev->private = NULL; + + return 0; +} + +static void linear_make_request(struct mddev *mddev, struct bio *bio) +{ + struct dev_info *tmp_dev; + sector_t start_sector; + + if (unlikely(bio->bi_rw & REQ_FLUSH)) { + md_flush_request(mddev, bio); + return; + } + + rcu_read_lock(); + tmp_dev = which_dev(mddev, bio->bi_sector); + start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; + + + if (unlikely(bio->bi_sector >= (tmp_dev->end_sector) + || (bio->bi_sector < start_sector))) { + char b[BDEVNAME_SIZE]; + + printk(KERN_ERR + "md/linear:%s: make_request: Sector %llu out of bounds on " + "dev %s: %llu sectors, offset %llu\n", + mdname(mddev), + (unsigned long long)bio->bi_sector, + bdevname(tmp_dev->rdev->bdev, b), + (unsigned long long)tmp_dev->rdev->sectors, + (unsigned long long)start_sector); + rcu_read_unlock(); + bio_io_error(bio); + return; + } + if (unlikely(bio->bi_sector + (bio->bi_size >> 9) > + tmp_dev->end_sector)) { + /* This bio crosses a device boundary, so we have to + * split it. + */ + struct bio_pair *bp; + sector_t end_sector = tmp_dev->end_sector; + + rcu_read_unlock(); + + bp = bio_split(bio, end_sector - bio->bi_sector); + + linear_make_request(mddev, &bp->bio1); + linear_make_request(mddev, &bp->bio2); + bio_pair_release(bp); + return; + } + + bio->bi_bdev = tmp_dev->rdev->bdev; + bio->bi_sector = bio->bi_sector - start_sector + + tmp_dev->rdev->data_offset; + rcu_read_unlock(); + generic_make_request(bio); +} + +static void linear_status (struct seq_file *seq, struct mddev *mddev) +{ + + seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2); +} + + +static struct md_personality linear_personality = +{ + .name = "linear", + .level = LEVEL_LINEAR, + .owner = THIS_MODULE, + .make_request = linear_make_request, + .run = linear_run, + .stop = linear_stop, + .status = linear_status, + .hot_add_disk = linear_add, + .size = linear_size, +}; + +static int __init linear_init (void) +{ + return register_md_personality (&linear_personality); +} + +static void linear_exit (void) +{ + unregister_md_personality (&linear_personality); +} + + +module_init(linear_init); +module_exit(linear_exit); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Linear device concatenation personality for MD"); +MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/ +MODULE_ALIAS("md-linear"); +MODULE_ALIAS("md-level--1"); diff --git a/drivers/md/linear.h b/drivers/md/linear.h new file mode 100644 index 00000000..b685ddd7 --- /dev/null +++ b/drivers/md/linear.h @@ -0,0 +1,15 @@ +#ifndef _LINEAR_H +#define _LINEAR_H + +struct dev_info { + struct md_rdev *rdev; + sector_t end_sector; +}; + +struct linear_conf +{ + struct rcu_head rcu; + sector_t array_sectors; + struct dev_info disks[0]; +}; +#endif diff --git a/drivers/md/md.c b/drivers/md/md.c new file mode 100644 index 00000000..2b30ffdb --- /dev/null +++ b/drivers/md/md.c @@ -0,0 +1,8342 @@ +/* + md.c : Multiple Devices driver for Linux + Copyright (C) 1998, 1999, 2000 Ingo Molnar + + completely rewritten, based on the MD driver code from Marc Zyngier + + Changes: + + - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar + - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> + - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> + - kerneld support by Boris Tobotras <boris@xtalk.msk.su> + - kmod support by: Cyrus Durgin + - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> + - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> + + - lots of fixes and improvements to the RAID1/RAID5 and generic + RAID code (such as request based resynchronization): + + Neil Brown <neilb@cse.unsw.edu.au>. + + - persistent bitmap code + Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include <linux/kthread.h> +#include <linux/blkdev.h> +#include <linux/sysctl.h> +#include <linux/seq_file.h> +#include <linux/fs.h> +#include <linux/poll.h> +#include <linux/ctype.h> +#include <linux/string.h> +#include <linux/hdreg.h> +#include <linux/proc_fs.h> +#include <linux/random.h> +#include <linux/module.h> +#include <linux/reboot.h> +#include <linux/file.h> +#include <linux/compat.h> +#include <linux/delay.h> +#include <linux/raid/md_p.h> +#include <linux/raid/md_u.h> +#include <linux/slab.h> +#include "md.h" +#include "bitmap.h" + +#ifndef MODULE +static void autostart_arrays(int part); +#endif + +/* pers_list is a list of registered personalities protected + * by pers_lock. + * pers_lock does extra service to protect accesses to + * mddev->thread when the mutex cannot be held. + */ +static LIST_HEAD(pers_list); +static DEFINE_SPINLOCK(pers_lock); + +static void md_print_devices(void); + +static DECLARE_WAIT_QUEUE_HEAD(resync_wait); +static struct workqueue_struct *md_wq; +static struct workqueue_struct *md_misc_wq; + +#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } + +/* + * Default number of read corrections we'll attempt on an rdev + * before ejecting it from the array. We divide the read error + * count by 2 for every hour elapsed between read errors. + */ +#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 +/* + * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' + * is 1000 KB/sec, so the extra system load does not show up that much. + * Increase it if you want to have more _guaranteed_ speed. Note that + * the RAID driver will use the maximum available bandwidth if the IO + * subsystem is idle. There is also an 'absolute maximum' reconstruction + * speed limit - in case reconstruction slows down your system despite + * idle IO detection. + * + * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. + * or /sys/block/mdX/md/sync_speed_{min,max} + */ + +static int sysctl_speed_limit_min = 1000; +static int sysctl_speed_limit_max = 200000; +static inline int speed_min(struct mddev *mddev) +{ + return mddev->sync_speed_min ? + mddev->sync_speed_min : sysctl_speed_limit_min; +} + +static inline int speed_max(struct mddev *mddev) +{ + return mddev->sync_speed_max ? + mddev->sync_speed_max : sysctl_speed_limit_max; +} + +static struct ctl_table_header *raid_table_header; + +static ctl_table raid_table[] = { + { + .procname = "speed_limit_min", + .data = &sysctl_speed_limit_min, + .maxlen = sizeof(int), + .mode = S_IRUGO|S_IWUSR, + .proc_handler = proc_dointvec, + }, + { + .procname = "speed_limit_max", + .data = &sysctl_speed_limit_max, + .maxlen = sizeof(int), + .mode = S_IRUGO|S_IWUSR, + .proc_handler = proc_dointvec, + }, + { } +}; + +static ctl_table raid_dir_table[] = { + { + .procname = "raid", + .maxlen = 0, + .mode = S_IRUGO|S_IXUGO, + .child = raid_table, + }, + { } +}; + +static ctl_table raid_root_table[] = { + { + .procname = "dev", + .maxlen = 0, + .mode = 0555, + .child = raid_dir_table, + }, + { } +}; + +static const struct block_device_operations md_fops; + +static int start_readonly; + +/* bio_clone_mddev + * like bio_clone, but with a local bio set + */ + +static void mddev_bio_destructor(struct bio *bio) +{ + struct mddev *mddev, **mddevp; + + mddevp = (void*)bio; + mddev = mddevp[-1]; + + bio_free(bio, mddev->bio_set); +} + +struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, + struct mddev *mddev) +{ + struct bio *b; + struct mddev **mddevp; + + if (!mddev || !mddev->bio_set) + return bio_alloc(gfp_mask, nr_iovecs); + + b = bio_alloc_bioset(gfp_mask, nr_iovecs, + mddev->bio_set); + if (!b) + return NULL; + mddevp = (void*)b; + mddevp[-1] = mddev; + b->bi_destructor = mddev_bio_destructor; + return b; +} +EXPORT_SYMBOL_GPL(bio_alloc_mddev); + +struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, + struct mddev *mddev) +{ + struct bio *b; + struct mddev **mddevp; + + if (!mddev || !mddev->bio_set) + return bio_clone(bio, gfp_mask); + + b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, + mddev->bio_set); + if (!b) + return NULL; + mddevp = (void*)b; + mddevp[-1] = mddev; + b->bi_destructor = mddev_bio_destructor; + __bio_clone(b, bio); + if (bio_integrity(bio)) { + int ret; + + ret = bio_integrity_clone(b, bio, gfp_mask, mddev->bio_set); + + if (ret < 0) { + bio_put(b); + return NULL; + } + } + + return b; +} +EXPORT_SYMBOL_GPL(bio_clone_mddev); + +void md_trim_bio(struct bio *bio, int offset, int size) +{ + /* 'bio' is a cloned bio which we need to trim to match + * the given offset and size. + * This requires adjusting bi_sector, bi_size, and bi_io_vec + */ + int i; + struct bio_vec *bvec; + int sofar = 0; + + size <<= 9; + if (offset == 0 && size == bio->bi_size) + return; + + bio->bi_sector += offset; + bio->bi_size = size; + offset <<= 9; + clear_bit(BIO_SEG_VALID, &bio->bi_flags); + + while (bio->bi_idx < bio->bi_vcnt && + bio->bi_io_vec[bio->bi_idx].bv_len <= offset) { + /* remove this whole bio_vec */ + offset -= bio->bi_io_vec[bio->bi_idx].bv_len; + bio->bi_idx++; + } + if (bio->bi_idx < bio->bi_vcnt) { + bio->bi_io_vec[bio->bi_idx].bv_offset += offset; + bio->bi_io_vec[bio->bi_idx].bv_len -= offset; + } + /* avoid any complications with bi_idx being non-zero*/ + if (bio->bi_idx) { + memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx, + (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec)); + bio->bi_vcnt -= bio->bi_idx; + bio->bi_idx = 0; + } + /* Make sure vcnt and last bv are not too big */ + bio_for_each_segment(bvec, bio, i) { + if (sofar + bvec->bv_len > size) + bvec->bv_len = size - sofar; + if (bvec->bv_len == 0) { + bio->bi_vcnt = i; + break; + } + sofar += bvec->bv_len; + } +} +EXPORT_SYMBOL_GPL(md_trim_bio); + +/* + * We have a system wide 'event count' that is incremented + * on any 'interesting' event, and readers of /proc/mdstat + * can use 'poll' or 'select' to find out when the event + * count increases. + * + * Events are: + * start array, stop array, error, add device, remove device, + * start build, activate spare + */ +static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); +static atomic_t md_event_count; +void md_new_event(struct mddev *mddev) +{ + atomic_inc(&md_event_count); + wake_up(&md_event_waiters); +} +EXPORT_SYMBOL_GPL(md_new_event); + +/* Alternate version that can be called from interrupts + * when calling sysfs_notify isn't needed. + */ +static void md_new_event_inintr(struct mddev *mddev) +{ + atomic_inc(&md_event_count); + wake_up(&md_event_waiters); +} + +/* + * Enables to iterate over all existing md arrays + * all_mddevs_lock protects this list. + */ +static LIST_HEAD(all_mddevs); +static DEFINE_SPINLOCK(all_mddevs_lock); + + +/* + * iterates through all used mddevs in the system. + * We take care to grab the all_mddevs_lock whenever navigating + * the list, and to always hold a refcount when unlocked. + * Any code which breaks out of this loop while own + * a reference to the current mddev and must mddev_put it. + */ +#define for_each_mddev(_mddev,_tmp) \ + \ + for (({ spin_lock(&all_mddevs_lock); \ + _tmp = all_mddevs.next; \ + _mddev = NULL;}); \ + ({ if (_tmp != &all_mddevs) \ + mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\ + spin_unlock(&all_mddevs_lock); \ + if (_mddev) mddev_put(_mddev); \ + _mddev = list_entry(_tmp, struct mddev, all_mddevs); \ + _tmp != &all_mddevs;}); \ + ({ spin_lock(&all_mddevs_lock); \ + _tmp = _tmp->next;}) \ + ) + + +/* Rather than calling directly into the personality make_request function, + * IO requests come here first so that we can check if the device is + * being suspended pending a reconfiguration. + * We hold a refcount over the call to ->make_request. By the time that + * call has finished, the bio has been linked into some internal structure + * and so is visible to ->quiesce(), so we don't need the refcount any more. + */ +static void md_make_request(struct request_queue *q, struct bio *bio) +{ + const int rw = bio_data_dir(bio); + struct mddev *mddev = q->queuedata; + int cpu; + unsigned int sectors; + + if (mddev == NULL || mddev->pers == NULL + || !mddev->ready) { + bio_io_error(bio); + return; + } + smp_rmb(); /* Ensure implications of 'active' are visible */ + rcu_read_lock(); + if (mddev->suspended) { + DEFINE_WAIT(__wait); + for (;;) { + prepare_to_wait(&mddev->sb_wait, &__wait, + TASK_UNINTERRUPTIBLE); + if (!mddev->suspended) + break; + rcu_read_unlock(); + schedule(); + rcu_read_lock(); + } + finish_wait(&mddev->sb_wait, &__wait); + } + atomic_inc(&mddev->active_io); + rcu_read_unlock(); + + /* + * save the sectors now since our bio can + * go away inside make_request + */ + sectors = bio_sectors(bio); + mddev->pers->make_request(mddev, bio); + + cpu = part_stat_lock(); + part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); + part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors); + part_stat_unlock(); + + if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) + wake_up(&mddev->sb_wait); +} + +/* mddev_suspend makes sure no new requests are submitted + * to the device, and that any requests that have been submitted + * are completely handled. + * Once ->stop is called and completes, the module will be completely + * unused. + */ +void mddev_suspend(struct mddev *mddev) +{ + BUG_ON(mddev->suspended); + mddev->suspended = 1; + synchronize_rcu(); + wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); + mddev->pers->quiesce(mddev, 1); + + del_timer_sync(&mddev->safemode_timer); +} +EXPORT_SYMBOL_GPL(mddev_suspend); + +void mddev_resume(struct mddev *mddev) +{ + mddev->suspended = 0; + wake_up(&mddev->sb_wait); + mddev->pers->quiesce(mddev, 0); + + md_wakeup_thread(mddev->thread); + md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ +} +EXPORT_SYMBOL_GPL(mddev_resume); + +int mddev_congested(struct mddev *mddev, int bits) +{ + return mddev->suspended; +} +EXPORT_SYMBOL(mddev_congested); + +/* + * Generic flush handling for md + */ + +static void md_end_flush(struct bio *bio, int err) +{ + struct md_rdev *rdev = bio->bi_private; + struct mddev *mddev = rdev->mddev; + + rdev_dec_pending(rdev, mddev); + + if (atomic_dec_and_test(&mddev->flush_pending)) { + /* The pre-request flush has finished */ + queue_work(md_wq, &mddev->flush_work); + } + bio_put(bio); +} + +static void md_submit_flush_data(struct work_struct *ws); + +static void submit_flushes(struct work_struct *ws) +{ + struct mddev *mddev = container_of(ws, struct mddev, flush_work); + struct md_rdev *rdev; + + INIT_WORK(&mddev->flush_work, md_submit_flush_data); + atomic_set(&mddev->flush_pending, 1); + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Faulty, &rdev->flags)) { + /* Take two references, one is dropped + * when request finishes, one after + * we reclaim rcu_read_lock + */ + struct bio *bi; + atomic_inc(&rdev->nr_pending); + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); + bi->bi_end_io = md_end_flush; + bi->bi_private = rdev; + bi->bi_bdev = rdev->bdev; + atomic_inc(&mddev->flush_pending); + submit_bio(WRITE_FLUSH, bi); + rcu_read_lock(); + rdev_dec_pending(rdev, mddev); + } + rcu_read_unlock(); + if (atomic_dec_and_test(&mddev->flush_pending)) + queue_work(md_wq, &mddev->flush_work); +} + +static void md_submit_flush_data(struct work_struct *ws) +{ + struct mddev *mddev = container_of(ws, struct mddev, flush_work); + struct bio *bio = mddev->flush_bio; + + if (bio->bi_size == 0) + /* an empty barrier - all done */ + bio_endio(bio, 0); + else { + bio->bi_rw &= ~REQ_FLUSH; + mddev->pers->make_request(mddev, bio); + } + + mddev->flush_bio = NULL; + wake_up(&mddev->sb_wait); +} + +void md_flush_request(struct mddev *mddev, struct bio *bio) +{ + spin_lock_irq(&mddev->write_lock); + wait_event_lock_irq(mddev->sb_wait, + !mddev->flush_bio, + mddev->write_lock, /*nothing*/); + mddev->flush_bio = bio; + spin_unlock_irq(&mddev->write_lock); + + INIT_WORK(&mddev->flush_work, submit_flushes); + queue_work(md_wq, &mddev->flush_work); +} +EXPORT_SYMBOL(md_flush_request); + +/* Support for plugging. + * This mirrors the plugging support in request_queue, but does not + * require having a whole queue or request structures. + * We allocate an md_plug_cb for each md device and each thread it gets + * plugged on. This links tot the private plug_handle structure in the + * personality data where we keep a count of the number of outstanding + * plugs so other code can see if a plug is active. + */ +struct md_plug_cb { + struct blk_plug_cb cb; + struct mddev *mddev; +}; + +static void plugger_unplug(struct blk_plug_cb *cb) +{ + struct md_plug_cb *mdcb = container_of(cb, struct md_plug_cb, cb); + if (atomic_dec_and_test(&mdcb->mddev->plug_cnt)) + md_wakeup_thread(mdcb->mddev->thread); + kfree(mdcb); +} + +/* Check that an unplug wakeup will come shortly. + * If not, wakeup the md thread immediately + */ +int mddev_check_plugged(struct mddev *mddev) +{ + struct blk_plug *plug = current->plug; + struct md_plug_cb *mdcb; + + if (!plug) + return 0; + + list_for_each_entry(mdcb, &plug->cb_list, cb.list) { + if (mdcb->cb.callback == plugger_unplug && + mdcb->mddev == mddev) { + /* Already on the list, move to top */ + if (mdcb != list_first_entry(&plug->cb_list, + struct md_plug_cb, + cb.list)) + list_move(&mdcb->cb.list, &plug->cb_list); + return 1; + } + } + /* Not currently on the callback list */ + mdcb = kmalloc(sizeof(*mdcb), GFP_ATOMIC); + if (!mdcb) + return 0; + + mdcb->mddev = mddev; + mdcb->cb.callback = plugger_unplug; + atomic_inc(&mddev->plug_cnt); + list_add(&mdcb->cb.list, &plug->cb_list); + return 1; +} +EXPORT_SYMBOL_GPL(mddev_check_plugged); + +static inline struct mddev *mddev_get(struct mddev *mddev) +{ + atomic_inc(&mddev->active); + return mddev; +} + +static void mddev_delayed_delete(struct work_struct *ws); + +static void mddev_put(struct mddev *mddev) +{ + struct bio_set *bs = NULL; + + if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) + return; + if (!mddev->raid_disks && list_empty(&mddev->disks) && + mddev->ctime == 0 && !mddev->hold_active) { + /* Array is not configured at all, and not held active, + * so destroy it */ + list_del_init(&mddev->all_mddevs); + bs = mddev->bio_set; + mddev->bio_set = NULL; + if (mddev->gendisk) { + /* We did a probe so need to clean up. Call + * queue_work inside the spinlock so that + * flush_workqueue() after mddev_find will + * succeed in waiting for the work to be done. + */ + INIT_WORK(&mddev->del_work, mddev_delayed_delete); + queue_work(md_misc_wq, &mddev->del_work); + } else + kfree(mddev); + } + spin_unlock(&all_mddevs_lock); + if (bs) + bioset_free(bs); +} + +void mddev_init(struct mddev *mddev) +{ + mutex_init(&mddev->open_mutex); + mutex_init(&mddev->reconfig_mutex); + mutex_init(&mddev->bitmap_info.mutex); + INIT_LIST_HEAD(&mddev->disks); + INIT_LIST_HEAD(&mddev->all_mddevs); + init_timer(&mddev->safemode_timer); + atomic_set(&mddev->active, 1); + atomic_set(&mddev->openers, 0); + atomic_set(&mddev->active_io, 0); + atomic_set(&mddev->plug_cnt, 0); + spin_lock_init(&mddev->write_lock); + atomic_set(&mddev->flush_pending, 0); + init_waitqueue_head(&mddev->sb_wait); + init_waitqueue_head(&mddev->recovery_wait); + mddev->reshape_position = MaxSector; + mddev->resync_min = 0; + mddev->resync_max = MaxSector; + mddev->level = LEVEL_NONE; +} +EXPORT_SYMBOL_GPL(mddev_init); + +static struct mddev * mddev_find(dev_t unit) +{ + struct mddev *mddev, *new = NULL; + + if (unit && MAJOR(unit) != MD_MAJOR) + unit &= ~((1<<MdpMinorShift)-1); + + retry: + spin_lock(&all_mddevs_lock); + + if (unit) { + list_for_each_entry(mddev, &all_mddevs, all_mddevs) + if (mddev->unit == unit) { + mddev_get(mddev); + spin_unlock(&all_mddevs_lock); + kfree(new); + return mddev; + } + + if (new) { + list_add(&new->all_mddevs, &all_mddevs); + spin_unlock(&all_mddevs_lock); + new->hold_active = UNTIL_IOCTL; + return new; + } + } else if (new) { + /* find an unused unit number */ + static int next_minor = 512; + int start = next_minor; + int is_free = 0; + int dev = 0; + while (!is_free) { + dev = MKDEV(MD_MAJOR, next_minor); + next_minor++; + if (next_minor > MINORMASK) + next_minor = 0; + if (next_minor == start) { + /* Oh dear, all in use. */ + spin_unlock(&all_mddevs_lock); + kfree(new); + return NULL; + } + + is_free = 1; + list_for_each_entry(mddev, &all_mddevs, all_mddevs) + if (mddev->unit == dev) { + is_free = 0; + break; + } + } + new->unit = dev; + new->md_minor = MINOR(dev); + new->hold_active = UNTIL_STOP; + list_add(&new->all_mddevs, &all_mddevs); + spin_unlock(&all_mddevs_lock); + return new; + } + spin_unlock(&all_mddevs_lock); + + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return NULL; + + new->unit = unit; + if (MAJOR(unit) == MD_MAJOR) + new->md_minor = MINOR(unit); + else + new->md_minor = MINOR(unit) >> MdpMinorShift; + + mddev_init(new); + + goto retry; +} + +static inline int mddev_lock(struct mddev * mddev) +{ + return mutex_lock_interruptible(&mddev->reconfig_mutex); +} + +static inline int mddev_is_locked(struct mddev *mddev) +{ + return mutex_is_locked(&mddev->reconfig_mutex); +} + +static inline int mddev_trylock(struct mddev * mddev) +{ + return mutex_trylock(&mddev->reconfig_mutex); +} + +static struct attribute_group md_redundancy_group; + +static void mddev_unlock(struct mddev * mddev) +{ + if (mddev->to_remove) { + /* These cannot be removed under reconfig_mutex as + * an access to the files will try to take reconfig_mutex + * while holding the file unremovable, which leads to + * a deadlock. + * So hold set sysfs_active while the remove in happeing, + * and anything else which might set ->to_remove or my + * otherwise change the sysfs namespace will fail with + * -EBUSY if sysfs_active is still set. + * We set sysfs_active under reconfig_mutex and elsewhere + * test it under the same mutex to ensure its correct value + * is seen. + */ + struct attribute_group *to_remove = mddev->to_remove; + mddev->to_remove = NULL; + mddev->sysfs_active = 1; + mutex_unlock(&mddev->reconfig_mutex); + + if (mddev->kobj.sd) { + if (to_remove != &md_redundancy_group) + sysfs_remove_group(&mddev->kobj, to_remove); + if (mddev->pers == NULL || + mddev->pers->sync_request == NULL) { + sysfs_remove_group(&mddev->kobj, &md_redundancy_group); + if (mddev->sysfs_action) + sysfs_put(mddev->sysfs_action); + mddev->sysfs_action = NULL; + } + } + mddev->sysfs_active = 0; + } else + mutex_unlock(&mddev->reconfig_mutex); + + /* As we've dropped the mutex we need a spinlock to + * make sure the thread doesn't disappear + */ + spin_lock(&pers_lock); + md_wakeup_thread(mddev->thread); + spin_unlock(&pers_lock); +} + +static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr) +{ + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) + if (rdev->desc_nr == nr) + return rdev; + + return NULL; +} + +static struct md_rdev * find_rdev(struct mddev * mddev, dev_t dev) +{ + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) + if (rdev->bdev->bd_dev == dev) + return rdev; + + return NULL; +} + +static struct md_personality *find_pers(int level, char *clevel) +{ + struct md_personality *pers; + list_for_each_entry(pers, &pers_list, list) { + if (level != LEVEL_NONE && pers->level == level) + return pers; + if (strcmp(pers->name, clevel)==0) + return pers; + } + return NULL; +} + +/* return the offset of the super block in 512byte sectors */ +static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) +{ + sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512; + return MD_NEW_SIZE_SECTORS(num_sectors); +} + +static int alloc_disk_sb(struct md_rdev * rdev) +{ + if (rdev->sb_page) + MD_BUG(); + + rdev->sb_page = alloc_page(GFP_KERNEL); + if (!rdev->sb_page) { + printk(KERN_ALERT "md: out of memory.\n"); + return -ENOMEM; + } + + return 0; +} + +static void free_disk_sb(struct md_rdev * rdev) +{ + if (rdev->sb_page) { + put_page(rdev->sb_page); + rdev->sb_loaded = 0; + rdev->sb_page = NULL; + rdev->sb_start = 0; + rdev->sectors = 0; + } + if (rdev->bb_page) { + put_page(rdev->bb_page); + rdev->bb_page = NULL; + } +} + + +static void super_written(struct bio *bio, int error) +{ + struct md_rdev *rdev = bio->bi_private; + struct mddev *mddev = rdev->mddev; + + if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) { + printk("md: super_written gets error=%d, uptodate=%d\n", + error, test_bit(BIO_UPTODATE, &bio->bi_flags)); + WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags)); + md_error(mddev, rdev); + } + + if (atomic_dec_and_test(&mddev->pending_writes)) + wake_up(&mddev->sb_wait); + bio_put(bio); +} + +void md_super_write(struct mddev *mddev, struct md_rdev *rdev, + sector_t sector, int size, struct page *page) +{ + /* write first size bytes of page to sector of rdev + * Increment mddev->pending_writes before returning + * and decrement it on completion, waking up sb_wait + * if zero is reached. + * If an error occurred, call md_error + */ + struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); + + bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev; + bio->bi_sector = sector; + bio_add_page(bio, page, size, 0); + bio->bi_private = rdev; + bio->bi_end_io = super_written; + + atomic_inc(&mddev->pending_writes); + submit_bio(WRITE_FLUSH_FUA, bio); +} + +void md_super_wait(struct mddev *mddev) +{ + /* wait for all superblock writes that were scheduled to complete */ + DEFINE_WAIT(wq); + for(;;) { + prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); + if (atomic_read(&mddev->pending_writes)==0) + break; + schedule(); + } + finish_wait(&mddev->sb_wait, &wq); +} + +static void bi_complete(struct bio *bio, int error) +{ + complete((struct completion*)bio->bi_private); +} + +int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, + struct page *page, int rw, bool metadata_op) +{ + struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); + struct completion event; + int ret; + + rw |= REQ_SYNC; + + bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? + rdev->meta_bdev : rdev->bdev; + if (metadata_op) + bio->bi_sector = sector + rdev->sb_start; + else + bio->bi_sector = sector + rdev->data_offset; + bio_add_page(bio, page, size, 0); + init_completion(&event); + bio->bi_private = &event; + bio->bi_end_io = bi_complete; + submit_bio(rw, bio); + wait_for_completion(&event); + + ret = test_bit(BIO_UPTODATE, &bio->bi_flags); + bio_put(bio); + return ret; +} +EXPORT_SYMBOL_GPL(sync_page_io); + +static int read_disk_sb(struct md_rdev * rdev, int size) +{ + char b[BDEVNAME_SIZE]; + if (!rdev->sb_page) { + MD_BUG(); + return -EINVAL; + } + if (rdev->sb_loaded) + return 0; + + + if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true)) + goto fail; + rdev->sb_loaded = 1; + return 0; + +fail: + printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", + bdevname(rdev->bdev,b)); + return -EINVAL; +} + +static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) +{ + return sb1->set_uuid0 == sb2->set_uuid0 && + sb1->set_uuid1 == sb2->set_uuid1 && + sb1->set_uuid2 == sb2->set_uuid2 && + sb1->set_uuid3 == sb2->set_uuid3; +} + +static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) +{ + int ret; + mdp_super_t *tmp1, *tmp2; + + tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); + tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); + + if (!tmp1 || !tmp2) { + ret = 0; + printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n"); + goto abort; + } + + *tmp1 = *sb1; + *tmp2 = *sb2; + + /* + * nr_disks is not constant + */ + tmp1->nr_disks = 0; + tmp2->nr_disks = 0; + + ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); +abort: + kfree(tmp1); + kfree(tmp2); + return ret; +} + + +static u32 md_csum_fold(u32 csum) +{ + csum = (csum & 0xffff) + (csum >> 16); + return (csum & 0xffff) + (csum >> 16); +} + +static unsigned int calc_sb_csum(mdp_super_t * sb) +{ + u64 newcsum = 0; + u32 *sb32 = (u32*)sb; + int i; + unsigned int disk_csum, csum; + + disk_csum = sb->sb_csum; + sb->sb_csum = 0; + + for (i = 0; i < MD_SB_BYTES/4 ; i++) + newcsum += sb32[i]; + csum = (newcsum & 0xffffffff) + (newcsum>>32); + + +#ifdef CONFIG_ALPHA + /* This used to use csum_partial, which was wrong for several + * reasons including that different results are returned on + * different architectures. It isn't critical that we get exactly + * the same return value as before (we always csum_fold before + * testing, and that removes any differences). However as we + * know that csum_partial always returned a 16bit value on + * alphas, do a fold to maximise conformity to previous behaviour. + */ + sb->sb_csum = md_csum_fold(disk_csum); +#else + sb->sb_csum = disk_csum; +#endif + return csum; +} + + +/* + * Handle superblock details. + * We want to be able to handle multiple superblock formats + * so we have a common interface to them all, and an array of + * different handlers. + * We rely on user-space to write the initial superblock, and support + * reading and updating of superblocks. + * Interface methods are: + * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) + * loads and validates a superblock on dev. + * if refdev != NULL, compare superblocks on both devices + * Return: + * 0 - dev has a superblock that is compatible with refdev + * 1 - dev has a superblock that is compatible and newer than refdev + * so dev should be used as the refdev in future + * -EINVAL superblock incompatible or invalid + * -othererror e.g. -EIO + * + * int validate_super(struct mddev *mddev, struct md_rdev *dev) + * Verify that dev is acceptable into mddev. + * The first time, mddev->raid_disks will be 0, and data from + * dev should be merged in. Subsequent calls check that dev + * is new enough. Return 0 or -EINVAL + * + * void sync_super(struct mddev *mddev, struct md_rdev *dev) + * Update the superblock for rdev with data in mddev + * This does not write to disc. + * + */ + +struct super_type { + char *name; + struct module *owner; + int (*load_super)(struct md_rdev *rdev, struct md_rdev *refdev, + int minor_version); + int (*validate_super)(struct mddev *mddev, struct md_rdev *rdev); + void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); + unsigned long long (*rdev_size_change)(struct md_rdev *rdev, + sector_t num_sectors); +}; + +/* + * Check that the given mddev has no bitmap. + * + * This function is called from the run method of all personalities that do not + * support bitmaps. It prints an error message and returns non-zero if mddev + * has a bitmap. Otherwise, it returns 0. + * + */ +int md_check_no_bitmap(struct mddev *mddev) +{ + if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) + return 0; + printk(KERN_ERR "%s: bitmaps are not supported for %s\n", + mdname(mddev), mddev->pers->name); + return 1; +} +EXPORT_SYMBOL(md_check_no_bitmap); + +/* + * load_super for 0.90.0 + */ +static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) +{ + char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; + mdp_super_t *sb; + int ret; + + /* + * Calculate the position of the superblock (512byte sectors), + * it's at the end of the disk. + * + * It also happens to be a multiple of 4Kb. + */ + rdev->sb_start = calc_dev_sboffset(rdev); + + ret = read_disk_sb(rdev, MD_SB_BYTES); + if (ret) return ret; + + ret = -EINVAL; + + bdevname(rdev->bdev, b); + sb = page_address(rdev->sb_page); + + if (sb->md_magic != MD_SB_MAGIC) { + printk(KERN_ERR "md: invalid raid superblock magic on %s\n", + b); + goto abort; + } + + if (sb->major_version != 0 || + sb->minor_version < 90 || + sb->minor_version > 91) { + printk(KERN_WARNING "Bad version number %d.%d on %s\n", + sb->major_version, sb->minor_version, + b); + goto abort; + } + + if (sb->raid_disks <= 0) + goto abort; + + if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { + printk(KERN_WARNING "md: invalid superblock checksum on %s\n", + b); + goto abort; + } + + rdev->preferred_minor = sb->md_minor; + rdev->data_offset = 0; + rdev->sb_size = MD_SB_BYTES; + rdev->badblocks.shift = -1; + + if (sb->level == LEVEL_MULTIPATH) + rdev->desc_nr = -1; + else + rdev->desc_nr = sb->this_disk.number; + + if (!refdev) { + ret = 1; + } else { + __u64 ev1, ev2; + mdp_super_t *refsb = page_address(refdev->sb_page); + if (!uuid_equal(refsb, sb)) { + printk(KERN_WARNING "md: %s has different UUID to %s\n", + b, bdevname(refdev->bdev,b2)); + goto abort; + } + if (!sb_equal(refsb, sb)) { + printk(KERN_WARNING "md: %s has same UUID" + " but different superblock to %s\n", + b, bdevname(refdev->bdev, b2)); + goto abort; + } + ev1 = md_event(sb); + ev2 = md_event(refsb); + if (ev1 > ev2) + ret = 1; + else + ret = 0; + } + rdev->sectors = rdev->sb_start; + /* Limit to 4TB as metadata cannot record more than that */ + if (rdev->sectors >= (2ULL << 32)) + rdev->sectors = (2ULL << 32) - 2; + + if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) + /* "this cannot possibly happen" ... */ + ret = -EINVAL; + + abort: + return ret; +} + +/* + * validate_super for 0.90.0 + */ +static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) +{ + mdp_disk_t *desc; + mdp_super_t *sb = page_address(rdev->sb_page); + __u64 ev1 = md_event(sb); + + rdev->raid_disk = -1; + clear_bit(Faulty, &rdev->flags); + clear_bit(In_sync, &rdev->flags); + clear_bit(WriteMostly, &rdev->flags); + + if (mddev->raid_disks == 0) { + mddev->major_version = 0; + mddev->minor_version = sb->minor_version; + mddev->patch_version = sb->patch_version; + mddev->external = 0; + mddev->chunk_sectors = sb->chunk_size >> 9; + mddev->ctime = sb->ctime; + mddev->utime = sb->utime; + mddev->level = sb->level; + mddev->clevel[0] = 0; + mddev->layout = sb->layout; + mddev->raid_disks = sb->raid_disks; + mddev->dev_sectors = ((sector_t)sb->size) * 2; + mddev->events = ev1; + mddev->bitmap_info.offset = 0; + mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; + + if (mddev->minor_version >= 91) { + mddev->reshape_position = sb->reshape_position; + mddev->delta_disks = sb->delta_disks; + mddev->new_level = sb->new_level; + mddev->new_layout = sb->new_layout; + mddev->new_chunk_sectors = sb->new_chunk >> 9; + } else { + mddev->reshape_position = MaxSector; + mddev->delta_disks = 0; + mddev->new_level = mddev->level; + mddev->new_layout = mddev->layout; + mddev->new_chunk_sectors = mddev->chunk_sectors; + } + + if (sb->state & (1<<MD_SB_CLEAN)) + mddev->recovery_cp = MaxSector; + else { + if (sb->events_hi == sb->cp_events_hi && + sb->events_lo == sb->cp_events_lo) { + mddev->recovery_cp = sb->recovery_cp; + } else + mddev->recovery_cp = 0; + } + + memcpy(mddev->uuid+0, &sb->set_uuid0, 4); + memcpy(mddev->uuid+4, &sb->set_uuid1, 4); + memcpy(mddev->uuid+8, &sb->set_uuid2, 4); + memcpy(mddev->uuid+12,&sb->set_uuid3, 4); + + mddev->max_disks = MD_SB_DISKS; + + if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && + mddev->bitmap_info.file == NULL) + mddev->bitmap_info.offset = + mddev->bitmap_info.default_offset; + + } else if (mddev->pers == NULL) { + /* Insist on good event counter while assembling, except + * for spares (which don't need an event count) */ + ++ev1; + if (sb->disks[rdev->desc_nr].state & ( + (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) + if (ev1 < mddev->events) + return -EINVAL; + } else if (mddev->bitmap) { + /* if adding to array with a bitmap, then we can accept an + * older device ... but not too old. + */ + if (ev1 < mddev->bitmap->events_cleared) + return 0; + } else { + if (ev1 < mddev->events) + /* just a hot-add of a new device, leave raid_disk at -1 */ + return 0; + } + + if (mddev->level != LEVEL_MULTIPATH) { + desc = sb->disks + rdev->desc_nr; + + if (desc->state & (1<<MD_DISK_FAULTY)) + set_bit(Faulty, &rdev->flags); + else if (desc->state & (1<<MD_DISK_SYNC) /* && + desc->raid_disk < mddev->raid_disks */) { + set_bit(In_sync, &rdev->flags); + rdev->raid_disk = desc->raid_disk; + } else if (desc->state & (1<<MD_DISK_ACTIVE)) { + /* active but not in sync implies recovery up to + * reshape position. We don't know exactly where + * that is, so set to zero for now */ + if (mddev->minor_version >= 91) { + rdev->recovery_offset = 0; + rdev->raid_disk = desc->raid_disk; + } + } + if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) + set_bit(WriteMostly, &rdev->flags); + } else /* MULTIPATH are always insync */ + set_bit(In_sync, &rdev->flags); + return 0; +} + +/* + * sync_super for 0.90.0 + */ +static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) +{ + mdp_super_t *sb; + struct md_rdev *rdev2; + int next_spare = mddev->raid_disks; + + + /* make rdev->sb match mddev data.. + * + * 1/ zero out disks + * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); + * 3/ any empty disks < next_spare become removed + * + * disks[0] gets initialised to REMOVED because + * we cannot be sure from other fields if it has + * been initialised or not. + */ + int i; + int active=0, working=0,failed=0,spare=0,nr_disks=0; + + rdev->sb_size = MD_SB_BYTES; + + sb = page_address(rdev->sb_page); + + memset(sb, 0, sizeof(*sb)); + + sb->md_magic = MD_SB_MAGIC; + sb->major_version = mddev->major_version; + sb->patch_version = mddev->patch_version; + sb->gvalid_words = 0; /* ignored */ + memcpy(&sb->set_uuid0, mddev->uuid+0, 4); + memcpy(&sb->set_uuid1, mddev->uuid+4, 4); + memcpy(&sb->set_uuid2, mddev->uuid+8, 4); + memcpy(&sb->set_uuid3, mddev->uuid+12,4); + + sb->ctime = mddev->ctime; + sb->level = mddev->level; + sb->size = mddev->dev_sectors / 2; + sb->raid_disks = mddev->raid_disks; + sb->md_minor = mddev->md_minor; + sb->not_persistent = 0; + sb->utime = mddev->utime; + sb->state = 0; + sb->events_hi = (mddev->events>>32); + sb->events_lo = (u32)mddev->events; + + if (mddev->reshape_position == MaxSector) + sb->minor_version = 90; + else { + sb->minor_version = 91; + sb->reshape_position = mddev->reshape_position; + sb->new_level = mddev->new_level; + sb->delta_disks = mddev->delta_disks; + sb->new_layout = mddev->new_layout; + sb->new_chunk = mddev->new_chunk_sectors << 9; + } + mddev->minor_version = sb->minor_version; + if (mddev->in_sync) + { + sb->recovery_cp = mddev->recovery_cp; + sb->cp_events_hi = (mddev->events>>32); + sb->cp_events_lo = (u32)mddev->events; + if (mddev->recovery_cp == MaxSector) + sb->state = (1<< MD_SB_CLEAN); + } else + sb->recovery_cp = 0; + + sb->layout = mddev->layout; + sb->chunk_size = mddev->chunk_sectors << 9; + + if (mddev->bitmap && mddev->bitmap_info.file == NULL) + sb->state |= (1<<MD_SB_BITMAP_PRESENT); + + sb->disks[0].state = (1<<MD_DISK_REMOVED); + rdev_for_each(rdev2, mddev) { + mdp_disk_t *d; + int desc_nr; + int is_active = test_bit(In_sync, &rdev2->flags); + + if (rdev2->raid_disk >= 0 && + sb->minor_version >= 91) + /* we have nowhere to store the recovery_offset, + * but if it is not below the reshape_position, + * we can piggy-back on that. + */ + is_active = 1; + if (rdev2->raid_disk < 0 || + test_bit(Faulty, &rdev2->flags)) + is_active = 0; + if (is_active) + desc_nr = rdev2->raid_disk; + else + desc_nr = next_spare++; + rdev2->desc_nr = desc_nr; + d = &sb->disks[rdev2->desc_nr]; + nr_disks++; + d->number = rdev2->desc_nr; + d->major = MAJOR(rdev2->bdev->bd_dev); + d->minor = MINOR(rdev2->bdev->bd_dev); + if (is_active) + d->raid_disk = rdev2->raid_disk; + else + d->raid_disk = rdev2->desc_nr; /* compatibility */ + if (test_bit(Faulty, &rdev2->flags)) + d->state = (1<<MD_DISK_FAULTY); + else if (is_active) { + d->state = (1<<MD_DISK_ACTIVE); + if (test_bit(In_sync, &rdev2->flags)) + d->state |= (1<<MD_DISK_SYNC); + active++; + working++; + } else { + d->state = 0; + spare++; + working++; + } + if (test_bit(WriteMostly, &rdev2->flags)) + d->state |= (1<<MD_DISK_WRITEMOSTLY); + } + /* now set the "removed" and "faulty" bits on any missing devices */ + for (i=0 ; i < mddev->raid_disks ; i++) { + mdp_disk_t *d = &sb->disks[i]; + if (d->state == 0 && d->number == 0) { + d->number = i; + d->raid_disk = i; + d->state = (1<<MD_DISK_REMOVED); + d->state |= (1<<MD_DISK_FAULTY); + failed++; + } + } + sb->nr_disks = nr_disks; + sb->active_disks = active; + sb->working_disks = working; + sb->failed_disks = failed; + sb->spare_disks = spare; + + sb->this_disk = sb->disks[rdev->desc_nr]; + sb->sb_csum = calc_sb_csum(sb); +} + +/* + * rdev_size_change for 0.90.0 + */ +static unsigned long long +super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) +{ + if (num_sectors && num_sectors < rdev->mddev->dev_sectors) + return 0; /* component must fit device */ + if (rdev->mddev->bitmap_info.offset) + return 0; /* can't move bitmap */ + rdev->sb_start = calc_dev_sboffset(rdev); + if (!num_sectors || num_sectors > rdev->sb_start) + num_sectors = rdev->sb_start; + /* Limit to 4TB as metadata cannot record more than that. + * 4TB == 2^32 KB, or 2*2^32 sectors. + */ + if (num_sectors >= (2ULL << 32)) + num_sectors = (2ULL << 32) - 2; + md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, + rdev->sb_page); + md_super_wait(rdev->mddev); + return num_sectors; +} + + +/* + * version 1 superblock + */ + +static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) +{ + __le32 disk_csum; + u32 csum; + unsigned long long newcsum; + int size = 256 + le32_to_cpu(sb->max_dev)*2; + __le32 *isuper = (__le32*)sb; + int i; + + disk_csum = sb->sb_csum; + sb->sb_csum = 0; + newcsum = 0; + for (i=0; size>=4; size -= 4 ) + newcsum += le32_to_cpu(*isuper++); + + if (size == 2) + newcsum += le16_to_cpu(*(__le16*) isuper); + + csum = (newcsum & 0xffffffff) + (newcsum >> 32); + sb->sb_csum = disk_csum; + return cpu_to_le32(csum); +} + +static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, + int acknowledged); +static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) +{ + struct mdp_superblock_1 *sb; + int ret; + sector_t sb_start; + char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; + int bmask; + + /* + * Calculate the position of the superblock in 512byte sectors. + * It is always aligned to a 4K boundary and + * depeding on minor_version, it can be: + * 0: At least 8K, but less than 12K, from end of device + * 1: At start of device + * 2: 4K from start of device. + */ + switch(minor_version) { + case 0: + sb_start = i_size_read(rdev->bdev->bd_inode) >> 9; + sb_start -= 8*2; + sb_start &= ~(sector_t)(4*2-1); + break; + case 1: + sb_start = 0; + break; + case 2: + sb_start = 8; + break; + default: + return -EINVAL; + } + rdev->sb_start = sb_start; + + /* superblock is rarely larger than 1K, but it can be larger, + * and it is safe to read 4k, so we do that + */ + ret = read_disk_sb(rdev, 4096); + if (ret) return ret; + + + sb = page_address(rdev->sb_page); + + if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || + sb->major_version != cpu_to_le32(1) || + le32_to_cpu(sb->max_dev) > (4096-256)/2 || + le64_to_cpu(sb->super_offset) != rdev->sb_start || + (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) + return -EINVAL; + + if (calc_sb_1_csum(sb) != sb->sb_csum) { + printk("md: invalid superblock checksum on %s\n", + bdevname(rdev->bdev,b)); + return -EINVAL; + } + if (le64_to_cpu(sb->data_size) < 10) { + printk("md: data_size too small on %s\n", + bdevname(rdev->bdev,b)); + return -EINVAL; + } + + rdev->preferred_minor = 0xffff; + rdev->data_offset = le64_to_cpu(sb->data_offset); + atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); + + rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; + bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; + if (rdev->sb_size & bmask) + rdev->sb_size = (rdev->sb_size | bmask) + 1; + + if (minor_version + && rdev->data_offset < sb_start + (rdev->sb_size/512)) + return -EINVAL; + + if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) + rdev->desc_nr = -1; + else + rdev->desc_nr = le32_to_cpu(sb->dev_number); + + if (!rdev->bb_page) { + rdev->bb_page = alloc_page(GFP_KERNEL); + if (!rdev->bb_page) + return -ENOMEM; + } + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && + rdev->badblocks.count == 0) { + /* need to load the bad block list. + * Currently we limit it to one page. + */ + s32 offset; + sector_t bb_sector; + u64 *bbp; + int i; + int sectors = le16_to_cpu(sb->bblog_size); + if (sectors > (PAGE_SIZE / 512)) + return -EINVAL; + offset = le32_to_cpu(sb->bblog_offset); + if (offset == 0) + return -EINVAL; + bb_sector = (long long)offset; + if (!sync_page_io(rdev, bb_sector, sectors << 9, + rdev->bb_page, READ, true)) + return -EIO; + bbp = (u64 *)page_address(rdev->bb_page); + rdev->badblocks.shift = sb->bblog_shift; + for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { + u64 bb = le64_to_cpu(*bbp); + int count = bb & (0x3ff); + u64 sector = bb >> 10; + sector <<= sb->bblog_shift; + count <<= sb->bblog_shift; + if (bb + 1 == 0) + break; + if (md_set_badblocks(&rdev->badblocks, + sector, count, 1) == 0) + return -EINVAL; + } + } else if (sb->bblog_offset == 0) + rdev->badblocks.shift = -1; + + if (!refdev) { + ret = 1; + } else { + __u64 ev1, ev2; + struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); + + if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || + sb->level != refsb->level || + sb->layout != refsb->layout || + sb->chunksize != refsb->chunksize) { + printk(KERN_WARNING "md: %s has strangely different" + " superblock to %s\n", + bdevname(rdev->bdev,b), + bdevname(refdev->bdev,b2)); + return -EINVAL; + } + ev1 = le64_to_cpu(sb->events); + ev2 = le64_to_cpu(refsb->events); + + if (ev1 > ev2) + ret = 1; + else + ret = 0; + } + if (minor_version) + rdev->sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - + le64_to_cpu(sb->data_offset); + else + rdev->sectors = rdev->sb_start; + if (rdev->sectors < le64_to_cpu(sb->data_size)) + return -EINVAL; + rdev->sectors = le64_to_cpu(sb->data_size); + if (le64_to_cpu(sb->size) > rdev->sectors) + return -EINVAL; + return ret; +} + +static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) +{ + struct mdp_superblock_1 *sb = page_address(rdev->sb_page); + __u64 ev1 = le64_to_cpu(sb->events); + + rdev->raid_disk = -1; + clear_bit(Faulty, &rdev->flags); + clear_bit(In_sync, &rdev->flags); + clear_bit(WriteMostly, &rdev->flags); + + if (mddev->raid_disks == 0) { + mddev->major_version = 1; + mddev->patch_version = 0; + mddev->external = 0; + mddev->chunk_sectors = le32_to_cpu(sb->chunksize); + mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); + mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); + mddev->level = le32_to_cpu(sb->level); + mddev->clevel[0] = 0; + mddev->layout = le32_to_cpu(sb->layout); + mddev->raid_disks = le32_to_cpu(sb->raid_disks); + mddev->dev_sectors = le64_to_cpu(sb->size); + mddev->events = ev1; + mddev->bitmap_info.offset = 0; + mddev->bitmap_info.default_offset = 1024 >> 9; + + mddev->recovery_cp = le64_to_cpu(sb->resync_offset); + memcpy(mddev->uuid, sb->set_uuid, 16); + + mddev->max_disks = (4096-256)/2; + + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && + mddev->bitmap_info.file == NULL ) + mddev->bitmap_info.offset = + (__s32)le32_to_cpu(sb->bitmap_offset); + + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { + mddev->reshape_position = le64_to_cpu(sb->reshape_position); + mddev->delta_disks = le32_to_cpu(sb->delta_disks); + mddev->new_level = le32_to_cpu(sb->new_level); + mddev->new_layout = le32_to_cpu(sb->new_layout); + mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); + } else { + mddev->reshape_position = MaxSector; + mddev->delta_disks = 0; + mddev->new_level = mddev->level; + mddev->new_layout = mddev->layout; + mddev->new_chunk_sectors = mddev->chunk_sectors; + } + + } else if (mddev->pers == NULL) { + /* Insist of good event counter while assembling, except for + * spares (which don't need an event count) */ + ++ev1; + if (rdev->desc_nr >= 0 && + rdev->desc_nr < le32_to_cpu(sb->max_dev) && + le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe) + if (ev1 < mddev->events) + return -EINVAL; + } else if (mddev->bitmap) { + /* If adding to array with a bitmap, then we can accept an + * older device, but not too old. + */ + if (ev1 < mddev->bitmap->events_cleared) + return 0; + } else { + if (ev1 < mddev->events) + /* just a hot-add of a new device, leave raid_disk at -1 */ + return 0; + } + if (mddev->level != LEVEL_MULTIPATH) { + int role; + if (rdev->desc_nr < 0 || + rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { + role = 0xffff; + rdev->desc_nr = -1; + } else + role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); + switch(role) { + case 0xffff: /* spare */ + break; + case 0xfffe: /* faulty */ + set_bit(Faulty, &rdev->flags); + break; + default: + if ((le32_to_cpu(sb->feature_map) & + MD_FEATURE_RECOVERY_OFFSET)) + rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); + else + set_bit(In_sync, &rdev->flags); + rdev->raid_disk = role; + break; + } + if (sb->devflags & WriteMostly1) + set_bit(WriteMostly, &rdev->flags); + if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) + set_bit(Replacement, &rdev->flags); + } else /* MULTIPATH are always insync */ + set_bit(In_sync, &rdev->flags); + + return 0; +} + +static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) +{ + struct mdp_superblock_1 *sb; + struct md_rdev *rdev2; + int max_dev, i; + /* make rdev->sb match mddev and rdev data. */ + + sb = page_address(rdev->sb_page); + + sb->feature_map = 0; + sb->pad0 = 0; + sb->recovery_offset = cpu_to_le64(0); + memset(sb->pad1, 0, sizeof(sb->pad1)); + memset(sb->pad3, 0, sizeof(sb->pad3)); + + sb->utime = cpu_to_le64((__u64)mddev->utime); + sb->events = cpu_to_le64(mddev->events); + if (mddev->in_sync) + sb->resync_offset = cpu_to_le64(mddev->recovery_cp); + else + sb->resync_offset = cpu_to_le64(0); + + sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); + + sb->raid_disks = cpu_to_le32(mddev->raid_disks); + sb->size = cpu_to_le64(mddev->dev_sectors); + sb->chunksize = cpu_to_le32(mddev->chunk_sectors); + sb->level = cpu_to_le32(mddev->level); + sb->layout = cpu_to_le32(mddev->layout); + + if (test_bit(WriteMostly, &rdev->flags)) + sb->devflags |= WriteMostly1; + else + sb->devflags &= ~WriteMostly1; + + if (mddev->bitmap && mddev->bitmap_info.file == NULL) { + sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); + sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); + } + + if (rdev->raid_disk >= 0 && + !test_bit(In_sync, &rdev->flags)) { + sb->feature_map |= + cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); + sb->recovery_offset = + cpu_to_le64(rdev->recovery_offset); + } + if (test_bit(Replacement, &rdev->flags)) + sb->feature_map |= + cpu_to_le32(MD_FEATURE_REPLACEMENT); + + if (mddev->reshape_position != MaxSector) { + sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); + sb->reshape_position = cpu_to_le64(mddev->reshape_position); + sb->new_layout = cpu_to_le32(mddev->new_layout); + sb->delta_disks = cpu_to_le32(mddev->delta_disks); + sb->new_level = cpu_to_le32(mddev->new_level); + sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); + } + + if (rdev->badblocks.count == 0) + /* Nothing to do for bad blocks*/ ; + else if (sb->bblog_offset == 0) + /* Cannot record bad blocks on this device */ + md_error(mddev, rdev); + else { + struct badblocks *bb = &rdev->badblocks; + u64 *bbp = (u64 *)page_address(rdev->bb_page); + u64 *p = bb->page; + sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); + if (bb->changed) { + unsigned seq; + +retry: + seq = read_seqbegin(&bb->lock); + + memset(bbp, 0xff, PAGE_SIZE); + + for (i = 0 ; i < bb->count ; i++) { + u64 internal_bb = *p++; + u64 store_bb = ((BB_OFFSET(internal_bb) << 10) + | BB_LEN(internal_bb)); + *bbp++ = cpu_to_le64(store_bb); + } + bb->changed = 0; + if (read_seqretry(&bb->lock, seq)) + goto retry; + + bb->sector = (rdev->sb_start + + (int)le32_to_cpu(sb->bblog_offset)); + bb->size = le16_to_cpu(sb->bblog_size); + } + } + + max_dev = 0; + rdev_for_each(rdev2, mddev) + if (rdev2->desc_nr+1 > max_dev) + max_dev = rdev2->desc_nr+1; + + if (max_dev > le32_to_cpu(sb->max_dev)) { + int bmask; + sb->max_dev = cpu_to_le32(max_dev); + rdev->sb_size = max_dev * 2 + 256; + bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; + if (rdev->sb_size & bmask) + rdev->sb_size = (rdev->sb_size | bmask) + 1; + } else + max_dev = le32_to_cpu(sb->max_dev); + + for (i=0; i<max_dev;i++) + sb->dev_roles[i] = cpu_to_le16(0xfffe); + + rdev_for_each(rdev2, mddev) { + i = rdev2->desc_nr; + if (test_bit(Faulty, &rdev2->flags)) + sb->dev_roles[i] = cpu_to_le16(0xfffe); + else if (test_bit(In_sync, &rdev2->flags)) + sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); + else if (rdev2->raid_disk >= 0) + sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); + else + sb->dev_roles[i] = cpu_to_le16(0xffff); + } + + sb->sb_csum = calc_sb_1_csum(sb); +} + +static unsigned long long +super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) +{ + struct mdp_superblock_1 *sb; + sector_t max_sectors; + if (num_sectors && num_sectors < rdev->mddev->dev_sectors) + return 0; /* component must fit device */ + if (rdev->sb_start < rdev->data_offset) { + /* minor versions 1 and 2; superblock before data */ + max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9; + max_sectors -= rdev->data_offset; + if (!num_sectors || num_sectors > max_sectors) + num_sectors = max_sectors; + } else if (rdev->mddev->bitmap_info.offset) { + /* minor version 0 with bitmap we can't move */ + return 0; + } else { + /* minor version 0; superblock after data */ + sector_t sb_start; + sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2; + sb_start &= ~(sector_t)(4*2 - 1); + max_sectors = rdev->sectors + sb_start - rdev->sb_start; + if (!num_sectors || num_sectors > max_sectors) + num_sectors = max_sectors; + rdev->sb_start = sb_start; + } + sb = page_address(rdev->sb_page); + sb->data_size = cpu_to_le64(num_sectors); + sb->super_offset = rdev->sb_start; + sb->sb_csum = calc_sb_1_csum(sb); + md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, + rdev->sb_page); + md_super_wait(rdev->mddev); + return num_sectors; +} + +static struct super_type super_types[] = { + [0] = { + .name = "0.90.0", + .owner = THIS_MODULE, + .load_super = super_90_load, + .validate_super = super_90_validate, + .sync_super = super_90_sync, + .rdev_size_change = super_90_rdev_size_change, + }, + [1] = { + .name = "md-1", + .owner = THIS_MODULE, + .load_super = super_1_load, + .validate_super = super_1_validate, + .sync_super = super_1_sync, + .rdev_size_change = super_1_rdev_size_change, + }, +}; + +static void sync_super(struct mddev *mddev, struct md_rdev *rdev) +{ + if (mddev->sync_super) { + mddev->sync_super(mddev, rdev); + return; + } + + BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); + + super_types[mddev->major_version].sync_super(mddev, rdev); +} + +static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) +{ + struct md_rdev *rdev, *rdev2; + + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev1) + rdev_for_each_rcu(rdev2, mddev2) + if (rdev->bdev->bd_contains == + rdev2->bdev->bd_contains) { + rcu_read_unlock(); + return 1; + } + rcu_read_unlock(); + return 0; +} + +static LIST_HEAD(pending_raid_disks); + +/* + * Try to register data integrity profile for an mddev + * + * This is called when an array is started and after a disk has been kicked + * from the array. It only succeeds if all working and active component devices + * are integrity capable with matching profiles. + */ +int md_integrity_register(struct mddev *mddev) +{ + struct md_rdev *rdev, *reference = NULL; + + if (list_empty(&mddev->disks)) + return 0; /* nothing to do */ + if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) + return 0; /* shouldn't register, or already is */ + rdev_for_each(rdev, mddev) { + /* skip spares and non-functional disks */ + if (test_bit(Faulty, &rdev->flags)) + continue; + if (rdev->raid_disk < 0) + continue; + if (!reference) { + /* Use the first rdev as the reference */ + reference = rdev; + continue; + } + /* does this rdev's profile match the reference profile? */ + if (blk_integrity_compare(reference->bdev->bd_disk, + rdev->bdev->bd_disk) < 0) + return -EINVAL; + } + if (!reference || !bdev_get_integrity(reference->bdev)) + return 0; + /* + * All component devices are integrity capable and have matching + * profiles, register the common profile for the md device. + */ + if (blk_integrity_register(mddev->gendisk, + bdev_get_integrity(reference->bdev)) != 0) { + printk(KERN_ERR "md: failed to register integrity for %s\n", + mdname(mddev)); + return -EINVAL; + } + printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev)); + if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) { + printk(KERN_ERR "md: failed to create integrity pool for %s\n", + mdname(mddev)); + return -EINVAL; + } + return 0; +} +EXPORT_SYMBOL(md_integrity_register); + +/* Disable data integrity if non-capable/non-matching disk is being added */ +void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) +{ + struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev); + struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk); + + if (!bi_mddev) /* nothing to do */ + return; + if (rdev->raid_disk < 0) /* skip spares */ + return; + if (bi_rdev && blk_integrity_compare(mddev->gendisk, + rdev->bdev->bd_disk) >= 0) + return; + printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev)); + blk_integrity_unregister(mddev->gendisk); +} +EXPORT_SYMBOL(md_integrity_add_rdev); + +static int bind_rdev_to_array(struct md_rdev * rdev, struct mddev * mddev) +{ + char b[BDEVNAME_SIZE]; + struct kobject *ko; + char *s; + int err; + + if (rdev->mddev) { + MD_BUG(); + return -EINVAL; + } + + /* prevent duplicates */ + if (find_rdev(mddev, rdev->bdev->bd_dev)) + return -EEXIST; + + /* make sure rdev->sectors exceeds mddev->dev_sectors */ + if (rdev->sectors && (mddev->dev_sectors == 0 || + rdev->sectors < mddev->dev_sectors)) { + if (mddev->pers) { + /* Cannot change size, so fail + * If mddev->level <= 0, then we don't care + * about aligning sizes (e.g. linear) + */ + if (mddev->level > 0) + return -ENOSPC; + } else + mddev->dev_sectors = rdev->sectors; + } + + /* Verify rdev->desc_nr is unique. + * If it is -1, assign a free number, else + * check number is not in use + */ + if (rdev->desc_nr < 0) { + int choice = 0; + if (mddev->pers) choice = mddev->raid_disks; + while (find_rdev_nr(mddev, choice)) + choice++; + rdev->desc_nr = choice; + } else { + if (find_rdev_nr(mddev, rdev->desc_nr)) + return -EBUSY; + } + if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { + printk(KERN_WARNING "md: %s: array is limited to %d devices\n", + mdname(mddev), mddev->max_disks); + return -EBUSY; + } + bdevname(rdev->bdev,b); + while ( (s=strchr(b, '/')) != NULL) + *s = '!'; + + rdev->mddev = mddev; + printk(KERN_INFO "md: bind<%s>\n", b); + + if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) + goto fail; + + ko = &part_to_dev(rdev->bdev->bd_part)->kobj; + if (sysfs_create_link(&rdev->kobj, ko, "block")) + /* failure here is OK */; + rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); + + list_add_rcu(&rdev->same_set, &mddev->disks); + bd_link_disk_holder(rdev->bdev, mddev->gendisk); + + /* May as well allow recovery to be retried once */ + mddev->recovery_disabled++; + + return 0; + + fail: + printk(KERN_WARNING "md: failed to register dev-%s for %s\n", + b, mdname(mddev)); + return err; +} + +static void md_delayed_delete(struct work_struct *ws) +{ + struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work); + kobject_del(&rdev->kobj); + kobject_put(&rdev->kobj); +} + +static void unbind_rdev_from_array(struct md_rdev * rdev) +{ + char b[BDEVNAME_SIZE]; + if (!rdev->mddev) { + MD_BUG(); + return; + } + bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); + list_del_rcu(&rdev->same_set); + printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); + rdev->mddev = NULL; + sysfs_remove_link(&rdev->kobj, "block"); + sysfs_put(rdev->sysfs_state); + rdev->sysfs_state = NULL; + kfree(rdev->badblocks.page); + rdev->badblocks.count = 0; + rdev->badblocks.page = NULL; + /* We need to delay this, otherwise we can deadlock when + * writing to 'remove' to "dev/state". We also need + * to delay it due to rcu usage. + */ + synchronize_rcu(); + INIT_WORK(&rdev->del_work, md_delayed_delete); + kobject_get(&rdev->kobj); + queue_work(md_misc_wq, &rdev->del_work); +} + +/* + * prevent the device from being mounted, repartitioned or + * otherwise reused by a RAID array (or any other kernel + * subsystem), by bd_claiming the device. + */ +static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared) +{ + int err = 0; + struct block_device *bdev; + char b[BDEVNAME_SIZE]; + + bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, + shared ? (struct md_rdev *)lock_rdev : rdev); + if (IS_ERR(bdev)) { + printk(KERN_ERR "md: could not open %s.\n", + __bdevname(dev, b)); + return PTR_ERR(bdev); + } + rdev->bdev = bdev; + return err; +} + +static void unlock_rdev(struct md_rdev *rdev) +{ + struct block_device *bdev = rdev->bdev; + rdev->bdev = NULL; + if (!bdev) + MD_BUG(); + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); +} + +void md_autodetect_dev(dev_t dev); + +static void export_rdev(struct md_rdev * rdev) +{ + char b[BDEVNAME_SIZE]; + printk(KERN_INFO "md: export_rdev(%s)\n", + bdevname(rdev->bdev,b)); + if (rdev->mddev) + MD_BUG(); + free_disk_sb(rdev); +#ifndef MODULE + if (test_bit(AutoDetected, &rdev->flags)) + md_autodetect_dev(rdev->bdev->bd_dev); +#endif + unlock_rdev(rdev); + kobject_put(&rdev->kobj); +} + +static void kick_rdev_from_array(struct md_rdev * rdev) +{ + unbind_rdev_from_array(rdev); + export_rdev(rdev); +} + +static void export_array(struct mddev *mddev) +{ + struct md_rdev *rdev, *tmp; + + rdev_for_each_safe(rdev, tmp, mddev) { + if (!rdev->mddev) { + MD_BUG(); + continue; + } + kick_rdev_from_array(rdev); + } + if (!list_empty(&mddev->disks)) + MD_BUG(); + mddev->raid_disks = 0; + mddev->major_version = 0; +} + +static void print_desc(mdp_disk_t *desc) +{ + printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, + desc->major,desc->minor,desc->raid_disk,desc->state); +} + +static void print_sb_90(mdp_super_t *sb) +{ + int i; + + printk(KERN_INFO + "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", + sb->major_version, sb->minor_version, sb->patch_version, + sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, + sb->ctime); + printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", + sb->level, sb->size, sb->nr_disks, sb->raid_disks, + sb->md_minor, sb->layout, sb->chunk_size); + printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" + " FD:%d SD:%d CSUM:%08x E:%08lx\n", + sb->utime, sb->state, sb->active_disks, sb->working_disks, + sb->failed_disks, sb->spare_disks, + sb->sb_csum, (unsigned long)sb->events_lo); + + printk(KERN_INFO); + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + + desc = sb->disks + i; + if (desc->number || desc->major || desc->minor || + desc->raid_disk || (desc->state && (desc->state != 4))) { + printk(" D %2d: ", i); + print_desc(desc); + } + } + printk(KERN_INFO "md: THIS: "); + print_desc(&sb->this_disk); +} + +static void print_sb_1(struct mdp_superblock_1 *sb) +{ + __u8 *uuid; + + uuid = sb->set_uuid; + printk(KERN_INFO + "md: SB: (V:%u) (F:0x%08x) Array-ID:<%pU>\n" + "md: Name: \"%s\" CT:%llu\n", + le32_to_cpu(sb->major_version), + le32_to_cpu(sb->feature_map), + uuid, + sb->set_name, + (unsigned long long)le64_to_cpu(sb->ctime) + & MD_SUPERBLOCK_1_TIME_SEC_MASK); + + uuid = sb->device_uuid; + printk(KERN_INFO + "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" + " RO:%llu\n" + "md: Dev:%08x UUID: %pU\n" + "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" + "md: (MaxDev:%u) \n", + le32_to_cpu(sb->level), + (unsigned long long)le64_to_cpu(sb->size), + le32_to_cpu(sb->raid_disks), + le32_to_cpu(sb->layout), + le32_to_cpu(sb->chunksize), + (unsigned long long)le64_to_cpu(sb->data_offset), + (unsigned long long)le64_to_cpu(sb->data_size), + (unsigned long long)le64_to_cpu(sb->super_offset), + (unsigned long long)le64_to_cpu(sb->recovery_offset), + le32_to_cpu(sb->dev_number), + uuid, + sb->devflags, + (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK, + (unsigned long long)le64_to_cpu(sb->events), + (unsigned long long)le64_to_cpu(sb->resync_offset), + le32_to_cpu(sb->sb_csum), + le32_to_cpu(sb->max_dev) + ); +} + +static void print_rdev(struct md_rdev *rdev, int major_version) +{ + char b[BDEVNAME_SIZE]; + printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n", + bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors, + test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), + rdev->desc_nr); + if (rdev->sb_loaded) { + printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version); + switch (major_version) { + case 0: + print_sb_90(page_address(rdev->sb_page)); + break; + case 1: + print_sb_1(page_address(rdev->sb_page)); + break; + } + } else + printk(KERN_INFO "md: no rdev superblock!\n"); +} + +static void md_print_devices(void) +{ + struct list_head *tmp; + struct md_rdev *rdev; + struct mddev *mddev; + char b[BDEVNAME_SIZE]; + + printk("\n"); + printk("md: **********************************\n"); + printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); + printk("md: **********************************\n"); + for_each_mddev(mddev, tmp) { + + if (mddev->bitmap) + bitmap_print_sb(mddev->bitmap); + else + printk("%s: ", mdname(mddev)); + rdev_for_each(rdev, mddev) + printk("<%s>", bdevname(rdev->bdev,b)); + printk("\n"); + + rdev_for_each(rdev, mddev) + print_rdev(rdev, mddev->major_version); + } + printk("md: **********************************\n"); + printk("\n"); +} + + +static void sync_sbs(struct mddev * mddev, int nospares) +{ + /* Update each superblock (in-memory image), but + * if we are allowed to, skip spares which already + * have the right event counter, or have one earlier + * (which would mean they aren't being marked as dirty + * with the rest of the array) + */ + struct md_rdev *rdev; + rdev_for_each(rdev, mddev) { + if (rdev->sb_events == mddev->events || + (nospares && + rdev->raid_disk < 0 && + rdev->sb_events+1 == mddev->events)) { + /* Don't update this superblock */ + rdev->sb_loaded = 2; + } else { + sync_super(mddev, rdev); + rdev->sb_loaded = 1; + } + } +} + +static void md_update_sb(struct mddev * mddev, int force_change) +{ + struct md_rdev *rdev; + int sync_req; + int nospares = 0; + int any_badblocks_changed = 0; + +repeat: + /* First make sure individual recovery_offsets are correct */ + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk >= 0 && + mddev->delta_disks >= 0 && + !test_bit(In_sync, &rdev->flags) && + mddev->curr_resync_completed > rdev->recovery_offset) + rdev->recovery_offset = mddev->curr_resync_completed; + + } + if (!mddev->persistent) { + clear_bit(MD_CHANGE_CLEAN, &mddev->flags); + clear_bit(MD_CHANGE_DEVS, &mddev->flags); + if (!mddev->external) { + clear_bit(MD_CHANGE_PENDING, &mddev->flags); + rdev_for_each(rdev, mddev) { + if (rdev->badblocks.changed) { + rdev->badblocks.changed = 0; + md_ack_all_badblocks(&rdev->badblocks); + md_error(mddev, rdev); + } + clear_bit(Blocked, &rdev->flags); + clear_bit(BlockedBadBlocks, &rdev->flags); + wake_up(&rdev->blocked_wait); + } + } + wake_up(&mddev->sb_wait); + return; + } + + spin_lock_irq(&mddev->write_lock); + + mddev->utime = get_seconds(); + + if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) + force_change = 1; + if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) + /* just a clean<-> dirty transition, possibly leave spares alone, + * though if events isn't the right even/odd, we will have to do + * spares after all + */ + nospares = 1; + if (force_change) + nospares = 0; + if (mddev->degraded) + /* If the array is degraded, then skipping spares is both + * dangerous and fairly pointless. + * Dangerous because a device that was removed from the array + * might have a event_count that still looks up-to-date, + * so it can be re-added without a resync. + * Pointless because if there are any spares to skip, + * then a recovery will happen and soon that array won't + * be degraded any more and the spare can go back to sleep then. + */ + nospares = 0; + + sync_req = mddev->in_sync; + + /* If this is just a dirty<->clean transition, and the array is clean + * and 'events' is odd, we can roll back to the previous clean state */ + if (nospares + && (mddev->in_sync && mddev->recovery_cp == MaxSector) + && mddev->can_decrease_events + && mddev->events != 1) { + mddev->events--; + mddev->can_decrease_events = 0; + } else { + /* otherwise we have to go forward and ... */ + mddev->events ++; + mddev->can_decrease_events = nospares; + } + + if (!mddev->events) { + /* + * oops, this 64-bit counter should never wrap. + * Either we are in around ~1 trillion A.C., assuming + * 1 reboot per second, or we have a bug: + */ + MD_BUG(); + mddev->events --; + } + + rdev_for_each(rdev, mddev) { + if (rdev->badblocks.changed) + any_badblocks_changed++; + if (test_bit(Faulty, &rdev->flags)) + set_bit(FaultRecorded, &rdev->flags); + } + + sync_sbs(mddev, nospares); + spin_unlock_irq(&mddev->write_lock); + + pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", + mdname(mddev), mddev->in_sync); + + bitmap_update_sb(mddev->bitmap); + rdev_for_each(rdev, mddev) { + char b[BDEVNAME_SIZE]; + + if (rdev->sb_loaded != 1) + continue; /* no noise on spare devices */ + + if (!test_bit(Faulty, &rdev->flags) && + rdev->saved_raid_disk == -1) { + md_super_write(mddev,rdev, + rdev->sb_start, rdev->sb_size, + rdev->sb_page); + pr_debug("md: (write) %s's sb offset: %llu\n", + bdevname(rdev->bdev, b), + (unsigned long long)rdev->sb_start); + rdev->sb_events = mddev->events; + if (rdev->badblocks.size) { + md_super_write(mddev, rdev, + rdev->badblocks.sector, + rdev->badblocks.size << 9, + rdev->bb_page); + rdev->badblocks.size = 0; + } + + } else if (test_bit(Faulty, &rdev->flags)) + pr_debug("md: %s (skipping faulty)\n", + bdevname(rdev->bdev, b)); + else + pr_debug("(skipping incremental s/r "); + + if (mddev->level == LEVEL_MULTIPATH) + /* only need to write one superblock... */ + break; + } + md_super_wait(mddev); + /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ + + spin_lock_irq(&mddev->write_lock); + if (mddev->in_sync != sync_req || + test_bit(MD_CHANGE_DEVS, &mddev->flags)) { + /* have to write it out again */ + spin_unlock_irq(&mddev->write_lock); + goto repeat; + } + clear_bit(MD_CHANGE_PENDING, &mddev->flags); + spin_unlock_irq(&mddev->write_lock); + wake_up(&mddev->sb_wait); + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + + rdev_for_each(rdev, mddev) { + if (test_and_clear_bit(FaultRecorded, &rdev->flags)) + clear_bit(Blocked, &rdev->flags); + + if (any_badblocks_changed) + md_ack_all_badblocks(&rdev->badblocks); + clear_bit(BlockedBadBlocks, &rdev->flags); + wake_up(&rdev->blocked_wait); + } +} + +/* words written to sysfs files may, or may not, be \n terminated. + * We want to accept with case. For this we use cmd_match. + */ +static int cmd_match(const char *cmd, const char *str) +{ + /* See if cmd, written into a sysfs file, matches + * str. They must either be the same, or cmd can + * have a trailing newline + */ + while (*cmd && *str && *cmd == *str) { + cmd++; + str++; + } + if (*cmd == '\n') + cmd++; + if (*str || *cmd) + return 0; + return 1; +} + +struct rdev_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct md_rdev *, char *); + ssize_t (*store)(struct md_rdev *, const char *, size_t); +}; + +static ssize_t +state_show(struct md_rdev *rdev, char *page) +{ + char *sep = ""; + size_t len = 0; + + if (test_bit(Faulty, &rdev->flags) || + rdev->badblocks.unacked_exist) { + len+= sprintf(page+len, "%sfaulty",sep); + sep = ","; + } + if (test_bit(In_sync, &rdev->flags)) { + len += sprintf(page+len, "%sin_sync",sep); + sep = ","; + } + if (test_bit(WriteMostly, &rdev->flags)) { + len += sprintf(page+len, "%swrite_mostly",sep); + sep = ","; + } + if (test_bit(Blocked, &rdev->flags) || + (rdev->badblocks.unacked_exist + && !test_bit(Faulty, &rdev->flags))) { + len += sprintf(page+len, "%sblocked", sep); + sep = ","; + } + if (!test_bit(Faulty, &rdev->flags) && + !test_bit(In_sync, &rdev->flags)) { + len += sprintf(page+len, "%sspare", sep); + sep = ","; + } + if (test_bit(WriteErrorSeen, &rdev->flags)) { + len += sprintf(page+len, "%swrite_error", sep); + sep = ","; + } + if (test_bit(WantReplacement, &rdev->flags)) { + len += sprintf(page+len, "%swant_replacement", sep); + sep = ","; + } + if (test_bit(Replacement, &rdev->flags)) { + len += sprintf(page+len, "%sreplacement", sep); + sep = ","; + } + + return len+sprintf(page+len, "\n"); +} + +static ssize_t +state_store(struct md_rdev *rdev, const char *buf, size_t len) +{ + /* can write + * faulty - simulates an error + * remove - disconnects the device + * writemostly - sets write_mostly + * -writemostly - clears write_mostly + * blocked - sets the Blocked flags + * -blocked - clears the Blocked and possibly simulates an error + * insync - sets Insync providing device isn't active + * write_error - sets WriteErrorSeen + * -write_error - clears WriteErrorSeen + */ + int err = -EINVAL; + if (cmd_match(buf, "faulty") && rdev->mddev->pers) { + md_error(rdev->mddev, rdev); + if (test_bit(Faulty, &rdev->flags)) + err = 0; + else + err = -EBUSY; + } else if (cmd_match(buf, "remove")) { + if (rdev->raid_disk >= 0) + err = -EBUSY; + else { + struct mddev *mddev = rdev->mddev; + kick_rdev_from_array(rdev); + if (mddev->pers) + md_update_sb(mddev, 1); + md_new_event(mddev); + err = 0; + } + } else if (cmd_match(buf, "writemostly")) { + set_bit(WriteMostly, &rdev->flags); + err = 0; + } else if (cmd_match(buf, "-writemostly")) { + clear_bit(WriteMostly, &rdev->flags); + err = 0; + } else if (cmd_match(buf, "blocked")) { + set_bit(Blocked, &rdev->flags); + err = 0; + } else if (cmd_match(buf, "-blocked")) { + if (!test_bit(Faulty, &rdev->flags) && + rdev->badblocks.unacked_exist) { + /* metadata handler doesn't understand badblocks, + * so we need to fail the device + */ + md_error(rdev->mddev, rdev); + } + clear_bit(Blocked, &rdev->flags); + clear_bit(BlockedBadBlocks, &rdev->flags); + wake_up(&rdev->blocked_wait); + set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); + md_wakeup_thread(rdev->mddev->thread); + + err = 0; + } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { + set_bit(In_sync, &rdev->flags); + err = 0; + } else if (cmd_match(buf, "write_error")) { + set_bit(WriteErrorSeen, &rdev->flags); + err = 0; + } else if (cmd_match(buf, "-write_error")) { + clear_bit(WriteErrorSeen, &rdev->flags); + err = 0; + } else if (cmd_match(buf, "want_replacement")) { + /* Any non-spare device that is not a replacement can + * become want_replacement at any time, but we then need to + * check if recovery is needed. + */ + if (rdev->raid_disk >= 0 && + !test_bit(Replacement, &rdev->flags)) + set_bit(WantReplacement, &rdev->flags); + set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); + md_wakeup_thread(rdev->mddev->thread); + err = 0; + } else if (cmd_match(buf, "-want_replacement")) { + /* Clearing 'want_replacement' is always allowed. + * Once replacements starts it is too late though. + */ + err = 0; + clear_bit(WantReplacement, &rdev->flags); + } else if (cmd_match(buf, "replacement")) { + /* Can only set a device as a replacement when array has not + * yet been started. Once running, replacement is automatic + * from spares, or by assigning 'slot'. + */ + if (rdev->mddev->pers) + err = -EBUSY; + else { + set_bit(Replacement, &rdev->flags); + err = 0; + } + } else if (cmd_match(buf, "-replacement")) { + /* Similarly, can only clear Replacement before start */ + if (rdev->mddev->pers) + err = -EBUSY; + else { + clear_bit(Replacement, &rdev->flags); + err = 0; + } + } + if (!err) + sysfs_notify_dirent_safe(rdev->sysfs_state); + return err ? err : len; +} +static struct rdev_sysfs_entry rdev_state = +__ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store); + +static ssize_t +errors_show(struct md_rdev *rdev, char *page) +{ + return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); +} + +static ssize_t +errors_store(struct md_rdev *rdev, const char *buf, size_t len) +{ + char *e; + unsigned long n = simple_strtoul(buf, &e, 10); + if (*buf && (*e == 0 || *e == '\n')) { + atomic_set(&rdev->corrected_errors, n); + return len; + } + return -EINVAL; +} +static struct rdev_sysfs_entry rdev_errors = +__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); + +static ssize_t +slot_show(struct md_rdev *rdev, char *page) +{ + if (rdev->raid_disk < 0) + return sprintf(page, "none\n"); + else + return sprintf(page, "%d\n", rdev->raid_disk); +} + +static ssize_t +slot_store(struct md_rdev *rdev, const char *buf, size_t len) +{ + char *e; + int err; + int slot = simple_strtoul(buf, &e, 10); + if (strncmp(buf, "none", 4)==0) + slot = -1; + else if (e==buf || (*e && *e!= '\n')) + return -EINVAL; + if (rdev->mddev->pers && slot == -1) { + /* Setting 'slot' on an active array requires also + * updating the 'rd%d' link, and communicating + * with the personality with ->hot_*_disk. + * For now we only support removing + * failed/spare devices. This normally happens automatically, + * but not when the metadata is externally managed. + */ + if (rdev->raid_disk == -1) + return -EEXIST; + /* personality does all needed checks */ + if (rdev->mddev->pers->hot_remove_disk == NULL) + return -EINVAL; + err = rdev->mddev->pers-> + hot_remove_disk(rdev->mddev, rdev); + if (err) + return err; + sysfs_unlink_rdev(rdev->mddev, rdev); + rdev->raid_disk = -1; + set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); + md_wakeup_thread(rdev->mddev->thread); + } else if (rdev->mddev->pers) { + /* Activating a spare .. or possibly reactivating + * if we ever get bitmaps working here. + */ + + if (rdev->raid_disk != -1) + return -EBUSY; + + if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) + return -EBUSY; + + if (rdev->mddev->pers->hot_add_disk == NULL) + return -EINVAL; + + if (slot >= rdev->mddev->raid_disks && + slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) + return -ENOSPC; + + rdev->raid_disk = slot; + if (test_bit(In_sync, &rdev->flags)) + rdev->saved_raid_disk = slot; + else + rdev->saved_raid_disk = -1; + clear_bit(In_sync, &rdev->flags); + err = rdev->mddev->pers-> + hot_add_disk(rdev->mddev, rdev); + if (err) { + rdev->raid_disk = -1; + return err; + } else + sysfs_notify_dirent_safe(rdev->sysfs_state); + if (sysfs_link_rdev(rdev->mddev, rdev)) + /* failure here is OK */; + /* don't wakeup anyone, leave that to userspace. */ + } else { + if (slot >= rdev->mddev->raid_disks && + slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) + return -ENOSPC; + rdev->raid_disk = slot; + /* assume it is working */ + clear_bit(Faulty, &rdev->flags); + clear_bit(WriteMostly, &rdev->flags); + set_bit(In_sync, &rdev->flags); + sysfs_notify_dirent_safe(rdev->sysfs_state); + } + return len; +} + + +static struct rdev_sysfs_entry rdev_slot = +__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); + +static ssize_t +offset_show(struct md_rdev *rdev, char *page) +{ + return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); +} + +static ssize_t +offset_store(struct md_rdev *rdev, const char *buf, size_t len) +{ + char *e; + unsigned long long offset = simple_strtoull(buf, &e, 10); + if (e==buf || (*e && *e != '\n')) + return -EINVAL; + if (rdev->mddev->pers && rdev->raid_disk >= 0) + return -EBUSY; + if (rdev->sectors && rdev->mddev->external) + /* Must set offset before size, so overlap checks + * can be sane */ + return -EBUSY; + rdev->data_offset = offset; + return len; +} + +static struct rdev_sysfs_entry rdev_offset = +__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); + +static ssize_t +rdev_size_show(struct md_rdev *rdev, char *page) +{ + return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); +} + +static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) +{ + /* check if two start/length pairs overlap */ + if (s1+l1 <= s2) + return 0; + if (s2+l2 <= s1) + return 0; + return 1; +} + +static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) +{ + unsigned long long blocks; + sector_t new; + + if (strict_strtoull(buf, 10, &blocks) < 0) + return -EINVAL; + + if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) + return -EINVAL; /* sector conversion overflow */ + + new = blocks * 2; + if (new != blocks * 2) + return -EINVAL; /* unsigned long long to sector_t overflow */ + + *sectors = new; + return 0; +} + +static ssize_t +rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) +{ + struct mddev *my_mddev = rdev->mddev; + sector_t oldsectors = rdev->sectors; + sector_t sectors; + + if (strict_blocks_to_sectors(buf, §ors) < 0) + return -EINVAL; + if (my_mddev->pers && rdev->raid_disk >= 0) { + if (my_mddev->persistent) { + sectors = super_types[my_mddev->major_version]. + rdev_size_change(rdev, sectors); + if (!sectors) + return -EBUSY; + } else if (!sectors) + sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - + rdev->data_offset; + } + if (sectors < my_mddev->dev_sectors) + return -EINVAL; /* component must fit device */ + + rdev->sectors = sectors; + if (sectors > oldsectors && my_mddev->external) { + /* need to check that all other rdevs with the same ->bdev + * do not overlap. We need to unlock the mddev to avoid + * a deadlock. We have already changed rdev->sectors, and if + * we have to change it back, we will have the lock again. + */ + struct mddev *mddev; + int overlap = 0; + struct list_head *tmp; + + mddev_unlock(my_mddev); + for_each_mddev(mddev, tmp) { + struct md_rdev *rdev2; + + mddev_lock(mddev); + rdev_for_each(rdev2, mddev) + if (rdev->bdev == rdev2->bdev && + rdev != rdev2 && + overlaps(rdev->data_offset, rdev->sectors, + rdev2->data_offset, + rdev2->sectors)) { + overlap = 1; + break; + } + mddev_unlock(mddev); + if (overlap) { + mddev_put(mddev); + break; + } + } + mddev_lock(my_mddev); + if (overlap) { + /* Someone else could have slipped in a size + * change here, but doing so is just silly. + * We put oldsectors back because we *know* it is + * safe, and trust userspace not to race with + * itself + */ + rdev->sectors = oldsectors; + return -EBUSY; + } + } + return len; +} + +static struct rdev_sysfs_entry rdev_size = +__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); + + +static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) +{ + unsigned long long recovery_start = rdev->recovery_offset; + + if (test_bit(In_sync, &rdev->flags) || + recovery_start == MaxSector) + return sprintf(page, "none\n"); + + return sprintf(page, "%llu\n", recovery_start); +} + +static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) +{ + unsigned long long recovery_start; + + if (cmd_match(buf, "none")) + recovery_start = MaxSector; + else if (strict_strtoull(buf, 10, &recovery_start)) + return -EINVAL; + + if (rdev->mddev->pers && + rdev->raid_disk >= 0) + return -EBUSY; + + rdev->recovery_offset = recovery_start; + if (recovery_start == MaxSector) + set_bit(In_sync, &rdev->flags); + else + clear_bit(In_sync, &rdev->flags); + return len; +} + +static struct rdev_sysfs_entry rdev_recovery_start = +__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); + + +static ssize_t +badblocks_show(struct badblocks *bb, char *page, int unack); +static ssize_t +badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack); + +static ssize_t bb_show(struct md_rdev *rdev, char *page) +{ + return badblocks_show(&rdev->badblocks, page, 0); +} +static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) +{ + int rv = badblocks_store(&rdev->badblocks, page, len, 0); + /* Maybe that ack was all we needed */ + if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) + wake_up(&rdev->blocked_wait); + return rv; +} +static struct rdev_sysfs_entry rdev_bad_blocks = +__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); + + +static ssize_t ubb_show(struct md_rdev *rdev, char *page) +{ + return badblocks_show(&rdev->badblocks, page, 1); +} +static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) +{ + return badblocks_store(&rdev->badblocks, page, len, 1); +} +static struct rdev_sysfs_entry rdev_unack_bad_blocks = +__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); + +static struct attribute *rdev_default_attrs[] = { + &rdev_state.attr, + &rdev_errors.attr, + &rdev_slot.attr, + &rdev_offset.attr, + &rdev_size.attr, + &rdev_recovery_start.attr, + &rdev_bad_blocks.attr, + &rdev_unack_bad_blocks.attr, + NULL, +}; +static ssize_t +rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) +{ + struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); + struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); + struct mddev *mddev = rdev->mddev; + ssize_t rv; + + if (!entry->show) + return -EIO; + + rv = mddev ? mddev_lock(mddev) : -EBUSY; + if (!rv) { + if (rdev->mddev == NULL) + rv = -EBUSY; + else + rv = entry->show(rdev, page); + mddev_unlock(mddev); + } + return rv; +} + +static ssize_t +rdev_attr_store(struct kobject *kobj, struct attribute *attr, + const char *page, size_t length) +{ + struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); + struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); + ssize_t rv; + struct mddev *mddev = rdev->mddev; + + if (!entry->store) + return -EIO; + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + rv = mddev ? mddev_lock(mddev): -EBUSY; + if (!rv) { + if (rdev->mddev == NULL) + rv = -EBUSY; + else + rv = entry->store(rdev, page, length); + mddev_unlock(mddev); + } + return rv; +} + +static void rdev_free(struct kobject *ko) +{ + struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); + kfree(rdev); +} +static const struct sysfs_ops rdev_sysfs_ops = { + .show = rdev_attr_show, + .store = rdev_attr_store, +}; +static struct kobj_type rdev_ktype = { + .release = rdev_free, + .sysfs_ops = &rdev_sysfs_ops, + .default_attrs = rdev_default_attrs, +}; + +int md_rdev_init(struct md_rdev *rdev) +{ + rdev->desc_nr = -1; + rdev->saved_raid_disk = -1; + rdev->raid_disk = -1; + rdev->flags = 0; + rdev->data_offset = 0; + rdev->sb_events = 0; + rdev->last_read_error.tv_sec = 0; + rdev->last_read_error.tv_nsec = 0; + rdev->sb_loaded = 0; + rdev->bb_page = NULL; + atomic_set(&rdev->nr_pending, 0); + atomic_set(&rdev->read_errors, 0); + atomic_set(&rdev->corrected_errors, 0); + + INIT_LIST_HEAD(&rdev->same_set); + init_waitqueue_head(&rdev->blocked_wait); + + /* Add space to store bad block list. + * This reserves the space even on arrays where it cannot + * be used - I wonder if that matters + */ + rdev->badblocks.count = 0; + rdev->badblocks.shift = 0; + rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL); + seqlock_init(&rdev->badblocks.lock); + if (rdev->badblocks.page == NULL) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL_GPL(md_rdev_init); +/* + * Import a device. If 'super_format' >= 0, then sanity check the superblock + * + * mark the device faulty if: + * + * - the device is nonexistent (zero size) + * - the device has no valid superblock + * + * a faulty rdev _never_ has rdev->sb set. + */ +static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) +{ + char b[BDEVNAME_SIZE]; + int err; + struct md_rdev *rdev; + sector_t size; + + rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); + if (!rdev) { + printk(KERN_ERR "md: could not alloc mem for new device!\n"); + return ERR_PTR(-ENOMEM); + } + + err = md_rdev_init(rdev); + if (err) + goto abort_free; + err = alloc_disk_sb(rdev); + if (err) + goto abort_free; + + err = lock_rdev(rdev, newdev, super_format == -2); + if (err) + goto abort_free; + + kobject_init(&rdev->kobj, &rdev_ktype); + + size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS; + if (!size) { + printk(KERN_WARNING + "md: %s has zero or unknown size, marking faulty!\n", + bdevname(rdev->bdev,b)); + err = -EINVAL; + goto abort_free; + } + + if (super_format >= 0) { + err = super_types[super_format]. + load_super(rdev, NULL, super_minor); + if (err == -EINVAL) { + printk(KERN_WARNING + "md: %s does not have a valid v%d.%d " + "superblock, not importing!\n", + bdevname(rdev->bdev,b), + super_format, super_minor); + goto abort_free; + } + if (err < 0) { + printk(KERN_WARNING + "md: could not read %s's sb, not importing!\n", + bdevname(rdev->bdev,b)); + goto abort_free; + } + } + if (super_format == -1) + /* hot-add for 0.90, or non-persistent: so no badblocks */ + rdev->badblocks.shift = -1; + + return rdev; + +abort_free: + if (rdev->bdev) + unlock_rdev(rdev); + free_disk_sb(rdev); + kfree(rdev->badblocks.page); + kfree(rdev); + return ERR_PTR(err); +} + +/* + * Check a full RAID array for plausibility + */ + + +static void analyze_sbs(struct mddev * mddev) +{ + int i; + struct md_rdev *rdev, *freshest, *tmp; + char b[BDEVNAME_SIZE]; + + freshest = NULL; + rdev_for_each_safe(rdev, tmp, mddev) + switch (super_types[mddev->major_version]. + load_super(rdev, freshest, mddev->minor_version)) { + case 1: + freshest = rdev; + break; + case 0: + break; + default: + printk( KERN_ERR \ + "md: fatal superblock inconsistency in %s" + " -- removing from array\n", + bdevname(rdev->bdev,b)); + kick_rdev_from_array(rdev); + } + + + super_types[mddev->major_version]. + validate_super(mddev, freshest); + + i = 0; + rdev_for_each_safe(rdev, tmp, mddev) { + if (mddev->max_disks && + (rdev->desc_nr >= mddev->max_disks || + i > mddev->max_disks)) { + printk(KERN_WARNING + "md: %s: %s: only %d devices permitted\n", + mdname(mddev), bdevname(rdev->bdev, b), + mddev->max_disks); + kick_rdev_from_array(rdev); + continue; + } + if (rdev != freshest) + if (super_types[mddev->major_version]. + validate_super(mddev, rdev)) { + printk(KERN_WARNING "md: kicking non-fresh %s" + " from array!\n", + bdevname(rdev->bdev,b)); + kick_rdev_from_array(rdev); + continue; + } + if (mddev->level == LEVEL_MULTIPATH) { + rdev->desc_nr = i++; + rdev->raid_disk = rdev->desc_nr; + set_bit(In_sync, &rdev->flags); + } else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) { + rdev->raid_disk = -1; + clear_bit(In_sync, &rdev->flags); + } + } +} + +/* Read a fixed-point number. + * Numbers in sysfs attributes should be in "standard" units where + * possible, so time should be in seconds. + * However we internally use a a much smaller unit such as + * milliseconds or jiffies. + * This function takes a decimal number with a possible fractional + * component, and produces an integer which is the result of + * multiplying that number by 10^'scale'. + * all without any floating-point arithmetic. + */ +int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) +{ + unsigned long result = 0; + long decimals = -1; + while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { + if (*cp == '.') + decimals = 0; + else if (decimals < scale) { + unsigned int value; + value = *cp - '0'; + result = result * 10 + value; + if (decimals >= 0) + decimals++; + } + cp++; + } + if (*cp == '\n') + cp++; + if (*cp) + return -EINVAL; + if (decimals < 0) + decimals = 0; + while (decimals < scale) { + result *= 10; + decimals ++; + } + *res = result; + return 0; +} + + +static void md_safemode_timeout(unsigned long data); + +static ssize_t +safe_delay_show(struct mddev *mddev, char *page) +{ + int msec = (mddev->safemode_delay*1000)/HZ; + return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); +} +static ssize_t +safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) +{ + unsigned long msec; + + if (strict_strtoul_scaled(cbuf, &msec, 3) < 0) + return -EINVAL; + if (msec == 0) + mddev->safemode_delay = 0; + else { + unsigned long old_delay = mddev->safemode_delay; + mddev->safemode_delay = (msec*HZ)/1000; + if (mddev->safemode_delay == 0) + mddev->safemode_delay = 1; + if (mddev->safemode_delay < old_delay) + md_safemode_timeout((unsigned long)mddev); + } + return len; +} +static struct md_sysfs_entry md_safe_delay = +__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); + +static ssize_t +level_show(struct mddev *mddev, char *page) +{ + struct md_personality *p = mddev->pers; + if (p) + return sprintf(page, "%s\n", p->name); + else if (mddev->clevel[0]) + return sprintf(page, "%s\n", mddev->clevel); + else if (mddev->level != LEVEL_NONE) + return sprintf(page, "%d\n", mddev->level); + else + return 0; +} + +static ssize_t +level_store(struct mddev *mddev, const char *buf, size_t len) +{ + char clevel[16]; + ssize_t rv = len; + struct md_personality *pers; + long level; + void *priv; + struct md_rdev *rdev; + + if (mddev->pers == NULL) { + if (len == 0) + return 0; + if (len >= sizeof(mddev->clevel)) + return -ENOSPC; + strncpy(mddev->clevel, buf, len); + if (mddev->clevel[len-1] == '\n') + len--; + mddev->clevel[len] = 0; + mddev->level = LEVEL_NONE; + return rv; + } + + /* request to change the personality. Need to ensure: + * - array is not engaged in resync/recovery/reshape + * - old personality can be suspended + * - new personality will access other array. + */ + + if (mddev->sync_thread || + mddev->reshape_position != MaxSector || + mddev->sysfs_active) + return -EBUSY; + + if (!mddev->pers->quiesce) { + printk(KERN_WARNING "md: %s: %s does not support online personality change\n", + mdname(mddev), mddev->pers->name); + return -EINVAL; + } + + /* Now find the new personality */ + if (len == 0 || len >= sizeof(clevel)) + return -EINVAL; + strncpy(clevel, buf, len); + if (clevel[len-1] == '\n') + len--; + clevel[len] = 0; + if (strict_strtol(clevel, 10, &level)) + level = LEVEL_NONE; + + if (request_module("md-%s", clevel) != 0) + request_module("md-level-%s", clevel); + spin_lock(&pers_lock); + pers = find_pers(level, clevel); + if (!pers || !try_module_get(pers->owner)) { + spin_unlock(&pers_lock); + printk(KERN_WARNING "md: personality %s not loaded\n", clevel); + return -EINVAL; + } + spin_unlock(&pers_lock); + + if (pers == mddev->pers) { + /* Nothing to do! */ + module_put(pers->owner); + return rv; + } + if (!pers->takeover) { + module_put(pers->owner); + printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", + mdname(mddev), clevel); + return -EINVAL; + } + + rdev_for_each(rdev, mddev) + rdev->new_raid_disk = rdev->raid_disk; + + /* ->takeover must set new_* and/or delta_disks + * if it succeeds, and may set them when it fails. + */ + priv = pers->takeover(mddev); + if (IS_ERR(priv)) { + mddev->new_level = mddev->level; + mddev->new_layout = mddev->layout; + mddev->new_chunk_sectors = mddev->chunk_sectors; + mddev->raid_disks -= mddev->delta_disks; + mddev->delta_disks = 0; + module_put(pers->owner); + printk(KERN_WARNING "md: %s: %s would not accept array\n", + mdname(mddev), clevel); + return PTR_ERR(priv); + } + + /* Looks like we have a winner */ + mddev_suspend(mddev); + mddev->pers->stop(mddev); + + if (mddev->pers->sync_request == NULL && + pers->sync_request != NULL) { + /* need to add the md_redundancy_group */ + if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) + printk(KERN_WARNING + "md: cannot register extra attributes for %s\n", + mdname(mddev)); + mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, NULL, "sync_action"); + } + if (mddev->pers->sync_request != NULL && + pers->sync_request == NULL) { + /* need to remove the md_redundancy_group */ + if (mddev->to_remove == NULL) + mddev->to_remove = &md_redundancy_group; + } + + if (mddev->pers->sync_request == NULL && + mddev->external) { + /* We are converting from a no-redundancy array + * to a redundancy array and metadata is managed + * externally so we need to be sure that writes + * won't block due to a need to transition + * clean->dirty + * until external management is started. + */ + mddev->in_sync = 0; + mddev->safemode_delay = 0; + mddev->safemode = 0; + } + + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk < 0) + continue; + if (rdev->new_raid_disk >= mddev->raid_disks) + rdev->new_raid_disk = -1; + if (rdev->new_raid_disk == rdev->raid_disk) + continue; + sysfs_unlink_rdev(mddev, rdev); + } + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk < 0) + continue; + if (rdev->new_raid_disk == rdev->raid_disk) + continue; + rdev->raid_disk = rdev->new_raid_disk; + if (rdev->raid_disk < 0) + clear_bit(In_sync, &rdev->flags); + else { + if (sysfs_link_rdev(mddev, rdev)) + printk(KERN_WARNING "md: cannot register rd%d" + " for %s after level change\n", + rdev->raid_disk, mdname(mddev)); + } + } + + module_put(mddev->pers->owner); + mddev->pers = pers; + mddev->private = priv; + strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + mddev->level = mddev->new_level; + mddev->layout = mddev->new_layout; + mddev->chunk_sectors = mddev->new_chunk_sectors; + mddev->delta_disks = 0; + mddev->degraded = 0; + if (mddev->pers->sync_request == NULL) { + /* this is now an array without redundancy, so + * it must always be in_sync + */ + mddev->in_sync = 1; + del_timer_sync(&mddev->safemode_timer); + } + pers->run(mddev); + mddev_resume(mddev); + set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + sysfs_notify(&mddev->kobj, NULL, "level"); + md_new_event(mddev); + return rv; +} + +static struct md_sysfs_entry md_level = +__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); + + +static ssize_t +layout_show(struct mddev *mddev, char *page) +{ + /* just a number, not meaningful for all levels */ + if (mddev->reshape_position != MaxSector && + mddev->layout != mddev->new_layout) + return sprintf(page, "%d (%d)\n", + mddev->new_layout, mddev->layout); + return sprintf(page, "%d\n", mddev->layout); +} + +static ssize_t +layout_store(struct mddev *mddev, const char *buf, size_t len) +{ + char *e; + unsigned long n = simple_strtoul(buf, &e, 10); + + if (!*buf || (*e && *e != '\n')) + return -EINVAL; + + if (mddev->pers) { + int err; + if (mddev->pers->check_reshape == NULL) + return -EBUSY; + mddev->new_layout = n; + err = mddev->pers->check_reshape(mddev); + if (err) { + mddev->new_layout = mddev->layout; + return err; + } + } else { + mddev->new_layout = n; + if (mddev->reshape_position == MaxSector) + mddev->layout = n; + } + return len; +} +static struct md_sysfs_entry md_layout = +__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); + + +static ssize_t +raid_disks_show(struct mddev *mddev, char *page) +{ + if (mddev->raid_disks == 0) + return 0; + if (mddev->reshape_position != MaxSector && + mddev->delta_disks != 0) + return sprintf(page, "%d (%d)\n", mddev->raid_disks, + mddev->raid_disks - mddev->delta_disks); + return sprintf(page, "%d\n", mddev->raid_disks); +} + +static int update_raid_disks(struct mddev *mddev, int raid_disks); + +static ssize_t +raid_disks_store(struct mddev *mddev, const char *buf, size_t len) +{ + char *e; + int rv = 0; + unsigned long n = simple_strtoul(buf, &e, 10); + + if (!*buf || (*e && *e != '\n')) + return -EINVAL; + + if (mddev->pers) + rv = update_raid_disks(mddev, n); + else if (mddev->reshape_position != MaxSector) { + int olddisks = mddev->raid_disks - mddev->delta_disks; + mddev->delta_disks = n - olddisks; + mddev->raid_disks = n; + } else + mddev->raid_disks = n; + return rv ? rv : len; +} +static struct md_sysfs_entry md_raid_disks = +__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); + +static ssize_t +chunk_size_show(struct mddev *mddev, char *page) +{ + if (mddev->reshape_position != MaxSector && + mddev->chunk_sectors != mddev->new_chunk_sectors) + return sprintf(page, "%d (%d)\n", + mddev->new_chunk_sectors << 9, + mddev->chunk_sectors << 9); + return sprintf(page, "%d\n", mddev->chunk_sectors << 9); +} + +static ssize_t +chunk_size_store(struct mddev *mddev, const char *buf, size_t len) +{ + char *e; + unsigned long n = simple_strtoul(buf, &e, 10); + + if (!*buf || (*e && *e != '\n')) + return -EINVAL; + + if (mddev->pers) { + int err; + if (mddev->pers->check_reshape == NULL) + return -EBUSY; + mddev->new_chunk_sectors = n >> 9; + err = mddev->pers->check_reshape(mddev); + if (err) { + mddev->new_chunk_sectors = mddev->chunk_sectors; + return err; + } + } else { + mddev->new_chunk_sectors = n >> 9; + if (mddev->reshape_position == MaxSector) + mddev->chunk_sectors = n >> 9; + } + return len; +} +static struct md_sysfs_entry md_chunk_size = +__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); + +static ssize_t +resync_start_show(struct mddev *mddev, char *page) +{ + if (mddev->recovery_cp == MaxSector) + return sprintf(page, "none\n"); + return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); +} + +static ssize_t +resync_start_store(struct mddev *mddev, const char *buf, size_t len) +{ + char *e; + unsigned long long n = simple_strtoull(buf, &e, 10); + + if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) + return -EBUSY; + if (cmd_match(buf, "none")) + n = MaxSector; + else if (!*buf || (*e && *e != '\n')) + return -EINVAL; + + mddev->recovery_cp = n; + return len; +} +static struct md_sysfs_entry md_resync_start = +__ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); + +/* + * The array state can be: + * + * clear + * No devices, no size, no level + * Equivalent to STOP_ARRAY ioctl + * inactive + * May have some settings, but array is not active + * all IO results in error + * When written, doesn't tear down array, but just stops it + * suspended (not supported yet) + * All IO requests will block. The array can be reconfigured. + * Writing this, if accepted, will block until array is quiescent + * readonly + * no resync can happen. no superblocks get written. + * write requests fail + * read-auto + * like readonly, but behaves like 'clean' on a write request. + * + * clean - no pending writes, but otherwise active. + * When written to inactive array, starts without resync + * If a write request arrives then + * if metadata is known, mark 'dirty' and switch to 'active'. + * if not known, block and switch to write-pending + * If written to an active array that has pending writes, then fails. + * active + * fully active: IO and resync can be happening. + * When written to inactive array, starts with resync + * + * write-pending + * clean, but writes are blocked waiting for 'active' to be written. + * + * active-idle + * like active, but no writes have been seen for a while (100msec). + * + */ +enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, + write_pending, active_idle, bad_word}; +static char *array_states[] = { + "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", + "write-pending", "active-idle", NULL }; + +static int match_word(const char *word, char **list) +{ + int n; + for (n=0; list[n]; n++) + if (cmd_match(word, list[n])) + break; + return n; +} + +static ssize_t +array_state_show(struct mddev *mddev, char *page) +{ + enum array_state st = inactive; + + if (mddev->pers) + switch(mddev->ro) { + case 1: + st = readonly; + break; + case 2: + st = read_auto; + break; + case 0: + if (mddev->in_sync) + st = clean; + else if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) + st = write_pending; + else if (mddev->safemode) + st = active_idle; + else + st = active; + } + else { + if (list_empty(&mddev->disks) && + mddev->raid_disks == 0 && + mddev->dev_sectors == 0) + st = clear; + else + st = inactive; + } + return sprintf(page, "%s\n", array_states[st]); +} + +static int do_md_stop(struct mddev * mddev, int ro, int is_open); +static int md_set_readonly(struct mddev * mddev, int is_open); +static int do_md_run(struct mddev * mddev); +static int restart_array(struct mddev *mddev); + +static ssize_t +array_state_store(struct mddev *mddev, const char *buf, size_t len) +{ + int err = -EINVAL; + enum array_state st = match_word(buf, array_states); + switch(st) { + case bad_word: + break; + case clear: + /* stopping an active array */ + if (atomic_read(&mddev->openers) > 0) + return -EBUSY; + err = do_md_stop(mddev, 0, 0); + break; + case inactive: + /* stopping an active array */ + if (mddev->pers) { + if (atomic_read(&mddev->openers) > 0) + return -EBUSY; + err = do_md_stop(mddev, 2, 0); + } else + err = 0; /* already inactive */ + break; + case suspended: + break; /* not supported yet */ + case readonly: + if (mddev->pers) + err = md_set_readonly(mddev, 0); + else { + mddev->ro = 1; + set_disk_ro(mddev->gendisk, 1); + err = do_md_run(mddev); + } + break; + case read_auto: + if (mddev->pers) { + if (mddev->ro == 0) + err = md_set_readonly(mddev, 0); + else if (mddev->ro == 1) + err = restart_array(mddev); + if (err == 0) { + mddev->ro = 2; + set_disk_ro(mddev->gendisk, 0); + } + } else { + mddev->ro = 2; + err = do_md_run(mddev); + } + break; + case clean: + if (mddev->pers) { + restart_array(mddev); + spin_lock_irq(&mddev->write_lock); + if (atomic_read(&mddev->writes_pending) == 0) { + if (mddev->in_sync == 0) { + mddev->in_sync = 1; + if (mddev->safemode == 1) + mddev->safemode = 0; + set_bit(MD_CHANGE_CLEAN, &mddev->flags); + } + err = 0; + } else + err = -EBUSY; + spin_unlock_irq(&mddev->write_lock); + } else + err = -EINVAL; + break; + case active: + if (mddev->pers) { + restart_array(mddev); + clear_bit(MD_CHANGE_PENDING, &mddev->flags); + wake_up(&mddev->sb_wait); + err = 0; + } else { + mddev->ro = 0; + set_disk_ro(mddev->gendisk, 0); + err = do_md_run(mddev); + } + break; + case write_pending: + case active_idle: + /* these cannot be set */ + break; + } + if (err) + return err; + else { + if (mddev->hold_active == UNTIL_IOCTL) + mddev->hold_active = 0; + sysfs_notify_dirent_safe(mddev->sysfs_state); + return len; + } +} +static struct md_sysfs_entry md_array_state = +__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); + +static ssize_t +max_corrected_read_errors_show(struct mddev *mddev, char *page) { + return sprintf(page, "%d\n", + atomic_read(&mddev->max_corr_read_errors)); +} + +static ssize_t +max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) +{ + char *e; + unsigned long n = simple_strtoul(buf, &e, 10); + + if (*buf && (*e == 0 || *e == '\n')) { + atomic_set(&mddev->max_corr_read_errors, n); + return len; + } + return -EINVAL; +} + +static struct md_sysfs_entry max_corr_read_errors = +__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, + max_corrected_read_errors_store); + +static ssize_t +null_show(struct mddev *mddev, char *page) +{ + return -EINVAL; +} + +static ssize_t +new_dev_store(struct mddev *mddev, const char *buf, size_t len) +{ + /* buf must be %d:%d\n? giving major and minor numbers */ + /* The new device is added to the array. + * If the array has a persistent superblock, we read the + * superblock to initialise info and check validity. + * Otherwise, only checking done is that in bind_rdev_to_array, + * which mainly checks size. + */ + char *e; + int major = simple_strtoul(buf, &e, 10); + int minor; + dev_t dev; + struct md_rdev *rdev; + int err; + + if (!*buf || *e != ':' || !e[1] || e[1] == '\n') + return -EINVAL; + minor = simple_strtoul(e+1, &e, 10); + if (*e && *e != '\n') + return -EINVAL; + dev = MKDEV(major, minor); + if (major != MAJOR(dev) || + minor != MINOR(dev)) + return -EOVERFLOW; + + + if (mddev->persistent) { + rdev = md_import_device(dev, mddev->major_version, + mddev->minor_version); + if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { + struct md_rdev *rdev0 + = list_entry(mddev->disks.next, + struct md_rdev, same_set); + err = super_types[mddev->major_version] + .load_super(rdev, rdev0, mddev->minor_version); + if (err < 0) + goto out; + } + } else if (mddev->external) + rdev = md_import_device(dev, -2, -1); + else + rdev = md_import_device(dev, -1, -1); + + if (IS_ERR(rdev)) + return PTR_ERR(rdev); + err = bind_rdev_to_array(rdev, mddev); + out: + if (err) + export_rdev(rdev); + return err ? err : len; +} + +static struct md_sysfs_entry md_new_device = +__ATTR(new_dev, S_IWUSR, null_show, new_dev_store); + +static ssize_t +bitmap_store(struct mddev *mddev, const char *buf, size_t len) +{ + char *end; + unsigned long chunk, end_chunk; + + if (!mddev->bitmap) + goto out; + /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ + while (*buf) { + chunk = end_chunk = simple_strtoul(buf, &end, 0); + if (buf == end) break; + if (*end == '-') { /* range */ + buf = end + 1; + end_chunk = simple_strtoul(buf, &end, 0); + if (buf == end) break; + } + if (*end && !isspace(*end)) break; + bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); + buf = skip_spaces(end); + } + bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ +out: + return len; +} + +static struct md_sysfs_entry md_bitmap = +__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); + +static ssize_t +size_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%llu\n", + (unsigned long long)mddev->dev_sectors / 2); +} + +static int update_size(struct mddev *mddev, sector_t num_sectors); + +static ssize_t +size_store(struct mddev *mddev, const char *buf, size_t len) +{ + /* If array is inactive, we can reduce the component size, but + * not increase it (except from 0). + * If array is active, we can try an on-line resize + */ + sector_t sectors; + int err = strict_blocks_to_sectors(buf, §ors); + + if (err < 0) + return err; + if (mddev->pers) { + err = update_size(mddev, sectors); + md_update_sb(mddev, 1); + } else { + if (mddev->dev_sectors == 0 || + mddev->dev_sectors > sectors) + mddev->dev_sectors = sectors; + else + err = -ENOSPC; + } + return err ? err : len; +} + +static struct md_sysfs_entry md_size = +__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); + + +/* Metdata version. + * This is one of + * 'none' for arrays with no metadata (good luck...) + * 'external' for arrays with externally managed metadata, + * or N.M for internally known formats + */ +static ssize_t +metadata_show(struct mddev *mddev, char *page) +{ + if (mddev->persistent) + return sprintf(page, "%d.%d\n", + mddev->major_version, mddev->minor_version); + else if (mddev->external) + return sprintf(page, "external:%s\n", mddev->metadata_type); + else + return sprintf(page, "none\n"); +} + +static ssize_t +metadata_store(struct mddev *mddev, const char *buf, size_t len) +{ + int major, minor; + char *e; + /* Changing the details of 'external' metadata is + * always permitted. Otherwise there must be + * no devices attached to the array. + */ + if (mddev->external && strncmp(buf, "external:", 9) == 0) + ; + else if (!list_empty(&mddev->disks)) + return -EBUSY; + + if (cmd_match(buf, "none")) { + mddev->persistent = 0; + mddev->external = 0; + mddev->major_version = 0; + mddev->minor_version = 90; + return len; + } + if (strncmp(buf, "external:", 9) == 0) { + size_t namelen = len-9; + if (namelen >= sizeof(mddev->metadata_type)) + namelen = sizeof(mddev->metadata_type)-1; + strncpy(mddev->metadata_type, buf+9, namelen); + mddev->metadata_type[namelen] = 0; + if (namelen && mddev->metadata_type[namelen-1] == '\n') + mddev->metadata_type[--namelen] = 0; + mddev->persistent = 0; + mddev->external = 1; + mddev->major_version = 0; + mddev->minor_version = 90; + return len; + } + major = simple_strtoul(buf, &e, 10); + if (e==buf || *e != '.') + return -EINVAL; + buf = e+1; + minor = simple_strtoul(buf, &e, 10); + if (e==buf || (*e && *e != '\n') ) + return -EINVAL; + if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) + return -ENOENT; + mddev->major_version = major; + mddev->minor_version = minor; + mddev->persistent = 1; + mddev->external = 0; + return len; +} + +static struct md_sysfs_entry md_metadata = +__ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); + +static ssize_t +action_show(struct mddev *mddev, char *page) +{ + char *type = "idle"; + if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) + type = "frozen"; + else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) { + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + type = "reshape"; + else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + type = "resync"; + else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) + type = "check"; + else + type = "repair"; + } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) + type = "recover"; + } + return sprintf(page, "%s\n", type); +} + +static void reap_sync_thread(struct mddev *mddev); + +static ssize_t +action_store(struct mddev *mddev, const char *page, size_t len) +{ + if (!mddev->pers || !mddev->pers->sync_request) + return -EINVAL; + + if (cmd_match(page, "frozen")) + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + else + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + + if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { + if (mddev->sync_thread) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + reap_sync_thread(mddev); + } + } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) + return -EBUSY; + else if (cmd_match(page, "resync")) + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + else if (cmd_match(page, "recover")) { + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + } else if (cmd_match(page, "reshape")) { + int err; + if (mddev->pers->start_reshape == NULL) + return -EINVAL; + err = mddev->pers->start_reshape(mddev); + if (err) + return err; + sysfs_notify(&mddev->kobj, NULL, "degraded"); + } else { + if (cmd_match(page, "check")) + set_bit(MD_RECOVERY_CHECK, &mddev->recovery); + else if (!cmd_match(page, "repair")) + return -EINVAL; + set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + } + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + sysfs_notify_dirent_safe(mddev->sysfs_action); + return len; +} + +static ssize_t +mismatch_cnt_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%llu\n", + (unsigned long long) mddev->resync_mismatches); +} + +static struct md_sysfs_entry md_scan_mode = +__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); + + +static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); + +static ssize_t +sync_min_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%d (%s)\n", speed_min(mddev), + mddev->sync_speed_min ? "local": "system"); +} + +static ssize_t +sync_min_store(struct mddev *mddev, const char *buf, size_t len) +{ + int min; + char *e; + if (strncmp(buf, "system", 6)==0) { + mddev->sync_speed_min = 0; + return len; + } + min = simple_strtoul(buf, &e, 10); + if (buf == e || (*e && *e != '\n') || min <= 0) + return -EINVAL; + mddev->sync_speed_min = min; + return len; +} + +static struct md_sysfs_entry md_sync_min = +__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); + +static ssize_t +sync_max_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%d (%s)\n", speed_max(mddev), + mddev->sync_speed_max ? "local": "system"); +} + +static ssize_t +sync_max_store(struct mddev *mddev, const char *buf, size_t len) +{ + int max; + char *e; + if (strncmp(buf, "system", 6)==0) { + mddev->sync_speed_max = 0; + return len; + } + max = simple_strtoul(buf, &e, 10); + if (buf == e || (*e && *e != '\n') || max <= 0) + return -EINVAL; + mddev->sync_speed_max = max; + return len; +} + +static struct md_sysfs_entry md_sync_max = +__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); + +static ssize_t +degraded_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%d\n", mddev->degraded); +} +static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); + +static ssize_t +sync_force_parallel_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%d\n", mddev->parallel_resync); +} + +static ssize_t +sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) +{ + long n; + + if (strict_strtol(buf, 10, &n)) + return -EINVAL; + + if (n != 0 && n != 1) + return -EINVAL; + + mddev->parallel_resync = n; + + if (mddev->sync_thread) + wake_up(&resync_wait); + + return len; +} + +/* force parallel resync, even with shared block devices */ +static struct md_sysfs_entry md_sync_force_parallel = +__ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, + sync_force_parallel_show, sync_force_parallel_store); + +static ssize_t +sync_speed_show(struct mddev *mddev, char *page) +{ + unsigned long resync, dt, db; + if (mddev->curr_resync == 0) + return sprintf(page, "none\n"); + resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); + dt = (jiffies - mddev->resync_mark) / HZ; + if (!dt) dt++; + db = resync - mddev->resync_mark_cnt; + return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ +} + +static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); + +static ssize_t +sync_completed_show(struct mddev *mddev, char *page) +{ + unsigned long long max_sectors, resync; + + if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + return sprintf(page, "none\n"); + + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) + max_sectors = mddev->resync_max_sectors; + else + max_sectors = mddev->dev_sectors; + + resync = mddev->curr_resync_completed; + return sprintf(page, "%llu / %llu\n", resync, max_sectors); +} + +static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); + +static ssize_t +min_sync_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%llu\n", + (unsigned long long)mddev->resync_min); +} +static ssize_t +min_sync_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned long long min; + if (strict_strtoull(buf, 10, &min)) + return -EINVAL; + if (min > mddev->resync_max) + return -EINVAL; + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + return -EBUSY; + + /* Must be a multiple of chunk_size */ + if (mddev->chunk_sectors) { + sector_t temp = min; + if (sector_div(temp, mddev->chunk_sectors)) + return -EINVAL; + } + mddev->resync_min = min; + + return len; +} + +static struct md_sysfs_entry md_min_sync = +__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); + +static ssize_t +max_sync_show(struct mddev *mddev, char *page) +{ + if (mddev->resync_max == MaxSector) + return sprintf(page, "max\n"); + else + return sprintf(page, "%llu\n", + (unsigned long long)mddev->resync_max); +} +static ssize_t +max_sync_store(struct mddev *mddev, const char *buf, size_t len) +{ + if (strncmp(buf, "max", 3) == 0) + mddev->resync_max = MaxSector; + else { + unsigned long long max; + if (strict_strtoull(buf, 10, &max)) + return -EINVAL; + if (max < mddev->resync_min) + return -EINVAL; + if (max < mddev->resync_max && + mddev->ro == 0 && + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + return -EBUSY; + + /* Must be a multiple of chunk_size */ + if (mddev->chunk_sectors) { + sector_t temp = max; + if (sector_div(temp, mddev->chunk_sectors)) + return -EINVAL; + } + mddev->resync_max = max; + } + wake_up(&mddev->recovery_wait); + return len; +} + +static struct md_sysfs_entry md_max_sync = +__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); + +static ssize_t +suspend_lo_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); +} + +static ssize_t +suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) +{ + char *e; + unsigned long long new = simple_strtoull(buf, &e, 10); + unsigned long long old = mddev->suspend_lo; + + if (mddev->pers == NULL || + mddev->pers->quiesce == NULL) + return -EINVAL; + if (buf == e || (*e && *e != '\n')) + return -EINVAL; + + mddev->suspend_lo = new; + if (new >= old) + /* Shrinking suspended region */ + mddev->pers->quiesce(mddev, 2); + else { + /* Expanding suspended region - need to wait */ + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + } + return len; +} +static struct md_sysfs_entry md_suspend_lo = +__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); + + +static ssize_t +suspend_hi_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); +} + +static ssize_t +suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) +{ + char *e; + unsigned long long new = simple_strtoull(buf, &e, 10); + unsigned long long old = mddev->suspend_hi; + + if (mddev->pers == NULL || + mddev->pers->quiesce == NULL) + return -EINVAL; + if (buf == e || (*e && *e != '\n')) + return -EINVAL; + + mddev->suspend_hi = new; + if (new <= old) + /* Shrinking suspended region */ + mddev->pers->quiesce(mddev, 2); + else { + /* Expanding suspended region - need to wait */ + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + } + return len; +} +static struct md_sysfs_entry md_suspend_hi = +__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); + +static ssize_t +reshape_position_show(struct mddev *mddev, char *page) +{ + if (mddev->reshape_position != MaxSector) + return sprintf(page, "%llu\n", + (unsigned long long)mddev->reshape_position); + strcpy(page, "none\n"); + return 5; +} + +static ssize_t +reshape_position_store(struct mddev *mddev, const char *buf, size_t len) +{ + char *e; + unsigned long long new = simple_strtoull(buf, &e, 10); + if (mddev->pers) + return -EBUSY; + if (buf == e || (*e && *e != '\n')) + return -EINVAL; + mddev->reshape_position = new; + mddev->delta_disks = 0; + mddev->new_level = mddev->level; + mddev->new_layout = mddev->layout; + mddev->new_chunk_sectors = mddev->chunk_sectors; + return len; +} + +static struct md_sysfs_entry md_reshape_position = +__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, + reshape_position_store); + +static ssize_t +array_size_show(struct mddev *mddev, char *page) +{ + if (mddev->external_size) + return sprintf(page, "%llu\n", + (unsigned long long)mddev->array_sectors/2); + else + return sprintf(page, "default\n"); +} + +static ssize_t +array_size_store(struct mddev *mddev, const char *buf, size_t len) +{ + sector_t sectors; + + if (strncmp(buf, "default", 7) == 0) { + if (mddev->pers) + sectors = mddev->pers->size(mddev, 0, 0); + else + sectors = mddev->array_sectors; + + mddev->external_size = 0; + } else { + if (strict_blocks_to_sectors(buf, §ors) < 0) + return -EINVAL; + if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) + return -E2BIG; + + mddev->external_size = 1; + } + + mddev->array_sectors = sectors; + if (mddev->pers) { + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + } + return len; +} + +static struct md_sysfs_entry md_array_size = +__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, + array_size_store); + +static struct attribute *md_default_attrs[] = { + &md_level.attr, + &md_layout.attr, + &md_raid_disks.attr, + &md_chunk_size.attr, + &md_size.attr, + &md_resync_start.attr, + &md_metadata.attr, + &md_new_device.attr, + &md_safe_delay.attr, + &md_array_state.attr, + &md_reshape_position.attr, + &md_array_size.attr, + &max_corr_read_errors.attr, + NULL, +}; + +static struct attribute *md_redundancy_attrs[] = { + &md_scan_mode.attr, + &md_mismatches.attr, + &md_sync_min.attr, + &md_sync_max.attr, + &md_sync_speed.attr, + &md_sync_force_parallel.attr, + &md_sync_completed.attr, + &md_min_sync.attr, + &md_max_sync.attr, + &md_suspend_lo.attr, + &md_suspend_hi.attr, + &md_bitmap.attr, + &md_degraded.attr, + NULL, +}; +static struct attribute_group md_redundancy_group = { + .name = NULL, + .attrs = md_redundancy_attrs, +}; + + +static ssize_t +md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) +{ + struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); + struct mddev *mddev = container_of(kobj, struct mddev, kobj); + ssize_t rv; + + if (!entry->show) + return -EIO; + spin_lock(&all_mddevs_lock); + if (list_empty(&mddev->all_mddevs)) { + spin_unlock(&all_mddevs_lock); + return -EBUSY; + } + mddev_get(mddev); + spin_unlock(&all_mddevs_lock); + + rv = mddev_lock(mddev); + if (!rv) { + rv = entry->show(mddev, page); + mddev_unlock(mddev); + } + mddev_put(mddev); + return rv; +} + +static ssize_t +md_attr_store(struct kobject *kobj, struct attribute *attr, + const char *page, size_t length) +{ + struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); + struct mddev *mddev = container_of(kobj, struct mddev, kobj); + ssize_t rv; + + if (!entry->store) + return -EIO; + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + spin_lock(&all_mddevs_lock); + if (list_empty(&mddev->all_mddevs)) { + spin_unlock(&all_mddevs_lock); + return -EBUSY; + } + mddev_get(mddev); + spin_unlock(&all_mddevs_lock); + rv = mddev_lock(mddev); + if (!rv) { + rv = entry->store(mddev, page, length); + mddev_unlock(mddev); + } + mddev_put(mddev); + return rv; +} + +static void md_free(struct kobject *ko) +{ + struct mddev *mddev = container_of(ko, struct mddev, kobj); + + if (mddev->sysfs_state) + sysfs_put(mddev->sysfs_state); + + if (mddev->gendisk) { + del_gendisk(mddev->gendisk); + put_disk(mddev->gendisk); + } + if (mddev->queue) + blk_cleanup_queue(mddev->queue); + + kfree(mddev); +} + +static const struct sysfs_ops md_sysfs_ops = { + .show = md_attr_show, + .store = md_attr_store, +}; +static struct kobj_type md_ktype = { + .release = md_free, + .sysfs_ops = &md_sysfs_ops, + .default_attrs = md_default_attrs, +}; + +int mdp_major = 0; + +static void mddev_delayed_delete(struct work_struct *ws) +{ + struct mddev *mddev = container_of(ws, struct mddev, del_work); + + sysfs_remove_group(&mddev->kobj, &md_bitmap_group); + kobject_del(&mddev->kobj); + kobject_put(&mddev->kobj); +} + +static int md_alloc(dev_t dev, char *name) +{ + static DEFINE_MUTEX(disks_mutex); + struct mddev *mddev = mddev_find(dev); + struct gendisk *disk; + int partitioned; + int shift; + int unit; + int error; + + if (!mddev) + return -ENODEV; + + partitioned = (MAJOR(mddev->unit) != MD_MAJOR); + shift = partitioned ? MdpMinorShift : 0; + unit = MINOR(mddev->unit) >> shift; + + /* wait for any previous instance of this device to be + * completely removed (mddev_delayed_delete). + */ + flush_workqueue(md_misc_wq); + + mutex_lock(&disks_mutex); + error = -EEXIST; + if (mddev->gendisk) + goto abort; + + if (name) { + /* Need to ensure that 'name' is not a duplicate. + */ + struct mddev *mddev2; + spin_lock(&all_mddevs_lock); + + list_for_each_entry(mddev2, &all_mddevs, all_mddevs) + if (mddev2->gendisk && + strcmp(mddev2->gendisk->disk_name, name) == 0) { + spin_unlock(&all_mddevs_lock); + goto abort; + } + spin_unlock(&all_mddevs_lock); + } + + error = -ENOMEM; + mddev->queue = blk_alloc_queue(GFP_KERNEL); + if (!mddev->queue) + goto abort; + mddev->queue->queuedata = mddev; + + blk_queue_make_request(mddev->queue, md_make_request); + blk_set_stacking_limits(&mddev->queue->limits); + + disk = alloc_disk(1 << shift); + if (!disk) { + blk_cleanup_queue(mddev->queue); + mddev->queue = NULL; + goto abort; + } + disk->major = MAJOR(mddev->unit); + disk->first_minor = unit << shift; + if (name) + strcpy(disk->disk_name, name); + else if (partitioned) + sprintf(disk->disk_name, "md_d%d", unit); + else + sprintf(disk->disk_name, "md%d", unit); + disk->fops = &md_fops; + disk->private_data = mddev; + disk->queue = mddev->queue; + blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA); + /* Allow extended partitions. This makes the + * 'mdp' device redundant, but we can't really + * remove it now. + */ + disk->flags |= GENHD_FL_EXT_DEVT; + mddev->gendisk = disk; + /* As soon as we call add_disk(), another thread could get + * through to md_open, so make sure it doesn't get too far + */ + mutex_lock(&mddev->open_mutex); + add_disk(disk); + + error = kobject_init_and_add(&mddev->kobj, &md_ktype, + &disk_to_dev(disk)->kobj, "%s", "md"); + if (error) { + /* This isn't possible, but as kobject_init_and_add is marked + * __must_check, we must do something with the result + */ + printk(KERN_WARNING "md: cannot register %s/md - name in use\n", + disk->disk_name); + error = 0; + } + if (mddev->kobj.sd && + sysfs_create_group(&mddev->kobj, &md_bitmap_group)) + printk(KERN_DEBUG "pointless warning\n"); + mutex_unlock(&mddev->open_mutex); + abort: + mutex_unlock(&disks_mutex); + if (!error && mddev->kobj.sd) { + kobject_uevent(&mddev->kobj, KOBJ_ADD); + mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); + } + mddev_put(mddev); + return error; +} + +static struct kobject *md_probe(dev_t dev, int *part, void *data) +{ + md_alloc(dev, NULL); + return NULL; +} + +static int add_named_array(const char *val, struct kernel_param *kp) +{ + /* val must be "md_*" where * is not all digits. + * We allocate an array with a large free minor number, and + * set the name to val. val must not already be an active name. + */ + int len = strlen(val); + char buf[DISK_NAME_LEN]; + + while (len && val[len-1] == '\n') + len--; + if (len >= DISK_NAME_LEN) + return -E2BIG; + strlcpy(buf, val, len+1); + if (strncmp(buf, "md_", 3) != 0) + return -EINVAL; + return md_alloc(0, buf); +} + +static void md_safemode_timeout(unsigned long data) +{ + struct mddev *mddev = (struct mddev *) data; + + if (!atomic_read(&mddev->writes_pending)) { + mddev->safemode = 1; + if (mddev->external) + sysfs_notify_dirent_safe(mddev->sysfs_state); + } + md_wakeup_thread(mddev->thread); +} + +static int start_dirty_degraded; + +int md_run(struct mddev *mddev) +{ + int err; + struct md_rdev *rdev; + struct md_personality *pers; + + if (list_empty(&mddev->disks)) + /* cannot run an array with no devices.. */ + return -EINVAL; + + if (mddev->pers) + return -EBUSY; + /* Cannot run until previous stop completes properly */ + if (mddev->sysfs_active) + return -EBUSY; + + /* + * Analyze all RAID superblock(s) + */ + if (!mddev->raid_disks) { + if (!mddev->persistent) + return -EINVAL; + analyze_sbs(mddev); + } + + if (mddev->level != LEVEL_NONE) + request_module("md-level-%d", mddev->level); + else if (mddev->clevel[0]) + request_module("md-%s", mddev->clevel); + + /* + * Drop all container device buffers, from now on + * the only valid external interface is through the md + * device. + */ + rdev_for_each(rdev, mddev) { + if (test_bit(Faulty, &rdev->flags)) + continue; + sync_blockdev(rdev->bdev); + invalidate_bdev(rdev->bdev); + + /* perform some consistency tests on the device. + * We don't want the data to overlap the metadata, + * Internal Bitmap issues have been handled elsewhere. + */ + if (rdev->meta_bdev) { + /* Nothing to check */; + } else if (rdev->data_offset < rdev->sb_start) { + if (mddev->dev_sectors && + rdev->data_offset + mddev->dev_sectors + > rdev->sb_start) { + printk("md: %s: data overlaps metadata\n", + mdname(mddev)); + return -EINVAL; + } + } else { + if (rdev->sb_start + rdev->sb_size/512 + > rdev->data_offset) { + printk("md: %s: metadata overlaps data\n", + mdname(mddev)); + return -EINVAL; + } + } + sysfs_notify_dirent_safe(rdev->sysfs_state); + } + + if (mddev->bio_set == NULL) + mddev->bio_set = bioset_create(BIO_POOL_SIZE, + sizeof(struct mddev *)); + + spin_lock(&pers_lock); + pers = find_pers(mddev->level, mddev->clevel); + if (!pers || !try_module_get(pers->owner)) { + spin_unlock(&pers_lock); + if (mddev->level != LEVEL_NONE) + printk(KERN_WARNING "md: personality for level %d is not loaded!\n", + mddev->level); + else + printk(KERN_WARNING "md: personality for level %s is not loaded!\n", + mddev->clevel); + return -EINVAL; + } + mddev->pers = pers; + spin_unlock(&pers_lock); + if (mddev->level != pers->level) { + mddev->level = pers->level; + mddev->new_level = pers->level; + } + strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + + if (mddev->reshape_position != MaxSector && + pers->start_reshape == NULL) { + /* This personality cannot handle reshaping... */ + mddev->pers = NULL; + module_put(pers->owner); + return -EINVAL; + } + + if (pers->sync_request) { + /* Warn if this is a potentially silly + * configuration. + */ + char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; + struct md_rdev *rdev2; + int warned = 0; + + rdev_for_each(rdev, mddev) + rdev_for_each(rdev2, mddev) { + if (rdev < rdev2 && + rdev->bdev->bd_contains == + rdev2->bdev->bd_contains) { + printk(KERN_WARNING + "%s: WARNING: %s appears to be" + " on the same physical disk as" + " %s.\n", + mdname(mddev), + bdevname(rdev->bdev,b), + bdevname(rdev2->bdev,b2)); + warned = 1; + } + } + + if (warned) + printk(KERN_WARNING + "True protection against single-disk" + " failure might be compromised.\n"); + } + + mddev->recovery = 0; + /* may be over-ridden by personality */ + mddev->resync_max_sectors = mddev->dev_sectors; + + mddev->ok_start_degraded = start_dirty_degraded; + + if (start_readonly && mddev->ro == 0) + mddev->ro = 2; /* read-only, but switch on first write */ + + err = mddev->pers->run(mddev); + if (err) + printk(KERN_ERR "md: pers->run() failed ...\n"); + else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) { + WARN_ONCE(!mddev->external_size, "%s: default size too small," + " but 'external_size' not in effect?\n", __func__); + printk(KERN_ERR + "md: invalid array_size %llu > default size %llu\n", + (unsigned long long)mddev->array_sectors / 2, + (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2); + err = -EINVAL; + mddev->pers->stop(mddev); + } + if (err == 0 && mddev->pers->sync_request) { + err = bitmap_create(mddev); + if (err) { + printk(KERN_ERR "%s: failed to create bitmap (%d)\n", + mdname(mddev), err); + mddev->pers->stop(mddev); + } + } + if (err) { + module_put(mddev->pers->owner); + mddev->pers = NULL; + bitmap_destroy(mddev); + return err; + } + if (mddev->pers->sync_request) { + if (mddev->kobj.sd && + sysfs_create_group(&mddev->kobj, &md_redundancy_group)) + printk(KERN_WARNING + "md: cannot register extra attributes for %s\n", + mdname(mddev)); + mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); + } else if (mddev->ro == 2) /* auto-readonly not meaningful */ + mddev->ro = 0; + + atomic_set(&mddev->writes_pending,0); + atomic_set(&mddev->max_corr_read_errors, + MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); + mddev->safemode = 0; + mddev->safemode_timer.function = md_safemode_timeout; + mddev->safemode_timer.data = (unsigned long) mddev; + mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ + mddev->in_sync = 1; + smp_wmb(); + mddev->ready = 1; + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0) + if (sysfs_link_rdev(mddev, rdev)) + /* failure here is OK */; + + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + + if (mddev->flags) + md_update_sb(mddev, 0); + + md_new_event(mddev); + sysfs_notify_dirent_safe(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_action); + sysfs_notify(&mddev->kobj, NULL, "degraded"); + return 0; +} +EXPORT_SYMBOL_GPL(md_run); + +static int do_md_run(struct mddev *mddev) +{ + int err; + + err = md_run(mddev); + if (err) + goto out; + err = bitmap_load(mddev); + if (err) { + bitmap_destroy(mddev); + goto out; + } + + md_wakeup_thread(mddev->thread); + md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ + + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + mddev->changed = 1; + kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); +out: + return err; +} + +static int restart_array(struct mddev *mddev) +{ + struct gendisk *disk = mddev->gendisk; + + /* Complain if it has no devices */ + if (list_empty(&mddev->disks)) + return -ENXIO; + if (!mddev->pers) + return -EINVAL; + if (!mddev->ro) + return -EBUSY; + mddev->safemode = 0; + mddev->ro = 0; + set_disk_ro(disk, 0); + printk(KERN_INFO "md: %s switched to read-write mode.\n", + mdname(mddev)); + /* Kick recovery or resync if necessary */ + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + md_wakeup_thread(mddev->sync_thread); + sysfs_notify_dirent_safe(mddev->sysfs_state); + return 0; +} + +/* similar to deny_write_access, but accounts for our holding a reference + * to the file ourselves */ +static int deny_bitmap_write_access(struct file * file) +{ + struct inode *inode = file->f_mapping->host; + + spin_lock(&inode->i_lock); + if (atomic_read(&inode->i_writecount) > 1) { + spin_unlock(&inode->i_lock); + return -ETXTBSY; + } + atomic_set(&inode->i_writecount, -1); + spin_unlock(&inode->i_lock); + + return 0; +} + +void restore_bitmap_write_access(struct file *file) +{ + struct inode *inode = file->f_mapping->host; + + spin_lock(&inode->i_lock); + atomic_set(&inode->i_writecount, 1); + spin_unlock(&inode->i_lock); +} + +static void md_clean(struct mddev *mddev) +{ + mddev->array_sectors = 0; + mddev->external_size = 0; + mddev->dev_sectors = 0; + mddev->raid_disks = 0; + mddev->recovery_cp = 0; + mddev->resync_min = 0; + mddev->resync_max = MaxSector; + mddev->reshape_position = MaxSector; + mddev->external = 0; + mddev->persistent = 0; + mddev->level = LEVEL_NONE; + mddev->clevel[0] = 0; + mddev->flags = 0; + mddev->ro = 0; + mddev->metadata_type[0] = 0; + mddev->chunk_sectors = 0; + mddev->ctime = mddev->utime = 0; + mddev->layout = 0; + mddev->max_disks = 0; + mddev->events = 0; + mddev->can_decrease_events = 0; + mddev->delta_disks = 0; + mddev->new_level = LEVEL_NONE; + mddev->new_layout = 0; + mddev->new_chunk_sectors = 0; + mddev->curr_resync = 0; + mddev->resync_mismatches = 0; + mddev->suspend_lo = mddev->suspend_hi = 0; + mddev->sync_speed_min = mddev->sync_speed_max = 0; + mddev->recovery = 0; + mddev->in_sync = 0; + mddev->changed = 0; + mddev->degraded = 0; + mddev->safemode = 0; + mddev->merge_check_needed = 0; + mddev->bitmap_info.offset = 0; + mddev->bitmap_info.default_offset = 0; + mddev->bitmap_info.chunksize = 0; + mddev->bitmap_info.daemon_sleep = 0; + mddev->bitmap_info.max_write_behind = 0; +} + +static void __md_stop_writes(struct mddev *mddev) +{ + if (mddev->sync_thread) { + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + reap_sync_thread(mddev); + } + + del_timer_sync(&mddev->safemode_timer); + + bitmap_flush(mddev); + md_super_wait(mddev); + + if (!mddev->in_sync || mddev->flags) { + /* mark array as shutdown cleanly */ + mddev->in_sync = 1; + md_update_sb(mddev, 1); + } +} + +void md_stop_writes(struct mddev *mddev) +{ + mddev_lock(mddev); + __md_stop_writes(mddev); + mddev_unlock(mddev); +} +EXPORT_SYMBOL_GPL(md_stop_writes); + +void md_stop(struct mddev *mddev) +{ + mddev->ready = 0; + mddev->pers->stop(mddev); + if (mddev->pers->sync_request && mddev->to_remove == NULL) + mddev->to_remove = &md_redundancy_group; + module_put(mddev->pers->owner); + mddev->pers = NULL; + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); +} +EXPORT_SYMBOL_GPL(md_stop); + +static int md_set_readonly(struct mddev *mddev, int is_open) +{ + int err = 0; + mutex_lock(&mddev->open_mutex); + if (atomic_read(&mddev->openers) > is_open) { + printk("md: %s still in use.\n",mdname(mddev)); + err = -EBUSY; + goto out; + } + if (mddev->pers) { + __md_stop_writes(mddev); + + err = -ENXIO; + if (mddev->ro==1) + goto out; + mddev->ro = 1; + set_disk_ro(mddev->gendisk, 1); + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + sysfs_notify_dirent_safe(mddev->sysfs_state); + err = 0; + } +out: + mutex_unlock(&mddev->open_mutex); + return err; +} + +/* mode: + * 0 - completely stop and dis-assemble array + * 2 - stop but do not disassemble array + */ +static int do_md_stop(struct mddev * mddev, int mode, int is_open) +{ + struct gendisk *disk = mddev->gendisk; + struct md_rdev *rdev; + + mutex_lock(&mddev->open_mutex); + if (atomic_read(&mddev->openers) > is_open || + mddev->sysfs_active) { + printk("md: %s still in use.\n",mdname(mddev)); + mutex_unlock(&mddev->open_mutex); + return -EBUSY; + } + + if (mddev->pers) { + if (mddev->ro) + set_disk_ro(disk, 0); + + __md_stop_writes(mddev); + md_stop(mddev); + mddev->queue->merge_bvec_fn = NULL; + mddev->queue->backing_dev_info.congested_fn = NULL; + + /* tell userspace to handle 'inactive' */ + sysfs_notify_dirent_safe(mddev->sysfs_state); + + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0) + sysfs_unlink_rdev(mddev, rdev); + + set_capacity(disk, 0); + mutex_unlock(&mddev->open_mutex); + mddev->changed = 1; + revalidate_disk(disk); + + if (mddev->ro) + mddev->ro = 0; + } else + mutex_unlock(&mddev->open_mutex); + /* + * Free resources if final stop + */ + if (mode == 0) { + printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); + + bitmap_destroy(mddev); + if (mddev->bitmap_info.file) { + restore_bitmap_write_access(mddev->bitmap_info.file); + fput(mddev->bitmap_info.file); + mddev->bitmap_info.file = NULL; + } + mddev->bitmap_info.offset = 0; + + export_array(mddev); + + md_clean(mddev); + kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); + if (mddev->hold_active == UNTIL_STOP) + mddev->hold_active = 0; + } + blk_integrity_unregister(disk); + md_new_event(mddev); + sysfs_notify_dirent_safe(mddev->sysfs_state); + return 0; +} + +#ifndef MODULE +static void autorun_array(struct mddev *mddev) +{ + struct md_rdev *rdev; + int err; + + if (list_empty(&mddev->disks)) + return; + + printk(KERN_INFO "md: running: "); + + rdev_for_each(rdev, mddev) { + char b[BDEVNAME_SIZE]; + printk("<%s>", bdevname(rdev->bdev,b)); + } + printk("\n"); + + err = do_md_run(mddev); + if (err) { + printk(KERN_WARNING "md: do_md_run() returned %d\n", err); + do_md_stop(mddev, 0, 0); + } +} + +/* + * lets try to run arrays based on all disks that have arrived + * until now. (those are in pending_raid_disks) + * + * the method: pick the first pending disk, collect all disks with + * the same UUID, remove all from the pending list and put them into + * the 'same_array' list. Then order this list based on superblock + * update time (freshest comes first), kick out 'old' disks and + * compare superblocks. If everything's fine then run it. + * + * If "unit" is allocated, then bump its reference count + */ +static void autorun_devices(int part) +{ + struct md_rdev *rdev0, *rdev, *tmp; + struct mddev *mddev; + char b[BDEVNAME_SIZE]; + + printk(KERN_INFO "md: autorun ...\n"); + while (!list_empty(&pending_raid_disks)) { + int unit; + dev_t dev; + LIST_HEAD(candidates); + rdev0 = list_entry(pending_raid_disks.next, + struct md_rdev, same_set); + + printk(KERN_INFO "md: considering %s ...\n", + bdevname(rdev0->bdev,b)); + INIT_LIST_HEAD(&candidates); + rdev_for_each_list(rdev, tmp, &pending_raid_disks) + if (super_90_load(rdev, rdev0, 0) >= 0) { + printk(KERN_INFO "md: adding %s ...\n", + bdevname(rdev->bdev,b)); + list_move(&rdev->same_set, &candidates); + } + /* + * now we have a set of devices, with all of them having + * mostly sane superblocks. It's time to allocate the + * mddev. + */ + if (part) { + dev = MKDEV(mdp_major, + rdev0->preferred_minor << MdpMinorShift); + unit = MINOR(dev) >> MdpMinorShift; + } else { + dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); + unit = MINOR(dev); + } + if (rdev0->preferred_minor != unit) { + printk(KERN_INFO "md: unit number in %s is bad: %d\n", + bdevname(rdev0->bdev, b), rdev0->preferred_minor); + break; + } + + md_probe(dev, NULL, NULL); + mddev = mddev_find(dev); + if (!mddev || !mddev->gendisk) { + if (mddev) + mddev_put(mddev); + printk(KERN_ERR + "md: cannot allocate memory for md drive.\n"); + break; + } + if (mddev_lock(mddev)) + printk(KERN_WARNING "md: %s locked, cannot run\n", + mdname(mddev)); + else if (mddev->raid_disks || mddev->major_version + || !list_empty(&mddev->disks)) { + printk(KERN_WARNING + "md: %s already running, cannot run %s\n", + mdname(mddev), bdevname(rdev0->bdev,b)); + mddev_unlock(mddev); + } else { + printk(KERN_INFO "md: created %s\n", mdname(mddev)); + mddev->persistent = 1; + rdev_for_each_list(rdev, tmp, &candidates) { + list_del_init(&rdev->same_set); + if (bind_rdev_to_array(rdev, mddev)) + export_rdev(rdev); + } + autorun_array(mddev); + mddev_unlock(mddev); + } + /* on success, candidates will be empty, on error + * it won't... + */ + rdev_for_each_list(rdev, tmp, &candidates) { + list_del_init(&rdev->same_set); + export_rdev(rdev); + } + mddev_put(mddev); + } + printk(KERN_INFO "md: ... autorun DONE.\n"); +} +#endif /* !MODULE */ + +static int get_version(void __user * arg) +{ + mdu_version_t ver; + + ver.major = MD_MAJOR_VERSION; + ver.minor = MD_MINOR_VERSION; + ver.patchlevel = MD_PATCHLEVEL_VERSION; + + if (copy_to_user(arg, &ver, sizeof(ver))) + return -EFAULT; + + return 0; +} + +static int get_array_info(struct mddev * mddev, void __user * arg) +{ + mdu_array_info_t info; + int nr,working,insync,failed,spare; + struct md_rdev *rdev; + + nr=working=insync=failed=spare=0; + rdev_for_each(rdev, mddev) { + nr++; + if (test_bit(Faulty, &rdev->flags)) + failed++; + else { + working++; + if (test_bit(In_sync, &rdev->flags)) + insync++; + else + spare++; + } + } + + info.major_version = mddev->major_version; + info.minor_version = mddev->minor_version; + info.patch_version = MD_PATCHLEVEL_VERSION; + info.ctime = mddev->ctime; + info.level = mddev->level; + info.size = mddev->dev_sectors / 2; + if (info.size != mddev->dev_sectors / 2) /* overflow */ + info.size = -1; + info.nr_disks = nr; + info.raid_disks = mddev->raid_disks; + info.md_minor = mddev->md_minor; + info.not_persistent= !mddev->persistent; + + info.utime = mddev->utime; + info.state = 0; + if (mddev->in_sync) + info.state = (1<<MD_SB_CLEAN); + if (mddev->bitmap && mddev->bitmap_info.offset) + info.state = (1<<MD_SB_BITMAP_PRESENT); + info.active_disks = insync; + info.working_disks = working; + info.failed_disks = failed; + info.spare_disks = spare; + + info.layout = mddev->layout; + info.chunk_size = mddev->chunk_sectors << 9; + + if (copy_to_user(arg, &info, sizeof(info))) + return -EFAULT; + + return 0; +} + +static int get_bitmap_file(struct mddev * mddev, void __user * arg) +{ + mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ + char *ptr, *buf = NULL; + int err = -ENOMEM; + + if (md_allow_write(mddev)) + file = kmalloc(sizeof(*file), GFP_NOIO); + else + file = kmalloc(sizeof(*file), GFP_KERNEL); + + if (!file) + goto out; + + /* bitmap disabled, zero the first byte and copy out */ + if (!mddev->bitmap || !mddev->bitmap->file) { + file->pathname[0] = '\0'; + goto copy_out; + } + + buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); + if (!buf) + goto out; + + ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname)); + if (IS_ERR(ptr)) + goto out; + + strcpy(file->pathname, ptr); + +copy_out: + err = 0; + if (copy_to_user(arg, file, sizeof(*file))) + err = -EFAULT; +out: + kfree(buf); + kfree(file); + return err; +} + +static int get_disk_info(struct mddev * mddev, void __user * arg) +{ + mdu_disk_info_t info; + struct md_rdev *rdev; + + if (copy_from_user(&info, arg, sizeof(info))) + return -EFAULT; + + rdev = find_rdev_nr(mddev, info.number); + if (rdev) { + info.major = MAJOR(rdev->bdev->bd_dev); + info.minor = MINOR(rdev->bdev->bd_dev); + info.raid_disk = rdev->raid_disk; + info.state = 0; + if (test_bit(Faulty, &rdev->flags)) + info.state |= (1<<MD_DISK_FAULTY); + else if (test_bit(In_sync, &rdev->flags)) { + info.state |= (1<<MD_DISK_ACTIVE); + info.state |= (1<<MD_DISK_SYNC); + } + if (test_bit(WriteMostly, &rdev->flags)) + info.state |= (1<<MD_DISK_WRITEMOSTLY); + } else { + info.major = info.minor = 0; + info.raid_disk = -1; + info.state = (1<<MD_DISK_REMOVED); + } + + if (copy_to_user(arg, &info, sizeof(info))) + return -EFAULT; + + return 0; +} + +static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) +{ + char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; + struct md_rdev *rdev; + dev_t dev = MKDEV(info->major,info->minor); + + if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) + return -EOVERFLOW; + + if (!mddev->raid_disks) { + int err; + /* expecting a device which has a superblock */ + rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); + if (IS_ERR(rdev)) { + printk(KERN_WARNING + "md: md_import_device returned %ld\n", + PTR_ERR(rdev)); + return PTR_ERR(rdev); + } + if (!list_empty(&mddev->disks)) { + struct md_rdev *rdev0 + = list_entry(mddev->disks.next, + struct md_rdev, same_set); + err = super_types[mddev->major_version] + .load_super(rdev, rdev0, mddev->minor_version); + if (err < 0) { + printk(KERN_WARNING + "md: %s has different UUID to %s\n", + bdevname(rdev->bdev,b), + bdevname(rdev0->bdev,b2)); + export_rdev(rdev); + return -EINVAL; + } + } + err = bind_rdev_to_array(rdev, mddev); + if (err) + export_rdev(rdev); + return err; + } + + /* + * add_new_disk can be used once the array is assembled + * to add "hot spares". They must already have a superblock + * written + */ + if (mddev->pers) { + int err; + if (!mddev->pers->hot_add_disk) { + printk(KERN_WARNING + "%s: personality does not support diskops!\n", + mdname(mddev)); + return -EINVAL; + } + if (mddev->persistent) + rdev = md_import_device(dev, mddev->major_version, + mddev->minor_version); + else + rdev = md_import_device(dev, -1, -1); + if (IS_ERR(rdev)) { + printk(KERN_WARNING + "md: md_import_device returned %ld\n", + PTR_ERR(rdev)); + return PTR_ERR(rdev); + } + /* set saved_raid_disk if appropriate */ + if (!mddev->persistent) { + if (info->state & (1<<MD_DISK_SYNC) && + info->raid_disk < mddev->raid_disks) { + rdev->raid_disk = info->raid_disk; + set_bit(In_sync, &rdev->flags); + } else + rdev->raid_disk = -1; + } else + super_types[mddev->major_version]. + validate_super(mddev, rdev); + if ((info->state & (1<<MD_DISK_SYNC)) && + (!test_bit(In_sync, &rdev->flags) || + rdev->raid_disk != info->raid_disk)) { + /* This was a hot-add request, but events doesn't + * match, so reject it. + */ + export_rdev(rdev); + return -EINVAL; + } + + if (test_bit(In_sync, &rdev->flags)) + rdev->saved_raid_disk = rdev->raid_disk; + else + rdev->saved_raid_disk = -1; + + clear_bit(In_sync, &rdev->flags); /* just to be sure */ + if (info->state & (1<<MD_DISK_WRITEMOSTLY)) + set_bit(WriteMostly, &rdev->flags); + else + clear_bit(WriteMostly, &rdev->flags); + + rdev->raid_disk = -1; + err = bind_rdev_to_array(rdev, mddev); + if (!err && !mddev->pers->hot_remove_disk) { + /* If there is hot_add_disk but no hot_remove_disk + * then added disks for geometry changes, + * and should be added immediately. + */ + super_types[mddev->major_version]. + validate_super(mddev, rdev); + err = mddev->pers->hot_add_disk(mddev, rdev); + if (err) + unbind_rdev_from_array(rdev); + } + if (err) + export_rdev(rdev); + else + sysfs_notify_dirent_safe(rdev->sysfs_state); + + md_update_sb(mddev, 1); + if (mddev->degraded) + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + if (!err) + md_new_event(mddev); + md_wakeup_thread(mddev->thread); + return err; + } + + /* otherwise, add_new_disk is only allowed + * for major_version==0 superblocks + */ + if (mddev->major_version != 0) { + printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", + mdname(mddev)); + return -EINVAL; + } + + if (!(info->state & (1<<MD_DISK_FAULTY))) { + int err; + rdev = md_import_device(dev, -1, 0); + if (IS_ERR(rdev)) { + printk(KERN_WARNING + "md: error, md_import_device() returned %ld\n", + PTR_ERR(rdev)); + return PTR_ERR(rdev); + } + rdev->desc_nr = info->number; + if (info->raid_disk < mddev->raid_disks) + rdev->raid_disk = info->raid_disk; + else + rdev->raid_disk = -1; + + if (rdev->raid_disk < mddev->raid_disks) + if (info->state & (1<<MD_DISK_SYNC)) + set_bit(In_sync, &rdev->flags); + + if (info->state & (1<<MD_DISK_WRITEMOSTLY)) + set_bit(WriteMostly, &rdev->flags); + + if (!mddev->persistent) { + printk(KERN_INFO "md: nonpersistent superblock ...\n"); + rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; + } else + rdev->sb_start = calc_dev_sboffset(rdev); + rdev->sectors = rdev->sb_start; + + err = bind_rdev_to_array(rdev, mddev); + if (err) { + export_rdev(rdev); + return err; + } + } + + return 0; +} + +static int hot_remove_disk(struct mddev * mddev, dev_t dev) +{ + char b[BDEVNAME_SIZE]; + struct md_rdev *rdev; + + rdev = find_rdev(mddev, dev); + if (!rdev) + return -ENXIO; + + if (rdev->raid_disk >= 0) + goto busy; + + kick_rdev_from_array(rdev); + md_update_sb(mddev, 1); + md_new_event(mddev); + + return 0; +busy: + printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", + bdevname(rdev->bdev,b), mdname(mddev)); + return -EBUSY; +} + +static int hot_add_disk(struct mddev * mddev, dev_t dev) +{ + char b[BDEVNAME_SIZE]; + int err; + struct md_rdev *rdev; + + if (!mddev->pers) + return -ENODEV; + + if (mddev->major_version != 0) { + printk(KERN_WARNING "%s: HOT_ADD may only be used with" + " version-0 superblocks.\n", + mdname(mddev)); + return -EINVAL; + } + if (!mddev->pers->hot_add_disk) { + printk(KERN_WARNING + "%s: personality does not support diskops!\n", + mdname(mddev)); + return -EINVAL; + } + + rdev = md_import_device(dev, -1, 0); + if (IS_ERR(rdev)) { + printk(KERN_WARNING + "md: error, md_import_device() returned %ld\n", + PTR_ERR(rdev)); + return -EINVAL; + } + + if (mddev->persistent) + rdev->sb_start = calc_dev_sboffset(rdev); + else + rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; + + rdev->sectors = rdev->sb_start; + + if (test_bit(Faulty, &rdev->flags)) { + printk(KERN_WARNING + "md: can not hot-add faulty %s disk to %s!\n", + bdevname(rdev->bdev,b), mdname(mddev)); + err = -EINVAL; + goto abort_export; + } + clear_bit(In_sync, &rdev->flags); + rdev->desc_nr = -1; + rdev->saved_raid_disk = -1; + err = bind_rdev_to_array(rdev, mddev); + if (err) + goto abort_export; + + /* + * The rest should better be atomic, we can have disk failures + * noticed in interrupt contexts ... + */ + + rdev->raid_disk = -1; + + md_update_sb(mddev, 1); + + /* + * Kick recovery, maybe this spare has to be added to the + * array immediately. + */ + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + md_new_event(mddev); + return 0; + +abort_export: + export_rdev(rdev); + return err; +} + +static int set_bitmap_file(struct mddev *mddev, int fd) +{ + int err; + + if (mddev->pers) { + if (!mddev->pers->quiesce) + return -EBUSY; + if (mddev->recovery || mddev->sync_thread) + return -EBUSY; + /* we should be able to change the bitmap.. */ + } + + + if (fd >= 0) { + if (mddev->bitmap) + return -EEXIST; /* cannot add when bitmap is present */ + mddev->bitmap_info.file = fget(fd); + + if (mddev->bitmap_info.file == NULL) { + printk(KERN_ERR "%s: error: failed to get bitmap file\n", + mdname(mddev)); + return -EBADF; + } + + err = deny_bitmap_write_access(mddev->bitmap_info.file); + if (err) { + printk(KERN_ERR "%s: error: bitmap file is already in use\n", + mdname(mddev)); + fput(mddev->bitmap_info.file); + mddev->bitmap_info.file = NULL; + return err; + } + mddev->bitmap_info.offset = 0; /* file overrides offset */ + } else if (mddev->bitmap == NULL) + return -ENOENT; /* cannot remove what isn't there */ + err = 0; + if (mddev->pers) { + mddev->pers->quiesce(mddev, 1); + if (fd >= 0) { + err = bitmap_create(mddev); + if (!err) + err = bitmap_load(mddev); + } + if (fd < 0 || err) { + bitmap_destroy(mddev); + fd = -1; /* make sure to put the file */ + } + mddev->pers->quiesce(mddev, 0); + } + if (fd < 0) { + if (mddev->bitmap_info.file) { + restore_bitmap_write_access(mddev->bitmap_info.file); + fput(mddev->bitmap_info.file); + } + mddev->bitmap_info.file = NULL; + } + + return err; +} + +/* + * set_array_info is used two different ways + * The original usage is when creating a new array. + * In this usage, raid_disks is > 0 and it together with + * level, size, not_persistent,layout,chunksize determine the + * shape of the array. + * This will always create an array with a type-0.90.0 superblock. + * The newer usage is when assembling an array. + * In this case raid_disks will be 0, and the major_version field is + * use to determine which style super-blocks are to be found on the devices. + * The minor and patch _version numbers are also kept incase the + * super_block handler wishes to interpret them. + */ +static int set_array_info(struct mddev * mddev, mdu_array_info_t *info) +{ + + if (info->raid_disks == 0) { + /* just setting version number for superblock loading */ + if (info->major_version < 0 || + info->major_version >= ARRAY_SIZE(super_types) || + super_types[info->major_version].name == NULL) { + /* maybe try to auto-load a module? */ + printk(KERN_INFO + "md: superblock version %d not known\n", + info->major_version); + return -EINVAL; + } + mddev->major_version = info->major_version; + mddev->minor_version = info->minor_version; + mddev->patch_version = info->patch_version; + mddev->persistent = !info->not_persistent; + /* ensure mddev_put doesn't delete this now that there + * is some minimal configuration. + */ + mddev->ctime = get_seconds(); + return 0; + } + mddev->major_version = MD_MAJOR_VERSION; + mddev->minor_version = MD_MINOR_VERSION; + mddev->patch_version = MD_PATCHLEVEL_VERSION; + mddev->ctime = get_seconds(); + + mddev->level = info->level; + mddev->clevel[0] = 0; + mddev->dev_sectors = 2 * (sector_t)info->size; + mddev->raid_disks = info->raid_disks; + /* don't set md_minor, it is determined by which /dev/md* was + * openned + */ + if (info->state & (1<<MD_SB_CLEAN)) + mddev->recovery_cp = MaxSector; + else + mddev->recovery_cp = 0; + mddev->persistent = ! info->not_persistent; + mddev->external = 0; + + mddev->layout = info->layout; + mddev->chunk_sectors = info->chunk_size >> 9; + + mddev->max_disks = MD_SB_DISKS; + + if (mddev->persistent) + mddev->flags = 0; + set_bit(MD_CHANGE_DEVS, &mddev->flags); + + mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; + mddev->bitmap_info.offset = 0; + + mddev->reshape_position = MaxSector; + + /* + * Generate a 128 bit UUID + */ + get_random_bytes(mddev->uuid, 16); + + mddev->new_level = mddev->level; + mddev->new_chunk_sectors = mddev->chunk_sectors; + mddev->new_layout = mddev->layout; + mddev->delta_disks = 0; + + return 0; +} + +void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) +{ + WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); + + if (mddev->external_size) + return; + + mddev->array_sectors = array_sectors; +} +EXPORT_SYMBOL(md_set_array_sectors); + +static int update_size(struct mddev *mddev, sector_t num_sectors) +{ + struct md_rdev *rdev; + int rv; + int fit = (num_sectors == 0); + + if (mddev->pers->resize == NULL) + return -EINVAL; + /* The "num_sectors" is the number of sectors of each device that + * is used. This can only make sense for arrays with redundancy. + * linear and raid0 always use whatever space is available. We can only + * consider changing this number if no resync or reconstruction is + * happening, and if the new size is acceptable. It must fit before the + * sb_start or, if that is <data_offset, it must fit before the size + * of each device. If num_sectors is zero, we find the largest size + * that fits. + */ + if (mddev->sync_thread) + return -EBUSY; + if (mddev->bitmap) + /* Sorry, cannot grow a bitmap yet, just remove it, + * grow, and re-add. + */ + return -EBUSY; + rdev_for_each(rdev, mddev) { + sector_t avail = rdev->sectors; + + if (fit && (num_sectors == 0 || num_sectors > avail)) + num_sectors = avail; + if (avail < num_sectors) + return -ENOSPC; + } + rv = mddev->pers->resize(mddev, num_sectors); + if (!rv) + revalidate_disk(mddev->gendisk); + return rv; +} + +static int update_raid_disks(struct mddev *mddev, int raid_disks) +{ + int rv; + /* change the number of raid disks */ + if (mddev->pers->check_reshape == NULL) + return -EINVAL; + if (raid_disks <= 0 || + (mddev->max_disks && raid_disks >= mddev->max_disks)) + return -EINVAL; + if (mddev->sync_thread || mddev->reshape_position != MaxSector) + return -EBUSY; + mddev->delta_disks = raid_disks - mddev->raid_disks; + + rv = mddev->pers->check_reshape(mddev); + if (rv < 0) + mddev->delta_disks = 0; + return rv; +} + + +/* + * update_array_info is used to change the configuration of an + * on-line array. + * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size + * fields in the info are checked against the array. + * Any differences that cannot be handled will cause an error. + * Normally, only one change can be managed at a time. + */ +static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) +{ + int rv = 0; + int cnt = 0; + int state = 0; + + /* calculate expected state,ignoring low bits */ + if (mddev->bitmap && mddev->bitmap_info.offset) + state |= (1 << MD_SB_BITMAP_PRESENT); + + if (mddev->major_version != info->major_version || + mddev->minor_version != info->minor_version || +/* mddev->patch_version != info->patch_version || */ + mddev->ctime != info->ctime || + mddev->level != info->level || +/* mddev->layout != info->layout || */ + !mddev->persistent != info->not_persistent|| + mddev->chunk_sectors != info->chunk_size >> 9 || + /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ + ((state^info->state) & 0xfffffe00) + ) + return -EINVAL; + /* Check there is only one change */ + if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) + cnt++; + if (mddev->raid_disks != info->raid_disks) + cnt++; + if (mddev->layout != info->layout) + cnt++; + if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) + cnt++; + if (cnt == 0) + return 0; + if (cnt > 1) + return -EINVAL; + + if (mddev->layout != info->layout) { + /* Change layout + * we don't need to do anything at the md level, the + * personality will take care of it all. + */ + if (mddev->pers->check_reshape == NULL) + return -EINVAL; + else { + mddev->new_layout = info->layout; + rv = mddev->pers->check_reshape(mddev); + if (rv) + mddev->new_layout = mddev->layout; + return rv; + } + } + if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) + rv = update_size(mddev, (sector_t)info->size * 2); + + if (mddev->raid_disks != info->raid_disks) + rv = update_raid_disks(mddev, info->raid_disks); + + if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { + if (mddev->pers->quiesce == NULL) + return -EINVAL; + if (mddev->recovery || mddev->sync_thread) + return -EBUSY; + if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { + /* add the bitmap */ + if (mddev->bitmap) + return -EEXIST; + if (mddev->bitmap_info.default_offset == 0) + return -EINVAL; + mddev->bitmap_info.offset = + mddev->bitmap_info.default_offset; + mddev->pers->quiesce(mddev, 1); + rv = bitmap_create(mddev); + if (!rv) + rv = bitmap_load(mddev); + if (rv) + bitmap_destroy(mddev); + mddev->pers->quiesce(mddev, 0); + } else { + /* remove the bitmap */ + if (!mddev->bitmap) + return -ENOENT; + if (mddev->bitmap->file) + return -EINVAL; + mddev->pers->quiesce(mddev, 1); + bitmap_destroy(mddev); + mddev->pers->quiesce(mddev, 0); + mddev->bitmap_info.offset = 0; + } + } + md_update_sb(mddev, 1); + return rv; +} + +static int set_disk_faulty(struct mddev *mddev, dev_t dev) +{ + struct md_rdev *rdev; + + if (mddev->pers == NULL) + return -ENODEV; + + rdev = find_rdev(mddev, dev); + if (!rdev) + return -ENODEV; + + md_error(mddev, rdev); + if (!test_bit(Faulty, &rdev->flags)) + return -EBUSY; + return 0; +} + +/* + * We have a problem here : there is no easy way to give a CHS + * virtual geometry. We currently pretend that we have a 2 heads + * 4 sectors (with a BIG number of cylinders...). This drives + * dosfs just mad... ;-) + */ +static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + struct mddev *mddev = bdev->bd_disk->private_data; + + geo->heads = 2; + geo->sectors = 4; + geo->cylinders = mddev->array_sectors / 8; + return 0; +} + +static int md_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + int err = 0; + void __user *argp = (void __user *)arg; + struct mddev *mddev = NULL; + int ro; + + switch (cmd) { + case RAID_VERSION: + case GET_ARRAY_INFO: + case GET_DISK_INFO: + break; + default: + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + } + + /* + * Commands dealing with the RAID driver but not any + * particular array: + */ + switch (cmd) + { + case RAID_VERSION: + err = get_version(argp); + goto done; + + case PRINT_RAID_DEBUG: + err = 0; + md_print_devices(); + goto done; + +#ifndef MODULE + case RAID_AUTORUN: + err = 0; + autostart_arrays(arg); + goto done; +#endif + default:; + } + + /* + * Commands creating/starting a new array: + */ + + mddev = bdev->bd_disk->private_data; + + if (!mddev) { + BUG(); + goto abort; + } + + err = mddev_lock(mddev); + if (err) { + printk(KERN_INFO + "md: ioctl lock interrupted, reason %d, cmd %d\n", + err, cmd); + goto abort; + } + + switch (cmd) + { + case SET_ARRAY_INFO: + { + mdu_array_info_t info; + if (!arg) + memset(&info, 0, sizeof(info)); + else if (copy_from_user(&info, argp, sizeof(info))) { + err = -EFAULT; + goto abort_unlock; + } + if (mddev->pers) { + err = update_array_info(mddev, &info); + if (err) { + printk(KERN_WARNING "md: couldn't update" + " array info. %d\n", err); + goto abort_unlock; + } + goto done_unlock; + } + if (!list_empty(&mddev->disks)) { + printk(KERN_WARNING + "md: array %s already has disks!\n", + mdname(mddev)); + err = -EBUSY; + goto abort_unlock; + } + if (mddev->raid_disks) { + printk(KERN_WARNING + "md: array %s already initialised!\n", + mdname(mddev)); + err = -EBUSY; + goto abort_unlock; + } + err = set_array_info(mddev, &info); + if (err) { + printk(KERN_WARNING "md: couldn't set" + " array info. %d\n", err); + goto abort_unlock; + } + } + goto done_unlock; + + default:; + } + + /* + * Commands querying/configuring an existing array: + */ + /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, + * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ + if ((!mddev->raid_disks && !mddev->external) + && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY + && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE + && cmd != GET_BITMAP_FILE) { + err = -ENODEV; + goto abort_unlock; + } + + /* + * Commands even a read-only array can execute: + */ + switch (cmd) + { + case GET_ARRAY_INFO: + err = get_array_info(mddev, argp); + goto done_unlock; + + case GET_BITMAP_FILE: + err = get_bitmap_file(mddev, argp); + goto done_unlock; + + case GET_DISK_INFO: + err = get_disk_info(mddev, argp); + goto done_unlock; + + case RESTART_ARRAY_RW: + err = restart_array(mddev); + goto done_unlock; + + case STOP_ARRAY: + err = do_md_stop(mddev, 0, 1); + goto done_unlock; + + case STOP_ARRAY_RO: + err = md_set_readonly(mddev, 1); + goto done_unlock; + + case BLKROSET: + if (get_user(ro, (int __user *)(arg))) { + err = -EFAULT; + goto done_unlock; + } + err = -EINVAL; + + /* if the bdev is going readonly the value of mddev->ro + * does not matter, no writes are coming + */ + if (ro) + goto done_unlock; + + /* are we are already prepared for writes? */ + if (mddev->ro != 1) + goto done_unlock; + + /* transitioning to readauto need only happen for + * arrays that call md_write_start + */ + if (mddev->pers) { + err = restart_array(mddev); + if (err == 0) { + mddev->ro = 2; + set_disk_ro(mddev->gendisk, 0); + } + } + goto done_unlock; + } + + /* + * The remaining ioctls are changing the state of the + * superblock, so we do not allow them on read-only arrays. + * However non-MD ioctls (e.g. get-size) will still come through + * here and hit the 'default' below, so only disallow + * 'md' ioctls, and switch to rw mode if started auto-readonly. + */ + if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) { + if (mddev->ro == 2) { + mddev->ro = 0; + sysfs_notify_dirent_safe(mddev->sysfs_state); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } else { + err = -EROFS; + goto abort_unlock; + } + } + + switch (cmd) + { + case ADD_NEW_DISK: + { + mdu_disk_info_t info; + if (copy_from_user(&info, argp, sizeof(info))) + err = -EFAULT; + else + err = add_new_disk(mddev, &info); + goto done_unlock; + } + + case HOT_REMOVE_DISK: + err = hot_remove_disk(mddev, new_decode_dev(arg)); + goto done_unlock; + + case HOT_ADD_DISK: + err = hot_add_disk(mddev, new_decode_dev(arg)); + goto done_unlock; + + case SET_DISK_FAULTY: + err = set_disk_faulty(mddev, new_decode_dev(arg)); + goto done_unlock; + + case RUN_ARRAY: + err = do_md_run(mddev); + goto done_unlock; + + case SET_BITMAP_FILE: + err = set_bitmap_file(mddev, (int)arg); + goto done_unlock; + + default: + err = -EINVAL; + goto abort_unlock; + } + +done_unlock: +abort_unlock: + if (mddev->hold_active == UNTIL_IOCTL && + err != -EINVAL) + mddev->hold_active = 0; + mddev_unlock(mddev); + + return err; +done: + if (err) + MD_BUG(); +abort: + return err; +} +#ifdef CONFIG_COMPAT +static int md_compat_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case HOT_REMOVE_DISK: + case HOT_ADD_DISK: + case SET_DISK_FAULTY: + case SET_BITMAP_FILE: + /* These take in integer arg, do not convert */ + break; + default: + arg = (unsigned long)compat_ptr(arg); + break; + } + + return md_ioctl(bdev, mode, cmd, arg); +} +#endif /* CONFIG_COMPAT */ + +static int md_open(struct block_device *bdev, fmode_t mode) +{ + /* + * Succeed if we can lock the mddev, which confirms that + * it isn't being stopped right now. + */ + struct mddev *mddev = mddev_find(bdev->bd_dev); + int err; + + if (mddev->gendisk != bdev->bd_disk) { + /* we are racing with mddev_put which is discarding this + * bd_disk. + */ + mddev_put(mddev); + /* Wait until bdev->bd_disk is definitely gone */ + flush_workqueue(md_misc_wq); + /* Then retry the open from the top */ + return -ERESTARTSYS; + } + BUG_ON(mddev != bdev->bd_disk->private_data); + + if ((err = mutex_lock_interruptible(&mddev->open_mutex))) + goto out; + + err = 0; + atomic_inc(&mddev->openers); + mutex_unlock(&mddev->open_mutex); + + check_disk_change(bdev); + out: + return err; +} + +static int md_release(struct gendisk *disk, fmode_t mode) +{ + struct mddev *mddev = disk->private_data; + + BUG_ON(!mddev); + atomic_dec(&mddev->openers); + mddev_put(mddev); + + return 0; +} + +static int md_media_changed(struct gendisk *disk) +{ + struct mddev *mddev = disk->private_data; + + return mddev->changed; +} + +static int md_revalidate(struct gendisk *disk) +{ + struct mddev *mddev = disk->private_data; + + mddev->changed = 0; + return 0; +} +static const struct block_device_operations md_fops = +{ + .owner = THIS_MODULE, + .open = md_open, + .release = md_release, + .ioctl = md_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = md_compat_ioctl, +#endif + .getgeo = md_getgeo, + .media_changed = md_media_changed, + .revalidate_disk= md_revalidate, +}; + +static int md_thread(void * arg) +{ + struct md_thread *thread = arg; + + /* + * md_thread is a 'system-thread', it's priority should be very + * high. We avoid resource deadlocks individually in each + * raid personality. (RAID5 does preallocation) We also use RR and + * the very same RT priority as kswapd, thus we will never get + * into a priority inversion deadlock. + * + * we definitely have to have equal or higher priority than + * bdflush, otherwise bdflush will deadlock if there are too + * many dirty RAID5 blocks. + */ + + allow_signal(SIGKILL); + while (!kthread_should_stop()) { + + /* We need to wait INTERRUPTIBLE so that + * we don't add to the load-average. + * That means we need to be sure no signals are + * pending + */ + if (signal_pending(current)) + flush_signals(current); + + wait_event_interruptible_timeout + (thread->wqueue, + test_bit(THREAD_WAKEUP, &thread->flags) + || kthread_should_stop(), + thread->timeout); + + clear_bit(THREAD_WAKEUP, &thread->flags); + if (!kthread_should_stop()) + thread->run(thread->mddev); + } + + return 0; +} + +void md_wakeup_thread(struct md_thread *thread) +{ + if (thread) { + pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm); + set_bit(THREAD_WAKEUP, &thread->flags); + wake_up(&thread->wqueue); + } +} + +struct md_thread *md_register_thread(void (*run) (struct mddev *), struct mddev *mddev, + const char *name) +{ + struct md_thread *thread; + + thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL); + if (!thread) + return NULL; + + init_waitqueue_head(&thread->wqueue); + + thread->run = run; + thread->mddev = mddev; + thread->timeout = MAX_SCHEDULE_TIMEOUT; + thread->tsk = kthread_run(md_thread, thread, + "%s_%s", + mdname(thread->mddev), + name ?: mddev->pers->name); + if (IS_ERR(thread->tsk)) { + kfree(thread); + return NULL; + } + return thread; +} + +void md_unregister_thread(struct md_thread **threadp) +{ + struct md_thread *thread = *threadp; + if (!thread) + return; + pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); + /* Locking ensures that mddev_unlock does not wake_up a + * non-existent thread + */ + spin_lock(&pers_lock); + *threadp = NULL; + spin_unlock(&pers_lock); + + kthread_stop(thread->tsk); + kfree(thread); +} + +void md_error(struct mddev *mddev, struct md_rdev *rdev) +{ + if (!mddev) { + MD_BUG(); + return; + } + + if (!rdev || test_bit(Faulty, &rdev->flags)) + return; + + if (!mddev->pers || !mddev->pers->error_handler) + return; + mddev->pers->error_handler(mddev,rdev); + if (mddev->degraded) + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + sysfs_notify_dirent_safe(rdev->sysfs_state); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + if (mddev->event_work.func) + queue_work(md_misc_wq, &mddev->event_work); + md_new_event_inintr(mddev); +} + +/* seq_file implementation /proc/mdstat */ + +static void status_unused(struct seq_file *seq) +{ + int i = 0; + struct md_rdev *rdev; + + seq_printf(seq, "unused devices: "); + + list_for_each_entry(rdev, &pending_raid_disks, same_set) { + char b[BDEVNAME_SIZE]; + i++; + seq_printf(seq, "%s ", + bdevname(rdev->bdev,b)); + } + if (!i) + seq_printf(seq, "<none>"); + + seq_printf(seq, "\n"); +} + + +static void status_resync(struct seq_file *seq, struct mddev * mddev) +{ + sector_t max_sectors, resync, res; + unsigned long dt, db; + sector_t rt; + int scale; + unsigned int per_milli; + + resync = mddev->curr_resync - atomic_read(&mddev->recovery_active); + + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) + max_sectors = mddev->resync_max_sectors; + else + max_sectors = mddev->dev_sectors; + + /* + * Should not happen. + */ + if (!max_sectors) { + MD_BUG(); + return; + } + /* Pick 'scale' such that (resync>>scale)*1000 will fit + * in a sector_t, and (max_sectors>>scale) will fit in a + * u32, as those are the requirements for sector_div. + * Thus 'scale' must be at least 10 + */ + scale = 10; + if (sizeof(sector_t) > sizeof(unsigned long)) { + while ( max_sectors/2 > (1ULL<<(scale+32))) + scale++; + } + res = (resync>>scale)*1000; + sector_div(res, (u32)((max_sectors>>scale)+1)); + + per_milli = res; + { + int i, x = per_milli/50, y = 20-x; + seq_printf(seq, "["); + for (i = 0; i < x; i++) + seq_printf(seq, "="); + seq_printf(seq, ">"); + for (i = 0; i < y; i++) + seq_printf(seq, "."); + seq_printf(seq, "] "); + } + seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", + (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? + "reshape" : + (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? + "check" : + (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? + "resync" : "recovery"))), + per_milli/10, per_milli % 10, + (unsigned long long) resync/2, + (unsigned long long) max_sectors/2); + + /* + * dt: time from mark until now + * db: blocks written from mark until now + * rt: remaining time + * + * rt is a sector_t, so could be 32bit or 64bit. + * So we divide before multiply in case it is 32bit and close + * to the limit. + * We scale the divisor (db) by 32 to avoid losing precision + * near the end of resync when the number of remaining sectors + * is close to 'db'. + * We then divide rt by 32 after multiplying by db to compensate. + * The '+1' avoids division by zero if db is very small. + */ + dt = ((jiffies - mddev->resync_mark) / HZ); + if (!dt) dt++; + db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) + - mddev->resync_mark_cnt; + + rt = max_sectors - resync; /* number of remaining sectors */ + sector_div(rt, db/32+1); + rt *= dt; + rt >>= 5; + + seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, + ((unsigned long)rt % 60)/6); + + seq_printf(seq, " speed=%ldK/sec", db/2/dt); +} + +static void *md_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct list_head *tmp; + loff_t l = *pos; + struct mddev *mddev; + + if (l >= 0x10000) + return NULL; + if (!l--) + /* header */ + return (void*)1; + + spin_lock(&all_mddevs_lock); + list_for_each(tmp,&all_mddevs) + if (!l--) { + mddev = list_entry(tmp, struct mddev, all_mddevs); + mddev_get(mddev); + spin_unlock(&all_mddevs_lock); + return mddev; + } + spin_unlock(&all_mddevs_lock); + if (!l--) + return (void*)2;/* tail */ + return NULL; +} + +static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct list_head *tmp; + struct mddev *next_mddev, *mddev = v; + + ++*pos; + if (v == (void*)2) + return NULL; + + spin_lock(&all_mddevs_lock); + if (v == (void*)1) + tmp = all_mddevs.next; + else + tmp = mddev->all_mddevs.next; + if (tmp != &all_mddevs) + next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs)); + else { + next_mddev = (void*)2; + *pos = 0x10000; + } + spin_unlock(&all_mddevs_lock); + + if (v != (void*)1) + mddev_put(mddev); + return next_mddev; + +} + +static void md_seq_stop(struct seq_file *seq, void *v) +{ + struct mddev *mddev = v; + + if (mddev && v != (void*)1 && v != (void*)2) + mddev_put(mddev); +} + +static int md_seq_show(struct seq_file *seq, void *v) +{ + struct mddev *mddev = v; + sector_t sectors; + struct md_rdev *rdev; + + if (v == (void*)1) { + struct md_personality *pers; + seq_printf(seq, "Personalities : "); + spin_lock(&pers_lock); + list_for_each_entry(pers, &pers_list, list) + seq_printf(seq, "[%s] ", pers->name); + + spin_unlock(&pers_lock); + seq_printf(seq, "\n"); + seq->poll_event = atomic_read(&md_event_count); + return 0; + } + if (v == (void*)2) { + status_unused(seq); + return 0; + } + + if (mddev_lock(mddev) < 0) + return -EINTR; + + if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { + seq_printf(seq, "%s : %sactive", mdname(mddev), + mddev->pers ? "" : "in"); + if (mddev->pers) { + if (mddev->ro==1) + seq_printf(seq, " (read-only)"); + if (mddev->ro==2) + seq_printf(seq, " (auto-read-only)"); + seq_printf(seq, " %s", mddev->pers->name); + } + + sectors = 0; + rdev_for_each(rdev, mddev) { + char b[BDEVNAME_SIZE]; + seq_printf(seq, " %s[%d]", + bdevname(rdev->bdev,b), rdev->desc_nr); + if (test_bit(WriteMostly, &rdev->flags)) + seq_printf(seq, "(W)"); + if (test_bit(Faulty, &rdev->flags)) { + seq_printf(seq, "(F)"); + continue; + } + if (rdev->raid_disk < 0) + seq_printf(seq, "(S)"); /* spare */ + if (test_bit(Replacement, &rdev->flags)) + seq_printf(seq, "(R)"); + sectors += rdev->sectors; + } + + if (!list_empty(&mddev->disks)) { + if (mddev->pers) + seq_printf(seq, "\n %llu blocks", + (unsigned long long) + mddev->array_sectors / 2); + else + seq_printf(seq, "\n %llu blocks", + (unsigned long long)sectors / 2); + } + if (mddev->persistent) { + if (mddev->major_version != 0 || + mddev->minor_version != 90) { + seq_printf(seq," super %d.%d", + mddev->major_version, + mddev->minor_version); + } + } else if (mddev->external) + seq_printf(seq, " super external:%s", + mddev->metadata_type); + else + seq_printf(seq, " super non-persistent"); + + if (mddev->pers) { + mddev->pers->status(seq, mddev); + seq_printf(seq, "\n "); + if (mddev->pers->sync_request) { + if (mddev->curr_resync > 2) { + status_resync(seq, mddev); + seq_printf(seq, "\n "); + } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) + seq_printf(seq, "\tresync=DELAYED\n "); + else if (mddev->recovery_cp < MaxSector) + seq_printf(seq, "\tresync=PENDING\n "); + } + } else + seq_printf(seq, "\n "); + + bitmap_status(seq, mddev->bitmap); + + seq_printf(seq, "\n"); + } + mddev_unlock(mddev); + + return 0; +} + +static const struct seq_operations md_seq_ops = { + .start = md_seq_start, + .next = md_seq_next, + .stop = md_seq_stop, + .show = md_seq_show, +}; + +static int md_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int error; + + error = seq_open(file, &md_seq_ops); + if (error) + return error; + + seq = file->private_data; + seq->poll_event = atomic_read(&md_event_count); + return error; +} + +static unsigned int mdstat_poll(struct file *filp, poll_table *wait) +{ + struct seq_file *seq = filp->private_data; + int mask; + + poll_wait(filp, &md_event_waiters, wait); + + /* always allow read */ + mask = POLLIN | POLLRDNORM; + + if (seq->poll_event != atomic_read(&md_event_count)) + mask |= POLLERR | POLLPRI; + return mask; +} + +static const struct file_operations md_seq_fops = { + .owner = THIS_MODULE, + .open = md_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, + .poll = mdstat_poll, +}; + +int register_md_personality(struct md_personality *p) +{ + spin_lock(&pers_lock); + list_add_tail(&p->list, &pers_list); + printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level); + spin_unlock(&pers_lock); + return 0; +} + +int unregister_md_personality(struct md_personality *p) +{ + printk(KERN_INFO "md: %s personality unregistered\n", p->name); + spin_lock(&pers_lock); + list_del_init(&p->list); + spin_unlock(&pers_lock); + return 0; +} + +static int is_mddev_idle(struct mddev *mddev, int init) +{ + struct md_rdev * rdev; + int idle; + int curr_events; + + idle = 1; + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { + struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; + curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + + (int)part_stat_read(&disk->part0, sectors[1]) - + atomic_read(&disk->sync_io); + /* sync IO will cause sync_io to increase before the disk_stats + * as sync_io is counted when a request starts, and + * disk_stats is counted when it completes. + * So resync activity will cause curr_events to be smaller than + * when there was no such activity. + * non-sync IO will cause disk_stat to increase without + * increasing sync_io so curr_events will (eventually) + * be larger than it was before. Once it becomes + * substantially larger, the test below will cause + * the array to appear non-idle, and resync will slow + * down. + * If there is a lot of outstanding resync activity when + * we set last_event to curr_events, then all that activity + * completing might cause the array to appear non-idle + * and resync will be slowed down even though there might + * not have been non-resync activity. This will only + * happen once though. 'last_events' will soon reflect + * the state where there is little or no outstanding + * resync requests, and further resync activity will + * always make curr_events less than last_events. + * + */ + if (init || curr_events - rdev->last_events > 64) { + rdev->last_events = curr_events; + idle = 0; + } + } + rcu_read_unlock(); + return idle; +} + +void md_done_sync(struct mddev *mddev, int blocks, int ok) +{ + /* another "blocks" (512byte) blocks have been synced */ + atomic_sub(blocks, &mddev->recovery_active); + wake_up(&mddev->recovery_wait); + if (!ok) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_wakeup_thread(mddev->thread); + // stop recovery, signal do_sync .... + } +} + + +/* md_write_start(mddev, bi) + * If we need to update some array metadata (e.g. 'active' flag + * in superblock) before writing, schedule a superblock update + * and wait for it to complete. + */ +void md_write_start(struct mddev *mddev, struct bio *bi) +{ + int did_change = 0; + if (bio_data_dir(bi) != WRITE) + return; + + BUG_ON(mddev->ro == 1); + if (mddev->ro == 2) { + /* need to switch to read/write */ + mddev->ro = 0; + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + md_wakeup_thread(mddev->sync_thread); + did_change = 1; + } + atomic_inc(&mddev->writes_pending); + if (mddev->safemode == 1) + mddev->safemode = 0; + if (mddev->in_sync) { + spin_lock_irq(&mddev->write_lock); + if (mddev->in_sync) { + mddev->in_sync = 0; + set_bit(MD_CHANGE_CLEAN, &mddev->flags); + set_bit(MD_CHANGE_PENDING, &mddev->flags); + md_wakeup_thread(mddev->thread); + did_change = 1; + } + spin_unlock_irq(&mddev->write_lock); + } + if (did_change) + sysfs_notify_dirent_safe(mddev->sysfs_state); + wait_event(mddev->sb_wait, + !test_bit(MD_CHANGE_PENDING, &mddev->flags)); +} + +void md_write_end(struct mddev *mddev) +{ + if (atomic_dec_and_test(&mddev->writes_pending)) { + if (mddev->safemode == 2) + md_wakeup_thread(mddev->thread); + else if (mddev->safemode_delay) + mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); + } +} + +/* md_allow_write(mddev) + * Calling this ensures that the array is marked 'active' so that writes + * may proceed without blocking. It is important to call this before + * attempting a GFP_KERNEL allocation while holding the mddev lock. + * Must be called with mddev_lock held. + * + * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock + * is dropped, so return -EAGAIN after notifying userspace. + */ +int md_allow_write(struct mddev *mddev) +{ + if (!mddev->pers) + return 0; + if (mddev->ro) + return 0; + if (!mddev->pers->sync_request) + return 0; + + spin_lock_irq(&mddev->write_lock); + if (mddev->in_sync) { + mddev->in_sync = 0; + set_bit(MD_CHANGE_CLEAN, &mddev->flags); + set_bit(MD_CHANGE_PENDING, &mddev->flags); + if (mddev->safemode_delay && + mddev->safemode == 0) + mddev->safemode = 1; + spin_unlock_irq(&mddev->write_lock); + md_update_sb(mddev, 0); + sysfs_notify_dirent_safe(mddev->sysfs_state); + } else + spin_unlock_irq(&mddev->write_lock); + + if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) + return -EAGAIN; + else + return 0; +} +EXPORT_SYMBOL_GPL(md_allow_write); + +#define SYNC_MARKS 10 +#define SYNC_MARK_STEP (3*HZ) +void md_do_sync(struct mddev *mddev) +{ + struct mddev *mddev2; + unsigned int currspeed = 0, + window; + sector_t max_sectors,j, io_sectors; + unsigned long mark[SYNC_MARKS]; + sector_t mark_cnt[SYNC_MARKS]; + int last_mark,m; + struct list_head *tmp; + sector_t last_check; + int skipped = 0; + struct md_rdev *rdev; + char *desc; + + /* just incase thread restarts... */ + if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) + return; + if (mddev->ro) /* never try to sync a read-only array */ + return; + + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) + desc = "data-check"; + else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + desc = "requested-resync"; + else + desc = "resync"; + } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + desc = "reshape"; + else + desc = "recovery"; + + /* we overload curr_resync somewhat here. + * 0 == not engaged in resync at all + * 2 == checking that there is no conflict with another sync + * 1 == like 2, but have yielded to allow conflicting resync to + * commense + * other == active in resync - this many blocks + * + * Before starting a resync we must have set curr_resync to + * 2, and then checked that every "conflicting" array has curr_resync + * less than ours. When we find one that is the same or higher + * we wait on resync_wait. To avoid deadlock, we reduce curr_resync + * to 1 if we choose to yield (based arbitrarily on address of mddev structure). + * This will mean we have to start checking from the beginning again. + * + */ + + do { + mddev->curr_resync = 2; + + try_again: + if (kthread_should_stop()) + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + goto skip; + for_each_mddev(mddev2, tmp) { + if (mddev2 == mddev) + continue; + if (!mddev->parallel_resync + && mddev2->curr_resync + && match_mddev_units(mddev, mddev2)) { + DEFINE_WAIT(wq); + if (mddev < mddev2 && mddev->curr_resync == 2) { + /* arbitrarily yield */ + mddev->curr_resync = 1; + wake_up(&resync_wait); + } + if (mddev > mddev2 && mddev->curr_resync == 1) + /* no need to wait here, we can wait the next + * time 'round when curr_resync == 2 + */ + continue; + /* We need to wait 'interruptible' so as not to + * contribute to the load average, and not to + * be caught by 'softlockup' + */ + prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); + if (!kthread_should_stop() && + mddev2->curr_resync >= mddev->curr_resync) { + printk(KERN_INFO "md: delaying %s of %s" + " until %s has finished (they" + " share one or more physical units)\n", + desc, mdname(mddev), mdname(mddev2)); + mddev_put(mddev2); + if (signal_pending(current)) + flush_signals(current); + schedule(); + finish_wait(&resync_wait, &wq); + goto try_again; + } + finish_wait(&resync_wait, &wq); + } + } + } while (mddev->curr_resync < 2); + + j = 0; + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + /* resync follows the size requested by the personality, + * which defaults to physical size, but can be virtual size + */ + max_sectors = mddev->resync_max_sectors; + mddev->resync_mismatches = 0; + /* we don't use the checkpoint if there's a bitmap */ + if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + j = mddev->resync_min; + else if (!mddev->bitmap) + j = mddev->recovery_cp; + + } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + max_sectors = mddev->dev_sectors; + else { + /* recovery follows the physical size of devices */ + max_sectors = mddev->dev_sectors; + j = MaxSector; + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Faulty, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < j) + j = rdev->recovery_offset; + rcu_read_unlock(); + } + + printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); + printk(KERN_INFO "md: minimum _guaranteed_ speed:" + " %d KB/sec/disk.\n", speed_min(mddev)); + printk(KERN_INFO "md: using maximum available idle IO bandwidth " + "(but not more than %d KB/sec) for %s.\n", + speed_max(mddev), desc); + + is_mddev_idle(mddev, 1); /* this initializes IO event counters */ + + io_sectors = 0; + for (m = 0; m < SYNC_MARKS; m++) { + mark[m] = jiffies; + mark_cnt[m] = io_sectors; + } + last_mark = 0; + mddev->resync_mark = mark[last_mark]; + mddev->resync_mark_cnt = mark_cnt[last_mark]; + + /* + * Tune reconstruction: + */ + window = 32*(PAGE_SIZE/512); + printk(KERN_INFO "md: using %dk window, over a total of %lluk.\n", + window/2, (unsigned long long)max_sectors/2); + + atomic_set(&mddev->recovery_active, 0); + last_check = 0; + + if (j>2) { + printk(KERN_INFO + "md: resuming %s of %s from checkpoint.\n", + desc, mdname(mddev)); + mddev->curr_resync = j; + } + mddev->curr_resync_completed = j; + + while (j < max_sectors) { + sector_t sectors; + + skipped = 0; + + if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + ((mddev->curr_resync > mddev->curr_resync_completed && + (mddev->curr_resync - mddev->curr_resync_completed) + > (max_sectors >> 4)) || + (j - mddev->curr_resync_completed)*2 + >= mddev->resync_max - mddev->curr_resync_completed + )) { + /* time to update curr_resync_completed */ + wait_event(mddev->recovery_wait, + atomic_read(&mddev->recovery_active) == 0); + mddev->curr_resync_completed = j; + set_bit(MD_CHANGE_CLEAN, &mddev->flags); + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + } + + while (j >= mddev->resync_max && !kthread_should_stop()) { + /* As this condition is controlled by user-space, + * we can block indefinitely, so use '_interruptible' + * to avoid triggering warnings. + */ + flush_signals(current); /* just in case */ + wait_event_interruptible(mddev->recovery_wait, + mddev->resync_max > j + || kthread_should_stop()); + } + + if (kthread_should_stop()) + goto interrupted; + + sectors = mddev->pers->sync_request(mddev, j, &skipped, + currspeed < speed_min(mddev)); + if (sectors == 0) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + goto out; + } + + if (!skipped) { /* actual IO requested */ + io_sectors += sectors; + atomic_add(sectors, &mddev->recovery_active); + } + + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + break; + + j += sectors; + if (j>1) mddev->curr_resync = j; + mddev->curr_mark_cnt = io_sectors; + if (last_check == 0) + /* this is the earliest that rebuild will be + * visible in /proc/mdstat + */ + md_new_event(mddev); + + if (last_check + window > io_sectors || j == max_sectors) + continue; + + last_check = io_sectors; + repeat: + if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { + /* step marks */ + int next = (last_mark+1) % SYNC_MARKS; + + mddev->resync_mark = mark[next]; + mddev->resync_mark_cnt = mark_cnt[next]; + mark[next] = jiffies; + mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); + last_mark = next; + } + + + if (kthread_should_stop()) + goto interrupted; + + + /* + * this loop exits only if either when we are slower than + * the 'hard' speed limit, or the system was IO-idle for + * a jiffy. + * the system might be non-idle CPU-wise, but we only care + * about not overloading the IO subsystem. (things like an + * e2fsck being done on the RAID array should execute fast) + */ + cond_resched(); + + currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 + /((jiffies-mddev->resync_mark)/HZ +1) +1; + + if (currspeed > speed_min(mddev)) { + if ((currspeed > speed_max(mddev)) || + !is_mddev_idle(mddev, 0)) { + msleep(500); + goto repeat; + } + } + } + printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); + /* + * this also signals 'finished resyncing' to md_stop + */ + out: + wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); + + /* tell personality that we are finished */ + mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); + + if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && + mddev->curr_resync > 2) { + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + if (mddev->curr_resync >= mddev->recovery_cp) { + printk(KERN_INFO + "md: checkpointing %s of %s.\n", + desc, mdname(mddev)); + mddev->recovery_cp = + mddev->curr_resync_completed; + } + } else + mddev->recovery_cp = MaxSector; + } else { + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + mddev->curr_resync = MaxSector; + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) + if (rdev->raid_disk >= 0 && + mddev->delta_disks >= 0 && + !test_bit(Faulty, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < mddev->curr_resync) + rdev->recovery_offset = mddev->curr_resync; + rcu_read_unlock(); + } + } + skip: + set_bit(MD_CHANGE_DEVS, &mddev->flags); + + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + /* We completed so min/max setting can be forgotten if used. */ + if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + mddev->resync_min = 0; + mddev->resync_max = MaxSector; + } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + mddev->resync_min = mddev->curr_resync_completed; + mddev->curr_resync = 0; + wake_up(&resync_wait); + set_bit(MD_RECOVERY_DONE, &mddev->recovery); + md_wakeup_thread(mddev->thread); + return; + + interrupted: + /* + * got a signal, exit. + */ + printk(KERN_INFO + "md: md_do_sync() got signal ... exiting\n"); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + goto out; + +} +EXPORT_SYMBOL_GPL(md_do_sync); + +static int remove_and_add_spares(struct mddev *mddev) +{ + struct md_rdev *rdev; + int spares = 0; + int removed = 0; + + mddev->curr_resync_completed = 0; + + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Blocked, &rdev->flags) && + (test_bit(Faulty, &rdev->flags) || + ! test_bit(In_sync, &rdev->flags)) && + atomic_read(&rdev->nr_pending)==0) { + if (mddev->pers->hot_remove_disk( + mddev, rdev) == 0) { + sysfs_unlink_rdev(mddev, rdev); + rdev->raid_disk = -1; + removed++; + } + } + if (removed) + sysfs_notify(&mddev->kobj, NULL, + "degraded"); + + + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk >= 0 && + !test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) + spares++; + if (rdev->raid_disk < 0 + && !test_bit(Faulty, &rdev->flags)) { + rdev->recovery_offset = 0; + if (mddev->pers-> + hot_add_disk(mddev, rdev) == 0) { + if (sysfs_link_rdev(mddev, rdev)) + /* failure here is OK */; + spares++; + md_new_event(mddev); + set_bit(MD_CHANGE_DEVS, &mddev->flags); + } + } + } + return spares; +} + +static void reap_sync_thread(struct mddev *mddev) +{ + struct md_rdev *rdev; + + /* resync has finished, collect result */ + md_unregister_thread(&mddev->sync_thread); + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { + /* success...*/ + /* activate any spares */ + if (mddev->pers->spare_active(mddev)) + sysfs_notify(&mddev->kobj, NULL, + "degraded"); + } + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + mddev->pers->finish_reshape) + mddev->pers->finish_reshape(mddev); + + /* If array is no-longer degraded, then any saved_raid_disk + * information must be scrapped. Also if any device is now + * In_sync we must scrape the saved_raid_disk for that device + * do the superblock for an incrementally recovered device + * written out. + */ + rdev_for_each(rdev, mddev) + if (!mddev->degraded || + test_bit(In_sync, &rdev->flags)) + rdev->saved_raid_disk = -1; + + md_update_sb(mddev, 1); + clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + /* flag recovery needed just to double check */ + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + sysfs_notify_dirent_safe(mddev->sysfs_action); + md_new_event(mddev); + if (mddev->event_work.func) + queue_work(md_misc_wq, &mddev->event_work); +} + +/* + * This routine is regularly called by all per-raid-array threads to + * deal with generic issues like resync and super-block update. + * Raid personalities that don't have a thread (linear/raid0) do not + * need this as they never do any recovery or update the superblock. + * + * It does not do any resync itself, but rather "forks" off other threads + * to do that as needed. + * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in + * "->recovery" and create a thread at ->sync_thread. + * When the thread finishes it sets MD_RECOVERY_DONE + * and wakeups up this thread which will reap the thread and finish up. + * This thread also removes any faulty devices (with nr_pending == 0). + * + * The overall approach is: + * 1/ if the superblock needs updating, update it. + * 2/ If a recovery thread is running, don't do anything else. + * 3/ If recovery has finished, clean up, possibly marking spares active. + * 4/ If there are any faulty devices, remove them. + * 5/ If array is degraded, try to add spares devices + * 6/ If array has spares or is not in-sync, start a resync thread. + */ +void md_check_recovery(struct mddev *mddev) +{ + if (mddev->suspended) + return; + + if (mddev->bitmap) + bitmap_daemon_work(mddev); + + if (signal_pending(current)) { + if (mddev->pers->sync_request && !mddev->external) { + printk(KERN_INFO "md: %s in immediate safe mode\n", + mdname(mddev)); + mddev->safemode = 2; + } + flush_signals(current); + } + + if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) + return; + if ( ! ( + (mddev->flags & ~ (1<<MD_CHANGE_PENDING)) || + test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || + test_bit(MD_RECOVERY_DONE, &mddev->recovery) || + (mddev->external == 0 && mddev->safemode == 1) || + (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) + && !mddev->in_sync && mddev->recovery_cp == MaxSector) + )) + return; + + if (mddev_trylock(mddev)) { + int spares = 0; + + if (mddev->ro) { + /* Only thing we do on a ro array is remove + * failed devices. + */ + struct md_rdev *rdev; + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Blocked, &rdev->flags) && + test_bit(Faulty, &rdev->flags) && + atomic_read(&rdev->nr_pending)==0) { + if (mddev->pers->hot_remove_disk( + mddev, rdev) == 0) { + sysfs_unlink_rdev(mddev, rdev); + rdev->raid_disk = -1; + } + } + clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + goto unlock; + } + + if (!mddev->external) { + int did_change = 0; + spin_lock_irq(&mddev->write_lock); + if (mddev->safemode && + !atomic_read(&mddev->writes_pending) && + !mddev->in_sync && + mddev->recovery_cp == MaxSector) { + mddev->in_sync = 1; + did_change = 1; + set_bit(MD_CHANGE_CLEAN, &mddev->flags); + } + if (mddev->safemode == 1) + mddev->safemode = 0; + spin_unlock_irq(&mddev->write_lock); + if (did_change) + sysfs_notify_dirent_safe(mddev->sysfs_state); + } + + if (mddev->flags) + md_update_sb(mddev, 0); + + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && + !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { + /* resync/recovery still happening */ + clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + goto unlock; + } + if (mddev->sync_thread) { + reap_sync_thread(mddev); + goto unlock; + } + /* Set RUNNING before clearing NEEDED to avoid + * any transients in the value of "sync_action". + */ + set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + /* Clear some bits that don't mean anything, but + * might be left set + */ + clear_bit(MD_RECOVERY_INTR, &mddev->recovery); + clear_bit(MD_RECOVERY_DONE, &mddev->recovery); + + if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || + test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) + goto unlock; + /* no recovery is running. + * remove any failed drives, then + * add spares if possible. + * Spare are also removed and re-added, to allow + * the personality to fail the re-add. + */ + + if (mddev->reshape_position != MaxSector) { + if (mddev->pers->check_reshape == NULL || + mddev->pers->check_reshape(mddev) != 0) + /* Cannot proceed */ + goto unlock; + set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + } else if ((spares = remove_and_add_spares(mddev))) { + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + } else if (mddev->recovery_cp < MaxSector) { + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) + /* nothing to be done ... */ + goto unlock; + + if (mddev->pers->sync_request) { + if (spares && mddev->bitmap && ! mddev->bitmap->file) { + /* We are adding a device or devices to an array + * which has the bitmap stored on all devices. + * So make sure all bitmap pages get written + */ + bitmap_write_all(mddev->bitmap); + } + mddev->sync_thread = md_register_thread(md_do_sync, + mddev, + "resync"); + if (!mddev->sync_thread) { + printk(KERN_ERR "%s: could not start resync" + " thread...\n", + mdname(mddev)); + /* leave the spares where they are, it shouldn't hurt */ + clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + } else + md_wakeup_thread(mddev->sync_thread); + sysfs_notify_dirent_safe(mddev->sysfs_action); + md_new_event(mddev); + } + unlock: + if (!mddev->sync_thread) { + clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + if (test_and_clear_bit(MD_RECOVERY_RECOVER, + &mddev->recovery)) + if (mddev->sysfs_action) + sysfs_notify_dirent_safe(mddev->sysfs_action); + } + mddev_unlock(mddev); + } +} + +void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) +{ + sysfs_notify_dirent_safe(rdev->sysfs_state); + wait_event_timeout(rdev->blocked_wait, + !test_bit(Blocked, &rdev->flags) && + !test_bit(BlockedBadBlocks, &rdev->flags), + msecs_to_jiffies(5000)); + rdev_dec_pending(rdev, mddev); +} +EXPORT_SYMBOL(md_wait_for_blocked_rdev); + + +/* Bad block management. + * We can record which blocks on each device are 'bad' and so just + * fail those blocks, or that stripe, rather than the whole device. + * Entries in the bad-block table are 64bits wide. This comprises: + * Length of bad-range, in sectors: 0-511 for lengths 1-512 + * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) + * A 'shift' can be set so that larger blocks are tracked and + * consequently larger devices can be covered. + * 'Acknowledged' flag - 1 bit. - the most significant bit. + * + * Locking of the bad-block table uses a seqlock so md_is_badblock + * might need to retry if it is very unlucky. + * We will sometimes want to check for bad blocks in a bi_end_io function, + * so we use the write_seqlock_irq variant. + * + * When looking for a bad block we specify a range and want to + * know if any block in the range is bad. So we binary-search + * to the last range that starts at-or-before the given endpoint, + * (or "before the sector after the target range") + * then see if it ends after the given start. + * We return + * 0 if there are no known bad blocks in the range + * 1 if there are known bad block which are all acknowledged + * -1 if there are bad blocks which have not yet been acknowledged in metadata. + * plus the start/length of the first bad section we overlap. + */ +int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors) +{ + int hi; + int lo = 0; + u64 *p = bb->page; + int rv = 0; + sector_t target = s + sectors; + unsigned seq; + + if (bb->shift > 0) { + /* round the start down, and the end up */ + s >>= bb->shift; + target += (1<<bb->shift) - 1; + target >>= bb->shift; + sectors = target - s; + } + /* 'target' is now the first block after the bad range */ + +retry: + seq = read_seqbegin(&bb->lock); + + hi = bb->count; + + /* Binary search between lo and hi for 'target' + * i.e. for the last range that starts before 'target' + */ + /* INVARIANT: ranges before 'lo' and at-or-after 'hi' + * are known not to be the last range before target. + * VARIANT: hi-lo is the number of possible + * ranges, and decreases until it reaches 1 + */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + if (a < target) + /* This could still be the one, earlier ranges + * could not. */ + lo = mid; + else + /* This and later ranges are definitely out. */ + hi = mid; + } + /* 'lo' might be the last that started before target, but 'hi' isn't */ + if (hi > lo) { + /* need to check all range that end after 's' to see if + * any are unacknowledged. + */ + while (lo >= 0 && + BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { + if (BB_OFFSET(p[lo]) < target) { + /* starts before the end, and finishes after + * the start, so they must overlap + */ + if (rv != -1 && BB_ACK(p[lo])) + rv = 1; + else + rv = -1; + *first_bad = BB_OFFSET(p[lo]); + *bad_sectors = BB_LEN(p[lo]); + } + lo--; + } + } + + if (read_seqretry(&bb->lock, seq)) + goto retry; + + return rv; +} +EXPORT_SYMBOL_GPL(md_is_badblock); + +/* + * Add a range of bad blocks to the table. + * This might extend the table, or might contract it + * if two adjacent ranges can be merged. + * We binary-search to find the 'insertion' point, then + * decide how best to handle it. + */ +static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, + int acknowledged) +{ + u64 *p; + int lo, hi; + int rv = 1; + + if (bb->shift < 0) + /* badblocks are disabled */ + return 0; + + if (bb->shift) { + /* round the start down, and the end up */ + sector_t next = s + sectors; + s >>= bb->shift; + next += (1<<bb->shift) - 1; + next >>= bb->shift; + sectors = next - s; + } + + write_seqlock_irq(&bb->lock); + + p = bb->page; + lo = 0; + hi = bb->count; + /* Find the last range that starts at-or-before 's' */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + if (a <= s) + lo = mid; + else + hi = mid; + } + if (hi > lo && BB_OFFSET(p[lo]) > s) + hi = lo; + + if (hi > lo) { + /* we found a range that might merge with the start + * of our new range + */ + sector_t a = BB_OFFSET(p[lo]); + sector_t e = a + BB_LEN(p[lo]); + int ack = BB_ACK(p[lo]); + if (e >= s) { + /* Yes, we can merge with a previous range */ + if (s == a && s + sectors >= e) + /* new range covers old */ + ack = acknowledged; + else + ack = ack && acknowledged; + + if (e < s + sectors) + e = s + sectors; + if (e - a <= BB_MAX_LEN) { + p[lo] = BB_MAKE(a, e-a, ack); + s = e; + } else { + /* does not all fit in one range, + * make p[lo] maximal + */ + if (BB_LEN(p[lo]) != BB_MAX_LEN) + p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); + s = a + BB_MAX_LEN; + } + sectors = e - s; + } + } + if (sectors && hi < bb->count) { + /* 'hi' points to the first range that starts after 's'. + * Maybe we can merge with the start of that range */ + sector_t a = BB_OFFSET(p[hi]); + sector_t e = a + BB_LEN(p[hi]); + int ack = BB_ACK(p[hi]); + if (a <= s + sectors) { + /* merging is possible */ + if (e <= s + sectors) { + /* full overlap */ + e = s + sectors; + ack = acknowledged; + } else + ack = ack && acknowledged; + + a = s; + if (e - a <= BB_MAX_LEN) { + p[hi] = BB_MAKE(a, e-a, ack); + s = e; + } else { + p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); + s = a + BB_MAX_LEN; + } + sectors = e - s; + lo = hi; + hi++; + } + } + if (sectors == 0 && hi < bb->count) { + /* we might be able to combine lo and hi */ + /* Note: 's' is at the end of 'lo' */ + sector_t a = BB_OFFSET(p[hi]); + int lolen = BB_LEN(p[lo]); + int hilen = BB_LEN(p[hi]); + int newlen = lolen + hilen - (s - a); + if (s >= a && newlen < BB_MAX_LEN) { + /* yes, we can combine them */ + int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); + p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); + memmove(p + hi, p + hi + 1, + (bb->count - hi - 1) * 8); + bb->count--; + } + } + while (sectors) { + /* didn't merge (it all). + * Need to add a range just before 'hi' */ + if (bb->count >= MD_MAX_BADBLOCKS) { + /* No room for more */ + rv = 0; + break; + } else { + int this_sectors = sectors; + memmove(p + hi + 1, p + hi, + (bb->count - hi) * 8); + bb->count++; + + if (this_sectors > BB_MAX_LEN) + this_sectors = BB_MAX_LEN; + p[hi] = BB_MAKE(s, this_sectors, acknowledged); + sectors -= this_sectors; + s += this_sectors; + } + } + + bb->changed = 1; + if (!acknowledged) + bb->unacked_exist = 1; + write_sequnlock_irq(&bb->lock); + + return rv; +} + +int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int acknowledged) +{ + int rv = md_set_badblocks(&rdev->badblocks, + s + rdev->data_offset, sectors, acknowledged); + if (rv) { + /* Make sure they get written out promptly */ + sysfs_notify_dirent_safe(rdev->sysfs_state); + set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); + md_wakeup_thread(rdev->mddev->thread); + } + return rv; +} +EXPORT_SYMBOL_GPL(rdev_set_badblocks); + +/* + * Remove a range of bad blocks from the table. + * This may involve extending the table if we spilt a region, + * but it must not fail. So if the table becomes full, we just + * drop the remove request. + */ +static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors) +{ + u64 *p; + int lo, hi; + sector_t target = s + sectors; + int rv = 0; + + if (bb->shift > 0) { + /* When clearing we round the start up and the end down. + * This should not matter as the shift should align with + * the block size and no rounding should ever be needed. + * However it is better the think a block is bad when it + * isn't than to think a block is not bad when it is. + */ + s += (1<<bb->shift) - 1; + s >>= bb->shift; + target >>= bb->shift; + sectors = target - s; + } + + write_seqlock_irq(&bb->lock); + + p = bb->page; + lo = 0; + hi = bb->count; + /* Find the last range that starts before 'target' */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + if (a < target) + lo = mid; + else + hi = mid; + } + if (hi > lo) { + /* p[lo] is the last range that could overlap the + * current range. Earlier ranges could also overlap, + * but only this one can overlap the end of the range. + */ + if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) { + /* Partial overlap, leave the tail of this range */ + int ack = BB_ACK(p[lo]); + sector_t a = BB_OFFSET(p[lo]); + sector_t end = a + BB_LEN(p[lo]); + + if (a < s) { + /* we need to split this range */ + if (bb->count >= MD_MAX_BADBLOCKS) { + rv = 0; + goto out; + } + memmove(p+lo+1, p+lo, (bb->count - lo) * 8); + bb->count++; + p[lo] = BB_MAKE(a, s-a, ack); + lo++; + } + p[lo] = BB_MAKE(target, end - target, ack); + /* there is no longer an overlap */ + hi = lo; + lo--; + } + while (lo >= 0 && + BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { + /* This range does overlap */ + if (BB_OFFSET(p[lo]) < s) { + /* Keep the early parts of this range. */ + int ack = BB_ACK(p[lo]); + sector_t start = BB_OFFSET(p[lo]); + p[lo] = BB_MAKE(start, s - start, ack); + /* now low doesn't overlap, so.. */ + break; + } + lo--; + } + /* 'lo' is strictly before, 'hi' is strictly after, + * anything between needs to be discarded + */ + if (hi - lo > 1) { + memmove(p+lo+1, p+hi, (bb->count - hi) * 8); + bb->count -= (hi - lo - 1); + } + } + + bb->changed = 1; +out: + write_sequnlock_irq(&bb->lock); + return rv; +} + +int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors) +{ + return md_clear_badblocks(&rdev->badblocks, + s + rdev->data_offset, + sectors); +} +EXPORT_SYMBOL_GPL(rdev_clear_badblocks); + +/* + * Acknowledge all bad blocks in a list. + * This only succeeds if ->changed is clear. It is used by + * in-kernel metadata updates + */ +void md_ack_all_badblocks(struct badblocks *bb) +{ + if (bb->page == NULL || bb->changed) + /* no point even trying */ + return; + write_seqlock_irq(&bb->lock); + + if (bb->changed == 0 && bb->unacked_exist) { + u64 *p = bb->page; + int i; + for (i = 0; i < bb->count ; i++) { + if (!BB_ACK(p[i])) { + sector_t start = BB_OFFSET(p[i]); + int len = BB_LEN(p[i]); + p[i] = BB_MAKE(start, len, 1); + } + } + bb->unacked_exist = 0; + } + write_sequnlock_irq(&bb->lock); +} +EXPORT_SYMBOL_GPL(md_ack_all_badblocks); + +/* sysfs access to bad-blocks list. + * We present two files. + * 'bad-blocks' lists sector numbers and lengths of ranges that + * are recorded as bad. The list is truncated to fit within + * the one-page limit of sysfs. + * Writing "sector length" to this file adds an acknowledged + * bad block list. + * 'unacknowledged-bad-blocks' lists bad blocks that have not yet + * been acknowledged. Writing to this file adds bad blocks + * without acknowledging them. This is largely for testing. + */ + +static ssize_t +badblocks_show(struct badblocks *bb, char *page, int unack) +{ + size_t len; + int i; + u64 *p = bb->page; + unsigned seq; + + if (bb->shift < 0) + return 0; + +retry: + seq = read_seqbegin(&bb->lock); + + len = 0; + i = 0; + + while (len < PAGE_SIZE && i < bb->count) { + sector_t s = BB_OFFSET(p[i]); + unsigned int length = BB_LEN(p[i]); + int ack = BB_ACK(p[i]); + i++; + + if (unack && ack) + continue; + + len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n", + (unsigned long long)s << bb->shift, + length << bb->shift); + } + if (unack && len == 0) + bb->unacked_exist = 0; + + if (read_seqretry(&bb->lock, seq)) + goto retry; + + return len; +} + +#define DO_DEBUG 1 + +static ssize_t +badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack) +{ + unsigned long long sector; + int length; + char newline; +#ifdef DO_DEBUG + /* Allow clearing via sysfs *only* for testing/debugging. + * Normally only a successful write may clear a badblock + */ + int clear = 0; + if (page[0] == '-') { + clear = 1; + page++; + } +#endif /* DO_DEBUG */ + + switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) { + case 3: + if (newline != '\n') + return -EINVAL; + case 2: + if (length <= 0) + return -EINVAL; + break; + default: + return -EINVAL; + } + +#ifdef DO_DEBUG + if (clear) { + md_clear_badblocks(bb, sector, length); + return len; + } +#endif /* DO_DEBUG */ + if (md_set_badblocks(bb, sector, length, !unack)) + return len; + else + return -ENOSPC; +} + +static int md_notify_reboot(struct notifier_block *this, + unsigned long code, void *x) +{ + struct list_head *tmp; + struct mddev *mddev; + int need_delay = 0; + + for_each_mddev(mddev, tmp) { + if (mddev_trylock(mddev)) { + if (mddev->pers) + __md_stop_writes(mddev); + mddev->safemode = 2; + mddev_unlock(mddev); + } + need_delay = 1; + } + /* + * certain more exotic SCSI devices are known to be + * volatile wrt too early system reboots. While the + * right place to handle this issue is the given + * driver, we do want to have a safe RAID driver ... + */ + if (need_delay) + mdelay(1000*1); + + return NOTIFY_DONE; +} + +static struct notifier_block md_notifier = { + .notifier_call = md_notify_reboot, + .next = NULL, + .priority = INT_MAX, /* before any real devices */ +}; + +static void md_geninit(void) +{ + pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); + + proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops); +} + +static int __init md_init(void) +{ + int ret = -ENOMEM; + + md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); + if (!md_wq) + goto err_wq; + + md_misc_wq = alloc_workqueue("md_misc", 0, 0); + if (!md_misc_wq) + goto err_misc_wq; + + if ((ret = register_blkdev(MD_MAJOR, "md")) < 0) + goto err_md; + + if ((ret = register_blkdev(0, "mdp")) < 0) + goto err_mdp; + mdp_major = ret; + + blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE, + md_probe, NULL, NULL); + blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, + md_probe, NULL, NULL); + + register_reboot_notifier(&md_notifier); + raid_table_header = register_sysctl_table(raid_root_table); + + md_geninit(); + return 0; + +err_mdp: + unregister_blkdev(MD_MAJOR, "md"); +err_md: + destroy_workqueue(md_misc_wq); +err_misc_wq: + destroy_workqueue(md_wq); +err_wq: + return ret; +} + +#ifndef MODULE + +/* + * Searches all registered partitions for autorun RAID arrays + * at boot time. + */ + +static LIST_HEAD(all_detected_devices); +struct detected_devices_node { + struct list_head list; + dev_t dev; +}; + +void md_autodetect_dev(dev_t dev) +{ + struct detected_devices_node *node_detected_dev; + + node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); + if (node_detected_dev) { + node_detected_dev->dev = dev; + list_add_tail(&node_detected_dev->list, &all_detected_devices); + } else { + printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed" + ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev)); + } +} + + +static void autostart_arrays(int part) +{ + struct md_rdev *rdev; + struct detected_devices_node *node_detected_dev; + dev_t dev; + int i_scanned, i_passed; + + i_scanned = 0; + i_passed = 0; + + printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); + + while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { + i_scanned++; + node_detected_dev = list_entry(all_detected_devices.next, + struct detected_devices_node, list); + list_del(&node_detected_dev->list); + dev = node_detected_dev->dev; + kfree(node_detected_dev); + rdev = md_import_device(dev,0, 90); + if (IS_ERR(rdev)) + continue; + + if (test_bit(Faulty, &rdev->flags)) { + MD_BUG(); + continue; + } + set_bit(AutoDetected, &rdev->flags); + list_add(&rdev->same_set, &pending_raid_disks); + i_passed++; + } + + printk(KERN_INFO "md: Scanned %d and added %d devices.\n", + i_scanned, i_passed); + + autorun_devices(part); +} + +#endif /* !MODULE */ + +static __exit void md_exit(void) +{ + struct mddev *mddev; + struct list_head *tmp; + + blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS); + blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); + + unregister_blkdev(MD_MAJOR,"md"); + unregister_blkdev(mdp_major, "mdp"); + unregister_reboot_notifier(&md_notifier); + unregister_sysctl_table(raid_table_header); + remove_proc_entry("mdstat", NULL); + for_each_mddev(mddev, tmp) { + export_array(mddev); + mddev->hold_active = 0; + } + destroy_workqueue(md_misc_wq); + destroy_workqueue(md_wq); +} + +subsys_initcall(md_init); +module_exit(md_exit) + +static int get_ro(char *buffer, struct kernel_param *kp) +{ + return sprintf(buffer, "%d", start_readonly); +} +static int set_ro(const char *val, struct kernel_param *kp) +{ + char *e; + int num = simple_strtoul(val, &e, 10); + if (*val && (*e == '\0' || *e == '\n')) { + start_readonly = num; + return 0; + } + return -EINVAL; +} + +module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); +module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); + +module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); + +EXPORT_SYMBOL(register_md_personality); +EXPORT_SYMBOL(unregister_md_personality); +EXPORT_SYMBOL(md_error); +EXPORT_SYMBOL(md_done_sync); +EXPORT_SYMBOL(md_write_start); +EXPORT_SYMBOL(md_write_end); +EXPORT_SYMBOL(md_register_thread); +EXPORT_SYMBOL(md_unregister_thread); +EXPORT_SYMBOL(md_wakeup_thread); +EXPORT_SYMBOL(md_check_recovery); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("MD RAID framework"); +MODULE_ALIAS("md"); +MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); diff --git a/drivers/md/md.h b/drivers/md/md.h new file mode 100644 index 00000000..1c2063cc --- /dev/null +++ b/drivers/md/md.h @@ -0,0 +1,627 @@ +/* + md.h : kernel internal structure of the Linux MD driver + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#ifndef _MD_MD_H +#define _MD_MD_H + +#include <linux/blkdev.h> +#include <linux/kobject.h> +#include <linux/list.h> +#include <linux/mm.h> +#include <linux/mutex.h> +#include <linux/timer.h> +#include <linux/wait.h> +#include <linux/workqueue.h> + +#define MaxSector (~(sector_t)0) + +/* Bad block numbers are stored sorted in a single page. + * 64bits is used for each block or extent. + * 54 bits are sector number, 9 bits are extent size, + * 1 bit is an 'acknowledged' flag. + */ +#define MD_MAX_BADBLOCKS (PAGE_SIZE/8) + +/* + * MD's 'extended' device + */ +struct md_rdev { + struct list_head same_set; /* RAID devices within the same set */ + + sector_t sectors; /* Device size (in 512bytes sectors) */ + struct mddev *mddev; /* RAID array if running */ + int last_events; /* IO event timestamp */ + + /* + * If meta_bdev is non-NULL, it means that a separate device is + * being used to store the metadata (superblock/bitmap) which + * would otherwise be contained on the same device as the data (bdev). + */ + struct block_device *meta_bdev; + struct block_device *bdev; /* block device handle */ + + struct page *sb_page, *bb_page; + int sb_loaded; + __u64 sb_events; + sector_t data_offset; /* start of data in array */ + sector_t sb_start; /* offset of the super block (in 512byte sectors) */ + int sb_size; /* bytes in the superblock */ + int preferred_minor; /* autorun support */ + + struct kobject kobj; + + /* A device can be in one of three states based on two flags: + * Not working: faulty==1 in_sync==0 + * Fully working: faulty==0 in_sync==1 + * Working, but not + * in sync with array + * faulty==0 in_sync==0 + * + * It can never have faulty==1, in_sync==1 + * This reduces the burden of testing multiple flags in many cases + */ + + unsigned long flags; /* bit set of 'enum flag_bits' bits. */ + wait_queue_head_t blocked_wait; + + int desc_nr; /* descriptor index in the superblock */ + int raid_disk; /* role of device in array */ + int new_raid_disk; /* role that the device will have in + * the array after a level-change completes. + */ + int saved_raid_disk; /* role that device used to have in the + * array and could again if we did a partial + * resync from the bitmap + */ + sector_t recovery_offset;/* If this device has been partially + * recovered, this is where we were + * up to. + */ + + atomic_t nr_pending; /* number of pending requests. + * only maintained for arrays that + * support hot removal + */ + atomic_t read_errors; /* number of consecutive read errors that + * we have tried to ignore. + */ + struct timespec last_read_error; /* monotonic time since our + * last read error + */ + atomic_t corrected_errors; /* number of corrected read errors, + * for reporting to userspace and storing + * in superblock. + */ + struct work_struct del_work; /* used for delayed sysfs removal */ + + struct sysfs_dirent *sysfs_state; /* handle for 'state' + * sysfs entry */ + + struct badblocks { + int count; /* count of bad blocks */ + int unacked_exist; /* there probably are unacknowledged + * bad blocks. This is only cleared + * when a read discovers none + */ + int shift; /* shift from sectors to block size + * a -ve shift means badblocks are + * disabled.*/ + u64 *page; /* badblock list */ + int changed; + seqlock_t lock; + + sector_t sector; + sector_t size; /* in sectors */ + } badblocks; +}; +enum flag_bits { + Faulty, /* device is known to have a fault */ + In_sync, /* device is in_sync with rest of array */ + Unmerged, /* device is being added to array and should + * be considerred for bvec_merge_fn but not + * yet for actual IO + */ + WriteMostly, /* Avoid reading if at all possible */ + AutoDetected, /* added by auto-detect */ + Blocked, /* An error occurred but has not yet + * been acknowledged by the metadata + * handler, so don't allow writes + * until it is cleared */ + WriteErrorSeen, /* A write error has been seen on this + * device + */ + FaultRecorded, /* Intermediate state for clearing + * Blocked. The Fault is/will-be + * recorded in the metadata, but that + * metadata hasn't been stored safely + * on disk yet. + */ + BlockedBadBlocks, /* A writer is blocked because they + * found an unacknowledged bad-block. + * This can safely be cleared at any + * time, and the writer will re-check. + * It may be set at any time, and at + * worst the writer will timeout and + * re-check. So setting it as + * accurately as possible is good, but + * not absolutely critical. + */ + WantReplacement, /* This device is a candidate to be + * hot-replaced, either because it has + * reported some faults, or because + * of explicit request. + */ + Replacement, /* This device is a replacement for + * a want_replacement device with same + * raid_disk number. + */ +}; + +#define BB_LEN_MASK (0x00000000000001FFULL) +#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL) +#define BB_ACK_MASK (0x8000000000000000ULL) +#define BB_MAX_LEN 512 +#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9) +#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1) +#define BB_ACK(x) (!!((x) & BB_ACK_MASK)) +#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63)) + +extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors); +static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors) +{ + if (unlikely(rdev->badblocks.count)) { + int rv = md_is_badblock(&rdev->badblocks, rdev->data_offset + s, + sectors, + first_bad, bad_sectors); + if (rv) + *first_bad -= rdev->data_offset; + return rv; + } + return 0; +} +extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int acknowledged); +extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors); +extern void md_ack_all_badblocks(struct badblocks *bb); + +struct mddev { + void *private; + struct md_personality *pers; + dev_t unit; + int md_minor; + struct list_head disks; + unsigned long flags; +#define MD_CHANGE_DEVS 0 /* Some device status has changed */ +#define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */ +#define MD_CHANGE_PENDING 2 /* switch from 'clean' to 'active' in progress */ +#define MD_ARRAY_FIRST_USE 3 /* First use of array, needs initialization */ + + int suspended; + atomic_t active_io; + int ro; + int sysfs_active; /* set when sysfs deletes + * are happening, so run/ + * takeover/stop are not safe + */ + int ready; /* See when safe to pass + * IO requests down */ + struct gendisk *gendisk; + + struct kobject kobj; + int hold_active; +#define UNTIL_IOCTL 1 +#define UNTIL_STOP 2 + + /* Superblock information */ + int major_version, + minor_version, + patch_version; + int persistent; + int external; /* metadata is + * managed externally */ + char metadata_type[17]; /* externally set*/ + int chunk_sectors; + time_t ctime, utime; + int level, layout; + char clevel[16]; + int raid_disks; + int max_disks; + sector_t dev_sectors; /* used size of + * component devices */ + sector_t array_sectors; /* exported array size */ + int external_size; /* size managed + * externally */ + __u64 events; + /* If the last 'event' was simply a clean->dirty transition, and + * we didn't write it to the spares, then it is safe and simple + * to just decrement the event count on a dirty->clean transition. + * So we record that possibility here. + */ + int can_decrease_events; + + char uuid[16]; + + /* If the array is being reshaped, we need to record the + * new shape and an indication of where we are up to. + * This is written to the superblock. + * If reshape_position is MaxSector, then no reshape is happening (yet). + */ + sector_t reshape_position; + int delta_disks, new_level, new_layout; + int new_chunk_sectors; + + atomic_t plug_cnt; /* If device is expecting + * more bios soon. + */ + struct md_thread *thread; /* management thread */ + struct md_thread *sync_thread; /* doing resync or reconstruct */ + sector_t curr_resync; /* last block scheduled */ + /* As resync requests can complete out of order, we cannot easily track + * how much resync has been completed. So we occasionally pause until + * everything completes, then set curr_resync_completed to curr_resync. + * As such it may be well behind the real resync mark, but it is a value + * we are certain of. + */ + sector_t curr_resync_completed; + unsigned long resync_mark; /* a recent timestamp */ + sector_t resync_mark_cnt;/* blocks written at resync_mark */ + sector_t curr_mark_cnt; /* blocks scheduled now */ + + sector_t resync_max_sectors; /* may be set by personality */ + + sector_t resync_mismatches; /* count of sectors where + * parity/replica mismatch found + */ + + /* allow user-space to request suspension of IO to regions of the array */ + sector_t suspend_lo; + sector_t suspend_hi; + /* if zero, use the system-wide default */ + int sync_speed_min; + int sync_speed_max; + + /* resync even though the same disks are shared among md-devices */ + int parallel_resync; + + int ok_start_degraded; + /* recovery/resync flags + * NEEDED: we might need to start a resync/recover + * RUNNING: a thread is running, or about to be started + * SYNC: actually doing a resync, not a recovery + * RECOVER: doing recovery, or need to try it. + * INTR: resync needs to be aborted for some reason + * DONE: thread is done and is waiting to be reaped + * REQUEST: user-space has requested a sync (used with SYNC) + * CHECK: user-space request for check-only, no repair + * RESHAPE: A reshape is happening + * + * If neither SYNC or RESHAPE are set, then it is a recovery. + */ +#define MD_RECOVERY_RUNNING 0 +#define MD_RECOVERY_SYNC 1 +#define MD_RECOVERY_RECOVER 2 +#define MD_RECOVERY_INTR 3 +#define MD_RECOVERY_DONE 4 +#define MD_RECOVERY_NEEDED 5 +#define MD_RECOVERY_REQUESTED 6 +#define MD_RECOVERY_CHECK 7 +#define MD_RECOVERY_RESHAPE 8 +#define MD_RECOVERY_FROZEN 9 + + unsigned long recovery; + /* If a RAID personality determines that recovery (of a particular + * device) will fail due to a read error on the source device, it + * takes a copy of this number and does not attempt recovery again + * until this number changes. + */ + int recovery_disabled; + + int in_sync; /* know to not need resync */ + /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so + * that we are never stopping an array while it is open. + * 'reconfig_mutex' protects all other reconfiguration. + * These locks are separate due to conflicting interactions + * with bdev->bd_mutex. + * Lock ordering is: + * reconfig_mutex -> bd_mutex : e.g. do_md_run -> revalidate_disk + * bd_mutex -> open_mutex: e.g. __blkdev_get -> md_open + */ + struct mutex open_mutex; + struct mutex reconfig_mutex; + atomic_t active; /* general refcount */ + atomic_t openers; /* number of active opens */ + + int changed; /* True if we might need to + * reread partition info */ + int degraded; /* whether md should consider + * adding a spare + */ + int merge_check_needed; /* at least one + * member device + * has a + * merge_bvec_fn */ + + atomic_t recovery_active; /* blocks scheduled, but not written */ + wait_queue_head_t recovery_wait; + sector_t recovery_cp; + sector_t resync_min; /* user requested sync + * starts here */ + sector_t resync_max; /* resync should pause + * when it gets here */ + + struct sysfs_dirent *sysfs_state; /* handle for 'array_state' + * file in sysfs. + */ + struct sysfs_dirent *sysfs_action; /* handle for 'sync_action' */ + + struct work_struct del_work; /* used for delayed sysfs removal */ + + spinlock_t write_lock; + wait_queue_head_t sb_wait; /* for waiting on superblock updates */ + atomic_t pending_writes; /* number of active superblock writes */ + + unsigned int safemode; /* if set, update "clean" superblock + * when no writes pending. + */ + unsigned int safemode_delay; + struct timer_list safemode_timer; + atomic_t writes_pending; + struct request_queue *queue; /* for plugging ... */ + + struct bitmap *bitmap; /* the bitmap for the device */ + struct { + struct file *file; /* the bitmap file */ + loff_t offset; /* offset from superblock of + * start of bitmap. May be + * negative, but not '0' + * For external metadata, offset + * from start of device. + */ + loff_t default_offset; /* this is the offset to use when + * hot-adding a bitmap. It should + * eventually be settable by sysfs. + */ + struct mutex mutex; + unsigned long chunksize; + unsigned long daemon_sleep; /* how many jiffies between updates? */ + unsigned long max_write_behind; /* write-behind mode */ + int external; + } bitmap_info; + + atomic_t max_corr_read_errors; /* max read retries */ + struct list_head all_mddevs; + + struct attribute_group *to_remove; + + struct bio_set *bio_set; + + /* Generic flush handling. + * The last to finish preflush schedules a worker to submit + * the rest of the request (without the REQ_FLUSH flag). + */ + struct bio *flush_bio; + atomic_t flush_pending; + struct work_struct flush_work; + struct work_struct event_work; /* used by dm to report failure event */ + void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); +}; + + +static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) +{ + int faulty = test_bit(Faulty, &rdev->flags); + if (atomic_dec_and_test(&rdev->nr_pending) && faulty) + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); +} + +static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) +{ + atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io); +} + +struct md_personality +{ + char *name; + int level; + struct list_head list; + struct module *owner; + void (*make_request)(struct mddev *mddev, struct bio *bio); + int (*run)(struct mddev *mddev); + int (*stop)(struct mddev *mddev); + void (*status)(struct seq_file *seq, struct mddev *mddev); + /* error_handler must set ->faulty and clear ->in_sync + * if appropriate, and should abort recovery if needed + */ + void (*error_handler)(struct mddev *mddev, struct md_rdev *rdev); + int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev); + int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev); + int (*spare_active) (struct mddev *mddev); + sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster); + int (*resize) (struct mddev *mddev, sector_t sectors); + sector_t (*size) (struct mddev *mddev, sector_t sectors, int raid_disks); + int (*check_reshape) (struct mddev *mddev); + int (*start_reshape) (struct mddev *mddev); + void (*finish_reshape) (struct mddev *mddev); + /* quiesce moves between quiescence states + * 0 - fully active + * 1 - no new requests allowed + * others - reserved + */ + void (*quiesce) (struct mddev *mddev, int state); + /* takeover is used to transition an array from one + * personality to another. The new personality must be able + * to handle the data in the current layout. + * e.g. 2drive raid1 -> 2drive raid5 + * ndrive raid5 -> degraded n+1drive raid6 with special layout + * If the takeover succeeds, a new 'private' structure is returned. + * This needs to be installed and then ->run used to activate the + * array. + */ + void *(*takeover) (struct mddev *mddev); +}; + + +struct md_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct mddev *, char *); + ssize_t (*store)(struct mddev *, const char *, size_t); +}; +extern struct attribute_group md_bitmap_group; + +static inline struct sysfs_dirent *sysfs_get_dirent_safe(struct sysfs_dirent *sd, char *name) +{ + if (sd) + return sysfs_get_dirent(sd, NULL, name); + return sd; +} +static inline void sysfs_notify_dirent_safe(struct sysfs_dirent *sd) +{ + if (sd) + sysfs_notify_dirent(sd); +} + +static inline char * mdname (struct mddev * mddev) +{ + return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; +} + +static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev) +{ + char nm[20]; + if (!test_bit(Replacement, &rdev->flags)) { + sprintf(nm, "rd%d", rdev->raid_disk); + return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); + } else + return 0; +} + +static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) +{ + char nm[20]; + if (!test_bit(Replacement, &rdev->flags)) { + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); + } +} + +/* + * iterates through some rdev ringlist. It's safe to remove the + * current 'rdev'. Dont touch 'tmp' though. + */ +#define rdev_for_each_list(rdev, tmp, head) \ + list_for_each_entry_safe(rdev, tmp, head, same_set) + +/* + * iterates through the 'same array disks' ringlist + */ +#define rdev_for_each(rdev, mddev) \ + list_for_each_entry(rdev, &((mddev)->disks), same_set) + +#define rdev_for_each_safe(rdev, tmp, mddev) \ + list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set) + +#define rdev_for_each_rcu(rdev, mddev) \ + list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set) + +struct md_thread { + void (*run) (struct mddev *mddev); + struct mddev *mddev; + wait_queue_head_t wqueue; + unsigned long flags; + struct task_struct *tsk; + unsigned long timeout; +}; + +#define THREAD_WAKEUP 0 + +#define __wait_event_lock_irq(wq, condition, lock, cmd) \ +do { \ + wait_queue_t __wait; \ + init_waitqueue_entry(&__wait, current); \ + \ + add_wait_queue(&wq, &__wait); \ + for (;;) { \ + set_current_state(TASK_UNINTERRUPTIBLE); \ + if (condition) \ + break; \ + spin_unlock_irq(&lock); \ + cmd; \ + schedule(); \ + spin_lock_irq(&lock); \ + } \ + current->state = TASK_RUNNING; \ + remove_wait_queue(&wq, &__wait); \ +} while (0) + +#define wait_event_lock_irq(wq, condition, lock, cmd) \ +do { \ + if (condition) \ + break; \ + __wait_event_lock_irq(wq, condition, lock, cmd); \ +} while (0) + +static inline void safe_put_page(struct page *p) +{ + if (p) put_page(p); +} + +extern int register_md_personality(struct md_personality *p); +extern int unregister_md_personality(struct md_personality *p); +extern struct md_thread *md_register_thread( + void (*run)(struct mddev *mddev), + struct mddev *mddev, + const char *name); +extern void md_unregister_thread(struct md_thread **threadp); +extern void md_wakeup_thread(struct md_thread *thread); +extern void md_check_recovery(struct mddev *mddev); +extern void md_write_start(struct mddev *mddev, struct bio *bi); +extern void md_write_end(struct mddev *mddev); +extern void md_done_sync(struct mddev *mddev, int blocks, int ok); +extern void md_error(struct mddev *mddev, struct md_rdev *rdev); + +extern int mddev_congested(struct mddev *mddev, int bits); +extern void md_flush_request(struct mddev *mddev, struct bio *bio); +extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, + sector_t sector, int size, struct page *page); +extern void md_super_wait(struct mddev *mddev); +extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, + struct page *page, int rw, bool metadata_op); +extern void md_do_sync(struct mddev *mddev); +extern void md_new_event(struct mddev *mddev); +extern int md_allow_write(struct mddev *mddev); +extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev); +extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors); +extern int md_check_no_bitmap(struct mddev *mddev); +extern int md_integrity_register(struct mddev *mddev); +extern void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev); +extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); +extern void restore_bitmap_write_access(struct file *file); + +extern void mddev_init(struct mddev *mddev); +extern int md_run(struct mddev *mddev); +extern void md_stop(struct mddev *mddev); +extern void md_stop_writes(struct mddev *mddev); +extern int md_rdev_init(struct md_rdev *rdev); + +extern void mddev_suspend(struct mddev *mddev); +extern void mddev_resume(struct mddev *mddev); +extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, + struct mddev *mddev); +extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, + struct mddev *mddev); +extern int mddev_check_plugged(struct mddev *mddev); +extern void md_trim_bio(struct bio *bio, int offset, int size); +#endif /* _MD_MD_H */ diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c new file mode 100644 index 00000000..9339e67f --- /dev/null +++ b/drivers/md/multipath.c @@ -0,0 +1,557 @@ +/* + * multipath.c : Multiple Devices driver for Linux + * + * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat + * + * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman + * + * MULTIPATH management functions. + * + * derived from raid1.c. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * You should have received a copy of the GNU General Public License + * (for example /usr/src/linux/COPYING); if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <linux/blkdev.h> +#include <linux/module.h> +#include <linux/raid/md_u.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include "md.h" +#include "multipath.h" + +#define MAX_WORK_PER_DISK 128 + +#define NR_RESERVED_BUFS 32 + + +static int multipath_map (struct mpconf *conf) +{ + int i, disks = conf->raid_disks; + + /* + * Later we do read balancing on the read side + * now we use the first available disk. + */ + + rcu_read_lock(); + for (i = 0; i < disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); + if (rdev && test_bit(In_sync, &rdev->flags)) { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + return i; + } + } + rcu_read_unlock(); + + printk(KERN_ERR "multipath_map(): no more operational IO paths?\n"); + return (-1); +} + +static void multipath_reschedule_retry (struct multipath_bh *mp_bh) +{ + unsigned long flags; + struct mddev *mddev = mp_bh->mddev; + struct mpconf *conf = mddev->private; + + spin_lock_irqsave(&conf->device_lock, flags); + list_add(&mp_bh->retry_list, &conf->retry_list); + spin_unlock_irqrestore(&conf->device_lock, flags); + md_wakeup_thread(mddev->thread); +} + + +/* + * multipath_end_bh_io() is called when we have finished servicing a multipathed + * operation and are ready to return a success/failure code to the buffer + * cache layer. + */ +static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err) +{ + struct bio *bio = mp_bh->master_bio; + struct mpconf *conf = mp_bh->mddev->private; + + bio_endio(bio, err); + mempool_free(mp_bh, conf->pool); +} + +static void multipath_end_request(struct bio *bio, int error) +{ + int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct multipath_bh *mp_bh = bio->bi_private; + struct mpconf *conf = mp_bh->mddev->private; + struct md_rdev *rdev = conf->multipaths[mp_bh->path].rdev; + + if (uptodate) + multipath_end_bh_io(mp_bh, 0); + else if (!(bio->bi_rw & REQ_RAHEAD)) { + /* + * oops, IO error: + */ + char b[BDEVNAME_SIZE]; + md_error (mp_bh->mddev, rdev); + printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n", + bdevname(rdev->bdev,b), + (unsigned long long)bio->bi_sector); + multipath_reschedule_retry(mp_bh); + } else + multipath_end_bh_io(mp_bh, error); + rdev_dec_pending(rdev, conf->mddev); +} + +static void multipath_make_request(struct mddev *mddev, struct bio * bio) +{ + struct mpconf *conf = mddev->private; + struct multipath_bh * mp_bh; + struct multipath_info *multipath; + + if (unlikely(bio->bi_rw & REQ_FLUSH)) { + md_flush_request(mddev, bio); + return; + } + + mp_bh = mempool_alloc(conf->pool, GFP_NOIO); + + mp_bh->master_bio = bio; + mp_bh->mddev = mddev; + + mp_bh->path = multipath_map(conf); + if (mp_bh->path < 0) { + bio_endio(bio, -EIO); + mempool_free(mp_bh, conf->pool); + return; + } + multipath = conf->multipaths + mp_bh->path; + + mp_bh->bio = *bio; + mp_bh->bio.bi_sector += multipath->rdev->data_offset; + mp_bh->bio.bi_bdev = multipath->rdev->bdev; + mp_bh->bio.bi_rw |= REQ_FAILFAST_TRANSPORT; + mp_bh->bio.bi_end_io = multipath_end_request; + mp_bh->bio.bi_private = mp_bh; + generic_make_request(&mp_bh->bio); + return; +} + +static void multipath_status (struct seq_file *seq, struct mddev *mddev) +{ + struct mpconf *conf = mddev->private; + int i; + + seq_printf (seq, " [%d/%d] [", conf->raid_disks, + conf->raid_disks - mddev->degraded); + for (i = 0; i < conf->raid_disks; i++) + seq_printf (seq, "%s", + conf->multipaths[i].rdev && + test_bit(In_sync, &conf->multipaths[i].rdev->flags) ? "U" : "_"); + seq_printf (seq, "]"); +} + +static int multipath_congested(void *data, int bits) +{ + struct mddev *mddev = data; + struct mpconf *conf = mddev->private; + int i, ret = 0; + + if (mddev_congested(mddev, bits)) + return 1; + + rcu_read_lock(); + for (i = 0; i < mddev->raid_disks ; i++) { + struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); + if (rdev && !test_bit(Faulty, &rdev->flags)) { + struct request_queue *q = bdev_get_queue(rdev->bdev); + + ret |= bdi_congested(&q->backing_dev_info, bits); + /* Just like multipath_map, we just check the + * first available device + */ + break; + } + } + rcu_read_unlock(); + return ret; +} + +/* + * Careful, this can execute in IRQ contexts as well! + */ +static void multipath_error (struct mddev *mddev, struct md_rdev *rdev) +{ + struct mpconf *conf = mddev->private; + char b[BDEVNAME_SIZE]; + + if (conf->raid_disks - mddev->degraded <= 1) { + /* + * Uh oh, we can do nothing if this is our last path, but + * first check if this is a queued request for a device + * which has just failed. + */ + printk(KERN_ALERT + "multipath: only one IO path left and IO error.\n"); + /* leave it active... it's all we have */ + return; + } + /* + * Mark disk as unusable + */ + if (test_and_clear_bit(In_sync, &rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + mddev->degraded++; + spin_unlock_irqrestore(&conf->device_lock, flags); + } + set_bit(Faulty, &rdev->flags); + set_bit(MD_CHANGE_DEVS, &mddev->flags); + printk(KERN_ALERT "multipath: IO failure on %s," + " disabling IO path.\n" + "multipath: Operation continuing" + " on %d IO paths.\n", + bdevname(rdev->bdev, b), + conf->raid_disks - mddev->degraded); +} + +static void print_multipath_conf (struct mpconf *conf) +{ + int i; + struct multipath_info *tmp; + + printk("MULTIPATH conf printout:\n"); + if (!conf) { + printk("(conf==NULL)\n"); + return; + } + printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, + conf->raid_disks); + + for (i = 0; i < conf->raid_disks; i++) { + char b[BDEVNAME_SIZE]; + tmp = conf->multipaths + i; + if (tmp->rdev) + printk(" disk%d, o:%d, dev:%s\n", + i,!test_bit(Faulty, &tmp->rdev->flags), + bdevname(tmp->rdev->bdev,b)); + } +} + + +static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) +{ + struct mpconf *conf = mddev->private; + struct request_queue *q; + int err = -EEXIST; + int path; + struct multipath_info *p; + int first = 0; + int last = mddev->raid_disks - 1; + + if (rdev->raid_disk >= 0) + first = last = rdev->raid_disk; + + print_multipath_conf(conf); + + for (path = first; path <= last; path++) + if ((p=conf->multipaths+path)->rdev == NULL) { + q = rdev->bdev->bd_disk->queue; + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + + /* as we don't honour merge_bvec_fn, we must never risk + * violating it, so limit ->max_segments to one, lying + * within a single page. + * (Note: it is very unlikely that a device with + * merge_bvec_fn will be involved in multipath.) + */ + if (q->merge_bvec_fn) { + blk_queue_max_segments(mddev->queue, 1); + blk_queue_segment_boundary(mddev->queue, + PAGE_CACHE_SIZE - 1); + } + + spin_lock_irq(&conf->device_lock); + mddev->degraded--; + rdev->raid_disk = path; + set_bit(In_sync, &rdev->flags); + spin_unlock_irq(&conf->device_lock); + rcu_assign_pointer(p->rdev, rdev); + err = 0; + md_integrity_add_rdev(rdev, mddev); + break; + } + + print_multipath_conf(conf); + + return err; +} + +static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev) +{ + struct mpconf *conf = mddev->private; + int err = 0; + int number = rdev->raid_disk; + struct multipath_info *p = conf->multipaths + number; + + print_multipath_conf(conf); + + if (rdev == p->rdev) { + if (test_bit(In_sync, &rdev->flags) || + atomic_read(&rdev->nr_pending)) { + printk(KERN_ERR "hot-remove-disk, slot %d is identified" + " but is still operational!\n", number); + err = -EBUSY; + goto abort; + } + p->rdev = NULL; + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + p->rdev = rdev; + goto abort; + } + err = md_integrity_register(mddev); + } +abort: + + print_multipath_conf(conf); + return err; +} + + + +/* + * This is a kernel thread which: + * + * 1. Retries failed read operations on working multipaths. + * 2. Updates the raid superblock when problems encounter. + * 3. Performs writes following reads for array syncronising. + */ + +static void multipathd (struct mddev *mddev) +{ + struct multipath_bh *mp_bh; + struct bio *bio; + unsigned long flags; + struct mpconf *conf = mddev->private; + struct list_head *head = &conf->retry_list; + + md_check_recovery(mddev); + for (;;) { + char b[BDEVNAME_SIZE]; + spin_lock_irqsave(&conf->device_lock, flags); + if (list_empty(head)) + break; + mp_bh = list_entry(head->prev, struct multipath_bh, retry_list); + list_del(head->prev); + spin_unlock_irqrestore(&conf->device_lock, flags); + + bio = &mp_bh->bio; + bio->bi_sector = mp_bh->master_bio->bi_sector; + + if ((mp_bh->path = multipath_map (conf))<0) { + printk(KERN_ALERT "multipath: %s: unrecoverable IO read" + " error for block %llu\n", + bdevname(bio->bi_bdev,b), + (unsigned long long)bio->bi_sector); + multipath_end_bh_io(mp_bh, -EIO); + } else { + printk(KERN_ERR "multipath: %s: redirecting sector %llu" + " to another IO path\n", + bdevname(bio->bi_bdev,b), + (unsigned long long)bio->bi_sector); + *bio = *(mp_bh->master_bio); + bio->bi_sector += conf->multipaths[mp_bh->path].rdev->data_offset; + bio->bi_bdev = conf->multipaths[mp_bh->path].rdev->bdev; + bio->bi_rw |= REQ_FAILFAST_TRANSPORT; + bio->bi_end_io = multipath_end_request; + bio->bi_private = mp_bh; + generic_make_request(bio); + } + } + spin_unlock_irqrestore(&conf->device_lock, flags); +} + +static sector_t multipath_size(struct mddev *mddev, sector_t sectors, int raid_disks) +{ + WARN_ONCE(sectors || raid_disks, + "%s does not support generic reshape\n", __func__); + + return mddev->dev_sectors; +} + +static int multipath_run (struct mddev *mddev) +{ + struct mpconf *conf; + int disk_idx; + struct multipath_info *disk; + struct md_rdev *rdev; + int working_disks; + + if (md_check_no_bitmap(mddev)) + return -EINVAL; + + if (mddev->level != LEVEL_MULTIPATH) { + printk("multipath: %s: raid level not set to multipath IO (%d)\n", + mdname(mddev), mddev->level); + goto out; + } + /* + * copy the already verified devices into our private MULTIPATH + * bookkeeping area. [whatever we allocate in multipath_run(), + * should be freed in multipath_stop()] + */ + + conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL); + mddev->private = conf; + if (!conf) { + printk(KERN_ERR + "multipath: couldn't allocate memory for %s\n", + mdname(mddev)); + goto out; + } + + conf->multipaths = kzalloc(sizeof(struct multipath_info)*mddev->raid_disks, + GFP_KERNEL); + if (!conf->multipaths) { + printk(KERN_ERR + "multipath: couldn't allocate memory for %s\n", + mdname(mddev)); + goto out_free_conf; + } + + working_disks = 0; + rdev_for_each(rdev, mddev) { + disk_idx = rdev->raid_disk; + if (disk_idx < 0 || + disk_idx >= mddev->raid_disks) + continue; + + disk = conf->multipaths + disk_idx; + disk->rdev = rdev; + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + + /* as we don't honour merge_bvec_fn, we must never risk + * violating it, not that we ever expect a device with + * a merge_bvec_fn to be involved in multipath */ + if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { + blk_queue_max_segments(mddev->queue, 1); + blk_queue_segment_boundary(mddev->queue, + PAGE_CACHE_SIZE - 1); + } + + if (!test_bit(Faulty, &rdev->flags)) + working_disks++; + } + + conf->raid_disks = mddev->raid_disks; + conf->mddev = mddev; + spin_lock_init(&conf->device_lock); + INIT_LIST_HEAD(&conf->retry_list); + + if (!working_disks) { + printk(KERN_ERR "multipath: no operational IO paths for %s\n", + mdname(mddev)); + goto out_free_conf; + } + mddev->degraded = conf->raid_disks - working_disks; + + conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS, + sizeof(struct multipath_bh)); + if (conf->pool == NULL) { + printk(KERN_ERR + "multipath: couldn't allocate memory for %s\n", + mdname(mddev)); + goto out_free_conf; + } + + { + mddev->thread = md_register_thread(multipathd, mddev, NULL); + if (!mddev->thread) { + printk(KERN_ERR "multipath: couldn't allocate thread" + " for %s\n", mdname(mddev)); + goto out_free_conf; + } + } + + printk(KERN_INFO + "multipath: array %s active with %d out of %d IO paths\n", + mdname(mddev), conf->raid_disks - mddev->degraded, + mddev->raid_disks); + /* + * Ok, everything is just fine now + */ + md_set_array_sectors(mddev, multipath_size(mddev, 0, 0)); + + mddev->queue->backing_dev_info.congested_fn = multipath_congested; + mddev->queue->backing_dev_info.congested_data = mddev; + + if (md_integrity_register(mddev)) + goto out_free_conf; + + return 0; + +out_free_conf: + if (conf->pool) + mempool_destroy(conf->pool); + kfree(conf->multipaths); + kfree(conf); + mddev->private = NULL; +out: + return -EIO; +} + + +static int multipath_stop (struct mddev *mddev) +{ + struct mpconf *conf = mddev->private; + + md_unregister_thread(&mddev->thread); + blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ + mempool_destroy(conf->pool); + kfree(conf->multipaths); + kfree(conf); + mddev->private = NULL; + return 0; +} + +static struct md_personality multipath_personality = +{ + .name = "multipath", + .level = LEVEL_MULTIPATH, + .owner = THIS_MODULE, + .make_request = multipath_make_request, + .run = multipath_run, + .stop = multipath_stop, + .status = multipath_status, + .error_handler = multipath_error, + .hot_add_disk = multipath_add_disk, + .hot_remove_disk= multipath_remove_disk, + .size = multipath_size, +}; + +static int __init multipath_init (void) +{ + return register_md_personality (&multipath_personality); +} + +static void __exit multipath_exit (void) +{ + unregister_md_personality (&multipath_personality); +} + +module_init(multipath_init); +module_exit(multipath_exit); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("simple multi-path personality for MD"); +MODULE_ALIAS("md-personality-7"); /* MULTIPATH */ +MODULE_ALIAS("md-multipath"); +MODULE_ALIAS("md-level--4"); diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h new file mode 100644 index 00000000..717c60f6 --- /dev/null +++ b/drivers/md/multipath.h @@ -0,0 +1,31 @@ +#ifndef _MULTIPATH_H +#define _MULTIPATH_H + +struct multipath_info { + struct md_rdev *rdev; +}; + +struct mpconf { + struct mddev *mddev; + struct multipath_info *multipaths; + int raid_disks; + spinlock_t device_lock; + struct list_head retry_list; + + mempool_t *pool; +}; + +/* + * this is our 'private' 'collective' MULTIPATH buffer head. + * it contains information about what kind of IO operations were started + * for this MULTIPATH operation, and about their status: + */ + +struct multipath_bh { + struct mddev *mddev; + struct bio *master_bio; + struct bio bio; + int path; + struct list_head retry_list; +}; +#endif diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig new file mode 100644 index 00000000..ceb35905 --- /dev/null +++ b/drivers/md/persistent-data/Kconfig @@ -0,0 +1,8 @@ +config DM_PERSISTENT_DATA + tristate + depends on BLK_DEV_DM && EXPERIMENTAL + select LIBCRC32C + select DM_BUFIO + ---help--- + Library providing immutable on-disk data structure support for + device-mapper targets such as the thin provisioning target. diff --git a/drivers/md/persistent-data/Makefile b/drivers/md/persistent-data/Makefile new file mode 100644 index 00000000..cfa95f66 --- /dev/null +++ b/drivers/md/persistent-data/Makefile @@ -0,0 +1,11 @@ +obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o +dm-persistent-data-objs := \ + dm-block-manager.o \ + dm-space-map-checker.o \ + dm-space-map-common.o \ + dm-space-map-disk.o \ + dm-space-map-metadata.o \ + dm-transaction-manager.o \ + dm-btree.o \ + dm-btree-remove.o \ + dm-btree-spine.o diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c new file mode 100644 index 00000000..0317ecdc --- /dev/null +++ b/drivers/md/persistent-data/dm-block-manager.c @@ -0,0 +1,620 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ +#include "dm-block-manager.h" +#include "dm-persistent-data-internal.h" +#include "../dm-bufio.h" + +#include <linux/crc32c.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/rwsem.h> +#include <linux/device-mapper.h> +#include <linux/stacktrace.h> + +#define DM_MSG_PREFIX "block manager" + +/*----------------------------------------------------------------*/ + +/* + * This is a read/write semaphore with a couple of differences. + * + * i) There is a restriction on the number of concurrent read locks that + * may be held at once. This is just an implementation detail. + * + * ii) Recursive locking attempts are detected and return EINVAL. A stack + * trace is also emitted for the previous lock aquisition. + * + * iii) Priority is given to write locks. + */ +#define MAX_HOLDERS 4 +#define MAX_STACK 10 + +typedef unsigned long stack_entries[MAX_STACK]; + +struct block_lock { + spinlock_t lock; + __s32 count; + struct list_head waiters; + struct task_struct *holders[MAX_HOLDERS]; + +#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING + struct stack_trace traces[MAX_HOLDERS]; + stack_entries entries[MAX_HOLDERS]; +#endif +}; + +struct waiter { + struct list_head list; + struct task_struct *task; + int wants_write; +}; + +static unsigned __find_holder(struct block_lock *lock, + struct task_struct *task) +{ + unsigned i; + + for (i = 0; i < MAX_HOLDERS; i++) + if (lock->holders[i] == task) + break; + + BUG_ON(i == MAX_HOLDERS); + return i; +} + +/* call this *after* you increment lock->count */ +static void __add_holder(struct block_lock *lock, struct task_struct *task) +{ + unsigned h = __find_holder(lock, NULL); +#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING + struct stack_trace *t; +#endif + + get_task_struct(task); + lock->holders[h] = task; + +#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING + t = lock->traces + h; + t->nr_entries = 0; + t->max_entries = MAX_STACK; + t->entries = lock->entries[h]; + t->skip = 2; + save_stack_trace(t); +#endif +} + +/* call this *before* you decrement lock->count */ +static void __del_holder(struct block_lock *lock, struct task_struct *task) +{ + unsigned h = __find_holder(lock, task); + lock->holders[h] = NULL; + put_task_struct(task); +} + +static int __check_holder(struct block_lock *lock) +{ + unsigned i; +#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING + static struct stack_trace t; + static stack_entries entries; +#endif + + for (i = 0; i < MAX_HOLDERS; i++) { + if (lock->holders[i] == current) { + DMERR("recursive lock detected in pool metadata"); +#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING + DMERR("previously held here:"); + print_stack_trace(lock->traces + i, 4); + + DMERR("subsequent aquisition attempted here:"); + t.nr_entries = 0; + t.max_entries = MAX_STACK; + t.entries = entries; + t.skip = 3; + save_stack_trace(&t); + print_stack_trace(&t, 4); +#endif + return -EINVAL; + } + } + + return 0; +} + +static void __wait(struct waiter *w) +{ + for (;;) { + set_task_state(current, TASK_UNINTERRUPTIBLE); + + if (!w->task) + break; + + schedule(); + } + + set_task_state(current, TASK_RUNNING); +} + +static void __wake_waiter(struct waiter *w) +{ + struct task_struct *task; + + list_del(&w->list); + task = w->task; + smp_mb(); + w->task = NULL; + wake_up_process(task); +} + +/* + * We either wake a few readers or a single writer. + */ +static void __wake_many(struct block_lock *lock) +{ + struct waiter *w, *tmp; + + BUG_ON(lock->count < 0); + list_for_each_entry_safe(w, tmp, &lock->waiters, list) { + if (lock->count >= MAX_HOLDERS) + return; + + if (w->wants_write) { + if (lock->count > 0) + return; /* still read locked */ + + lock->count = -1; + __add_holder(lock, w->task); + __wake_waiter(w); + return; + } + + lock->count++; + __add_holder(lock, w->task); + __wake_waiter(w); + } +} + +static void bl_init(struct block_lock *lock) +{ + int i; + + spin_lock_init(&lock->lock); + lock->count = 0; + INIT_LIST_HEAD(&lock->waiters); + for (i = 0; i < MAX_HOLDERS; i++) + lock->holders[i] = NULL; +} + +static int __available_for_read(struct block_lock *lock) +{ + return lock->count >= 0 && + lock->count < MAX_HOLDERS && + list_empty(&lock->waiters); +} + +static int bl_down_read(struct block_lock *lock) +{ + int r; + struct waiter w; + + spin_lock(&lock->lock); + r = __check_holder(lock); + if (r) { + spin_unlock(&lock->lock); + return r; + } + + if (__available_for_read(lock)) { + lock->count++; + __add_holder(lock, current); + spin_unlock(&lock->lock); + return 0; + } + + get_task_struct(current); + + w.task = current; + w.wants_write = 0; + list_add_tail(&w.list, &lock->waiters); + spin_unlock(&lock->lock); + + __wait(&w); + put_task_struct(current); + return 0; +} + +static int bl_down_read_nonblock(struct block_lock *lock) +{ + int r; + + spin_lock(&lock->lock); + r = __check_holder(lock); + if (r) + goto out; + + if (__available_for_read(lock)) { + lock->count++; + __add_holder(lock, current); + r = 0; + } else + r = -EWOULDBLOCK; + +out: + spin_unlock(&lock->lock); + return r; +} + +static void bl_up_read(struct block_lock *lock) +{ + spin_lock(&lock->lock); + BUG_ON(lock->count <= 0); + __del_holder(lock, current); + --lock->count; + if (!list_empty(&lock->waiters)) + __wake_many(lock); + spin_unlock(&lock->lock); +} + +static int bl_down_write(struct block_lock *lock) +{ + int r; + struct waiter w; + + spin_lock(&lock->lock); + r = __check_holder(lock); + if (r) { + spin_unlock(&lock->lock); + return r; + } + + if (lock->count == 0 && list_empty(&lock->waiters)) { + lock->count = -1; + __add_holder(lock, current); + spin_unlock(&lock->lock); + return 0; + } + + get_task_struct(current); + w.task = current; + w.wants_write = 1; + + /* + * Writers given priority. We know there's only one mutator in the + * system, so ignoring the ordering reversal. + */ + list_add(&w.list, &lock->waiters); + spin_unlock(&lock->lock); + + __wait(&w); + put_task_struct(current); + + return 0; +} + +static void bl_up_write(struct block_lock *lock) +{ + spin_lock(&lock->lock); + __del_holder(lock, current); + lock->count = 0; + if (!list_empty(&lock->waiters)) + __wake_many(lock); + spin_unlock(&lock->lock); +} + +static void report_recursive_bug(dm_block_t b, int r) +{ + if (r == -EINVAL) + DMERR("recursive acquisition of block %llu requested.", + (unsigned long long) b); +} + +/*----------------------------------------------------------------*/ + +/* + * Block manager is currently implemented using dm-bufio. struct + * dm_block_manager and struct dm_block map directly onto a couple of + * structs in the bufio interface. I want to retain the freedom to move + * away from bufio in the future. So these structs are just cast within + * this .c file, rather than making it through to the public interface. + */ +static struct dm_buffer *to_buffer(struct dm_block *b) +{ + return (struct dm_buffer *) b; +} + +static struct dm_bufio_client *to_bufio(struct dm_block_manager *bm) +{ + return (struct dm_bufio_client *) bm; +} + +dm_block_t dm_block_location(struct dm_block *b) +{ + return dm_bufio_get_block_number(to_buffer(b)); +} +EXPORT_SYMBOL_GPL(dm_block_location); + +void *dm_block_data(struct dm_block *b) +{ + return dm_bufio_get_block_data(to_buffer(b)); +} +EXPORT_SYMBOL_GPL(dm_block_data); + +struct buffer_aux { + struct dm_block_validator *validator; + struct block_lock lock; + int write_locked; +}; + +static void dm_block_manager_alloc_callback(struct dm_buffer *buf) +{ + struct buffer_aux *aux = dm_bufio_get_aux_data(buf); + aux->validator = NULL; + bl_init(&aux->lock); +} + +static void dm_block_manager_write_callback(struct dm_buffer *buf) +{ + struct buffer_aux *aux = dm_bufio_get_aux_data(buf); + if (aux->validator) { + aux->validator->prepare_for_write(aux->validator, (struct dm_block *) buf, + dm_bufio_get_block_size(dm_bufio_get_client(buf))); + } +} + +/*---------------------------------------------------------------- + * Public interface + *--------------------------------------------------------------*/ +struct dm_block_manager *dm_block_manager_create(struct block_device *bdev, + unsigned block_size, + unsigned cache_size, + unsigned max_held_per_thread) +{ + return (struct dm_block_manager *) + dm_bufio_client_create(bdev, block_size, max_held_per_thread, + sizeof(struct buffer_aux), + dm_block_manager_alloc_callback, + dm_block_manager_write_callback); +} +EXPORT_SYMBOL_GPL(dm_block_manager_create); + +void dm_block_manager_destroy(struct dm_block_manager *bm) +{ + return dm_bufio_client_destroy(to_bufio(bm)); +} +EXPORT_SYMBOL_GPL(dm_block_manager_destroy); + +unsigned dm_bm_block_size(struct dm_block_manager *bm) +{ + return dm_bufio_get_block_size(to_bufio(bm)); +} +EXPORT_SYMBOL_GPL(dm_bm_block_size); + +dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm) +{ + return dm_bufio_get_device_size(to_bufio(bm)); +} + +static int dm_bm_validate_buffer(struct dm_block_manager *bm, + struct dm_buffer *buf, + struct buffer_aux *aux, + struct dm_block_validator *v) +{ + if (unlikely(!aux->validator)) { + int r; + if (!v) + return 0; + r = v->check(v, (struct dm_block *) buf, dm_bufio_get_block_size(to_bufio(bm))); + if (unlikely(r)) + return r; + aux->validator = v; + } else { + if (unlikely(aux->validator != v)) { + DMERR("validator mismatch (old=%s vs new=%s) for block %llu", + aux->validator->name, v ? v->name : "NULL", + (unsigned long long) + dm_bufio_get_block_number(buf)); + return -EINVAL; + } + } + + return 0; +} +int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b, + struct dm_block_validator *v, + struct dm_block **result) +{ + struct buffer_aux *aux; + void *p; + int r; + + p = dm_bufio_read(to_bufio(bm), b, (struct dm_buffer **) result); + if (unlikely(IS_ERR(p))) + return PTR_ERR(p); + + aux = dm_bufio_get_aux_data(to_buffer(*result)); + r = bl_down_read(&aux->lock); + if (unlikely(r)) { + dm_bufio_release(to_buffer(*result)); + report_recursive_bug(b, r); + return r; + } + + aux->write_locked = 0; + + r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v); + if (unlikely(r)) { + bl_up_read(&aux->lock); + dm_bufio_release(to_buffer(*result)); + return r; + } + + return 0; +} +EXPORT_SYMBOL_GPL(dm_bm_read_lock); + +int dm_bm_write_lock(struct dm_block_manager *bm, + dm_block_t b, struct dm_block_validator *v, + struct dm_block **result) +{ + struct buffer_aux *aux; + void *p; + int r; + + p = dm_bufio_read(to_bufio(bm), b, (struct dm_buffer **) result); + if (unlikely(IS_ERR(p))) + return PTR_ERR(p); + + aux = dm_bufio_get_aux_data(to_buffer(*result)); + r = bl_down_write(&aux->lock); + if (r) { + dm_bufio_release(to_buffer(*result)); + report_recursive_bug(b, r); + return r; + } + + aux->write_locked = 1; + + r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v); + if (unlikely(r)) { + bl_up_write(&aux->lock); + dm_bufio_release(to_buffer(*result)); + return r; + } + + return 0; +} +EXPORT_SYMBOL_GPL(dm_bm_write_lock); + +int dm_bm_read_try_lock(struct dm_block_manager *bm, + dm_block_t b, struct dm_block_validator *v, + struct dm_block **result) +{ + struct buffer_aux *aux; + void *p; + int r; + + p = dm_bufio_get(to_bufio(bm), b, (struct dm_buffer **) result); + if (unlikely(IS_ERR(p))) + return PTR_ERR(p); + if (unlikely(!p)) + return -EWOULDBLOCK; + + aux = dm_bufio_get_aux_data(to_buffer(*result)); + r = bl_down_read_nonblock(&aux->lock); + if (r < 0) { + dm_bufio_release(to_buffer(*result)); + report_recursive_bug(b, r); + return r; + } + aux->write_locked = 0; + + r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v); + if (unlikely(r)) { + bl_up_read(&aux->lock); + dm_bufio_release(to_buffer(*result)); + return r; + } + + return 0; +} + +int dm_bm_write_lock_zero(struct dm_block_manager *bm, + dm_block_t b, struct dm_block_validator *v, + struct dm_block **result) +{ + int r; + struct buffer_aux *aux; + void *p; + + p = dm_bufio_new(to_bufio(bm), b, (struct dm_buffer **) result); + if (unlikely(IS_ERR(p))) + return PTR_ERR(p); + + memset(p, 0, dm_bm_block_size(bm)); + + aux = dm_bufio_get_aux_data(to_buffer(*result)); + r = bl_down_write(&aux->lock); + if (r) { + dm_bufio_release(to_buffer(*result)); + return r; + } + + aux->write_locked = 1; + aux->validator = v; + + return 0; +} + +int dm_bm_unlock(struct dm_block *b) +{ + struct buffer_aux *aux; + aux = dm_bufio_get_aux_data(to_buffer(b)); + + if (aux->write_locked) { + dm_bufio_mark_buffer_dirty(to_buffer(b)); + bl_up_write(&aux->lock); + } else + bl_up_read(&aux->lock); + + dm_bufio_release(to_buffer(b)); + + return 0; +} +EXPORT_SYMBOL_GPL(dm_bm_unlock); + +int dm_bm_unlock_move(struct dm_block *b, dm_block_t n) +{ + struct buffer_aux *aux; + + aux = dm_bufio_get_aux_data(to_buffer(b)); + + if (aux->write_locked) { + dm_bufio_mark_buffer_dirty(to_buffer(b)); + bl_up_write(&aux->lock); + } else + bl_up_read(&aux->lock); + + dm_bufio_release_move(to_buffer(b), n); + return 0; +} + +int dm_bm_flush_and_unlock(struct dm_block_manager *bm, + struct dm_block *superblock) +{ + int r; + + r = dm_bufio_write_dirty_buffers(to_bufio(bm)); + if (unlikely(r)) + return r; + r = dm_bufio_issue_flush(to_bufio(bm)); + if (unlikely(r)) + return r; + + dm_bm_unlock(superblock); + + r = dm_bufio_write_dirty_buffers(to_bufio(bm)); + if (unlikely(r)) + return r; + r = dm_bufio_issue_flush(to_bufio(bm)); + if (unlikely(r)) + return r; + + return 0; +} + +u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor) +{ + return crc32c(~(u32) 0, data, len) ^ init_xor; +} +EXPORT_SYMBOL_GPL(dm_bm_checksum); + +/*----------------------------------------------------------------*/ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); +MODULE_DESCRIPTION("Immutable metadata library for dm"); + +/*----------------------------------------------------------------*/ diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h new file mode 100644 index 00000000..924833d2 --- /dev/null +++ b/drivers/md/persistent-data/dm-block-manager.h @@ -0,0 +1,123 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef _LINUX_DM_BLOCK_MANAGER_H +#define _LINUX_DM_BLOCK_MANAGER_H + +#include <linux/types.h> +#include <linux/blkdev.h> + +/*----------------------------------------------------------------*/ + +/* + * Block number. + */ +typedef uint64_t dm_block_t; +struct dm_block; + +dm_block_t dm_block_location(struct dm_block *b); +void *dm_block_data(struct dm_block *b); + +/*----------------------------------------------------------------*/ + +/* + * @name should be a unique identifier for the block manager, no longer + * than 32 chars. + * + * @max_held_per_thread should be the maximum number of locks, read or + * write, that an individual thread holds at any one time. + */ +struct dm_block_manager; +struct dm_block_manager *dm_block_manager_create( + struct block_device *bdev, unsigned block_size, + unsigned cache_size, unsigned max_held_per_thread); +void dm_block_manager_destroy(struct dm_block_manager *bm); + +unsigned dm_bm_block_size(struct dm_block_manager *bm); +dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm); + +/*----------------------------------------------------------------*/ + +/* + * The validator allows the caller to verify newly-read data and modify + * the data just before writing, e.g. to calculate checksums. It's + * important to be consistent with your use of validators. The only time + * you can change validators is if you call dm_bm_write_lock_zero. + */ +struct dm_block_validator { + const char *name; + void (*prepare_for_write)(struct dm_block_validator *v, struct dm_block *b, size_t block_size); + + /* + * Return 0 if the checksum is valid or < 0 on error. + */ + int (*check)(struct dm_block_validator *v, struct dm_block *b, size_t block_size); +}; + +/*----------------------------------------------------------------*/ + +/* + * You can have multiple concurrent readers or a single writer holding a + * block lock. + */ + +/* + * dm_bm_lock() locks a block and returns through @result a pointer to + * memory that holds a copy of that block. If you have write-locked the + * block then any changes you make to memory pointed to by @result will be + * written back to the disk sometime after dm_bm_unlock is called. + */ +int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b, + struct dm_block_validator *v, + struct dm_block **result); + +int dm_bm_write_lock(struct dm_block_manager *bm, dm_block_t b, + struct dm_block_validator *v, + struct dm_block **result); + +/* + * The *_try_lock variants return -EWOULDBLOCK if the block isn't + * available immediately. + */ +int dm_bm_read_try_lock(struct dm_block_manager *bm, dm_block_t b, + struct dm_block_validator *v, + struct dm_block **result); + +/* + * Use dm_bm_write_lock_zero() when you know you're going to + * overwrite the block completely. It saves a disk read. + */ +int dm_bm_write_lock_zero(struct dm_block_manager *bm, dm_block_t b, + struct dm_block_validator *v, + struct dm_block **result); + +int dm_bm_unlock(struct dm_block *b); + +/* + * An optimisation; we often want to copy a block's contents to a new + * block. eg, as part of the shadowing operation. It's far better for + * bufio to do this move behind the scenes than hold 2 locks and memcpy the + * data. + */ +int dm_bm_unlock_move(struct dm_block *b, dm_block_t n); + +/* + * It's a common idiom to have a superblock that should be committed last. + * + * @superblock should be write-locked on entry. It will be unlocked during + * this function. All dirty blocks are guaranteed to be written and flushed + * before the superblock. + * + * This method always blocks. + */ +int dm_bm_flush_and_unlock(struct dm_block_manager *bm, + struct dm_block *superblock); + +u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor); + +/*----------------------------------------------------------------*/ + +#endif /* _LINUX_DM_BLOCK_MANAGER_H */ diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h new file mode 100644 index 00000000..5709bfea --- /dev/null +++ b/drivers/md/persistent-data/dm-btree-internal.h @@ -0,0 +1,134 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef DM_BTREE_INTERNAL_H +#define DM_BTREE_INTERNAL_H + +#include "dm-btree.h" + +/*----------------------------------------------------------------*/ + +/* + * We'll need 2 accessor functions for n->csum and n->blocknr + * to support dm-btree-spine.c in that case. + */ + +enum node_flags { + INTERNAL_NODE = 1, + LEAF_NODE = 1 << 1 +}; + +/* + * Every btree node begins with this structure. Make sure it's a multiple + * of 8-bytes in size, otherwise the 64bit keys will be mis-aligned. + */ +struct node_header { + __le32 csum; + __le32 flags; + __le64 blocknr; /* Block this node is supposed to live in. */ + + __le32 nr_entries; + __le32 max_entries; + __le32 value_size; + __le32 padding; +} __packed; + +struct node { + struct node_header header; + __le64 keys[0]; +} __packed; + + +void inc_children(struct dm_transaction_manager *tm, struct node *n, + struct dm_btree_value_type *vt); + +int new_block(struct dm_btree_info *info, struct dm_block **result); +int unlock_block(struct dm_btree_info *info, struct dm_block *b); + +/* + * Spines keep track of the rolling locks. There are 2 variants, read-only + * and one that uses shadowing. These are separate structs to allow the + * type checker to spot misuse, for example accidentally calling read_lock + * on a shadow spine. + */ +struct ro_spine { + struct dm_btree_info *info; + + int count; + struct dm_block *nodes[2]; +}; + +void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info); +int exit_ro_spine(struct ro_spine *s); +int ro_step(struct ro_spine *s, dm_block_t new_child); +struct node *ro_node(struct ro_spine *s); + +struct shadow_spine { + struct dm_btree_info *info; + + int count; + struct dm_block *nodes[2]; + + dm_block_t root; +}; + +void init_shadow_spine(struct shadow_spine *s, struct dm_btree_info *info); +int exit_shadow_spine(struct shadow_spine *s); + +int shadow_step(struct shadow_spine *s, dm_block_t b, + struct dm_btree_value_type *vt); + +/* + * The spine must have at least one entry before calling this. + */ +struct dm_block *shadow_current(struct shadow_spine *s); + +/* + * The spine must have at least two entries before calling this. + */ +struct dm_block *shadow_parent(struct shadow_spine *s); + +int shadow_has_parent(struct shadow_spine *s); + +int shadow_root(struct shadow_spine *s); + +/* + * Some inlines. + */ +static inline __le64 *key_ptr(struct node *n, uint32_t index) +{ + return n->keys + index; +} + +static inline void *value_base(struct node *n) +{ + return &n->keys[le32_to_cpu(n->header.max_entries)]; +} + +static inline void *value_ptr(struct node *n, uint32_t index) +{ + uint32_t value_size = le32_to_cpu(n->header.value_size); + return value_base(n) + (value_size * index); +} + +/* + * Assumes the values are suitably-aligned and converts to core format. + */ +static inline uint64_t value64(struct node *n, uint32_t index) +{ + __le64 *values_le = value_base(n); + + return le64_to_cpu(values_le[index]); +} + +/* + * Searching for a key within a single node. + */ +int lower_bound(struct node *n, uint64_t key); + +extern struct dm_block_validator btree_node_validator; + +#endif /* DM_BTREE_INTERNAL_H */ diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c new file mode 100644 index 00000000..aa71e235 --- /dev/null +++ b/drivers/md/persistent-data/dm-btree-remove.c @@ -0,0 +1,590 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-btree.h" +#include "dm-btree-internal.h" +#include "dm-transaction-manager.h" + +#include <linux/export.h> + +/* + * Removing an entry from a btree + * ============================== + * + * A very important constraint for our btree is that no node, except the + * root, may have fewer than a certain number of entries. + * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES). + * + * Ensuring this is complicated by the way we want to only ever hold the + * locks on 2 nodes concurrently, and only change nodes in a top to bottom + * fashion. + * + * Each node may have a left or right sibling. When decending the spine, + * if a node contains only MIN_ENTRIES then we try and increase this to at + * least MIN_ENTRIES + 1. We do this in the following ways: + * + * [A] No siblings => this can only happen if the node is the root, in which + * case we copy the childs contents over the root. + * + * [B] No left sibling + * ==> rebalance(node, right sibling) + * + * [C] No right sibling + * ==> rebalance(left sibling, node) + * + * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD + * ==> delete node adding it's contents to left and right + * + * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD + * ==> rebalance(left, node, right) + * + * After these operations it's possible that the our original node no + * longer contains the desired sub tree. For this reason this rebalancing + * is performed on the children of the current node. This also avoids + * having a special case for the root. + * + * Once this rebalancing has occurred we can then step into the child node + * for internal nodes. Or delete the entry for leaf nodes. + */ + +/* + * Some little utilities for moving node data around. + */ +static void node_shift(struct node *n, int shift) +{ + uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); + uint32_t value_size = le32_to_cpu(n->header.value_size); + + if (shift < 0) { + shift = -shift; + BUG_ON(shift > nr_entries); + BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift)); + memmove(key_ptr(n, 0), + key_ptr(n, shift), + (nr_entries - shift) * sizeof(__le64)); + memmove(value_ptr(n, 0), + value_ptr(n, shift), + (nr_entries - shift) * value_size); + } else { + BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries)); + memmove(key_ptr(n, shift), + key_ptr(n, 0), + nr_entries * sizeof(__le64)); + memmove(value_ptr(n, shift), + value_ptr(n, 0), + nr_entries * value_size); + } +} + +static void node_copy(struct node *left, struct node *right, int shift) +{ + uint32_t nr_left = le32_to_cpu(left->header.nr_entries); + uint32_t value_size = le32_to_cpu(left->header.value_size); + BUG_ON(value_size != le32_to_cpu(right->header.value_size)); + + if (shift < 0) { + shift = -shift; + BUG_ON(nr_left + shift > le32_to_cpu(left->header.max_entries)); + memcpy(key_ptr(left, nr_left), + key_ptr(right, 0), + shift * sizeof(__le64)); + memcpy(value_ptr(left, nr_left), + value_ptr(right, 0), + shift * value_size); + } else { + BUG_ON(shift > le32_to_cpu(right->header.max_entries)); + memcpy(key_ptr(right, 0), + key_ptr(left, nr_left - shift), + shift * sizeof(__le64)); + memcpy(value_ptr(right, 0), + value_ptr(left, nr_left - shift), + shift * value_size); + } +} + +/* + * Delete a specific entry from a leaf node. + */ +static void delete_at(struct node *n, unsigned index) +{ + unsigned nr_entries = le32_to_cpu(n->header.nr_entries); + unsigned nr_to_copy = nr_entries - (index + 1); + uint32_t value_size = le32_to_cpu(n->header.value_size); + BUG_ON(index >= nr_entries); + + if (nr_to_copy) { + memmove(key_ptr(n, index), + key_ptr(n, index + 1), + nr_to_copy * sizeof(__le64)); + + memmove(value_ptr(n, index), + value_ptr(n, index + 1), + nr_to_copy * value_size); + } + + n->header.nr_entries = cpu_to_le32(nr_entries - 1); +} + +static unsigned merge_threshold(struct node *n) +{ + return le32_to_cpu(n->header.max_entries) / 3; +} + +struct child { + unsigned index; + struct dm_block *block; + struct node *n; +}; + +static struct dm_btree_value_type le64_type = { + .context = NULL, + .size = sizeof(__le64), + .inc = NULL, + .dec = NULL, + .equal = NULL +}; + +static int init_child(struct dm_btree_info *info, struct node *parent, + unsigned index, struct child *result) +{ + int r, inc; + dm_block_t root; + + result->index = index; + root = value64(parent, index); + + r = dm_tm_shadow_block(info->tm, root, &btree_node_validator, + &result->block, &inc); + if (r) + return r; + + result->n = dm_block_data(result->block); + + if (inc) + inc_children(info->tm, result->n, &le64_type); + + *((__le64 *) value_ptr(parent, index)) = + cpu_to_le64(dm_block_location(result->block)); + + return 0; +} + +static int exit_child(struct dm_btree_info *info, struct child *c) +{ + return dm_tm_unlock(info->tm, c->block); +} + +static void shift(struct node *left, struct node *right, int count) +{ + uint32_t nr_left = le32_to_cpu(left->header.nr_entries); + uint32_t nr_right = le32_to_cpu(right->header.nr_entries); + uint32_t max_entries = le32_to_cpu(left->header.max_entries); + uint32_t r_max_entries = le32_to_cpu(right->header.max_entries); + + BUG_ON(max_entries != r_max_entries); + BUG_ON(nr_left - count > max_entries); + BUG_ON(nr_right + count > max_entries); + + if (!count) + return; + + if (count > 0) { + node_shift(right, count); + node_copy(left, right, count); + } else { + node_copy(left, right, count); + node_shift(right, count); + } + + left->header.nr_entries = cpu_to_le32(nr_left - count); + right->header.nr_entries = cpu_to_le32(nr_right + count); +} + +static void __rebalance2(struct dm_btree_info *info, struct node *parent, + struct child *l, struct child *r) +{ + struct node *left = l->n; + struct node *right = r->n; + uint32_t nr_left = le32_to_cpu(left->header.nr_entries); + uint32_t nr_right = le32_to_cpu(right->header.nr_entries); + unsigned threshold = 2 * merge_threshold(left) + 1; + + if (nr_left + nr_right < threshold) { + /* + * Merge + */ + node_copy(left, right, -nr_right); + left->header.nr_entries = cpu_to_le32(nr_left + nr_right); + delete_at(parent, r->index); + + /* + * We need to decrement the right block, but not it's + * children, since they're still referenced by left. + */ + dm_tm_dec(info->tm, dm_block_location(r->block)); + } else { + /* + * Rebalance. + */ + unsigned target_left = (nr_left + nr_right) / 2; + shift(left, right, nr_left - target_left); + *key_ptr(parent, r->index) = right->keys[0]; + } +} + +static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info, + unsigned left_index) +{ + int r; + struct node *parent; + struct child left, right; + + parent = dm_block_data(shadow_current(s)); + + r = init_child(info, parent, left_index, &left); + if (r) + return r; + + r = init_child(info, parent, left_index + 1, &right); + if (r) { + exit_child(info, &left); + return r; + } + + __rebalance2(info, parent, &left, &right); + + r = exit_child(info, &left); + if (r) { + exit_child(info, &right); + return r; + } + + return exit_child(info, &right); +} + +/* + * We dump as many entries from center as possible into left, then the rest + * in right, then rebalance2. This wastes some cpu, but I want something + * simple atm. + */ +static void delete_center_node(struct dm_btree_info *info, struct node *parent, + struct child *l, struct child *c, struct child *r, + struct node *left, struct node *center, struct node *right, + uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) +{ + uint32_t max_entries = le32_to_cpu(left->header.max_entries); + unsigned shift = min(max_entries - nr_left, nr_center); + + BUG_ON(nr_left + shift > max_entries); + node_copy(left, center, -shift); + left->header.nr_entries = cpu_to_le32(nr_left + shift); + + if (shift != nr_center) { + shift = nr_center - shift; + BUG_ON((nr_right + shift) > max_entries); + node_shift(right, shift); + node_copy(center, right, shift); + right->header.nr_entries = cpu_to_le32(nr_right + shift); + } + *key_ptr(parent, r->index) = right->keys[0]; + + delete_at(parent, c->index); + r->index--; + + dm_tm_dec(info->tm, dm_block_location(c->block)); + __rebalance2(info, parent, l, r); +} + +/* + * Redistributes entries among 3 sibling nodes. + */ +static void redistribute3(struct dm_btree_info *info, struct node *parent, + struct child *l, struct child *c, struct child *r, + struct node *left, struct node *center, struct node *right, + uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) +{ + int s; + uint32_t max_entries = le32_to_cpu(left->header.max_entries); + unsigned target = (nr_left + nr_center + nr_right) / 3; + BUG_ON(target > max_entries); + + if (nr_left < nr_right) { + s = nr_left - target; + + if (s < 0 && nr_center < -s) { + /* not enough in central node */ + shift(left, center, nr_center); + s = nr_center - target; + shift(left, right, s); + nr_right += s; + } else + shift(left, center, s); + + shift(center, right, target - nr_right); + + } else { + s = target - nr_right; + if (s > 0 && nr_center < s) { + /* not enough in central node */ + shift(center, right, nr_center); + s = target - nr_center; + shift(left, right, s); + nr_left -= s; + } else + shift(center, right, s); + + shift(left, center, nr_left - target); + } + + *key_ptr(parent, c->index) = center->keys[0]; + *key_ptr(parent, r->index) = right->keys[0]; +} + +static void __rebalance3(struct dm_btree_info *info, struct node *parent, + struct child *l, struct child *c, struct child *r) +{ + struct node *left = l->n; + struct node *center = c->n; + struct node *right = r->n; + + uint32_t nr_left = le32_to_cpu(left->header.nr_entries); + uint32_t nr_center = le32_to_cpu(center->header.nr_entries); + uint32_t nr_right = le32_to_cpu(right->header.nr_entries); + + unsigned threshold = merge_threshold(left) * 4 + 1; + + BUG_ON(left->header.max_entries != center->header.max_entries); + BUG_ON(center->header.max_entries != right->header.max_entries); + + if ((nr_left + nr_center + nr_right) < threshold) + delete_center_node(info, parent, l, c, r, left, center, right, + nr_left, nr_center, nr_right); + else + redistribute3(info, parent, l, c, r, left, center, right, + nr_left, nr_center, nr_right); +} + +static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, + unsigned left_index) +{ + int r; + struct node *parent = dm_block_data(shadow_current(s)); + struct child left, center, right; + + /* + * FIXME: fill out an array? + */ + r = init_child(info, parent, left_index, &left); + if (r) + return r; + + r = init_child(info, parent, left_index + 1, ¢er); + if (r) { + exit_child(info, &left); + return r; + } + + r = init_child(info, parent, left_index + 2, &right); + if (r) { + exit_child(info, &left); + exit_child(info, ¢er); + return r; + } + + __rebalance3(info, parent, &left, ¢er, &right); + + r = exit_child(info, &left); + if (r) { + exit_child(info, ¢er); + exit_child(info, &right); + return r; + } + + r = exit_child(info, ¢er); + if (r) { + exit_child(info, &right); + return r; + } + + r = exit_child(info, &right); + if (r) + return r; + + return 0; +} + +static int get_nr_entries(struct dm_transaction_manager *tm, + dm_block_t b, uint32_t *result) +{ + int r; + struct dm_block *block; + struct node *n; + + r = dm_tm_read_lock(tm, b, &btree_node_validator, &block); + if (r) + return r; + + n = dm_block_data(block); + *result = le32_to_cpu(n->header.nr_entries); + + return dm_tm_unlock(tm, block); +} + +static int rebalance_children(struct shadow_spine *s, + struct dm_btree_info *info, uint64_t key) +{ + int i, r, has_left_sibling, has_right_sibling; + uint32_t child_entries; + struct node *n; + + n = dm_block_data(shadow_current(s)); + + if (le32_to_cpu(n->header.nr_entries) == 1) { + struct dm_block *child; + dm_block_t b = value64(n, 0); + + r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child); + if (r) + return r; + + memcpy(n, dm_block_data(child), + dm_bm_block_size(dm_tm_get_bm(info->tm))); + r = dm_tm_unlock(info->tm, child); + if (r) + return r; + + dm_tm_dec(info->tm, dm_block_location(child)); + return 0; + } + + i = lower_bound(n, key); + if (i < 0) + return -ENODATA; + + r = get_nr_entries(info->tm, value64(n, i), &child_entries); + if (r) + return r; + + has_left_sibling = i > 0; + has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1); + + if (!has_left_sibling) + r = rebalance2(s, info, i); + + else if (!has_right_sibling) + r = rebalance2(s, info, i - 1); + + else + r = rebalance3(s, info, i - 1); + + return r; +} + +static int do_leaf(struct node *n, uint64_t key, unsigned *index) +{ + int i = lower_bound(n, key); + + if ((i < 0) || + (i >= le32_to_cpu(n->header.nr_entries)) || + (le64_to_cpu(n->keys[i]) != key)) + return -ENODATA; + + *index = i; + + return 0; +} + +/* + * Prepares for removal from one level of the hierarchy. The caller must + * call delete_at() to remove the entry at index. + */ +static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, + struct dm_btree_value_type *vt, dm_block_t root, + uint64_t key, unsigned *index) +{ + int i = *index, r; + struct node *n; + + for (;;) { + r = shadow_step(s, root, vt); + if (r < 0) + break; + + /* + * We have to patch up the parent node, ugly, but I don't + * see a way to do this automatically as part of the spine + * op. + */ + if (shadow_has_parent(s)) { + __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); + memcpy(value_ptr(dm_block_data(shadow_parent(s)), i), + &location, sizeof(__le64)); + } + + n = dm_block_data(shadow_current(s)); + + if (le32_to_cpu(n->header.flags) & LEAF_NODE) + return do_leaf(n, key, index); + + r = rebalance_children(s, info, key); + if (r) + break; + + n = dm_block_data(shadow_current(s)); + if (le32_to_cpu(n->header.flags) & LEAF_NODE) + return do_leaf(n, key, index); + + i = lower_bound(n, key); + + /* + * We know the key is present, or else + * rebalance_children would have returned + * -ENODATA + */ + root = value64(n, i); + } + + return r; +} + +int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, + uint64_t *keys, dm_block_t *new_root) +{ + unsigned level, last_level = info->levels - 1; + int index = 0, r = 0; + struct shadow_spine spine; + struct node *n; + + init_shadow_spine(&spine, info); + for (level = 0; level < info->levels; level++) { + r = remove_raw(&spine, info, + (level == last_level ? + &info->value_type : &le64_type), + root, keys[level], (unsigned *)&index); + if (r < 0) + break; + + n = dm_block_data(shadow_current(&spine)); + if (level != last_level) { + root = value64(n, index); + continue; + } + + BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries)); + + if (info->value_type.dec) + info->value_type.dec(info->value_type.context, + value_ptr(n, index)); + + delete_at(n, index); + } + + *new_root = shadow_root(&spine); + exit_shadow_spine(&spine); + + return r; +} +EXPORT_SYMBOL_GPL(dm_btree_remove); diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c new file mode 100644 index 00000000..d9a7912e --- /dev/null +++ b/drivers/md/persistent-data/dm-btree-spine.c @@ -0,0 +1,244 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-btree-internal.h" +#include "dm-transaction-manager.h" + +#include <linux/device-mapper.h> + +#define DM_MSG_PREFIX "btree spine" + +/*----------------------------------------------------------------*/ + +#define BTREE_CSUM_XOR 121107 + +static int node_check(struct dm_block_validator *v, + struct dm_block *b, + size_t block_size); + +static void node_prepare_for_write(struct dm_block_validator *v, + struct dm_block *b, + size_t block_size) +{ + struct node *n = dm_block_data(b); + struct node_header *h = &n->header; + + h->blocknr = cpu_to_le64(dm_block_location(b)); + h->csum = cpu_to_le32(dm_bm_checksum(&h->flags, + block_size - sizeof(__le32), + BTREE_CSUM_XOR)); + + BUG_ON(node_check(v, b, 4096)); +} + +static int node_check(struct dm_block_validator *v, + struct dm_block *b, + size_t block_size) +{ + struct node *n = dm_block_data(b); + struct node_header *h = &n->header; + size_t value_size; + __le32 csum_disk; + uint32_t flags; + + if (dm_block_location(b) != le64_to_cpu(h->blocknr)) { + DMERR("node_check failed blocknr %llu wanted %llu", + le64_to_cpu(h->blocknr), dm_block_location(b)); + return -ENOTBLK; + } + + csum_disk = cpu_to_le32(dm_bm_checksum(&h->flags, + block_size - sizeof(__le32), + BTREE_CSUM_XOR)); + if (csum_disk != h->csum) { + DMERR("node_check failed csum %u wanted %u", + le32_to_cpu(csum_disk), le32_to_cpu(h->csum)); + return -EILSEQ; + } + + value_size = le32_to_cpu(h->value_size); + + if (sizeof(struct node_header) + + (sizeof(__le64) + value_size) * le32_to_cpu(h->max_entries) > block_size) { + DMERR("node_check failed: max_entries too large"); + return -EILSEQ; + } + + if (le32_to_cpu(h->nr_entries) > le32_to_cpu(h->max_entries)) { + DMERR("node_check failed, too many entries"); + return -EILSEQ; + } + + /* + * The node must be either INTERNAL or LEAF. + */ + flags = le32_to_cpu(h->flags); + if (!(flags & INTERNAL_NODE) && !(flags & LEAF_NODE)) { + DMERR("node_check failed, node is neither INTERNAL or LEAF"); + return -EILSEQ; + } + + return 0; +} + +struct dm_block_validator btree_node_validator = { + .name = "btree_node", + .prepare_for_write = node_prepare_for_write, + .check = node_check +}; + +/*----------------------------------------------------------------*/ + +static int bn_read_lock(struct dm_btree_info *info, dm_block_t b, + struct dm_block **result) +{ + return dm_tm_read_lock(info->tm, b, &btree_node_validator, result); +} + +static int bn_shadow(struct dm_btree_info *info, dm_block_t orig, + struct dm_btree_value_type *vt, + struct dm_block **result) +{ + int r, inc; + + r = dm_tm_shadow_block(info->tm, orig, &btree_node_validator, + result, &inc); + if (!r && inc) + inc_children(info->tm, dm_block_data(*result), vt); + + return r; +} + +int new_block(struct dm_btree_info *info, struct dm_block **result) +{ + return dm_tm_new_block(info->tm, &btree_node_validator, result); +} + +int unlock_block(struct dm_btree_info *info, struct dm_block *b) +{ + return dm_tm_unlock(info->tm, b); +} + +/*----------------------------------------------------------------*/ + +void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info) +{ + s->info = info; + s->count = 0; + s->nodes[0] = NULL; + s->nodes[1] = NULL; +} + +int exit_ro_spine(struct ro_spine *s) +{ + int r = 0, i; + + for (i = 0; i < s->count; i++) { + int r2 = unlock_block(s->info, s->nodes[i]); + if (r2 < 0) + r = r2; + } + + return r; +} + +int ro_step(struct ro_spine *s, dm_block_t new_child) +{ + int r; + + if (s->count == 2) { + r = unlock_block(s->info, s->nodes[0]); + if (r < 0) + return r; + s->nodes[0] = s->nodes[1]; + s->count--; + } + + r = bn_read_lock(s->info, new_child, s->nodes + s->count); + if (!r) + s->count++; + + return r; +} + +struct node *ro_node(struct ro_spine *s) +{ + struct dm_block *block; + + BUG_ON(!s->count); + block = s->nodes[s->count - 1]; + + return dm_block_data(block); +} + +/*----------------------------------------------------------------*/ + +void init_shadow_spine(struct shadow_spine *s, struct dm_btree_info *info) +{ + s->info = info; + s->count = 0; +} + +int exit_shadow_spine(struct shadow_spine *s) +{ + int r = 0, i; + + for (i = 0; i < s->count; i++) { + int r2 = unlock_block(s->info, s->nodes[i]); + if (r2 < 0) + r = r2; + } + + return r; +} + +int shadow_step(struct shadow_spine *s, dm_block_t b, + struct dm_btree_value_type *vt) +{ + int r; + + if (s->count == 2) { + r = unlock_block(s->info, s->nodes[0]); + if (r < 0) + return r; + s->nodes[0] = s->nodes[1]; + s->count--; + } + + r = bn_shadow(s->info, b, vt, s->nodes + s->count); + if (!r) { + if (!s->count) + s->root = dm_block_location(s->nodes[0]); + + s->count++; + } + + return r; +} + +struct dm_block *shadow_current(struct shadow_spine *s) +{ + BUG_ON(!s->count); + + return s->nodes[s->count - 1]; +} + +struct dm_block *shadow_parent(struct shadow_spine *s) +{ + BUG_ON(s->count != 2); + + return s->count == 2 ? s->nodes[0] : NULL; +} + +int shadow_has_parent(struct shadow_spine *s) +{ + return s->count >= 2; +} + +int shadow_root(struct shadow_spine *s) +{ + return s->root; +} diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c new file mode 100644 index 00000000..d12b2cc5 --- /dev/null +++ b/drivers/md/persistent-data/dm-btree.c @@ -0,0 +1,804 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-btree-internal.h" +#include "dm-space-map.h" +#include "dm-transaction-manager.h" + +#include <linux/export.h> +#include <linux/device-mapper.h> + +#define DM_MSG_PREFIX "btree" + +/*---------------------------------------------------------------- + * Array manipulation + *--------------------------------------------------------------*/ +static void memcpy_disk(void *dest, const void *src, size_t len) + __dm_written_to_disk(src) +{ + memcpy(dest, src, len); + __dm_unbless_for_disk(src); +} + +static void array_insert(void *base, size_t elt_size, unsigned nr_elts, + unsigned index, void *elt) + __dm_written_to_disk(elt) +{ + if (index < nr_elts) + memmove(base + (elt_size * (index + 1)), + base + (elt_size * index), + (nr_elts - index) * elt_size); + + memcpy_disk(base + (elt_size * index), elt, elt_size); +} + +/*----------------------------------------------------------------*/ + +/* makes the assumption that no two keys are the same. */ +static int bsearch(struct node *n, uint64_t key, int want_hi) +{ + int lo = -1, hi = le32_to_cpu(n->header.nr_entries); + + while (hi - lo > 1) { + int mid = lo + ((hi - lo) / 2); + uint64_t mid_key = le64_to_cpu(n->keys[mid]); + + if (mid_key == key) + return mid; + + if (mid_key < key) + lo = mid; + else + hi = mid; + } + + return want_hi ? hi : lo; +} + +int lower_bound(struct node *n, uint64_t key) +{ + return bsearch(n, key, 0); +} + +void inc_children(struct dm_transaction_manager *tm, struct node *n, + struct dm_btree_value_type *vt) +{ + unsigned i; + uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); + + if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) + for (i = 0; i < nr_entries; i++) + dm_tm_inc(tm, value64(n, i)); + else if (vt->inc) + for (i = 0; i < nr_entries; i++) + vt->inc(vt->context, value_ptr(n, i)); +} + +static int insert_at(size_t value_size, struct node *node, unsigned index, + uint64_t key, void *value) + __dm_written_to_disk(value) +{ + uint32_t nr_entries = le32_to_cpu(node->header.nr_entries); + __le64 key_le = cpu_to_le64(key); + + if (index > nr_entries || + index >= le32_to_cpu(node->header.max_entries)) { + DMERR("too many entries in btree node for insert"); + __dm_unbless_for_disk(value); + return -ENOMEM; + } + + __dm_bless_for_disk(&key_le); + + array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le); + array_insert(value_base(node), value_size, nr_entries, index, value); + node->header.nr_entries = cpu_to_le32(nr_entries + 1); + + return 0; +} + +/*----------------------------------------------------------------*/ + +/* + * We want 3n entries (for some n). This works more nicely for repeated + * insert remove loops than (2n + 1). + */ +static uint32_t calc_max_entries(size_t value_size, size_t block_size) +{ + uint32_t total, n; + size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */ + + block_size -= sizeof(struct node_header); + total = block_size / elt_size; + n = total / 3; /* rounds down */ + + return 3 * n; +} + +int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root) +{ + int r; + struct dm_block *b; + struct node *n; + size_t block_size; + uint32_t max_entries; + + r = new_block(info, &b); + if (r < 0) + return r; + + block_size = dm_bm_block_size(dm_tm_get_bm(info->tm)); + max_entries = calc_max_entries(info->value_type.size, block_size); + + n = dm_block_data(b); + memset(n, 0, block_size); + n->header.flags = cpu_to_le32(LEAF_NODE); + n->header.nr_entries = cpu_to_le32(0); + n->header.max_entries = cpu_to_le32(max_entries); + n->header.value_size = cpu_to_le32(info->value_type.size); + + *root = dm_block_location(b); + return unlock_block(info, b); +} +EXPORT_SYMBOL_GPL(dm_btree_empty); + +/*----------------------------------------------------------------*/ + +/* + * Deletion uses a recursive algorithm, since we have limited stack space + * we explicitly manage our own stack on the heap. + */ +#define MAX_SPINE_DEPTH 64 +struct frame { + struct dm_block *b; + struct node *n; + unsigned level; + unsigned nr_children; + unsigned current_child; +}; + +struct del_stack { + struct dm_transaction_manager *tm; + int top; + struct frame spine[MAX_SPINE_DEPTH]; +}; + +static int top_frame(struct del_stack *s, struct frame **f) +{ + if (s->top < 0) { + DMERR("btree deletion stack empty"); + return -EINVAL; + } + + *f = s->spine + s->top; + + return 0; +} + +static int unprocessed_frames(struct del_stack *s) +{ + return s->top >= 0; +} + +static int push_frame(struct del_stack *s, dm_block_t b, unsigned level) +{ + int r; + uint32_t ref_count; + + if (s->top >= MAX_SPINE_DEPTH - 1) { + DMERR("btree deletion stack out of memory"); + return -ENOMEM; + } + + r = dm_tm_ref(s->tm, b, &ref_count); + if (r) + return r; + + if (ref_count > 1) + /* + * This is a shared node, so we can just decrement it's + * reference counter and leave the children. + */ + dm_tm_dec(s->tm, b); + + else { + struct frame *f = s->spine + ++s->top; + + r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b); + if (r) { + s->top--; + return r; + } + + f->n = dm_block_data(f->b); + f->level = level; + f->nr_children = le32_to_cpu(f->n->header.nr_entries); + f->current_child = 0; + } + + return 0; +} + +static void pop_frame(struct del_stack *s) +{ + struct frame *f = s->spine + s->top--; + + dm_tm_dec(s->tm, dm_block_location(f->b)); + dm_tm_unlock(s->tm, f->b); +} + +int dm_btree_del(struct dm_btree_info *info, dm_block_t root) +{ + int r; + struct del_stack *s; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + s->tm = info->tm; + s->top = -1; + + r = push_frame(s, root, 1); + if (r) + goto out; + + while (unprocessed_frames(s)) { + uint32_t flags; + struct frame *f; + dm_block_t b; + + r = top_frame(s, &f); + if (r) + goto out; + + if (f->current_child >= f->nr_children) { + pop_frame(s); + continue; + } + + flags = le32_to_cpu(f->n->header.flags); + if (flags & INTERNAL_NODE) { + b = value64(f->n, f->current_child); + f->current_child++; + r = push_frame(s, b, f->level); + if (r) + goto out; + + } else if (f->level != (info->levels - 1)) { + b = value64(f->n, f->current_child); + f->current_child++; + r = push_frame(s, b, f->level + 1); + if (r) + goto out; + + } else { + if (info->value_type.dec) { + unsigned i; + + for (i = 0; i < f->nr_children; i++) + info->value_type.dec(info->value_type.context, + value_ptr(f->n, i)); + } + f->current_child = f->nr_children; + } + } + +out: + kfree(s); + return r; +} +EXPORT_SYMBOL_GPL(dm_btree_del); + +/*----------------------------------------------------------------*/ + +static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key, + int (*search_fn)(struct node *, uint64_t), + uint64_t *result_key, void *v, size_t value_size) +{ + int i, r; + uint32_t flags, nr_entries; + + do { + r = ro_step(s, block); + if (r < 0) + return r; + + i = search_fn(ro_node(s), key); + + flags = le32_to_cpu(ro_node(s)->header.flags); + nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries); + if (i < 0 || i >= nr_entries) + return -ENODATA; + + if (flags & INTERNAL_NODE) + block = value64(ro_node(s), i); + + } while (!(flags & LEAF_NODE)); + + *result_key = le64_to_cpu(ro_node(s)->keys[i]); + memcpy(v, value_ptr(ro_node(s), i), value_size); + + return 0; +} + +int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root, + uint64_t *keys, void *value_le) +{ + unsigned level, last_level = info->levels - 1; + int r = -ENODATA; + uint64_t rkey; + __le64 internal_value_le; + struct ro_spine spine; + + init_ro_spine(&spine, info); + for (level = 0; level < info->levels; level++) { + size_t size; + void *value_p; + + if (level == last_level) { + value_p = value_le; + size = info->value_type.size; + + } else { + value_p = &internal_value_le; + size = sizeof(uint64_t); + } + + r = btree_lookup_raw(&spine, root, keys[level], + lower_bound, &rkey, + value_p, size); + + if (!r) { + if (rkey != keys[level]) { + exit_ro_spine(&spine); + return -ENODATA; + } + } else { + exit_ro_spine(&spine); + return r; + } + + root = le64_to_cpu(internal_value_le); + } + exit_ro_spine(&spine); + + return r; +} +EXPORT_SYMBOL_GPL(dm_btree_lookup); + +/* + * Splits a node by creating a sibling node and shifting half the nodes + * contents across. Assumes there is a parent node, and it has room for + * another child. + * + * Before: + * +--------+ + * | Parent | + * +--------+ + * | + * v + * +----------+ + * | A ++++++ | + * +----------+ + * + * + * After: + * +--------+ + * | Parent | + * +--------+ + * | | + * v +------+ + * +---------+ | + * | A* +++ | v + * +---------+ +-------+ + * | B +++ | + * +-------+ + * + * Where A* is a shadow of A. + */ +static int btree_split_sibling(struct shadow_spine *s, dm_block_t root, + unsigned parent_index, uint64_t key) +{ + int r; + size_t size; + unsigned nr_left, nr_right; + struct dm_block *left, *right, *parent; + struct node *ln, *rn, *pn; + __le64 location; + + left = shadow_current(s); + + r = new_block(s->info, &right); + if (r < 0) + return r; + + ln = dm_block_data(left); + rn = dm_block_data(right); + + nr_left = le32_to_cpu(ln->header.nr_entries) / 2; + nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left; + + ln->header.nr_entries = cpu_to_le32(nr_left); + + rn->header.flags = ln->header.flags; + rn->header.nr_entries = cpu_to_le32(nr_right); + rn->header.max_entries = ln->header.max_entries; + rn->header.value_size = ln->header.value_size; + memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0])); + + size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ? + sizeof(uint64_t) : s->info->value_type.size; + memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left), + size * nr_right); + + /* + * Patch up the parent + */ + parent = shadow_parent(s); + + pn = dm_block_data(parent); + location = cpu_to_le64(dm_block_location(left)); + __dm_bless_for_disk(&location); + memcpy_disk(value_ptr(pn, parent_index), + &location, sizeof(__le64)); + + location = cpu_to_le64(dm_block_location(right)); + __dm_bless_for_disk(&location); + + r = insert_at(sizeof(__le64), pn, parent_index + 1, + le64_to_cpu(rn->keys[0]), &location); + if (r) + return r; + + if (key < le64_to_cpu(rn->keys[0])) { + unlock_block(s->info, right); + s->nodes[1] = left; + } else { + unlock_block(s->info, left); + s->nodes[1] = right; + } + + return 0; +} + +/* + * Splits a node by creating two new children beneath the given node. + * + * Before: + * +----------+ + * | A ++++++ | + * +----------+ + * + * + * After: + * +------------+ + * | A (shadow) | + * +------------+ + * | | + * +------+ +----+ + * | | + * v v + * +-------+ +-------+ + * | B +++ | | C +++ | + * +-------+ +-------+ + */ +static int btree_split_beneath(struct shadow_spine *s, uint64_t key) +{ + int r; + size_t size; + unsigned nr_left, nr_right; + struct dm_block *left, *right, *new_parent; + struct node *pn, *ln, *rn; + __le64 val; + + new_parent = shadow_current(s); + + r = new_block(s->info, &left); + if (r < 0) + return r; + + r = new_block(s->info, &right); + if (r < 0) { + /* FIXME: put left */ + return r; + } + + pn = dm_block_data(new_parent); + ln = dm_block_data(left); + rn = dm_block_data(right); + + nr_left = le32_to_cpu(pn->header.nr_entries) / 2; + nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left; + + ln->header.flags = pn->header.flags; + ln->header.nr_entries = cpu_to_le32(nr_left); + ln->header.max_entries = pn->header.max_entries; + ln->header.value_size = pn->header.value_size; + + rn->header.flags = pn->header.flags; + rn->header.nr_entries = cpu_to_le32(nr_right); + rn->header.max_entries = pn->header.max_entries; + rn->header.value_size = pn->header.value_size; + + memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0])); + memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0])); + + size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ? + sizeof(__le64) : s->info->value_type.size; + memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size); + memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left), + nr_right * size); + + /* new_parent should just point to l and r now */ + pn->header.flags = cpu_to_le32(INTERNAL_NODE); + pn->header.nr_entries = cpu_to_le32(2); + pn->header.max_entries = cpu_to_le32( + calc_max_entries(sizeof(__le64), + dm_bm_block_size( + dm_tm_get_bm(s->info->tm)))); + pn->header.value_size = cpu_to_le32(sizeof(__le64)); + + val = cpu_to_le64(dm_block_location(left)); + __dm_bless_for_disk(&val); + pn->keys[0] = ln->keys[0]; + memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64)); + + val = cpu_to_le64(dm_block_location(right)); + __dm_bless_for_disk(&val); + pn->keys[1] = rn->keys[0]; + memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64)); + + /* + * rejig the spine. This is ugly, since it knows too + * much about the spine + */ + if (s->nodes[0] != new_parent) { + unlock_block(s->info, s->nodes[0]); + s->nodes[0] = new_parent; + } + if (key < le64_to_cpu(rn->keys[0])) { + unlock_block(s->info, right); + s->nodes[1] = left; + } else { + unlock_block(s->info, left); + s->nodes[1] = right; + } + s->count = 2; + + return 0; +} + +static int btree_insert_raw(struct shadow_spine *s, dm_block_t root, + struct dm_btree_value_type *vt, + uint64_t key, unsigned *index) +{ + int r, i = *index, top = 1; + struct node *node; + + for (;;) { + r = shadow_step(s, root, vt); + if (r < 0) + return r; + + node = dm_block_data(shadow_current(s)); + + /* + * We have to patch up the parent node, ugly, but I don't + * see a way to do this automatically as part of the spine + * op. + */ + if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */ + __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); + + __dm_bless_for_disk(&location); + memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i), + &location, sizeof(__le64)); + } + + node = dm_block_data(shadow_current(s)); + + if (node->header.nr_entries == node->header.max_entries) { + if (top) + r = btree_split_beneath(s, key); + else + r = btree_split_sibling(s, root, i, key); + + if (r < 0) + return r; + } + + node = dm_block_data(shadow_current(s)); + + i = lower_bound(node, key); + + if (le32_to_cpu(node->header.flags) & LEAF_NODE) + break; + + if (i < 0) { + /* change the bounds on the lowest key */ + node->keys[0] = cpu_to_le64(key); + i = 0; + } + + root = value64(node, i); + top = 0; + } + + if (i < 0 || le64_to_cpu(node->keys[i]) != key) + i++; + + *index = i; + return 0; +} + +static int insert(struct dm_btree_info *info, dm_block_t root, + uint64_t *keys, void *value, dm_block_t *new_root, + int *inserted) + __dm_written_to_disk(value) +{ + int r, need_insert; + unsigned level, index = -1, last_level = info->levels - 1; + dm_block_t block = root; + struct shadow_spine spine; + struct node *n; + struct dm_btree_value_type le64_type; + + le64_type.context = NULL; + le64_type.size = sizeof(__le64); + le64_type.inc = NULL; + le64_type.dec = NULL; + le64_type.equal = NULL; + + init_shadow_spine(&spine, info); + + for (level = 0; level < (info->levels - 1); level++) { + r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index); + if (r < 0) + goto bad; + + n = dm_block_data(shadow_current(&spine)); + need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) || + (le64_to_cpu(n->keys[index]) != keys[level])); + + if (need_insert) { + dm_block_t new_tree; + __le64 new_le; + + r = dm_btree_empty(info, &new_tree); + if (r < 0) + goto bad; + + new_le = cpu_to_le64(new_tree); + __dm_bless_for_disk(&new_le); + + r = insert_at(sizeof(uint64_t), n, index, + keys[level], &new_le); + if (r) + goto bad; + } + + if (level < last_level) + block = value64(n, index); + } + + r = btree_insert_raw(&spine, block, &info->value_type, + keys[level], &index); + if (r < 0) + goto bad; + + n = dm_block_data(shadow_current(&spine)); + need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) || + (le64_to_cpu(n->keys[index]) != keys[level])); + + if (need_insert) { + if (inserted) + *inserted = 1; + + r = insert_at(info->value_type.size, n, index, + keys[level], value); + if (r) + goto bad_unblessed; + } else { + if (inserted) + *inserted = 0; + + if (info->value_type.dec && + (!info->value_type.equal || + !info->value_type.equal( + info->value_type.context, + value_ptr(n, index), + value))) { + info->value_type.dec(info->value_type.context, + value_ptr(n, index)); + } + memcpy_disk(value_ptr(n, index), + value, info->value_type.size); + } + + *new_root = shadow_root(&spine); + exit_shadow_spine(&spine); + + return 0; + +bad: + __dm_unbless_for_disk(value); +bad_unblessed: + exit_shadow_spine(&spine); + return r; +} + +int dm_btree_insert(struct dm_btree_info *info, dm_block_t root, + uint64_t *keys, void *value, dm_block_t *new_root) + __dm_written_to_disk(value) +{ + return insert(info, root, keys, value, new_root, NULL); +} +EXPORT_SYMBOL_GPL(dm_btree_insert); + +int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root, + uint64_t *keys, void *value, dm_block_t *new_root, + int *inserted) + __dm_written_to_disk(value) +{ + return insert(info, root, keys, value, new_root, inserted); +} +EXPORT_SYMBOL_GPL(dm_btree_insert_notify); + +/*----------------------------------------------------------------*/ + +static int find_highest_key(struct ro_spine *s, dm_block_t block, + uint64_t *result_key, dm_block_t *next_block) +{ + int i, r; + uint32_t flags; + + do { + r = ro_step(s, block); + if (r < 0) + return r; + + flags = le32_to_cpu(ro_node(s)->header.flags); + i = le32_to_cpu(ro_node(s)->header.nr_entries); + if (!i) + return -ENODATA; + else + i--; + + *result_key = le64_to_cpu(ro_node(s)->keys[i]); + if (next_block || flags & INTERNAL_NODE) + block = value64(ro_node(s), i); + + } while (flags & INTERNAL_NODE); + + if (next_block) + *next_block = block; + return 0; +} + +int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, + uint64_t *result_keys) +{ + int r = 0, count = 0, level; + struct ro_spine spine; + + init_ro_spine(&spine, info); + for (level = 0; level < info->levels; level++) { + r = find_highest_key(&spine, root, result_keys + level, + level == info->levels - 1 ? NULL : &root); + if (r == -ENODATA) { + r = 0; + break; + + } else if (r) + break; + + count++; + } + exit_ro_spine(&spine); + + return r ? r : count; +} +EXPORT_SYMBOL_GPL(dm_btree_find_highest_key); diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h new file mode 100644 index 00000000..ae02c844 --- /dev/null +++ b/drivers/md/persistent-data/dm-btree.h @@ -0,0 +1,145 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ +#ifndef _LINUX_DM_BTREE_H +#define _LINUX_DM_BTREE_H + +#include "dm-block-manager.h" + +struct dm_transaction_manager; + +/*----------------------------------------------------------------*/ + +/* + * Annotations used to check on-disk metadata is handled as little-endian. + */ +#ifdef __CHECKER__ +# define __dm_written_to_disk(x) __releases(x) +# define __dm_reads_from_disk(x) __acquires(x) +# define __dm_bless_for_disk(x) __acquire(x) +# define __dm_unbless_for_disk(x) __release(x) +#else +# define __dm_written_to_disk(x) +# define __dm_reads_from_disk(x) +# define __dm_bless_for_disk(x) +# define __dm_unbless_for_disk(x) +#endif + +/*----------------------------------------------------------------*/ + +/* + * Manipulates hierarchical B+ trees with 64-bit keys and arbitrary-sized + * values. + */ + +/* + * Infomation about the values stored within the btree. + */ +struct dm_btree_value_type { + void *context; + + /* + * The size in bytes of each value. + */ + uint32_t size; + + /* + * Any of these methods can be safely set to NULL if you do not + * need the corresponding feature. + */ + + /* + * The btree is making a duplicate of the value, for instance + * because previously-shared btree nodes have now diverged. + * @value argument is the new copy that the copy function may modify. + * (Probably it just wants to increment a reference count + * somewhere.) This method is _not_ called for insertion of a new + * value: It is assumed the ref count is already 1. + */ + void (*inc)(void *context, void *value); + + /* + * This value is being deleted. The btree takes care of freeing + * the memory pointed to by @value. Often the del function just + * needs to decrement a reference count somewhere. + */ + void (*dec)(void *context, void *value); + + /* + * A test for equality between two values. When a value is + * overwritten with a new one, the old one has the dec method + * called _unless_ the new and old value are deemed equal. + */ + int (*equal)(void *context, void *value1, void *value2); +}; + +/* + * The shape and contents of a btree. + */ +struct dm_btree_info { + struct dm_transaction_manager *tm; + + /* + * Number of nested btrees. (Not the depth of a single tree.) + */ + unsigned levels; + struct dm_btree_value_type value_type; +}; + +/* + * Set up an empty tree. O(1). + */ +int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root); + +/* + * Delete a tree. O(n) - this is the slow one! It can also block, so + * please don't call it on an IO path. + */ +int dm_btree_del(struct dm_btree_info *info, dm_block_t root); + +/* + * All the lookup functions return -ENODATA if the key cannot be found. + */ + +/* + * Tries to find a key that matches exactly. O(ln(n)) + */ +int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root, + uint64_t *keys, void *value_le); + +/* + * Insertion (or overwrite an existing value). O(ln(n)) + */ +int dm_btree_insert(struct dm_btree_info *info, dm_block_t root, + uint64_t *keys, void *value, dm_block_t *new_root) + __dm_written_to_disk(value); + +/* + * A variant of insert that indicates whether it actually inserted or just + * overwrote. Useful if you're keeping track of the number of entries in a + * tree. + */ +int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root, + uint64_t *keys, void *value, dm_block_t *new_root, + int *inserted) + __dm_written_to_disk(value); + +/* + * Remove a key if present. This doesn't remove empty sub trees. Normally + * subtrees represent a separate entity, like a snapshot map, so this is + * correct behaviour. O(ln(n)). + */ +int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, + uint64_t *keys, dm_block_t *new_root); + +/* + * Returns < 0 on failure. Otherwise the number of key entries that have + * been filled out. Remember trees can have zero entries, and as such have + * no highest key. + */ +int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, + uint64_t *result_keys); + +#endif /* _LINUX_DM_BTREE_H */ diff --git a/drivers/md/persistent-data/dm-persistent-data-internal.h b/drivers/md/persistent-data/dm-persistent-data-internal.h new file mode 100644 index 00000000..c49e26ff --- /dev/null +++ b/drivers/md/persistent-data/dm-persistent-data-internal.h @@ -0,0 +1,19 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef _DM_PERSISTENT_DATA_INTERNAL_H +#define _DM_PERSISTENT_DATA_INTERNAL_H + +#include "dm-block-manager.h" + +static inline unsigned dm_hash_block(dm_block_t b, unsigned hash_mask) +{ + const unsigned BIG_PRIME = 4294967291UL; + + return (((unsigned) b) * BIG_PRIME) & hash_mask; +} + +#endif /* _PERSISTENT_DATA_INTERNAL_H */ diff --git a/drivers/md/persistent-data/dm-space-map-checker.c b/drivers/md/persistent-data/dm-space-map-checker.c new file mode 100644 index 00000000..fc90c116 --- /dev/null +++ b/drivers/md/persistent-data/dm-space-map-checker.c @@ -0,0 +1,446 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-space-map-checker.h" + +#include <linux/device-mapper.h> +#include <linux/export.h> +#include <linux/vmalloc.h> + +#ifdef CONFIG_DM_DEBUG_SPACE_MAPS + +#define DM_MSG_PREFIX "space map checker" + +/*----------------------------------------------------------------*/ + +struct count_array { + dm_block_t nr; + dm_block_t nr_free; + + uint32_t *counts; +}; + +static int ca_get_count(struct count_array *ca, dm_block_t b, uint32_t *count) +{ + if (b >= ca->nr) + return -EINVAL; + + *count = ca->counts[b]; + return 0; +} + +static int ca_count_more_than_one(struct count_array *ca, dm_block_t b, int *r) +{ + if (b >= ca->nr) + return -EINVAL; + + *r = ca->counts[b] > 1; + return 0; +} + +static int ca_set_count(struct count_array *ca, dm_block_t b, uint32_t count) +{ + uint32_t old_count; + + if (b >= ca->nr) + return -EINVAL; + + old_count = ca->counts[b]; + + if (!count && old_count) + ca->nr_free++; + + else if (count && !old_count) + ca->nr_free--; + + ca->counts[b] = count; + return 0; +} + +static int ca_inc_block(struct count_array *ca, dm_block_t b) +{ + if (b >= ca->nr) + return -EINVAL; + + ca_set_count(ca, b, ca->counts[b] + 1); + return 0; +} + +static int ca_dec_block(struct count_array *ca, dm_block_t b) +{ + if (b >= ca->nr) + return -EINVAL; + + BUG_ON(ca->counts[b] == 0); + ca_set_count(ca, b, ca->counts[b] - 1); + return 0; +} + +static int ca_create(struct count_array *ca, struct dm_space_map *sm) +{ + int r; + dm_block_t nr_blocks; + + r = dm_sm_get_nr_blocks(sm, &nr_blocks); + if (r) + return r; + + ca->nr = nr_blocks; + ca->nr_free = nr_blocks; + + if (!nr_blocks) + ca->counts = NULL; + else { + ca->counts = vzalloc(sizeof(*ca->counts) * nr_blocks); + if (!ca->counts) + return -ENOMEM; + } + + return 0; +} + +static void ca_destroy(struct count_array *ca) +{ + vfree(ca->counts); +} + +static int ca_load(struct count_array *ca, struct dm_space_map *sm) +{ + int r; + uint32_t count; + dm_block_t nr_blocks, i; + + r = dm_sm_get_nr_blocks(sm, &nr_blocks); + if (r) + return r; + + BUG_ON(ca->nr != nr_blocks); + + DMWARN("Loading debug space map from disk. This may take some time"); + for (i = 0; i < nr_blocks; i++) { + r = dm_sm_get_count(sm, i, &count); + if (r) { + DMERR("load failed"); + return r; + } + + ca_set_count(ca, i, count); + } + DMWARN("Load complete"); + + return 0; +} + +static int ca_extend(struct count_array *ca, dm_block_t extra_blocks) +{ + dm_block_t nr_blocks = ca->nr + extra_blocks; + uint32_t *counts = vzalloc(sizeof(*counts) * nr_blocks); + if (!counts) + return -ENOMEM; + + if (ca->counts) { + memcpy(counts, ca->counts, sizeof(*counts) * ca->nr); + ca_destroy(ca); + } + ca->nr = nr_blocks; + ca->nr_free += extra_blocks; + ca->counts = counts; + return 0; +} + +static int ca_commit(struct count_array *old, struct count_array *new) +{ + if (old->nr != new->nr) { + BUG_ON(old->nr > new->nr); + ca_extend(old, new->nr - old->nr); + } + + BUG_ON(old->nr != new->nr); + old->nr_free = new->nr_free; + memcpy(old->counts, new->counts, sizeof(*old->counts) * old->nr); + return 0; +} + +/*----------------------------------------------------------------*/ + +struct sm_checker { + struct dm_space_map sm; + + struct count_array old_counts; + struct count_array counts; + + struct dm_space_map *real_sm; +}; + +static void sm_checker_destroy(struct dm_space_map *sm) +{ + struct sm_checker *smc = container_of(sm, struct sm_checker, sm); + + dm_sm_destroy(smc->real_sm); + ca_destroy(&smc->old_counts); + ca_destroy(&smc->counts); + kfree(smc); +} + +static int sm_checker_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count) +{ + struct sm_checker *smc = container_of(sm, struct sm_checker, sm); + int r = dm_sm_get_nr_blocks(smc->real_sm, count); + if (!r) + BUG_ON(smc->old_counts.nr != *count); + return r; +} + +static int sm_checker_get_nr_free(struct dm_space_map *sm, dm_block_t *count) +{ + struct sm_checker *smc = container_of(sm, struct sm_checker, sm); + int r = dm_sm_get_nr_free(smc->real_sm, count); + if (!r) { + /* + * Slow, but we know it's correct. + */ + dm_block_t b, n = 0; + for (b = 0; b < smc->old_counts.nr; b++) + if (smc->old_counts.counts[b] == 0 && + smc->counts.counts[b] == 0) + n++; + + if (n != *count) + DMERR("free block counts differ, checker %u, sm-disk:%u", + (unsigned) n, (unsigned) *count); + } + return r; +} + +static int sm_checker_new_block(struct dm_space_map *sm, dm_block_t *b) +{ + struct sm_checker *smc = container_of(sm, struct sm_checker, sm); + int r = dm_sm_new_block(smc->real_sm, b); + + if (!r) { + BUG_ON(*b >= smc->old_counts.nr); + BUG_ON(smc->old_counts.counts[*b] != 0); + BUG_ON(*b >= smc->counts.nr); + BUG_ON(smc->counts.counts[*b] != 0); + ca_set_count(&smc->counts, *b, 1); + } + + return r; +} + +static int sm_checker_inc_block(struct dm_space_map *sm, dm_block_t b) +{ + struct sm_checker *smc = container_of(sm, struct sm_checker, sm); + int r = dm_sm_inc_block(smc->real_sm, b); + int r2 = ca_inc_block(&smc->counts, b); + BUG_ON(r != r2); + return r; +} + +static int sm_checker_dec_block(struct dm_space_map *sm, dm_block_t b) +{ + struct sm_checker *smc = container_of(sm, struct sm_checker, sm); + int r = dm_sm_dec_block(smc->real_sm, b); + int r2 = ca_dec_block(&smc->counts, b); + BUG_ON(r != r2); + return r; +} + +static int sm_checker_get_count(struct dm_space_map *sm, dm_block_t b, uint32_t *result) +{ + struct sm_checker *smc = container_of(sm, struct sm_checker, sm); + uint32_t result2 = 0; + int r = dm_sm_get_count(smc->real_sm, b, result); + int r2 = ca_get_count(&smc->counts, b, &result2); + + BUG_ON(r != r2); + if (!r) + BUG_ON(*result != result2); + return r; +} + +static int sm_checker_count_more_than_one(struct dm_space_map *sm, dm_block_t b, int *result) +{ + struct sm_checker *smc = container_of(sm, struct sm_checker, sm); + int result2 = 0; + int r = dm_sm_count_is_more_than_one(smc->real_sm, b, result); + int r2 = ca_count_more_than_one(&smc->counts, b, &result2); + + BUG_ON(r != r2); + if (!r) + BUG_ON(!(*result) && result2); + return r; +} + +static int sm_checker_set_count(struct dm_space_map *sm, dm_block_t b, uint32_t count) +{ + struct sm_checker *smc = container_of(sm, struct sm_checker, sm); + uint32_t old_rc; + int r = dm_sm_set_count(smc->real_sm, b, count); + int r2; + + BUG_ON(b >= smc->counts.nr); + old_rc = smc->counts.counts[b]; + r2 = ca_set_count(&smc->counts, b, count); + BUG_ON(r != r2); + + return r; +} + +static int sm_checker_commit(struct dm_space_map *sm) +{ + struct sm_checker *smc = container_of(sm, struct sm_checker, sm); + int r; + + r = dm_sm_commit(smc->real_sm); + if (r) + return r; + + r = ca_commit(&smc->old_counts, &smc->counts); + if (r) + return r; + + return 0; +} + +static int sm_checker_extend(struct dm_space_map *sm, dm_block_t extra_blocks) +{ + struct sm_checker *smc = container_of(sm, struct sm_checker, sm); + int r = dm_sm_extend(smc->real_sm, extra_blocks); + if (r) + return r; + + return ca_extend(&smc->counts, extra_blocks); +} + +static int sm_checker_root_size(struct dm_space_map *sm, size_t *result) +{ + struct sm_checker *smc = container_of(sm, struct sm_checker, sm); + return dm_sm_root_size(smc->real_sm, result); +} + +static int sm_checker_copy_root(struct dm_space_map *sm, void *copy_to_here_le, size_t len) +{ + struct sm_checker *smc = container_of(sm, struct sm_checker, sm); + return dm_sm_copy_root(smc->real_sm, copy_to_here_le, len); +} + +/*----------------------------------------------------------------*/ + +static struct dm_space_map ops_ = { + .destroy = sm_checker_destroy, + .get_nr_blocks = sm_checker_get_nr_blocks, + .get_nr_free = sm_checker_get_nr_free, + .inc_block = sm_checker_inc_block, + .dec_block = sm_checker_dec_block, + .new_block = sm_checker_new_block, + .get_count = sm_checker_get_count, + .count_is_more_than_one = sm_checker_count_more_than_one, + .set_count = sm_checker_set_count, + .commit = sm_checker_commit, + .extend = sm_checker_extend, + .root_size = sm_checker_root_size, + .copy_root = sm_checker_copy_root +}; + +struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm) +{ + int r; + struct sm_checker *smc; + + if (IS_ERR_OR_NULL(sm)) + return ERR_PTR(-EINVAL); + + smc = kmalloc(sizeof(*smc), GFP_KERNEL); + if (!smc) + return ERR_PTR(-ENOMEM); + + memcpy(&smc->sm, &ops_, sizeof(smc->sm)); + r = ca_create(&smc->old_counts, sm); + if (r) { + kfree(smc); + return ERR_PTR(r); + } + + r = ca_create(&smc->counts, sm); + if (r) { + ca_destroy(&smc->old_counts); + kfree(smc); + return ERR_PTR(r); + } + + smc->real_sm = sm; + + r = ca_load(&smc->counts, sm); + if (r) { + ca_destroy(&smc->counts); + ca_destroy(&smc->old_counts); + kfree(smc); + return ERR_PTR(r); + } + + r = ca_commit(&smc->old_counts, &smc->counts); + if (r) { + ca_destroy(&smc->counts); + ca_destroy(&smc->old_counts); + kfree(smc); + return ERR_PTR(r); + } + + return &smc->sm; +} +EXPORT_SYMBOL_GPL(dm_sm_checker_create); + +struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm) +{ + int r; + struct sm_checker *smc; + + if (IS_ERR_OR_NULL(sm)) + return ERR_PTR(-EINVAL); + + smc = kmalloc(sizeof(*smc), GFP_KERNEL); + if (!smc) + return ERR_PTR(-ENOMEM); + + memcpy(&smc->sm, &ops_, sizeof(smc->sm)); + r = ca_create(&smc->old_counts, sm); + if (r) { + kfree(smc); + return ERR_PTR(r); + } + + r = ca_create(&smc->counts, sm); + if (r) { + ca_destroy(&smc->old_counts); + kfree(smc); + return ERR_PTR(r); + } + + smc->real_sm = sm; + return &smc->sm; +} +EXPORT_SYMBOL_GPL(dm_sm_checker_create_fresh); + +/*----------------------------------------------------------------*/ + +#else + +struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm) +{ + return sm; +} +EXPORT_SYMBOL_GPL(dm_sm_checker_create); + +struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm) +{ + return sm; +} +EXPORT_SYMBOL_GPL(dm_sm_checker_create_fresh); + +/*----------------------------------------------------------------*/ + +#endif diff --git a/drivers/md/persistent-data/dm-space-map-checker.h b/drivers/md/persistent-data/dm-space-map-checker.h new file mode 100644 index 00000000..444dccf6 --- /dev/null +++ b/drivers/md/persistent-data/dm-space-map-checker.h @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef SNAPSHOTS_SPACE_MAP_CHECKER_H +#define SNAPSHOTS_SPACE_MAP_CHECKER_H + +#include "dm-space-map.h" + +/*----------------------------------------------------------------*/ + +/* + * This space map wraps a real on-disk space map, and verifies all of its + * operations. It uses a lot of memory, so only use if you have a specific + * problem that you're debugging. + * + * Ownership of @sm passes. + */ +struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm); +struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm); + +/*----------------------------------------------------------------*/ + +#endif diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c new file mode 100644 index 00000000..ff3beed6 --- /dev/null +++ b/drivers/md/persistent-data/dm-space-map-common.c @@ -0,0 +1,702 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-space-map-common.h" +#include "dm-transaction-manager.h" + +#include <linux/bitops.h> +#include <linux/device-mapper.h> + +#define DM_MSG_PREFIX "space map common" + +/*----------------------------------------------------------------*/ + +/* + * Index validator. + */ +#define INDEX_CSUM_XOR 160478 + +static void index_prepare_for_write(struct dm_block_validator *v, + struct dm_block *b, + size_t block_size) +{ + struct disk_metadata_index *mi_le = dm_block_data(b); + + mi_le->blocknr = cpu_to_le64(dm_block_location(b)); + mi_le->csum = cpu_to_le32(dm_bm_checksum(&mi_le->padding, + block_size - sizeof(__le32), + INDEX_CSUM_XOR)); +} + +static int index_check(struct dm_block_validator *v, + struct dm_block *b, + size_t block_size) +{ + struct disk_metadata_index *mi_le = dm_block_data(b); + __le32 csum_disk; + + if (dm_block_location(b) != le64_to_cpu(mi_le->blocknr)) { + DMERR("index_check failed blocknr %llu wanted %llu", + le64_to_cpu(mi_le->blocknr), dm_block_location(b)); + return -ENOTBLK; + } + + csum_disk = cpu_to_le32(dm_bm_checksum(&mi_le->padding, + block_size - sizeof(__le32), + INDEX_CSUM_XOR)); + if (csum_disk != mi_le->csum) { + DMERR("index_check failed csum %u wanted %u", + le32_to_cpu(csum_disk), le32_to_cpu(mi_le->csum)); + return -EILSEQ; + } + + return 0; +} + +static struct dm_block_validator index_validator = { + .name = "index", + .prepare_for_write = index_prepare_for_write, + .check = index_check +}; + +/*----------------------------------------------------------------*/ + +/* + * Bitmap validator + */ +#define BITMAP_CSUM_XOR 240779 + +static void bitmap_prepare_for_write(struct dm_block_validator *v, + struct dm_block *b, + size_t block_size) +{ + struct disk_bitmap_header *disk_header = dm_block_data(b); + + disk_header->blocknr = cpu_to_le64(dm_block_location(b)); + disk_header->csum = cpu_to_le32(dm_bm_checksum(&disk_header->not_used, + block_size - sizeof(__le32), + BITMAP_CSUM_XOR)); +} + +static int bitmap_check(struct dm_block_validator *v, + struct dm_block *b, + size_t block_size) +{ + struct disk_bitmap_header *disk_header = dm_block_data(b); + __le32 csum_disk; + + if (dm_block_location(b) != le64_to_cpu(disk_header->blocknr)) { + DMERR("bitmap check failed blocknr %llu wanted %llu", + le64_to_cpu(disk_header->blocknr), dm_block_location(b)); + return -ENOTBLK; + } + + csum_disk = cpu_to_le32(dm_bm_checksum(&disk_header->not_used, + block_size - sizeof(__le32), + BITMAP_CSUM_XOR)); + if (csum_disk != disk_header->csum) { + DMERR("bitmap check failed csum %u wanted %u", + le32_to_cpu(csum_disk), le32_to_cpu(disk_header->csum)); + return -EILSEQ; + } + + return 0; +} + +static struct dm_block_validator dm_sm_bitmap_validator = { + .name = "sm_bitmap", + .prepare_for_write = bitmap_prepare_for_write, + .check = bitmap_check +}; + +/*----------------------------------------------------------------*/ + +#define ENTRIES_PER_WORD 32 +#define ENTRIES_SHIFT 5 + +static void *dm_bitmap_data(struct dm_block *b) +{ + return dm_block_data(b) + sizeof(struct disk_bitmap_header); +} + +#define WORD_MASK_HIGH 0xAAAAAAAAAAAAAAAAULL + +static unsigned bitmap_word_used(void *addr, unsigned b) +{ + __le64 *words_le = addr; + __le64 *w_le = words_le + (b >> ENTRIES_SHIFT); + + uint64_t bits = le64_to_cpu(*w_le); + uint64_t mask = (bits + WORD_MASK_HIGH + 1) & WORD_MASK_HIGH; + + return !(~bits & mask); +} + +static unsigned sm_lookup_bitmap(void *addr, unsigned b) +{ + __le64 *words_le = addr; + __le64 *w_le = words_le + (b >> ENTRIES_SHIFT); + unsigned hi, lo; + + b = (b & (ENTRIES_PER_WORD - 1)) << 1; + hi = !!test_bit_le(b, (void *) w_le); + lo = !!test_bit_le(b + 1, (void *) w_le); + return (hi << 1) | lo; +} + +static void sm_set_bitmap(void *addr, unsigned b, unsigned val) +{ + __le64 *words_le = addr; + __le64 *w_le = words_le + (b >> ENTRIES_SHIFT); + + b = (b & (ENTRIES_PER_WORD - 1)) << 1; + + if (val & 2) + __set_bit_le(b, (void *) w_le); + else + __clear_bit_le(b, (void *) w_le); + + if (val & 1) + __set_bit_le(b + 1, (void *) w_le); + else + __clear_bit_le(b + 1, (void *) w_le); +} + +static int sm_find_free(void *addr, unsigned begin, unsigned end, + unsigned *result) +{ + while (begin < end) { + if (!(begin & (ENTRIES_PER_WORD - 1)) && + bitmap_word_used(addr, begin)) { + begin += ENTRIES_PER_WORD; + continue; + } + + if (!sm_lookup_bitmap(addr, begin)) { + *result = begin; + return 0; + } + + begin++; + } + + return -ENOSPC; +} + +/*----------------------------------------------------------------*/ + +static int sm_ll_init(struct ll_disk *ll, struct dm_transaction_manager *tm) +{ + ll->tm = tm; + + ll->bitmap_info.tm = tm; + ll->bitmap_info.levels = 1; + + /* + * Because the new bitmap blocks are created via a shadow + * operation, the old entry has already had its reference count + * decremented and we don't need the btree to do any bookkeeping. + */ + ll->bitmap_info.value_type.size = sizeof(struct disk_index_entry); + ll->bitmap_info.value_type.inc = NULL; + ll->bitmap_info.value_type.dec = NULL; + ll->bitmap_info.value_type.equal = NULL; + + ll->ref_count_info.tm = tm; + ll->ref_count_info.levels = 1; + ll->ref_count_info.value_type.size = sizeof(uint32_t); + ll->ref_count_info.value_type.inc = NULL; + ll->ref_count_info.value_type.dec = NULL; + ll->ref_count_info.value_type.equal = NULL; + + ll->block_size = dm_bm_block_size(dm_tm_get_bm(tm)); + + if (ll->block_size > (1 << 30)) { + DMERR("block size too big to hold bitmaps"); + return -EINVAL; + } + + ll->entries_per_block = (ll->block_size - sizeof(struct disk_bitmap_header)) * + ENTRIES_PER_BYTE; + ll->nr_blocks = 0; + ll->bitmap_root = 0; + ll->ref_count_root = 0; + + return 0; +} + +int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks) +{ + int r; + dm_block_t i, nr_blocks, nr_indexes; + unsigned old_blocks, blocks; + + nr_blocks = ll->nr_blocks + extra_blocks; + old_blocks = dm_sector_div_up(ll->nr_blocks, ll->entries_per_block); + blocks = dm_sector_div_up(nr_blocks, ll->entries_per_block); + + nr_indexes = dm_sector_div_up(nr_blocks, ll->entries_per_block); + if (nr_indexes > ll->max_entries(ll)) { + DMERR("space map too large"); + return -EINVAL; + } + + for (i = old_blocks; i < blocks; i++) { + struct dm_block *b; + struct disk_index_entry idx; + + r = dm_tm_new_block(ll->tm, &dm_sm_bitmap_validator, &b); + if (r < 0) + return r; + idx.blocknr = cpu_to_le64(dm_block_location(b)); + + r = dm_tm_unlock(ll->tm, b); + if (r < 0) + return r; + + idx.nr_free = cpu_to_le32(ll->entries_per_block); + idx.none_free_before = 0; + + r = ll->save_ie(ll, i, &idx); + if (r < 0) + return r; + } + + ll->nr_blocks = nr_blocks; + return 0; +} + +int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result) +{ + int r; + dm_block_t index = b; + struct disk_index_entry ie_disk; + struct dm_block *blk; + + b = do_div(index, ll->entries_per_block); + r = ll->load_ie(ll, index, &ie_disk); + if (r < 0) + return r; + + r = dm_tm_read_lock(ll->tm, le64_to_cpu(ie_disk.blocknr), + &dm_sm_bitmap_validator, &blk); + if (r < 0) + return r; + + *result = sm_lookup_bitmap(dm_bitmap_data(blk), b); + + return dm_tm_unlock(ll->tm, blk); +} + +int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result) +{ + __le32 le_rc; + int r = sm_ll_lookup_bitmap(ll, b, result); + + if (r) + return r; + + if (*result != 3) + return r; + + r = dm_btree_lookup(&ll->ref_count_info, ll->ref_count_root, &b, &le_rc); + if (r < 0) + return r; + + *result = le32_to_cpu(le_rc); + + return r; +} + +int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, + dm_block_t end, dm_block_t *result) +{ + int r; + struct disk_index_entry ie_disk; + dm_block_t i, index_begin = begin; + dm_block_t index_end = dm_sector_div_up(end, ll->entries_per_block); + + /* + * FIXME: Use shifts + */ + begin = do_div(index_begin, ll->entries_per_block); + end = do_div(end, ll->entries_per_block); + + for (i = index_begin; i < index_end; i++, begin = 0) { + struct dm_block *blk; + unsigned position; + uint32_t bit_end; + + r = ll->load_ie(ll, i, &ie_disk); + if (r < 0) + return r; + + if (le32_to_cpu(ie_disk.nr_free) == 0) + continue; + + r = dm_tm_read_lock(ll->tm, le64_to_cpu(ie_disk.blocknr), + &dm_sm_bitmap_validator, &blk); + if (r < 0) + return r; + + bit_end = (i == index_end - 1) ? end : ll->entries_per_block; + + r = sm_find_free(dm_bitmap_data(blk), + max_t(unsigned, begin, le32_to_cpu(ie_disk.none_free_before)), + bit_end, &position); + if (r == -ENOSPC) { + /* + * This might happen because we started searching + * part way through the bitmap. + */ + dm_tm_unlock(ll->tm, blk); + continue; + + } else if (r < 0) { + dm_tm_unlock(ll->tm, blk); + return r; + } + + r = dm_tm_unlock(ll->tm, blk); + if (r < 0) + return r; + + *result = i * ll->entries_per_block + (dm_block_t) position; + return 0; + } + + return -ENOSPC; +} + +int sm_ll_insert(struct ll_disk *ll, dm_block_t b, + uint32_t ref_count, enum allocation_event *ev) +{ + int r; + uint32_t bit, old; + struct dm_block *nb; + dm_block_t index = b; + struct disk_index_entry ie_disk; + void *bm_le; + int inc; + + bit = do_div(index, ll->entries_per_block); + r = ll->load_ie(ll, index, &ie_disk); + if (r < 0) + return r; + + r = dm_tm_shadow_block(ll->tm, le64_to_cpu(ie_disk.blocknr), + &dm_sm_bitmap_validator, &nb, &inc); + if (r < 0) { + DMERR("dm_tm_shadow_block() failed"); + return r; + } + ie_disk.blocknr = cpu_to_le64(dm_block_location(nb)); + + bm_le = dm_bitmap_data(nb); + old = sm_lookup_bitmap(bm_le, bit); + + if (ref_count <= 2) { + sm_set_bitmap(bm_le, bit, ref_count); + + r = dm_tm_unlock(ll->tm, nb); + if (r < 0) + return r; + + if (old > 2) { + r = dm_btree_remove(&ll->ref_count_info, + ll->ref_count_root, + &b, &ll->ref_count_root); + if (r) + return r; + } + + } else { + __le32 le_rc = cpu_to_le32(ref_count); + + sm_set_bitmap(bm_le, bit, 3); + r = dm_tm_unlock(ll->tm, nb); + if (r < 0) + return r; + + __dm_bless_for_disk(&le_rc); + r = dm_btree_insert(&ll->ref_count_info, ll->ref_count_root, + &b, &le_rc, &ll->ref_count_root); + if (r < 0) { + DMERR("ref count insert failed"); + return r; + } + } + + if (ref_count && !old) { + *ev = SM_ALLOC; + ll->nr_allocated++; + ie_disk.nr_free = cpu_to_le32(le32_to_cpu(ie_disk.nr_free) - 1); + if (le32_to_cpu(ie_disk.none_free_before) == bit) + ie_disk.none_free_before = cpu_to_le32(bit + 1); + + } else if (old && !ref_count) { + *ev = SM_FREE; + ll->nr_allocated--; + ie_disk.nr_free = cpu_to_le32(le32_to_cpu(ie_disk.nr_free) + 1); + ie_disk.none_free_before = cpu_to_le32(min(le32_to_cpu(ie_disk.none_free_before), bit)); + } + + return ll->save_ie(ll, index, &ie_disk); +} + +int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) +{ + int r; + uint32_t rc; + + r = sm_ll_lookup(ll, b, &rc); + if (r) + return r; + + return sm_ll_insert(ll, b, rc + 1, ev); +} + +int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) +{ + int r; + uint32_t rc; + + r = sm_ll_lookup(ll, b, &rc); + if (r) + return r; + + if (!rc) + return -EINVAL; + + return sm_ll_insert(ll, b, rc - 1, ev); +} + +int sm_ll_commit(struct ll_disk *ll) +{ + return ll->commit(ll); +} + +/*----------------------------------------------------------------*/ + +static int metadata_ll_load_ie(struct ll_disk *ll, dm_block_t index, + struct disk_index_entry *ie) +{ + memcpy(ie, ll->mi_le.index + index, sizeof(*ie)); + return 0; +} + +static int metadata_ll_save_ie(struct ll_disk *ll, dm_block_t index, + struct disk_index_entry *ie) +{ + memcpy(ll->mi_le.index + index, ie, sizeof(*ie)); + return 0; +} + +static int metadata_ll_init_index(struct ll_disk *ll) +{ + int r; + struct dm_block *b; + + r = dm_tm_new_block(ll->tm, &index_validator, &b); + if (r < 0) + return r; + + memcpy(dm_block_data(b), &ll->mi_le, sizeof(ll->mi_le)); + ll->bitmap_root = dm_block_location(b); + + return dm_tm_unlock(ll->tm, b); +} + +static int metadata_ll_open(struct ll_disk *ll) +{ + int r; + struct dm_block *block; + + r = dm_tm_read_lock(ll->tm, ll->bitmap_root, + &index_validator, &block); + if (r) + return r; + + memcpy(&ll->mi_le, dm_block_data(block), sizeof(ll->mi_le)); + return dm_tm_unlock(ll->tm, block); +} + +static dm_block_t metadata_ll_max_entries(struct ll_disk *ll) +{ + return MAX_METADATA_BITMAPS; +} + +static int metadata_ll_commit(struct ll_disk *ll) +{ + int r, inc; + struct dm_block *b; + + r = dm_tm_shadow_block(ll->tm, ll->bitmap_root, &index_validator, &b, &inc); + if (r) + return r; + + memcpy(dm_block_data(b), &ll->mi_le, sizeof(ll->mi_le)); + ll->bitmap_root = dm_block_location(b); + + return dm_tm_unlock(ll->tm, b); +} + +int sm_ll_new_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm) +{ + int r; + + r = sm_ll_init(ll, tm); + if (r < 0) + return r; + + ll->load_ie = metadata_ll_load_ie; + ll->save_ie = metadata_ll_save_ie; + ll->init_index = metadata_ll_init_index; + ll->open_index = metadata_ll_open; + ll->max_entries = metadata_ll_max_entries; + ll->commit = metadata_ll_commit; + + ll->nr_blocks = 0; + ll->nr_allocated = 0; + + r = ll->init_index(ll); + if (r < 0) + return r; + + r = dm_btree_empty(&ll->ref_count_info, &ll->ref_count_root); + if (r < 0) + return r; + + return 0; +} + +int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm, + void *root_le, size_t len) +{ + int r; + struct disk_sm_root *smr = root_le; + + if (len < sizeof(struct disk_sm_root)) { + DMERR("sm_metadata root too small"); + return -ENOMEM; + } + + r = sm_ll_init(ll, tm); + if (r < 0) + return r; + + ll->load_ie = metadata_ll_load_ie; + ll->save_ie = metadata_ll_save_ie; + ll->init_index = metadata_ll_init_index; + ll->open_index = metadata_ll_open; + ll->max_entries = metadata_ll_max_entries; + ll->commit = metadata_ll_commit; + + ll->nr_blocks = le64_to_cpu(smr->nr_blocks); + ll->nr_allocated = le64_to_cpu(smr->nr_allocated); + ll->bitmap_root = le64_to_cpu(smr->bitmap_root); + ll->ref_count_root = le64_to_cpu(smr->ref_count_root); + + return ll->open_index(ll); +} + +/*----------------------------------------------------------------*/ + +static int disk_ll_load_ie(struct ll_disk *ll, dm_block_t index, + struct disk_index_entry *ie) +{ + return dm_btree_lookup(&ll->bitmap_info, ll->bitmap_root, &index, ie); +} + +static int disk_ll_save_ie(struct ll_disk *ll, dm_block_t index, + struct disk_index_entry *ie) +{ + __dm_bless_for_disk(ie); + return dm_btree_insert(&ll->bitmap_info, ll->bitmap_root, + &index, ie, &ll->bitmap_root); +} + +static int disk_ll_init_index(struct ll_disk *ll) +{ + return dm_btree_empty(&ll->bitmap_info, &ll->bitmap_root); +} + +static int disk_ll_open(struct ll_disk *ll) +{ + /* nothing to do */ + return 0; +} + +static dm_block_t disk_ll_max_entries(struct ll_disk *ll) +{ + return -1ULL; +} + +static int disk_ll_commit(struct ll_disk *ll) +{ + return 0; +} + +int sm_ll_new_disk(struct ll_disk *ll, struct dm_transaction_manager *tm) +{ + int r; + + r = sm_ll_init(ll, tm); + if (r < 0) + return r; + + ll->load_ie = disk_ll_load_ie; + ll->save_ie = disk_ll_save_ie; + ll->init_index = disk_ll_init_index; + ll->open_index = disk_ll_open; + ll->max_entries = disk_ll_max_entries; + ll->commit = disk_ll_commit; + + ll->nr_blocks = 0; + ll->nr_allocated = 0; + + r = ll->init_index(ll); + if (r < 0) + return r; + + r = dm_btree_empty(&ll->ref_count_info, &ll->ref_count_root); + if (r < 0) + return r; + + return 0; +} + +int sm_ll_open_disk(struct ll_disk *ll, struct dm_transaction_manager *tm, + void *root_le, size_t len) +{ + int r; + struct disk_sm_root *smr = root_le; + + if (len < sizeof(struct disk_sm_root)) { + DMERR("sm_metadata root too small"); + return -ENOMEM; + } + + r = sm_ll_init(ll, tm); + if (r < 0) + return r; + + ll->load_ie = disk_ll_load_ie; + ll->save_ie = disk_ll_save_ie; + ll->init_index = disk_ll_init_index; + ll->open_index = disk_ll_open; + ll->max_entries = disk_ll_max_entries; + ll->commit = disk_ll_commit; + + ll->nr_blocks = le64_to_cpu(smr->nr_blocks); + ll->nr_allocated = le64_to_cpu(smr->nr_allocated); + ll->bitmap_root = le64_to_cpu(smr->bitmap_root); + ll->ref_count_root = le64_to_cpu(smr->ref_count_root); + + return ll->open_index(ll); +} + +/*----------------------------------------------------------------*/ diff --git a/drivers/md/persistent-data/dm-space-map-common.h b/drivers/md/persistent-data/dm-space-map-common.h new file mode 100644 index 00000000..8f220821 --- /dev/null +++ b/drivers/md/persistent-data/dm-space-map-common.h @@ -0,0 +1,126 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef DM_SPACE_MAP_COMMON_H +#define DM_SPACE_MAP_COMMON_H + +#include "dm-btree.h" + +/*----------------------------------------------------------------*/ + +/* + * Low level disk format + * + * Bitmap btree + * ------------ + * + * Each value stored in the btree is an index_entry. This points to a + * block that is used as a bitmap. Within the bitmap hold 2 bits per + * entry, which represent UNUSED = 0, REF_COUNT = 1, REF_COUNT = 2 and + * REF_COUNT = many. + * + * Refcount btree + * -------------- + * + * Any entry that has a ref count higher than 2 gets entered in the ref + * count tree. The leaf values for this tree is the 32-bit ref count. + */ + +struct disk_index_entry { + __le64 blocknr; + __le32 nr_free; + __le32 none_free_before; +} __packed; + + +#define MAX_METADATA_BITMAPS 255 +struct disk_metadata_index { + __le32 csum; + __le32 padding; + __le64 blocknr; + + struct disk_index_entry index[MAX_METADATA_BITMAPS]; +} __packed; + +struct ll_disk; + +typedef int (*load_ie_fn)(struct ll_disk *ll, dm_block_t index, struct disk_index_entry *result); +typedef int (*save_ie_fn)(struct ll_disk *ll, dm_block_t index, struct disk_index_entry *ie); +typedef int (*init_index_fn)(struct ll_disk *ll); +typedef int (*open_index_fn)(struct ll_disk *ll); +typedef dm_block_t (*max_index_entries_fn)(struct ll_disk *ll); +typedef int (*commit_fn)(struct ll_disk *ll); + +struct ll_disk { + struct dm_transaction_manager *tm; + struct dm_btree_info bitmap_info; + struct dm_btree_info ref_count_info; + + uint32_t block_size; + uint32_t entries_per_block; + dm_block_t nr_blocks; + dm_block_t nr_allocated; + + /* + * bitmap_root may be a btree root or a simple index. + */ + dm_block_t bitmap_root; + + dm_block_t ref_count_root; + + struct disk_metadata_index mi_le; + load_ie_fn load_ie; + save_ie_fn save_ie; + init_index_fn init_index; + open_index_fn open_index; + max_index_entries_fn max_entries; + commit_fn commit; +}; + +struct disk_sm_root { + __le64 nr_blocks; + __le64 nr_allocated; + __le64 bitmap_root; + __le64 ref_count_root; +} __packed; + +#define ENTRIES_PER_BYTE 4 + +struct disk_bitmap_header { + __le32 csum; + __le32 not_used; + __le64 blocknr; +} __packed; + +enum allocation_event { + SM_NONE, + SM_ALLOC, + SM_FREE, +}; + +/*----------------------------------------------------------------*/ + +int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks); +int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result); +int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result); +int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, + dm_block_t end, dm_block_t *result); +int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, enum allocation_event *ev); +int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev); +int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev); +int sm_ll_commit(struct ll_disk *ll); + +int sm_ll_new_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm); +int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm, + void *root_le, size_t len); + +int sm_ll_new_disk(struct ll_disk *ll, struct dm_transaction_manager *tm); +int sm_ll_open_disk(struct ll_disk *ll, struct dm_transaction_manager *tm, + void *root_le, size_t len); + +/*----------------------------------------------------------------*/ + +#endif /* DM_SPACE_MAP_COMMON_H */ diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c new file mode 100644 index 00000000..3d0ed533 --- /dev/null +++ b/drivers/md/persistent-data/dm-space-map-disk.c @@ -0,0 +1,344 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-space-map-checker.h" +#include "dm-space-map-common.h" +#include "dm-space-map-disk.h" +#include "dm-space-map.h" +#include "dm-transaction-manager.h" + +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/export.h> +#include <linux/device-mapper.h> + +#define DM_MSG_PREFIX "space map disk" + +/*----------------------------------------------------------------*/ + +/* + * Space map interface. + */ +struct sm_disk { + struct dm_space_map sm; + + struct ll_disk ll; + struct ll_disk old_ll; + + dm_block_t begin; + dm_block_t nr_allocated_this_transaction; +}; + +static void sm_disk_destroy(struct dm_space_map *sm) +{ + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + + kfree(smd); +} + +static int sm_disk_extend(struct dm_space_map *sm, dm_block_t extra_blocks) +{ + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + + return sm_ll_extend(&smd->ll, extra_blocks); +} + +static int sm_disk_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count) +{ + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + *count = smd->old_ll.nr_blocks; + + return 0; +} + +static int sm_disk_get_nr_free(struct dm_space_map *sm, dm_block_t *count) +{ + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + *count = (smd->old_ll.nr_blocks - smd->old_ll.nr_allocated) - smd->nr_allocated_this_transaction; + + return 0; +} + +static int sm_disk_get_count(struct dm_space_map *sm, dm_block_t b, + uint32_t *result) +{ + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + return sm_ll_lookup(&smd->ll, b, result); +} + +static int sm_disk_count_is_more_than_one(struct dm_space_map *sm, dm_block_t b, + int *result) +{ + int r; + uint32_t count; + + r = sm_disk_get_count(sm, b, &count); + if (r) + return r; + + return count > 1; +} + +static int sm_disk_set_count(struct dm_space_map *sm, dm_block_t b, + uint32_t count) +{ + int r; + uint32_t old_count; + enum allocation_event ev; + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + + r = sm_ll_insert(&smd->ll, b, count, &ev); + if (!r) { + switch (ev) { + case SM_NONE: + break; + + case SM_ALLOC: + /* + * This _must_ be free in the prior transaction + * otherwise we've lost atomicity. + */ + smd->nr_allocated_this_transaction++; + break; + + case SM_FREE: + /* + * It's only free if it's also free in the last + * transaction. + */ + r = sm_ll_lookup(&smd->old_ll, b, &old_count); + if (r) + return r; + + if (!old_count) + smd->nr_allocated_this_transaction--; + break; + } + } + + return r; +} + +static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b) +{ + int r; + enum allocation_event ev; + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + + r = sm_ll_inc(&smd->ll, b, &ev); + if (!r && (ev == SM_ALLOC)) + /* + * This _must_ be free in the prior transaction + * otherwise we've lost atomicity. + */ + smd->nr_allocated_this_transaction++; + + return r; +} + +static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b) +{ + int r; + uint32_t old_count; + enum allocation_event ev; + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + + r = sm_ll_dec(&smd->ll, b, &ev); + if (!r && (ev == SM_FREE)) { + /* + * It's only free if it's also free in the last + * transaction. + */ + r = sm_ll_lookup(&smd->old_ll, b, &old_count); + if (r) + return r; + + if (!old_count) + smd->nr_allocated_this_transaction--; + } + + return r; +} + +static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b) +{ + int r; + enum allocation_event ev; + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + + /* FIXME: we should loop round a couple of times */ + r = sm_ll_find_free_block(&smd->old_ll, smd->begin, smd->old_ll.nr_blocks, b); + if (r) + return r; + + smd->begin = *b + 1; + r = sm_ll_inc(&smd->ll, *b, &ev); + if (!r) { + BUG_ON(ev != SM_ALLOC); + smd->nr_allocated_this_transaction++; + } + + return r; +} + +static int sm_disk_commit(struct dm_space_map *sm) +{ + int r; + dm_block_t nr_free; + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + + r = sm_disk_get_nr_free(sm, &nr_free); + if (r) + return r; + + r = sm_ll_commit(&smd->ll); + if (r) + return r; + + memcpy(&smd->old_ll, &smd->ll, sizeof(smd->old_ll)); + smd->begin = 0; + smd->nr_allocated_this_transaction = 0; + + r = sm_disk_get_nr_free(sm, &nr_free); + if (r) + return r; + + return 0; +} + +static int sm_disk_root_size(struct dm_space_map *sm, size_t *result) +{ + *result = sizeof(struct disk_sm_root); + + return 0; +} + +static int sm_disk_copy_root(struct dm_space_map *sm, void *where_le, size_t max) +{ + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + struct disk_sm_root root_le; + + root_le.nr_blocks = cpu_to_le64(smd->ll.nr_blocks); + root_le.nr_allocated = cpu_to_le64(smd->ll.nr_allocated); + root_le.bitmap_root = cpu_to_le64(smd->ll.bitmap_root); + root_le.ref_count_root = cpu_to_le64(smd->ll.ref_count_root); + + if (max < sizeof(root_le)) + return -ENOSPC; + + memcpy(where_le, &root_le, sizeof(root_le)); + + return 0; +} + +/*----------------------------------------------------------------*/ + +static struct dm_space_map ops = { + .destroy = sm_disk_destroy, + .extend = sm_disk_extend, + .get_nr_blocks = sm_disk_get_nr_blocks, + .get_nr_free = sm_disk_get_nr_free, + .get_count = sm_disk_get_count, + .count_is_more_than_one = sm_disk_count_is_more_than_one, + .set_count = sm_disk_set_count, + .inc_block = sm_disk_inc_block, + .dec_block = sm_disk_dec_block, + .new_block = sm_disk_new_block, + .commit = sm_disk_commit, + .root_size = sm_disk_root_size, + .copy_root = sm_disk_copy_root +}; + +static struct dm_space_map *dm_sm_disk_create_real( + struct dm_transaction_manager *tm, + dm_block_t nr_blocks) +{ + int r; + struct sm_disk *smd; + + smd = kmalloc(sizeof(*smd), GFP_KERNEL); + if (!smd) + return ERR_PTR(-ENOMEM); + + smd->begin = 0; + smd->nr_allocated_this_transaction = 0; + memcpy(&smd->sm, &ops, sizeof(smd->sm)); + + r = sm_ll_new_disk(&smd->ll, tm); + if (r) + goto bad; + + r = sm_ll_extend(&smd->ll, nr_blocks); + if (r) + goto bad; + + r = sm_disk_commit(&smd->sm); + if (r) + goto bad; + + return &smd->sm; + +bad: + kfree(smd); + return ERR_PTR(r); +} + +struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm, + dm_block_t nr_blocks) +{ + struct dm_space_map *sm = dm_sm_disk_create_real(tm, nr_blocks); + struct dm_space_map *smc; + + if (IS_ERR_OR_NULL(sm)) + return sm; + + smc = dm_sm_checker_create_fresh(sm); + if (IS_ERR(smc)) + dm_sm_destroy(sm); + + return smc; +} +EXPORT_SYMBOL_GPL(dm_sm_disk_create); + +static struct dm_space_map *dm_sm_disk_open_real( + struct dm_transaction_manager *tm, + void *root_le, size_t len) +{ + int r; + struct sm_disk *smd; + + smd = kmalloc(sizeof(*smd), GFP_KERNEL); + if (!smd) + return ERR_PTR(-ENOMEM); + + smd->begin = 0; + smd->nr_allocated_this_transaction = 0; + memcpy(&smd->sm, &ops, sizeof(smd->sm)); + + r = sm_ll_open_disk(&smd->ll, tm, root_le, len); + if (r) + goto bad; + + r = sm_disk_commit(&smd->sm); + if (r) + goto bad; + + return &smd->sm; + +bad: + kfree(smd); + return ERR_PTR(r); +} + +struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm, + void *root_le, size_t len) +{ + return dm_sm_checker_create( + dm_sm_disk_open_real(tm, root_le, len)); +} +EXPORT_SYMBOL_GPL(dm_sm_disk_open); + +/*----------------------------------------------------------------*/ diff --git a/drivers/md/persistent-data/dm-space-map-disk.h b/drivers/md/persistent-data/dm-space-map-disk.h new file mode 100644 index 00000000..447a0a9a --- /dev/null +++ b/drivers/md/persistent-data/dm-space-map-disk.h @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef _LINUX_DM_SPACE_MAP_DISK_H +#define _LINUX_DM_SPACE_MAP_DISK_H + +#include "dm-block-manager.h" + +struct dm_space_map; +struct dm_transaction_manager; + +/* + * Unfortunately we have to use two-phase construction due to the cycle + * between the tm and sm. + */ +struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm, + dm_block_t nr_blocks); + +struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm, + void *root, size_t len); + +#endif /* _LINUX_DM_SPACE_MAP_DISK_H */ diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c new file mode 100644 index 00000000..e89ae5e7 --- /dev/null +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -0,0 +1,596 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-space-map.h" +#include "dm-space-map-common.h" +#include "dm-space-map-metadata.h" + +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/device-mapper.h> + +#define DM_MSG_PREFIX "space map metadata" + +/*----------------------------------------------------------------*/ + +/* + * Space map interface. + * + * The low level disk format is written using the standard btree and + * transaction manager. This means that performing disk operations may + * cause us to recurse into the space map in order to allocate new blocks. + * For this reason we have a pool of pre-allocated blocks large enough to + * service any metadata_ll_disk operation. + */ + +/* + * FIXME: we should calculate this based on the size of the device. + * Only the metadata space map needs this functionality. + */ +#define MAX_RECURSIVE_ALLOCATIONS 1024 + +enum block_op_type { + BOP_INC, + BOP_DEC +}; + +struct block_op { + enum block_op_type type; + dm_block_t block; +}; + +struct sm_metadata { + struct dm_space_map sm; + + struct ll_disk ll; + struct ll_disk old_ll; + + dm_block_t begin; + + unsigned recursion_count; + unsigned allocated_this_transaction; + unsigned nr_uncommitted; + struct block_op uncommitted[MAX_RECURSIVE_ALLOCATIONS]; +}; + +static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b) +{ + struct block_op *op; + + if (smm->nr_uncommitted == MAX_RECURSIVE_ALLOCATIONS) { + DMERR("too many recursive allocations"); + return -ENOMEM; + } + + op = smm->uncommitted + smm->nr_uncommitted++; + op->type = type; + op->block = b; + + return 0; +} + +static int commit_bop(struct sm_metadata *smm, struct block_op *op) +{ + int r = 0; + enum allocation_event ev; + + switch (op->type) { + case BOP_INC: + r = sm_ll_inc(&smm->ll, op->block, &ev); + break; + + case BOP_DEC: + r = sm_ll_dec(&smm->ll, op->block, &ev); + break; + } + + return r; +} + +static void in(struct sm_metadata *smm) +{ + smm->recursion_count++; +} + +static int out(struct sm_metadata *smm) +{ + int r = 0; + + /* + * If we're not recursing then very bad things are happening. + */ + if (!smm->recursion_count) { + DMERR("lost track of recursion depth"); + return -ENOMEM; + } + + if (smm->recursion_count == 1 && smm->nr_uncommitted) { + while (smm->nr_uncommitted && !r) { + smm->nr_uncommitted--; + r = commit_bop(smm, smm->uncommitted + + smm->nr_uncommitted); + if (r) + break; + } + } + + smm->recursion_count--; + + return r; +} + +/* + * When using the out() function above, we often want to combine an error + * code for the operation run in the recursive context with that from + * out(). + */ +static int combine_errors(int r1, int r2) +{ + return r1 ? r1 : r2; +} + +static int recursing(struct sm_metadata *smm) +{ + return smm->recursion_count; +} + +static void sm_metadata_destroy(struct dm_space_map *sm) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + kfree(smm); +} + +static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks) +{ + DMERR("doesn't support extend"); + return -EINVAL; +} + +static int sm_metadata_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + *count = smm->ll.nr_blocks; + + return 0; +} + +static int sm_metadata_get_nr_free(struct dm_space_map *sm, dm_block_t *count) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + *count = smm->old_ll.nr_blocks - smm->old_ll.nr_allocated - + smm->allocated_this_transaction; + + return 0; +} + +static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, + uint32_t *result) +{ + int r, i; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + unsigned adjustment = 0; + + /* + * We may have some uncommitted adjustments to add. This list + * should always be really short. + */ + for (i = 0; i < smm->nr_uncommitted; i++) { + struct block_op *op = smm->uncommitted + i; + + if (op->block != b) + continue; + + switch (op->type) { + case BOP_INC: + adjustment++; + break; + + case BOP_DEC: + adjustment--; + break; + } + } + + r = sm_ll_lookup(&smm->ll, b, result); + if (r) + return r; + + *result += adjustment; + + return 0; +} + +static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm, + dm_block_t b, int *result) +{ + int r, i, adjustment = 0; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + uint32_t rc; + + /* + * We may have some uncommitted adjustments to add. This list + * should always be really short. + */ + for (i = 0; i < smm->nr_uncommitted; i++) { + struct block_op *op = smm->uncommitted + i; + + if (op->block != b) + continue; + + switch (op->type) { + case BOP_INC: + adjustment++; + break; + + case BOP_DEC: + adjustment--; + break; + } + } + + if (adjustment > 1) { + *result = 1; + return 0; + } + + r = sm_ll_lookup_bitmap(&smm->ll, b, &rc); + if (r) + return r; + + if (rc == 3) + /* + * We err on the side of caution, and always return true. + */ + *result = 1; + else + *result = rc + adjustment > 1; + + return 0; +} + +static int sm_metadata_set_count(struct dm_space_map *sm, dm_block_t b, + uint32_t count) +{ + int r, r2; + enum allocation_event ev; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + if (smm->recursion_count) { + DMERR("cannot recurse set_count()"); + return -EINVAL; + } + + in(smm); + r = sm_ll_insert(&smm->ll, b, count, &ev); + r2 = out(smm); + + return combine_errors(r, r2); +} + +static int sm_metadata_inc_block(struct dm_space_map *sm, dm_block_t b) +{ + int r, r2 = 0; + enum allocation_event ev; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + if (recursing(smm)) + r = add_bop(smm, BOP_INC, b); + else { + in(smm); + r = sm_ll_inc(&smm->ll, b, &ev); + r2 = out(smm); + } + + return combine_errors(r, r2); +} + +static int sm_metadata_dec_block(struct dm_space_map *sm, dm_block_t b) +{ + int r, r2 = 0; + enum allocation_event ev; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + if (recursing(smm)) + r = add_bop(smm, BOP_DEC, b); + else { + in(smm); + r = sm_ll_dec(&smm->ll, b, &ev); + r2 = out(smm); + } + + return combine_errors(r, r2); +} + +static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b) +{ + int r, r2 = 0; + enum allocation_event ev; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + r = sm_ll_find_free_block(&smm->old_ll, smm->begin, smm->old_ll.nr_blocks, b); + if (r) + return r; + + smm->begin = *b + 1; + + if (recursing(smm)) + r = add_bop(smm, BOP_INC, *b); + else { + in(smm); + r = sm_ll_inc(&smm->ll, *b, &ev); + r2 = out(smm); + } + + if (!r) + smm->allocated_this_transaction++; + + return combine_errors(r, r2); +} + +static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b) +{ + int r = sm_metadata_new_block_(sm, b); + if (r) + DMERR("out of metadata space"); + return r; +} + +static int sm_metadata_commit(struct dm_space_map *sm) +{ + int r; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + r = sm_ll_commit(&smm->ll); + if (r) + return r; + + memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll)); + smm->begin = 0; + smm->allocated_this_transaction = 0; + + return 0; +} + +static int sm_metadata_root_size(struct dm_space_map *sm, size_t *result) +{ + *result = sizeof(struct disk_sm_root); + + return 0; +} + +static int sm_metadata_copy_root(struct dm_space_map *sm, void *where_le, size_t max) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + struct disk_sm_root root_le; + + root_le.nr_blocks = cpu_to_le64(smm->ll.nr_blocks); + root_le.nr_allocated = cpu_to_le64(smm->ll.nr_allocated); + root_le.bitmap_root = cpu_to_le64(smm->ll.bitmap_root); + root_le.ref_count_root = cpu_to_le64(smm->ll.ref_count_root); + + if (max < sizeof(root_le)) + return -ENOSPC; + + memcpy(where_le, &root_le, sizeof(root_le)); + + return 0; +} + +static struct dm_space_map ops = { + .destroy = sm_metadata_destroy, + .extend = sm_metadata_extend, + .get_nr_blocks = sm_metadata_get_nr_blocks, + .get_nr_free = sm_metadata_get_nr_free, + .get_count = sm_metadata_get_count, + .count_is_more_than_one = sm_metadata_count_is_more_than_one, + .set_count = sm_metadata_set_count, + .inc_block = sm_metadata_inc_block, + .dec_block = sm_metadata_dec_block, + .new_block = sm_metadata_new_block, + .commit = sm_metadata_commit, + .root_size = sm_metadata_root_size, + .copy_root = sm_metadata_copy_root +}; + +/*----------------------------------------------------------------*/ + +/* + * When a new space map is created that manages its own space. We use + * this tiny bootstrap allocator. + */ +static void sm_bootstrap_destroy(struct dm_space_map *sm) +{ +} + +static int sm_bootstrap_extend(struct dm_space_map *sm, dm_block_t extra_blocks) +{ + DMERR("boostrap doesn't support extend"); + + return -EINVAL; +} + +static int sm_bootstrap_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + return smm->ll.nr_blocks; +} + +static int sm_bootstrap_get_nr_free(struct dm_space_map *sm, dm_block_t *count) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + *count = smm->ll.nr_blocks - smm->begin; + + return 0; +} + +static int sm_bootstrap_get_count(struct dm_space_map *sm, dm_block_t b, + uint32_t *result) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + return b < smm->begin ? 1 : 0; +} + +static int sm_bootstrap_count_is_more_than_one(struct dm_space_map *sm, + dm_block_t b, int *result) +{ + *result = 0; + + return 0; +} + +static int sm_bootstrap_set_count(struct dm_space_map *sm, dm_block_t b, + uint32_t count) +{ + DMERR("boostrap doesn't support set_count"); + + return -EINVAL; +} + +static int sm_bootstrap_new_block(struct dm_space_map *sm, dm_block_t *b) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + /* + * We know the entire device is unused. + */ + if (smm->begin == smm->ll.nr_blocks) + return -ENOSPC; + + *b = smm->begin++; + + return 0; +} + +static int sm_bootstrap_inc_block(struct dm_space_map *sm, dm_block_t b) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + return add_bop(smm, BOP_INC, b); +} + +static int sm_bootstrap_dec_block(struct dm_space_map *sm, dm_block_t b) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + return add_bop(smm, BOP_DEC, b); +} + +static int sm_bootstrap_commit(struct dm_space_map *sm) +{ + return 0; +} + +static int sm_bootstrap_root_size(struct dm_space_map *sm, size_t *result) +{ + DMERR("boostrap doesn't support root_size"); + + return -EINVAL; +} + +static int sm_bootstrap_copy_root(struct dm_space_map *sm, void *where, + size_t max) +{ + DMERR("boostrap doesn't support copy_root"); + + return -EINVAL; +} + +static struct dm_space_map bootstrap_ops = { + .destroy = sm_bootstrap_destroy, + .extend = sm_bootstrap_extend, + .get_nr_blocks = sm_bootstrap_get_nr_blocks, + .get_nr_free = sm_bootstrap_get_nr_free, + .get_count = sm_bootstrap_get_count, + .count_is_more_than_one = sm_bootstrap_count_is_more_than_one, + .set_count = sm_bootstrap_set_count, + .inc_block = sm_bootstrap_inc_block, + .dec_block = sm_bootstrap_dec_block, + .new_block = sm_bootstrap_new_block, + .commit = sm_bootstrap_commit, + .root_size = sm_bootstrap_root_size, + .copy_root = sm_bootstrap_copy_root +}; + +/*----------------------------------------------------------------*/ + +struct dm_space_map *dm_sm_metadata_init(void) +{ + struct sm_metadata *smm; + + smm = kmalloc(sizeof(*smm), GFP_KERNEL); + if (!smm) + return ERR_PTR(-ENOMEM); + + memcpy(&smm->sm, &ops, sizeof(smm->sm)); + + return &smm->sm; +} + +int dm_sm_metadata_create(struct dm_space_map *sm, + struct dm_transaction_manager *tm, + dm_block_t nr_blocks, + dm_block_t superblock) +{ + int r; + dm_block_t i; + enum allocation_event ev; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + smm->begin = superblock + 1; + smm->recursion_count = 0; + smm->allocated_this_transaction = 0; + smm->nr_uncommitted = 0; + + memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm)); + + r = sm_ll_new_metadata(&smm->ll, tm); + if (r) + return r; + + r = sm_ll_extend(&smm->ll, nr_blocks); + if (r) + return r; + + memcpy(&smm->sm, &ops, sizeof(smm->sm)); + + /* + * Now we need to update the newly created data structures with the + * allocated blocks that they were built from. + */ + for (i = superblock; !r && i < smm->begin; i++) + r = sm_ll_inc(&smm->ll, i, &ev); + + if (r) + return r; + + return sm_metadata_commit(sm); +} + +int dm_sm_metadata_open(struct dm_space_map *sm, + struct dm_transaction_manager *tm, + void *root_le, size_t len) +{ + int r; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + r = sm_ll_open_metadata(&smm->ll, tm, root_le, len); + if (r) + return r; + + smm->begin = 0; + smm->recursion_count = 0; + smm->allocated_this_transaction = 0; + smm->nr_uncommitted = 0; + + memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll)); + return 0; +} diff --git a/drivers/md/persistent-data/dm-space-map-metadata.h b/drivers/md/persistent-data/dm-space-map-metadata.h new file mode 100644 index 00000000..39bba080 --- /dev/null +++ b/drivers/md/persistent-data/dm-space-map-metadata.h @@ -0,0 +1,33 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef DM_SPACE_MAP_METADATA_H +#define DM_SPACE_MAP_METADATA_H + +#include "dm-transaction-manager.h" + +/* + * Unfortunately we have to use two-phase construction due to the cycle + * between the tm and sm. + */ +struct dm_space_map *dm_sm_metadata_init(void); + +/* + * Create a fresh space map. + */ +int dm_sm_metadata_create(struct dm_space_map *sm, + struct dm_transaction_manager *tm, + dm_block_t nr_blocks, + dm_block_t superblock); + +/* + * Open from a previously-recorded root. + */ +int dm_sm_metadata_open(struct dm_space_map *sm, + struct dm_transaction_manager *tm, + void *root_le, size_t len); + +#endif /* DM_SPACE_MAP_METADATA_H */ diff --git a/drivers/md/persistent-data/dm-space-map.h b/drivers/md/persistent-data/dm-space-map.h new file mode 100644 index 00000000..1cbfc6b1 --- /dev/null +++ b/drivers/md/persistent-data/dm-space-map.h @@ -0,0 +1,134 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef _LINUX_DM_SPACE_MAP_H +#define _LINUX_DM_SPACE_MAP_H + +#include "dm-block-manager.h" + +/* + * struct dm_space_map keeps a record of how many times each block in a device + * is referenced. It needs to be fixed on disk as part of the transaction. + */ +struct dm_space_map { + void (*destroy)(struct dm_space_map *sm); + + /* + * You must commit before allocating the newly added space. + */ + int (*extend)(struct dm_space_map *sm, dm_block_t extra_blocks); + + /* + * Extensions do not appear in this count until after commit has + * been called. + */ + int (*get_nr_blocks)(struct dm_space_map *sm, dm_block_t *count); + + /* + * Space maps must never allocate a block from the previous + * transaction, in case we need to rollback. This complicates the + * semantics of get_nr_free(), it should return the number of blocks + * that are available for allocation _now_. For instance you may + * have blocks with a zero reference count that will not be + * available for allocation until after the next commit. + */ + int (*get_nr_free)(struct dm_space_map *sm, dm_block_t *count); + + int (*get_count)(struct dm_space_map *sm, dm_block_t b, uint32_t *result); + int (*count_is_more_than_one)(struct dm_space_map *sm, dm_block_t b, + int *result); + int (*set_count)(struct dm_space_map *sm, dm_block_t b, uint32_t count); + + int (*commit)(struct dm_space_map *sm); + + int (*inc_block)(struct dm_space_map *sm, dm_block_t b); + int (*dec_block)(struct dm_space_map *sm, dm_block_t b); + + /* + * new_block will increment the returned block. + */ + int (*new_block)(struct dm_space_map *sm, dm_block_t *b); + + /* + * The root contains all the information needed to fix the space map. + * Generally this info is small, so squirrel it away in a disk block + * along with other info. + */ + int (*root_size)(struct dm_space_map *sm, size_t *result); + int (*copy_root)(struct dm_space_map *sm, void *copy_to_here_le, size_t len); +}; + +/*----------------------------------------------------------------*/ + +static inline void dm_sm_destroy(struct dm_space_map *sm) +{ + sm->destroy(sm); +} + +static inline int dm_sm_extend(struct dm_space_map *sm, dm_block_t extra_blocks) +{ + return sm->extend(sm, extra_blocks); +} + +static inline int dm_sm_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count) +{ + return sm->get_nr_blocks(sm, count); +} + +static inline int dm_sm_get_nr_free(struct dm_space_map *sm, dm_block_t *count) +{ + return sm->get_nr_free(sm, count); +} + +static inline int dm_sm_get_count(struct dm_space_map *sm, dm_block_t b, + uint32_t *result) +{ + return sm->get_count(sm, b, result); +} + +static inline int dm_sm_count_is_more_than_one(struct dm_space_map *sm, + dm_block_t b, int *result) +{ + return sm->count_is_more_than_one(sm, b, result); +} + +static inline int dm_sm_set_count(struct dm_space_map *sm, dm_block_t b, + uint32_t count) +{ + return sm->set_count(sm, b, count); +} + +static inline int dm_sm_commit(struct dm_space_map *sm) +{ + return sm->commit(sm); +} + +static inline int dm_sm_inc_block(struct dm_space_map *sm, dm_block_t b) +{ + return sm->inc_block(sm, b); +} + +static inline int dm_sm_dec_block(struct dm_space_map *sm, dm_block_t b) +{ + return sm->dec_block(sm, b); +} + +static inline int dm_sm_new_block(struct dm_space_map *sm, dm_block_t *b) +{ + return sm->new_block(sm, b); +} + +static inline int dm_sm_root_size(struct dm_space_map *sm, size_t *result) +{ + return sm->root_size(sm, result); +} + +static inline int dm_sm_copy_root(struct dm_space_map *sm, void *copy_to_here_le, size_t len) +{ + return sm->copy_root(sm, copy_to_here_le, len); +} + +#endif /* _LINUX_DM_SPACE_MAP_H */ diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c new file mode 100644 index 00000000..ba54aacf --- /dev/null +++ b/drivers/md/persistent-data/dm-transaction-manager.c @@ -0,0 +1,407 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ +#include "dm-transaction-manager.h" +#include "dm-space-map.h" +#include "dm-space-map-checker.h" +#include "dm-space-map-disk.h" +#include "dm-space-map-metadata.h" +#include "dm-persistent-data-internal.h" + +#include <linux/export.h> +#include <linux/slab.h> +#include <linux/device-mapper.h> + +#define DM_MSG_PREFIX "transaction manager" + +/*----------------------------------------------------------------*/ + +struct shadow_info { + struct hlist_node hlist; + dm_block_t where; +}; + +/* + * It would be nice if we scaled with the size of transaction. + */ +#define HASH_SIZE 256 +#define HASH_MASK (HASH_SIZE - 1) + +struct dm_transaction_manager { + int is_clone; + struct dm_transaction_manager *real; + + struct dm_block_manager *bm; + struct dm_space_map *sm; + + spinlock_t lock; + struct hlist_head buckets[HASH_SIZE]; +}; + +/*----------------------------------------------------------------*/ + +static int is_shadow(struct dm_transaction_manager *tm, dm_block_t b) +{ + int r = 0; + unsigned bucket = dm_hash_block(b, HASH_MASK); + struct shadow_info *si; + struct hlist_node *n; + + spin_lock(&tm->lock); + hlist_for_each_entry(si, n, tm->buckets + bucket, hlist) + if (si->where == b) { + r = 1; + break; + } + spin_unlock(&tm->lock); + + return r; +} + +/* + * This can silently fail if there's no memory. We're ok with this since + * creating redundant shadows causes no harm. + */ +static void insert_shadow(struct dm_transaction_manager *tm, dm_block_t b) +{ + unsigned bucket; + struct shadow_info *si; + + si = kmalloc(sizeof(*si), GFP_NOIO); + if (si) { + si->where = b; + bucket = dm_hash_block(b, HASH_MASK); + spin_lock(&tm->lock); + hlist_add_head(&si->hlist, tm->buckets + bucket); + spin_unlock(&tm->lock); + } +} + +static void wipe_shadow_table(struct dm_transaction_manager *tm) +{ + struct shadow_info *si; + struct hlist_node *n, *tmp; + struct hlist_head *bucket; + int i; + + spin_lock(&tm->lock); + for (i = 0; i < HASH_SIZE; i++) { + bucket = tm->buckets + i; + hlist_for_each_entry_safe(si, n, tmp, bucket, hlist) + kfree(si); + + INIT_HLIST_HEAD(bucket); + } + + spin_unlock(&tm->lock); +} + +/*----------------------------------------------------------------*/ + +static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm, + struct dm_space_map *sm) +{ + int i; + struct dm_transaction_manager *tm; + + tm = kmalloc(sizeof(*tm), GFP_KERNEL); + if (!tm) + return ERR_PTR(-ENOMEM); + + tm->is_clone = 0; + tm->real = NULL; + tm->bm = bm; + tm->sm = sm; + + spin_lock_init(&tm->lock); + for (i = 0; i < HASH_SIZE; i++) + INIT_HLIST_HEAD(tm->buckets + i); + + return tm; +} + +struct dm_transaction_manager *dm_tm_create_non_blocking_clone(struct dm_transaction_manager *real) +{ + struct dm_transaction_manager *tm; + + tm = kmalloc(sizeof(*tm), GFP_KERNEL); + if (tm) { + tm->is_clone = 1; + tm->real = real; + } + + return tm; +} +EXPORT_SYMBOL_GPL(dm_tm_create_non_blocking_clone); + +void dm_tm_destroy(struct dm_transaction_manager *tm) +{ + if (!tm->is_clone) + wipe_shadow_table(tm); + + kfree(tm); +} +EXPORT_SYMBOL_GPL(dm_tm_destroy); + +int dm_tm_pre_commit(struct dm_transaction_manager *tm) +{ + int r; + + if (tm->is_clone) + return -EWOULDBLOCK; + + r = dm_sm_commit(tm->sm); + if (r < 0) + return r; + + return 0; +} +EXPORT_SYMBOL_GPL(dm_tm_pre_commit); + +int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root) +{ + if (tm->is_clone) + return -EWOULDBLOCK; + + wipe_shadow_table(tm); + + return dm_bm_flush_and_unlock(tm->bm, root); +} +EXPORT_SYMBOL_GPL(dm_tm_commit); + +int dm_tm_new_block(struct dm_transaction_manager *tm, + struct dm_block_validator *v, + struct dm_block **result) +{ + int r; + dm_block_t new_block; + + if (tm->is_clone) + return -EWOULDBLOCK; + + r = dm_sm_new_block(tm->sm, &new_block); + if (r < 0) + return r; + + r = dm_bm_write_lock_zero(tm->bm, new_block, v, result); + if (r < 0) { + dm_sm_dec_block(tm->sm, new_block); + return r; + } + + /* + * New blocks count as shadows in that they don't need to be + * shadowed again. + */ + insert_shadow(tm, new_block); + + return 0; +} + +static int __shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, + struct dm_block_validator *v, + struct dm_block **result) +{ + int r; + dm_block_t new; + struct dm_block *orig_block; + + r = dm_sm_new_block(tm->sm, &new); + if (r < 0) + return r; + + r = dm_sm_dec_block(tm->sm, orig); + if (r < 0) + return r; + + r = dm_bm_read_lock(tm->bm, orig, v, &orig_block); + if (r < 0) + return r; + + r = dm_bm_unlock_move(orig_block, new); + if (r < 0) { + dm_bm_unlock(orig_block); + return r; + } + + return dm_bm_write_lock(tm->bm, new, v, result); +} + +int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, + struct dm_block_validator *v, struct dm_block **result, + int *inc_children) +{ + int r; + + if (tm->is_clone) + return -EWOULDBLOCK; + + r = dm_sm_count_is_more_than_one(tm->sm, orig, inc_children); + if (r < 0) + return r; + + if (is_shadow(tm, orig) && !*inc_children) + return dm_bm_write_lock(tm->bm, orig, v, result); + + r = __shadow_block(tm, orig, v, result); + if (r < 0) + return r; + insert_shadow(tm, dm_block_location(*result)); + + return r; +} + +int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b, + struct dm_block_validator *v, + struct dm_block **blk) +{ + if (tm->is_clone) + return dm_bm_read_try_lock(tm->real->bm, b, v, blk); + + return dm_bm_read_lock(tm->bm, b, v, blk); +} + +int dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b) +{ + return dm_bm_unlock(b); +} +EXPORT_SYMBOL_GPL(dm_tm_unlock); + +void dm_tm_inc(struct dm_transaction_manager *tm, dm_block_t b) +{ + /* + * The non-blocking clone doesn't support this. + */ + BUG_ON(tm->is_clone); + + dm_sm_inc_block(tm->sm, b); +} +EXPORT_SYMBOL_GPL(dm_tm_inc); + +void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b) +{ + /* + * The non-blocking clone doesn't support this. + */ + BUG_ON(tm->is_clone); + + dm_sm_dec_block(tm->sm, b); +} +EXPORT_SYMBOL_GPL(dm_tm_dec); + +int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b, + uint32_t *result) +{ + if (tm->is_clone) + return -EWOULDBLOCK; + + return dm_sm_get_count(tm->sm, b, result); +} + +struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm) +{ + return tm->bm; +} + +/*----------------------------------------------------------------*/ + +static int dm_tm_create_internal(struct dm_block_manager *bm, + dm_block_t sb_location, + struct dm_block_validator *sb_validator, + size_t root_offset, size_t root_max_len, + struct dm_transaction_manager **tm, + struct dm_space_map **sm, + struct dm_block **sblock, + int create) +{ + int r; + struct dm_space_map *inner; + + inner = dm_sm_metadata_init(); + if (IS_ERR(inner)) + return PTR_ERR(inner); + + *tm = dm_tm_create(bm, inner); + if (IS_ERR(*tm)) { + dm_sm_destroy(inner); + return PTR_ERR(*tm); + } + + if (create) { + r = dm_bm_write_lock_zero(dm_tm_get_bm(*tm), sb_location, + sb_validator, sblock); + if (r < 0) { + DMERR("couldn't lock superblock"); + goto bad1; + } + + r = dm_sm_metadata_create(inner, *tm, dm_bm_nr_blocks(bm), + sb_location); + if (r) { + DMERR("couldn't create metadata space map"); + goto bad2; + } + + *sm = dm_sm_checker_create(inner); + if (IS_ERR(*sm)) { + r = PTR_ERR(*sm); + goto bad2; + } + + } else { + r = dm_bm_write_lock(dm_tm_get_bm(*tm), sb_location, + sb_validator, sblock); + if (r < 0) { + DMERR("couldn't lock superblock"); + goto bad1; + } + + r = dm_sm_metadata_open(inner, *tm, + dm_block_data(*sblock) + root_offset, + root_max_len); + if (r) { + DMERR("couldn't open metadata space map"); + goto bad2; + } + + *sm = dm_sm_checker_create(inner); + if (IS_ERR(*sm)) { + r = PTR_ERR(*sm); + goto bad2; + } + } + + return 0; + +bad2: + dm_tm_unlock(*tm, *sblock); +bad1: + dm_tm_destroy(*tm); + dm_sm_destroy(inner); + return r; +} + +int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, + struct dm_block_validator *sb_validator, + struct dm_transaction_manager **tm, + struct dm_space_map **sm, struct dm_block **sblock) +{ + return dm_tm_create_internal(bm, sb_location, sb_validator, + 0, 0, tm, sm, sblock, 1); +} +EXPORT_SYMBOL_GPL(dm_tm_create_with_sm); + +int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, + struct dm_block_validator *sb_validator, + size_t root_offset, size_t root_max_len, + struct dm_transaction_manager **tm, + struct dm_space_map **sm, struct dm_block **sblock) +{ + return dm_tm_create_internal(bm, sb_location, sb_validator, root_offset, + root_max_len, tm, sm, sblock, 0); +} +EXPORT_SYMBOL_GPL(dm_tm_open_with_sm); + +/*----------------------------------------------------------------*/ diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h new file mode 100644 index 00000000..6da78487 --- /dev/null +++ b/drivers/md/persistent-data/dm-transaction-manager.h @@ -0,0 +1,130 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef _LINUX_DM_TRANSACTION_MANAGER_H +#define _LINUX_DM_TRANSACTION_MANAGER_H + +#include "dm-block-manager.h" + +struct dm_transaction_manager; +struct dm_space_map; + +/*----------------------------------------------------------------*/ + +/* + * This manages the scope of a transaction. It also enforces immutability + * of the on-disk data structures by limiting access to writeable blocks. + * + * Clients should not fiddle with the block manager directly. + */ + +void dm_tm_destroy(struct dm_transaction_manager *tm); + +/* + * The non-blocking version of a transaction manager is intended for use in + * fast path code that needs to do lookups e.g. a dm mapping function. + * You create the non-blocking variant from a normal tm. The interface is + * the same, except that most functions will just return -EWOULDBLOCK. + * Methods that return void yet may block should not be called on a clone + * viz. dm_tm_inc, dm_tm_dec. Call dm_tm_destroy() as you would with a normal + * tm when you've finished with it. You may not destroy the original prior + * to clones. + */ +struct dm_transaction_manager *dm_tm_create_non_blocking_clone(struct dm_transaction_manager *real); + +/* + * We use a 2-phase commit here. + * + * i) In the first phase the block manager is told to start flushing, and + * the changes to the space map are written to disk. You should interrogate + * your particular space map to get detail of its root node etc. to be + * included in your superblock. + * + * ii) @root will be committed last. You shouldn't use more than the + * first 512 bytes of @root if you wish the transaction to survive a power + * failure. You *must* have a write lock held on @root for both stage (i) + * and (ii). The commit will drop the write lock. + */ +int dm_tm_pre_commit(struct dm_transaction_manager *tm); +int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root); + +/* + * These methods are the only way to get hold of a writeable block. + */ + +/* + * dm_tm_new_block() is pretty self-explanatory. Make sure you do actually + * write to the whole of @data before you unlock, otherwise you could get + * a data leak. (The other option is for tm_new_block() to zero new blocks + * before handing them out, which will be redundant in most, if not all, + * cases). + * Zeroes the new block and returns with write lock held. + */ +int dm_tm_new_block(struct dm_transaction_manager *tm, + struct dm_block_validator *v, + struct dm_block **result); + +/* + * dm_tm_shadow_block() allocates a new block and copies the data from @orig + * to it. It then decrements the reference count on original block. Use + * this to update the contents of a block in a data structure, don't + * confuse this with a clone - you shouldn't access the orig block after + * this operation. Because the tm knows the scope of the transaction it + * can optimise requests for a shadow of a shadow to a no-op. Don't forget + * to unlock when you've finished with the shadow. + * + * The @inc_children flag is used to tell the caller whether it needs to + * adjust reference counts for children. (Data in the block may refer to + * other blocks.) + * + * Shadowing implicitly drops a reference on @orig so you must not have + * it locked when you call this. + */ +int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, + struct dm_block_validator *v, + struct dm_block **result, int *inc_children); + +/* + * Read access. You can lock any block you want. If there's a write lock + * on it outstanding then it'll block. + */ +int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b, + struct dm_block_validator *v, + struct dm_block **result); + +int dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b); + +/* + * Functions for altering the reference count of a block directly. + */ +void dm_tm_inc(struct dm_transaction_manager *tm, dm_block_t b); + +void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b); + +int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b, + uint32_t *result); + +struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm); + +/* + * A little utility that ties the knot by producing a transaction manager + * that has a space map managed by the transaction manager... + * + * Returns a tm that has an open transaction to write the new disk sm. + * Caller should store the new sm root and commit. + */ +int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, + struct dm_block_validator *sb_validator, + struct dm_transaction_manager **tm, + struct dm_space_map **sm, struct dm_block **sblock); + +int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, + struct dm_block_validator *sb_validator, + size_t root_offset, size_t root_max_len, + struct dm_transaction_manager **tm, + struct dm_space_map **sm, struct dm_block **sblock); + +#endif /* _LINUX_DM_TRANSACTION_MANAGER_H */ diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c new file mode 100644 index 00000000..de63a1fc --- /dev/null +++ b/drivers/md/raid0.c @@ -0,0 +1,739 @@ +/* + raid0.c : Multiple Devices driver for Linux + Copyright (C) 1994-96 Marc ZYNGIER + <zyngier@ufr-info-p7.ibp.fr> or + <maz@gloups.fdn.fr> + Copyright (C) 1999, 2000 Ingo Molnar, Red Hat + + + RAID-0 management functions. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include <linux/blkdev.h> +#include <linux/seq_file.h> +#include <linux/module.h> +#include <linux/slab.h> +#include "md.h" +#include "raid0.h" +#include "raid5.h" + +static int raid0_congested(void *data, int bits) +{ + struct mddev *mddev = data; + struct r0conf *conf = mddev->private; + struct md_rdev **devlist = conf->devlist; + int raid_disks = conf->strip_zone[0].nb_dev; + int i, ret = 0; + + if (mddev_congested(mddev, bits)) + return 1; + + for (i = 0; i < raid_disks && !ret ; i++) { + struct request_queue *q = bdev_get_queue(devlist[i]->bdev); + + ret |= bdi_congested(&q->backing_dev_info, bits); + } + return ret; +} + +/* + * inform the user of the raid configuration +*/ +static void dump_zones(struct mddev *mddev) +{ + int j, k; + sector_t zone_size = 0; + sector_t zone_start = 0; + char b[BDEVNAME_SIZE]; + struct r0conf *conf = mddev->private; + int raid_disks = conf->strip_zone[0].nb_dev; + printk(KERN_INFO "md: RAID0 configuration for %s - %d zone%s\n", + mdname(mddev), + conf->nr_strip_zones, conf->nr_strip_zones==1?"":"s"); + for (j = 0; j < conf->nr_strip_zones; j++) { + printk(KERN_INFO "md: zone%d=[", j); + for (k = 0; k < conf->strip_zone[j].nb_dev; k++) + printk(KERN_CONT "%s%s", k?"/":"", + bdevname(conf->devlist[j*raid_disks + + k]->bdev, b)); + printk(KERN_CONT "]\n"); + + zone_size = conf->strip_zone[j].zone_end - zone_start; + printk(KERN_INFO " zone-offset=%10lluKB, " + "device-offset=%10lluKB, size=%10lluKB\n", + (unsigned long long)zone_start>>1, + (unsigned long long)conf->strip_zone[j].dev_start>>1, + (unsigned long long)zone_size>>1); + zone_start = conf->strip_zone[j].zone_end; + } + printk(KERN_INFO "\n"); +} + +static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) +{ + int i, c, err; + sector_t curr_zone_end, sectors; + struct md_rdev *smallest, *rdev1, *rdev2, *rdev, **dev; + struct strip_zone *zone; + int cnt; + char b[BDEVNAME_SIZE]; + char b2[BDEVNAME_SIZE]; + struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL); + + if (!conf) + return -ENOMEM; + rdev_for_each(rdev1, mddev) { + pr_debug("md/raid0:%s: looking at %s\n", + mdname(mddev), + bdevname(rdev1->bdev, b)); + c = 0; + + /* round size to chunk_size */ + sectors = rdev1->sectors; + sector_div(sectors, mddev->chunk_sectors); + rdev1->sectors = sectors * mddev->chunk_sectors; + + rdev_for_each(rdev2, mddev) { + pr_debug("md/raid0:%s: comparing %s(%llu)" + " with %s(%llu)\n", + mdname(mddev), + bdevname(rdev1->bdev,b), + (unsigned long long)rdev1->sectors, + bdevname(rdev2->bdev,b2), + (unsigned long long)rdev2->sectors); + if (rdev2 == rdev1) { + pr_debug("md/raid0:%s: END\n", + mdname(mddev)); + break; + } + if (rdev2->sectors == rdev1->sectors) { + /* + * Not unique, don't count it as a new + * group + */ + pr_debug("md/raid0:%s: EQUAL\n", + mdname(mddev)); + c = 1; + break; + } + pr_debug("md/raid0:%s: NOT EQUAL\n", + mdname(mddev)); + } + if (!c) { + pr_debug("md/raid0:%s: ==> UNIQUE\n", + mdname(mddev)); + conf->nr_strip_zones++; + pr_debug("md/raid0:%s: %d zones\n", + mdname(mddev), conf->nr_strip_zones); + } + } + pr_debug("md/raid0:%s: FINAL %d zones\n", + mdname(mddev), conf->nr_strip_zones); + err = -ENOMEM; + conf->strip_zone = kzalloc(sizeof(struct strip_zone)* + conf->nr_strip_zones, GFP_KERNEL); + if (!conf->strip_zone) + goto abort; + conf->devlist = kzalloc(sizeof(struct md_rdev*)* + conf->nr_strip_zones*mddev->raid_disks, + GFP_KERNEL); + if (!conf->devlist) + goto abort; + + /* The first zone must contain all devices, so here we check that + * there is a proper alignment of slots to devices and find them all + */ + zone = &conf->strip_zone[0]; + cnt = 0; + smallest = NULL; + dev = conf->devlist; + err = -EINVAL; + rdev_for_each(rdev1, mddev) { + int j = rdev1->raid_disk; + + if (mddev->level == 10) { + /* taking over a raid10-n2 array */ + j /= 2; + rdev1->new_raid_disk = j; + } + + if (mddev->level == 1) { + /* taiking over a raid1 array- + * we have only one active disk + */ + j = 0; + rdev1->new_raid_disk = j; + } + + if (j < 0 || j >= mddev->raid_disks) { + printk(KERN_ERR "md/raid0:%s: bad disk number %d - " + "aborting!\n", mdname(mddev), j); + goto abort; + } + if (dev[j]) { + printk(KERN_ERR "md/raid0:%s: multiple devices for %d - " + "aborting!\n", mdname(mddev), j); + goto abort; + } + dev[j] = rdev1; + + disk_stack_limits(mddev->gendisk, rdev1->bdev, + rdev1->data_offset << 9); + + if (rdev1->bdev->bd_disk->queue->merge_bvec_fn) + conf->has_merge_bvec = 1; + + if (!smallest || (rdev1->sectors < smallest->sectors)) + smallest = rdev1; + cnt++; + } + if (cnt != mddev->raid_disks) { + printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - " + "aborting!\n", mdname(mddev), cnt, mddev->raid_disks); + goto abort; + } + zone->nb_dev = cnt; + zone->zone_end = smallest->sectors * cnt; + + curr_zone_end = zone->zone_end; + + /* now do the other zones */ + for (i = 1; i < conf->nr_strip_zones; i++) + { + int j; + + zone = conf->strip_zone + i; + dev = conf->devlist + i * mddev->raid_disks; + + pr_debug("md/raid0:%s: zone %d\n", mdname(mddev), i); + zone->dev_start = smallest->sectors; + smallest = NULL; + c = 0; + + for (j=0; j<cnt; j++) { + rdev = conf->devlist[j]; + if (rdev->sectors <= zone->dev_start) { + pr_debug("md/raid0:%s: checking %s ... nope\n", + mdname(mddev), + bdevname(rdev->bdev, b)); + continue; + } + pr_debug("md/raid0:%s: checking %s ..." + " contained as device %d\n", + mdname(mddev), + bdevname(rdev->bdev, b), c); + dev[c] = rdev; + c++; + if (!smallest || rdev->sectors < smallest->sectors) { + smallest = rdev; + pr_debug("md/raid0:%s: (%llu) is smallest!.\n", + mdname(mddev), + (unsigned long long)rdev->sectors); + } + } + + zone->nb_dev = c; + sectors = (smallest->sectors - zone->dev_start) * c; + pr_debug("md/raid0:%s: zone->nb_dev: %d, sectors: %llu\n", + mdname(mddev), + zone->nb_dev, (unsigned long long)sectors); + + curr_zone_end += sectors; + zone->zone_end = curr_zone_end; + + pr_debug("md/raid0:%s: current zone start: %llu\n", + mdname(mddev), + (unsigned long long)smallest->sectors); + } + mddev->queue->backing_dev_info.congested_fn = raid0_congested; + mddev->queue->backing_dev_info.congested_data = mddev; + + /* + * now since we have the hard sector sizes, we can make sure + * chunk size is a multiple of that sector size + */ + if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) { + printk(KERN_ERR "md/raid0:%s: chunk_size of %d not valid\n", + mdname(mddev), + mddev->chunk_sectors << 9); + goto abort; + } + + blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); + blk_queue_io_opt(mddev->queue, + (mddev->chunk_sectors << 9) * mddev->raid_disks); + + pr_debug("md/raid0:%s: done.\n", mdname(mddev)); + *private_conf = conf; + + return 0; +abort: + kfree(conf->strip_zone); + kfree(conf->devlist); + kfree(conf); + *private_conf = NULL; + return err; +} + +/* Find the zone which holds a particular offset + * Update *sectorp to be an offset in that zone + */ +static struct strip_zone *find_zone(struct r0conf *conf, + sector_t *sectorp) +{ + int i; + struct strip_zone *z = conf->strip_zone; + sector_t sector = *sectorp; + + for (i = 0; i < conf->nr_strip_zones; i++) + if (sector < z[i].zone_end) { + if (i) + *sectorp = sector - z[i-1].zone_end; + return z + i; + } + BUG(); +} + +/* + * remaps the bio to the target device. we separate two flows. + * power 2 flow and a general flow for the sake of perfromance +*/ +static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone, + sector_t sector, sector_t *sector_offset) +{ + unsigned int sect_in_chunk; + sector_t chunk; + struct r0conf *conf = mddev->private; + int raid_disks = conf->strip_zone[0].nb_dev; + unsigned int chunk_sects = mddev->chunk_sectors; + + if (is_power_of_2(chunk_sects)) { + int chunksect_bits = ffz(~chunk_sects); + /* find the sector offset inside the chunk */ + sect_in_chunk = sector & (chunk_sects - 1); + sector >>= chunksect_bits; + /* chunk in zone */ + chunk = *sector_offset; + /* quotient is the chunk in real device*/ + sector_div(chunk, zone->nb_dev << chunksect_bits); + } else{ + sect_in_chunk = sector_div(sector, chunk_sects); + chunk = *sector_offset; + sector_div(chunk, chunk_sects * zone->nb_dev); + } + /* + * position the bio over the real device + * real sector = chunk in device + starting of zone + * + the position in the chunk + */ + *sector_offset = (chunk * chunk_sects) + sect_in_chunk; + return conf->devlist[(zone - conf->strip_zone)*raid_disks + + sector_div(sector, zone->nb_dev)]; +} + +/** + * raid0_mergeable_bvec -- tell bio layer if two requests can be merged + * @q: request queue + * @bvm: properties of new bio + * @biovec: the request that could be merged to it. + * + * Return amount of bytes we can accept at this offset + */ +static int raid0_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) +{ + struct mddev *mddev = q->queuedata; + struct r0conf *conf = mddev->private; + sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); + sector_t sector_offset = sector; + int max; + unsigned int chunk_sectors = mddev->chunk_sectors; + unsigned int bio_sectors = bvm->bi_size >> 9; + struct strip_zone *zone; + struct md_rdev *rdev; + struct request_queue *subq; + + if (is_power_of_2(chunk_sectors)) + max = (chunk_sectors - ((sector & (chunk_sectors-1)) + + bio_sectors)) << 9; + else + max = (chunk_sectors - (sector_div(sector, chunk_sectors) + + bio_sectors)) << 9; + if (max < 0) + max = 0; /* bio_add cannot handle a negative return */ + if (max <= biovec->bv_len && bio_sectors == 0) + return biovec->bv_len; + if (max < biovec->bv_len) + /* too small already, no need to check further */ + return max; + if (!conf->has_merge_bvec) + return max; + + /* May need to check subordinate device */ + sector = sector_offset; + zone = find_zone(mddev->private, §or_offset); + rdev = map_sector(mddev, zone, sector, §or_offset); + subq = bdev_get_queue(rdev->bdev); + if (subq->merge_bvec_fn) { + bvm->bi_bdev = rdev->bdev; + bvm->bi_sector = sector_offset + zone->dev_start + + rdev->data_offset; + return min(max, subq->merge_bvec_fn(subq, bvm, biovec)); + } else + return max; +} + +static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks) +{ + sector_t array_sectors = 0; + struct md_rdev *rdev; + + WARN_ONCE(sectors || raid_disks, + "%s does not support generic reshape\n", __func__); + + rdev_for_each(rdev, mddev) + array_sectors += rdev->sectors; + + return array_sectors; +} + +static int raid0_stop(struct mddev *mddev); + +static int raid0_run(struct mddev *mddev) +{ + struct r0conf *conf; + int ret; + + if (mddev->chunk_sectors == 0) { + printk(KERN_ERR "md/raid0:%s: chunk size must be set.\n", + mdname(mddev)); + return -EINVAL; + } + if (md_check_no_bitmap(mddev)) + return -EINVAL; + blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); + + /* if private is not null, we are here after takeover */ + if (mddev->private == NULL) { + ret = create_strip_zones(mddev, &conf); + if (ret < 0) + return ret; + mddev->private = conf; + } + conf = mddev->private; + + /* calculate array device size */ + md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); + + printk(KERN_INFO "md/raid0:%s: md_size is %llu sectors.\n", + mdname(mddev), + (unsigned long long)mddev->array_sectors); + /* calculate the max read-ahead size. + * For read-ahead of large files to be effective, we need to + * readahead at least twice a whole stripe. i.e. number of devices + * multiplied by chunk size times 2. + * If an individual device has an ra_pages greater than the + * chunk size, then we will not drive that device as hard as it + * wants. We consider this a configuration error: a larger + * chunksize should be used in that case. + */ + { + int stripe = mddev->raid_disks * + (mddev->chunk_sectors << 9) / PAGE_SIZE; + if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) + mddev->queue->backing_dev_info.ra_pages = 2* stripe; + } + + blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec); + dump_zones(mddev); + + ret = md_integrity_register(mddev); + if (ret) + raid0_stop(mddev); + + return ret; +} + +static int raid0_stop(struct mddev *mddev) +{ + struct r0conf *conf = mddev->private; + + blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ + kfree(conf->strip_zone); + kfree(conf->devlist); + kfree(conf); + mddev->private = NULL; + return 0; +} + +/* + * Is io distribute over 1 or more chunks ? +*/ +static inline int is_io_in_chunk_boundary(struct mddev *mddev, + unsigned int chunk_sects, struct bio *bio) +{ + if (likely(is_power_of_2(chunk_sects))) { + return chunk_sects >= ((bio->bi_sector & (chunk_sects-1)) + + (bio->bi_size >> 9)); + } else{ + sector_t sector = bio->bi_sector; + return chunk_sects >= (sector_div(sector, chunk_sects) + + (bio->bi_size >> 9)); + } +} + +static void raid0_make_request(struct mddev *mddev, struct bio *bio) +{ + unsigned int chunk_sects; + sector_t sector_offset; + struct strip_zone *zone; + struct md_rdev *tmp_dev; + + if (unlikely(bio->bi_rw & REQ_FLUSH)) { + md_flush_request(mddev, bio); + return; + } + + chunk_sects = mddev->chunk_sectors; + if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) { + sector_t sector = bio->bi_sector; + struct bio_pair *bp; + /* Sanity check -- queue functions should prevent this happening */ + if (bio->bi_vcnt != 1 || + bio->bi_idx != 0) + goto bad_map; + /* This is a one page bio that upper layers + * refuse to split for us, so we need to split it. + */ + if (likely(is_power_of_2(chunk_sects))) + bp = bio_split(bio, chunk_sects - (sector & + (chunk_sects-1))); + else + bp = bio_split(bio, chunk_sects - + sector_div(sector, chunk_sects)); + raid0_make_request(mddev, &bp->bio1); + raid0_make_request(mddev, &bp->bio2); + bio_pair_release(bp); + return; + } + + sector_offset = bio->bi_sector; + zone = find_zone(mddev->private, §or_offset); + tmp_dev = map_sector(mddev, zone, bio->bi_sector, + §or_offset); + bio->bi_bdev = tmp_dev->bdev; + bio->bi_sector = sector_offset + zone->dev_start + + tmp_dev->data_offset; + + generic_make_request(bio); + return; + +bad_map: + printk("md/raid0:%s: make_request bug: can't convert block across chunks" + " or bigger than %dk %llu %d\n", + mdname(mddev), chunk_sects / 2, + (unsigned long long)bio->bi_sector, bio->bi_size >> 10); + + bio_io_error(bio); + return; +} + +static void raid0_status(struct seq_file *seq, struct mddev *mddev) +{ + seq_printf(seq, " %dk chunks", mddev->chunk_sectors / 2); + return; +} + +static void *raid0_takeover_raid45(struct mddev *mddev) +{ + struct md_rdev *rdev; + struct r0conf *priv_conf; + + if (mddev->degraded != 1) { + printk(KERN_ERR "md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n", + mdname(mddev), + mddev->degraded); + return ERR_PTR(-EINVAL); + } + + rdev_for_each(rdev, mddev) { + /* check slot number for a disk */ + if (rdev->raid_disk == mddev->raid_disks-1) { + printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n", + mdname(mddev)); + return ERR_PTR(-EINVAL); + } + } + + /* Set new parameters */ + mddev->new_level = 0; + mddev->new_layout = 0; + mddev->new_chunk_sectors = mddev->chunk_sectors; + mddev->raid_disks--; + mddev->delta_disks = -1; + /* make sure it will be not marked as dirty */ + mddev->recovery_cp = MaxSector; + + create_strip_zones(mddev, &priv_conf); + return priv_conf; +} + +static void *raid0_takeover_raid10(struct mddev *mddev) +{ + struct r0conf *priv_conf; + + /* Check layout: + * - far_copies must be 1 + * - near_copies must be 2 + * - disks number must be even + * - all mirrors must be already degraded + */ + if (mddev->layout != ((1 << 8) + 2)) { + printk(KERN_ERR "md/raid0:%s:: Raid0 cannot takover layout: 0x%x\n", + mdname(mddev), + mddev->layout); + return ERR_PTR(-EINVAL); + } + if (mddev->raid_disks & 1) { + printk(KERN_ERR "md/raid0:%s: Raid0 cannot takover Raid10 with odd disk number.\n", + mdname(mddev)); + return ERR_PTR(-EINVAL); + } + if (mddev->degraded != (mddev->raid_disks>>1)) { + printk(KERN_ERR "md/raid0:%s: All mirrors must be already degraded!\n", + mdname(mddev)); + return ERR_PTR(-EINVAL); + } + + /* Set new parameters */ + mddev->new_level = 0; + mddev->new_layout = 0; + mddev->new_chunk_sectors = mddev->chunk_sectors; + mddev->delta_disks = - mddev->raid_disks / 2; + mddev->raid_disks += mddev->delta_disks; + mddev->degraded = 0; + /* make sure it will be not marked as dirty */ + mddev->recovery_cp = MaxSector; + + create_strip_zones(mddev, &priv_conf); + return priv_conf; +} + +static void *raid0_takeover_raid1(struct mddev *mddev) +{ + struct r0conf *priv_conf; + int chunksect; + + /* Check layout: + * - (N - 1) mirror drives must be already faulty + */ + if ((mddev->raid_disks - 1) != mddev->degraded) { + printk(KERN_ERR "md/raid0:%s: (N - 1) mirrors drives must be already faulty!\n", + mdname(mddev)); + return ERR_PTR(-EINVAL); + } + + /* + * a raid1 doesn't have the notion of chunk size, so + * figure out the largest suitable size we can use. + */ + chunksect = 64 * 2; /* 64K by default */ + + /* The array must be an exact multiple of chunksize */ + while (chunksect && (mddev->array_sectors & (chunksect - 1))) + chunksect >>= 1; + + if ((chunksect << 9) < PAGE_SIZE) + /* array size does not allow a suitable chunk size */ + return ERR_PTR(-EINVAL); + + /* Set new parameters */ + mddev->new_level = 0; + mddev->new_layout = 0; + mddev->new_chunk_sectors = chunksect; + mddev->chunk_sectors = chunksect; + mddev->delta_disks = 1 - mddev->raid_disks; + mddev->raid_disks = 1; + /* make sure it will be not marked as dirty */ + mddev->recovery_cp = MaxSector; + + create_strip_zones(mddev, &priv_conf); + return priv_conf; +} + +static void *raid0_takeover(struct mddev *mddev) +{ + /* raid0 can take over: + * raid4 - if all data disks are active. + * raid5 - providing it is Raid4 layout and one disk is faulty + * raid10 - assuming we have all necessary active disks + * raid1 - with (N -1) mirror drives faulty + */ + if (mddev->level == 4) + return raid0_takeover_raid45(mddev); + + if (mddev->level == 5) { + if (mddev->layout == ALGORITHM_PARITY_N) + return raid0_takeover_raid45(mddev); + + printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n", + mdname(mddev), ALGORITHM_PARITY_N); + } + + if (mddev->level == 10) + return raid0_takeover_raid10(mddev); + + if (mddev->level == 1) + return raid0_takeover_raid1(mddev); + + printk(KERN_ERR "Takeover from raid%i to raid0 not supported\n", + mddev->level); + + return ERR_PTR(-EINVAL); +} + +static void raid0_quiesce(struct mddev *mddev, int state) +{ +} + +static struct md_personality raid0_personality= +{ + .name = "raid0", + .level = 0, + .owner = THIS_MODULE, + .make_request = raid0_make_request, + .run = raid0_run, + .stop = raid0_stop, + .status = raid0_status, + .size = raid0_size, + .takeover = raid0_takeover, + .quiesce = raid0_quiesce, +}; + +static int __init raid0_init (void) +{ + return register_md_personality (&raid0_personality); +} + +static void raid0_exit (void) +{ + unregister_md_personality (&raid0_personality); +} + +module_init(raid0_init); +module_exit(raid0_exit); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("RAID0 (striping) personality for MD"); +MODULE_ALIAS("md-personality-2"); /* RAID0 */ +MODULE_ALIAS("md-raid0"); +MODULE_ALIAS("md-level-0"); diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h new file mode 100644 index 00000000..05539d9c --- /dev/null +++ b/drivers/md/raid0.h @@ -0,0 +1,19 @@ +#ifndef _RAID0_H +#define _RAID0_H + +struct strip_zone { + sector_t zone_end; /* Start of the next zone (in sectors) */ + sector_t dev_start; /* Zone offset in real dev (in sectors) */ + int nb_dev; /* # of devices attached to the zone */ +}; + +struct r0conf { + struct strip_zone *strip_zone; + struct md_rdev **devlist; /* lists of rdevs, pointed to + * by strip_zone->dev */ + int nr_strip_zones; + int has_merge_bvec; /* at least one member has + * a merge_bvec_fn */ +}; + +#endif diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c new file mode 100644 index 00000000..d7e95772 --- /dev/null +++ b/drivers/md/raid1.c @@ -0,0 +1,2953 @@ +/* + * raid1.c : Multiple Devices driver for Linux + * + * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat + * + * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman + * + * RAID-1 management functions. + * + * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000 + * + * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk> + * Various fixes by Neil Brown <neilb@cse.unsw.edu.au> + * + * Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support + * bitmapped intelligence in resync: + * + * - bitmap marked during normal i/o + * - bitmap used to skip nondirty blocks during sync + * + * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology: + * - persistent bitmap code + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * You should have received a copy of the GNU General Public License + * (for example /usr/src/linux/COPYING); if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <linux/slab.h> +#include <linux/delay.h> +#include <linux/blkdev.h> +#include <linux/module.h> +#include <linux/seq_file.h> +#include <linux/ratelimit.h> +#include "md.h" +#include "raid1.h" +#include "bitmap.h" + +/* + * Number of guaranteed r1bios in case of extreme VM load: + */ +#define NR_RAID1_BIOS 256 + +/* When there are this many requests queue to be written by + * the raid1 thread, we become 'congested' to provide back-pressure + * for writeback. + */ +static int max_queued_requests = 1024; + +static void allow_barrier(struct r1conf *conf); +static void lower_barrier(struct r1conf *conf); + +static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) +{ + struct pool_info *pi = data; + int size = offsetof(struct r1bio, bios[pi->raid_disks]); + + /* allocate a r1bio with room for raid_disks entries in the bios array */ + return kzalloc(size, gfp_flags); +} + +static void r1bio_pool_free(void *r1_bio, void *data) +{ + kfree(r1_bio); +} + +#define RESYNC_BLOCK_SIZE (64*1024) +//#define RESYNC_BLOCK_SIZE PAGE_SIZE +#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) +#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) +#define RESYNC_WINDOW (2048*1024) + +static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) +{ + struct pool_info *pi = data; + struct page *page; + struct r1bio *r1_bio; + struct bio *bio; + int i, j; + + r1_bio = r1bio_pool_alloc(gfp_flags, pi); + if (!r1_bio) + return NULL; + + /* + * Allocate bios : 1 for reading, n-1 for writing + */ + for (j = pi->raid_disks ; j-- ; ) { + bio = bio_kmalloc(gfp_flags, RESYNC_PAGES); + if (!bio) + goto out_free_bio; + r1_bio->bios[j] = bio; + } + /* + * Allocate RESYNC_PAGES data pages and attach them to + * the first bio. + * If this is a user-requested check/repair, allocate + * RESYNC_PAGES for each bio. + */ + if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) + j = pi->raid_disks; + else + j = 1; + while(j--) { + bio = r1_bio->bios[j]; + for (i = 0; i < RESYNC_PAGES; i++) { + page = alloc_page(gfp_flags); + if (unlikely(!page)) + goto out_free_pages; + + bio->bi_io_vec[i].bv_page = page; + bio->bi_vcnt = i+1; + } + } + /* If not user-requests, copy the page pointers to all bios */ + if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) { + for (i=0; i<RESYNC_PAGES ; i++) + for (j=1; j<pi->raid_disks; j++) + r1_bio->bios[j]->bi_io_vec[i].bv_page = + r1_bio->bios[0]->bi_io_vec[i].bv_page; + } + + r1_bio->master_bio = NULL; + + return r1_bio; + +out_free_pages: + for (j=0 ; j < pi->raid_disks; j++) + for (i=0; i < r1_bio->bios[j]->bi_vcnt ; i++) + put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page); + j = -1; +out_free_bio: + while (++j < pi->raid_disks) + bio_put(r1_bio->bios[j]); + r1bio_pool_free(r1_bio, data); + return NULL; +} + +static void r1buf_pool_free(void *__r1_bio, void *data) +{ + struct pool_info *pi = data; + int i,j; + struct r1bio *r1bio = __r1_bio; + + for (i = 0; i < RESYNC_PAGES; i++) + for (j = pi->raid_disks; j-- ;) { + if (j == 0 || + r1bio->bios[j]->bi_io_vec[i].bv_page != + r1bio->bios[0]->bi_io_vec[i].bv_page) + safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page); + } + for (i=0 ; i < pi->raid_disks; i++) + bio_put(r1bio->bios[i]); + + r1bio_pool_free(r1bio, data); +} + +static void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio) +{ + int i; + + for (i = 0; i < conf->raid_disks * 2; i++) { + struct bio **bio = r1_bio->bios + i; + if (!BIO_SPECIAL(*bio)) + bio_put(*bio); + *bio = NULL; + } +} + +static void free_r1bio(struct r1bio *r1_bio) +{ + struct r1conf *conf = r1_bio->mddev->private; + + put_all_bios(conf, r1_bio); + mempool_free(r1_bio, conf->r1bio_pool); +} + +static void put_buf(struct r1bio *r1_bio) +{ + struct r1conf *conf = r1_bio->mddev->private; + int i; + + for (i = 0; i < conf->raid_disks * 2; i++) { + struct bio *bio = r1_bio->bios[i]; + if (bio->bi_end_io) + rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev); + } + + mempool_free(r1_bio, conf->r1buf_pool); + + lower_barrier(conf); +} + +static void reschedule_retry(struct r1bio *r1_bio) +{ + unsigned long flags; + struct mddev *mddev = r1_bio->mddev; + struct r1conf *conf = mddev->private; + + spin_lock_irqsave(&conf->device_lock, flags); + list_add(&r1_bio->retry_list, &conf->retry_list); + conf->nr_queued ++; + spin_unlock_irqrestore(&conf->device_lock, flags); + + wake_up(&conf->wait_barrier); + md_wakeup_thread(mddev->thread); +} + +/* + * raid_end_bio_io() is called when we have finished servicing a mirrored + * operation and are ready to return a success/failure code to the buffer + * cache layer. + */ +static void call_bio_endio(struct r1bio *r1_bio) +{ + struct bio *bio = r1_bio->master_bio; + int done; + struct r1conf *conf = r1_bio->mddev->private; + + if (bio->bi_phys_segments) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + bio->bi_phys_segments--; + done = (bio->bi_phys_segments == 0); + spin_unlock_irqrestore(&conf->device_lock, flags); + } else + done = 1; + + if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) + clear_bit(BIO_UPTODATE, &bio->bi_flags); + if (done) { + bio_endio(bio, 0); + /* + * Wake up any possible resync thread that waits for the device + * to go idle. + */ + allow_barrier(conf); + } +} + +static void raid_end_bio_io(struct r1bio *r1_bio) +{ + struct bio *bio = r1_bio->master_bio; + + /* if nobody has done the final endio yet, do it now */ + if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { + pr_debug("raid1: sync end %s on sectors %llu-%llu\n", + (bio_data_dir(bio) == WRITE) ? "write" : "read", + (unsigned long long) bio->bi_sector, + (unsigned long long) bio->bi_sector + + (bio->bi_size >> 9) - 1); + + call_bio_endio(r1_bio); + } + free_r1bio(r1_bio); +} + +/* + * Update disk head position estimator based on IRQ completion info. + */ +static inline void update_head_pos(int disk, struct r1bio *r1_bio) +{ + struct r1conf *conf = r1_bio->mddev->private; + + conf->mirrors[disk].head_position = + r1_bio->sector + (r1_bio->sectors); +} + +/* + * Find the disk number which triggered given bio + */ +static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio) +{ + int mirror; + struct r1conf *conf = r1_bio->mddev->private; + int raid_disks = conf->raid_disks; + + for (mirror = 0; mirror < raid_disks * 2; mirror++) + if (r1_bio->bios[mirror] == bio) + break; + + BUG_ON(mirror == raid_disks * 2); + update_head_pos(mirror, r1_bio); + + return mirror; +} + +static void raid1_end_read_request(struct bio *bio, int error) +{ + int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct r1bio *r1_bio = bio->bi_private; + int mirror; + struct r1conf *conf = r1_bio->mddev->private; + + mirror = r1_bio->read_disk; + /* + * this branch is our 'one mirror IO has finished' event handler: + */ + update_head_pos(mirror, r1_bio); + + if (uptodate) + set_bit(R1BIO_Uptodate, &r1_bio->state); + else { + /* If all other devices have failed, we want to return + * the error upwards rather than fail the last device. + * Here we redefine "uptodate" to mean "Don't want to retry" + */ + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + if (r1_bio->mddev->degraded == conf->raid_disks || + (r1_bio->mddev->degraded == conf->raid_disks-1 && + !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags))) + uptodate = 1; + spin_unlock_irqrestore(&conf->device_lock, flags); + } + + if (uptodate) + raid_end_bio_io(r1_bio); + else { + /* + * oops, read error: + */ + char b[BDEVNAME_SIZE]; + printk_ratelimited( + KERN_ERR "md/raid1:%s: %s: " + "rescheduling sector %llu\n", + mdname(conf->mddev), + bdevname(conf->mirrors[mirror].rdev->bdev, + b), + (unsigned long long)r1_bio->sector); + set_bit(R1BIO_ReadError, &r1_bio->state); + reschedule_retry(r1_bio); + } + + rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); +} + +static void close_write(struct r1bio *r1_bio) +{ + /* it really is the end of this request */ + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { + /* free extra copy of the data pages */ + int i = r1_bio->behind_page_count; + while (i--) + safe_put_page(r1_bio->behind_bvecs[i].bv_page); + kfree(r1_bio->behind_bvecs); + r1_bio->behind_bvecs = NULL; + } + /* clear the bitmap if all writes complete successfully */ + bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, + r1_bio->sectors, + !test_bit(R1BIO_Degraded, &r1_bio->state), + test_bit(R1BIO_BehindIO, &r1_bio->state)); + md_write_end(r1_bio->mddev); +} + +static void r1_bio_write_done(struct r1bio *r1_bio) +{ + if (!atomic_dec_and_test(&r1_bio->remaining)) + return; + + if (test_bit(R1BIO_WriteError, &r1_bio->state)) + reschedule_retry(r1_bio); + else { + close_write(r1_bio); + if (test_bit(R1BIO_MadeGood, &r1_bio->state)) + reschedule_retry(r1_bio); + else + raid_end_bio_io(r1_bio); + } +} + +static void raid1_end_write_request(struct bio *bio, int error) +{ + int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct r1bio *r1_bio = bio->bi_private; + int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); + struct r1conf *conf = r1_bio->mddev->private; + struct bio *to_put = NULL; + + mirror = find_bio_disk(r1_bio, bio); + + /* + * 'one mirror IO has finished' event handler: + */ + if (!uptodate) { + set_bit(WriteErrorSeen, + &conf->mirrors[mirror].rdev->flags); + if (!test_and_set_bit(WantReplacement, + &conf->mirrors[mirror].rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, & + conf->mddev->recovery); + + set_bit(R1BIO_WriteError, &r1_bio->state); + } else { + /* + * Set R1BIO_Uptodate in our master bio, so that we + * will return a good error code for to the higher + * levels even if IO on some other mirrored buffer + * fails. + * + * The 'master' represents the composite IO operation + * to user-side. So if something waits for IO, then it + * will wait for the 'master' bio. + */ + sector_t first_bad; + int bad_sectors; + + r1_bio->bios[mirror] = NULL; + to_put = bio; + set_bit(R1BIO_Uptodate, &r1_bio->state); + + /* Maybe we can clear some bad blocks. */ + if (is_badblock(conf->mirrors[mirror].rdev, + r1_bio->sector, r1_bio->sectors, + &first_bad, &bad_sectors)) { + r1_bio->bios[mirror] = IO_MADE_GOOD; + set_bit(R1BIO_MadeGood, &r1_bio->state); + } + } + + if (behind) { + if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) + atomic_dec(&r1_bio->behind_remaining); + + /* + * In behind mode, we ACK the master bio once the I/O + * has safely reached all non-writemostly + * disks. Setting the Returned bit ensures that this + * gets done only once -- we don't ever want to return + * -EIO here, instead we'll wait + */ + if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && + test_bit(R1BIO_Uptodate, &r1_bio->state)) { + /* Maybe we can return now */ + if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { + struct bio *mbio = r1_bio->master_bio; + pr_debug("raid1: behind end write sectors" + " %llu-%llu\n", + (unsigned long long) mbio->bi_sector, + (unsigned long long) mbio->bi_sector + + (mbio->bi_size >> 9) - 1); + call_bio_endio(r1_bio); + } + } + } + if (r1_bio->bios[mirror] == NULL) + rdev_dec_pending(conf->mirrors[mirror].rdev, + conf->mddev); + + /* + * Let's see if all mirrored write operations have finished + * already. + */ + r1_bio_write_done(r1_bio); + + if (to_put) + bio_put(to_put); +} + + +/* + * This routine returns the disk from which the requested read should + * be done. There is a per-array 'next expected sequential IO' sector + * number - if this matches on the next IO then we use the last disk. + * There is also a per-disk 'last know head position' sector that is + * maintained from IRQ contexts, both the normal and the resync IO + * completion handlers update this position correctly. If there is no + * perfect sequential match then we pick the disk whose head is closest. + * + * If there are 2 mirrors in the same 2 devices, performance degrades + * because position is mirror, not device based. + * + * The rdev for the device selected will have nr_pending incremented. + */ +static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sectors) +{ + const sector_t this_sector = r1_bio->sector; + int sectors; + int best_good_sectors; + int start_disk; + int best_disk; + int i; + sector_t best_dist; + struct md_rdev *rdev; + int choose_first; + + rcu_read_lock(); + /* + * Check if we can balance. We can balance on the whole + * device if no resync is going on, or below the resync window. + * We take the first readable disk when above the resync window. + */ + retry: + sectors = r1_bio->sectors; + best_disk = -1; + best_dist = MaxSector; + best_good_sectors = 0; + + if (conf->mddev->recovery_cp < MaxSector && + (this_sector + sectors >= conf->next_resync)) { + choose_first = 1; + start_disk = 0; + } else { + choose_first = 0; + start_disk = conf->last_used; + } + + for (i = 0 ; i < conf->raid_disks * 2 ; i++) { + sector_t dist; + sector_t first_bad; + int bad_sectors; + + int disk = start_disk + i; + if (disk >= conf->raid_disks) + disk -= conf->raid_disks; + + rdev = rcu_dereference(conf->mirrors[disk].rdev); + if (r1_bio->bios[disk] == IO_BLOCKED + || rdev == NULL + || test_bit(Unmerged, &rdev->flags) + || test_bit(Faulty, &rdev->flags)) + continue; + if (!test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < this_sector + sectors) + continue; + if (test_bit(WriteMostly, &rdev->flags)) { + /* Don't balance among write-mostly, just + * use the first as a last resort */ + if (best_disk < 0) { + if (is_badblock(rdev, this_sector, sectors, + &first_bad, &bad_sectors)) { + if (first_bad < this_sector) + /* Cannot use this */ + continue; + best_good_sectors = first_bad - this_sector; + } else + best_good_sectors = sectors; + best_disk = disk; + } + continue; + } + /* This is a reasonable device to use. It might + * even be best. + */ + if (is_badblock(rdev, this_sector, sectors, + &first_bad, &bad_sectors)) { + if (best_dist < MaxSector) + /* already have a better device */ + continue; + if (first_bad <= this_sector) { + /* cannot read here. If this is the 'primary' + * device, then we must not read beyond + * bad_sectors from another device.. + */ + bad_sectors -= (this_sector - first_bad); + if (choose_first && sectors > bad_sectors) + sectors = bad_sectors; + if (best_good_sectors > sectors) + best_good_sectors = sectors; + + } else { + sector_t good_sectors = first_bad - this_sector; + if (good_sectors > best_good_sectors) { + best_good_sectors = good_sectors; + best_disk = disk; + } + if (choose_first) + break; + } + continue; + } else + best_good_sectors = sectors; + + dist = abs(this_sector - conf->mirrors[disk].head_position); + if (choose_first + /* Don't change to another disk for sequential reads */ + || conf->next_seq_sect == this_sector + || dist == 0 + /* If device is idle, use it */ + || atomic_read(&rdev->nr_pending) == 0) { + best_disk = disk; + break; + } + if (dist < best_dist) { + best_dist = dist; + best_disk = disk; + } + } + + if (best_disk >= 0) { + rdev = rcu_dereference(conf->mirrors[best_disk].rdev); + if (!rdev) + goto retry; + atomic_inc(&rdev->nr_pending); + if (test_bit(Faulty, &rdev->flags)) { + /* cannot risk returning a device that failed + * before we inc'ed nr_pending + */ + rdev_dec_pending(rdev, conf->mddev); + goto retry; + } + sectors = best_good_sectors; + conf->next_seq_sect = this_sector + sectors; + conf->last_used = best_disk; + } + rcu_read_unlock(); + *max_sectors = sectors; + + return best_disk; +} + +static int raid1_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) +{ + struct mddev *mddev = q->queuedata; + struct r1conf *conf = mddev->private; + sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); + int max = biovec->bv_len; + + if (mddev->merge_check_needed) { + int disk; + rcu_read_lock(); + for (disk = 0; disk < conf->raid_disks * 2; disk++) { + struct md_rdev *rdev = rcu_dereference( + conf->mirrors[disk].rdev); + if (rdev && !test_bit(Faulty, &rdev->flags)) { + struct request_queue *q = + bdev_get_queue(rdev->bdev); + if (q->merge_bvec_fn) { + bvm->bi_sector = sector + + rdev->data_offset; + bvm->bi_bdev = rdev->bdev; + max = min(max, q->merge_bvec_fn( + q, bvm, biovec)); + } + } + } + rcu_read_unlock(); + } + return max; + +} + +int md_raid1_congested(struct mddev *mddev, int bits) +{ + struct r1conf *conf = mddev->private; + int i, ret = 0; + + if ((bits & (1 << BDI_async_congested)) && + conf->pending_count >= max_queued_requests) + return 1; + + rcu_read_lock(); + for (i = 0; i < conf->raid_disks * 2; i++) { + struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + if (rdev && !test_bit(Faulty, &rdev->flags)) { + struct request_queue *q = bdev_get_queue(rdev->bdev); + + BUG_ON(!q); + + /* Note the '|| 1' - when read_balance prefers + * non-congested targets, it can be removed + */ + if ((bits & (1<<BDI_async_congested)) || 1) + ret |= bdi_congested(&q->backing_dev_info, bits); + else + ret &= bdi_congested(&q->backing_dev_info, bits); + } + } + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(md_raid1_congested); + +static int raid1_congested(void *data, int bits) +{ + struct mddev *mddev = data; + + return mddev_congested(mddev, bits) || + md_raid1_congested(mddev, bits); +} + +static void flush_pending_writes(struct r1conf *conf) +{ + /* Any writes that have been queued but are awaiting + * bitmap updates get flushed here. + */ + spin_lock_irq(&conf->device_lock); + + if (conf->pending_bio_list.head) { + struct bio *bio; + bio = bio_list_get(&conf->pending_bio_list); + conf->pending_count = 0; + spin_unlock_irq(&conf->device_lock); + /* flush any pending bitmap writes to + * disk before proceeding w/ I/O */ + bitmap_unplug(conf->mddev->bitmap); + wake_up(&conf->wait_barrier); + + while (bio) { /* submit pending writes */ + struct bio *next = bio->bi_next; + bio->bi_next = NULL; + generic_make_request(bio); + bio = next; + } + } else + spin_unlock_irq(&conf->device_lock); +} + +/* Barriers.... + * Sometimes we need to suspend IO while we do something else, + * either some resync/recovery, or reconfigure the array. + * To do this we raise a 'barrier'. + * The 'barrier' is a counter that can be raised multiple times + * to count how many activities are happening which preclude + * normal IO. + * We can only raise the barrier if there is no pending IO. + * i.e. if nr_pending == 0. + * We choose only to raise the barrier if no-one is waiting for the + * barrier to go down. This means that as soon as an IO request + * is ready, no other operations which require a barrier will start + * until the IO request has had a chance. + * + * So: regular IO calls 'wait_barrier'. When that returns there + * is no backgroup IO happening, It must arrange to call + * allow_barrier when it has finished its IO. + * backgroup IO calls must call raise_barrier. Once that returns + * there is no normal IO happeing. It must arrange to call + * lower_barrier when the particular background IO completes. + */ +#define RESYNC_DEPTH 32 + +static void raise_barrier(struct r1conf *conf) +{ + spin_lock_irq(&conf->resync_lock); + + /* Wait until no block IO is waiting */ + wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting, + conf->resync_lock, ); + + /* block any new IO from starting */ + conf->barrier++; + + /* Now wait for all pending IO to complete */ + wait_event_lock_irq(conf->wait_barrier, + !conf->nr_pending && conf->barrier < RESYNC_DEPTH, + conf->resync_lock, ); + + spin_unlock_irq(&conf->resync_lock); +} + +static void lower_barrier(struct r1conf *conf) +{ + unsigned long flags; + BUG_ON(conf->barrier <= 0); + spin_lock_irqsave(&conf->resync_lock, flags); + conf->barrier--; + spin_unlock_irqrestore(&conf->resync_lock, flags); + wake_up(&conf->wait_barrier); +} + +static void wait_barrier(struct r1conf *conf) +{ + spin_lock_irq(&conf->resync_lock); + if (conf->barrier) { + conf->nr_waiting++; + /* Wait for the barrier to drop. + * However if there are already pending + * requests (preventing the barrier from + * rising completely), and the + * pre-process bio queue isn't empty, + * then don't wait, as we need to empty + * that queue to get the nr_pending + * count down. + */ + wait_event_lock_irq(conf->wait_barrier, + !conf->barrier || + (conf->nr_pending && + current->bio_list && + !bio_list_empty(current->bio_list)), + conf->resync_lock, + ); + conf->nr_waiting--; + } + conf->nr_pending++; + spin_unlock_irq(&conf->resync_lock); +} + +static void allow_barrier(struct r1conf *conf) +{ + unsigned long flags; + spin_lock_irqsave(&conf->resync_lock, flags); + conf->nr_pending--; + spin_unlock_irqrestore(&conf->resync_lock, flags); + wake_up(&conf->wait_barrier); +} + +static void freeze_array(struct r1conf *conf) +{ + /* stop syncio and normal IO and wait for everything to + * go quite. + * We increment barrier and nr_waiting, and then + * wait until nr_pending match nr_queued+1 + * This is called in the context of one normal IO request + * that has failed. Thus any sync request that might be pending + * will be blocked by nr_pending, and we need to wait for + * pending IO requests to complete or be queued for re-try. + * Thus the number queued (nr_queued) plus this request (1) + * must match the number of pending IOs (nr_pending) before + * we continue. + */ + spin_lock_irq(&conf->resync_lock); + conf->barrier++; + conf->nr_waiting++; + wait_event_lock_irq(conf->wait_barrier, + conf->nr_pending == conf->nr_queued+1, + conf->resync_lock, + flush_pending_writes(conf)); + spin_unlock_irq(&conf->resync_lock); +} +static void unfreeze_array(struct r1conf *conf) +{ + /* reverse the effect of the freeze */ + spin_lock_irq(&conf->resync_lock); + conf->barrier--; + conf->nr_waiting--; + wake_up(&conf->wait_barrier); + spin_unlock_irq(&conf->resync_lock); +} + + +/* duplicate the data pages for behind I/O + */ +static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio) +{ + int i; + struct bio_vec *bvec; + struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec), + GFP_NOIO); + if (unlikely(!bvecs)) + return; + + bio_for_each_segment(bvec, bio, i) { + bvecs[i] = *bvec; + bvecs[i].bv_page = alloc_page(GFP_NOIO); + if (unlikely(!bvecs[i].bv_page)) + goto do_sync_io; + memcpy(kmap(bvecs[i].bv_page) + bvec->bv_offset, + kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); + kunmap(bvecs[i].bv_page); + kunmap(bvec->bv_page); + } + r1_bio->behind_bvecs = bvecs; + r1_bio->behind_page_count = bio->bi_vcnt; + set_bit(R1BIO_BehindIO, &r1_bio->state); + return; + +do_sync_io: + for (i = 0; i < bio->bi_vcnt; i++) + if (bvecs[i].bv_page) + put_page(bvecs[i].bv_page); + kfree(bvecs); + pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); +} + +static void make_request(struct mddev *mddev, struct bio * bio) +{ + struct r1conf *conf = mddev->private; + struct mirror_info *mirror; + struct r1bio *r1_bio; + struct bio *read_bio; + int i, disks; + struct bitmap *bitmap; + unsigned long flags; + const int rw = bio_data_dir(bio); + const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); + const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); + struct md_rdev *blocked_rdev; + int plugged; + int first_clone; + int sectors_handled; + int max_sectors; + + /* + * Register the new request and wait if the reconstruction + * thread has put up a bar for new requests. + * Continue immediately if no resync is active currently. + */ + + md_write_start(mddev, bio); /* wait on superblock update early */ + + if (bio_data_dir(bio) == WRITE && + bio->bi_sector + bio->bi_size/512 > mddev->suspend_lo && + bio->bi_sector < mddev->suspend_hi) { + /* As the suspend_* range is controlled by + * userspace, we want an interruptible + * wait. + */ + DEFINE_WAIT(w); + for (;;) { + flush_signals(current); + prepare_to_wait(&conf->wait_barrier, + &w, TASK_INTERRUPTIBLE); + if (bio->bi_sector + bio->bi_size/512 <= mddev->suspend_lo || + bio->bi_sector >= mddev->suspend_hi) + break; + schedule(); + } + finish_wait(&conf->wait_barrier, &w); + } + + wait_barrier(conf); + + bitmap = mddev->bitmap; + + /* + * make_request() can abort the operation when READA is being + * used and no empty request is available. + * + */ + r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); + + r1_bio->master_bio = bio; + r1_bio->sectors = bio->bi_size >> 9; + r1_bio->state = 0; + r1_bio->mddev = mddev; + r1_bio->sector = bio->bi_sector; + + /* We might need to issue multiple reads to different + * devices if there are bad blocks around, so we keep + * track of the number of reads in bio->bi_phys_segments. + * If this is 0, there is only one r1_bio and no locking + * will be needed when requests complete. If it is + * non-zero, then it is the number of not-completed requests. + */ + bio->bi_phys_segments = 0; + clear_bit(BIO_SEG_VALID, &bio->bi_flags); + + if (rw == READ) { + /* + * read balancing logic: + */ + int rdisk; + +read_again: + rdisk = read_balance(conf, r1_bio, &max_sectors); + + if (rdisk < 0) { + /* couldn't find anywhere to read from */ + raid_end_bio_io(r1_bio); + return; + } + mirror = conf->mirrors + rdisk; + + if (test_bit(WriteMostly, &mirror->rdev->flags) && + bitmap) { + /* Reading from a write-mostly device must + * take care not to over-take any writes + * that are 'behind' + */ + wait_event(bitmap->behind_wait, + atomic_read(&bitmap->behind_writes) == 0); + } + r1_bio->read_disk = rdisk; + + read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); + md_trim_bio(read_bio, r1_bio->sector - bio->bi_sector, + max_sectors); + + r1_bio->bios[rdisk] = read_bio; + + read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset; + read_bio->bi_bdev = mirror->rdev->bdev; + read_bio->bi_end_io = raid1_end_read_request; + read_bio->bi_rw = READ | do_sync; + read_bio->bi_private = r1_bio; + + if (max_sectors < r1_bio->sectors) { + /* could not read all from this device, so we will + * need another r1_bio. + */ + + sectors_handled = (r1_bio->sector + max_sectors + - bio->bi_sector); + r1_bio->sectors = max_sectors; + spin_lock_irq(&conf->device_lock); + if (bio->bi_phys_segments == 0) + bio->bi_phys_segments = 2; + else + bio->bi_phys_segments++; + spin_unlock_irq(&conf->device_lock); + /* Cannot call generic_make_request directly + * as that will be queued in __make_request + * and subsequent mempool_alloc might block waiting + * for it. So hand bio over to raid1d. + */ + reschedule_retry(r1_bio); + + r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); + + r1_bio->master_bio = bio; + r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled; + r1_bio->state = 0; + r1_bio->mddev = mddev; + r1_bio->sector = bio->bi_sector + sectors_handled; + goto read_again; + } else + generic_make_request(read_bio); + return; + } + + /* + * WRITE: + */ + if (conf->pending_count >= max_queued_requests) { + md_wakeup_thread(mddev->thread); + wait_event(conf->wait_barrier, + conf->pending_count < max_queued_requests); + } + /* first select target devices under rcu_lock and + * inc refcount on their rdev. Record them by setting + * bios[x] to bio + * If there are known/acknowledged bad blocks on any device on + * which we have seen a write error, we want to avoid writing those + * blocks. + * This potentially requires several writes to write around + * the bad blocks. Each set of writes gets it's own r1bio + * with a set of bios attached. + */ + plugged = mddev_check_plugged(mddev); + + disks = conf->raid_disks * 2; + retry_write: + blocked_rdev = NULL; + rcu_read_lock(); + max_sectors = r1_bio->sectors; + for (i = 0; i < disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { + atomic_inc(&rdev->nr_pending); + blocked_rdev = rdev; + break; + } + r1_bio->bios[i] = NULL; + if (!rdev || test_bit(Faulty, &rdev->flags) + || test_bit(Unmerged, &rdev->flags)) { + if (i < conf->raid_disks) + set_bit(R1BIO_Degraded, &r1_bio->state); + continue; + } + + atomic_inc(&rdev->nr_pending); + if (test_bit(WriteErrorSeen, &rdev->flags)) { + sector_t first_bad; + int bad_sectors; + int is_bad; + + is_bad = is_badblock(rdev, r1_bio->sector, + max_sectors, + &first_bad, &bad_sectors); + if (is_bad < 0) { + /* mustn't write here until the bad block is + * acknowledged*/ + set_bit(BlockedBadBlocks, &rdev->flags); + blocked_rdev = rdev; + break; + } + if (is_bad && first_bad <= r1_bio->sector) { + /* Cannot write here at all */ + bad_sectors -= (r1_bio->sector - first_bad); + if (bad_sectors < max_sectors) + /* mustn't write more than bad_sectors + * to other devices yet + */ + max_sectors = bad_sectors; + rdev_dec_pending(rdev, mddev); + /* We don't set R1BIO_Degraded as that + * only applies if the disk is + * missing, so it might be re-added, + * and we want to know to recover this + * chunk. + * In this case the device is here, + * and the fact that this chunk is not + * in-sync is recorded in the bad + * block log + */ + continue; + } + if (is_bad) { + int good_sectors = first_bad - r1_bio->sector; + if (good_sectors < max_sectors) + max_sectors = good_sectors; + } + } + r1_bio->bios[i] = bio; + } + rcu_read_unlock(); + + if (unlikely(blocked_rdev)) { + /* Wait for this device to become unblocked */ + int j; + + for (j = 0; j < i; j++) + if (r1_bio->bios[j]) + rdev_dec_pending(conf->mirrors[j].rdev, mddev); + r1_bio->state = 0; + allow_barrier(conf); + md_wait_for_blocked_rdev(blocked_rdev, mddev); + wait_barrier(conf); + goto retry_write; + } + + if (max_sectors < r1_bio->sectors) { + /* We are splitting this write into multiple parts, so + * we need to prepare for allocating another r1_bio. + */ + r1_bio->sectors = max_sectors; + spin_lock_irq(&conf->device_lock); + if (bio->bi_phys_segments == 0) + bio->bi_phys_segments = 2; + else + bio->bi_phys_segments++; + spin_unlock_irq(&conf->device_lock); + } + sectors_handled = r1_bio->sector + max_sectors - bio->bi_sector; + + atomic_set(&r1_bio->remaining, 1); + atomic_set(&r1_bio->behind_remaining, 0); + + first_clone = 1; + for (i = 0; i < disks; i++) { + struct bio *mbio; + if (!r1_bio->bios[i]) + continue; + + mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + md_trim_bio(mbio, r1_bio->sector - bio->bi_sector, max_sectors); + + if (first_clone) { + /* do behind I/O ? + * Not if there are too many, or cannot + * allocate memory, or a reader on WriteMostly + * is waiting for behind writes to flush */ + if (bitmap && + (atomic_read(&bitmap->behind_writes) + < mddev->bitmap_info.max_write_behind) && + !waitqueue_active(&bitmap->behind_wait)) + alloc_behind_pages(mbio, r1_bio); + + bitmap_startwrite(bitmap, r1_bio->sector, + r1_bio->sectors, + test_bit(R1BIO_BehindIO, + &r1_bio->state)); + first_clone = 0; + } + if (r1_bio->behind_bvecs) { + struct bio_vec *bvec; + int j; + + /* Yes, I really want the '__' version so that + * we clear any unused pointer in the io_vec, rather + * than leave them unchanged. This is important + * because when we come to free the pages, we won't + * know the original bi_idx, so we just free + * them all + */ + __bio_for_each_segment(bvec, mbio, j, 0) + bvec->bv_page = r1_bio->behind_bvecs[j].bv_page; + if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) + atomic_inc(&r1_bio->behind_remaining); + } + + r1_bio->bios[i] = mbio; + + mbio->bi_sector = (r1_bio->sector + + conf->mirrors[i].rdev->data_offset); + mbio->bi_bdev = conf->mirrors[i].rdev->bdev; + mbio->bi_end_io = raid1_end_write_request; + mbio->bi_rw = WRITE | do_flush_fua | do_sync; + mbio->bi_private = r1_bio; + + atomic_inc(&r1_bio->remaining); + spin_lock_irqsave(&conf->device_lock, flags); + bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; + spin_unlock_irqrestore(&conf->device_lock, flags); + } + /* Mustn't call r1_bio_write_done before this next test, + * as it could result in the bio being freed. + */ + if (sectors_handled < (bio->bi_size >> 9)) { + r1_bio_write_done(r1_bio); + /* We need another r1_bio. It has already been counted + * in bio->bi_phys_segments + */ + r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); + r1_bio->master_bio = bio; + r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled; + r1_bio->state = 0; + r1_bio->mddev = mddev; + r1_bio->sector = bio->bi_sector + sectors_handled; + goto retry_write; + } + + r1_bio_write_done(r1_bio); + + /* In case raid1d snuck in to freeze_array */ + wake_up(&conf->wait_barrier); + + if (do_sync || !bitmap || !plugged) + md_wakeup_thread(mddev->thread); +} + +static void status(struct seq_file *seq, struct mddev *mddev) +{ + struct r1conf *conf = mddev->private; + int i; + + seq_printf(seq, " [%d/%d] [", conf->raid_disks, + conf->raid_disks - mddev->degraded); + rcu_read_lock(); + for (i = 0; i < conf->raid_disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + seq_printf(seq, "%s", + rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); + } + rcu_read_unlock(); + seq_printf(seq, "]"); +} + + +static void error(struct mddev *mddev, struct md_rdev *rdev) +{ + char b[BDEVNAME_SIZE]; + struct r1conf *conf = mddev->private; + + /* + * If it is not operational, then we have already marked it as dead + * else if it is the last working disks, ignore the error, let the + * next level up know. + * else mark the drive as failed + */ + if (test_bit(In_sync, &rdev->flags) + && (conf->raid_disks - mddev->degraded) == 1) { + /* + * Don't fail the drive, act as though we were just a + * normal single drive. + * However don't try a recovery from this drive as + * it is very likely to fail. + */ + conf->recovery_disabled = mddev->recovery_disabled; + return; + } + set_bit(Blocked, &rdev->flags); + if (test_and_clear_bit(In_sync, &rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + mddev->degraded++; + set_bit(Faulty, &rdev->flags); + spin_unlock_irqrestore(&conf->device_lock, flags); + /* + * if recovery is running, make sure it aborts. + */ + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + } else + set_bit(Faulty, &rdev->flags); + set_bit(MD_CHANGE_DEVS, &mddev->flags); + printk(KERN_ALERT + "md/raid1:%s: Disk failure on %s, disabling device.\n" + "md/raid1:%s: Operation continuing on %d devices.\n", + mdname(mddev), bdevname(rdev->bdev, b), + mdname(mddev), conf->raid_disks - mddev->degraded); +} + +static void print_conf(struct r1conf *conf) +{ + int i; + + printk(KERN_DEBUG "RAID1 conf printout:\n"); + if (!conf) { + printk(KERN_DEBUG "(!conf)\n"); + return; + } + printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, + conf->raid_disks); + + rcu_read_lock(); + for (i = 0; i < conf->raid_disks; i++) { + char b[BDEVNAME_SIZE]; + struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + if (rdev) + printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n", + i, !test_bit(In_sync, &rdev->flags), + !test_bit(Faulty, &rdev->flags), + bdevname(rdev->bdev,b)); + } + rcu_read_unlock(); +} + +static void close_sync(struct r1conf *conf) +{ + wait_barrier(conf); + allow_barrier(conf); + + mempool_destroy(conf->r1buf_pool); + conf->r1buf_pool = NULL; +} + +static int raid1_spare_active(struct mddev *mddev) +{ + int i; + struct r1conf *conf = mddev->private; + int count = 0; + unsigned long flags; + + /* + * Find all failed disks within the RAID1 configuration + * and mark them readable. + * Called under mddev lock, so rcu protection not needed. + */ + for (i = 0; i < conf->raid_disks; i++) { + struct md_rdev *rdev = conf->mirrors[i].rdev; + struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev; + if (repl + && repl->recovery_offset == MaxSector + && !test_bit(Faulty, &repl->flags) + && !test_and_set_bit(In_sync, &repl->flags)) { + /* replacement has just become active */ + if (!rdev || + !test_and_clear_bit(In_sync, &rdev->flags)) + count++; + if (rdev) { + /* Replaced device not technically + * faulty, but we need to be sure + * it gets removed and never re-added + */ + set_bit(Faulty, &rdev->flags); + sysfs_notify_dirent_safe( + rdev->sysfs_state); + } + } + if (rdev + && !test_bit(Faulty, &rdev->flags) + && !test_and_set_bit(In_sync, &rdev->flags)) { + count++; + sysfs_notify_dirent_safe(rdev->sysfs_state); + } + } + spin_lock_irqsave(&conf->device_lock, flags); + mddev->degraded -= count; + spin_unlock_irqrestore(&conf->device_lock, flags); + + print_conf(conf); + return count; +} + + +static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) +{ + struct r1conf *conf = mddev->private; + int err = -EEXIST; + int mirror = 0; + struct mirror_info *p; + int first = 0; + int last = conf->raid_disks - 1; + struct request_queue *q = bdev_get_queue(rdev->bdev); + + if (mddev->recovery_disabled == conf->recovery_disabled) + return -EBUSY; + + if (rdev->raid_disk >= 0) + first = last = rdev->raid_disk; + + if (q->merge_bvec_fn) { + set_bit(Unmerged, &rdev->flags); + mddev->merge_check_needed = 1; + } + + for (mirror = first; mirror <= last; mirror++) { + p = conf->mirrors+mirror; + if (!p->rdev) { + + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + + p->head_position = 0; + rdev->raid_disk = mirror; + err = 0; + /* As all devices are equivalent, we don't need a full recovery + * if this was recently any drive of the array + */ + if (rdev->saved_raid_disk < 0) + conf->fullsync = 1; + rcu_assign_pointer(p->rdev, rdev); + break; + } + if (test_bit(WantReplacement, &p->rdev->flags) && + p[conf->raid_disks].rdev == NULL) { + /* Add this device as a replacement */ + clear_bit(In_sync, &rdev->flags); + set_bit(Replacement, &rdev->flags); + rdev->raid_disk = mirror; + err = 0; + conf->fullsync = 1; + rcu_assign_pointer(p[conf->raid_disks].rdev, rdev); + break; + } + } + if (err == 0 && test_bit(Unmerged, &rdev->flags)) { + /* Some requests might not have seen this new + * merge_bvec_fn. We must wait for them to complete + * before merging the device fully. + * First we make sure any code which has tested + * our function has submitted the request, then + * we wait for all outstanding requests to complete. + */ + synchronize_sched(); + raise_barrier(conf); + lower_barrier(conf); + clear_bit(Unmerged, &rdev->flags); + } + md_integrity_add_rdev(rdev, mddev); + print_conf(conf); + return err; +} + +static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) +{ + struct r1conf *conf = mddev->private; + int err = 0; + int number = rdev->raid_disk; + struct mirror_info *p = conf->mirrors+ number; + + if (rdev != p->rdev) + p = conf->mirrors + conf->raid_disks + number; + + print_conf(conf); + if (rdev == p->rdev) { + if (test_bit(In_sync, &rdev->flags) || + atomic_read(&rdev->nr_pending)) { + err = -EBUSY; + goto abort; + } + /* Only remove non-faulty devices if recovery + * is not possible. + */ + if (!test_bit(Faulty, &rdev->flags) && + mddev->recovery_disabled != conf->recovery_disabled && + mddev->degraded < conf->raid_disks) { + err = -EBUSY; + goto abort; + } + p->rdev = NULL; + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + p->rdev = rdev; + goto abort; + } else if (conf->mirrors[conf->raid_disks + number].rdev) { + /* We just removed a device that is being replaced. + * Move down the replacement. We drain all IO before + * doing this to avoid confusion. + */ + struct md_rdev *repl = + conf->mirrors[conf->raid_disks + number].rdev; + raise_barrier(conf); + clear_bit(Replacement, &repl->flags); + p->rdev = repl; + conf->mirrors[conf->raid_disks + number].rdev = NULL; + lower_barrier(conf); + clear_bit(WantReplacement, &rdev->flags); + } else + clear_bit(WantReplacement, &rdev->flags); + err = md_integrity_register(mddev); + } +abort: + + print_conf(conf); + return err; +} + + +static void end_sync_read(struct bio *bio, int error) +{ + struct r1bio *r1_bio = bio->bi_private; + + update_head_pos(r1_bio->read_disk, r1_bio); + + /* + * we have read a block, now it needs to be re-written, + * or re-read if the read failed. + * We don't do much here, just schedule handling by raid1d + */ + if (test_bit(BIO_UPTODATE, &bio->bi_flags)) + set_bit(R1BIO_Uptodate, &r1_bio->state); + + if (atomic_dec_and_test(&r1_bio->remaining)) + reschedule_retry(r1_bio); +} + +static void end_sync_write(struct bio *bio, int error) +{ + int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct r1bio *r1_bio = bio->bi_private; + struct mddev *mddev = r1_bio->mddev; + struct r1conf *conf = mddev->private; + int mirror=0; + sector_t first_bad; + int bad_sectors; + + mirror = find_bio_disk(r1_bio, bio); + + if (!uptodate) { + sector_t sync_blocks = 0; + sector_t s = r1_bio->sector; + long sectors_to_go = r1_bio->sectors; + /* make sure these bits doesn't get cleared. */ + do { + bitmap_end_sync(mddev->bitmap, s, + &sync_blocks, 1); + s += sync_blocks; + sectors_to_go -= sync_blocks; + } while (sectors_to_go > 0); + set_bit(WriteErrorSeen, + &conf->mirrors[mirror].rdev->flags); + if (!test_and_set_bit(WantReplacement, + &conf->mirrors[mirror].rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, & + mddev->recovery); + set_bit(R1BIO_WriteError, &r1_bio->state); + } else if (is_badblock(conf->mirrors[mirror].rdev, + r1_bio->sector, + r1_bio->sectors, + &first_bad, &bad_sectors) && + !is_badblock(conf->mirrors[r1_bio->read_disk].rdev, + r1_bio->sector, + r1_bio->sectors, + &first_bad, &bad_sectors) + ) + set_bit(R1BIO_MadeGood, &r1_bio->state); + + if (atomic_dec_and_test(&r1_bio->remaining)) { + int s = r1_bio->sectors; + if (test_bit(R1BIO_MadeGood, &r1_bio->state) || + test_bit(R1BIO_WriteError, &r1_bio->state)) + reschedule_retry(r1_bio); + else { + put_buf(r1_bio); + md_done_sync(mddev, s, uptodate); + } + } +} + +static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector, + int sectors, struct page *page, int rw) +{ + if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) + /* success */ + return 1; + if (rw == WRITE) { + set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, + &rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, & + rdev->mddev->recovery); + } + /* need to record an error - either for the block or the device */ + if (!rdev_set_badblocks(rdev, sector, sectors, 0)) + md_error(rdev->mddev, rdev); + return 0; +} + +static int fix_sync_read_error(struct r1bio *r1_bio) +{ + /* Try some synchronous reads of other devices to get + * good data, much like with normal read errors. Only + * read into the pages we already have so we don't + * need to re-issue the read request. + * We don't need to freeze the array, because being in an + * active sync request, there is no normal IO, and + * no overlapping syncs. + * We don't need to check is_badblock() again as we + * made sure that anything with a bad block in range + * will have bi_end_io clear. + */ + struct mddev *mddev = r1_bio->mddev; + struct r1conf *conf = mddev->private; + struct bio *bio = r1_bio->bios[r1_bio->read_disk]; + sector_t sect = r1_bio->sector; + int sectors = r1_bio->sectors; + int idx = 0; + + while(sectors) { + int s = sectors; + int d = r1_bio->read_disk; + int success = 0; + struct md_rdev *rdev; + int start; + + if (s > (PAGE_SIZE>>9)) + s = PAGE_SIZE >> 9; + do { + if (r1_bio->bios[d]->bi_end_io == end_sync_read) { + /* No rcu protection needed here devices + * can only be removed when no resync is + * active, and resync is currently active + */ + rdev = conf->mirrors[d].rdev; + if (sync_page_io(rdev, sect, s<<9, + bio->bi_io_vec[idx].bv_page, + READ, false)) { + success = 1; + break; + } + } + d++; + if (d == conf->raid_disks * 2) + d = 0; + } while (!success && d != r1_bio->read_disk); + + if (!success) { + char b[BDEVNAME_SIZE]; + int abort = 0; + /* Cannot read from anywhere, this block is lost. + * Record a bad block on each device. If that doesn't + * work just disable and interrupt the recovery. + * Don't fail devices as that won't really help. + */ + printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" + " for block %llu\n", + mdname(mddev), + bdevname(bio->bi_bdev, b), + (unsigned long long)r1_bio->sector); + for (d = 0; d < conf->raid_disks * 2; d++) { + rdev = conf->mirrors[d].rdev; + if (!rdev || test_bit(Faulty, &rdev->flags)) + continue; + if (!rdev_set_badblocks(rdev, sect, s, 0)) + abort = 1; + } + if (abort) { + conf->recovery_disabled = + mddev->recovery_disabled; + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_done_sync(mddev, r1_bio->sectors, 0); + put_buf(r1_bio); + return 0; + } + /* Try next page */ + sectors -= s; + sect += s; + idx++; + continue; + } + + start = d; + /* write it back and re-read */ + while (d != r1_bio->read_disk) { + if (d == 0) + d = conf->raid_disks * 2; + d--; + if (r1_bio->bios[d]->bi_end_io != end_sync_read) + continue; + rdev = conf->mirrors[d].rdev; + if (r1_sync_page_io(rdev, sect, s, + bio->bi_io_vec[idx].bv_page, + WRITE) == 0) { + r1_bio->bios[d]->bi_end_io = NULL; + rdev_dec_pending(rdev, mddev); + } + } + d = start; + while (d != r1_bio->read_disk) { + if (d == 0) + d = conf->raid_disks * 2; + d--; + if (r1_bio->bios[d]->bi_end_io != end_sync_read) + continue; + rdev = conf->mirrors[d].rdev; + if (r1_sync_page_io(rdev, sect, s, + bio->bi_io_vec[idx].bv_page, + READ) != 0) + atomic_add(s, &rdev->corrected_errors); + } + sectors -= s; + sect += s; + idx ++; + } + set_bit(R1BIO_Uptodate, &r1_bio->state); + set_bit(BIO_UPTODATE, &bio->bi_flags); + return 1; +} + +static int process_checks(struct r1bio *r1_bio) +{ + /* We have read all readable devices. If we haven't + * got the block, then there is no hope left. + * If we have, then we want to do a comparison + * and skip the write if everything is the same. + * If any blocks failed to read, then we need to + * attempt an over-write + */ + struct mddev *mddev = r1_bio->mddev; + struct r1conf *conf = mddev->private; + int primary; + int i; + int vcnt; + + for (primary = 0; primary < conf->raid_disks * 2; primary++) + if (r1_bio->bios[primary]->bi_end_io == end_sync_read && + test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { + r1_bio->bios[primary]->bi_end_io = NULL; + rdev_dec_pending(conf->mirrors[primary].rdev, mddev); + break; + } + r1_bio->read_disk = primary; + vcnt = (r1_bio->sectors + PAGE_SIZE / 512 - 1) >> (PAGE_SHIFT - 9); + for (i = 0; i < conf->raid_disks * 2; i++) { + int j; + struct bio *pbio = r1_bio->bios[primary]; + struct bio *sbio = r1_bio->bios[i]; + int size; + + if (r1_bio->bios[i]->bi_end_io != end_sync_read) + continue; + + if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) { + for (j = vcnt; j-- ; ) { + struct page *p, *s; + p = pbio->bi_io_vec[j].bv_page; + s = sbio->bi_io_vec[j].bv_page; + if (memcmp(page_address(p), + page_address(s), + sbio->bi_io_vec[j].bv_len)) + break; + } + } else + j = 0; + if (j >= 0) + mddev->resync_mismatches += r1_bio->sectors; + if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) + && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { + /* No need to write to this device. */ + sbio->bi_end_io = NULL; + rdev_dec_pending(conf->mirrors[i].rdev, mddev); + continue; + } + /* fixup the bio for reuse */ + sbio->bi_vcnt = vcnt; + sbio->bi_size = r1_bio->sectors << 9; + sbio->bi_idx = 0; + sbio->bi_phys_segments = 0; + sbio->bi_flags &= ~(BIO_POOL_MASK - 1); + sbio->bi_flags |= 1 << BIO_UPTODATE; + sbio->bi_next = NULL; + sbio->bi_sector = r1_bio->sector + + conf->mirrors[i].rdev->data_offset; + sbio->bi_bdev = conf->mirrors[i].rdev->bdev; + size = sbio->bi_size; + for (j = 0; j < vcnt ; j++) { + struct bio_vec *bi; + bi = &sbio->bi_io_vec[j]; + bi->bv_offset = 0; + if (size > PAGE_SIZE) + bi->bv_len = PAGE_SIZE; + else + bi->bv_len = size; + size -= PAGE_SIZE; + memcpy(page_address(bi->bv_page), + page_address(pbio->bi_io_vec[j].bv_page), + PAGE_SIZE); + } + } + return 0; +} + +static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) +{ + struct r1conf *conf = mddev->private; + int i; + int disks = conf->raid_disks * 2; + struct bio *bio, *wbio; + + bio = r1_bio->bios[r1_bio->read_disk]; + + if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) + /* ouch - failed to read all of that. */ + if (!fix_sync_read_error(r1_bio)) + return; + + if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + if (process_checks(r1_bio) < 0) + return; + /* + * schedule writes + */ + atomic_set(&r1_bio->remaining, 1); + for (i = 0; i < disks ; i++) { + wbio = r1_bio->bios[i]; + if (wbio->bi_end_io == NULL || + (wbio->bi_end_io == end_sync_read && + (i == r1_bio->read_disk || + !test_bit(MD_RECOVERY_SYNC, &mddev->recovery)))) + continue; + + wbio->bi_rw = WRITE; + wbio->bi_end_io = end_sync_write; + atomic_inc(&r1_bio->remaining); + md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9); + + generic_make_request(wbio); + } + + if (atomic_dec_and_test(&r1_bio->remaining)) { + /* if we're here, all write(s) have completed, so clean up */ + md_done_sync(mddev, r1_bio->sectors, 1); + put_buf(r1_bio); + } +} + +/* + * This is a kernel thread which: + * + * 1. Retries failed read operations on working mirrors. + * 2. Updates the raid superblock when problems encounter. + * 3. Performs writes following reads for array synchronising. + */ + +static void fix_read_error(struct r1conf *conf, int read_disk, + sector_t sect, int sectors) +{ + struct mddev *mddev = conf->mddev; + while(sectors) { + int s = sectors; + int d = read_disk; + int success = 0; + int start; + struct md_rdev *rdev; + + if (s > (PAGE_SIZE>>9)) + s = PAGE_SIZE >> 9; + + do { + /* Note: no rcu protection needed here + * as this is synchronous in the raid1d thread + * which is the thread that might remove + * a device. If raid1d ever becomes multi-threaded.... + */ + sector_t first_bad; + int bad_sectors; + + rdev = conf->mirrors[d].rdev; + if (rdev && + test_bit(In_sync, &rdev->flags) && + is_badblock(rdev, sect, s, + &first_bad, &bad_sectors) == 0 && + sync_page_io(rdev, sect, s<<9, + conf->tmppage, READ, false)) + success = 1; + else { + d++; + if (d == conf->raid_disks * 2) + d = 0; + } + } while (!success && d != read_disk); + + if (!success) { + /* Cannot read from anywhere - mark it bad */ + struct md_rdev *rdev = conf->mirrors[read_disk].rdev; + if (!rdev_set_badblocks(rdev, sect, s, 0)) + md_error(mddev, rdev); + break; + } + /* write it back and re-read */ + start = d; + while (d != read_disk) { + if (d==0) + d = conf->raid_disks * 2; + d--; + rdev = conf->mirrors[d].rdev; + if (rdev && + test_bit(In_sync, &rdev->flags)) + r1_sync_page_io(rdev, sect, s, + conf->tmppage, WRITE); + } + d = start; + while (d != read_disk) { + char b[BDEVNAME_SIZE]; + if (d==0) + d = conf->raid_disks * 2; + d--; + rdev = conf->mirrors[d].rdev; + if (rdev && + test_bit(In_sync, &rdev->flags)) { + if (r1_sync_page_io(rdev, sect, s, + conf->tmppage, READ)) { + atomic_add(s, &rdev->corrected_errors); + printk(KERN_INFO + "md/raid1:%s: read error corrected " + "(%d sectors at %llu on %s)\n", + mdname(mddev), s, + (unsigned long long)(sect + + rdev->data_offset), + bdevname(rdev->bdev, b)); + } + } + } + sectors -= s; + sect += s; + } +} + +static void bi_complete(struct bio *bio, int error) +{ + complete((struct completion *)bio->bi_private); +} + +static int submit_bio_wait(int rw, struct bio *bio) +{ + struct completion event; + rw |= REQ_SYNC; + + init_completion(&event); + bio->bi_private = &event; + bio->bi_end_io = bi_complete; + submit_bio(rw, bio); + wait_for_completion(&event); + + return test_bit(BIO_UPTODATE, &bio->bi_flags); +} + +static int narrow_write_error(struct r1bio *r1_bio, int i) +{ + struct mddev *mddev = r1_bio->mddev; + struct r1conf *conf = mddev->private; + struct md_rdev *rdev = conf->mirrors[i].rdev; + int vcnt, idx; + struct bio_vec *vec; + + /* bio has the data to be written to device 'i' where + * we just recently had a write error. + * We repeatedly clone the bio and trim down to one block, + * then try the write. Where the write fails we record + * a bad block. + * It is conceivable that the bio doesn't exactly align with + * blocks. We must handle this somehow. + * + * We currently own a reference on the rdev. + */ + + int block_sectors; + sector_t sector; + int sectors; + int sect_to_write = r1_bio->sectors; + int ok = 1; + + if (rdev->badblocks.shift < 0) + return 0; + + block_sectors = 1 << rdev->badblocks.shift; + sector = r1_bio->sector; + sectors = ((sector + block_sectors) + & ~(sector_t)(block_sectors - 1)) + - sector; + + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { + vcnt = r1_bio->behind_page_count; + vec = r1_bio->behind_bvecs; + idx = 0; + while (vec[idx].bv_page == NULL) + idx++; + } else { + vcnt = r1_bio->master_bio->bi_vcnt; + vec = r1_bio->master_bio->bi_io_vec; + idx = r1_bio->master_bio->bi_idx; + } + while (sect_to_write) { + struct bio *wbio; + if (sectors > sect_to_write) + sectors = sect_to_write; + /* Write at 'sector' for 'sectors'*/ + + wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev); + memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec)); + wbio->bi_sector = r1_bio->sector; + wbio->bi_rw = WRITE; + wbio->bi_vcnt = vcnt; + wbio->bi_size = r1_bio->sectors << 9; + wbio->bi_idx = idx; + + md_trim_bio(wbio, sector - r1_bio->sector, sectors); + wbio->bi_sector += rdev->data_offset; + wbio->bi_bdev = rdev->bdev; + if (submit_bio_wait(WRITE, wbio) == 0) + /* failure! */ + ok = rdev_set_badblocks(rdev, sector, + sectors, 0) + && ok; + + bio_put(wbio); + sect_to_write -= sectors; + sector += sectors; + sectors = block_sectors; + } + return ok; +} + +static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio) +{ + int m; + int s = r1_bio->sectors; + for (m = 0; m < conf->raid_disks * 2 ; m++) { + struct md_rdev *rdev = conf->mirrors[m].rdev; + struct bio *bio = r1_bio->bios[m]; + if (bio->bi_end_io == NULL) + continue; + if (test_bit(BIO_UPTODATE, &bio->bi_flags) && + test_bit(R1BIO_MadeGood, &r1_bio->state)) { + rdev_clear_badblocks(rdev, r1_bio->sector, s); + } + if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && + test_bit(R1BIO_WriteError, &r1_bio->state)) { + if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0)) + md_error(conf->mddev, rdev); + } + } + put_buf(r1_bio); + md_done_sync(conf->mddev, s, 1); +} + +static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) +{ + int m; + for (m = 0; m < conf->raid_disks * 2 ; m++) + if (r1_bio->bios[m] == IO_MADE_GOOD) { + struct md_rdev *rdev = conf->mirrors[m].rdev; + rdev_clear_badblocks(rdev, + r1_bio->sector, + r1_bio->sectors); + rdev_dec_pending(rdev, conf->mddev); + } else if (r1_bio->bios[m] != NULL) { + /* This drive got a write error. We need to + * narrow down and record precise write + * errors. + */ + if (!narrow_write_error(r1_bio, m)) { + md_error(conf->mddev, + conf->mirrors[m].rdev); + /* an I/O failed, we can't clear the bitmap */ + set_bit(R1BIO_Degraded, &r1_bio->state); + } + rdev_dec_pending(conf->mirrors[m].rdev, + conf->mddev); + } + if (test_bit(R1BIO_WriteError, &r1_bio->state)) + close_write(r1_bio); + raid_end_bio_io(r1_bio); +} + +static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) +{ + int disk; + int max_sectors; + struct mddev *mddev = conf->mddev; + struct bio *bio; + char b[BDEVNAME_SIZE]; + struct md_rdev *rdev; + + clear_bit(R1BIO_ReadError, &r1_bio->state); + /* we got a read error. Maybe the drive is bad. Maybe just + * the block and we can fix it. + * We freeze all other IO, and try reading the block from + * other devices. When we find one, we re-write + * and check it that fixes the read error. + * This is all done synchronously while the array is + * frozen + */ + if (mddev->ro == 0) { + freeze_array(conf); + fix_read_error(conf, r1_bio->read_disk, + r1_bio->sector, r1_bio->sectors); + unfreeze_array(conf); + } else + md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); + + bio = r1_bio->bios[r1_bio->read_disk]; + bdevname(bio->bi_bdev, b); +read_more: + disk = read_balance(conf, r1_bio, &max_sectors); + if (disk == -1) { + printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O" + " read error for block %llu\n", + mdname(mddev), b, (unsigned long long)r1_bio->sector); + raid_end_bio_io(r1_bio); + } else { + const unsigned long do_sync + = r1_bio->master_bio->bi_rw & REQ_SYNC; + if (bio) { + r1_bio->bios[r1_bio->read_disk] = + mddev->ro ? IO_BLOCKED : NULL; + bio_put(bio); + } + r1_bio->read_disk = disk; + bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); + md_trim_bio(bio, r1_bio->sector - bio->bi_sector, max_sectors); + r1_bio->bios[r1_bio->read_disk] = bio; + rdev = conf->mirrors[disk].rdev; + printk_ratelimited(KERN_ERR + "md/raid1:%s: redirecting sector %llu" + " to other mirror: %s\n", + mdname(mddev), + (unsigned long long)r1_bio->sector, + bdevname(rdev->bdev, b)); + bio->bi_sector = r1_bio->sector + rdev->data_offset; + bio->bi_bdev = rdev->bdev; + bio->bi_end_io = raid1_end_read_request; + bio->bi_rw = READ | do_sync; + bio->bi_private = r1_bio; + if (max_sectors < r1_bio->sectors) { + /* Drat - have to split this up more */ + struct bio *mbio = r1_bio->master_bio; + int sectors_handled = (r1_bio->sector + max_sectors + - mbio->bi_sector); + r1_bio->sectors = max_sectors; + spin_lock_irq(&conf->device_lock); + if (mbio->bi_phys_segments == 0) + mbio->bi_phys_segments = 2; + else + mbio->bi_phys_segments++; + spin_unlock_irq(&conf->device_lock); + generic_make_request(bio); + bio = NULL; + + r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); + + r1_bio->master_bio = mbio; + r1_bio->sectors = (mbio->bi_size >> 9) + - sectors_handled; + r1_bio->state = 0; + set_bit(R1BIO_ReadError, &r1_bio->state); + r1_bio->mddev = mddev; + r1_bio->sector = mbio->bi_sector + sectors_handled; + + goto read_more; + } else + generic_make_request(bio); + } +} + +static void raid1d(struct mddev *mddev) +{ + struct r1bio *r1_bio; + unsigned long flags; + struct r1conf *conf = mddev->private; + struct list_head *head = &conf->retry_list; + struct blk_plug plug; + + md_check_recovery(mddev); + + blk_start_plug(&plug); + for (;;) { + + if (atomic_read(&mddev->plug_cnt) == 0) + flush_pending_writes(conf); + + spin_lock_irqsave(&conf->device_lock, flags); + if (list_empty(head)) { + spin_unlock_irqrestore(&conf->device_lock, flags); + break; + } + r1_bio = list_entry(head->prev, struct r1bio, retry_list); + list_del(head->prev); + conf->nr_queued--; + spin_unlock_irqrestore(&conf->device_lock, flags); + + mddev = r1_bio->mddev; + conf = mddev->private; + if (test_bit(R1BIO_IsSync, &r1_bio->state)) { + if (test_bit(R1BIO_MadeGood, &r1_bio->state) || + test_bit(R1BIO_WriteError, &r1_bio->state)) + handle_sync_write_finished(conf, r1_bio); + else + sync_request_write(mddev, r1_bio); + } else if (test_bit(R1BIO_MadeGood, &r1_bio->state) || + test_bit(R1BIO_WriteError, &r1_bio->state)) + handle_write_finished(conf, r1_bio); + else if (test_bit(R1BIO_ReadError, &r1_bio->state)) + handle_read_error(conf, r1_bio); + else + /* just a partial read to be scheduled from separate + * context + */ + generic_make_request(r1_bio->bios[r1_bio->read_disk]); + + cond_resched(); + if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) + md_check_recovery(mddev); + } + blk_finish_plug(&plug); +} + + +static int init_resync(struct r1conf *conf) +{ + int buffs; + + buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; + BUG_ON(conf->r1buf_pool); + conf->r1buf_pool = mempool_create(buffs, r1buf_pool_alloc, r1buf_pool_free, + conf->poolinfo); + if (!conf->r1buf_pool) + return -ENOMEM; + conf->next_resync = 0; + return 0; +} + +/* + * perform a "sync" on one "block" + * + * We need to make sure that no normal I/O request - particularly write + * requests - conflict with active sync requests. + * + * This is achieved by tracking pending requests and a 'barrier' concept + * that can be installed to exclude normal IO requests. + */ + +static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster) +{ + struct r1conf *conf = mddev->private; + struct r1bio *r1_bio; + struct bio *bio; + sector_t max_sector, nr_sectors; + int disk = -1; + int i; + int wonly = -1; + int write_targets = 0, read_targets = 0; + sector_t sync_blocks; + int still_degraded = 0; + int good_sectors = RESYNC_SECTORS; + int min_bad = 0; /* number of sectors that are bad in all devices */ + + if (!conf->r1buf_pool) + if (init_resync(conf)) + return 0; + + max_sector = mddev->dev_sectors; + if (sector_nr >= max_sector) { + /* If we aborted, we need to abort the + * sync on the 'current' bitmap chunk (there will + * only be one in raid1 resync. + * We can find the current addess in mddev->curr_resync + */ + if (mddev->curr_resync < max_sector) /* aborted */ + bitmap_end_sync(mddev->bitmap, mddev->curr_resync, + &sync_blocks, 1); + else /* completed sync */ + conf->fullsync = 0; + + bitmap_close_sync(mddev->bitmap); + close_sync(conf); + return 0; + } + + if (mddev->bitmap == NULL && + mddev->recovery_cp == MaxSector && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && + conf->fullsync == 0) { + *skipped = 1; + return max_sector - sector_nr; + } + /* before building a request, check if we can skip these blocks.. + * This call the bitmap_start_sync doesn't actually record anything + */ + if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && + !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { + /* We can skip this block, and probably several more */ + *skipped = 1; + return sync_blocks; + } + /* + * If there is non-resync activity waiting for a turn, + * and resync is going fast enough, + * then let it though before starting on this new sync request. + */ + if (!go_faster && conf->nr_waiting) + msleep_interruptible(1000); + + bitmap_cond_end_sync(mddev->bitmap, sector_nr); + r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); + raise_barrier(conf); + + conf->next_resync = sector_nr; + + rcu_read_lock(); + /* + * If we get a correctably read error during resync or recovery, + * we might want to read from a different device. So we + * flag all drives that could conceivably be read from for READ, + * and any others (which will be non-In_sync devices) for WRITE. + * If a read fails, we try reading from something else for which READ + * is OK. + */ + + r1_bio->mddev = mddev; + r1_bio->sector = sector_nr; + r1_bio->state = 0; + set_bit(R1BIO_IsSync, &r1_bio->state); + + for (i = 0; i < conf->raid_disks * 2; i++) { + struct md_rdev *rdev; + bio = r1_bio->bios[i]; + + /* take from bio_init */ + bio->bi_next = NULL; + bio->bi_flags &= ~(BIO_POOL_MASK-1); + bio->bi_flags |= 1 << BIO_UPTODATE; + bio->bi_rw = READ; + bio->bi_vcnt = 0; + bio->bi_idx = 0; + bio->bi_phys_segments = 0; + bio->bi_size = 0; + bio->bi_end_io = NULL; + bio->bi_private = NULL; + + rdev = rcu_dereference(conf->mirrors[i].rdev); + if (rdev == NULL || + test_bit(Faulty, &rdev->flags)) { + if (i < conf->raid_disks) + still_degraded = 1; + } else if (!test_bit(In_sync, &rdev->flags)) { + bio->bi_rw = WRITE; + bio->bi_end_io = end_sync_write; + write_targets ++; + } else { + /* may need to read from here */ + sector_t first_bad = MaxSector; + int bad_sectors; + + if (is_badblock(rdev, sector_nr, good_sectors, + &first_bad, &bad_sectors)) { + if (first_bad > sector_nr) + good_sectors = first_bad - sector_nr; + else { + bad_sectors -= (sector_nr - first_bad); + if (min_bad == 0 || + min_bad > bad_sectors) + min_bad = bad_sectors; + } + } + if (sector_nr < first_bad) { + if (test_bit(WriteMostly, &rdev->flags)) { + if (wonly < 0) + wonly = i; + } else { + if (disk < 0) + disk = i; + } + bio->bi_rw = READ; + bio->bi_end_io = end_sync_read; + read_targets++; + } + } + if (bio->bi_end_io) { + atomic_inc(&rdev->nr_pending); + bio->bi_sector = sector_nr + rdev->data_offset; + bio->bi_bdev = rdev->bdev; + bio->bi_private = r1_bio; + } + } + rcu_read_unlock(); + if (disk < 0) + disk = wonly; + r1_bio->read_disk = disk; + + if (read_targets == 0 && min_bad > 0) { + /* These sectors are bad on all InSync devices, so we + * need to mark them bad on all write targets + */ + int ok = 1; + for (i = 0 ; i < conf->raid_disks * 2 ; i++) + if (r1_bio->bios[i]->bi_end_io == end_sync_write) { + struct md_rdev *rdev = conf->mirrors[i].rdev; + ok = rdev_set_badblocks(rdev, sector_nr, + min_bad, 0 + ) && ok; + } + set_bit(MD_CHANGE_DEVS, &mddev->flags); + *skipped = 1; + put_buf(r1_bio); + + if (!ok) { + /* Cannot record the badblocks, so need to + * abort the resync. + * If there are multiple read targets, could just + * fail the really bad ones ??? + */ + conf->recovery_disabled = mddev->recovery_disabled; + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + return 0; + } else + return min_bad; + + } + if (min_bad > 0 && min_bad < good_sectors) { + /* only resync enough to reach the next bad->good + * transition */ + good_sectors = min_bad; + } + + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0) + /* extra read targets are also write targets */ + write_targets += read_targets-1; + + if (write_targets == 0 || read_targets == 0) { + /* There is nowhere to write, so all non-sync + * drives must be failed - so we are finished + */ + sector_t rv = max_sector - sector_nr; + *skipped = 1; + put_buf(r1_bio); + return rv; + } + + if (max_sector > mddev->resync_max) + max_sector = mddev->resync_max; /* Don't do IO beyond here */ + if (max_sector > sector_nr + good_sectors) + max_sector = sector_nr + good_sectors; + nr_sectors = 0; + sync_blocks = 0; + do { + struct page *page; + int len = PAGE_SIZE; + if (sector_nr + (len>>9) > max_sector) + len = (max_sector - sector_nr) << 9; + if (len == 0) + break; + if (sync_blocks == 0) { + if (!bitmap_start_sync(mddev->bitmap, sector_nr, + &sync_blocks, still_degraded) && + !conf->fullsync && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + break; + BUG_ON(sync_blocks < (PAGE_SIZE>>9)); + if ((len >> 9) > sync_blocks) + len = sync_blocks<<9; + } + + for (i = 0 ; i < conf->raid_disks * 2; i++) { + bio = r1_bio->bios[i]; + if (bio->bi_end_io) { + page = bio->bi_io_vec[bio->bi_vcnt].bv_page; + if (bio_add_page(bio, page, len, 0) == 0) { + /* stop here */ + bio->bi_io_vec[bio->bi_vcnt].bv_page = page; + while (i > 0) { + i--; + bio = r1_bio->bios[i]; + if (bio->bi_end_io==NULL) + continue; + /* remove last page from this bio */ + bio->bi_vcnt--; + bio->bi_size -= len; + bio->bi_flags &= ~(1<< BIO_SEG_VALID); + } + goto bio_full; + } + } + } + nr_sectors += len>>9; + sector_nr += len>>9; + sync_blocks -= (len>>9); + } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES); + bio_full: + r1_bio->sectors = nr_sectors; + + /* For a user-requested sync, we read all readable devices and do a + * compare + */ + if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { + atomic_set(&r1_bio->remaining, read_targets); + for (i = 0; i < conf->raid_disks * 2; i++) { + bio = r1_bio->bios[i]; + if (bio->bi_end_io == end_sync_read) { + md_sync_acct(bio->bi_bdev, nr_sectors); + generic_make_request(bio); + } + } + } else { + atomic_set(&r1_bio->remaining, 1); + bio = r1_bio->bios[r1_bio->read_disk]; + md_sync_acct(bio->bi_bdev, nr_sectors); + generic_make_request(bio); + + } + return nr_sectors; +} + +static sector_t raid1_size(struct mddev *mddev, sector_t sectors, int raid_disks) +{ + if (sectors) + return sectors; + + return mddev->dev_sectors; +} + +static struct r1conf *setup_conf(struct mddev *mddev) +{ + struct r1conf *conf; + int i; + struct mirror_info *disk; + struct md_rdev *rdev; + int err = -ENOMEM; + + conf = kzalloc(sizeof(struct r1conf), GFP_KERNEL); + if (!conf) + goto abort; + + conf->mirrors = kzalloc(sizeof(struct mirror_info) + * mddev->raid_disks * 2, + GFP_KERNEL); + if (!conf->mirrors) + goto abort; + + conf->tmppage = alloc_page(GFP_KERNEL); + if (!conf->tmppage) + goto abort; + + conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL); + if (!conf->poolinfo) + goto abort; + conf->poolinfo->raid_disks = mddev->raid_disks * 2; + conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, + r1bio_pool_free, + conf->poolinfo); + if (!conf->r1bio_pool) + goto abort; + + conf->poolinfo->mddev = mddev; + + err = -EINVAL; + spin_lock_init(&conf->device_lock); + rdev_for_each(rdev, mddev) { + struct request_queue *q; + int disk_idx = rdev->raid_disk; + if (disk_idx >= mddev->raid_disks + || disk_idx < 0) + continue; + if (test_bit(Replacement, &rdev->flags)) + disk = conf->mirrors + conf->raid_disks + disk_idx; + else + disk = conf->mirrors + disk_idx; + + if (disk->rdev) + goto abort; + disk->rdev = rdev; + q = bdev_get_queue(rdev->bdev); + if (q->merge_bvec_fn) + mddev->merge_check_needed = 1; + + disk->head_position = 0; + } + conf->raid_disks = mddev->raid_disks; + conf->mddev = mddev; + INIT_LIST_HEAD(&conf->retry_list); + + spin_lock_init(&conf->resync_lock); + init_waitqueue_head(&conf->wait_barrier); + + bio_list_init(&conf->pending_bio_list); + conf->pending_count = 0; + conf->recovery_disabled = mddev->recovery_disabled - 1; + + err = -EIO; + conf->last_used = -1; + for (i = 0; i < conf->raid_disks * 2; i++) { + + disk = conf->mirrors + i; + + if (i < conf->raid_disks && + disk[conf->raid_disks].rdev) { + /* This slot has a replacement. */ + if (!disk->rdev) { + /* No original, just make the replacement + * a recovering spare + */ + disk->rdev = + disk[conf->raid_disks].rdev; + disk[conf->raid_disks].rdev = NULL; + } else if (!test_bit(In_sync, &disk->rdev->flags)) + /* Original is not in_sync - bad */ + goto abort; + } + + if (!disk->rdev || + !test_bit(In_sync, &disk->rdev->flags)) { + disk->head_position = 0; + if (disk->rdev) + conf->fullsync = 1; + } else if (conf->last_used < 0) + /* + * The first working device is used as a + * starting point to read balancing. + */ + conf->last_used = i; + } + + if (conf->last_used < 0) { + printk(KERN_ERR "md/raid1:%s: no operational mirrors\n", + mdname(mddev)); + goto abort; + } + err = -ENOMEM; + conf->thread = md_register_thread(raid1d, mddev, NULL); + if (!conf->thread) { + printk(KERN_ERR + "md/raid1:%s: couldn't allocate thread\n", + mdname(mddev)); + goto abort; + } + + return conf; + + abort: + if (conf) { + if (conf->r1bio_pool) + mempool_destroy(conf->r1bio_pool); + kfree(conf->mirrors); + safe_put_page(conf->tmppage); + kfree(conf->poolinfo); + kfree(conf); + } + return ERR_PTR(err); +} + +static int stop(struct mddev *mddev); +static int run(struct mddev *mddev) +{ + struct r1conf *conf; + int i; + struct md_rdev *rdev; + int ret; + + if (mddev->level != 1) { + printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n", + mdname(mddev), mddev->level); + return -EIO; + } + if (mddev->reshape_position != MaxSector) { + printk(KERN_ERR "md/raid1:%s: reshape_position set but not supported\n", + mdname(mddev)); + return -EIO; + } + /* + * copy the already verified devices into our private RAID1 + * bookkeeping area. [whatever we allocate in run(), + * should be freed in stop()] + */ + if (mddev->private == NULL) + conf = setup_conf(mddev); + else + conf = mddev->private; + + if (IS_ERR(conf)) + return PTR_ERR(conf); + + rdev_for_each(rdev, mddev) { + if (!mddev->gendisk) + continue; + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + } + + mddev->degraded = 0; + for (i=0; i < conf->raid_disks; i++) + if (conf->mirrors[i].rdev == NULL || + !test_bit(In_sync, &conf->mirrors[i].rdev->flags) || + test_bit(Faulty, &conf->mirrors[i].rdev->flags)) + mddev->degraded++; + + if (conf->raid_disks - mddev->degraded == 1) + mddev->recovery_cp = MaxSector; + + if (mddev->recovery_cp != MaxSector) + printk(KERN_NOTICE "md/raid1:%s: not clean" + " -- starting background reconstruction\n", + mdname(mddev)); + printk(KERN_INFO + "md/raid1:%s: active with %d out of %d mirrors\n", + mdname(mddev), mddev->raid_disks - mddev->degraded, + mddev->raid_disks); + + /* + * Ok, everything is just fine now + */ + mddev->thread = conf->thread; + conf->thread = NULL; + mddev->private = conf; + + md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); + + if (mddev->queue) { + mddev->queue->backing_dev_info.congested_fn = raid1_congested; + mddev->queue->backing_dev_info.congested_data = mddev; + blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec); + } + + ret = md_integrity_register(mddev); + if (ret) + stop(mddev); + return ret; +} + +static int stop(struct mddev *mddev) +{ + struct r1conf *conf = mddev->private; + struct bitmap *bitmap = mddev->bitmap; + + /* wait for behind writes to complete */ + if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { + printk(KERN_INFO "md/raid1:%s: behind writes in progress - waiting to stop.\n", + mdname(mddev)); + /* need to kick something here to make sure I/O goes? */ + wait_event(bitmap->behind_wait, + atomic_read(&bitmap->behind_writes) == 0); + } + + raise_barrier(conf); + lower_barrier(conf); + + md_unregister_thread(&mddev->thread); + if (conf->r1bio_pool) + mempool_destroy(conf->r1bio_pool); + kfree(conf->mirrors); + kfree(conf->poolinfo); + kfree(conf); + mddev->private = NULL; + return 0; +} + +static int raid1_resize(struct mddev *mddev, sector_t sectors) +{ + /* no resync is happening, and there is enough space + * on all devices, so we can resize. + * We need to make sure resync covers any new space. + * If the array is shrinking we should possibly wait until + * any io in the removed space completes, but it hardly seems + * worth it. + */ + md_set_array_sectors(mddev, raid1_size(mddev, sectors, 0)); + if (mddev->array_sectors > raid1_size(mddev, sectors, 0)) + return -EINVAL; + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + if (sectors > mddev->dev_sectors && + mddev->recovery_cp > mddev->dev_sectors) { + mddev->recovery_cp = mddev->dev_sectors; + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + } + mddev->dev_sectors = sectors; + mddev->resync_max_sectors = sectors; + return 0; +} + +static int raid1_reshape(struct mddev *mddev) +{ + /* We need to: + * 1/ resize the r1bio_pool + * 2/ resize conf->mirrors + * + * We allocate a new r1bio_pool if we can. + * Then raise a device barrier and wait until all IO stops. + * Then resize conf->mirrors and swap in the new r1bio pool. + * + * At the same time, we "pack" the devices so that all the missing + * devices have the higher raid_disk numbers. + */ + mempool_t *newpool, *oldpool; + struct pool_info *newpoolinfo; + struct mirror_info *newmirrors; + struct r1conf *conf = mddev->private; + int cnt, raid_disks; + unsigned long flags; + int d, d2, err; + + /* Cannot change chunk_size, layout, or level */ + if (mddev->chunk_sectors != mddev->new_chunk_sectors || + mddev->layout != mddev->new_layout || + mddev->level != mddev->new_level) { + mddev->new_chunk_sectors = mddev->chunk_sectors; + mddev->new_layout = mddev->layout; + mddev->new_level = mddev->level; + return -EINVAL; + } + + err = md_allow_write(mddev); + if (err) + return err; + + raid_disks = mddev->raid_disks + mddev->delta_disks; + + if (raid_disks < conf->raid_disks) { + cnt=0; + for (d= 0; d < conf->raid_disks; d++) + if (conf->mirrors[d].rdev) + cnt++; + if (cnt > raid_disks) + return -EBUSY; + } + + newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL); + if (!newpoolinfo) + return -ENOMEM; + newpoolinfo->mddev = mddev; + newpoolinfo->raid_disks = raid_disks * 2; + + newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, + r1bio_pool_free, newpoolinfo); + if (!newpool) { + kfree(newpoolinfo); + return -ENOMEM; + } + newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2, + GFP_KERNEL); + if (!newmirrors) { + kfree(newpoolinfo); + mempool_destroy(newpool); + return -ENOMEM; + } + + raise_barrier(conf); + + /* ok, everything is stopped */ + oldpool = conf->r1bio_pool; + conf->r1bio_pool = newpool; + + for (d = d2 = 0; d < conf->raid_disks; d++) { + struct md_rdev *rdev = conf->mirrors[d].rdev; + if (rdev && rdev->raid_disk != d2) { + sysfs_unlink_rdev(mddev, rdev); + rdev->raid_disk = d2; + sysfs_unlink_rdev(mddev, rdev); + if (sysfs_link_rdev(mddev, rdev)) + printk(KERN_WARNING + "md/raid1:%s: cannot register rd%d\n", + mdname(mddev), rdev->raid_disk); + } + if (rdev) + newmirrors[d2++].rdev = rdev; + } + kfree(conf->mirrors); + conf->mirrors = newmirrors; + kfree(conf->poolinfo); + conf->poolinfo = newpoolinfo; + + spin_lock_irqsave(&conf->device_lock, flags); + mddev->degraded += (raid_disks - conf->raid_disks); + spin_unlock_irqrestore(&conf->device_lock, flags); + conf->raid_disks = mddev->raid_disks = raid_disks; + mddev->delta_disks = 0; + + conf->last_used = 0; /* just make sure it is in-range */ + lower_barrier(conf); + + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + + mempool_destroy(oldpool); + return 0; +} + +static void raid1_quiesce(struct mddev *mddev, int state) +{ + struct r1conf *conf = mddev->private; + + switch(state) { + case 2: /* wake for suspend */ + wake_up(&conf->wait_barrier); + break; + case 1: + raise_barrier(conf); + break; + case 0: + lower_barrier(conf); + break; + } +} + +static void *raid1_takeover(struct mddev *mddev) +{ + /* raid1 can take over: + * raid5 with 2 devices, any layout or chunk size + */ + if (mddev->level == 5 && mddev->raid_disks == 2) { + struct r1conf *conf; + mddev->new_level = 1; + mddev->new_layout = 0; + mddev->new_chunk_sectors = 0; + conf = setup_conf(mddev); + if (!IS_ERR(conf)) + conf->barrier = 1; + return conf; + } + return ERR_PTR(-EINVAL); +} + +static struct md_personality raid1_personality = +{ + .name = "raid1", + .level = 1, + .owner = THIS_MODULE, + .make_request = make_request, + .run = run, + .stop = stop, + .status = status, + .error_handler = error, + .hot_add_disk = raid1_add_disk, + .hot_remove_disk= raid1_remove_disk, + .spare_active = raid1_spare_active, + .sync_request = sync_request, + .resize = raid1_resize, + .size = raid1_size, + .check_reshape = raid1_reshape, + .quiesce = raid1_quiesce, + .takeover = raid1_takeover, +}; + +static int __init raid_init(void) +{ + return register_md_personality(&raid1_personality); +} + +static void raid_exit(void) +{ + unregister_md_personality(&raid1_personality); +} + +module_init(raid_init); +module_exit(raid_exit); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD"); +MODULE_ALIAS("md-personality-3"); /* RAID1 */ +MODULE_ALIAS("md-raid1"); +MODULE_ALIAS("md-level-1"); + +module_param(max_queued_requests, int, S_IRUGO|S_IWUSR); diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h new file mode 100644 index 00000000..80ded139 --- /dev/null +++ b/drivers/md/raid1.h @@ -0,0 +1,177 @@ +#ifndef _RAID1_H +#define _RAID1_H + +struct mirror_info { + struct md_rdev *rdev; + sector_t head_position; +}; + +/* + * memory pools need a pointer to the mddev, so they can force an unplug + * when memory is tight, and a count of the number of drives that the + * pool was allocated for, so they know how much to allocate and free. + * mddev->raid_disks cannot be used, as it can change while a pool is active + * These two datums are stored in a kmalloced struct. + * The 'raid_disks' here is twice the raid_disks in r1conf. + * This allows space for each 'real' device can have a replacement in the + * second half of the array. + */ + +struct pool_info { + struct mddev *mddev; + int raid_disks; +}; + +struct r1conf { + struct mddev *mddev; + struct mirror_info *mirrors; /* twice 'raid_disks' to + * allow for replacements. + */ + int raid_disks; + + /* When choose the best device for a read (read_balance()) + * we try to keep sequential reads one the same device + * using 'last_used' and 'next_seq_sect' + */ + int last_used; + sector_t next_seq_sect; + /* During resync, read_balancing is only allowed on the part + * of the array that has been resynced. 'next_resync' tells us + * where that is. + */ + sector_t next_resync; + + spinlock_t device_lock; + + /* list of 'struct r1bio' that need to be processed by raid1d, + * whether to retry a read, writeout a resync or recovery + * block, or anything else. + */ + struct list_head retry_list; + + /* queue pending writes to be submitted on unplug */ + struct bio_list pending_bio_list; + int pending_count; + + /* for use when syncing mirrors: + * We don't allow both normal IO and resync/recovery IO at + * the same time - resync/recovery can only happen when there + * is no other IO. So when either is active, the other has to wait. + * See more details description in raid1.c near raise_barrier(). + */ + wait_queue_head_t wait_barrier; + spinlock_t resync_lock; + int nr_pending; + int nr_waiting; + int nr_queued; + int barrier; + + /* Set to 1 if a full sync is needed, (fresh device added). + * Cleared when a sync completes. + */ + int fullsync; + + /* When the same as mddev->recovery_disabled we don't allow + * recovery to be attempted as we expect a read error. + */ + int recovery_disabled; + + + /* poolinfo contains information about the content of the + * mempools - it changes when the array grows or shrinks + */ + struct pool_info *poolinfo; + mempool_t *r1bio_pool; + mempool_t *r1buf_pool; + + /* temporary buffer to synchronous IO when attempting to repair + * a read error. + */ + struct page *tmppage; + + + /* When taking over an array from a different personality, we store + * the new thread here until we fully activate the array. + */ + struct md_thread *thread; +}; + +/* + * this is our 'private' RAID1 bio. + * + * it contains information about what kind of IO operations were started + * for this RAID1 operation, and about their status: + */ + +struct r1bio { + atomic_t remaining; /* 'have we finished' count, + * used from IRQ handlers + */ + atomic_t behind_remaining; /* number of write-behind ios remaining + * in this BehindIO request + */ + sector_t sector; + int sectors; + unsigned long state; + struct mddev *mddev; + /* + * original bio going to /dev/mdx + */ + struct bio *master_bio; + /* + * if the IO is in READ direction, then this is where we read + */ + int read_disk; + + struct list_head retry_list; + /* Next two are only valid when R1BIO_BehindIO is set */ + struct bio_vec *behind_bvecs; + int behind_page_count; + /* + * if the IO is in WRITE direction, then multiple bios are used. + * We choose the number when they are allocated. + */ + struct bio *bios[0]; + /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/ +}; + +/* when we get a read error on a read-only array, we redirect to another + * device without failing the first device, or trying to over-write to + * correct the read error. To keep track of bad blocks on a per-bio + * level, we store IO_BLOCKED in the appropriate 'bios' pointer + */ +#define IO_BLOCKED ((struct bio *)1) +/* When we successfully write to a known bad-block, we need to remove the + * bad-block marking which must be done from process context. So we record + * the success by setting bios[n] to IO_MADE_GOOD + */ +#define IO_MADE_GOOD ((struct bio *)2) + +#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) + +/* bits for r1bio.state */ +#define R1BIO_Uptodate 0 +#define R1BIO_IsSync 1 +#define R1BIO_Degraded 2 +#define R1BIO_BehindIO 3 +/* Set ReadError on bios that experience a readerror so that + * raid1d knows what to do with them. + */ +#define R1BIO_ReadError 4 +/* For write-behind requests, we call bi_end_io when + * the last non-write-behind device completes, providing + * any write was successful. Otherwise we call when + * any write-behind write succeeds, otherwise we call + * with failure when last write completes (and all failed). + * Record that bi_end_io was called with this flag... + */ +#define R1BIO_Returned 6 +/* If a write for this request means we can clear some + * known-bad-block records, we set this flag + */ +#define R1BIO_MadeGood 7 +#define R1BIO_WriteError 8 + +extern int md_raid1_congested(struct mddev *mddev, int bits); + +#endif diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c new file mode 100644 index 00000000..a954c95d --- /dev/null +++ b/drivers/md/raid10.c @@ -0,0 +1,3584 @@ +/* + * raid10.c : Multiple Devices driver for Linux + * + * Copyright (C) 2000-2004 Neil Brown + * + * RAID-10 support for md. + * + * Base on code in raid1.c. See raid1.c for further copyright information. + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * You should have received a copy of the GNU General Public License + * (for example /usr/src/linux/COPYING); if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <linux/slab.h> +#include <linux/delay.h> +#include <linux/blkdev.h> +#include <linux/module.h> +#include <linux/seq_file.h> +#include <linux/ratelimit.h> +#include "md.h" +#include "raid10.h" +#include "raid0.h" +#include "bitmap.h" + +/* + * RAID10 provides a combination of RAID0 and RAID1 functionality. + * The layout of data is defined by + * chunk_size + * raid_disks + * near_copies (stored in low byte of layout) + * far_copies (stored in second byte of layout) + * far_offset (stored in bit 16 of layout ) + * + * The data to be stored is divided into chunks using chunksize. + * Each device is divided into far_copies sections. + * In each section, chunks are laid out in a style similar to raid0, but + * near_copies copies of each chunk is stored (each on a different drive). + * The starting device for each section is offset near_copies from the starting + * device of the previous section. + * Thus they are (near_copies*far_copies) of each chunk, and each is on a different + * drive. + * near_copies and far_copies must be at least one, and their product is at most + * raid_disks. + * + * If far_offset is true, then the far_copies are handled a bit differently. + * The copies are still in different stripes, but instead of be very far apart + * on disk, there are adjacent stripes. + */ + +/* + * Number of guaranteed r10bios in case of extreme VM load: + */ +#define NR_RAID10_BIOS 256 + +/* When there are this many requests queue to be written by + * the raid10 thread, we become 'congested' to provide back-pressure + * for writeback. + */ +static int max_queued_requests = 1024; + +static void allow_barrier(struct r10conf *conf); +static void lower_barrier(struct r10conf *conf); +static int enough(struct r10conf *conf, int ignore); + +static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) +{ + struct r10conf *conf = data; + int size = offsetof(struct r10bio, devs[conf->copies]); + + /* allocate a r10bio with room for raid_disks entries in the + * bios array */ + return kzalloc(size, gfp_flags); +} + +static void r10bio_pool_free(void *r10_bio, void *data) +{ + kfree(r10_bio); +} + +/* Maximum size of each resync request */ +#define RESYNC_BLOCK_SIZE (64*1024) +#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) +/* amount of memory to reserve for resync requests */ +#define RESYNC_WINDOW (1024*1024) +/* maximum number of concurrent requests, memory permitting */ +#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE) + +/* + * When performing a resync, we need to read and compare, so + * we need as many pages are there are copies. + * When performing a recovery, we need 2 bios, one for read, + * one for write (we recover only one drive per r10buf) + * + */ +static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) +{ + struct r10conf *conf = data; + struct page *page; + struct r10bio *r10_bio; + struct bio *bio; + int i, j; + int nalloc; + + r10_bio = r10bio_pool_alloc(gfp_flags, conf); + if (!r10_bio) + return NULL; + + if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) + nalloc = conf->copies; /* resync */ + else + nalloc = 2; /* recovery */ + + /* + * Allocate bios. + */ + for (j = nalloc ; j-- ; ) { + bio = bio_kmalloc(gfp_flags, RESYNC_PAGES); + if (!bio) + goto out_free_bio; + r10_bio->devs[j].bio = bio; + if (!conf->have_replacement) + continue; + bio = bio_kmalloc(gfp_flags, RESYNC_PAGES); + if (!bio) + goto out_free_bio; + r10_bio->devs[j].repl_bio = bio; + } + /* + * Allocate RESYNC_PAGES data pages and attach them + * where needed. + */ + for (j = 0 ; j < nalloc; j++) { + struct bio *rbio = r10_bio->devs[j].repl_bio; + bio = r10_bio->devs[j].bio; + for (i = 0; i < RESYNC_PAGES; i++) { + if (j == 1 && !test_bit(MD_RECOVERY_SYNC, + &conf->mddev->recovery)) { + /* we can share bv_page's during recovery */ + struct bio *rbio = r10_bio->devs[0].bio; + page = rbio->bi_io_vec[i].bv_page; + get_page(page); + } else + page = alloc_page(gfp_flags); + if (unlikely(!page)) + goto out_free_pages; + + bio->bi_io_vec[i].bv_page = page; + if (rbio) + rbio->bi_io_vec[i].bv_page = page; + } + } + + return r10_bio; + +out_free_pages: + for ( ; i > 0 ; i--) + safe_put_page(bio->bi_io_vec[i-1].bv_page); + while (j--) + for (i = 0; i < RESYNC_PAGES ; i++) + safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); + j = -1; +out_free_bio: + while (++j < nalloc) { + bio_put(r10_bio->devs[j].bio); + if (r10_bio->devs[j].repl_bio) + bio_put(r10_bio->devs[j].repl_bio); + } + r10bio_pool_free(r10_bio, conf); + return NULL; +} + +static void r10buf_pool_free(void *__r10_bio, void *data) +{ + int i; + struct r10conf *conf = data; + struct r10bio *r10bio = __r10_bio; + int j; + + for (j=0; j < conf->copies; j++) { + struct bio *bio = r10bio->devs[j].bio; + if (bio) { + for (i = 0; i < RESYNC_PAGES; i++) { + safe_put_page(bio->bi_io_vec[i].bv_page); + bio->bi_io_vec[i].bv_page = NULL; + } + bio_put(bio); + } + bio = r10bio->devs[j].repl_bio; + if (bio) + bio_put(bio); + } + r10bio_pool_free(r10bio, conf); +} + +static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio) +{ + int i; + + for (i = 0; i < conf->copies; i++) { + struct bio **bio = & r10_bio->devs[i].bio; + if (!BIO_SPECIAL(*bio)) + bio_put(*bio); + *bio = NULL; + bio = &r10_bio->devs[i].repl_bio; + if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio)) + bio_put(*bio); + *bio = NULL; + } +} + +static void free_r10bio(struct r10bio *r10_bio) +{ + struct r10conf *conf = r10_bio->mddev->private; + + put_all_bios(conf, r10_bio); + mempool_free(r10_bio, conf->r10bio_pool); +} + +static void put_buf(struct r10bio *r10_bio) +{ + struct r10conf *conf = r10_bio->mddev->private; + + mempool_free(r10_bio, conf->r10buf_pool); + + lower_barrier(conf); +} + +static void reschedule_retry(struct r10bio *r10_bio) +{ + unsigned long flags; + struct mddev *mddev = r10_bio->mddev; + struct r10conf *conf = mddev->private; + + spin_lock_irqsave(&conf->device_lock, flags); + list_add(&r10_bio->retry_list, &conf->retry_list); + conf->nr_queued ++; + spin_unlock_irqrestore(&conf->device_lock, flags); + + /* wake up frozen array... */ + wake_up(&conf->wait_barrier); + + md_wakeup_thread(mddev->thread); +} + +/* + * raid_end_bio_io() is called when we have finished servicing a mirrored + * operation and are ready to return a success/failure code to the buffer + * cache layer. + */ +static void raid_end_bio_io(struct r10bio *r10_bio) +{ + struct bio *bio = r10_bio->master_bio; + int done; + struct r10conf *conf = r10_bio->mddev->private; + + if (bio->bi_phys_segments) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + bio->bi_phys_segments--; + done = (bio->bi_phys_segments == 0); + spin_unlock_irqrestore(&conf->device_lock, flags); + } else + done = 1; + if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) + clear_bit(BIO_UPTODATE, &bio->bi_flags); + if (done) { + bio_endio(bio, 0); + /* + * Wake up any possible resync thread that waits for the device + * to go idle. + */ + allow_barrier(conf); + } + free_r10bio(r10_bio); +} + +/* + * Update disk head position estimator based on IRQ completion info. + */ +static inline void update_head_pos(int slot, struct r10bio *r10_bio) +{ + struct r10conf *conf = r10_bio->mddev->private; + + conf->mirrors[r10_bio->devs[slot].devnum].head_position = + r10_bio->devs[slot].addr + (r10_bio->sectors); +} + +/* + * Find the disk number which triggered given bio + */ +static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, + struct bio *bio, int *slotp, int *replp) +{ + int slot; + int repl = 0; + + for (slot = 0; slot < conf->copies; slot++) { + if (r10_bio->devs[slot].bio == bio) + break; + if (r10_bio->devs[slot].repl_bio == bio) { + repl = 1; + break; + } + } + + BUG_ON(slot == conf->copies); + update_head_pos(slot, r10_bio); + + if (slotp) + *slotp = slot; + if (replp) + *replp = repl; + return r10_bio->devs[slot].devnum; +} + +static void raid10_end_read_request(struct bio *bio, int error) +{ + int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct r10bio *r10_bio = bio->bi_private; + int slot, dev; + struct md_rdev *rdev; + struct r10conf *conf = r10_bio->mddev->private; + + + slot = r10_bio->read_slot; + dev = r10_bio->devs[slot].devnum; + rdev = r10_bio->devs[slot].rdev; + /* + * this branch is our 'one mirror IO has finished' event handler: + */ + update_head_pos(slot, r10_bio); + + if (uptodate) { + /* + * Set R10BIO_Uptodate in our master bio, so that + * we will return a good error code to the higher + * levels even if IO on some other mirrored buffer fails. + * + * The 'master' represents the composite IO operation to + * user-side. So if something waits for IO, then it will + * wait for the 'master' bio. + */ + set_bit(R10BIO_Uptodate, &r10_bio->state); + } else { + /* If all other devices that store this block have + * failed, we want to return the error upwards rather + * than fail the last device. Here we redefine + * "uptodate" to mean "Don't want to retry" + */ + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + if (!enough(conf, rdev->raid_disk)) + uptodate = 1; + spin_unlock_irqrestore(&conf->device_lock, flags); + } + if (uptodate) { + raid_end_bio_io(r10_bio); + rdev_dec_pending(rdev, conf->mddev); + } else { + /* + * oops, read error - keep the refcount on the rdev + */ + char b[BDEVNAME_SIZE]; + printk_ratelimited(KERN_ERR + "md/raid10:%s: %s: rescheduling sector %llu\n", + mdname(conf->mddev), + bdevname(rdev->bdev, b), + (unsigned long long)r10_bio->sector); + set_bit(R10BIO_ReadError, &r10_bio->state); + reschedule_retry(r10_bio); + } +} + +static void close_write(struct r10bio *r10_bio) +{ + /* clear the bitmap if all writes complete successfully */ + bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, + r10_bio->sectors, + !test_bit(R10BIO_Degraded, &r10_bio->state), + 0); + md_write_end(r10_bio->mddev); +} + +static void one_write_done(struct r10bio *r10_bio) +{ + if (atomic_dec_and_test(&r10_bio->remaining)) { + if (test_bit(R10BIO_WriteError, &r10_bio->state)) + reschedule_retry(r10_bio); + else { + close_write(r10_bio); + if (test_bit(R10BIO_MadeGood, &r10_bio->state)) + reschedule_retry(r10_bio); + else + raid_end_bio_io(r10_bio); + } + } +} + +static void raid10_end_write_request(struct bio *bio, int error) +{ + int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct r10bio *r10_bio = bio->bi_private; + int dev; + int dec_rdev = 1; + struct r10conf *conf = r10_bio->mddev->private; + int slot, repl; + struct md_rdev *rdev = NULL; + + dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); + + if (repl) + rdev = conf->mirrors[dev].replacement; + if (!rdev) { + smp_rmb(); + repl = 0; + rdev = conf->mirrors[dev].rdev; + } + /* + * this branch is our 'one mirror IO has finished' event handler: + */ + if (!uptodate) { + if (repl) + /* Never record new bad blocks to replacement, + * just fail it. + */ + md_error(rdev->mddev, rdev); + else { + set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, &rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, + &rdev->mddev->recovery); + set_bit(R10BIO_WriteError, &r10_bio->state); + dec_rdev = 0; + } + } else { + /* + * Set R10BIO_Uptodate in our master bio, so that + * we will return a good error code for to the higher + * levels even if IO on some other mirrored buffer fails. + * + * The 'master' represents the composite IO operation to + * user-side. So if something waits for IO, then it will + * wait for the 'master' bio. + */ + sector_t first_bad; + int bad_sectors; + + set_bit(R10BIO_Uptodate, &r10_bio->state); + + /* Maybe we can clear some bad blocks. */ + if (is_badblock(rdev, + r10_bio->devs[slot].addr, + r10_bio->sectors, + &first_bad, &bad_sectors)) { + bio_put(bio); + if (repl) + r10_bio->devs[slot].repl_bio = IO_MADE_GOOD; + else + r10_bio->devs[slot].bio = IO_MADE_GOOD; + dec_rdev = 0; + set_bit(R10BIO_MadeGood, &r10_bio->state); + } + } + + /* + * + * Let's see if all mirrored write operations have finished + * already. + */ + one_write_done(r10_bio); + if (dec_rdev) + rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); +} + +/* + * RAID10 layout manager + * As well as the chunksize and raid_disks count, there are two + * parameters: near_copies and far_copies. + * near_copies * far_copies must be <= raid_disks. + * Normally one of these will be 1. + * If both are 1, we get raid0. + * If near_copies == raid_disks, we get raid1. + * + * Chunks are laid out in raid0 style with near_copies copies of the + * first chunk, followed by near_copies copies of the next chunk and + * so on. + * If far_copies > 1, then after 1/far_copies of the array has been assigned + * as described above, we start again with a device offset of near_copies. + * So we effectively have another copy of the whole array further down all + * the drives, but with blocks on different drives. + * With this layout, and block is never stored twice on the one device. + * + * raid10_find_phys finds the sector offset of a given virtual sector + * on each device that it is on. + * + * raid10_find_virt does the reverse mapping, from a device and a + * sector offset to a virtual address + */ + +static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio) +{ + int n,f; + sector_t sector; + sector_t chunk; + sector_t stripe; + int dev; + + int slot = 0; + + /* now calculate first sector/dev */ + chunk = r10bio->sector >> conf->chunk_shift; + sector = r10bio->sector & conf->chunk_mask; + + chunk *= conf->near_copies; + stripe = chunk; + dev = sector_div(stripe, conf->raid_disks); + if (conf->far_offset) + stripe *= conf->far_copies; + + sector += stripe << conf->chunk_shift; + + /* and calculate all the others */ + for (n=0; n < conf->near_copies; n++) { + int d = dev; + sector_t s = sector; + r10bio->devs[slot].addr = sector; + r10bio->devs[slot].devnum = d; + slot++; + + for (f = 1; f < conf->far_copies; f++) { + d += conf->near_copies; + if (d >= conf->raid_disks) + d -= conf->raid_disks; + s += conf->stride; + r10bio->devs[slot].devnum = d; + r10bio->devs[slot].addr = s; + slot++; + } + dev++; + if (dev >= conf->raid_disks) { + dev = 0; + sector += (conf->chunk_mask + 1); + } + } + BUG_ON(slot != conf->copies); +} + +static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) +{ + sector_t offset, chunk, vchunk; + + offset = sector & conf->chunk_mask; + if (conf->far_offset) { + int fc; + chunk = sector >> conf->chunk_shift; + fc = sector_div(chunk, conf->far_copies); + dev -= fc * conf->near_copies; + if (dev < 0) + dev += conf->raid_disks; + } else { + while (sector >= conf->stride) { + sector -= conf->stride; + if (dev < conf->near_copies) + dev += conf->raid_disks - conf->near_copies; + else + dev -= conf->near_copies; + } + chunk = sector >> conf->chunk_shift; + } + vchunk = chunk * conf->raid_disks + dev; + sector_div(vchunk, conf->near_copies); + return (vchunk << conf->chunk_shift) + offset; +} + +/** + * raid10_mergeable_bvec -- tell bio layer if a two requests can be merged + * @q: request queue + * @bvm: properties of new bio + * @biovec: the request that could be merged to it. + * + * Return amount of bytes we can accept at this offset + * This requires checking for end-of-chunk if near_copies != raid_disks, + * and for subordinate merge_bvec_fns if merge_check_needed. + */ +static int raid10_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) +{ + struct mddev *mddev = q->queuedata; + struct r10conf *conf = mddev->private; + sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); + int max; + unsigned int chunk_sectors = mddev->chunk_sectors; + unsigned int bio_sectors = bvm->bi_size >> 9; + + if (conf->near_copies < conf->raid_disks) { + max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + + bio_sectors)) << 9; + if (max < 0) + /* bio_add cannot handle a negative return */ + max = 0; + if (max <= biovec->bv_len && bio_sectors == 0) + return biovec->bv_len; + } else + max = biovec->bv_len; + + if (mddev->merge_check_needed) { + struct r10bio r10_bio; + int s; + r10_bio.sector = sector; + raid10_find_phys(conf, &r10_bio); + rcu_read_lock(); + for (s = 0; s < conf->copies; s++) { + int disk = r10_bio.devs[s].devnum; + struct md_rdev *rdev = rcu_dereference( + conf->mirrors[disk].rdev); + if (rdev && !test_bit(Faulty, &rdev->flags)) { + struct request_queue *q = + bdev_get_queue(rdev->bdev); + if (q->merge_bvec_fn) { + bvm->bi_sector = r10_bio.devs[s].addr + + rdev->data_offset; + bvm->bi_bdev = rdev->bdev; + max = min(max, q->merge_bvec_fn( + q, bvm, biovec)); + } + } + rdev = rcu_dereference(conf->mirrors[disk].replacement); + if (rdev && !test_bit(Faulty, &rdev->flags)) { + struct request_queue *q = + bdev_get_queue(rdev->bdev); + if (q->merge_bvec_fn) { + bvm->bi_sector = r10_bio.devs[s].addr + + rdev->data_offset; + bvm->bi_bdev = rdev->bdev; + max = min(max, q->merge_bvec_fn( + q, bvm, biovec)); + } + } + } + rcu_read_unlock(); + } + return max; +} + +/* + * This routine returns the disk from which the requested read should + * be done. There is a per-array 'next expected sequential IO' sector + * number - if this matches on the next IO then we use the last disk. + * There is also a per-disk 'last know head position' sector that is + * maintained from IRQ contexts, both the normal and the resync IO + * completion handlers update this position correctly. If there is no + * perfect sequential match then we pick the disk whose head is closest. + * + * If there are 2 mirrors in the same 2 devices, performance degrades + * because position is mirror, not device based. + * + * The rdev for the device selected will have nr_pending incremented. + */ + +/* + * FIXME: possibly should rethink readbalancing and do it differently + * depending on near_copies / far_copies geometry. + */ +static struct md_rdev *read_balance(struct r10conf *conf, + struct r10bio *r10_bio, + int *max_sectors) +{ + const sector_t this_sector = r10_bio->sector; + int disk, slot; + int sectors = r10_bio->sectors; + int best_good_sectors; + sector_t new_distance, best_dist; + struct md_rdev *rdev, *best_rdev; + int do_balance; + int best_slot; + + raid10_find_phys(conf, r10_bio); + rcu_read_lock(); +retry: + sectors = r10_bio->sectors; + best_slot = -1; + best_rdev = NULL; + best_dist = MaxSector; + best_good_sectors = 0; + do_balance = 1; + /* + * Check if we can balance. We can balance on the whole + * device if no resync is going on (recovery is ok), or below + * the resync window. We take the first readable disk when + * above the resync window. + */ + if (conf->mddev->recovery_cp < MaxSector + && (this_sector + sectors >= conf->next_resync)) + do_balance = 0; + + for (slot = 0; slot < conf->copies ; slot++) { + sector_t first_bad; + int bad_sectors; + sector_t dev_sector; + + if (r10_bio->devs[slot].bio == IO_BLOCKED) + continue; + disk = r10_bio->devs[slot].devnum; + rdev = rcu_dereference(conf->mirrors[disk].replacement); + if (rdev == NULL || test_bit(Faulty, &rdev->flags) || + test_bit(Unmerged, &rdev->flags) || + r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) + rdev = rcu_dereference(conf->mirrors[disk].rdev); + if (rdev == NULL || + test_bit(Faulty, &rdev->flags) || + test_bit(Unmerged, &rdev->flags)) + continue; + if (!test_bit(In_sync, &rdev->flags) && + r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) + continue; + + dev_sector = r10_bio->devs[slot].addr; + if (is_badblock(rdev, dev_sector, sectors, + &first_bad, &bad_sectors)) { + if (best_dist < MaxSector) + /* Already have a better slot */ + continue; + if (first_bad <= dev_sector) { + /* Cannot read here. If this is the + * 'primary' device, then we must not read + * beyond 'bad_sectors' from another device. + */ + bad_sectors -= (dev_sector - first_bad); + if (!do_balance && sectors > bad_sectors) + sectors = bad_sectors; + if (best_good_sectors > sectors) + best_good_sectors = sectors; + } else { + sector_t good_sectors = + first_bad - dev_sector; + if (good_sectors > best_good_sectors) { + best_good_sectors = good_sectors; + best_slot = slot; + best_rdev = rdev; + } + if (!do_balance) + /* Must read from here */ + break; + } + continue; + } else + best_good_sectors = sectors; + + if (!do_balance) + break; + + /* This optimisation is debatable, and completely destroys + * sequential read speed for 'far copies' arrays. So only + * keep it for 'near' arrays, and review those later. + */ + if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) + break; + + /* for far > 1 always use the lowest address */ + if (conf->far_copies > 1) + new_distance = r10_bio->devs[slot].addr; + else + new_distance = abs(r10_bio->devs[slot].addr - + conf->mirrors[disk].head_position); + if (new_distance < best_dist) { + best_dist = new_distance; + best_slot = slot; + best_rdev = rdev; + } + } + if (slot >= conf->copies) { + slot = best_slot; + rdev = best_rdev; + } + + if (slot >= 0) { + atomic_inc(&rdev->nr_pending); + if (test_bit(Faulty, &rdev->flags)) { + /* Cannot risk returning a device that failed + * before we inc'ed nr_pending + */ + rdev_dec_pending(rdev, conf->mddev); + goto retry; + } + r10_bio->read_slot = slot; + } else + rdev = NULL; + rcu_read_unlock(); + *max_sectors = best_good_sectors; + + return rdev; +} + +static int raid10_congested(void *data, int bits) +{ + struct mddev *mddev = data; + struct r10conf *conf = mddev->private; + int i, ret = 0; + + if ((bits & (1 << BDI_async_congested)) && + conf->pending_count >= max_queued_requests) + return 1; + + if (mddev_congested(mddev, bits)) + return 1; + rcu_read_lock(); + for (i = 0; i < conf->raid_disks && ret == 0; i++) { + struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + if (rdev && !test_bit(Faulty, &rdev->flags)) { + struct request_queue *q = bdev_get_queue(rdev->bdev); + + ret |= bdi_congested(&q->backing_dev_info, bits); + } + } + rcu_read_unlock(); + return ret; +} + +static void flush_pending_writes(struct r10conf *conf) +{ + /* Any writes that have been queued but are awaiting + * bitmap updates get flushed here. + */ + spin_lock_irq(&conf->device_lock); + + if (conf->pending_bio_list.head) { + struct bio *bio; + bio = bio_list_get(&conf->pending_bio_list); + conf->pending_count = 0; + spin_unlock_irq(&conf->device_lock); + /* flush any pending bitmap writes to disk + * before proceeding w/ I/O */ + bitmap_unplug(conf->mddev->bitmap); + wake_up(&conf->wait_barrier); + + while (bio) { /* submit pending writes */ + struct bio *next = bio->bi_next; + bio->bi_next = NULL; + generic_make_request(bio); + bio = next; + } + } else + spin_unlock_irq(&conf->device_lock); +} + +/* Barriers.... + * Sometimes we need to suspend IO while we do something else, + * either some resync/recovery, or reconfigure the array. + * To do this we raise a 'barrier'. + * The 'barrier' is a counter that can be raised multiple times + * to count how many activities are happening which preclude + * normal IO. + * We can only raise the barrier if there is no pending IO. + * i.e. if nr_pending == 0. + * We choose only to raise the barrier if no-one is waiting for the + * barrier to go down. This means that as soon as an IO request + * is ready, no other operations which require a barrier will start + * until the IO request has had a chance. + * + * So: regular IO calls 'wait_barrier'. When that returns there + * is no backgroup IO happening, It must arrange to call + * allow_barrier when it has finished its IO. + * backgroup IO calls must call raise_barrier. Once that returns + * there is no normal IO happeing. It must arrange to call + * lower_barrier when the particular background IO completes. + */ + +static void raise_barrier(struct r10conf *conf, int force) +{ + BUG_ON(force && !conf->barrier); + spin_lock_irq(&conf->resync_lock); + + /* Wait until no block IO is waiting (unless 'force') */ + wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, + conf->resync_lock, ); + + /* block any new IO from starting */ + conf->barrier++; + + /* Now wait for all pending IO to complete */ + wait_event_lock_irq(conf->wait_barrier, + !conf->nr_pending && conf->barrier < RESYNC_DEPTH, + conf->resync_lock, ); + + spin_unlock_irq(&conf->resync_lock); +} + +static void lower_barrier(struct r10conf *conf) +{ + unsigned long flags; + spin_lock_irqsave(&conf->resync_lock, flags); + conf->barrier--; + spin_unlock_irqrestore(&conf->resync_lock, flags); + wake_up(&conf->wait_barrier); +} + +static void wait_barrier(struct r10conf *conf) +{ + spin_lock_irq(&conf->resync_lock); + if (conf->barrier) { + conf->nr_waiting++; + /* Wait for the barrier to drop. + * However if there are already pending + * requests (preventing the barrier from + * rising completely), and the + * pre-process bio queue isn't empty, + * then don't wait, as we need to empty + * that queue to get the nr_pending + * count down. + */ + wait_event_lock_irq(conf->wait_barrier, + !conf->barrier || + (conf->nr_pending && + current->bio_list && + !bio_list_empty(current->bio_list)), + conf->resync_lock, + ); + conf->nr_waiting--; + } + conf->nr_pending++; + spin_unlock_irq(&conf->resync_lock); +} + +static void allow_barrier(struct r10conf *conf) +{ + unsigned long flags; + spin_lock_irqsave(&conf->resync_lock, flags); + conf->nr_pending--; + spin_unlock_irqrestore(&conf->resync_lock, flags); + wake_up(&conf->wait_barrier); +} + +static void freeze_array(struct r10conf *conf) +{ + /* stop syncio and normal IO and wait for everything to + * go quiet. + * We increment barrier and nr_waiting, and then + * wait until nr_pending match nr_queued+1 + * This is called in the context of one normal IO request + * that has failed. Thus any sync request that might be pending + * will be blocked by nr_pending, and we need to wait for + * pending IO requests to complete or be queued for re-try. + * Thus the number queued (nr_queued) plus this request (1) + * must match the number of pending IOs (nr_pending) before + * we continue. + */ + spin_lock_irq(&conf->resync_lock); + conf->barrier++; + conf->nr_waiting++; + wait_event_lock_irq(conf->wait_barrier, + conf->nr_pending == conf->nr_queued+1, + conf->resync_lock, + flush_pending_writes(conf)); + + spin_unlock_irq(&conf->resync_lock); +} + +static void unfreeze_array(struct r10conf *conf) +{ + /* reverse the effect of the freeze */ + spin_lock_irq(&conf->resync_lock); + conf->barrier--; + conf->nr_waiting--; + wake_up(&conf->wait_barrier); + spin_unlock_irq(&conf->resync_lock); +} + +static void make_request(struct mddev *mddev, struct bio * bio) +{ + struct r10conf *conf = mddev->private; + struct r10bio *r10_bio; + struct bio *read_bio; + int i; + int chunk_sects = conf->chunk_mask + 1; + const int rw = bio_data_dir(bio); + const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); + const unsigned long do_fua = (bio->bi_rw & REQ_FUA); + unsigned long flags; + struct md_rdev *blocked_rdev; + int plugged; + int sectors_handled; + int max_sectors; + + if (unlikely(bio->bi_rw & REQ_FLUSH)) { + md_flush_request(mddev, bio); + return; + } + + /* If this request crosses a chunk boundary, we need to + * split it. This will only happen for 1 PAGE (or less) requests. + */ + if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9) + > chunk_sects && + conf->near_copies < conf->raid_disks)) { + struct bio_pair *bp; + /* Sanity check -- queue functions should prevent this happening */ + if (bio->bi_vcnt != 1 || + bio->bi_idx != 0) + goto bad_map; + /* This is a one page bio that upper layers + * refuse to split for us, so we need to split it. + */ + bp = bio_split(bio, + chunk_sects - (bio->bi_sector & (chunk_sects - 1)) ); + + /* Each of these 'make_request' calls will call 'wait_barrier'. + * If the first succeeds but the second blocks due to the resync + * thread raising the barrier, we will deadlock because the + * IO to the underlying device will be queued in generic_make_request + * and will never complete, so will never reduce nr_pending. + * So increment nr_waiting here so no new raise_barriers will + * succeed, and so the second wait_barrier cannot block. + */ + spin_lock_irq(&conf->resync_lock); + conf->nr_waiting++; + spin_unlock_irq(&conf->resync_lock); + + make_request(mddev, &bp->bio1); + make_request(mddev, &bp->bio2); + + spin_lock_irq(&conf->resync_lock); + conf->nr_waiting--; + wake_up(&conf->wait_barrier); + spin_unlock_irq(&conf->resync_lock); + + bio_pair_release(bp); + return; + bad_map: + printk("md/raid10:%s: make_request bug: can't convert block across chunks" + " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2, + (unsigned long long)bio->bi_sector, bio->bi_size >> 10); + + bio_io_error(bio); + return; + } + + md_write_start(mddev, bio); + + /* + * Register the new request and wait if the reconstruction + * thread has put up a bar for new requests. + * Continue immediately if no resync is active currently. + */ + wait_barrier(conf); + + r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); + + r10_bio->master_bio = bio; + r10_bio->sectors = bio->bi_size >> 9; + + r10_bio->mddev = mddev; + r10_bio->sector = bio->bi_sector; + r10_bio->state = 0; + + /* We might need to issue multiple reads to different + * devices if there are bad blocks around, so we keep + * track of the number of reads in bio->bi_phys_segments. + * If this is 0, there is only one r10_bio and no locking + * will be needed when the request completes. If it is + * non-zero, then it is the number of not-completed requests. + */ + bio->bi_phys_segments = 0; + clear_bit(BIO_SEG_VALID, &bio->bi_flags); + + if (rw == READ) { + /* + * read balancing logic: + */ + struct md_rdev *rdev; + int slot; + +read_again: + rdev = read_balance(conf, r10_bio, &max_sectors); + if (!rdev) { + raid_end_bio_io(r10_bio); + return; + } + slot = r10_bio->read_slot; + + read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); + md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector, + max_sectors); + + r10_bio->devs[slot].bio = read_bio; + r10_bio->devs[slot].rdev = rdev; + + read_bio->bi_sector = r10_bio->devs[slot].addr + + rdev->data_offset; + read_bio->bi_bdev = rdev->bdev; + read_bio->bi_end_io = raid10_end_read_request; + read_bio->bi_rw = READ | do_sync; + read_bio->bi_private = r10_bio; + + if (max_sectors < r10_bio->sectors) { + /* Could not read all from this device, so we will + * need another r10_bio. + */ + sectors_handled = (r10_bio->sectors + max_sectors + - bio->bi_sector); + r10_bio->sectors = max_sectors; + spin_lock_irq(&conf->device_lock); + if (bio->bi_phys_segments == 0) + bio->bi_phys_segments = 2; + else + bio->bi_phys_segments++; + spin_unlock(&conf->device_lock); + /* Cannot call generic_make_request directly + * as that will be queued in __generic_make_request + * and subsequent mempool_alloc might block + * waiting for it. so hand bio over to raid10d. + */ + reschedule_retry(r10_bio); + + r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); + + r10_bio->master_bio = bio; + r10_bio->sectors = ((bio->bi_size >> 9) + - sectors_handled); + r10_bio->state = 0; + r10_bio->mddev = mddev; + r10_bio->sector = bio->bi_sector + sectors_handled; + goto read_again; + } else + generic_make_request(read_bio); + return; + } + + /* + * WRITE: + */ + if (conf->pending_count >= max_queued_requests) { + md_wakeup_thread(mddev->thread); + wait_event(conf->wait_barrier, + conf->pending_count < max_queued_requests); + } + /* first select target devices under rcu_lock and + * inc refcount on their rdev. Record them by setting + * bios[x] to bio + * If there are known/acknowledged bad blocks on any device + * on which we have seen a write error, we want to avoid + * writing to those blocks. This potentially requires several + * writes to write around the bad blocks. Each set of writes + * gets its own r10_bio with a set of bios attached. The number + * of r10_bios is recored in bio->bi_phys_segments just as with + * the read case. + */ + plugged = mddev_check_plugged(mddev); + + r10_bio->read_slot = -1; /* make sure repl_bio gets freed */ + raid10_find_phys(conf, r10_bio); +retry_write: + blocked_rdev = NULL; + rcu_read_lock(); + max_sectors = r10_bio->sectors; + + for (i = 0; i < conf->copies; i++) { + int d = r10_bio->devs[i].devnum; + struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); + struct md_rdev *rrdev = rcu_dereference( + conf->mirrors[d].replacement); + if (rdev == rrdev) + rrdev = NULL; + if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { + atomic_inc(&rdev->nr_pending); + blocked_rdev = rdev; + break; + } + if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) { + atomic_inc(&rrdev->nr_pending); + blocked_rdev = rrdev; + break; + } + if (rrdev && (test_bit(Faulty, &rrdev->flags) + || test_bit(Unmerged, &rrdev->flags))) + rrdev = NULL; + + r10_bio->devs[i].bio = NULL; + r10_bio->devs[i].repl_bio = NULL; + if (!rdev || test_bit(Faulty, &rdev->flags) || + test_bit(Unmerged, &rdev->flags)) { + set_bit(R10BIO_Degraded, &r10_bio->state); + continue; + } + if (test_bit(WriteErrorSeen, &rdev->flags)) { + sector_t first_bad; + sector_t dev_sector = r10_bio->devs[i].addr; + int bad_sectors; + int is_bad; + + is_bad = is_badblock(rdev, dev_sector, + max_sectors, + &first_bad, &bad_sectors); + if (is_bad < 0) { + /* Mustn't write here until the bad block + * is acknowledged + */ + atomic_inc(&rdev->nr_pending); + set_bit(BlockedBadBlocks, &rdev->flags); + blocked_rdev = rdev; + break; + } + if (is_bad && first_bad <= dev_sector) { + /* Cannot write here at all */ + bad_sectors -= (dev_sector - first_bad); + if (bad_sectors < max_sectors) + /* Mustn't write more than bad_sectors + * to other devices yet + */ + max_sectors = bad_sectors; + /* We don't set R10BIO_Degraded as that + * only applies if the disk is missing, + * so it might be re-added, and we want to + * know to recover this chunk. + * In this case the device is here, and the + * fact that this chunk is not in-sync is + * recorded in the bad block log. + */ + continue; + } + if (is_bad) { + int good_sectors = first_bad - dev_sector; + if (good_sectors < max_sectors) + max_sectors = good_sectors; + } + } + r10_bio->devs[i].bio = bio; + atomic_inc(&rdev->nr_pending); + if (rrdev) { + r10_bio->devs[i].repl_bio = bio; + atomic_inc(&rrdev->nr_pending); + } + } + rcu_read_unlock(); + + if (unlikely(blocked_rdev)) { + /* Have to wait for this device to get unblocked, then retry */ + int j; + int d; + + for (j = 0; j < i; j++) { + if (r10_bio->devs[j].bio) { + d = r10_bio->devs[j].devnum; + rdev_dec_pending(conf->mirrors[d].rdev, mddev); + } + if (r10_bio->devs[j].repl_bio) { + struct md_rdev *rdev; + d = r10_bio->devs[j].devnum; + rdev = conf->mirrors[d].replacement; + if (!rdev) { + /* Race with remove_disk */ + smp_mb(); + rdev = conf->mirrors[d].rdev; + } + rdev_dec_pending(rdev, mddev); + } + } + allow_barrier(conf); + md_wait_for_blocked_rdev(blocked_rdev, mddev); + wait_barrier(conf); + goto retry_write; + } + + if (max_sectors < r10_bio->sectors) { + /* We are splitting this into multiple parts, so + * we need to prepare for allocating another r10_bio. + */ + r10_bio->sectors = max_sectors; + spin_lock_irq(&conf->device_lock); + if (bio->bi_phys_segments == 0) + bio->bi_phys_segments = 2; + else + bio->bi_phys_segments++; + spin_unlock_irq(&conf->device_lock); + } + sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector; + + atomic_set(&r10_bio->remaining, 1); + bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); + + for (i = 0; i < conf->copies; i++) { + struct bio *mbio; + int d = r10_bio->devs[i].devnum; + if (!r10_bio->devs[i].bio) + continue; + + mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, + max_sectors); + r10_bio->devs[i].bio = mbio; + + mbio->bi_sector = (r10_bio->devs[i].addr+ + conf->mirrors[d].rdev->data_offset); + mbio->bi_bdev = conf->mirrors[d].rdev->bdev; + mbio->bi_end_io = raid10_end_write_request; + mbio->bi_rw = WRITE | do_sync | do_fua; + mbio->bi_private = r10_bio; + + atomic_inc(&r10_bio->remaining); + spin_lock_irqsave(&conf->device_lock, flags); + bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; + spin_unlock_irqrestore(&conf->device_lock, flags); + + if (!r10_bio->devs[i].repl_bio) + continue; + + mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, + max_sectors); + r10_bio->devs[i].repl_bio = mbio; + + /* We are actively writing to the original device + * so it cannot disappear, so the replacement cannot + * become NULL here + */ + mbio->bi_sector = (r10_bio->devs[i].addr+ + conf->mirrors[d].replacement->data_offset); + mbio->bi_bdev = conf->mirrors[d].replacement->bdev; + mbio->bi_end_io = raid10_end_write_request; + mbio->bi_rw = WRITE | do_sync | do_fua; + mbio->bi_private = r10_bio; + + atomic_inc(&r10_bio->remaining); + spin_lock_irqsave(&conf->device_lock, flags); + bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; + spin_unlock_irqrestore(&conf->device_lock, flags); + } + + /* Don't remove the bias on 'remaining' (one_write_done) until + * after checking if we need to go around again. + */ + + if (sectors_handled < (bio->bi_size >> 9)) { + one_write_done(r10_bio); + /* We need another r10_bio. It has already been counted + * in bio->bi_phys_segments. + */ + r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); + + r10_bio->master_bio = bio; + r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled; + + r10_bio->mddev = mddev; + r10_bio->sector = bio->bi_sector + sectors_handled; + r10_bio->state = 0; + goto retry_write; + } + one_write_done(r10_bio); + + /* In case raid10d snuck in to freeze_array */ + wake_up(&conf->wait_barrier); + + if (do_sync || !mddev->bitmap || !plugged) + md_wakeup_thread(mddev->thread); +} + +static void status(struct seq_file *seq, struct mddev *mddev) +{ + struct r10conf *conf = mddev->private; + int i; + + if (conf->near_copies < conf->raid_disks) + seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2); + if (conf->near_copies > 1) + seq_printf(seq, " %d near-copies", conf->near_copies); + if (conf->far_copies > 1) { + if (conf->far_offset) + seq_printf(seq, " %d offset-copies", conf->far_copies); + else + seq_printf(seq, " %d far-copies", conf->far_copies); + } + seq_printf(seq, " [%d/%d] [", conf->raid_disks, + conf->raid_disks - mddev->degraded); + for (i = 0; i < conf->raid_disks; i++) + seq_printf(seq, "%s", + conf->mirrors[i].rdev && + test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_"); + seq_printf(seq, "]"); +} + +/* check if there are enough drives for + * every block to appear on atleast one. + * Don't consider the device numbered 'ignore' + * as we might be about to remove it. + */ +static int enough(struct r10conf *conf, int ignore) +{ + int first = 0; + + do { + int n = conf->copies; + int cnt = 0; + while (n--) { + if (conf->mirrors[first].rdev && + first != ignore) + cnt++; + first = (first+1) % conf->raid_disks; + } + if (cnt == 0) + return 0; + } while (first != 0); + return 1; +} + +static void error(struct mddev *mddev, struct md_rdev *rdev) +{ + char b[BDEVNAME_SIZE]; + struct r10conf *conf = mddev->private; + + /* + * If it is not operational, then we have already marked it as dead + * else if it is the last working disks, ignore the error, let the + * next level up know. + * else mark the drive as failed + */ + if (test_bit(In_sync, &rdev->flags) + && !enough(conf, rdev->raid_disk)) + /* + * Don't fail the drive, just return an IO error. + */ + return; + if (test_and_clear_bit(In_sync, &rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + mddev->degraded++; + spin_unlock_irqrestore(&conf->device_lock, flags); + /* + * if recovery is running, make sure it aborts. + */ + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + } + set_bit(Blocked, &rdev->flags); + set_bit(Faulty, &rdev->flags); + set_bit(MD_CHANGE_DEVS, &mddev->flags); + printk(KERN_ALERT + "md/raid10:%s: Disk failure on %s, disabling device.\n" + "md/raid10:%s: Operation continuing on %d devices.\n", + mdname(mddev), bdevname(rdev->bdev, b), + mdname(mddev), conf->raid_disks - mddev->degraded); +} + +static void print_conf(struct r10conf *conf) +{ + int i; + struct mirror_info *tmp; + + printk(KERN_DEBUG "RAID10 conf printout:\n"); + if (!conf) { + printk(KERN_DEBUG "(!conf)\n"); + return; + } + printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, + conf->raid_disks); + + for (i = 0; i < conf->raid_disks; i++) { + char b[BDEVNAME_SIZE]; + tmp = conf->mirrors + i; + if (tmp->rdev) + printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n", + i, !test_bit(In_sync, &tmp->rdev->flags), + !test_bit(Faulty, &tmp->rdev->flags), + bdevname(tmp->rdev->bdev,b)); + } +} + +static void close_sync(struct r10conf *conf) +{ + wait_barrier(conf); + allow_barrier(conf); + + mempool_destroy(conf->r10buf_pool); + conf->r10buf_pool = NULL; +} + +static int raid10_spare_active(struct mddev *mddev) +{ + int i; + struct r10conf *conf = mddev->private; + struct mirror_info *tmp; + int count = 0; + unsigned long flags; + + /* + * Find all non-in_sync disks within the RAID10 configuration + * and mark them in_sync + */ + for (i = 0; i < conf->raid_disks; i++) { + tmp = conf->mirrors + i; + if (tmp->replacement + && tmp->replacement->recovery_offset == MaxSector + && !test_bit(Faulty, &tmp->replacement->flags) + && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { + /* Replacement has just become active */ + if (!tmp->rdev + || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) + count++; + if (tmp->rdev) { + /* Replaced device not technically faulty, + * but we need to be sure it gets removed + * and never re-added. + */ + set_bit(Faulty, &tmp->rdev->flags); + sysfs_notify_dirent_safe( + tmp->rdev->sysfs_state); + } + sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); + } else if (tmp->rdev + && !test_bit(Faulty, &tmp->rdev->flags) + && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { + count++; + sysfs_notify_dirent(tmp->rdev->sysfs_state); + } + } + spin_lock_irqsave(&conf->device_lock, flags); + mddev->degraded -= count; + spin_unlock_irqrestore(&conf->device_lock, flags); + + print_conf(conf); + return count; +} + + +static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) +{ + struct r10conf *conf = mddev->private; + int err = -EEXIST; + int mirror; + int first = 0; + int last = conf->raid_disks - 1; + struct request_queue *q = bdev_get_queue(rdev->bdev); + + if (mddev->recovery_cp < MaxSector) + /* only hot-add to in-sync arrays, as recovery is + * very different from resync + */ + return -EBUSY; + if (rdev->saved_raid_disk < 0 && !enough(conf, -1)) + return -EINVAL; + + if (rdev->raid_disk >= 0) + first = last = rdev->raid_disk; + + if (q->merge_bvec_fn) { + set_bit(Unmerged, &rdev->flags); + mddev->merge_check_needed = 1; + } + + if (rdev->saved_raid_disk >= first && + conf->mirrors[rdev->saved_raid_disk].rdev == NULL) + mirror = rdev->saved_raid_disk; + else + mirror = first; + for ( ; mirror <= last ; mirror++) { + struct mirror_info *p = &conf->mirrors[mirror]; + if (p->recovery_disabled == mddev->recovery_disabled) + continue; + if (p->rdev) { + if (!test_bit(WantReplacement, &p->rdev->flags) || + p->replacement != NULL) + continue; + clear_bit(In_sync, &rdev->flags); + set_bit(Replacement, &rdev->flags); + rdev->raid_disk = mirror; + err = 0; + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + conf->fullsync = 1; + rcu_assign_pointer(p->replacement, rdev); + break; + } + + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + + p->head_position = 0; + p->recovery_disabled = mddev->recovery_disabled - 1; + rdev->raid_disk = mirror; + err = 0; + if (rdev->saved_raid_disk != mirror) + conf->fullsync = 1; + rcu_assign_pointer(p->rdev, rdev); + break; + } + if (err == 0 && test_bit(Unmerged, &rdev->flags)) { + /* Some requests might not have seen this new + * merge_bvec_fn. We must wait for them to complete + * before merging the device fully. + * First we make sure any code which has tested + * our function has submitted the request, then + * we wait for all outstanding requests to complete. + */ + synchronize_sched(); + raise_barrier(conf, 0); + lower_barrier(conf); + clear_bit(Unmerged, &rdev->flags); + } + md_integrity_add_rdev(rdev, mddev); + print_conf(conf); + return err; +} + +static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) +{ + struct r10conf *conf = mddev->private; + int err = 0; + int number = rdev->raid_disk; + struct md_rdev **rdevp; + struct mirror_info *p = conf->mirrors + number; + + print_conf(conf); + if (rdev == p->rdev) + rdevp = &p->rdev; + else if (rdev == p->replacement) + rdevp = &p->replacement; + else + return 0; + + if (test_bit(In_sync, &rdev->flags) || + atomic_read(&rdev->nr_pending)) { + err = -EBUSY; + goto abort; + } + /* Only remove faulty devices if recovery + * is not possible. + */ + if (!test_bit(Faulty, &rdev->flags) && + mddev->recovery_disabled != p->recovery_disabled && + (!p->replacement || p->replacement == rdev) && + enough(conf, -1)) { + err = -EBUSY; + goto abort; + } + *rdevp = NULL; + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + *rdevp = rdev; + goto abort; + } else if (p->replacement) { + /* We must have just cleared 'rdev' */ + p->rdev = p->replacement; + clear_bit(Replacement, &p->replacement->flags); + smp_mb(); /* Make sure other CPUs may see both as identical + * but will never see neither -- if they are careful. + */ + p->replacement = NULL; + clear_bit(WantReplacement, &rdev->flags); + } else + /* We might have just remove the Replacement as faulty + * Clear the flag just in case + */ + clear_bit(WantReplacement, &rdev->flags); + + err = md_integrity_register(mddev); + +abort: + + print_conf(conf); + return err; +} + + +static void end_sync_read(struct bio *bio, int error) +{ + struct r10bio *r10_bio = bio->bi_private; + struct r10conf *conf = r10_bio->mddev->private; + int d; + + d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); + + if (test_bit(BIO_UPTODATE, &bio->bi_flags)) + set_bit(R10BIO_Uptodate, &r10_bio->state); + else + /* The write handler will notice the lack of + * R10BIO_Uptodate and record any errors etc + */ + atomic_add(r10_bio->sectors, + &conf->mirrors[d].rdev->corrected_errors); + + /* for reconstruct, we always reschedule after a read. + * for resync, only after all reads + */ + rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev); + if (test_bit(R10BIO_IsRecover, &r10_bio->state) || + atomic_dec_and_test(&r10_bio->remaining)) { + /* we have read all the blocks, + * do the comparison in process context in raid10d + */ + reschedule_retry(r10_bio); + } +} + +static void end_sync_request(struct r10bio *r10_bio) +{ + struct mddev *mddev = r10_bio->mddev; + + while (atomic_dec_and_test(&r10_bio->remaining)) { + if (r10_bio->master_bio == NULL) { + /* the primary of several recovery bios */ + sector_t s = r10_bio->sectors; + if (test_bit(R10BIO_MadeGood, &r10_bio->state) || + test_bit(R10BIO_WriteError, &r10_bio->state)) + reschedule_retry(r10_bio); + else + put_buf(r10_bio); + md_done_sync(mddev, s, 1); + break; + } else { + struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio; + if (test_bit(R10BIO_MadeGood, &r10_bio->state) || + test_bit(R10BIO_WriteError, &r10_bio->state)) + reschedule_retry(r10_bio); + else + put_buf(r10_bio); + r10_bio = r10_bio2; + } + } +} + +static void end_sync_write(struct bio *bio, int error) +{ + int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct r10bio *r10_bio = bio->bi_private; + struct mddev *mddev = r10_bio->mddev; + struct r10conf *conf = mddev->private; + int d; + sector_t first_bad; + int bad_sectors; + int slot; + int repl; + struct md_rdev *rdev = NULL; + + d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); + if (repl) + rdev = conf->mirrors[d].replacement; + else + rdev = conf->mirrors[d].rdev; + + if (!uptodate) { + if (repl) + md_error(mddev, rdev); + else { + set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, &rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, + &rdev->mddev->recovery); + set_bit(R10BIO_WriteError, &r10_bio->state); + } + } else if (is_badblock(rdev, + r10_bio->devs[slot].addr, + r10_bio->sectors, + &first_bad, &bad_sectors)) + set_bit(R10BIO_MadeGood, &r10_bio->state); + + rdev_dec_pending(rdev, mddev); + + end_sync_request(r10_bio); +} + +/* + * Note: sync and recover and handled very differently for raid10 + * This code is for resync. + * For resync, we read through virtual addresses and read all blocks. + * If there is any error, we schedule a write. The lowest numbered + * drive is authoritative. + * However requests come for physical address, so we need to map. + * For every physical address there are raid_disks/copies virtual addresses, + * which is always are least one, but is not necessarly an integer. + * This means that a physical address can span multiple chunks, so we may + * have to submit multiple io requests for a single sync request. + */ +/* + * We check if all blocks are in-sync and only write to blocks that + * aren't in sync + */ +static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) +{ + struct r10conf *conf = mddev->private; + int i, first; + struct bio *tbio, *fbio; + int vcnt; + + atomic_set(&r10_bio->remaining, 1); + + /* find the first device with a block */ + for (i=0; i<conf->copies; i++) + if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) + break; + + if (i == conf->copies) + goto done; + + first = i; + fbio = r10_bio->devs[i].bio; + + vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9); + /* now find blocks with errors */ + for (i=0 ; i < conf->copies ; i++) { + int j, d; + + tbio = r10_bio->devs[i].bio; + + if (tbio->bi_end_io != end_sync_read) + continue; + if (i == first) + continue; + if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) { + /* We know that the bi_io_vec layout is the same for + * both 'first' and 'i', so we just compare them. + * All vec entries are PAGE_SIZE; + */ + for (j = 0; j < vcnt; j++) + if (memcmp(page_address(fbio->bi_io_vec[j].bv_page), + page_address(tbio->bi_io_vec[j].bv_page), + fbio->bi_io_vec[j].bv_len)) + break; + if (j == vcnt) + continue; + mddev->resync_mismatches += r10_bio->sectors; + if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) + /* Don't fix anything. */ + continue; + } + /* Ok, we need to write this bio, either to correct an + * inconsistency or to correct an unreadable block. + * First we need to fixup bv_offset, bv_len and + * bi_vecs, as the read request might have corrupted these + */ + tbio->bi_vcnt = vcnt; + tbio->bi_size = r10_bio->sectors << 9; + tbio->bi_idx = 0; + tbio->bi_phys_segments = 0; + tbio->bi_flags &= ~(BIO_POOL_MASK - 1); + tbio->bi_flags |= 1 << BIO_UPTODATE; + tbio->bi_next = NULL; + tbio->bi_rw = WRITE; + tbio->bi_private = r10_bio; + tbio->bi_sector = r10_bio->devs[i].addr; + + for (j=0; j < vcnt ; j++) { + tbio->bi_io_vec[j].bv_offset = 0; + tbio->bi_io_vec[j].bv_len = PAGE_SIZE; + + memcpy(page_address(tbio->bi_io_vec[j].bv_page), + page_address(fbio->bi_io_vec[j].bv_page), + PAGE_SIZE); + } + tbio->bi_end_io = end_sync_write; + + d = r10_bio->devs[i].devnum; + atomic_inc(&conf->mirrors[d].rdev->nr_pending); + atomic_inc(&r10_bio->remaining); + md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9); + + tbio->bi_sector += conf->mirrors[d].rdev->data_offset; + tbio->bi_bdev = conf->mirrors[d].rdev->bdev; + generic_make_request(tbio); + } + + /* Now write out to any replacement devices + * that are active + */ + for (i = 0; i < conf->copies; i++) { + int j, d; + + tbio = r10_bio->devs[i].repl_bio; + if (!tbio || !tbio->bi_end_io) + continue; + if (r10_bio->devs[i].bio->bi_end_io != end_sync_write + && r10_bio->devs[i].bio != fbio) + for (j = 0; j < vcnt; j++) + memcpy(page_address(tbio->bi_io_vec[j].bv_page), + page_address(fbio->bi_io_vec[j].bv_page), + PAGE_SIZE); + d = r10_bio->devs[i].devnum; + atomic_inc(&r10_bio->remaining); + md_sync_acct(conf->mirrors[d].replacement->bdev, + tbio->bi_size >> 9); + generic_make_request(tbio); + } + +done: + if (atomic_dec_and_test(&r10_bio->remaining)) { + md_done_sync(mddev, r10_bio->sectors, 1); + put_buf(r10_bio); + } +} + +/* + * Now for the recovery code. + * Recovery happens across physical sectors. + * We recover all non-is_sync drives by finding the virtual address of + * each, and then choose a working drive that also has that virt address. + * There is a separate r10_bio for each non-in_sync drive. + * Only the first two slots are in use. The first for reading, + * The second for writing. + * + */ +static void fix_recovery_read_error(struct r10bio *r10_bio) +{ + /* We got a read error during recovery. + * We repeat the read in smaller page-sized sections. + * If a read succeeds, write it to the new device or record + * a bad block if we cannot. + * If a read fails, record a bad block on both old and + * new devices. + */ + struct mddev *mddev = r10_bio->mddev; + struct r10conf *conf = mddev->private; + struct bio *bio = r10_bio->devs[0].bio; + sector_t sect = 0; + int sectors = r10_bio->sectors; + int idx = 0; + int dr = r10_bio->devs[0].devnum; + int dw = r10_bio->devs[1].devnum; + + while (sectors) { + int s = sectors; + struct md_rdev *rdev; + sector_t addr; + int ok; + + if (s > (PAGE_SIZE>>9)) + s = PAGE_SIZE >> 9; + + rdev = conf->mirrors[dr].rdev; + addr = r10_bio->devs[0].addr + sect, + ok = sync_page_io(rdev, + addr, + s << 9, + bio->bi_io_vec[idx].bv_page, + READ, false); + if (ok) { + rdev = conf->mirrors[dw].rdev; + addr = r10_bio->devs[1].addr + sect; + ok = sync_page_io(rdev, + addr, + s << 9, + bio->bi_io_vec[idx].bv_page, + WRITE, false); + if (!ok) { + set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, + &rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, + &rdev->mddev->recovery); + } + } + if (!ok) { + /* We don't worry if we cannot set a bad block - + * it really is bad so there is no loss in not + * recording it yet + */ + rdev_set_badblocks(rdev, addr, s, 0); + + if (rdev != conf->mirrors[dw].rdev) { + /* need bad block on destination too */ + struct md_rdev *rdev2 = conf->mirrors[dw].rdev; + addr = r10_bio->devs[1].addr + sect; + ok = rdev_set_badblocks(rdev2, addr, s, 0); + if (!ok) { + /* just abort the recovery */ + printk(KERN_NOTICE + "md/raid10:%s: recovery aborted" + " due to read error\n", + mdname(mddev)); + + conf->mirrors[dw].recovery_disabled + = mddev->recovery_disabled; + set_bit(MD_RECOVERY_INTR, + &mddev->recovery); + break; + } + } + } + + sectors -= s; + sect += s; + idx++; + } +} + +static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) +{ + struct r10conf *conf = mddev->private; + int d; + struct bio *wbio, *wbio2; + + if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) { + fix_recovery_read_error(r10_bio); + end_sync_request(r10_bio); + return; + } + + /* + * share the pages with the first bio + * and submit the write request + */ + d = r10_bio->devs[1].devnum; + wbio = r10_bio->devs[1].bio; + wbio2 = r10_bio->devs[1].repl_bio; + if (wbio->bi_end_io) { + atomic_inc(&conf->mirrors[d].rdev->nr_pending); + md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); + generic_make_request(wbio); + } + if (wbio2 && wbio2->bi_end_io) { + atomic_inc(&conf->mirrors[d].replacement->nr_pending); + md_sync_acct(conf->mirrors[d].replacement->bdev, + wbio2->bi_size >> 9); + generic_make_request(wbio2); + } +} + + +/* + * Used by fix_read_error() to decay the per rdev read_errors. + * We halve the read error count for every hour that has elapsed + * since the last recorded read error. + * + */ +static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev) +{ + struct timespec cur_time_mon; + unsigned long hours_since_last; + unsigned int read_errors = atomic_read(&rdev->read_errors); + + ktime_get_ts(&cur_time_mon); + + if (rdev->last_read_error.tv_sec == 0 && + rdev->last_read_error.tv_nsec == 0) { + /* first time we've seen a read error */ + rdev->last_read_error = cur_time_mon; + return; + } + + hours_since_last = (cur_time_mon.tv_sec - + rdev->last_read_error.tv_sec) / 3600; + + rdev->last_read_error = cur_time_mon; + + /* + * if hours_since_last is > the number of bits in read_errors + * just set read errors to 0. We do this to avoid + * overflowing the shift of read_errors by hours_since_last. + */ + if (hours_since_last >= 8 * sizeof(read_errors)) + atomic_set(&rdev->read_errors, 0); + else + atomic_set(&rdev->read_errors, read_errors >> hours_since_last); +} + +static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, + int sectors, struct page *page, int rw) +{ + sector_t first_bad; + int bad_sectors; + + if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors) + && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags))) + return -1; + if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) + /* success */ + return 1; + if (rw == WRITE) { + set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, &rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, + &rdev->mddev->recovery); + } + /* need to record an error - either for the block or the device */ + if (!rdev_set_badblocks(rdev, sector, sectors, 0)) + md_error(rdev->mddev, rdev); + return 0; +} + +/* + * This is a kernel thread which: + * + * 1. Retries failed read operations on working mirrors. + * 2. Updates the raid superblock when problems encounter. + * 3. Performs writes following reads for array synchronising. + */ + +static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio) +{ + int sect = 0; /* Offset from r10_bio->sector */ + int sectors = r10_bio->sectors; + struct md_rdev*rdev; + int max_read_errors = atomic_read(&mddev->max_corr_read_errors); + int d = r10_bio->devs[r10_bio->read_slot].devnum; + + /* still own a reference to this rdev, so it cannot + * have been cleared recently. + */ + rdev = conf->mirrors[d].rdev; + + if (test_bit(Faulty, &rdev->flags)) + /* drive has already been failed, just ignore any + more fix_read_error() attempts */ + return; + + check_decay_read_errors(mddev, rdev); + atomic_inc(&rdev->read_errors); + if (atomic_read(&rdev->read_errors) > max_read_errors) { + char b[BDEVNAME_SIZE]; + bdevname(rdev->bdev, b); + + printk(KERN_NOTICE + "md/raid10:%s: %s: Raid device exceeded " + "read_error threshold [cur %d:max %d]\n", + mdname(mddev), b, + atomic_read(&rdev->read_errors), max_read_errors); + printk(KERN_NOTICE + "md/raid10:%s: %s: Failing raid device\n", + mdname(mddev), b); + md_error(mddev, conf->mirrors[d].rdev); + r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED; + return; + } + + while(sectors) { + int s = sectors; + int sl = r10_bio->read_slot; + int success = 0; + int start; + + if (s > (PAGE_SIZE>>9)) + s = PAGE_SIZE >> 9; + + rcu_read_lock(); + do { + sector_t first_bad; + int bad_sectors; + + d = r10_bio->devs[sl].devnum; + rdev = rcu_dereference(conf->mirrors[d].rdev); + if (rdev && + !test_bit(Unmerged, &rdev->flags) && + test_bit(In_sync, &rdev->flags) && + is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, + &first_bad, &bad_sectors) == 0) { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + success = sync_page_io(rdev, + r10_bio->devs[sl].addr + + sect, + s<<9, + conf->tmppage, READ, false); + rdev_dec_pending(rdev, mddev); + rcu_read_lock(); + if (success) + break; + } + sl++; + if (sl == conf->copies) + sl = 0; + } while (!success && sl != r10_bio->read_slot); + rcu_read_unlock(); + + if (!success) { + /* Cannot read from anywhere, just mark the block + * as bad on the first device to discourage future + * reads. + */ + int dn = r10_bio->devs[r10_bio->read_slot].devnum; + rdev = conf->mirrors[dn].rdev; + + if (!rdev_set_badblocks( + rdev, + r10_bio->devs[r10_bio->read_slot].addr + + sect, + s, 0)) { + md_error(mddev, rdev); + r10_bio->devs[r10_bio->read_slot].bio + = IO_BLOCKED; + } + break; + } + + start = sl; + /* write it back and re-read */ + rcu_read_lock(); + while (sl != r10_bio->read_slot) { + char b[BDEVNAME_SIZE]; + + if (sl==0) + sl = conf->copies; + sl--; + d = r10_bio->devs[sl].devnum; + rdev = rcu_dereference(conf->mirrors[d].rdev); + if (!rdev || + test_bit(Unmerged, &rdev->flags) || + !test_bit(In_sync, &rdev->flags)) + continue; + + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + if (r10_sync_page_io(rdev, + r10_bio->devs[sl].addr + + sect, + s, conf->tmppage, WRITE) + == 0) { + /* Well, this device is dead */ + printk(KERN_NOTICE + "md/raid10:%s: read correction " + "write failed" + " (%d sectors at %llu on %s)\n", + mdname(mddev), s, + (unsigned long long)( + sect + rdev->data_offset), + bdevname(rdev->bdev, b)); + printk(KERN_NOTICE "md/raid10:%s: %s: failing " + "drive\n", + mdname(mddev), + bdevname(rdev->bdev, b)); + } + rdev_dec_pending(rdev, mddev); + rcu_read_lock(); + } + sl = start; + while (sl != r10_bio->read_slot) { + char b[BDEVNAME_SIZE]; + + if (sl==0) + sl = conf->copies; + sl--; + d = r10_bio->devs[sl].devnum; + rdev = rcu_dereference(conf->mirrors[d].rdev); + if (!rdev || + !test_bit(In_sync, &rdev->flags)) + continue; + + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + switch (r10_sync_page_io(rdev, + r10_bio->devs[sl].addr + + sect, + s, conf->tmppage, + READ)) { + case 0: + /* Well, this device is dead */ + printk(KERN_NOTICE + "md/raid10:%s: unable to read back " + "corrected sectors" + " (%d sectors at %llu on %s)\n", + mdname(mddev), s, + (unsigned long long)( + sect + rdev->data_offset), + bdevname(rdev->bdev, b)); + printk(KERN_NOTICE "md/raid10:%s: %s: failing " + "drive\n", + mdname(mddev), + bdevname(rdev->bdev, b)); + break; + case 1: + printk(KERN_INFO + "md/raid10:%s: read error corrected" + " (%d sectors at %llu on %s)\n", + mdname(mddev), s, + (unsigned long long)( + sect + rdev->data_offset), + bdevname(rdev->bdev, b)); + atomic_add(s, &rdev->corrected_errors); + } + + rdev_dec_pending(rdev, mddev); + rcu_read_lock(); + } + rcu_read_unlock(); + + sectors -= s; + sect += s; + } +} + +static void bi_complete(struct bio *bio, int error) +{ + complete((struct completion *)bio->bi_private); +} + +static int submit_bio_wait(int rw, struct bio *bio) +{ + struct completion event; + rw |= REQ_SYNC; + + init_completion(&event); + bio->bi_private = &event; + bio->bi_end_io = bi_complete; + submit_bio(rw, bio); + wait_for_completion(&event); + + return test_bit(BIO_UPTODATE, &bio->bi_flags); +} + +static int narrow_write_error(struct r10bio *r10_bio, int i) +{ + struct bio *bio = r10_bio->master_bio; + struct mddev *mddev = r10_bio->mddev; + struct r10conf *conf = mddev->private; + struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev; + /* bio has the data to be written to slot 'i' where + * we just recently had a write error. + * We repeatedly clone the bio and trim down to one block, + * then try the write. Where the write fails we record + * a bad block. + * It is conceivable that the bio doesn't exactly align with + * blocks. We must handle this. + * + * We currently own a reference to the rdev. + */ + + int block_sectors; + sector_t sector; + int sectors; + int sect_to_write = r10_bio->sectors; + int ok = 1; + + if (rdev->badblocks.shift < 0) + return 0; + + block_sectors = 1 << rdev->badblocks.shift; + sector = r10_bio->sector; + sectors = ((r10_bio->sector + block_sectors) + & ~(sector_t)(block_sectors - 1)) + - sector; + + while (sect_to_write) { + struct bio *wbio; + if (sectors > sect_to_write) + sectors = sect_to_write; + /* Write at 'sector' for 'sectors' */ + wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + md_trim_bio(wbio, sector - bio->bi_sector, sectors); + wbio->bi_sector = (r10_bio->devs[i].addr+ + rdev->data_offset+ + (sector - r10_bio->sector)); + wbio->bi_bdev = rdev->bdev; + if (submit_bio_wait(WRITE, wbio) == 0) + /* Failure! */ + ok = rdev_set_badblocks(rdev, sector, + sectors, 0) + && ok; + + bio_put(wbio); + sect_to_write -= sectors; + sector += sectors; + sectors = block_sectors; + } + return ok; +} + +static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) +{ + int slot = r10_bio->read_slot; + struct bio *bio; + struct r10conf *conf = mddev->private; + struct md_rdev *rdev = r10_bio->devs[slot].rdev; + char b[BDEVNAME_SIZE]; + unsigned long do_sync; + int max_sectors; + + /* we got a read error. Maybe the drive is bad. Maybe just + * the block and we can fix it. + * We freeze all other IO, and try reading the block from + * other devices. When we find one, we re-write + * and check it that fixes the read error. + * This is all done synchronously while the array is + * frozen. + */ + bio = r10_bio->devs[slot].bio; + bdevname(bio->bi_bdev, b); + bio_put(bio); + r10_bio->devs[slot].bio = NULL; + + if (mddev->ro == 0) { + freeze_array(conf); + fix_read_error(conf, mddev, r10_bio); + unfreeze_array(conf); + } else + r10_bio->devs[slot].bio = IO_BLOCKED; + + rdev_dec_pending(rdev, mddev); + +read_more: + rdev = read_balance(conf, r10_bio, &max_sectors); + if (rdev == NULL) { + printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" + " read error for block %llu\n", + mdname(mddev), b, + (unsigned long long)r10_bio->sector); + raid_end_bio_io(r10_bio); + return; + } + + do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); + slot = r10_bio->read_slot; + printk_ratelimited( + KERN_ERR + "md/raid10:%s: %s: redirecting " + "sector %llu to another mirror\n", + mdname(mddev), + bdevname(rdev->bdev, b), + (unsigned long long)r10_bio->sector); + bio = bio_clone_mddev(r10_bio->master_bio, + GFP_NOIO, mddev); + md_trim_bio(bio, + r10_bio->sector - bio->bi_sector, + max_sectors); + r10_bio->devs[slot].bio = bio; + r10_bio->devs[slot].rdev = rdev; + bio->bi_sector = r10_bio->devs[slot].addr + + rdev->data_offset; + bio->bi_bdev = rdev->bdev; + bio->bi_rw = READ | do_sync; + bio->bi_private = r10_bio; + bio->bi_end_io = raid10_end_read_request; + if (max_sectors < r10_bio->sectors) { + /* Drat - have to split this up more */ + struct bio *mbio = r10_bio->master_bio; + int sectors_handled = + r10_bio->sector + max_sectors + - mbio->bi_sector; + r10_bio->sectors = max_sectors; + spin_lock_irq(&conf->device_lock); + if (mbio->bi_phys_segments == 0) + mbio->bi_phys_segments = 2; + else + mbio->bi_phys_segments++; + spin_unlock_irq(&conf->device_lock); + generic_make_request(bio); + + r10_bio = mempool_alloc(conf->r10bio_pool, + GFP_NOIO); + r10_bio->master_bio = mbio; + r10_bio->sectors = (mbio->bi_size >> 9) + - sectors_handled; + r10_bio->state = 0; + set_bit(R10BIO_ReadError, + &r10_bio->state); + r10_bio->mddev = mddev; + r10_bio->sector = mbio->bi_sector + + sectors_handled; + + goto read_more; + } else + generic_make_request(bio); +} + +static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) +{ + /* Some sort of write request has finished and it + * succeeded in writing where we thought there was a + * bad block. So forget the bad block. + * Or possibly if failed and we need to record + * a bad block. + */ + int m; + struct md_rdev *rdev; + + if (test_bit(R10BIO_IsSync, &r10_bio->state) || + test_bit(R10BIO_IsRecover, &r10_bio->state)) { + for (m = 0; m < conf->copies; m++) { + int dev = r10_bio->devs[m].devnum; + rdev = conf->mirrors[dev].rdev; + if (r10_bio->devs[m].bio == NULL) + continue; + if (test_bit(BIO_UPTODATE, + &r10_bio->devs[m].bio->bi_flags)) { + rdev_clear_badblocks( + rdev, + r10_bio->devs[m].addr, + r10_bio->sectors); + } else { + if (!rdev_set_badblocks( + rdev, + r10_bio->devs[m].addr, + r10_bio->sectors, 0)) + md_error(conf->mddev, rdev); + } + rdev = conf->mirrors[dev].replacement; + if (r10_bio->devs[m].repl_bio == NULL) + continue; + if (test_bit(BIO_UPTODATE, + &r10_bio->devs[m].repl_bio->bi_flags)) { + rdev_clear_badblocks( + rdev, + r10_bio->devs[m].addr, + r10_bio->sectors); + } else { + if (!rdev_set_badblocks( + rdev, + r10_bio->devs[m].addr, + r10_bio->sectors, 0)) + md_error(conf->mddev, rdev); + } + } + put_buf(r10_bio); + } else { + for (m = 0; m < conf->copies; m++) { + int dev = r10_bio->devs[m].devnum; + struct bio *bio = r10_bio->devs[m].bio; + rdev = conf->mirrors[dev].rdev; + if (bio == IO_MADE_GOOD) { + rdev_clear_badblocks( + rdev, + r10_bio->devs[m].addr, + r10_bio->sectors); + rdev_dec_pending(rdev, conf->mddev); + } else if (bio != NULL && + !test_bit(BIO_UPTODATE, &bio->bi_flags)) { + if (!narrow_write_error(r10_bio, m)) { + md_error(conf->mddev, rdev); + set_bit(R10BIO_Degraded, + &r10_bio->state); + } + rdev_dec_pending(rdev, conf->mddev); + } + bio = r10_bio->devs[m].repl_bio; + rdev = conf->mirrors[dev].replacement; + if (rdev && bio == IO_MADE_GOOD) { + rdev_clear_badblocks( + rdev, + r10_bio->devs[m].addr, + r10_bio->sectors); + rdev_dec_pending(rdev, conf->mddev); + } + } + if (test_bit(R10BIO_WriteError, + &r10_bio->state)) + close_write(r10_bio); + raid_end_bio_io(r10_bio); + } +} + +static void raid10d(struct mddev *mddev) +{ + struct r10bio *r10_bio; + unsigned long flags; + struct r10conf *conf = mddev->private; + struct list_head *head = &conf->retry_list; + struct blk_plug plug; + + md_check_recovery(mddev); + + blk_start_plug(&plug); + for (;;) { + + flush_pending_writes(conf); + + spin_lock_irqsave(&conf->device_lock, flags); + if (list_empty(head)) { + spin_unlock_irqrestore(&conf->device_lock, flags); + break; + } + r10_bio = list_entry(head->prev, struct r10bio, retry_list); + list_del(head->prev); + conf->nr_queued--; + spin_unlock_irqrestore(&conf->device_lock, flags); + + mddev = r10_bio->mddev; + conf = mddev->private; + if (test_bit(R10BIO_MadeGood, &r10_bio->state) || + test_bit(R10BIO_WriteError, &r10_bio->state)) + handle_write_completed(conf, r10_bio); + else if (test_bit(R10BIO_IsSync, &r10_bio->state)) + sync_request_write(mddev, r10_bio); + else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) + recovery_request_write(mddev, r10_bio); + else if (test_bit(R10BIO_ReadError, &r10_bio->state)) + handle_read_error(mddev, r10_bio); + else { + /* just a partial read to be scheduled from a + * separate context + */ + int slot = r10_bio->read_slot; + generic_make_request(r10_bio->devs[slot].bio); + } + + cond_resched(); + if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) + md_check_recovery(mddev); + } + blk_finish_plug(&plug); +} + + +static int init_resync(struct r10conf *conf) +{ + int buffs; + int i; + + buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; + BUG_ON(conf->r10buf_pool); + conf->have_replacement = 0; + for (i = 0; i < conf->raid_disks; i++) + if (conf->mirrors[i].replacement) + conf->have_replacement = 1; + conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf); + if (!conf->r10buf_pool) + return -ENOMEM; + conf->next_resync = 0; + return 0; +} + +/* + * perform a "sync" on one "block" + * + * We need to make sure that no normal I/O request - particularly write + * requests - conflict with active sync requests. + * + * This is achieved by tracking pending requests and a 'barrier' concept + * that can be installed to exclude normal IO requests. + * + * Resync and recovery are handled very differently. + * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery. + * + * For resync, we iterate over virtual addresses, read all copies, + * and update if there are differences. If only one copy is live, + * skip it. + * For recovery, we iterate over physical addresses, read a good + * value for each non-in_sync drive, and over-write. + * + * So, for recovery we may have several outstanding complex requests for a + * given address, one for each out-of-sync device. We model this by allocating + * a number of r10_bio structures, one for each out-of-sync device. + * As we setup these structures, we collect all bio's together into a list + * which we then process collectively to add pages, and then process again + * to pass to generic_make_request. + * + * The r10_bio structures are linked using a borrowed master_bio pointer. + * This link is counted in ->remaining. When the r10_bio that points to NULL + * has its remaining count decremented to 0, the whole complex operation + * is complete. + * + */ + +static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, + int *skipped, int go_faster) +{ + struct r10conf *conf = mddev->private; + struct r10bio *r10_bio; + struct bio *biolist = NULL, *bio; + sector_t max_sector, nr_sectors; + int i; + int max_sync; + sector_t sync_blocks; + sector_t sectors_skipped = 0; + int chunks_skipped = 0; + + if (!conf->r10buf_pool) + if (init_resync(conf)) + return 0; + + skipped: + max_sector = mddev->dev_sectors; + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) + max_sector = mddev->resync_max_sectors; + if (sector_nr >= max_sector) { + /* If we aborted, we need to abort the + * sync on the 'current' bitmap chucks (there can + * be several when recovering multiple devices). + * as we may have started syncing it but not finished. + * We can find the current address in + * mddev->curr_resync, but for recovery, + * we need to convert that to several + * virtual addresses. + */ + if (mddev->curr_resync < max_sector) { /* aborted */ + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) + bitmap_end_sync(mddev->bitmap, mddev->curr_resync, + &sync_blocks, 1); + else for (i=0; i<conf->raid_disks; i++) { + sector_t sect = + raid10_find_virt(conf, mddev->curr_resync, i); + bitmap_end_sync(mddev->bitmap, sect, + &sync_blocks, 1); + } + } else { + /* completed sync */ + if ((!mddev->bitmap || conf->fullsync) + && conf->have_replacement + && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + /* Completed a full sync so the replacements + * are now fully recovered. + */ + for (i = 0; i < conf->raid_disks; i++) + if (conf->mirrors[i].replacement) + conf->mirrors[i].replacement + ->recovery_offset + = MaxSector; + } + conf->fullsync = 0; + } + bitmap_close_sync(mddev->bitmap); + close_sync(conf); + *skipped = 1; + return sectors_skipped; + } + if (chunks_skipped >= conf->raid_disks) { + /* if there has been nothing to do on any drive, + * then there is nothing to do at all.. + */ + *skipped = 1; + return (max_sector - sector_nr) + sectors_skipped; + } + + if (max_sector > mddev->resync_max) + max_sector = mddev->resync_max; /* Don't do IO beyond here */ + + /* make sure whole request will fit in a chunk - if chunks + * are meaningful + */ + if (conf->near_copies < conf->raid_disks && + max_sector > (sector_nr | conf->chunk_mask)) + max_sector = (sector_nr | conf->chunk_mask) + 1; + /* + * If there is non-resync activity waiting for us then + * put in a delay to throttle resync. + */ + if (!go_faster && conf->nr_waiting) + msleep_interruptible(1000); + + /* Again, very different code for resync and recovery. + * Both must result in an r10bio with a list of bios that + * have bi_end_io, bi_sector, bi_bdev set, + * and bi_private set to the r10bio. + * For recovery, we may actually create several r10bios + * with 2 bios in each, that correspond to the bios in the main one. + * In this case, the subordinate r10bios link back through a + * borrowed master_bio pointer, and the counter in the master + * includes a ref from each subordinate. + */ + /* First, we decide what to do and set ->bi_end_io + * To end_sync_read if we want to read, and + * end_sync_write if we will want to write. + */ + + max_sync = RESYNC_PAGES << (PAGE_SHIFT-9); + if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + /* recovery... the complicated one */ + int j; + r10_bio = NULL; + + for (i=0 ; i<conf->raid_disks; i++) { + int still_degraded; + struct r10bio *rb2; + sector_t sect; + int must_sync; + int any_working; + struct mirror_info *mirror = &conf->mirrors[i]; + + if ((mirror->rdev == NULL || + test_bit(In_sync, &mirror->rdev->flags)) + && + (mirror->replacement == NULL || + test_bit(Faulty, + &mirror->replacement->flags))) + continue; + + still_degraded = 0; + /* want to reconstruct this device */ + rb2 = r10_bio; + sect = raid10_find_virt(conf, sector_nr, i); + if (sect >= mddev->resync_max_sectors) { + /* last stripe is not complete - don't + * try to recover this sector. + */ + continue; + } + /* Unless we are doing a full sync, or a replacement + * we only need to recover the block if it is set in + * the bitmap + */ + must_sync = bitmap_start_sync(mddev->bitmap, sect, + &sync_blocks, 1); + if (sync_blocks < max_sync) + max_sync = sync_blocks; + if (!must_sync && + mirror->replacement == NULL && + !conf->fullsync) { + /* yep, skip the sync_blocks here, but don't assume + * that there will never be anything to do here + */ + chunks_skipped = -1; + continue; + } + + r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); + raise_barrier(conf, rb2 != NULL); + atomic_set(&r10_bio->remaining, 0); + + r10_bio->master_bio = (struct bio*)rb2; + if (rb2) + atomic_inc(&rb2->remaining); + r10_bio->mddev = mddev; + set_bit(R10BIO_IsRecover, &r10_bio->state); + r10_bio->sector = sect; + + raid10_find_phys(conf, r10_bio); + + /* Need to check if the array will still be + * degraded + */ + for (j=0; j<conf->raid_disks; j++) + if (conf->mirrors[j].rdev == NULL || + test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { + still_degraded = 1; + break; + } + + must_sync = bitmap_start_sync(mddev->bitmap, sect, + &sync_blocks, still_degraded); + + any_working = 0; + for (j=0; j<conf->copies;j++) { + int k; + int d = r10_bio->devs[j].devnum; + sector_t from_addr, to_addr; + struct md_rdev *rdev; + sector_t sector, first_bad; + int bad_sectors; + if (!conf->mirrors[d].rdev || + !test_bit(In_sync, &conf->mirrors[d].rdev->flags)) + continue; + /* This is where we read from */ + any_working = 1; + rdev = conf->mirrors[d].rdev; + sector = r10_bio->devs[j].addr; + + if (is_badblock(rdev, sector, max_sync, + &first_bad, &bad_sectors)) { + if (first_bad > sector) + max_sync = first_bad - sector; + else { + bad_sectors -= (sector + - first_bad); + if (max_sync > bad_sectors) + max_sync = bad_sectors; + continue; + } + } + bio = r10_bio->devs[0].bio; + bio->bi_next = biolist; + biolist = bio; + bio->bi_private = r10_bio; + bio->bi_end_io = end_sync_read; + bio->bi_rw = READ; + from_addr = r10_bio->devs[j].addr; + bio->bi_sector = from_addr + rdev->data_offset; + bio->bi_bdev = rdev->bdev; + atomic_inc(&rdev->nr_pending); + /* and we write to 'i' (if not in_sync) */ + + for (k=0; k<conf->copies; k++) + if (r10_bio->devs[k].devnum == i) + break; + BUG_ON(k == conf->copies); + to_addr = r10_bio->devs[k].addr; + r10_bio->devs[0].devnum = d; + r10_bio->devs[0].addr = from_addr; + r10_bio->devs[1].devnum = i; + r10_bio->devs[1].addr = to_addr; + + rdev = mirror->rdev; + if (!test_bit(In_sync, &rdev->flags)) { + bio = r10_bio->devs[1].bio; + bio->bi_next = biolist; + biolist = bio; + bio->bi_private = r10_bio; + bio->bi_end_io = end_sync_write; + bio->bi_rw = WRITE; + bio->bi_sector = to_addr + + rdev->data_offset; + bio->bi_bdev = rdev->bdev; + atomic_inc(&r10_bio->remaining); + } else + r10_bio->devs[1].bio->bi_end_io = NULL; + + /* and maybe write to replacement */ + bio = r10_bio->devs[1].repl_bio; + if (bio) + bio->bi_end_io = NULL; + rdev = mirror->replacement; + /* Note: if rdev != NULL, then bio + * cannot be NULL as r10buf_pool_alloc will + * have allocated it. + * So the second test here is pointless. + * But it keeps semantic-checkers happy, and + * this comment keeps human reviewers + * happy. + */ + if (rdev == NULL || bio == NULL || + test_bit(Faulty, &rdev->flags)) + break; + bio->bi_next = biolist; + biolist = bio; + bio->bi_private = r10_bio; + bio->bi_end_io = end_sync_write; + bio->bi_rw = WRITE; + bio->bi_sector = to_addr + rdev->data_offset; + bio->bi_bdev = rdev->bdev; + atomic_inc(&r10_bio->remaining); + break; + } + if (j == conf->copies) { + /* Cannot recover, so abort the recovery or + * record a bad block */ + put_buf(r10_bio); + if (rb2) + atomic_dec(&rb2->remaining); + r10_bio = rb2; + if (any_working) { + /* problem is that there are bad blocks + * on other device(s) + */ + int k; + for (k = 0; k < conf->copies; k++) + if (r10_bio->devs[k].devnum == i) + break; + if (!test_bit(In_sync, + &mirror->rdev->flags) + && !rdev_set_badblocks( + mirror->rdev, + r10_bio->devs[k].addr, + max_sync, 0)) + any_working = 0; + if (mirror->replacement && + !rdev_set_badblocks( + mirror->replacement, + r10_bio->devs[k].addr, + max_sync, 0)) + any_working = 0; + } + if (!any_working) { + if (!test_and_set_bit(MD_RECOVERY_INTR, + &mddev->recovery)) + printk(KERN_INFO "md/raid10:%s: insufficient " + "working devices for recovery.\n", + mdname(mddev)); + mirror->recovery_disabled + = mddev->recovery_disabled; + } + break; + } + } + if (biolist == NULL) { + while (r10_bio) { + struct r10bio *rb2 = r10_bio; + r10_bio = (struct r10bio*) rb2->master_bio; + rb2->master_bio = NULL; + put_buf(rb2); + } + goto giveup; + } + } else { + /* resync. Schedule a read for every block at this virt offset */ + int count = 0; + + bitmap_cond_end_sync(mddev->bitmap, sector_nr); + + if (!bitmap_start_sync(mddev->bitmap, sector_nr, + &sync_blocks, mddev->degraded) && + !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, + &mddev->recovery)) { + /* We can skip this block */ + *skipped = 1; + return sync_blocks + sectors_skipped; + } + if (sync_blocks < max_sync) + max_sync = sync_blocks; + r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); + + r10_bio->mddev = mddev; + atomic_set(&r10_bio->remaining, 0); + raise_barrier(conf, 0); + conf->next_resync = sector_nr; + + r10_bio->master_bio = NULL; + r10_bio->sector = sector_nr; + set_bit(R10BIO_IsSync, &r10_bio->state); + raid10_find_phys(conf, r10_bio); + r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1; + + for (i=0; i<conf->copies; i++) { + int d = r10_bio->devs[i].devnum; + sector_t first_bad, sector; + int bad_sectors; + + if (r10_bio->devs[i].repl_bio) + r10_bio->devs[i].repl_bio->bi_end_io = NULL; + + bio = r10_bio->devs[i].bio; + bio->bi_end_io = NULL; + clear_bit(BIO_UPTODATE, &bio->bi_flags); + if (conf->mirrors[d].rdev == NULL || + test_bit(Faulty, &conf->mirrors[d].rdev->flags)) + continue; + sector = r10_bio->devs[i].addr; + if (is_badblock(conf->mirrors[d].rdev, + sector, max_sync, + &first_bad, &bad_sectors)) { + if (first_bad > sector) + max_sync = first_bad - sector; + else { + bad_sectors -= (sector - first_bad); + if (max_sync > bad_sectors) + max_sync = max_sync; + continue; + } + } + atomic_inc(&conf->mirrors[d].rdev->nr_pending); + atomic_inc(&r10_bio->remaining); + bio->bi_next = biolist; + biolist = bio; + bio->bi_private = r10_bio; + bio->bi_end_io = end_sync_read; + bio->bi_rw = READ; + bio->bi_sector = sector + + conf->mirrors[d].rdev->data_offset; + bio->bi_bdev = conf->mirrors[d].rdev->bdev; + count++; + + if (conf->mirrors[d].replacement == NULL || + test_bit(Faulty, + &conf->mirrors[d].replacement->flags)) + continue; + + /* Need to set up for writing to the replacement */ + bio = r10_bio->devs[i].repl_bio; + clear_bit(BIO_UPTODATE, &bio->bi_flags); + + sector = r10_bio->devs[i].addr; + atomic_inc(&conf->mirrors[d].rdev->nr_pending); + bio->bi_next = biolist; + biolist = bio; + bio->bi_private = r10_bio; + bio->bi_end_io = end_sync_write; + bio->bi_rw = WRITE; + bio->bi_sector = sector + + conf->mirrors[d].replacement->data_offset; + bio->bi_bdev = conf->mirrors[d].replacement->bdev; + count++; + } + + if (count < 2) { + for (i=0; i<conf->copies; i++) { + int d = r10_bio->devs[i].devnum; + if (r10_bio->devs[i].bio->bi_end_io) + rdev_dec_pending(conf->mirrors[d].rdev, + mddev); + if (r10_bio->devs[i].repl_bio && + r10_bio->devs[i].repl_bio->bi_end_io) + rdev_dec_pending( + conf->mirrors[d].replacement, + mddev); + } + put_buf(r10_bio); + biolist = NULL; + goto giveup; + } + } + + for (bio = biolist; bio ; bio=bio->bi_next) { + + bio->bi_flags &= ~(BIO_POOL_MASK - 1); + if (bio->bi_end_io) + bio->bi_flags |= 1 << BIO_UPTODATE; + bio->bi_vcnt = 0; + bio->bi_idx = 0; + bio->bi_phys_segments = 0; + bio->bi_size = 0; + } + + nr_sectors = 0; + if (sector_nr + max_sync < max_sector) + max_sector = sector_nr + max_sync; + do { + struct page *page; + int len = PAGE_SIZE; + if (sector_nr + (len>>9) > max_sector) + len = (max_sector - sector_nr) << 9; + if (len == 0) + break; + for (bio= biolist ; bio ; bio=bio->bi_next) { + struct bio *bio2; + page = bio->bi_io_vec[bio->bi_vcnt].bv_page; + if (bio_add_page(bio, page, len, 0)) + continue; + + /* stop here */ + bio->bi_io_vec[bio->bi_vcnt].bv_page = page; + for (bio2 = biolist; + bio2 && bio2 != bio; + bio2 = bio2->bi_next) { + /* remove last page from this bio */ + bio2->bi_vcnt--; + bio2->bi_size -= len; + bio2->bi_flags &= ~(1<< BIO_SEG_VALID); + } + goto bio_full; + } + nr_sectors += len>>9; + sector_nr += len>>9; + } while (biolist->bi_vcnt < RESYNC_PAGES); + bio_full: + r10_bio->sectors = nr_sectors; + + while (biolist) { + bio = biolist; + biolist = biolist->bi_next; + + bio->bi_next = NULL; + r10_bio = bio->bi_private; + r10_bio->sectors = nr_sectors; + + if (bio->bi_end_io == end_sync_read) { + md_sync_acct(bio->bi_bdev, nr_sectors); + generic_make_request(bio); + } + } + + if (sectors_skipped) + /* pretend they weren't skipped, it makes + * no important difference in this case + */ + md_done_sync(mddev, sectors_skipped, 1); + + return sectors_skipped + nr_sectors; + giveup: + /* There is nowhere to write, so all non-sync + * drives must be failed or in resync, all drives + * have a bad block, so try the next chunk... + */ + if (sector_nr + max_sync < max_sector) + max_sector = sector_nr + max_sync; + + sectors_skipped += (max_sector - sector_nr); + chunks_skipped ++; + sector_nr = max_sector; + goto skipped; +} + +static sector_t +raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks) +{ + sector_t size; + struct r10conf *conf = mddev->private; + + if (!raid_disks) + raid_disks = conf->raid_disks; + if (!sectors) + sectors = conf->dev_sectors; + + size = sectors >> conf->chunk_shift; + sector_div(size, conf->far_copies); + size = size * raid_disks; + sector_div(size, conf->near_copies); + + return size << conf->chunk_shift; +} + +static void calc_sectors(struct r10conf *conf, sector_t size) +{ + /* Calculate the number of sectors-per-device that will + * actually be used, and set conf->dev_sectors and + * conf->stride + */ + + size = size >> conf->chunk_shift; + sector_div(size, conf->far_copies); + size = size * conf->raid_disks; + sector_div(size, conf->near_copies); + /* 'size' is now the number of chunks in the array */ + /* calculate "used chunks per device" */ + size = size * conf->copies; + + /* We need to round up when dividing by raid_disks to + * get the stride size. + */ + size = DIV_ROUND_UP_SECTOR_T(size, conf->raid_disks); + + conf->dev_sectors = size << conf->chunk_shift; + + if (conf->far_offset) + conf->stride = 1 << conf->chunk_shift; + else { + sector_div(size, conf->far_copies); + conf->stride = size << conf->chunk_shift; + } +} + +static struct r10conf *setup_conf(struct mddev *mddev) +{ + struct r10conf *conf = NULL; + int nc, fc, fo; + int err = -EINVAL; + + if (mddev->new_chunk_sectors < (PAGE_SIZE >> 9) || + !is_power_of_2(mddev->new_chunk_sectors)) { + printk(KERN_ERR "md/raid10:%s: chunk size must be " + "at least PAGE_SIZE(%ld) and be a power of 2.\n", + mdname(mddev), PAGE_SIZE); + goto out; + } + + nc = mddev->new_layout & 255; + fc = (mddev->new_layout >> 8) & 255; + fo = mddev->new_layout & (1<<16); + + if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || + (mddev->new_layout >> 17)) { + printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n", + mdname(mddev), mddev->new_layout); + goto out; + } + + err = -ENOMEM; + conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL); + if (!conf) + goto out; + + conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, + GFP_KERNEL); + if (!conf->mirrors) + goto out; + + conf->tmppage = alloc_page(GFP_KERNEL); + if (!conf->tmppage) + goto out; + + + conf->raid_disks = mddev->raid_disks; + conf->near_copies = nc; + conf->far_copies = fc; + conf->copies = nc*fc; + conf->far_offset = fo; + conf->chunk_mask = mddev->new_chunk_sectors - 1; + conf->chunk_shift = ffz(~mddev->new_chunk_sectors); + + conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, + r10bio_pool_free, conf); + if (!conf->r10bio_pool) + goto out; + + calc_sectors(conf, mddev->dev_sectors); + + spin_lock_init(&conf->device_lock); + INIT_LIST_HEAD(&conf->retry_list); + + spin_lock_init(&conf->resync_lock); + init_waitqueue_head(&conf->wait_barrier); + + conf->thread = md_register_thread(raid10d, mddev, NULL); + if (!conf->thread) + goto out; + + conf->mddev = mddev; + return conf; + + out: + printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n", + mdname(mddev)); + if (conf) { + if (conf->r10bio_pool) + mempool_destroy(conf->r10bio_pool); + kfree(conf->mirrors); + safe_put_page(conf->tmppage); + kfree(conf); + } + return ERR_PTR(err); +} + +static int run(struct mddev *mddev) +{ + struct r10conf *conf; + int i, disk_idx, chunk_size; + struct mirror_info *disk; + struct md_rdev *rdev; + sector_t size; + + /* + * copy the already verified devices into our private RAID10 + * bookkeeping area. [whatever we allocate in run(), + * should be freed in stop()] + */ + + if (mddev->private == NULL) { + conf = setup_conf(mddev); + if (IS_ERR(conf)) + return PTR_ERR(conf); + mddev->private = conf; + } + conf = mddev->private; + if (!conf) + goto out; + + mddev->thread = conf->thread; + conf->thread = NULL; + + chunk_size = mddev->chunk_sectors << 9; + blk_queue_io_min(mddev->queue, chunk_size); + if (conf->raid_disks % conf->near_copies) + blk_queue_io_opt(mddev->queue, chunk_size * conf->raid_disks); + else + blk_queue_io_opt(mddev->queue, chunk_size * + (conf->raid_disks / conf->near_copies)); + + rdev_for_each(rdev, mddev) { + struct request_queue *q; + disk_idx = rdev->raid_disk; + if (disk_idx >= conf->raid_disks + || disk_idx < 0) + continue; + disk = conf->mirrors + disk_idx; + + if (test_bit(Replacement, &rdev->flags)) { + if (disk->replacement) + goto out_free_conf; + disk->replacement = rdev; + } else { + if (disk->rdev) + goto out_free_conf; + disk->rdev = rdev; + } + q = bdev_get_queue(rdev->bdev); + if (q->merge_bvec_fn) + mddev->merge_check_needed = 1; + + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + + disk->head_position = 0; + } + /* need to check that every block has at least one working mirror */ + if (!enough(conf, -1)) { + printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", + mdname(mddev)); + goto out_free_conf; + } + + mddev->degraded = 0; + for (i = 0; i < conf->raid_disks; i++) { + + disk = conf->mirrors + i; + + if (!disk->rdev && disk->replacement) { + /* The replacement is all we have - use it */ + disk->rdev = disk->replacement; + disk->replacement = NULL; + clear_bit(Replacement, &disk->rdev->flags); + } + + if (!disk->rdev || + !test_bit(In_sync, &disk->rdev->flags)) { + disk->head_position = 0; + mddev->degraded++; + if (disk->rdev) + conf->fullsync = 1; + } + disk->recovery_disabled = mddev->recovery_disabled - 1; + } + + if (mddev->recovery_cp != MaxSector) + printk(KERN_NOTICE "md/raid10:%s: not clean" + " -- starting background reconstruction\n", + mdname(mddev)); + printk(KERN_INFO + "md/raid10:%s: active with %d out of %d devices\n", + mdname(mddev), conf->raid_disks - mddev->degraded, + conf->raid_disks); + /* + * Ok, everything is just fine now + */ + mddev->dev_sectors = conf->dev_sectors; + size = raid10_size(mddev, 0, 0); + md_set_array_sectors(mddev, size); + mddev->resync_max_sectors = size; + + mddev->queue->backing_dev_info.congested_fn = raid10_congested; + mddev->queue->backing_dev_info.congested_data = mddev; + + /* Calculate max read-ahead size. + * We need to readahead at least twice a whole stripe.... + * maybe... + */ + { + int stripe = conf->raid_disks * + ((mddev->chunk_sectors << 9) / PAGE_SIZE); + stripe /= conf->near_copies; + if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) + mddev->queue->backing_dev_info.ra_pages = 2* stripe; + } + + blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); + + if (md_integrity_register(mddev)) + goto out_free_conf; + + return 0; + +out_free_conf: + md_unregister_thread(&mddev->thread); + if (conf->r10bio_pool) + mempool_destroy(conf->r10bio_pool); + safe_put_page(conf->tmppage); + kfree(conf->mirrors); + kfree(conf); + mddev->private = NULL; +out: + return -EIO; +} + +static int stop(struct mddev *mddev) +{ + struct r10conf *conf = mddev->private; + + raise_barrier(conf, 0); + lower_barrier(conf); + + md_unregister_thread(&mddev->thread); + blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ + if (conf->r10bio_pool) + mempool_destroy(conf->r10bio_pool); + kfree(conf->mirrors); + kfree(conf); + mddev->private = NULL; + return 0; +} + +static void raid10_quiesce(struct mddev *mddev, int state) +{ + struct r10conf *conf = mddev->private; + + switch(state) { + case 1: + raise_barrier(conf, 0); + break; + case 0: + lower_barrier(conf); + break; + } +} + +static int raid10_resize(struct mddev *mddev, sector_t sectors) +{ + /* Resize of 'far' arrays is not supported. + * For 'near' and 'offset' arrays we can set the + * number of sectors used to be an appropriate multiple + * of the chunk size. + * For 'offset', this is far_copies*chunksize. + * For 'near' the multiplier is the LCM of + * near_copies and raid_disks. + * So if far_copies > 1 && !far_offset, fail. + * Else find LCM(raid_disks, near_copy)*far_copies and + * multiply by chunk_size. Then round to this number. + * This is mostly done by raid10_size() + */ + struct r10conf *conf = mddev->private; + sector_t oldsize, size; + + if (conf->far_copies > 1 && !conf->far_offset) + return -EINVAL; + + oldsize = raid10_size(mddev, 0, 0); + size = raid10_size(mddev, sectors, 0); + md_set_array_sectors(mddev, size); + if (mddev->array_sectors > size) + return -EINVAL; + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + if (sectors > mddev->dev_sectors && + mddev->recovery_cp > oldsize) { + mddev->recovery_cp = oldsize; + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + } + calc_sectors(conf, sectors); + mddev->dev_sectors = conf->dev_sectors; + mddev->resync_max_sectors = size; + return 0; +} + +static void *raid10_takeover_raid0(struct mddev *mddev) +{ + struct md_rdev *rdev; + struct r10conf *conf; + + if (mddev->degraded > 0) { + printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n", + mdname(mddev)); + return ERR_PTR(-EINVAL); + } + + /* Set new parameters */ + mddev->new_level = 10; + /* new layout: far_copies = 1, near_copies = 2 */ + mddev->new_layout = (1<<8) + 2; + mddev->new_chunk_sectors = mddev->chunk_sectors; + mddev->delta_disks = mddev->raid_disks; + mddev->raid_disks *= 2; + /* make sure it will be not marked as dirty */ + mddev->recovery_cp = MaxSector; + + conf = setup_conf(mddev); + if (!IS_ERR(conf)) { + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0) + rdev->new_raid_disk = rdev->raid_disk * 2; + conf->barrier = 1; + } + + return conf; +} + +static void *raid10_takeover(struct mddev *mddev) +{ + struct r0conf *raid0_conf; + + /* raid10 can take over: + * raid0 - providing it has only two drives + */ + if (mddev->level == 0) { + /* for raid0 takeover only one zone is supported */ + raid0_conf = mddev->private; + if (raid0_conf->nr_strip_zones > 1) { + printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0" + " with more than one zone.\n", + mdname(mddev)); + return ERR_PTR(-EINVAL); + } + return raid10_takeover_raid0(mddev); + } + return ERR_PTR(-EINVAL); +} + +static struct md_personality raid10_personality = +{ + .name = "raid10", + .level = 10, + .owner = THIS_MODULE, + .make_request = make_request, + .run = run, + .stop = stop, + .status = status, + .error_handler = error, + .hot_add_disk = raid10_add_disk, + .hot_remove_disk= raid10_remove_disk, + .spare_active = raid10_spare_active, + .sync_request = sync_request, + .quiesce = raid10_quiesce, + .size = raid10_size, + .resize = raid10_resize, + .takeover = raid10_takeover, +}; + +static int __init raid_init(void) +{ + return register_md_personality(&raid10_personality); +} + +static void raid_exit(void) +{ + unregister_md_personality(&raid10_personality); +} + +module_init(raid_init); +module_exit(raid_exit); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD"); +MODULE_ALIAS("md-personality-9"); /* RAID10 */ +MODULE_ALIAS("md-raid10"); +MODULE_ALIAS("md-level-10"); + +module_param(max_queued_requests, int, S_IRUGO|S_IWUSR); diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h new file mode 100644 index 00000000..7c615613 --- /dev/null +++ b/drivers/md/raid10.h @@ -0,0 +1,150 @@ +#ifndef _RAID10_H +#define _RAID10_H + +struct mirror_info { + struct md_rdev *rdev, *replacement; + sector_t head_position; + int recovery_disabled; /* matches + * mddev->recovery_disabled + * when we shouldn't try + * recovering this device. + */ +}; + +struct r10conf { + struct mddev *mddev; + struct mirror_info *mirrors; + int raid_disks; + spinlock_t device_lock; + + /* geometry */ + int near_copies; /* number of copies laid out + * raid0 style */ + int far_copies; /* number of copies laid out + * at large strides across drives + */ + int far_offset; /* far_copies are offset by 1 + * stripe instead of many + */ + int copies; /* near_copies * far_copies. + * must be <= raid_disks + */ + sector_t stride; /* distance between far copies. + * This is size / far_copies unless + * far_offset, in which case it is + * 1 stripe. + */ + + sector_t dev_sectors; /* temp copy of + * mddev->dev_sectors */ + + int chunk_shift; /* shift from chunks to sectors */ + sector_t chunk_mask; + + struct list_head retry_list; + /* queue pending writes and submit them on unplug */ + struct bio_list pending_bio_list; + int pending_count; + + spinlock_t resync_lock; + int nr_pending; + int nr_waiting; + int nr_queued; + int barrier; + sector_t next_resync; + int fullsync; /* set to 1 if a full sync is needed, + * (fresh device added). + * Cleared when a sync completes. + */ + int have_replacement; /* There is at least one + * replacement device. + */ + wait_queue_head_t wait_barrier; + + mempool_t *r10bio_pool; + mempool_t *r10buf_pool; + struct page *tmppage; + + /* When taking over an array from a different personality, we store + * the new thread here until we fully activate the array. + */ + struct md_thread *thread; +}; + +/* + * this is our 'private' RAID10 bio. + * + * it contains information about what kind of IO operations were started + * for this RAID10 operation, and about their status: + */ + +struct r10bio { + atomic_t remaining; /* 'have we finished' count, + * used from IRQ handlers + */ + sector_t sector; /* virtual sector number */ + int sectors; + unsigned long state; + struct mddev *mddev; + /* + * original bio going to /dev/mdx + */ + struct bio *master_bio; + /* + * if the IO is in READ direction, then this is where we read + */ + int read_slot; + + struct list_head retry_list; + /* + * if the IO is in WRITE direction, then multiple bios are used, + * one for each copy. + * When resyncing we also use one for each copy. + * When reconstructing, we use 2 bios, one for read, one for write. + * We choose the number when they are allocated. + * We sometimes need an extra bio to write to the replacement. + */ + struct { + struct bio *bio; + union { + struct bio *repl_bio; /* used for resync and + * writes */ + struct md_rdev *rdev; /* used for reads + * (read_slot >= 0) */ + }; + sector_t addr; + int devnum; + } devs[0]; +}; + +/* when we get a read error on a read-only array, we redirect to another + * device without failing the first device, or trying to over-write to + * correct the read error. To keep track of bad blocks on a per-bio + * level, we store IO_BLOCKED in the appropriate 'bios' pointer + */ +#define IO_BLOCKED ((struct bio*)1) +/* When we successfully write to a known bad-block, we need to remove the + * bad-block marking which must be done from process context. So we record + * the success by setting devs[n].bio to IO_MADE_GOOD + */ +#define IO_MADE_GOOD ((struct bio *)2) + +#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) + +/* bits for r10bio.state */ +enum r10bio_state { + R10BIO_Uptodate, + R10BIO_IsSync, + R10BIO_IsRecover, + R10BIO_Degraded, +/* Set ReadError on bios that experience a read error + * so that raid10d knows what to do with them. + */ + R10BIO_ReadError, +/* If a write for this request means we can clear some + * known-bad-block records, we set this flag. + */ + R10BIO_MadeGood, + R10BIO_WriteError, +}; +#endif diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c new file mode 100644 index 00000000..73a58007 --- /dev/null +++ b/drivers/md/raid5.c @@ -0,0 +1,6050 @@ +/* + * raid5.c : Multiple Devices driver for Linux + * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman + * Copyright (C) 1999, 2000 Ingo Molnar + * Copyright (C) 2002, 2003 H. Peter Anvin + * + * RAID-4/5/6 management functions. + * Thanks to Penguin Computing for making the RAID-6 development possible + * by donating a test server! + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * You should have received a copy of the GNU General Public License + * (for example /usr/src/linux/COPYING); if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* + * BITMAP UNPLUGGING: + * + * The sequencing for updating the bitmap reliably is a little + * subtle (and I got it wrong the first time) so it deserves some + * explanation. + * + * We group bitmap updates into batches. Each batch has a number. + * We may write out several batches at once, but that isn't very important. + * conf->seq_write is the number of the last batch successfully written. + * conf->seq_flush is the number of the last batch that was closed to + * new additions. + * When we discover that we will need to write to any block in a stripe + * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq + * the number of the batch it will be in. This is seq_flush+1. + * When we are ready to do a write, if that batch hasn't been written yet, + * we plug the array and queue the stripe for later. + * When an unplug happens, we increment bm_flush, thus closing the current + * batch. + * When we notice that bm_flush > bm_write, we write out all pending updates + * to the bitmap, and advance bm_write to where bm_flush was. + * This may occasionally write a bit out twice, but is sure never to + * miss any bits. + */ + +#include <linux/blkdev.h> +#include <linux/kthread.h> +#include <linux/raid/pq.h> +#include <linux/async_tx.h> +#include <linux/module.h> +#include <linux/async.h> +#include <linux/seq_file.h> +#include <linux/cpu.h> +#include <linux/slab.h> +#include <linux/ratelimit.h> +#include "md.h" +#include "raid5.h" +#include "raid0.h" +#include "bitmap.h" + +/* + * Stripe cache + */ + +#define NR_STRIPES 256 +#define STRIPE_SIZE PAGE_SIZE +#define STRIPE_SHIFT (PAGE_SHIFT - 9) +#define STRIPE_SECTORS (STRIPE_SIZE>>9) +#define IO_THRESHOLD 1 +#define BYPASS_THRESHOLD 1 +#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) +#define HASH_MASK (NR_HASH - 1) + +static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) +{ + int hash = (sect >> STRIPE_SHIFT) & HASH_MASK; + return &conf->stripe_hashtbl[hash]; +} + +/* bio's attached to a stripe+device for I/O are linked together in bi_sector + * order without overlap. There may be several bio's per stripe+device, and + * a bio could span several devices. + * When walking this list for a particular stripe+device, we must never proceed + * beyond a bio that extends past this device, as the next bio might no longer + * be valid. + * This function is used to determine the 'next' bio in the list, given the sector + * of the current stripe+device + */ +static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) +{ + int sectors = bio->bi_size >> 9; + if (bio->bi_sector + sectors < sector + STRIPE_SECTORS) + return bio->bi_next; + else + return NULL; +} + +/* + * We maintain a biased count of active stripes in the bottom 16 bits of + * bi_phys_segments, and a count of processed stripes in the upper 16 bits + */ +static inline int raid5_bi_phys_segments(struct bio *bio) +{ + return bio->bi_phys_segments & 0xffff; +} + +static inline int raid5_bi_hw_segments(struct bio *bio) +{ + return (bio->bi_phys_segments >> 16) & 0xffff; +} + +static inline int raid5_dec_bi_phys_segments(struct bio *bio) +{ + --bio->bi_phys_segments; + return raid5_bi_phys_segments(bio); +} + +static inline int raid5_dec_bi_hw_segments(struct bio *bio) +{ + unsigned short val = raid5_bi_hw_segments(bio); + + --val; + bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio); + return val; +} + +static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) +{ + bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16); +} + +/* Find first data disk in a raid6 stripe */ +static inline int raid6_d0(struct stripe_head *sh) +{ + if (sh->ddf_layout) + /* ddf always start from first device */ + return 0; + /* md starts just after Q block */ + if (sh->qd_idx == sh->disks - 1) + return 0; + else + return sh->qd_idx + 1; +} +static inline int raid6_next_disk(int disk, int raid_disks) +{ + disk++; + return (disk < raid_disks) ? disk : 0; +} + +/* When walking through the disks in a raid5, starting at raid6_d0, + * We need to map each disk to a 'slot', where the data disks are slot + * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk + * is raid_disks-1. This help does that mapping. + */ +static int raid6_idx_to_slot(int idx, struct stripe_head *sh, + int *count, int syndrome_disks) +{ + int slot = *count; + + if (sh->ddf_layout) + (*count)++; + if (idx == sh->pd_idx) + return syndrome_disks; + if (idx == sh->qd_idx) + return syndrome_disks + 1; + if (!sh->ddf_layout) + (*count)++; + return slot; +} + +static void return_io(struct bio *return_bi) +{ + struct bio *bi = return_bi; + while (bi) { + + return_bi = bi->bi_next; + bi->bi_next = NULL; + bi->bi_size = 0; + bio_endio(bi, 0); + bi = return_bi; + } +} + +static void print_raid5_conf (struct r5conf *conf); + +static int stripe_operations_active(struct stripe_head *sh) +{ + return sh->check_state || sh->reconstruct_state || + test_bit(STRIPE_BIOFILL_RUN, &sh->state) || + test_bit(STRIPE_COMPUTE_RUN, &sh->state); +} + +static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) +{ + if (atomic_dec_and_test(&sh->count)) { + BUG_ON(!list_empty(&sh->lru)); + BUG_ON(atomic_read(&conf->active_stripes)==0); + if (test_bit(STRIPE_HANDLE, &sh->state)) { + if (test_bit(STRIPE_DELAYED, &sh->state) && + !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + list_add_tail(&sh->lru, &conf->delayed_list); + else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && + sh->bm_seq - conf->seq_write > 0) + list_add_tail(&sh->lru, &conf->bitmap_list); + else { + clear_bit(STRIPE_DELAYED, &sh->state); + clear_bit(STRIPE_BIT_DELAY, &sh->state); + list_add_tail(&sh->lru, &conf->handle_list); + } + md_wakeup_thread(conf->mddev->thread); + } else { + BUG_ON(stripe_operations_active(sh)); + if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + if (atomic_dec_return(&conf->preread_active_stripes) + < IO_THRESHOLD) + md_wakeup_thread(conf->mddev->thread); + atomic_dec(&conf->active_stripes); + if (!test_bit(STRIPE_EXPANDING, &sh->state)) { + list_add_tail(&sh->lru, &conf->inactive_list); + wake_up(&conf->wait_for_stripe); + if (conf->retry_read_aligned) + md_wakeup_thread(conf->mddev->thread); + } + } + } +} + +static void release_stripe(struct stripe_head *sh) +{ + struct r5conf *conf = sh->raid_conf; + unsigned long flags; + + spin_lock_irqsave(&conf->device_lock, flags); + __release_stripe(conf, sh); + spin_unlock_irqrestore(&conf->device_lock, flags); +} + +static inline void remove_hash(struct stripe_head *sh) +{ + pr_debug("remove_hash(), stripe %llu\n", + (unsigned long long)sh->sector); + + hlist_del_init(&sh->hash); +} + +static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) +{ + struct hlist_head *hp = stripe_hash(conf, sh->sector); + + pr_debug("insert_hash(), stripe %llu\n", + (unsigned long long)sh->sector); + + hlist_add_head(&sh->hash, hp); +} + + +/* find an idle stripe, make sure it is unhashed, and return it. */ +static struct stripe_head *get_free_stripe(struct r5conf *conf) +{ + struct stripe_head *sh = NULL; + struct list_head *first; + + if (list_empty(&conf->inactive_list)) + goto out; + first = conf->inactive_list.next; + sh = list_entry(first, struct stripe_head, lru); + list_del_init(first); + remove_hash(sh); + atomic_inc(&conf->active_stripes); +out: + return sh; +} + +static void shrink_buffers(struct stripe_head *sh) +{ + struct page *p; + int i; + int num = sh->raid_conf->pool_size; + + for (i = 0; i < num ; i++) { + p = sh->dev[i].page; + if (!p) + continue; + sh->dev[i].page = NULL; + put_page(p); + } +} + +static int grow_buffers(struct stripe_head *sh) +{ + int i; + int num = sh->raid_conf->pool_size; + + for (i = 0; i < num; i++) { + struct page *page; + + if (!(page = alloc_page(GFP_KERNEL))) { + return 1; + } + sh->dev[i].page = page; + } + return 0; +} + +static void raid5_build_block(struct stripe_head *sh, int i, int previous); +static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, + struct stripe_head *sh); + +static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) +{ + struct r5conf *conf = sh->raid_conf; + int i; + + BUG_ON(atomic_read(&sh->count) != 0); + BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); + BUG_ON(stripe_operations_active(sh)); + + pr_debug("init_stripe called, stripe %llu\n", + (unsigned long long)sh->sector); + + remove_hash(sh); + + sh->generation = conf->generation - previous; + sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; + sh->sector = sector; + stripe_set_idx(sector, conf, previous, sh); + sh->state = 0; + + + for (i = sh->disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + + if (dev->toread || dev->read || dev->towrite || dev->written || + test_bit(R5_LOCKED, &dev->flags)) { + printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n", + (unsigned long long)sh->sector, i, dev->toread, + dev->read, dev->towrite, dev->written, + test_bit(R5_LOCKED, &dev->flags)); + WARN_ON(1); + } + dev->flags = 0; + raid5_build_block(sh, i, previous); + } + insert_hash(conf, sh); +} + +static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, + short generation) +{ + struct stripe_head *sh; + struct hlist_node *hn; + + pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); + hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) + if (sh->sector == sector && sh->generation == generation) + return sh; + pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); + return NULL; +} + +/* + * Need to check if array has failed when deciding whether to: + * - start an array + * - remove non-faulty devices + * - add a spare + * - allow a reshape + * This determination is simple when no reshape is happening. + * However if there is a reshape, we need to carefully check + * both the before and after sections. + * This is because some failed devices may only affect one + * of the two sections, and some non-in_sync devices may + * be insync in the section most affected by failed devices. + */ +static int calc_degraded(struct r5conf *conf) +{ + int degraded, degraded2; + int i; + + rcu_read_lock(); + degraded = 0; + for (i = 0; i < conf->previous_raid_disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); + if (!rdev || test_bit(Faulty, &rdev->flags)) + degraded++; + else if (test_bit(In_sync, &rdev->flags)) + ; + else + /* not in-sync or faulty. + * If the reshape increases the number of devices, + * this is being recovered by the reshape, so + * this 'previous' section is not in_sync. + * If the number of devices is being reduced however, + * the device can only be part of the array if + * we are reverting a reshape, so this section will + * be in-sync. + */ + if (conf->raid_disks >= conf->previous_raid_disks) + degraded++; + } + rcu_read_unlock(); + if (conf->raid_disks == conf->previous_raid_disks) + return degraded; + rcu_read_lock(); + degraded2 = 0; + for (i = 0; i < conf->raid_disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); + if (!rdev || test_bit(Faulty, &rdev->flags)) + degraded2++; + else if (test_bit(In_sync, &rdev->flags)) + ; + else + /* not in-sync or faulty. + * If reshape increases the number of devices, this + * section has already been recovered, else it + * almost certainly hasn't. + */ + if (conf->raid_disks <= conf->previous_raid_disks) + degraded2++; + } + rcu_read_unlock(); + if (degraded2 > degraded) + return degraded2; + return degraded; +} + +static int has_failed(struct r5conf *conf) +{ + int degraded; + + if (conf->mddev->reshape_position == MaxSector) + return conf->mddev->degraded > conf->max_degraded; + + degraded = calc_degraded(conf); + if (degraded > conf->max_degraded) + return 1; + return 0; +} + +static struct stripe_head * +get_active_stripe(struct r5conf *conf, sector_t sector, + int previous, int noblock, int noquiesce) +{ + struct stripe_head *sh; + + pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); + + spin_lock_irq(&conf->device_lock); + + do { + wait_event_lock_irq(conf->wait_for_stripe, + conf->quiesce == 0 || noquiesce, + conf->device_lock, /* nothing */); + sh = __find_stripe(conf, sector, conf->generation - previous); + if (!sh) { + if (!conf->inactive_blocked) + sh = get_free_stripe(conf); + if (noblock && sh == NULL) + break; + if (!sh) { + conf->inactive_blocked = 1; + wait_event_lock_irq(conf->wait_for_stripe, + !list_empty(&conf->inactive_list) && + (atomic_read(&conf->active_stripes) + < (conf->max_nr_stripes *3/4) + || !conf->inactive_blocked), + conf->device_lock, + ); + conf->inactive_blocked = 0; + } else + init_stripe(sh, sector, previous); + } else { + if (atomic_read(&sh->count)) { + BUG_ON(!list_empty(&sh->lru) + && !test_bit(STRIPE_EXPANDING, &sh->state)); + } else { + if (!test_bit(STRIPE_HANDLE, &sh->state)) + atomic_inc(&conf->active_stripes); + if (list_empty(&sh->lru) && + !test_bit(STRIPE_EXPANDING, &sh->state)) + BUG(); + list_del_init(&sh->lru); + } + } + } while (sh == NULL); + + if (sh) + atomic_inc(&sh->count); + + spin_unlock_irq(&conf->device_lock); + return sh; +} + +static void +raid5_end_read_request(struct bio *bi, int error); +static void +raid5_end_write_request(struct bio *bi, int error); + +static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) +{ + struct r5conf *conf = sh->raid_conf; + int i, disks = sh->disks; + + might_sleep(); + + for (i = disks; i--; ) { + int rw; + int replace_only = 0; + struct bio *bi, *rbi; + struct md_rdev *rdev, *rrdev = NULL; + if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { + if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) + rw = WRITE_FUA; + else + rw = WRITE; + } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) + rw = READ; + else if (test_and_clear_bit(R5_WantReplace, + &sh->dev[i].flags)) { + rw = WRITE; + replace_only = 1; + } else + continue; + + bi = &sh->dev[i].req; + rbi = &sh->dev[i].rreq; /* For writing to replacement */ + + bi->bi_rw = rw; + rbi->bi_rw = rw; + if (rw & WRITE) { + bi->bi_end_io = raid5_end_write_request; + rbi->bi_end_io = raid5_end_write_request; + } else + bi->bi_end_io = raid5_end_read_request; + + rcu_read_lock(); + rrdev = rcu_dereference(conf->disks[i].replacement); + smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ + rdev = rcu_dereference(conf->disks[i].rdev); + if (!rdev) { + rdev = rrdev; + rrdev = NULL; + } + if (rw & WRITE) { + if (replace_only) + rdev = NULL; + if (rdev == rrdev) + /* We raced and saw duplicates */ + rrdev = NULL; + } else { + if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev) + rdev = rrdev; + rrdev = NULL; + } + + if (rdev && test_bit(Faulty, &rdev->flags)) + rdev = NULL; + if (rdev) + atomic_inc(&rdev->nr_pending); + if (rrdev && test_bit(Faulty, &rrdev->flags)) + rrdev = NULL; + if (rrdev) + atomic_inc(&rrdev->nr_pending); + rcu_read_unlock(); + + /* We have already checked bad blocks for reads. Now + * need to check for writes. We never accept write errors + * on the replacement, so we don't to check rrdev. + */ + while ((rw & WRITE) && rdev && + test_bit(WriteErrorSeen, &rdev->flags)) { + sector_t first_bad; + int bad_sectors; + int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, + &first_bad, &bad_sectors); + if (!bad) + break; + + if (bad < 0) { + set_bit(BlockedBadBlocks, &rdev->flags); + if (!conf->mddev->external && + conf->mddev->flags) { + /* It is very unlikely, but we might + * still need to write out the + * bad block log - better give it + * a chance*/ + md_check_recovery(conf->mddev); + } + /* + * Because md_wait_for_blocked_rdev + * will dec nr_pending, we must + * increment it first. + */ + atomic_inc(&rdev->nr_pending); + md_wait_for_blocked_rdev(rdev, conf->mddev); + } else { + /* Acknowledged bad block - skip the write */ + rdev_dec_pending(rdev, conf->mddev); + rdev = NULL; + } + } + + if (rdev) { + if (s->syncing || s->expanding || s->expanded + || s->replacing) + md_sync_acct(rdev->bdev, STRIPE_SECTORS); + + set_bit(STRIPE_IO_STARTED, &sh->state); + + bi->bi_bdev = rdev->bdev; + pr_debug("%s: for %llu schedule op %ld on disc %d\n", + __func__, (unsigned long long)sh->sector, + bi->bi_rw, i); + atomic_inc(&sh->count); + bi->bi_sector = sh->sector + rdev->data_offset; + bi->bi_flags = 1 << BIO_UPTODATE; + bi->bi_idx = 0; + bi->bi_io_vec[0].bv_len = STRIPE_SIZE; + bi->bi_io_vec[0].bv_offset = 0; + bi->bi_size = STRIPE_SIZE; + bi->bi_next = NULL; + if (rrdev) + set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); + generic_make_request(bi); + } + if (rrdev) { + if (s->syncing || s->expanding || s->expanded + || s->replacing) + md_sync_acct(rrdev->bdev, STRIPE_SECTORS); + + set_bit(STRIPE_IO_STARTED, &sh->state); + + rbi->bi_bdev = rrdev->bdev; + pr_debug("%s: for %llu schedule op %ld on " + "replacement disc %d\n", + __func__, (unsigned long long)sh->sector, + rbi->bi_rw, i); + atomic_inc(&sh->count); + rbi->bi_sector = sh->sector + rrdev->data_offset; + rbi->bi_flags = 1 << BIO_UPTODATE; + rbi->bi_idx = 0; + rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; + rbi->bi_io_vec[0].bv_offset = 0; + rbi->bi_size = STRIPE_SIZE; + rbi->bi_next = NULL; + generic_make_request(rbi); + } + if (!rdev && !rrdev) { + if (rw & WRITE) + set_bit(STRIPE_DEGRADED, &sh->state); + pr_debug("skip op %ld on disc %d for sector %llu\n", + bi->bi_rw, i, (unsigned long long)sh->sector); + clear_bit(R5_LOCKED, &sh->dev[i].flags); + set_bit(STRIPE_HANDLE, &sh->state); + } + } +} + +static struct dma_async_tx_descriptor * +async_copy_data(int frombio, struct bio *bio, struct page *page, + sector_t sector, struct dma_async_tx_descriptor *tx) +{ + struct bio_vec *bvl; + struct page *bio_page; + int i; + int page_offset; + struct async_submit_ctl submit; + enum async_tx_flags flags = 0; + + if (bio->bi_sector >= sector) + page_offset = (signed)(bio->bi_sector - sector) * 512; + else + page_offset = (signed)(sector - bio->bi_sector) * -512; + + if (frombio) + flags |= ASYNC_TX_FENCE; + init_async_submit(&submit, flags, tx, NULL, NULL, NULL); + + bio_for_each_segment(bvl, bio, i) { + int len = bvl->bv_len; + int clen; + int b_offset = 0; + + if (page_offset < 0) { + b_offset = -page_offset; + page_offset += b_offset; + len -= b_offset; + } + + if (len > 0 && page_offset + len > STRIPE_SIZE) + clen = STRIPE_SIZE - page_offset; + else + clen = len; + + if (clen > 0) { + b_offset += bvl->bv_offset; + bio_page = bvl->bv_page; + if (frombio) + tx = async_memcpy(page, bio_page, page_offset, + b_offset, clen, &submit); + else + tx = async_memcpy(bio_page, page, b_offset, + page_offset, clen, &submit); + } + /* chain the operations */ + submit.depend_tx = tx; + + if (clen < len) /* hit end of page */ + break; + page_offset += len; + } + + return tx; +} + +static void ops_complete_biofill(void *stripe_head_ref) +{ + struct stripe_head *sh = stripe_head_ref; + struct bio *return_bi = NULL; + struct r5conf *conf = sh->raid_conf; + int i; + + pr_debug("%s: stripe %llu\n", __func__, + (unsigned long long)sh->sector); + + /* clear completed biofills */ + spin_lock_irq(&conf->device_lock); + for (i = sh->disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + + /* acknowledge completion of a biofill operation */ + /* and check if we need to reply to a read request, + * new R5_Wantfill requests are held off until + * !STRIPE_BIOFILL_RUN + */ + if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { + struct bio *rbi, *rbi2; + + BUG_ON(!dev->read); + rbi = dev->read; + dev->read = NULL; + while (rbi && rbi->bi_sector < + dev->sector + STRIPE_SECTORS) { + rbi2 = r5_next_bio(rbi, dev->sector); + if (!raid5_dec_bi_phys_segments(rbi)) { + rbi->bi_next = return_bi; + return_bi = rbi; + } + rbi = rbi2; + } + } + } + spin_unlock_irq(&conf->device_lock); + clear_bit(STRIPE_BIOFILL_RUN, &sh->state); + + return_io(return_bi); + + set_bit(STRIPE_HANDLE, &sh->state); + release_stripe(sh); +} + +static void ops_run_biofill(struct stripe_head *sh) +{ + struct dma_async_tx_descriptor *tx = NULL; + struct r5conf *conf = sh->raid_conf; + struct async_submit_ctl submit; + int i; + + pr_debug("%s: stripe %llu\n", __func__, + (unsigned long long)sh->sector); + + for (i = sh->disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + if (test_bit(R5_Wantfill, &dev->flags)) { + struct bio *rbi; + spin_lock_irq(&conf->device_lock); + dev->read = rbi = dev->toread; + dev->toread = NULL; + spin_unlock_irq(&conf->device_lock); + while (rbi && rbi->bi_sector < + dev->sector + STRIPE_SECTORS) { + tx = async_copy_data(0, rbi, dev->page, + dev->sector, tx); + rbi = r5_next_bio(rbi, dev->sector); + } + } + } + + atomic_inc(&sh->count); + init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); + async_trigger_callback(&submit); +} + +static void mark_target_uptodate(struct stripe_head *sh, int target) +{ + struct r5dev *tgt; + + if (target < 0) + return; + + tgt = &sh->dev[target]; + set_bit(R5_UPTODATE, &tgt->flags); + BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); + clear_bit(R5_Wantcompute, &tgt->flags); +} + +static void ops_complete_compute(void *stripe_head_ref) +{ + struct stripe_head *sh = stripe_head_ref; + + pr_debug("%s: stripe %llu\n", __func__, + (unsigned long long)sh->sector); + + /* mark the computed target(s) as uptodate */ + mark_target_uptodate(sh, sh->ops.target); + mark_target_uptodate(sh, sh->ops.target2); + + clear_bit(STRIPE_COMPUTE_RUN, &sh->state); + if (sh->check_state == check_state_compute_run) + sh->check_state = check_state_compute_result; + set_bit(STRIPE_HANDLE, &sh->state); + release_stripe(sh); +} + +/* return a pointer to the address conversion region of the scribble buffer */ +static addr_conv_t *to_addr_conv(struct stripe_head *sh, + struct raid5_percpu *percpu) +{ + return percpu->scribble + sizeof(struct page *) * (sh->disks + 2); +} + +static struct dma_async_tx_descriptor * +ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) +{ + int disks = sh->disks; + struct page **xor_srcs = percpu->scribble; + int target = sh->ops.target; + struct r5dev *tgt = &sh->dev[target]; + struct page *xor_dest = tgt->page; + int count = 0; + struct dma_async_tx_descriptor *tx; + struct async_submit_ctl submit; + int i; + + pr_debug("%s: stripe %llu block: %d\n", + __func__, (unsigned long long)sh->sector, target); + BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); + + for (i = disks; i--; ) + if (i != target) + xor_srcs[count++] = sh->dev[i].page; + + atomic_inc(&sh->count); + + init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, + ops_complete_compute, sh, to_addr_conv(sh, percpu)); + if (unlikely(count == 1)) + tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); + else + tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); + + return tx; +} + +/* set_syndrome_sources - populate source buffers for gen_syndrome + * @srcs - (struct page *) array of size sh->disks + * @sh - stripe_head to parse + * + * Populates srcs in proper layout order for the stripe and returns the + * 'count' of sources to be used in a call to async_gen_syndrome. The P + * destination buffer is recorded in srcs[count] and the Q destination + * is recorded in srcs[count+1]]. + */ +static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) +{ + int disks = sh->disks; + int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); + int d0_idx = raid6_d0(sh); + int count; + int i; + + for (i = 0; i < disks; i++) + srcs[i] = NULL; + + count = 0; + i = d0_idx; + do { + int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); + + srcs[slot] = sh->dev[i].page; + i = raid6_next_disk(i, disks); + } while (i != d0_idx); + + return syndrome_disks; +} + +static struct dma_async_tx_descriptor * +ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) +{ + int disks = sh->disks; + struct page **blocks = percpu->scribble; + int target; + int qd_idx = sh->qd_idx; + struct dma_async_tx_descriptor *tx; + struct async_submit_ctl submit; + struct r5dev *tgt; + struct page *dest; + int i; + int count; + + if (sh->ops.target < 0) + target = sh->ops.target2; + else if (sh->ops.target2 < 0) + target = sh->ops.target; + else + /* we should only have one valid target */ + BUG(); + BUG_ON(target < 0); + pr_debug("%s: stripe %llu block: %d\n", + __func__, (unsigned long long)sh->sector, target); + + tgt = &sh->dev[target]; + BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); + dest = tgt->page; + + atomic_inc(&sh->count); + + if (target == qd_idx) { + count = set_syndrome_sources(blocks, sh); + blocks[count] = NULL; /* regenerating p is not necessary */ + BUG_ON(blocks[count+1] != dest); /* q should already be set */ + init_async_submit(&submit, ASYNC_TX_FENCE, NULL, + ops_complete_compute, sh, + to_addr_conv(sh, percpu)); + tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); + } else { + /* Compute any data- or p-drive using XOR */ + count = 0; + for (i = disks; i-- ; ) { + if (i == target || i == qd_idx) + continue; + blocks[count++] = sh->dev[i].page; + } + + init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, + NULL, ops_complete_compute, sh, + to_addr_conv(sh, percpu)); + tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); + } + + return tx; +} + +static struct dma_async_tx_descriptor * +ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) +{ + int i, count, disks = sh->disks; + int syndrome_disks = sh->ddf_layout ? disks : disks-2; + int d0_idx = raid6_d0(sh); + int faila = -1, failb = -1; + int target = sh->ops.target; + int target2 = sh->ops.target2; + struct r5dev *tgt = &sh->dev[target]; + struct r5dev *tgt2 = &sh->dev[target2]; + struct dma_async_tx_descriptor *tx; + struct page **blocks = percpu->scribble; + struct async_submit_ctl submit; + + pr_debug("%s: stripe %llu block1: %d block2: %d\n", + __func__, (unsigned long long)sh->sector, target, target2); + BUG_ON(target < 0 || target2 < 0); + BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); + BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); + + /* we need to open-code set_syndrome_sources to handle the + * slot number conversion for 'faila' and 'failb' + */ + for (i = 0; i < disks ; i++) + blocks[i] = NULL; + count = 0; + i = d0_idx; + do { + int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); + + blocks[slot] = sh->dev[i].page; + + if (i == target) + faila = slot; + if (i == target2) + failb = slot; + i = raid6_next_disk(i, disks); + } while (i != d0_idx); + + BUG_ON(faila == failb); + if (failb < faila) + swap(faila, failb); + pr_debug("%s: stripe: %llu faila: %d failb: %d\n", + __func__, (unsigned long long)sh->sector, faila, failb); + + atomic_inc(&sh->count); + + if (failb == syndrome_disks+1) { + /* Q disk is one of the missing disks */ + if (faila == syndrome_disks) { + /* Missing P+Q, just recompute */ + init_async_submit(&submit, ASYNC_TX_FENCE, NULL, + ops_complete_compute, sh, + to_addr_conv(sh, percpu)); + return async_gen_syndrome(blocks, 0, syndrome_disks+2, + STRIPE_SIZE, &submit); + } else { + struct page *dest; + int data_target; + int qd_idx = sh->qd_idx; + + /* Missing D+Q: recompute D from P, then recompute Q */ + if (target == qd_idx) + data_target = target2; + else + data_target = target; + + count = 0; + for (i = disks; i-- ; ) { + if (i == data_target || i == qd_idx) + continue; + blocks[count++] = sh->dev[i].page; + } + dest = sh->dev[data_target].page; + init_async_submit(&submit, + ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, + NULL, NULL, NULL, + to_addr_conv(sh, percpu)); + tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, + &submit); + + count = set_syndrome_sources(blocks, sh); + init_async_submit(&submit, ASYNC_TX_FENCE, tx, + ops_complete_compute, sh, + to_addr_conv(sh, percpu)); + return async_gen_syndrome(blocks, 0, count+2, + STRIPE_SIZE, &submit); + } + } else { + init_async_submit(&submit, ASYNC_TX_FENCE, NULL, + ops_complete_compute, sh, + to_addr_conv(sh, percpu)); + if (failb == syndrome_disks) { + /* We're missing D+P. */ + return async_raid6_datap_recov(syndrome_disks+2, + STRIPE_SIZE, faila, + blocks, &submit); + } else { + /* We're missing D+D. */ + return async_raid6_2data_recov(syndrome_disks+2, + STRIPE_SIZE, faila, failb, + blocks, &submit); + } + } +} + + +static void ops_complete_prexor(void *stripe_head_ref) +{ + struct stripe_head *sh = stripe_head_ref; + + pr_debug("%s: stripe %llu\n", __func__, + (unsigned long long)sh->sector); +} + +static struct dma_async_tx_descriptor * +ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, + struct dma_async_tx_descriptor *tx) +{ + int disks = sh->disks; + struct page **xor_srcs = percpu->scribble; + int count = 0, pd_idx = sh->pd_idx, i; + struct async_submit_ctl submit; + + /* existing parity data subtracted */ + struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; + + pr_debug("%s: stripe %llu\n", __func__, + (unsigned long long)sh->sector); + + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + /* Only process blocks that are known to be uptodate */ + if (test_bit(R5_Wantdrain, &dev->flags)) + xor_srcs[count++] = dev->page; + } + + init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, + ops_complete_prexor, sh, to_addr_conv(sh, percpu)); + tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); + + return tx; +} + +static struct dma_async_tx_descriptor * +ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) +{ + int disks = sh->disks; + int i; + + pr_debug("%s: stripe %llu\n", __func__, + (unsigned long long)sh->sector); + + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + struct bio *chosen; + + if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { + struct bio *wbi; + + spin_lock_irq(&sh->raid_conf->device_lock); + chosen = dev->towrite; + dev->towrite = NULL; + BUG_ON(dev->written); + wbi = dev->written = chosen; + spin_unlock_irq(&sh->raid_conf->device_lock); + + while (wbi && wbi->bi_sector < + dev->sector + STRIPE_SECTORS) { + if (wbi->bi_rw & REQ_FUA) + set_bit(R5_WantFUA, &dev->flags); + tx = async_copy_data(1, wbi, dev->page, + dev->sector, tx); + wbi = r5_next_bio(wbi, dev->sector); + } + } + } + + return tx; +} + +static void ops_complete_reconstruct(void *stripe_head_ref) +{ + struct stripe_head *sh = stripe_head_ref; + int disks = sh->disks; + int pd_idx = sh->pd_idx; + int qd_idx = sh->qd_idx; + int i; + bool fua = false; + + pr_debug("%s: stripe %llu\n", __func__, + (unsigned long long)sh->sector); + + for (i = disks; i--; ) + fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); + + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + + if (dev->written || i == pd_idx || i == qd_idx) { + set_bit(R5_UPTODATE, &dev->flags); + if (fua) + set_bit(R5_WantFUA, &dev->flags); + } + } + + if (sh->reconstruct_state == reconstruct_state_drain_run) + sh->reconstruct_state = reconstruct_state_drain_result; + else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) + sh->reconstruct_state = reconstruct_state_prexor_drain_result; + else { + BUG_ON(sh->reconstruct_state != reconstruct_state_run); + sh->reconstruct_state = reconstruct_state_result; + } + + set_bit(STRIPE_HANDLE, &sh->state); + release_stripe(sh); +} + +static void +ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, + struct dma_async_tx_descriptor *tx) +{ + int disks = sh->disks; + struct page **xor_srcs = percpu->scribble; + struct async_submit_ctl submit; + int count = 0, pd_idx = sh->pd_idx, i; + struct page *xor_dest; + int prexor = 0; + unsigned long flags; + + pr_debug("%s: stripe %llu\n", __func__, + (unsigned long long)sh->sector); + + /* check if prexor is active which means only process blocks + * that are part of a read-modify-write (written) + */ + if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { + prexor = 1; + xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + if (dev->written) + xor_srcs[count++] = dev->page; + } + } else { + xor_dest = sh->dev[pd_idx].page; + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + if (i != pd_idx) + xor_srcs[count++] = dev->page; + } + } + + /* 1/ if we prexor'd then the dest is reused as a source + * 2/ if we did not prexor then we are redoing the parity + * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST + * for the synchronous xor case + */ + flags = ASYNC_TX_ACK | + (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); + + atomic_inc(&sh->count); + + init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, + to_addr_conv(sh, percpu)); + if (unlikely(count == 1)) + tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); + else + tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); +} + +static void +ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, + struct dma_async_tx_descriptor *tx) +{ + struct async_submit_ctl submit; + struct page **blocks = percpu->scribble; + int count; + + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); + + count = set_syndrome_sources(blocks, sh); + + atomic_inc(&sh->count); + + init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, + sh, to_addr_conv(sh, percpu)); + async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); +} + +static void ops_complete_check(void *stripe_head_ref) +{ + struct stripe_head *sh = stripe_head_ref; + + pr_debug("%s: stripe %llu\n", __func__, + (unsigned long long)sh->sector); + + sh->check_state = check_state_check_result; + set_bit(STRIPE_HANDLE, &sh->state); + release_stripe(sh); +} + +static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) +{ + int disks = sh->disks; + int pd_idx = sh->pd_idx; + int qd_idx = sh->qd_idx; + struct page *xor_dest; + struct page **xor_srcs = percpu->scribble; + struct dma_async_tx_descriptor *tx; + struct async_submit_ctl submit; + int count; + int i; + + pr_debug("%s: stripe %llu\n", __func__, + (unsigned long long)sh->sector); + + count = 0; + xor_dest = sh->dev[pd_idx].page; + xor_srcs[count++] = xor_dest; + for (i = disks; i--; ) { + if (i == pd_idx || i == qd_idx) + continue; + xor_srcs[count++] = sh->dev[i].page; + } + + init_async_submit(&submit, 0, NULL, NULL, NULL, + to_addr_conv(sh, percpu)); + tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, + &sh->ops.zero_sum_result, &submit); + + atomic_inc(&sh->count); + init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); + tx = async_trigger_callback(&submit); +} + +static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) +{ + struct page **srcs = percpu->scribble; + struct async_submit_ctl submit; + int count; + + pr_debug("%s: stripe %llu checkp: %d\n", __func__, + (unsigned long long)sh->sector, checkp); + + count = set_syndrome_sources(srcs, sh); + if (!checkp) + srcs[count] = NULL; + + atomic_inc(&sh->count); + init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, + sh, to_addr_conv(sh, percpu)); + async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, + &sh->ops.zero_sum_result, percpu->spare_page, &submit); +} + +static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request) +{ + int overlap_clear = 0, i, disks = sh->disks; + struct dma_async_tx_descriptor *tx = NULL; + struct r5conf *conf = sh->raid_conf; + int level = conf->level; + struct raid5_percpu *percpu; + unsigned long cpu; + + cpu = get_cpu(); + percpu = per_cpu_ptr(conf->percpu, cpu); + if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { + ops_run_biofill(sh); + overlap_clear++; + } + + if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { + if (level < 6) + tx = ops_run_compute5(sh, percpu); + else { + if (sh->ops.target2 < 0 || sh->ops.target < 0) + tx = ops_run_compute6_1(sh, percpu); + else + tx = ops_run_compute6_2(sh, percpu); + } + /* terminate the chain if reconstruct is not set to be run */ + if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) + async_tx_ack(tx); + } + + if (test_bit(STRIPE_OP_PREXOR, &ops_request)) + tx = ops_run_prexor(sh, percpu, tx); + + if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { + tx = ops_run_biodrain(sh, tx); + overlap_clear++; + } + + if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { + if (level < 6) + ops_run_reconstruct5(sh, percpu, tx); + else + ops_run_reconstruct6(sh, percpu, tx); + } + + if (test_bit(STRIPE_OP_CHECK, &ops_request)) { + if (sh->check_state == check_state_run) + ops_run_check_p(sh, percpu); + else if (sh->check_state == check_state_run_q) + ops_run_check_pq(sh, percpu, 0); + else if (sh->check_state == check_state_run_pq) + ops_run_check_pq(sh, percpu, 1); + else + BUG(); + } + + if (overlap_clear) + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + if (test_and_clear_bit(R5_Overlap, &dev->flags)) + wake_up(&sh->raid_conf->wait_for_overlap); + } + put_cpu(); +} + +#ifdef CONFIG_MULTICORE_RAID456 +static void async_run_ops(void *param, async_cookie_t cookie) +{ + struct stripe_head *sh = param; + unsigned long ops_request = sh->ops.request; + + clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state); + wake_up(&sh->ops.wait_for_ops); + + __raid_run_ops(sh, ops_request); + release_stripe(sh); +} + +static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) +{ + /* since handle_stripe can be called outside of raid5d context + * we need to ensure sh->ops.request is de-staged before another + * request arrives + */ + wait_event(sh->ops.wait_for_ops, + !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state)); + sh->ops.request = ops_request; + + atomic_inc(&sh->count); + async_schedule(async_run_ops, sh); +} +#else +#define raid_run_ops __raid_run_ops +#endif + +static int grow_one_stripe(struct r5conf *conf) +{ + struct stripe_head *sh; + sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); + if (!sh) + return 0; + + sh->raid_conf = conf; + #ifdef CONFIG_MULTICORE_RAID456 + init_waitqueue_head(&sh->ops.wait_for_ops); + #endif + + if (grow_buffers(sh)) { + shrink_buffers(sh); + kmem_cache_free(conf->slab_cache, sh); + return 0; + } + /* we just created an active stripe so... */ + atomic_set(&sh->count, 1); + atomic_inc(&conf->active_stripes); + INIT_LIST_HEAD(&sh->lru); + release_stripe(sh); + return 1; +} + +static int grow_stripes(struct r5conf *conf, int num) +{ + struct kmem_cache *sc; + int devs = max(conf->raid_disks, conf->previous_raid_disks); + + if (conf->mddev->gendisk) + sprintf(conf->cache_name[0], + "raid%d-%s", conf->level, mdname(conf->mddev)); + else + sprintf(conf->cache_name[0], + "raid%d-%p", conf->level, conf->mddev); + sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]); + + conf->active_name = 0; + sc = kmem_cache_create(conf->cache_name[conf->active_name], + sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), + 0, 0, NULL); + if (!sc) + return 1; + conf->slab_cache = sc; + conf->pool_size = devs; + while (num--) + if (!grow_one_stripe(conf)) + return 1; + return 0; +} + +/** + * scribble_len - return the required size of the scribble region + * @num - total number of disks in the array + * + * The size must be enough to contain: + * 1/ a struct page pointer for each device in the array +2 + * 2/ room to convert each entry in (1) to its corresponding dma + * (dma_map_page()) or page (page_address()) address. + * + * Note: the +2 is for the destination buffers of the ddf/raid6 case where we + * calculate over all devices (not just the data blocks), using zeros in place + * of the P and Q blocks. + */ +static size_t scribble_len(int num) +{ + size_t len; + + len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); + + return len; +} + +static int resize_stripes(struct r5conf *conf, int newsize) +{ + /* Make all the stripes able to hold 'newsize' devices. + * New slots in each stripe get 'page' set to a new page. + * + * This happens in stages: + * 1/ create a new kmem_cache and allocate the required number of + * stripe_heads. + * 2/ gather all the old stripe_heads and tranfer the pages across + * to the new stripe_heads. This will have the side effect of + * freezing the array as once all stripe_heads have been collected, + * no IO will be possible. Old stripe heads are freed once their + * pages have been transferred over, and the old kmem_cache is + * freed when all stripes are done. + * 3/ reallocate conf->disks to be suitable bigger. If this fails, + * we simple return a failre status - no need to clean anything up. + * 4/ allocate new pages for the new slots in the new stripe_heads. + * If this fails, we don't bother trying the shrink the + * stripe_heads down again, we just leave them as they are. + * As each stripe_head is processed the new one is released into + * active service. + * + * Once step2 is started, we cannot afford to wait for a write, + * so we use GFP_NOIO allocations. + */ + struct stripe_head *osh, *nsh; + LIST_HEAD(newstripes); + struct disk_info *ndisks; + unsigned long cpu; + int err; + struct kmem_cache *sc; + int i; + + if (newsize <= conf->pool_size) + return 0; /* never bother to shrink */ + + err = md_allow_write(conf->mddev); + if (err) + return err; + + /* Step 1 */ + sc = kmem_cache_create(conf->cache_name[1-conf->active_name], + sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), + 0, 0, NULL); + if (!sc) + return -ENOMEM; + + for (i = conf->max_nr_stripes; i; i--) { + nsh = kmem_cache_zalloc(sc, GFP_KERNEL); + if (!nsh) + break; + + nsh->raid_conf = conf; + #ifdef CONFIG_MULTICORE_RAID456 + init_waitqueue_head(&nsh->ops.wait_for_ops); + #endif + + list_add(&nsh->lru, &newstripes); + } + if (i) { + /* didn't get enough, give up */ + while (!list_empty(&newstripes)) { + nsh = list_entry(newstripes.next, struct stripe_head, lru); + list_del(&nsh->lru); + kmem_cache_free(sc, nsh); + } + kmem_cache_destroy(sc); + return -ENOMEM; + } + /* Step 2 - Must use GFP_NOIO now. + * OK, we have enough stripes, start collecting inactive + * stripes and copying them over + */ + list_for_each_entry(nsh, &newstripes, lru) { + spin_lock_irq(&conf->device_lock); + wait_event_lock_irq(conf->wait_for_stripe, + !list_empty(&conf->inactive_list), + conf->device_lock, + ); + osh = get_free_stripe(conf); + spin_unlock_irq(&conf->device_lock); + atomic_set(&nsh->count, 1); + for(i=0; i<conf->pool_size; i++) + nsh->dev[i].page = osh->dev[i].page; + for( ; i<newsize; i++) + nsh->dev[i].page = NULL; + kmem_cache_free(conf->slab_cache, osh); + } + kmem_cache_destroy(conf->slab_cache); + + /* Step 3. + * At this point, we are holding all the stripes so the array + * is completely stalled, so now is a good time to resize + * conf->disks and the scribble region + */ + ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); + if (ndisks) { + for (i=0; i<conf->raid_disks; i++) + ndisks[i] = conf->disks[i]; + kfree(conf->disks); + conf->disks = ndisks; + } else + err = -ENOMEM; + + get_online_cpus(); + conf->scribble_len = scribble_len(newsize); + for_each_present_cpu(cpu) { + struct raid5_percpu *percpu; + void *scribble; + + percpu = per_cpu_ptr(conf->percpu, cpu); + scribble = kmalloc(conf->scribble_len, GFP_NOIO); + + if (scribble) { + kfree(percpu->scribble); + percpu->scribble = scribble; + } else { + err = -ENOMEM; + break; + } + } + put_online_cpus(); + + /* Step 4, return new stripes to service */ + while(!list_empty(&newstripes)) { + nsh = list_entry(newstripes.next, struct stripe_head, lru); + list_del_init(&nsh->lru); + + for (i=conf->raid_disks; i < newsize; i++) + if (nsh->dev[i].page == NULL) { + struct page *p = alloc_page(GFP_NOIO); + nsh->dev[i].page = p; + if (!p) + err = -ENOMEM; + } + release_stripe(nsh); + } + /* critical section pass, GFP_NOIO no longer needed */ + + conf->slab_cache = sc; + conf->active_name = 1-conf->active_name; + conf->pool_size = newsize; + return err; +} + +static int drop_one_stripe(struct r5conf *conf) +{ + struct stripe_head *sh; + + spin_lock_irq(&conf->device_lock); + sh = get_free_stripe(conf); + spin_unlock_irq(&conf->device_lock); + if (!sh) + return 0; + BUG_ON(atomic_read(&sh->count)); + shrink_buffers(sh); + kmem_cache_free(conf->slab_cache, sh); + atomic_dec(&conf->active_stripes); + return 1; +} + +static void shrink_stripes(struct r5conf *conf) +{ + while (drop_one_stripe(conf)) + ; + + if (conf->slab_cache) + kmem_cache_destroy(conf->slab_cache); + conf->slab_cache = NULL; +} + +static void raid5_end_read_request(struct bio * bi, int error) +{ + struct stripe_head *sh = bi->bi_private; + struct r5conf *conf = sh->raid_conf; + int disks = sh->disks, i; + int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); + char b[BDEVNAME_SIZE]; + struct md_rdev *rdev = NULL; + + + for (i=0 ; i<disks; i++) + if (bi == &sh->dev[i].req) + break; + + pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n", + (unsigned long long)sh->sector, i, atomic_read(&sh->count), + uptodate); + if (i == disks) { + BUG(); + return; + } + if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) + /* If replacement finished while this request was outstanding, + * 'replacement' might be NULL already. + * In that case it moved down to 'rdev'. + * rdev is not removed until all requests are finished. + */ + rdev = conf->disks[i].replacement; + if (!rdev) + rdev = conf->disks[i].rdev; + + if (uptodate) { + set_bit(R5_UPTODATE, &sh->dev[i].flags); + if (test_bit(R5_ReadError, &sh->dev[i].flags)) { + /* Note that this cannot happen on a + * replacement device. We just fail those on + * any error + */ + printk_ratelimited( + KERN_INFO + "md/raid:%s: read error corrected" + " (%lu sectors at %llu on %s)\n", + mdname(conf->mddev), STRIPE_SECTORS, + (unsigned long long)(sh->sector + + rdev->data_offset), + bdevname(rdev->bdev, b)); + atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); + clear_bit(R5_ReadError, &sh->dev[i].flags); + clear_bit(R5_ReWrite, &sh->dev[i].flags); + } + if (atomic_read(&rdev->read_errors)) + atomic_set(&rdev->read_errors, 0); + } else { + const char *bdn = bdevname(rdev->bdev, b); + int retry = 0; + + clear_bit(R5_UPTODATE, &sh->dev[i].flags); + atomic_inc(&rdev->read_errors); + if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) + printk_ratelimited( + KERN_WARNING + "md/raid:%s: read error on replacement device " + "(sector %llu on %s).\n", + mdname(conf->mddev), + (unsigned long long)(sh->sector + + rdev->data_offset), + bdn); + else if (conf->mddev->degraded >= conf->max_degraded) + printk_ratelimited( + KERN_WARNING + "md/raid:%s: read error not correctable " + "(sector %llu on %s).\n", + mdname(conf->mddev), + (unsigned long long)(sh->sector + + rdev->data_offset), + bdn); + else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) + /* Oh, no!!! */ + printk_ratelimited( + KERN_WARNING + "md/raid:%s: read error NOT corrected!! " + "(sector %llu on %s).\n", + mdname(conf->mddev), + (unsigned long long)(sh->sector + + rdev->data_offset), + bdn); + else if (atomic_read(&rdev->read_errors) + > conf->max_nr_stripes) + printk(KERN_WARNING + "md/raid:%s: Too many read errors, failing device %s.\n", + mdname(conf->mddev), bdn); + else + retry = 1; + if (retry) + set_bit(R5_ReadError, &sh->dev[i].flags); + else { + clear_bit(R5_ReadError, &sh->dev[i].flags); + clear_bit(R5_ReWrite, &sh->dev[i].flags); + md_error(conf->mddev, rdev); + } + } + rdev_dec_pending(rdev, conf->mddev); + clear_bit(R5_LOCKED, &sh->dev[i].flags); + set_bit(STRIPE_HANDLE, &sh->state); + release_stripe(sh); +} + +static void raid5_end_write_request(struct bio *bi, int error) +{ + struct stripe_head *sh = bi->bi_private; + struct r5conf *conf = sh->raid_conf; + int disks = sh->disks, i; + struct md_rdev *uninitialized_var(rdev); + int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); + sector_t first_bad; + int bad_sectors; + int replacement = 0; + + for (i = 0 ; i < disks; i++) { + if (bi == &sh->dev[i].req) { + rdev = conf->disks[i].rdev; + break; + } + if (bi == &sh->dev[i].rreq) { + rdev = conf->disks[i].replacement; + if (rdev) + replacement = 1; + else + /* rdev was removed and 'replacement' + * replaced it. rdev is not removed + * until all requests are finished. + */ + rdev = conf->disks[i].rdev; + break; + } + } + pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", + (unsigned long long)sh->sector, i, atomic_read(&sh->count), + uptodate); + if (i == disks) { + BUG(); + return; + } + + if (replacement) { + if (!uptodate) + md_error(conf->mddev, rdev); + else if (is_badblock(rdev, sh->sector, + STRIPE_SECTORS, + &first_bad, &bad_sectors)) + set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); + } else { + if (!uptodate) { + set_bit(WriteErrorSeen, &rdev->flags); + set_bit(R5_WriteError, &sh->dev[i].flags); + if (!test_and_set_bit(WantReplacement, &rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, + &rdev->mddev->recovery); + } else if (is_badblock(rdev, sh->sector, + STRIPE_SECTORS, + &first_bad, &bad_sectors)) + set_bit(R5_MadeGood, &sh->dev[i].flags); + } + rdev_dec_pending(rdev, conf->mddev); + + if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) + clear_bit(R5_LOCKED, &sh->dev[i].flags); + set_bit(STRIPE_HANDLE, &sh->state); + release_stripe(sh); +} + +static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); + +static void raid5_build_block(struct stripe_head *sh, int i, int previous) +{ + struct r5dev *dev = &sh->dev[i]; + + bio_init(&dev->req); + dev->req.bi_io_vec = &dev->vec; + dev->req.bi_vcnt++; + dev->req.bi_max_vecs++; + dev->req.bi_private = sh; + dev->vec.bv_page = dev->page; + + bio_init(&dev->rreq); + dev->rreq.bi_io_vec = &dev->rvec; + dev->rreq.bi_vcnt++; + dev->rreq.bi_max_vecs++; + dev->rreq.bi_private = sh; + dev->rvec.bv_page = dev->page; + + dev->flags = 0; + dev->sector = compute_blocknr(sh, i, previous); +} + +static void error(struct mddev *mddev, struct md_rdev *rdev) +{ + char b[BDEVNAME_SIZE]; + struct r5conf *conf = mddev->private; + unsigned long flags; + pr_debug("raid456: error called\n"); + + spin_lock_irqsave(&conf->device_lock, flags); + clear_bit(In_sync, &rdev->flags); + mddev->degraded = calc_degraded(conf); + spin_unlock_irqrestore(&conf->device_lock, flags); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + + set_bit(Blocked, &rdev->flags); + set_bit(Faulty, &rdev->flags); + set_bit(MD_CHANGE_DEVS, &mddev->flags); + printk(KERN_ALERT + "md/raid:%s: Disk failure on %s, disabling device.\n" + "md/raid:%s: Operation continuing on %d devices.\n", + mdname(mddev), + bdevname(rdev->bdev, b), + mdname(mddev), + conf->raid_disks - mddev->degraded); +} + +/* + * Input: a 'big' sector number, + * Output: index of the data and parity disk, and the sector # in them. + */ +static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, + int previous, int *dd_idx, + struct stripe_head *sh) +{ + sector_t stripe, stripe2; + sector_t chunk_number; + unsigned int chunk_offset; + int pd_idx, qd_idx; + int ddf_layout = 0; + sector_t new_sector; + int algorithm = previous ? conf->prev_algo + : conf->algorithm; + int sectors_per_chunk = previous ? conf->prev_chunk_sectors + : conf->chunk_sectors; + int raid_disks = previous ? conf->previous_raid_disks + : conf->raid_disks; + int data_disks = raid_disks - conf->max_degraded; + + /* First compute the information on this sector */ + + /* + * Compute the chunk number and the sector offset inside the chunk + */ + chunk_offset = sector_div(r_sector, sectors_per_chunk); + chunk_number = r_sector; + + /* + * Compute the stripe number + */ + stripe = chunk_number; + *dd_idx = sector_div(stripe, data_disks); + stripe2 = stripe; + /* + * Select the parity disk based on the user selected algorithm. + */ + pd_idx = qd_idx = -1; + switch(conf->level) { + case 4: + pd_idx = data_disks; + break; + case 5: + switch (algorithm) { + case ALGORITHM_LEFT_ASYMMETRIC: + pd_idx = data_disks - sector_div(stripe2, raid_disks); + if (*dd_idx >= pd_idx) + (*dd_idx)++; + break; + case ALGORITHM_RIGHT_ASYMMETRIC: + pd_idx = sector_div(stripe2, raid_disks); + if (*dd_idx >= pd_idx) + (*dd_idx)++; + break; + case ALGORITHM_LEFT_SYMMETRIC: + pd_idx = data_disks - sector_div(stripe2, raid_disks); + *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; + break; + case ALGORITHM_RIGHT_SYMMETRIC: + pd_idx = sector_div(stripe2, raid_disks); + *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; + break; + case ALGORITHM_PARITY_0: + pd_idx = 0; + (*dd_idx)++; + break; + case ALGORITHM_PARITY_N: + pd_idx = data_disks; + break; + default: + BUG(); + } + break; + case 6: + + switch (algorithm) { + case ALGORITHM_LEFT_ASYMMETRIC: + pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); + qd_idx = pd_idx + 1; + if (pd_idx == raid_disks-1) { + (*dd_idx)++; /* Q D D D P */ + qd_idx = 0; + } else if (*dd_idx >= pd_idx) + (*dd_idx) += 2; /* D D P Q D */ + break; + case ALGORITHM_RIGHT_ASYMMETRIC: + pd_idx = sector_div(stripe2, raid_disks); + qd_idx = pd_idx + 1; + if (pd_idx == raid_disks-1) { + (*dd_idx)++; /* Q D D D P */ + qd_idx = 0; + } else if (*dd_idx >= pd_idx) + (*dd_idx) += 2; /* D D P Q D */ + break; + case ALGORITHM_LEFT_SYMMETRIC: + pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); + qd_idx = (pd_idx + 1) % raid_disks; + *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; + break; + case ALGORITHM_RIGHT_SYMMETRIC: + pd_idx = sector_div(stripe2, raid_disks); + qd_idx = (pd_idx + 1) % raid_disks; + *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; + break; + + case ALGORITHM_PARITY_0: + pd_idx = 0; + qd_idx = 1; + (*dd_idx) += 2; + break; + case ALGORITHM_PARITY_N: + pd_idx = data_disks; + qd_idx = data_disks + 1; + break; + + case ALGORITHM_ROTATING_ZERO_RESTART: + /* Exactly the same as RIGHT_ASYMMETRIC, but or + * of blocks for computing Q is different. + */ + pd_idx = sector_div(stripe2, raid_disks); + qd_idx = pd_idx + 1; + if (pd_idx == raid_disks-1) { + (*dd_idx)++; /* Q D D D P */ + qd_idx = 0; + } else if (*dd_idx >= pd_idx) + (*dd_idx) += 2; /* D D P Q D */ + ddf_layout = 1; + break; + + case ALGORITHM_ROTATING_N_RESTART: + /* Same a left_asymmetric, by first stripe is + * D D D P Q rather than + * Q D D D P + */ + stripe2 += 1; + pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); + qd_idx = pd_idx + 1; + if (pd_idx == raid_disks-1) { + (*dd_idx)++; /* Q D D D P */ + qd_idx = 0; + } else if (*dd_idx >= pd_idx) + (*dd_idx) += 2; /* D D P Q D */ + ddf_layout = 1; + break; + + case ALGORITHM_ROTATING_N_CONTINUE: + /* Same as left_symmetric but Q is before P */ + pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); + qd_idx = (pd_idx + raid_disks - 1) % raid_disks; + *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; + ddf_layout = 1; + break; + + case ALGORITHM_LEFT_ASYMMETRIC_6: + /* RAID5 left_asymmetric, with Q on last device */ + pd_idx = data_disks - sector_div(stripe2, raid_disks-1); + if (*dd_idx >= pd_idx) + (*dd_idx)++; + qd_idx = raid_disks - 1; + break; + + case ALGORITHM_RIGHT_ASYMMETRIC_6: + pd_idx = sector_div(stripe2, raid_disks-1); + if (*dd_idx >= pd_idx) + (*dd_idx)++; + qd_idx = raid_disks - 1; + break; + + case ALGORITHM_LEFT_SYMMETRIC_6: + pd_idx = data_disks - sector_div(stripe2, raid_disks-1); + *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); + qd_idx = raid_disks - 1; + break; + + case ALGORITHM_RIGHT_SYMMETRIC_6: + pd_idx = sector_div(stripe2, raid_disks-1); + *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); + qd_idx = raid_disks - 1; + break; + + case ALGORITHM_PARITY_0_6: + pd_idx = 0; + (*dd_idx)++; + qd_idx = raid_disks - 1; + break; + + default: + BUG(); + } + break; + } + + if (sh) { + sh->pd_idx = pd_idx; + sh->qd_idx = qd_idx; + sh->ddf_layout = ddf_layout; + } + /* + * Finally, compute the new sector number + */ + new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; + return new_sector; +} + + +static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) +{ + struct r5conf *conf = sh->raid_conf; + int raid_disks = sh->disks; + int data_disks = raid_disks - conf->max_degraded; + sector_t new_sector = sh->sector, check; + int sectors_per_chunk = previous ? conf->prev_chunk_sectors + : conf->chunk_sectors; + int algorithm = previous ? conf->prev_algo + : conf->algorithm; + sector_t stripe; + int chunk_offset; + sector_t chunk_number; + int dummy1, dd_idx = i; + sector_t r_sector; + struct stripe_head sh2; + + + chunk_offset = sector_div(new_sector, sectors_per_chunk); + stripe = new_sector; + + if (i == sh->pd_idx) + return 0; + switch(conf->level) { + case 4: break; + case 5: + switch (algorithm) { + case ALGORITHM_LEFT_ASYMMETRIC: + case ALGORITHM_RIGHT_ASYMMETRIC: + if (i > sh->pd_idx) + i--; + break; + case ALGORITHM_LEFT_SYMMETRIC: + case ALGORITHM_RIGHT_SYMMETRIC: + if (i < sh->pd_idx) + i += raid_disks; + i -= (sh->pd_idx + 1); + break; + case ALGORITHM_PARITY_0: + i -= 1; + break; + case ALGORITHM_PARITY_N: + break; + default: + BUG(); + } + break; + case 6: + if (i == sh->qd_idx) + return 0; /* It is the Q disk */ + switch (algorithm) { + case ALGORITHM_LEFT_ASYMMETRIC: + case ALGORITHM_RIGHT_ASYMMETRIC: + case ALGORITHM_ROTATING_ZERO_RESTART: + case ALGORITHM_ROTATING_N_RESTART: + if (sh->pd_idx == raid_disks-1) + i--; /* Q D D D P */ + else if (i > sh->pd_idx) + i -= 2; /* D D P Q D */ + break; + case ALGORITHM_LEFT_SYMMETRIC: + case ALGORITHM_RIGHT_SYMMETRIC: + if (sh->pd_idx == raid_disks-1) + i--; /* Q D D D P */ + else { + /* D D P Q D */ + if (i < sh->pd_idx) + i += raid_disks; + i -= (sh->pd_idx + 2); + } + break; + case ALGORITHM_PARITY_0: + i -= 2; + break; + case ALGORITHM_PARITY_N: + break; + case ALGORITHM_ROTATING_N_CONTINUE: + /* Like left_symmetric, but P is before Q */ + if (sh->pd_idx == 0) + i--; /* P D D D Q */ + else { + /* D D Q P D */ + if (i < sh->pd_idx) + i += raid_disks; + i -= (sh->pd_idx + 1); + } + break; + case ALGORITHM_LEFT_ASYMMETRIC_6: + case ALGORITHM_RIGHT_ASYMMETRIC_6: + if (i > sh->pd_idx) + i--; + break; + case ALGORITHM_LEFT_SYMMETRIC_6: + case ALGORITHM_RIGHT_SYMMETRIC_6: + if (i < sh->pd_idx) + i += data_disks + 1; + i -= (sh->pd_idx + 1); + break; + case ALGORITHM_PARITY_0_6: + i -= 1; + break; + default: + BUG(); + } + break; + } + + chunk_number = stripe * data_disks + i; + r_sector = chunk_number * sectors_per_chunk + chunk_offset; + + check = raid5_compute_sector(conf, r_sector, + previous, &dummy1, &sh2); + if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx + || sh2.qd_idx != sh->qd_idx) { + printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n", + mdname(conf->mddev)); + return 0; + } + return r_sector; +} + + +static void +schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, + int rcw, int expand) +{ + int i, pd_idx = sh->pd_idx, disks = sh->disks; + struct r5conf *conf = sh->raid_conf; + int level = conf->level; + + if (rcw) { + /* if we are not expanding this is a proper write request, and + * there will be bios with new data to be drained into the + * stripe cache + */ + if (!expand) { + sh->reconstruct_state = reconstruct_state_drain_run; + set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); + } else + sh->reconstruct_state = reconstruct_state_run; + + set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); + + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + + if (dev->towrite) { + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantdrain, &dev->flags); + if (!expand) + clear_bit(R5_UPTODATE, &dev->flags); + s->locked++; + } + } + if (s->locked + conf->max_degraded == disks) + if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) + atomic_inc(&conf->pending_full_writes); + } else { + BUG_ON(level == 6); + BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || + test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); + + sh->reconstruct_state = reconstruct_state_prexor_drain_run; + set_bit(STRIPE_OP_PREXOR, &s->ops_request); + set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); + set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); + + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + if (i == pd_idx) + continue; + + if (dev->towrite && + (test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_Wantcompute, &dev->flags))) { + set_bit(R5_Wantdrain, &dev->flags); + set_bit(R5_LOCKED, &dev->flags); + clear_bit(R5_UPTODATE, &dev->flags); + s->locked++; + } + } + } + + /* keep the parity disk(s) locked while asynchronous operations + * are in flight + */ + set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); + clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); + s->locked++; + + if (level == 6) { + int qd_idx = sh->qd_idx; + struct r5dev *dev = &sh->dev[qd_idx]; + + set_bit(R5_LOCKED, &dev->flags); + clear_bit(R5_UPTODATE, &dev->flags); + s->locked++; + } + + pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", + __func__, (unsigned long long)sh->sector, + s->locked, s->ops_request); +} + +/* + * Each stripe/dev can have one or more bion attached. + * toread/towrite point to the first in a chain. + * The bi_next chain must be in order. + */ +static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) +{ + struct bio **bip; + struct r5conf *conf = sh->raid_conf; + int firstwrite=0; + + pr_debug("adding bi b#%llu to stripe s#%llu\n", + (unsigned long long)bi->bi_sector, + (unsigned long long)sh->sector); + + + spin_lock_irq(&conf->device_lock); + if (forwrite) { + bip = &sh->dev[dd_idx].towrite; + if (*bip == NULL && sh->dev[dd_idx].written == NULL) + firstwrite = 1; + } else + bip = &sh->dev[dd_idx].toread; + while (*bip && (*bip)->bi_sector < bi->bi_sector) { + if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) + goto overlap; + bip = & (*bip)->bi_next; + } + if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9)) + goto overlap; + + BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); + if (*bip) + bi->bi_next = *bip; + *bip = bi; + bi->bi_phys_segments++; + + if (forwrite) { + /* check if page is covered */ + sector_t sector = sh->dev[dd_idx].sector; + for (bi=sh->dev[dd_idx].towrite; + sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && + bi && bi->bi_sector <= sector; + bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { + if (bi->bi_sector + (bi->bi_size>>9) >= sector) + sector = bi->bi_sector + (bi->bi_size>>9); + } + if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) + set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); + } + spin_unlock_irq(&conf->device_lock); + + pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", + (unsigned long long)(*bip)->bi_sector, + (unsigned long long)sh->sector, dd_idx); + + if (conf->mddev->bitmap && firstwrite) { + bitmap_startwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, 0); + sh->bm_seq = conf->seq_flush+1; + set_bit(STRIPE_BIT_DELAY, &sh->state); + } + return 1; + + overlap: + set_bit(R5_Overlap, &sh->dev[dd_idx].flags); + spin_unlock_irq(&conf->device_lock); + return 0; +} + +static void end_reshape(struct r5conf *conf); + +static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, + struct stripe_head *sh) +{ + int sectors_per_chunk = + previous ? conf->prev_chunk_sectors : conf->chunk_sectors; + int dd_idx; + int chunk_offset = sector_div(stripe, sectors_per_chunk); + int disks = previous ? conf->previous_raid_disks : conf->raid_disks; + + raid5_compute_sector(conf, + stripe * (disks - conf->max_degraded) + *sectors_per_chunk + chunk_offset, + previous, + &dd_idx, sh); +} + +static void +handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, + struct stripe_head_state *s, int disks, + struct bio **return_bi) +{ + int i; + for (i = disks; i--; ) { + struct bio *bi; + int bitmap_end = 0; + + if (test_bit(R5_ReadError, &sh->dev[i].flags)) { + struct md_rdev *rdev; + rcu_read_lock(); + rdev = rcu_dereference(conf->disks[i].rdev); + if (rdev && test_bit(In_sync, &rdev->flags)) + atomic_inc(&rdev->nr_pending); + else + rdev = NULL; + rcu_read_unlock(); + if (rdev) { + if (!rdev_set_badblocks( + rdev, + sh->sector, + STRIPE_SECTORS, 0)) + md_error(conf->mddev, rdev); + rdev_dec_pending(rdev, conf->mddev); + } + } + spin_lock_irq(&conf->device_lock); + /* fail all writes first */ + bi = sh->dev[i].towrite; + sh->dev[i].towrite = NULL; + if (bi) { + s->to_write--; + bitmap_end = 1; + } + + if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) + wake_up(&conf->wait_for_overlap); + + while (bi && bi->bi_sector < + sh->dev[i].sector + STRIPE_SECTORS) { + struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); + clear_bit(BIO_UPTODATE, &bi->bi_flags); + if (!raid5_dec_bi_phys_segments(bi)) { + md_write_end(conf->mddev); + bi->bi_next = *return_bi; + *return_bi = bi; + } + bi = nextbi; + } + /* and fail all 'written' */ + bi = sh->dev[i].written; + sh->dev[i].written = NULL; + if (bi) bitmap_end = 1; + while (bi && bi->bi_sector < + sh->dev[i].sector + STRIPE_SECTORS) { + struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); + clear_bit(BIO_UPTODATE, &bi->bi_flags); + if (!raid5_dec_bi_phys_segments(bi)) { + md_write_end(conf->mddev); + bi->bi_next = *return_bi; + *return_bi = bi; + } + bi = bi2; + } + + /* fail any reads if this device is non-operational and + * the data has not reached the cache yet. + */ + if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && + (!test_bit(R5_Insync, &sh->dev[i].flags) || + test_bit(R5_ReadError, &sh->dev[i].flags))) { + bi = sh->dev[i].toread; + sh->dev[i].toread = NULL; + if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) + wake_up(&conf->wait_for_overlap); + if (bi) s->to_read--; + while (bi && bi->bi_sector < + sh->dev[i].sector + STRIPE_SECTORS) { + struct bio *nextbi = + r5_next_bio(bi, sh->dev[i].sector); + clear_bit(BIO_UPTODATE, &bi->bi_flags); + if (!raid5_dec_bi_phys_segments(bi)) { + bi->bi_next = *return_bi; + *return_bi = bi; + } + bi = nextbi; + } + } + spin_unlock_irq(&conf->device_lock); + if (bitmap_end) + bitmap_endwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, 0, 0); + /* If we were in the middle of a write the parity block might + * still be locked - so just clear all R5_LOCKED flags + */ + clear_bit(R5_LOCKED, &sh->dev[i].flags); + } + + if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) + if (atomic_dec_and_test(&conf->pending_full_writes)) + md_wakeup_thread(conf->mddev->thread); +} + +static void +handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, + struct stripe_head_state *s) +{ + int abort = 0; + int i; + + clear_bit(STRIPE_SYNCING, &sh->state); + s->syncing = 0; + s->replacing = 0; + /* There is nothing more to do for sync/check/repair. + * Don't even need to abort as that is handled elsewhere + * if needed, and not always wanted e.g. if there is a known + * bad block here. + * For recover/replace we need to record a bad block on all + * non-sync devices, or abort the recovery + */ + if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) { + /* During recovery devices cannot be removed, so + * locking and refcounting of rdevs is not needed + */ + for (i = 0; i < conf->raid_disks; i++) { + struct md_rdev *rdev = conf->disks[i].rdev; + if (rdev + && !test_bit(Faulty, &rdev->flags) + && !test_bit(In_sync, &rdev->flags) + && !rdev_set_badblocks(rdev, sh->sector, + STRIPE_SECTORS, 0)) + abort = 1; + rdev = conf->disks[i].replacement; + if (rdev + && !test_bit(Faulty, &rdev->flags) + && !test_bit(In_sync, &rdev->flags) + && !rdev_set_badblocks(rdev, sh->sector, + STRIPE_SECTORS, 0)) + abort = 1; + } + if (abort) + conf->recovery_disabled = + conf->mddev->recovery_disabled; + } + md_done_sync(conf->mddev, STRIPE_SECTORS, !abort); +} + +static int want_replace(struct stripe_head *sh, int disk_idx) +{ + struct md_rdev *rdev; + int rv = 0; + /* Doing recovery so rcu locking not required */ + rdev = sh->raid_conf->disks[disk_idx].replacement; + if (rdev + && !test_bit(Faulty, &rdev->flags) + && !test_bit(In_sync, &rdev->flags) + && (rdev->recovery_offset <= sh->sector + || rdev->mddev->recovery_cp <= sh->sector)) + rv = 1; + + return rv; +} + +/* fetch_block - checks the given member device to see if its data needs + * to be read or computed to satisfy a request. + * + * Returns 1 when no more member devices need to be checked, otherwise returns + * 0 to tell the loop in handle_stripe_fill to continue + */ +static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, + int disk_idx, int disks) +{ + struct r5dev *dev = &sh->dev[disk_idx]; + struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], + &sh->dev[s->failed_num[1]] }; + + /* is the data in this block needed, and can we get it? */ + if (!test_bit(R5_LOCKED, &dev->flags) && + !test_bit(R5_UPTODATE, &dev->flags) && + (dev->toread || + (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || + s->syncing || s->expanding || + (s->replacing && want_replace(sh, disk_idx)) || + (s->failed >= 1 && fdev[0]->toread) || + (s->failed >= 2 && fdev[1]->toread) || + (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && + !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || + (sh->raid_conf->level == 6 && s->failed && s->to_write))) { + /* we would like to get this block, possibly by computing it, + * otherwise read it if the backing disk is insync + */ + BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); + BUG_ON(test_bit(R5_Wantread, &dev->flags)); + if ((s->uptodate == disks - 1) && + (s->failed && (disk_idx == s->failed_num[0] || + disk_idx == s->failed_num[1]))) { + /* have disk failed, and we're requested to fetch it; + * do compute it + */ + pr_debug("Computing stripe %llu block %d\n", + (unsigned long long)sh->sector, disk_idx); + set_bit(STRIPE_COMPUTE_RUN, &sh->state); + set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); + set_bit(R5_Wantcompute, &dev->flags); + sh->ops.target = disk_idx; + sh->ops.target2 = -1; /* no 2nd target */ + s->req_compute = 1; + /* Careful: from this point on 'uptodate' is in the eye + * of raid_run_ops which services 'compute' operations + * before writes. R5_Wantcompute flags a block that will + * be R5_UPTODATE by the time it is needed for a + * subsequent operation. + */ + s->uptodate++; + return 1; + } else if (s->uptodate == disks-2 && s->failed >= 2) { + /* Computing 2-failure is *very* expensive; only + * do it if failed >= 2 + */ + int other; + for (other = disks; other--; ) { + if (other == disk_idx) + continue; + if (!test_bit(R5_UPTODATE, + &sh->dev[other].flags)) + break; + } + BUG_ON(other < 0); + pr_debug("Computing stripe %llu blocks %d,%d\n", + (unsigned long long)sh->sector, + disk_idx, other); + set_bit(STRIPE_COMPUTE_RUN, &sh->state); + set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); + set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); + set_bit(R5_Wantcompute, &sh->dev[other].flags); + sh->ops.target = disk_idx; + sh->ops.target2 = other; + s->uptodate += 2; + s->req_compute = 1; + return 1; + } else if (test_bit(R5_Insync, &dev->flags)) { + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantread, &dev->flags); + s->locked++; + pr_debug("Reading block %d (sync=%d)\n", + disk_idx, s->syncing); + } + } + + return 0; +} + +/** + * handle_stripe_fill - read or compute data to satisfy pending requests. + */ +static void handle_stripe_fill(struct stripe_head *sh, + struct stripe_head_state *s, + int disks) +{ + int i; + + /* look for blocks to read/compute, skip this if a compute + * is already in flight, or if the stripe contents are in the + * midst of changing due to a write + */ + if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && + !sh->reconstruct_state) + for (i = disks; i--; ) + if (fetch_block(sh, s, i, disks)) + break; + set_bit(STRIPE_HANDLE, &sh->state); +} + + +/* handle_stripe_clean_event + * any written block on an uptodate or failed drive can be returned. + * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but + * never LOCKED, so we don't need to test 'failed' directly. + */ +static void handle_stripe_clean_event(struct r5conf *conf, + struct stripe_head *sh, int disks, struct bio **return_bi) +{ + int i; + struct r5dev *dev; + + for (i = disks; i--; ) + if (sh->dev[i].written) { + dev = &sh->dev[i]; + if (!test_bit(R5_LOCKED, &dev->flags) && + test_bit(R5_UPTODATE, &dev->flags)) { + /* We can return any write requests */ + struct bio *wbi, *wbi2; + int bitmap_end = 0; + pr_debug("Return write for disc %d\n", i); + spin_lock_irq(&conf->device_lock); + wbi = dev->written; + dev->written = NULL; + while (wbi && wbi->bi_sector < + dev->sector + STRIPE_SECTORS) { + wbi2 = r5_next_bio(wbi, dev->sector); + if (!raid5_dec_bi_phys_segments(wbi)) { + md_write_end(conf->mddev); + wbi->bi_next = *return_bi; + *return_bi = wbi; + } + wbi = wbi2; + } + if (dev->towrite == NULL) + bitmap_end = 1; + spin_unlock_irq(&conf->device_lock); + if (bitmap_end) + bitmap_endwrite(conf->mddev->bitmap, + sh->sector, + STRIPE_SECTORS, + !test_bit(STRIPE_DEGRADED, &sh->state), + 0); + } + } + + if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) + if (atomic_dec_and_test(&conf->pending_full_writes)) + md_wakeup_thread(conf->mddev->thread); +} + +static void handle_stripe_dirtying(struct r5conf *conf, + struct stripe_head *sh, + struct stripe_head_state *s, + int disks) +{ + int rmw = 0, rcw = 0, i; + if (conf->max_degraded == 2) { + /* RAID6 requires 'rcw' in current implementation + * Calculate the real rcw later - for now fake it + * look like rcw is cheaper + */ + rcw = 1; rmw = 2; + } else for (i = disks; i--; ) { + /* would I have to read this buffer for read_modify_write */ + struct r5dev *dev = &sh->dev[i]; + if ((dev->towrite || i == sh->pd_idx) && + !test_bit(R5_LOCKED, &dev->flags) && + !(test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_Wantcompute, &dev->flags))) { + if (test_bit(R5_Insync, &dev->flags)) + rmw++; + else + rmw += 2*disks; /* cannot read it */ + } + /* Would I have to read this buffer for reconstruct_write */ + if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && + !test_bit(R5_LOCKED, &dev->flags) && + !(test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_Wantcompute, &dev->flags))) { + if (test_bit(R5_Insync, &dev->flags)) rcw++; + else + rcw += 2*disks; + } + } + pr_debug("for sector %llu, rmw=%d rcw=%d\n", + (unsigned long long)sh->sector, rmw, rcw); + set_bit(STRIPE_HANDLE, &sh->state); + if (rmw < rcw && rmw > 0) + /* prefer read-modify-write, but need to get some data */ + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + if ((dev->towrite || i == sh->pd_idx) && + !test_bit(R5_LOCKED, &dev->flags) && + !(test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_Wantcompute, &dev->flags)) && + test_bit(R5_Insync, &dev->flags)) { + if ( + test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { + pr_debug("Read_old block " + "%d for r-m-w\n", i); + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantread, &dev->flags); + s->locked++; + } else { + set_bit(STRIPE_DELAYED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); + } + } + } + if (rcw <= rmw && rcw > 0) { + /* want reconstruct write, but need to get some data */ + rcw = 0; + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + if (!test_bit(R5_OVERWRITE, &dev->flags) && + i != sh->pd_idx && i != sh->qd_idx && + !test_bit(R5_LOCKED, &dev->flags) && + !(test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_Wantcompute, &dev->flags))) { + rcw++; + if (!test_bit(R5_Insync, &dev->flags)) + continue; /* it's a failed drive */ + if ( + test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { + pr_debug("Read_old block " + "%d for Reconstruct\n", i); + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantread, &dev->flags); + s->locked++; + } else { + set_bit(STRIPE_DELAYED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); + } + } + } + } + /* now if nothing is locked, and if we have enough data, + * we can start a write request + */ + /* since handle_stripe can be called at any time we need to handle the + * case where a compute block operation has been submitted and then a + * subsequent call wants to start a write request. raid_run_ops only + * handles the case where compute block and reconstruct are requested + * simultaneously. If this is not the case then new writes need to be + * held off until the compute completes. + */ + if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && + (s->locked == 0 && (rcw == 0 || rmw == 0) && + !test_bit(STRIPE_BIT_DELAY, &sh->state))) + schedule_reconstruction(sh, s, rcw == 0, 0); +} + +static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, + struct stripe_head_state *s, int disks) +{ + struct r5dev *dev = NULL; + + set_bit(STRIPE_HANDLE, &sh->state); + + switch (sh->check_state) { + case check_state_idle: + /* start a new check operation if there are no failures */ + if (s->failed == 0) { + BUG_ON(s->uptodate != disks); + sh->check_state = check_state_run; + set_bit(STRIPE_OP_CHECK, &s->ops_request); + clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); + s->uptodate--; + break; + } + dev = &sh->dev[s->failed_num[0]]; + /* fall through */ + case check_state_compute_result: + sh->check_state = check_state_idle; + if (!dev) + dev = &sh->dev[sh->pd_idx]; + + /* check that a write has not made the stripe insync */ + if (test_bit(STRIPE_INSYNC, &sh->state)) + break; + + /* either failed parity check, or recovery is happening */ + BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); + BUG_ON(s->uptodate != disks); + + set_bit(R5_LOCKED, &dev->flags); + s->locked++; + set_bit(R5_Wantwrite, &dev->flags); + + clear_bit(STRIPE_DEGRADED, &sh->state); + set_bit(STRIPE_INSYNC, &sh->state); + break; + case check_state_run: + break; /* we will be called again upon completion */ + case check_state_check_result: + sh->check_state = check_state_idle; + + /* if a failure occurred during the check operation, leave + * STRIPE_INSYNC not set and let the stripe be handled again + */ + if (s->failed) + break; + + /* handle a successful check operation, if parity is correct + * we are done. Otherwise update the mismatch count and repair + * parity if !MD_RECOVERY_CHECK + */ + if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) + /* parity is correct (on disc, + * not in buffer any more) + */ + set_bit(STRIPE_INSYNC, &sh->state); + else { + conf->mddev->resync_mismatches += STRIPE_SECTORS; + if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) + /* don't try to repair!! */ + set_bit(STRIPE_INSYNC, &sh->state); + else { + sh->check_state = check_state_compute_run; + set_bit(STRIPE_COMPUTE_RUN, &sh->state); + set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); + set_bit(R5_Wantcompute, + &sh->dev[sh->pd_idx].flags); + sh->ops.target = sh->pd_idx; + sh->ops.target2 = -1; + s->uptodate++; + } + } + break; + case check_state_compute_run: + break; + default: + printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", + __func__, sh->check_state, + (unsigned long long) sh->sector); + BUG(); + } +} + + +static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, + struct stripe_head_state *s, + int disks) +{ + int pd_idx = sh->pd_idx; + int qd_idx = sh->qd_idx; + struct r5dev *dev; + + set_bit(STRIPE_HANDLE, &sh->state); + + BUG_ON(s->failed > 2); + + /* Want to check and possibly repair P and Q. + * However there could be one 'failed' device, in which + * case we can only check one of them, possibly using the + * other to generate missing data + */ + + switch (sh->check_state) { + case check_state_idle: + /* start a new check operation if there are < 2 failures */ + if (s->failed == s->q_failed) { + /* The only possible failed device holds Q, so it + * makes sense to check P (If anything else were failed, + * we would have used P to recreate it). + */ + sh->check_state = check_state_run; + } + if (!s->q_failed && s->failed < 2) { + /* Q is not failed, and we didn't use it to generate + * anything, so it makes sense to check it + */ + if (sh->check_state == check_state_run) + sh->check_state = check_state_run_pq; + else + sh->check_state = check_state_run_q; + } + + /* discard potentially stale zero_sum_result */ + sh->ops.zero_sum_result = 0; + + if (sh->check_state == check_state_run) { + /* async_xor_zero_sum destroys the contents of P */ + clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); + s->uptodate--; + } + if (sh->check_state >= check_state_run && + sh->check_state <= check_state_run_pq) { + /* async_syndrome_zero_sum preserves P and Q, so + * no need to mark them !uptodate here + */ + set_bit(STRIPE_OP_CHECK, &s->ops_request); + break; + } + + /* we have 2-disk failure */ + BUG_ON(s->failed != 2); + /* fall through */ + case check_state_compute_result: + sh->check_state = check_state_idle; + + /* check that a write has not made the stripe insync */ + if (test_bit(STRIPE_INSYNC, &sh->state)) + break; + + /* now write out any block on a failed drive, + * or P or Q if they were recomputed + */ + BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ + if (s->failed == 2) { + dev = &sh->dev[s->failed_num[1]]; + s->locked++; + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantwrite, &dev->flags); + } + if (s->failed >= 1) { + dev = &sh->dev[s->failed_num[0]]; + s->locked++; + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantwrite, &dev->flags); + } + if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { + dev = &sh->dev[pd_idx]; + s->locked++; + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantwrite, &dev->flags); + } + if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { + dev = &sh->dev[qd_idx]; + s->locked++; + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantwrite, &dev->flags); + } + clear_bit(STRIPE_DEGRADED, &sh->state); + + set_bit(STRIPE_INSYNC, &sh->state); + break; + case check_state_run: + case check_state_run_q: + case check_state_run_pq: + break; /* we will be called again upon completion */ + case check_state_check_result: + sh->check_state = check_state_idle; + + /* handle a successful check operation, if parity is correct + * we are done. Otherwise update the mismatch count and repair + * parity if !MD_RECOVERY_CHECK + */ + if (sh->ops.zero_sum_result == 0) { + /* both parities are correct */ + if (!s->failed) + set_bit(STRIPE_INSYNC, &sh->state); + else { + /* in contrast to the raid5 case we can validate + * parity, but still have a failure to write + * back + */ + sh->check_state = check_state_compute_result; + /* Returning at this point means that we may go + * off and bring p and/or q uptodate again so + * we make sure to check zero_sum_result again + * to verify if p or q need writeback + */ + } + } else { + conf->mddev->resync_mismatches += STRIPE_SECTORS; + if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) + /* don't try to repair!! */ + set_bit(STRIPE_INSYNC, &sh->state); + else { + int *target = &sh->ops.target; + + sh->ops.target = -1; + sh->ops.target2 = -1; + sh->check_state = check_state_compute_run; + set_bit(STRIPE_COMPUTE_RUN, &sh->state); + set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); + if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { + set_bit(R5_Wantcompute, + &sh->dev[pd_idx].flags); + *target = pd_idx; + target = &sh->ops.target2; + s->uptodate++; + } + if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { + set_bit(R5_Wantcompute, + &sh->dev[qd_idx].flags); + *target = qd_idx; + s->uptodate++; + } + } + } + break; + case check_state_compute_run: + break; + default: + printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", + __func__, sh->check_state, + (unsigned long long) sh->sector); + BUG(); + } +} + +static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) +{ + int i; + + /* We have read all the blocks in this stripe and now we need to + * copy some of them into a target stripe for expand. + */ + struct dma_async_tx_descriptor *tx = NULL; + clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); + for (i = 0; i < sh->disks; i++) + if (i != sh->pd_idx && i != sh->qd_idx) { + int dd_idx, j; + struct stripe_head *sh2; + struct async_submit_ctl submit; + + sector_t bn = compute_blocknr(sh, i, 1); + sector_t s = raid5_compute_sector(conf, bn, 0, + &dd_idx, NULL); + sh2 = get_active_stripe(conf, s, 0, 1, 1); + if (sh2 == NULL) + /* so far only the early blocks of this stripe + * have been requested. When later blocks + * get requested, we will try again + */ + continue; + if (!test_bit(STRIPE_EXPANDING, &sh2->state) || + test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { + /* must have already done this block */ + release_stripe(sh2); + continue; + } + + /* place all the copies on one channel */ + init_async_submit(&submit, 0, tx, NULL, NULL, NULL); + tx = async_memcpy(sh2->dev[dd_idx].page, + sh->dev[i].page, 0, 0, STRIPE_SIZE, + &submit); + + set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); + set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); + for (j = 0; j < conf->raid_disks; j++) + if (j != sh2->pd_idx && + j != sh2->qd_idx && + !test_bit(R5_Expanded, &sh2->dev[j].flags)) + break; + if (j == conf->raid_disks) { + set_bit(STRIPE_EXPAND_READY, &sh2->state); + set_bit(STRIPE_HANDLE, &sh2->state); + } + release_stripe(sh2); + + } + /* done submitting copies, wait for them to complete */ + if (tx) { + async_tx_ack(tx); + dma_wait_for_async_tx(tx); + } +} + +/* + * handle_stripe - do things to a stripe. + * + * We lock the stripe by setting STRIPE_ACTIVE and then examine the + * state of various bits to see what needs to be done. + * Possible results: + * return some read requests which now have data + * return some write requests which are safely on storage + * schedule a read on some buffers + * schedule a write of some buffers + * return confirmation of parity correctness + * + */ + +static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) +{ + struct r5conf *conf = sh->raid_conf; + int disks = sh->disks; + struct r5dev *dev; + int i; + int do_recovery = 0; + + memset(s, 0, sizeof(*s)); + + s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); + s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); + s->failed_num[0] = -1; + s->failed_num[1] = -1; + + /* Now to look around and see what can be done */ + rcu_read_lock(); + spin_lock_irq(&conf->device_lock); + for (i=disks; i--; ) { + struct md_rdev *rdev; + sector_t first_bad; + int bad_sectors; + int is_bad = 0; + + dev = &sh->dev[i]; + + pr_debug("check %d: state 0x%lx read %p write %p written %p\n", + i, dev->flags, + dev->toread, dev->towrite, dev->written); + /* maybe we can reply to a read + * + * new wantfill requests are only permitted while + * ops_complete_biofill is guaranteed to be inactive + */ + if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && + !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) + set_bit(R5_Wantfill, &dev->flags); + + /* now count some things */ + if (test_bit(R5_LOCKED, &dev->flags)) + s->locked++; + if (test_bit(R5_UPTODATE, &dev->flags)) + s->uptodate++; + if (test_bit(R5_Wantcompute, &dev->flags)) { + s->compute++; + BUG_ON(s->compute > 2); + } + + if (test_bit(R5_Wantfill, &dev->flags)) + s->to_fill++; + else if (dev->toread) + s->to_read++; + if (dev->towrite) { + s->to_write++; + if (!test_bit(R5_OVERWRITE, &dev->flags)) + s->non_overwrite++; + } + if (dev->written) + s->written++; + /* Prefer to use the replacement for reads, but only + * if it is recovered enough and has no bad blocks. + */ + rdev = rcu_dereference(conf->disks[i].replacement); + if (rdev && !test_bit(Faulty, &rdev->flags) && + rdev->recovery_offset >= sh->sector + STRIPE_SECTORS && + !is_badblock(rdev, sh->sector, STRIPE_SECTORS, + &first_bad, &bad_sectors)) + set_bit(R5_ReadRepl, &dev->flags); + else { + if (rdev) + set_bit(R5_NeedReplace, &dev->flags); + rdev = rcu_dereference(conf->disks[i].rdev); + clear_bit(R5_ReadRepl, &dev->flags); + } + if (rdev && test_bit(Faulty, &rdev->flags)) + rdev = NULL; + if (rdev) { + is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, + &first_bad, &bad_sectors); + if (s->blocked_rdev == NULL + && (test_bit(Blocked, &rdev->flags) + || is_bad < 0)) { + if (is_bad < 0) + set_bit(BlockedBadBlocks, + &rdev->flags); + s->blocked_rdev = rdev; + atomic_inc(&rdev->nr_pending); + } + } + clear_bit(R5_Insync, &dev->flags); + if (!rdev) + /* Not in-sync */; + else if (is_bad) { + /* also not in-sync */ + if (!test_bit(WriteErrorSeen, &rdev->flags) && + test_bit(R5_UPTODATE, &dev->flags)) { + /* treat as in-sync, but with a read error + * which we can now try to correct + */ + set_bit(R5_Insync, &dev->flags); + set_bit(R5_ReadError, &dev->flags); + } + } else if (test_bit(In_sync, &rdev->flags)) + set_bit(R5_Insync, &dev->flags); + else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) + /* in sync if before recovery_offset */ + set_bit(R5_Insync, &dev->flags); + else if (test_bit(R5_UPTODATE, &dev->flags) && + test_bit(R5_Expanded, &dev->flags)) + /* If we've reshaped into here, we assume it is Insync. + * We will shortly update recovery_offset to make + * it official. + */ + set_bit(R5_Insync, &dev->flags); + + if (rdev && test_bit(R5_WriteError, &dev->flags)) { + /* This flag does not apply to '.replacement' + * only to .rdev, so make sure to check that*/ + struct md_rdev *rdev2 = rcu_dereference( + conf->disks[i].rdev); + if (rdev2 == rdev) + clear_bit(R5_Insync, &dev->flags); + if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { + s->handle_bad_blocks = 1; + atomic_inc(&rdev2->nr_pending); + } else + clear_bit(R5_WriteError, &dev->flags); + } + if (rdev && test_bit(R5_MadeGood, &dev->flags)) { + /* This flag does not apply to '.replacement' + * only to .rdev, so make sure to check that*/ + struct md_rdev *rdev2 = rcu_dereference( + conf->disks[i].rdev); + if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { + s->handle_bad_blocks = 1; + atomic_inc(&rdev2->nr_pending); + } else + clear_bit(R5_MadeGood, &dev->flags); + } + if (test_bit(R5_MadeGoodRepl, &dev->flags)) { + struct md_rdev *rdev2 = rcu_dereference( + conf->disks[i].replacement); + if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { + s->handle_bad_blocks = 1; + atomic_inc(&rdev2->nr_pending); + } else + clear_bit(R5_MadeGoodRepl, &dev->flags); + } + if (!test_bit(R5_Insync, &dev->flags)) { + /* The ReadError flag will just be confusing now */ + clear_bit(R5_ReadError, &dev->flags); + clear_bit(R5_ReWrite, &dev->flags); + } + if (test_bit(R5_ReadError, &dev->flags)) + clear_bit(R5_Insync, &dev->flags); + if (!test_bit(R5_Insync, &dev->flags)) { + if (s->failed < 2) + s->failed_num[s->failed] = i; + s->failed++; + if (rdev && !test_bit(Faulty, &rdev->flags)) + do_recovery = 1; + } + } + spin_unlock_irq(&conf->device_lock); + if (test_bit(STRIPE_SYNCING, &sh->state)) { + /* If there is a failed device being replaced, + * we must be recovering. + * else if we are after recovery_cp, we must be syncing + * else if MD_RECOVERY_REQUESTED is set, we also are syncing. + * else we can only be replacing + * sync and recovery both need to read all devices, and so + * use the same flag. + */ + if (do_recovery || + sh->sector >= conf->mddev->recovery_cp || + test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery))) + s->syncing = 1; + else + s->replacing = 1; + } + rcu_read_unlock(); +} + +static void handle_stripe(struct stripe_head *sh) +{ + struct stripe_head_state s; + struct r5conf *conf = sh->raid_conf; + int i; + int prexor; + int disks = sh->disks; + struct r5dev *pdev, *qdev; + + clear_bit(STRIPE_HANDLE, &sh->state); + if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) { + /* already being handled, ensure it gets handled + * again when current action finishes */ + set_bit(STRIPE_HANDLE, &sh->state); + return; + } + + if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { + set_bit(STRIPE_SYNCING, &sh->state); + clear_bit(STRIPE_INSYNC, &sh->state); + } + clear_bit(STRIPE_DELAYED, &sh->state); + + pr_debug("handling stripe %llu, state=%#lx cnt=%d, " + "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", + (unsigned long long)sh->sector, sh->state, + atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, + sh->check_state, sh->reconstruct_state); + + analyse_stripe(sh, &s); + + if (s.handle_bad_blocks) { + set_bit(STRIPE_HANDLE, &sh->state); + goto finish; + } + + if (unlikely(s.blocked_rdev)) { + if (s.syncing || s.expanding || s.expanded || + s.replacing || s.to_write || s.written) { + set_bit(STRIPE_HANDLE, &sh->state); + goto finish; + } + /* There is nothing for the blocked_rdev to block */ + rdev_dec_pending(s.blocked_rdev, conf->mddev); + s.blocked_rdev = NULL; + } + + if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { + set_bit(STRIPE_OP_BIOFILL, &s.ops_request); + set_bit(STRIPE_BIOFILL_RUN, &sh->state); + } + + pr_debug("locked=%d uptodate=%d to_read=%d" + " to_write=%d failed=%d failed_num=%d,%d\n", + s.locked, s.uptodate, s.to_read, s.to_write, s.failed, + s.failed_num[0], s.failed_num[1]); + /* check if the array has lost more than max_degraded devices and, + * if so, some requests might need to be failed. + */ + if (s.failed > conf->max_degraded) { + sh->check_state = 0; + sh->reconstruct_state = 0; + if (s.to_read+s.to_write+s.written) + handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); + if (s.syncing + s.replacing) + handle_failed_sync(conf, sh, &s); + } + + /* + * might be able to return some write requests if the parity blocks + * are safe, or on a failed drive + */ + pdev = &sh->dev[sh->pd_idx]; + s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) + || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); + qdev = &sh->dev[sh->qd_idx]; + s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) + || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) + || conf->level < 6; + + if (s.written && + (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) + && !test_bit(R5_LOCKED, &pdev->flags) + && test_bit(R5_UPTODATE, &pdev->flags)))) && + (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) + && !test_bit(R5_LOCKED, &qdev->flags) + && test_bit(R5_UPTODATE, &qdev->flags))))) + handle_stripe_clean_event(conf, sh, disks, &s.return_bi); + + /* Now we might consider reading some blocks, either to check/generate + * parity, or to satisfy requests + * or to load a block that is being partially written. + */ + if (s.to_read || s.non_overwrite + || (conf->level == 6 && s.to_write && s.failed) + || (s.syncing && (s.uptodate + s.compute < disks)) + || s.replacing + || s.expanding) + handle_stripe_fill(sh, &s, disks); + + /* Now we check to see if any write operations have recently + * completed + */ + prexor = 0; + if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) + prexor = 1; + if (sh->reconstruct_state == reconstruct_state_drain_result || + sh->reconstruct_state == reconstruct_state_prexor_drain_result) { + sh->reconstruct_state = reconstruct_state_idle; + + /* All the 'written' buffers and the parity block are ready to + * be written back to disk + */ + BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); + BUG_ON(sh->qd_idx >= 0 && + !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags)); + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + if (test_bit(R5_LOCKED, &dev->flags) && + (i == sh->pd_idx || i == sh->qd_idx || + dev->written)) { + pr_debug("Writing block %d\n", i); + set_bit(R5_Wantwrite, &dev->flags); + if (prexor) + continue; + if (!test_bit(R5_Insync, &dev->flags) || + ((i == sh->pd_idx || i == sh->qd_idx) && + s.failed == 0)) + set_bit(STRIPE_INSYNC, &sh->state); + } + } + if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + s.dec_preread_active = 1; + } + + /* Now to consider new write requests and what else, if anything + * should be read. We do not handle new writes when: + * 1/ A 'write' operation (copy+xor) is already in flight. + * 2/ A 'check' operation is in flight, as it may clobber the parity + * block. + */ + if (s.to_write && !sh->reconstruct_state && !sh->check_state) + handle_stripe_dirtying(conf, sh, &s, disks); + + /* maybe we need to check and possibly fix the parity for this stripe + * Any reads will already have been scheduled, so we just see if enough + * data is available. The parity check is held off while parity + * dependent operations are in flight. + */ + if (sh->check_state || + (s.syncing && s.locked == 0 && + !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && + !test_bit(STRIPE_INSYNC, &sh->state))) { + if (conf->level == 6) + handle_parity_checks6(conf, sh, &s, disks); + else + handle_parity_checks5(conf, sh, &s, disks); + } + + if (s.replacing && s.locked == 0 + && !test_bit(STRIPE_INSYNC, &sh->state)) { + /* Write out to replacement devices where possible */ + for (i = 0; i < conf->raid_disks; i++) + if (test_bit(R5_UPTODATE, &sh->dev[i].flags) && + test_bit(R5_NeedReplace, &sh->dev[i].flags)) { + set_bit(R5_WantReplace, &sh->dev[i].flags); + set_bit(R5_LOCKED, &sh->dev[i].flags); + s.locked++; + } + set_bit(STRIPE_INSYNC, &sh->state); + } + if ((s.syncing || s.replacing) && s.locked == 0 && + test_bit(STRIPE_INSYNC, &sh->state)) { + md_done_sync(conf->mddev, STRIPE_SECTORS, 1); + clear_bit(STRIPE_SYNCING, &sh->state); + } + + /* If the failed drives are just a ReadError, then we might need + * to progress the repair/check process + */ + if (s.failed <= conf->max_degraded && !conf->mddev->ro) + for (i = 0; i < s.failed; i++) { + struct r5dev *dev = &sh->dev[s.failed_num[i]]; + if (test_bit(R5_ReadError, &dev->flags) + && !test_bit(R5_LOCKED, &dev->flags) + && test_bit(R5_UPTODATE, &dev->flags) + ) { + if (!test_bit(R5_ReWrite, &dev->flags)) { + set_bit(R5_Wantwrite, &dev->flags); + set_bit(R5_ReWrite, &dev->flags); + set_bit(R5_LOCKED, &dev->flags); + s.locked++; + } else { + /* let's read it back */ + set_bit(R5_Wantread, &dev->flags); + set_bit(R5_LOCKED, &dev->flags); + s.locked++; + } + } + } + + + /* Finish reconstruct operations initiated by the expansion process */ + if (sh->reconstruct_state == reconstruct_state_result) { + struct stripe_head *sh_src + = get_active_stripe(conf, sh->sector, 1, 1, 1); + if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { + /* sh cannot be written until sh_src has been read. + * so arrange for sh to be delayed a little + */ + set_bit(STRIPE_DELAYED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); + if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, + &sh_src->state)) + atomic_inc(&conf->preread_active_stripes); + release_stripe(sh_src); + goto finish; + } + if (sh_src) + release_stripe(sh_src); + + sh->reconstruct_state = reconstruct_state_idle; + clear_bit(STRIPE_EXPANDING, &sh->state); + for (i = conf->raid_disks; i--; ) { + set_bit(R5_Wantwrite, &sh->dev[i].flags); + set_bit(R5_LOCKED, &sh->dev[i].flags); + s.locked++; + } + } + + if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && + !sh->reconstruct_state) { + /* Need to write out all blocks after computing parity */ + sh->disks = conf->raid_disks; + stripe_set_idx(sh->sector, conf, 0, sh); + schedule_reconstruction(sh, &s, 1, 1); + } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { + clear_bit(STRIPE_EXPAND_READY, &sh->state); + atomic_dec(&conf->reshape_stripes); + wake_up(&conf->wait_for_overlap); + md_done_sync(conf->mddev, STRIPE_SECTORS, 1); + } + + if (s.expanding && s.locked == 0 && + !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) + handle_stripe_expansion(conf, sh); + +finish: + /* wait for this device to become unblocked */ + if (conf->mddev->external && unlikely(s.blocked_rdev)) + md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev); + + if (s.handle_bad_blocks) + for (i = disks; i--; ) { + struct md_rdev *rdev; + struct r5dev *dev = &sh->dev[i]; + if (test_and_clear_bit(R5_WriteError, &dev->flags)) { + /* We own a safe reference to the rdev */ + rdev = conf->disks[i].rdev; + if (!rdev_set_badblocks(rdev, sh->sector, + STRIPE_SECTORS, 0)) + md_error(conf->mddev, rdev); + rdev_dec_pending(rdev, conf->mddev); + } + if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { + rdev = conf->disks[i].rdev; + rdev_clear_badblocks(rdev, sh->sector, + STRIPE_SECTORS); + rdev_dec_pending(rdev, conf->mddev); + } + if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { + rdev = conf->disks[i].replacement; + if (!rdev) + /* rdev have been moved down */ + rdev = conf->disks[i].rdev; + rdev_clear_badblocks(rdev, sh->sector, + STRIPE_SECTORS); + rdev_dec_pending(rdev, conf->mddev); + } + } + + if (s.ops_request) + raid_run_ops(sh, s.ops_request); + + ops_run_io(sh, &s); + + if (s.dec_preread_active) { + /* We delay this until after ops_run_io so that if make_request + * is waiting on a flush, it won't continue until the writes + * have actually been submitted. + */ + atomic_dec(&conf->preread_active_stripes); + if (atomic_read(&conf->preread_active_stripes) < + IO_THRESHOLD) + md_wakeup_thread(conf->mddev->thread); + } + + return_io(s.return_bi); + + clear_bit_unlock(STRIPE_ACTIVE, &sh->state); +} + +static void raid5_activate_delayed(struct r5conf *conf) +{ + if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { + while (!list_empty(&conf->delayed_list)) { + struct list_head *l = conf->delayed_list.next; + struct stripe_head *sh; + sh = list_entry(l, struct stripe_head, lru); + list_del_init(l); + clear_bit(STRIPE_DELAYED, &sh->state); + if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + atomic_inc(&conf->preread_active_stripes); + list_add_tail(&sh->lru, &conf->hold_list); + } + } +} + +static void activate_bit_delay(struct r5conf *conf) +{ + /* device_lock is held */ + struct list_head head; + list_add(&head, &conf->bitmap_list); + list_del_init(&conf->bitmap_list); + while (!list_empty(&head)) { + struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); + list_del_init(&sh->lru); + atomic_inc(&sh->count); + __release_stripe(conf, sh); + } +} + +int md_raid5_congested(struct mddev *mddev, int bits) +{ + struct r5conf *conf = mddev->private; + + /* No difference between reads and writes. Just check + * how busy the stripe_cache is + */ + + if (conf->inactive_blocked) + return 1; + if (conf->quiesce) + return 1; + if (list_empty_careful(&conf->inactive_list)) + return 1; + + return 0; +} +EXPORT_SYMBOL_GPL(md_raid5_congested); + +static int raid5_congested(void *data, int bits) +{ + struct mddev *mddev = data; + + return mddev_congested(mddev, bits) || + md_raid5_congested(mddev, bits); +} + +/* We want read requests to align with chunks where possible, + * but write requests don't need to. + */ +static int raid5_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) +{ + struct mddev *mddev = q->queuedata; + sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); + int max; + unsigned int chunk_sectors = mddev->chunk_sectors; + unsigned int bio_sectors = bvm->bi_size >> 9; + + if ((bvm->bi_rw & 1) == WRITE) + return biovec->bv_len; /* always allow writes to be mergeable */ + + if (mddev->new_chunk_sectors < mddev->chunk_sectors) + chunk_sectors = mddev->new_chunk_sectors; + max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; + if (max < 0) max = 0; + if (max <= biovec->bv_len && bio_sectors == 0) + return biovec->bv_len; + else + return max; +} + + +static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) +{ + sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + unsigned int chunk_sectors = mddev->chunk_sectors; + unsigned int bio_sectors = bio->bi_size >> 9; + + if (mddev->new_chunk_sectors < mddev->chunk_sectors) + chunk_sectors = mddev->new_chunk_sectors; + return chunk_sectors >= + ((sector & (chunk_sectors - 1)) + bio_sectors); +} + +/* + * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) + * later sampled by raid5d. + */ +static void add_bio_to_retry(struct bio *bi,struct r5conf *conf) +{ + unsigned long flags; + + spin_lock_irqsave(&conf->device_lock, flags); + + bi->bi_next = conf->retry_read_aligned_list; + conf->retry_read_aligned_list = bi; + + spin_unlock_irqrestore(&conf->device_lock, flags); + md_wakeup_thread(conf->mddev->thread); +} + + +static struct bio *remove_bio_from_retry(struct r5conf *conf) +{ + struct bio *bi; + + bi = conf->retry_read_aligned; + if (bi) { + conf->retry_read_aligned = NULL; + return bi; + } + bi = conf->retry_read_aligned_list; + if(bi) { + conf->retry_read_aligned_list = bi->bi_next; + bi->bi_next = NULL; + /* + * this sets the active strip count to 1 and the processed + * strip count to zero (upper 8 bits) + */ + bi->bi_phys_segments = 1; /* biased count of active stripes */ + } + + return bi; +} + + +/* + * The "raid5_align_endio" should check if the read succeeded and if it + * did, call bio_endio on the original bio (having bio_put the new bio + * first). + * If the read failed.. + */ +static void raid5_align_endio(struct bio *bi, int error) +{ + struct bio* raid_bi = bi->bi_private; + struct mddev *mddev; + struct r5conf *conf; + int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); + struct md_rdev *rdev; + + bio_put(bi); + + rdev = (void*)raid_bi->bi_next; + raid_bi->bi_next = NULL; + mddev = rdev->mddev; + conf = mddev->private; + + rdev_dec_pending(rdev, conf->mddev); + + if (!error && uptodate) { + bio_endio(raid_bi, 0); + if (atomic_dec_and_test(&conf->active_aligned_reads)) + wake_up(&conf->wait_for_stripe); + return; + } + + + pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); + + add_bio_to_retry(raid_bi, conf); +} + +static int bio_fits_rdev(struct bio *bi) +{ + struct request_queue *q = bdev_get_queue(bi->bi_bdev); + + if ((bi->bi_size>>9) > queue_max_sectors(q)) + return 0; + blk_recount_segments(q, bi); + if (bi->bi_phys_segments > queue_max_segments(q)) + return 0; + + if (q->merge_bvec_fn) + /* it's too hard to apply the merge_bvec_fn at this stage, + * just just give up + */ + return 0; + + return 1; +} + + +static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) +{ + struct r5conf *conf = mddev->private; + int dd_idx; + struct bio* align_bi; + struct md_rdev *rdev; + sector_t end_sector; + + if (!in_chunk_boundary(mddev, raid_bio)) { + pr_debug("chunk_aligned_read : non aligned\n"); + return 0; + } + /* + * use bio_clone_mddev to make a copy of the bio + */ + align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev); + if (!align_bi) + return 0; + /* + * set bi_end_io to a new function, and set bi_private to the + * original bio. + */ + align_bi->bi_end_io = raid5_align_endio; + align_bi->bi_private = raid_bio; + /* + * compute position + */ + align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector, + 0, + &dd_idx, NULL); + + end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9); + rcu_read_lock(); + rdev = rcu_dereference(conf->disks[dd_idx].replacement); + if (!rdev || test_bit(Faulty, &rdev->flags) || + rdev->recovery_offset < end_sector) { + rdev = rcu_dereference(conf->disks[dd_idx].rdev); + if (rdev && + (test_bit(Faulty, &rdev->flags) || + !(test_bit(In_sync, &rdev->flags) || + rdev->recovery_offset >= end_sector))) + rdev = NULL; + } + if (rdev) { + sector_t first_bad; + int bad_sectors; + + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + raid_bio->bi_next = (void*)rdev; + align_bi->bi_bdev = rdev->bdev; + align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); + + if (!bio_fits_rdev(align_bi) || + is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9, + &first_bad, &bad_sectors)) { + /* too big in some way, or has a known bad block */ + bio_put(align_bi); + rdev_dec_pending(rdev, mddev); + return 0; + } + + /* No reshape active, so we can trust rdev->data_offset */ + align_bi->bi_sector += rdev->data_offset; + + spin_lock_irq(&conf->device_lock); + wait_event_lock_irq(conf->wait_for_stripe, + conf->quiesce == 0, + conf->device_lock, /* nothing */); + atomic_inc(&conf->active_aligned_reads); + spin_unlock_irq(&conf->device_lock); + + generic_make_request(align_bi); + return 1; + } else { + rcu_read_unlock(); + bio_put(align_bi); + return 0; + } +} + +/* __get_priority_stripe - get the next stripe to process + * + * Full stripe writes are allowed to pass preread active stripes up until + * the bypass_threshold is exceeded. In general the bypass_count + * increments when the handle_list is handled before the hold_list; however, it + * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a + * stripe with in flight i/o. The bypass_count will be reset when the + * head of the hold_list has changed, i.e. the head was promoted to the + * handle_list. + */ +static struct stripe_head *__get_priority_stripe(struct r5conf *conf) +{ + struct stripe_head *sh; + + pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", + __func__, + list_empty(&conf->handle_list) ? "empty" : "busy", + list_empty(&conf->hold_list) ? "empty" : "busy", + atomic_read(&conf->pending_full_writes), conf->bypass_count); + + if (!list_empty(&conf->handle_list)) { + sh = list_entry(conf->handle_list.next, typeof(*sh), lru); + + if (list_empty(&conf->hold_list)) + conf->bypass_count = 0; + else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { + if (conf->hold_list.next == conf->last_hold) + conf->bypass_count++; + else { + conf->last_hold = conf->hold_list.next; + conf->bypass_count -= conf->bypass_threshold; + if (conf->bypass_count < 0) + conf->bypass_count = 0; + } + } + } else if (!list_empty(&conf->hold_list) && + ((conf->bypass_threshold && + conf->bypass_count > conf->bypass_threshold) || + atomic_read(&conf->pending_full_writes) == 0)) { + sh = list_entry(conf->hold_list.next, + typeof(*sh), lru); + conf->bypass_count -= conf->bypass_threshold; + if (conf->bypass_count < 0) + conf->bypass_count = 0; + } else + return NULL; + + list_del_init(&sh->lru); + atomic_inc(&sh->count); + BUG_ON(atomic_read(&sh->count) != 1); + return sh; +} + +static void make_request(struct mddev *mddev, struct bio * bi) +{ + struct r5conf *conf = mddev->private; + int dd_idx; + sector_t new_sector; + sector_t logical_sector, last_sector; + struct stripe_head *sh; + const int rw = bio_data_dir(bi); + int remaining; + int plugged; + + if (unlikely(bi->bi_rw & REQ_FLUSH)) { + md_flush_request(mddev, bi); + return; + } + + md_write_start(mddev, bi); + + if (rw == READ && + mddev->reshape_position == MaxSector && + chunk_aligned_read(mddev,bi)) + return; + + logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); + last_sector = bi->bi_sector + (bi->bi_size>>9); + bi->bi_next = NULL; + bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ + + plugged = mddev_check_plugged(mddev); + for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { + DEFINE_WAIT(w); + int disks, data_disks; + int previous; + + retry: + previous = 0; + disks = conf->raid_disks; + prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); + if (unlikely(conf->reshape_progress != MaxSector)) { + /* spinlock is needed as reshape_progress may be + * 64bit on a 32bit platform, and so it might be + * possible to see a half-updated value + * Of course reshape_progress could change after + * the lock is dropped, so once we get a reference + * to the stripe that we think it is, we will have + * to check again. + */ + spin_lock_irq(&conf->device_lock); + if (mddev->delta_disks < 0 + ? logical_sector < conf->reshape_progress + : logical_sector >= conf->reshape_progress) { + disks = conf->previous_raid_disks; + previous = 1; + } else { + if (mddev->delta_disks < 0 + ? logical_sector < conf->reshape_safe + : logical_sector >= conf->reshape_safe) { + spin_unlock_irq(&conf->device_lock); + schedule(); + goto retry; + } + } + spin_unlock_irq(&conf->device_lock); + } + data_disks = disks - conf->max_degraded; + + new_sector = raid5_compute_sector(conf, logical_sector, + previous, + &dd_idx, NULL); + pr_debug("raid456: make_request, sector %llu logical %llu\n", + (unsigned long long)new_sector, + (unsigned long long)logical_sector); + + sh = get_active_stripe(conf, new_sector, previous, + (bi->bi_rw&RWA_MASK), 0); + if (sh) { + if (unlikely(previous)) { + /* expansion might have moved on while waiting for a + * stripe, so we must do the range check again. + * Expansion could still move past after this + * test, but as we are holding a reference to + * 'sh', we know that if that happens, + * STRIPE_EXPANDING will get set and the expansion + * won't proceed until we finish with the stripe. + */ + int must_retry = 0; + spin_lock_irq(&conf->device_lock); + if (mddev->delta_disks < 0 + ? logical_sector >= conf->reshape_progress + : logical_sector < conf->reshape_progress) + /* mismatch, need to try again */ + must_retry = 1; + spin_unlock_irq(&conf->device_lock); + if (must_retry) { + release_stripe(sh); + schedule(); + goto retry; + } + } + + if (rw == WRITE && + logical_sector >= mddev->suspend_lo && + logical_sector < mddev->suspend_hi) { + release_stripe(sh); + /* As the suspend_* range is controlled by + * userspace, we want an interruptible + * wait. + */ + flush_signals(current); + prepare_to_wait(&conf->wait_for_overlap, + &w, TASK_INTERRUPTIBLE); + if (logical_sector >= mddev->suspend_lo && + logical_sector < mddev->suspend_hi) + schedule(); + goto retry; + } + + if (test_bit(STRIPE_EXPANDING, &sh->state) || + !add_stripe_bio(sh, bi, dd_idx, rw)) { + /* Stripe is busy expanding or + * add failed due to overlap. Flush everything + * and wait a while + */ + md_wakeup_thread(mddev->thread); + release_stripe(sh); + schedule(); + goto retry; + } + finish_wait(&conf->wait_for_overlap, &w); + set_bit(STRIPE_HANDLE, &sh->state); + clear_bit(STRIPE_DELAYED, &sh->state); + if ((bi->bi_rw & REQ_SYNC) && + !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + atomic_inc(&conf->preread_active_stripes); + release_stripe(sh); + } else { + /* cannot get stripe for read-ahead, just give-up */ + clear_bit(BIO_UPTODATE, &bi->bi_flags); + finish_wait(&conf->wait_for_overlap, &w); + break; + } + + } + if (!plugged) + md_wakeup_thread(mddev->thread); + + spin_lock_irq(&conf->device_lock); + remaining = raid5_dec_bi_phys_segments(bi); + spin_unlock_irq(&conf->device_lock); + if (remaining == 0) { + + if ( rw == WRITE ) + md_write_end(mddev); + + bio_endio(bi, 0); + } +} + +static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); + +static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped) +{ + /* reshaping is quite different to recovery/resync so it is + * handled quite separately ... here. + * + * On each call to sync_request, we gather one chunk worth of + * destination stripes and flag them as expanding. + * Then we find all the source stripes and request reads. + * As the reads complete, handle_stripe will copy the data + * into the destination stripe and release that stripe. + */ + struct r5conf *conf = mddev->private; + struct stripe_head *sh; + sector_t first_sector, last_sector; + int raid_disks = conf->previous_raid_disks; + int data_disks = raid_disks - conf->max_degraded; + int new_data_disks = conf->raid_disks - conf->max_degraded; + int i; + int dd_idx; + sector_t writepos, readpos, safepos; + sector_t stripe_addr; + int reshape_sectors; + struct list_head stripes; + + if (sector_nr == 0) { + /* If restarting in the middle, skip the initial sectors */ + if (mddev->delta_disks < 0 && + conf->reshape_progress < raid5_size(mddev, 0, 0)) { + sector_nr = raid5_size(mddev, 0, 0) + - conf->reshape_progress; + } else if (mddev->delta_disks >= 0 && + conf->reshape_progress > 0) + sector_nr = conf->reshape_progress; + sector_div(sector_nr, new_data_disks); + if (sector_nr) { + mddev->curr_resync_completed = sector_nr; + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + *skipped = 1; + return sector_nr; + } + } + + /* We need to process a full chunk at a time. + * If old and new chunk sizes differ, we need to process the + * largest of these + */ + if (mddev->new_chunk_sectors > mddev->chunk_sectors) + reshape_sectors = mddev->new_chunk_sectors; + else + reshape_sectors = mddev->chunk_sectors; + + /* we update the metadata when there is more than 3Meg + * in the block range (that is rather arbitrary, should + * probably be time based) or when the data about to be + * copied would over-write the source of the data at + * the front of the range. + * i.e. one new_stripe along from reshape_progress new_maps + * to after where reshape_safe old_maps to + */ + writepos = conf->reshape_progress; + sector_div(writepos, new_data_disks); + readpos = conf->reshape_progress; + sector_div(readpos, data_disks); + safepos = conf->reshape_safe; + sector_div(safepos, data_disks); + if (mddev->delta_disks < 0) { + writepos -= min_t(sector_t, reshape_sectors, writepos); + readpos += reshape_sectors; + safepos += reshape_sectors; + } else { + writepos += reshape_sectors; + readpos -= min_t(sector_t, reshape_sectors, readpos); + safepos -= min_t(sector_t, reshape_sectors, safepos); + } + + /* 'writepos' is the most advanced device address we might write. + * 'readpos' is the least advanced device address we might read. + * 'safepos' is the least address recorded in the metadata as having + * been reshaped. + * If 'readpos' is behind 'writepos', then there is no way that we can + * ensure safety in the face of a crash - that must be done by userspace + * making a backup of the data. So in that case there is no particular + * rush to update metadata. + * Otherwise if 'safepos' is behind 'writepos', then we really need to + * update the metadata to advance 'safepos' to match 'readpos' so that + * we can be safe in the event of a crash. + * So we insist on updating metadata if safepos is behind writepos and + * readpos is beyond writepos. + * In any case, update the metadata every 10 seconds. + * Maybe that number should be configurable, but I'm not sure it is + * worth it.... maybe it could be a multiple of safemode_delay??? + */ + if ((mddev->delta_disks < 0 + ? (safepos > writepos && readpos < writepos) + : (safepos < writepos && readpos > writepos)) || + time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { + /* Cannot proceed until we've updated the superblock... */ + wait_event(conf->wait_for_overlap, + atomic_read(&conf->reshape_stripes)==0); + mddev->reshape_position = conf->reshape_progress; + mddev->curr_resync_completed = sector_nr; + conf->reshape_checkpoint = jiffies; + set_bit(MD_CHANGE_DEVS, &mddev->flags); + md_wakeup_thread(mddev->thread); + wait_event(mddev->sb_wait, mddev->flags == 0 || + kthread_should_stop()); + spin_lock_irq(&conf->device_lock); + conf->reshape_safe = mddev->reshape_position; + spin_unlock_irq(&conf->device_lock); + wake_up(&conf->wait_for_overlap); + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + } + + if (mddev->delta_disks < 0) { + BUG_ON(conf->reshape_progress == 0); + stripe_addr = writepos; + BUG_ON((mddev->dev_sectors & + ~((sector_t)reshape_sectors - 1)) + - reshape_sectors - stripe_addr + != sector_nr); + } else { + BUG_ON(writepos != sector_nr + reshape_sectors); + stripe_addr = sector_nr; + } + INIT_LIST_HEAD(&stripes); + for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { + int j; + int skipped_disk = 0; + sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); + set_bit(STRIPE_EXPANDING, &sh->state); + atomic_inc(&conf->reshape_stripes); + /* If any of this stripe is beyond the end of the old + * array, then we need to zero those blocks + */ + for (j=sh->disks; j--;) { + sector_t s; + if (j == sh->pd_idx) + continue; + if (conf->level == 6 && + j == sh->qd_idx) + continue; + s = compute_blocknr(sh, j, 0); + if (s < raid5_size(mddev, 0, 0)) { + skipped_disk = 1; + continue; + } + memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); + set_bit(R5_Expanded, &sh->dev[j].flags); + set_bit(R5_UPTODATE, &sh->dev[j].flags); + } + if (!skipped_disk) { + set_bit(STRIPE_EXPAND_READY, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); + } + list_add(&sh->lru, &stripes); + } + spin_lock_irq(&conf->device_lock); + if (mddev->delta_disks < 0) + conf->reshape_progress -= reshape_sectors * new_data_disks; + else + conf->reshape_progress += reshape_sectors * new_data_disks; + spin_unlock_irq(&conf->device_lock); + /* Ok, those stripe are ready. We can start scheduling + * reads on the source stripes. + * The source stripes are determined by mapping the first and last + * block on the destination stripes. + */ + first_sector = + raid5_compute_sector(conf, stripe_addr*(new_data_disks), + 1, &dd_idx, NULL); + last_sector = + raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) + * new_data_disks - 1), + 1, &dd_idx, NULL); + if (last_sector >= mddev->dev_sectors) + last_sector = mddev->dev_sectors - 1; + while (first_sector <= last_sector) { + sh = get_active_stripe(conf, first_sector, 1, 0, 1); + set_bit(STRIPE_EXPAND_SOURCE, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); + release_stripe(sh); + first_sector += STRIPE_SECTORS; + } + /* Now that the sources are clearly marked, we can release + * the destination stripes + */ + while (!list_empty(&stripes)) { + sh = list_entry(stripes.next, struct stripe_head, lru); + list_del_init(&sh->lru); + release_stripe(sh); + } + /* If this takes us to the resync_max point where we have to pause, + * then we need to write out the superblock. + */ + sector_nr += reshape_sectors; + if ((sector_nr - mddev->curr_resync_completed) * 2 + >= mddev->resync_max - mddev->curr_resync_completed) { + /* Cannot proceed until we've updated the superblock... */ + wait_event(conf->wait_for_overlap, + atomic_read(&conf->reshape_stripes) == 0); + mddev->reshape_position = conf->reshape_progress; + mddev->curr_resync_completed = sector_nr; + conf->reshape_checkpoint = jiffies; + set_bit(MD_CHANGE_DEVS, &mddev->flags); + md_wakeup_thread(mddev->thread); + wait_event(mddev->sb_wait, + !test_bit(MD_CHANGE_DEVS, &mddev->flags) + || kthread_should_stop()); + spin_lock_irq(&conf->device_lock); + conf->reshape_safe = mddev->reshape_position; + spin_unlock_irq(&conf->device_lock); + wake_up(&conf->wait_for_overlap); + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + } + return reshape_sectors; +} + +/* FIXME go_faster isn't used */ +static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster) +{ + struct r5conf *conf = mddev->private; + struct stripe_head *sh; + sector_t max_sector = mddev->dev_sectors; + sector_t sync_blocks; + int still_degraded = 0; + int i; + + if (sector_nr >= max_sector) { + /* just being told to finish up .. nothing much to do */ + + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { + end_reshape(conf); + return 0; + } + + if (mddev->curr_resync < max_sector) /* aborted */ + bitmap_end_sync(mddev->bitmap, mddev->curr_resync, + &sync_blocks, 1); + else /* completed sync */ + conf->fullsync = 0; + bitmap_close_sync(mddev->bitmap); + + return 0; + } + + /* Allow raid5_quiesce to complete */ + wait_event(conf->wait_for_overlap, conf->quiesce != 2); + + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + return reshape_request(mddev, sector_nr, skipped); + + /* No need to check resync_max as we never do more than one + * stripe, and as resync_max will always be on a chunk boundary, + * if the check in md_do_sync didn't fire, there is no chance + * of overstepping resync_max here + */ + + /* if there is too many failed drives and we are trying + * to resync, then assert that we are finished, because there is + * nothing we can do. + */ + if (mddev->degraded >= conf->max_degraded && + test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + sector_t rv = mddev->dev_sectors - sector_nr; + *skipped = 1; + return rv; + } + if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && + !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { + /* we can skip this block, and probably more */ + sync_blocks /= STRIPE_SECTORS; + *skipped = 1; + return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ + } + + bitmap_cond_end_sync(mddev->bitmap, sector_nr); + + sh = get_active_stripe(conf, sector_nr, 0, 1, 0); + if (sh == NULL) { + sh = get_active_stripe(conf, sector_nr, 0, 0, 0); + /* make sure we don't swamp the stripe cache if someone else + * is trying to get access + */ + schedule_timeout_uninterruptible(1); + } + /* Need to check if array will still be degraded after recovery/resync + * We don't need to check the 'failed' flag as when that gets set, + * recovery aborts. + */ + for (i = 0; i < conf->raid_disks; i++) + if (conf->disks[i].rdev == NULL) + still_degraded = 1; + + bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); + + set_bit(STRIPE_SYNC_REQUESTED, &sh->state); + + handle_stripe(sh); + release_stripe(sh); + + return STRIPE_SECTORS; +} + +static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) +{ + /* We may not be able to submit a whole bio at once as there + * may not be enough stripe_heads available. + * We cannot pre-allocate enough stripe_heads as we may need + * more than exist in the cache (if we allow ever large chunks). + * So we do one stripe head at a time and record in + * ->bi_hw_segments how many have been done. + * + * We *know* that this entire raid_bio is in one chunk, so + * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. + */ + struct stripe_head *sh; + int dd_idx; + sector_t sector, logical_sector, last_sector; + int scnt = 0; + int remaining; + int handled = 0; + + logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); + sector = raid5_compute_sector(conf, logical_sector, + 0, &dd_idx, NULL); + last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); + + for (; logical_sector < last_sector; + logical_sector += STRIPE_SECTORS, + sector += STRIPE_SECTORS, + scnt++) { + + if (scnt < raid5_bi_hw_segments(raid_bio)) + /* already done this stripe */ + continue; + + sh = get_active_stripe(conf, sector, 0, 1, 0); + + if (!sh) { + /* failed to get a stripe - must wait */ + raid5_set_bi_hw_segments(raid_bio, scnt); + conf->retry_read_aligned = raid_bio; + return handled; + } + + if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { + release_stripe(sh); + raid5_set_bi_hw_segments(raid_bio, scnt); + conf->retry_read_aligned = raid_bio; + return handled; + } + + handle_stripe(sh); + release_stripe(sh); + handled++; + } + spin_lock_irq(&conf->device_lock); + remaining = raid5_dec_bi_phys_segments(raid_bio); + spin_unlock_irq(&conf->device_lock); + if (remaining == 0) + bio_endio(raid_bio, 0); + if (atomic_dec_and_test(&conf->active_aligned_reads)) + wake_up(&conf->wait_for_stripe); + return handled; +} + + +/* + * This is our raid5 kernel thread. + * + * We scan the hash table for stripes which can be handled now. + * During the scan, completed stripes are saved for us by the interrupt + * handler, so that they will not have to wait for our next wakeup. + */ +static void raid5d(struct mddev *mddev) +{ + struct stripe_head *sh; + struct r5conf *conf = mddev->private; + int handled; + struct blk_plug plug; + + pr_debug("+++ raid5d active\n"); + + md_check_recovery(mddev); + + blk_start_plug(&plug); + handled = 0; + spin_lock_irq(&conf->device_lock); + while (1) { + struct bio *bio; + + if (atomic_read(&mddev->plug_cnt) == 0 && + !list_empty(&conf->bitmap_list)) { + /* Now is a good time to flush some bitmap updates */ + conf->seq_flush++; + spin_unlock_irq(&conf->device_lock); + bitmap_unplug(mddev->bitmap); + spin_lock_irq(&conf->device_lock); + conf->seq_write = conf->seq_flush; + activate_bit_delay(conf); + } + if (atomic_read(&mddev->plug_cnt) == 0) + raid5_activate_delayed(conf); + + while ((bio = remove_bio_from_retry(conf))) { + int ok; + spin_unlock_irq(&conf->device_lock); + ok = retry_aligned_read(conf, bio); + spin_lock_irq(&conf->device_lock); + if (!ok) + break; + handled++; + } + + sh = __get_priority_stripe(conf); + + if (!sh) + break; + spin_unlock_irq(&conf->device_lock); + + handled++; + handle_stripe(sh); + release_stripe(sh); + cond_resched(); + + if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) + md_check_recovery(mddev); + + spin_lock_irq(&conf->device_lock); + } + pr_debug("%d stripes handled\n", handled); + + spin_unlock_irq(&conf->device_lock); + + async_tx_issue_pending_all(); + blk_finish_plug(&plug); + + pr_debug("--- raid5d inactive\n"); +} + +static ssize_t +raid5_show_stripe_cache_size(struct mddev *mddev, char *page) +{ + struct r5conf *conf = mddev->private; + if (conf) + return sprintf(page, "%d\n", conf->max_nr_stripes); + else + return 0; +} + +int +raid5_set_cache_size(struct mddev *mddev, int size) +{ + struct r5conf *conf = mddev->private; + int err; + + if (size <= 16 || size > 32768) + return -EINVAL; + while (size < conf->max_nr_stripes) { + if (drop_one_stripe(conf)) + conf->max_nr_stripes--; + else + break; + } + err = md_allow_write(mddev); + if (err) + return err; + while (size > conf->max_nr_stripes) { + if (grow_one_stripe(conf)) + conf->max_nr_stripes++; + else break; + } + return 0; +} +EXPORT_SYMBOL(raid5_set_cache_size); + +static ssize_t +raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) +{ + struct r5conf *conf = mddev->private; + unsigned long new; + int err; + + if (len >= PAGE_SIZE) + return -EINVAL; + if (!conf) + return -ENODEV; + + if (strict_strtoul(page, 10, &new)) + return -EINVAL; + err = raid5_set_cache_size(mddev, new); + if (err) + return err; + return len; +} + +static struct md_sysfs_entry +raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, + raid5_show_stripe_cache_size, + raid5_store_stripe_cache_size); + +static ssize_t +raid5_show_preread_threshold(struct mddev *mddev, char *page) +{ + struct r5conf *conf = mddev->private; + if (conf) + return sprintf(page, "%d\n", conf->bypass_threshold); + else + return 0; +} + +static ssize_t +raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) +{ + struct r5conf *conf = mddev->private; + unsigned long new; + if (len >= PAGE_SIZE) + return -EINVAL; + if (!conf) + return -ENODEV; + + if (strict_strtoul(page, 10, &new)) + return -EINVAL; + if (new > conf->max_nr_stripes) + return -EINVAL; + conf->bypass_threshold = new; + return len; +} + +static struct md_sysfs_entry +raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, + S_IRUGO | S_IWUSR, + raid5_show_preread_threshold, + raid5_store_preread_threshold); + +static ssize_t +stripe_cache_active_show(struct mddev *mddev, char *page) +{ + struct r5conf *conf = mddev->private; + if (conf) + return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); + else + return 0; +} + +static struct md_sysfs_entry +raid5_stripecache_active = __ATTR_RO(stripe_cache_active); + +static struct attribute *raid5_attrs[] = { + &raid5_stripecache_size.attr, + &raid5_stripecache_active.attr, + &raid5_preread_bypass_threshold.attr, + NULL, +}; +static struct attribute_group raid5_attrs_group = { + .name = NULL, + .attrs = raid5_attrs, +}; + +static sector_t +raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) +{ + struct r5conf *conf = mddev->private; + + if (!sectors) + sectors = mddev->dev_sectors; + if (!raid_disks) + /* size is defined by the smallest of previous and new size */ + raid_disks = min(conf->raid_disks, conf->previous_raid_disks); + + sectors &= ~((sector_t)mddev->chunk_sectors - 1); + sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); + return sectors * (raid_disks - conf->max_degraded); +} + +static void raid5_free_percpu(struct r5conf *conf) +{ + struct raid5_percpu *percpu; + unsigned long cpu; + + if (!conf->percpu) + return; + + get_online_cpus(); + for_each_possible_cpu(cpu) { + percpu = per_cpu_ptr(conf->percpu, cpu); + safe_put_page(percpu->spare_page); + kfree(percpu->scribble); + } +#ifdef CONFIG_HOTPLUG_CPU + unregister_cpu_notifier(&conf->cpu_notify); +#endif + put_online_cpus(); + + free_percpu(conf->percpu); +} + +static void free_conf(struct r5conf *conf) +{ + shrink_stripes(conf); + raid5_free_percpu(conf); + kfree(conf->disks); + kfree(conf->stripe_hashtbl); + kfree(conf); +} + +#ifdef CONFIG_HOTPLUG_CPU +static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ + struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify); + long cpu = (long)hcpu; + struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + if (conf->level == 6 && !percpu->spare_page) + percpu->spare_page = alloc_page(GFP_KERNEL); + if (!percpu->scribble) + percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); + + if (!percpu->scribble || + (conf->level == 6 && !percpu->spare_page)) { + safe_put_page(percpu->spare_page); + kfree(percpu->scribble); + pr_err("%s: failed memory allocation for cpu%ld\n", + __func__, cpu); + return notifier_from_errno(-ENOMEM); + } + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + safe_put_page(percpu->spare_page); + kfree(percpu->scribble); + percpu->spare_page = NULL; + percpu->scribble = NULL; + break; + default: + break; + } + return NOTIFY_OK; +} +#endif + +static int raid5_alloc_percpu(struct r5conf *conf) +{ + unsigned long cpu; + struct page *spare_page; + struct raid5_percpu __percpu *allcpus; + void *scribble; + int err; + + allcpus = alloc_percpu(struct raid5_percpu); + if (!allcpus) + return -ENOMEM; + conf->percpu = allcpus; + + get_online_cpus(); + err = 0; + for_each_present_cpu(cpu) { + if (conf->level == 6) { + spare_page = alloc_page(GFP_KERNEL); + if (!spare_page) { + err = -ENOMEM; + break; + } + per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; + } + scribble = kmalloc(conf->scribble_len, GFP_KERNEL); + if (!scribble) { + err = -ENOMEM; + break; + } + per_cpu_ptr(conf->percpu, cpu)->scribble = scribble; + } +#ifdef CONFIG_HOTPLUG_CPU + conf->cpu_notify.notifier_call = raid456_cpu_notify; + conf->cpu_notify.priority = 0; + if (err == 0) + err = register_cpu_notifier(&conf->cpu_notify); +#endif + put_online_cpus(); + + return err; +} + +static struct r5conf *setup_conf(struct mddev *mddev) +{ + struct r5conf *conf; + int raid_disk, memory, max_disks; + struct md_rdev *rdev; + struct disk_info *disk; + + if (mddev->new_level != 5 + && mddev->new_level != 4 + && mddev->new_level != 6) { + printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n", + mdname(mddev), mddev->new_level); + return ERR_PTR(-EIO); + } + if ((mddev->new_level == 5 + && !algorithm_valid_raid5(mddev->new_layout)) || + (mddev->new_level == 6 + && !algorithm_valid_raid6(mddev->new_layout))) { + printk(KERN_ERR "md/raid:%s: layout %d not supported\n", + mdname(mddev), mddev->new_layout); + return ERR_PTR(-EIO); + } + if (mddev->new_level == 6 && mddev->raid_disks < 4) { + printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n", + mdname(mddev), mddev->raid_disks); + return ERR_PTR(-EINVAL); + } + + if (!mddev->new_chunk_sectors || + (mddev->new_chunk_sectors << 9) % PAGE_SIZE || + !is_power_of_2(mddev->new_chunk_sectors)) { + printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n", + mdname(mddev), mddev->new_chunk_sectors << 9); + return ERR_PTR(-EINVAL); + } + + conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); + if (conf == NULL) + goto abort; + spin_lock_init(&conf->device_lock); + init_waitqueue_head(&conf->wait_for_stripe); + init_waitqueue_head(&conf->wait_for_overlap); + INIT_LIST_HEAD(&conf->handle_list); + INIT_LIST_HEAD(&conf->hold_list); + INIT_LIST_HEAD(&conf->delayed_list); + INIT_LIST_HEAD(&conf->bitmap_list); + INIT_LIST_HEAD(&conf->inactive_list); + atomic_set(&conf->active_stripes, 0); + atomic_set(&conf->preread_active_stripes, 0); + atomic_set(&conf->active_aligned_reads, 0); + conf->bypass_threshold = BYPASS_THRESHOLD; + conf->recovery_disabled = mddev->recovery_disabled - 1; + + conf->raid_disks = mddev->raid_disks; + if (mddev->reshape_position == MaxSector) + conf->previous_raid_disks = mddev->raid_disks; + else + conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; + max_disks = max(conf->raid_disks, conf->previous_raid_disks); + conf->scribble_len = scribble_len(max_disks); + + conf->disks = kzalloc(max_disks * sizeof(struct disk_info), + GFP_KERNEL); + if (!conf->disks) + goto abort; + + conf->mddev = mddev; + + if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) + goto abort; + + conf->level = mddev->new_level; + if (raid5_alloc_percpu(conf) != 0) + goto abort; + + pr_debug("raid456: run(%s) called.\n", mdname(mddev)); + + rdev_for_each(rdev, mddev) { + raid_disk = rdev->raid_disk; + if (raid_disk >= max_disks + || raid_disk < 0) + continue; + disk = conf->disks + raid_disk; + + if (test_bit(Replacement, &rdev->flags)) { + if (disk->replacement) + goto abort; + disk->replacement = rdev; + } else { + if (disk->rdev) + goto abort; + disk->rdev = rdev; + } + + if (test_bit(In_sync, &rdev->flags)) { + char b[BDEVNAME_SIZE]; + printk(KERN_INFO "md/raid:%s: device %s operational as raid" + " disk %d\n", + mdname(mddev), bdevname(rdev->bdev, b), raid_disk); + } else if (rdev->saved_raid_disk != raid_disk) + /* Cannot rely on bitmap to complete recovery */ + conf->fullsync = 1; + } + + conf->chunk_sectors = mddev->new_chunk_sectors; + conf->level = mddev->new_level; + if (conf->level == 6) + conf->max_degraded = 2; + else + conf->max_degraded = 1; + conf->algorithm = mddev->new_layout; + conf->max_nr_stripes = NR_STRIPES; + conf->reshape_progress = mddev->reshape_position; + if (conf->reshape_progress != MaxSector) { + conf->prev_chunk_sectors = mddev->chunk_sectors; + conf->prev_algo = mddev->layout; + } + + memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + + max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; + if (grow_stripes(conf, conf->max_nr_stripes)) { + printk(KERN_ERR + "md/raid:%s: couldn't allocate %dkB for buffers\n", + mdname(mddev), memory); + goto abort; + } else + printk(KERN_INFO "md/raid:%s: allocated %dkB\n", + mdname(mddev), memory); + + conf->thread = md_register_thread(raid5d, mddev, NULL); + if (!conf->thread) { + printk(KERN_ERR + "md/raid:%s: couldn't allocate thread.\n", + mdname(mddev)); + goto abort; + } + + return conf; + + abort: + if (conf) { + free_conf(conf); + return ERR_PTR(-EIO); + } else + return ERR_PTR(-ENOMEM); +} + + +static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) +{ + switch (algo) { + case ALGORITHM_PARITY_0: + if (raid_disk < max_degraded) + return 1; + break; + case ALGORITHM_PARITY_N: + if (raid_disk >= raid_disks - max_degraded) + return 1; + break; + case ALGORITHM_PARITY_0_6: + if (raid_disk == 0 || + raid_disk == raid_disks - 1) + return 1; + break; + case ALGORITHM_LEFT_ASYMMETRIC_6: + case ALGORITHM_RIGHT_ASYMMETRIC_6: + case ALGORITHM_LEFT_SYMMETRIC_6: + case ALGORITHM_RIGHT_SYMMETRIC_6: + if (raid_disk == raid_disks - 1) + return 1; + } + return 0; +} + +static int run(struct mddev *mddev) +{ + struct r5conf *conf; + int working_disks = 0; + int dirty_parity_disks = 0; + struct md_rdev *rdev; + sector_t reshape_offset = 0; + int i; + + if (mddev->recovery_cp != MaxSector) + printk(KERN_NOTICE "md/raid:%s: not clean" + " -- starting background reconstruction\n", + mdname(mddev)); + if (mddev->reshape_position != MaxSector) { + /* Check that we can continue the reshape. + * Currently only disks can change, it must + * increase, and we must be past the point where + * a stripe over-writes itself + */ + sector_t here_new, here_old; + int old_disks; + int max_degraded = (mddev->level == 6 ? 2 : 1); + + if (mddev->new_level != mddev->level) { + printk(KERN_ERR "md/raid:%s: unsupported reshape " + "required - aborting.\n", + mdname(mddev)); + return -EINVAL; + } + old_disks = mddev->raid_disks - mddev->delta_disks; + /* reshape_position must be on a new-stripe boundary, and one + * further up in new geometry must map after here in old + * geometry. + */ + here_new = mddev->reshape_position; + if (sector_div(here_new, mddev->new_chunk_sectors * + (mddev->raid_disks - max_degraded))) { + printk(KERN_ERR "md/raid:%s: reshape_position not " + "on a stripe boundary\n", mdname(mddev)); + return -EINVAL; + } + reshape_offset = here_new * mddev->new_chunk_sectors; + /* here_new is the stripe we will write to */ + here_old = mddev->reshape_position; + sector_div(here_old, mddev->chunk_sectors * + (old_disks-max_degraded)); + /* here_old is the first stripe that we might need to read + * from */ + if (mddev->delta_disks == 0) { + /* We cannot be sure it is safe to start an in-place + * reshape. It is only safe if user-space if monitoring + * and taking constant backups. + * mdadm always starts a situation like this in + * readonly mode so it can take control before + * allowing any writes. So just check for that. + */ + if ((here_new * mddev->new_chunk_sectors != + here_old * mddev->chunk_sectors) || + mddev->ro == 0) { + printk(KERN_ERR "md/raid:%s: in-place reshape must be started" + " in read-only mode - aborting\n", + mdname(mddev)); + return -EINVAL; + } + } else if (mddev->delta_disks < 0 + ? (here_new * mddev->new_chunk_sectors <= + here_old * mddev->chunk_sectors) + : (here_new * mddev->new_chunk_sectors >= + here_old * mddev->chunk_sectors)) { + /* Reading from the same stripe as writing to - bad */ + printk(KERN_ERR "md/raid:%s: reshape_position too early for " + "auto-recovery - aborting.\n", + mdname(mddev)); + return -EINVAL; + } + printk(KERN_INFO "md/raid:%s: reshape will continue\n", + mdname(mddev)); + /* OK, we should be able to continue; */ + } else { + BUG_ON(mddev->level != mddev->new_level); + BUG_ON(mddev->layout != mddev->new_layout); + BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); + BUG_ON(mddev->delta_disks != 0); + } + + if (mddev->private == NULL) + conf = setup_conf(mddev); + else + conf = mddev->private; + + if (IS_ERR(conf)) + return PTR_ERR(conf); + + mddev->thread = conf->thread; + conf->thread = NULL; + mddev->private = conf; + + for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; + i++) { + rdev = conf->disks[i].rdev; + if (!rdev && conf->disks[i].replacement) { + /* The replacement is all we have yet */ + rdev = conf->disks[i].replacement; + conf->disks[i].replacement = NULL; + clear_bit(Replacement, &rdev->flags); + conf->disks[i].rdev = rdev; + } + if (!rdev) + continue; + if (conf->disks[i].replacement && + conf->reshape_progress != MaxSector) { + /* replacements and reshape simply do not mix. */ + printk(KERN_ERR "md: cannot handle concurrent " + "replacement and reshape.\n"); + goto abort; + } + if (test_bit(In_sync, &rdev->flags)) { + working_disks++; + continue; + } + /* This disc is not fully in-sync. However if it + * just stored parity (beyond the recovery_offset), + * when we don't need to be concerned about the + * array being dirty. + * When reshape goes 'backwards', we never have + * partially completed devices, so we only need + * to worry about reshape going forwards. + */ + /* Hack because v0.91 doesn't store recovery_offset properly. */ + if (mddev->major_version == 0 && + mddev->minor_version > 90) + rdev->recovery_offset = reshape_offset; + + if (rdev->recovery_offset < reshape_offset) { + /* We need to check old and new layout */ + if (!only_parity(rdev->raid_disk, + conf->algorithm, + conf->raid_disks, + conf->max_degraded)) + continue; + } + if (!only_parity(rdev->raid_disk, + conf->prev_algo, + conf->previous_raid_disks, + conf->max_degraded)) + continue; + dirty_parity_disks++; + } + + /* + * 0 for a fully functional array, 1 or 2 for a degraded array. + */ + mddev->degraded = calc_degraded(conf); + + if (has_failed(conf)) { + printk(KERN_ERR "md/raid:%s: not enough operational devices" + " (%d/%d failed)\n", + mdname(mddev), mddev->degraded, conf->raid_disks); + goto abort; + } + + /* device size must be a multiple of chunk size */ + mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); + mddev->resync_max_sectors = mddev->dev_sectors; + + if (mddev->degraded > dirty_parity_disks && + mddev->recovery_cp != MaxSector) { + if (mddev->ok_start_degraded) + printk(KERN_WARNING + "md/raid:%s: starting dirty degraded array" + " - data corruption possible.\n", + mdname(mddev)); + else { + printk(KERN_ERR + "md/raid:%s: cannot start dirty degraded array.\n", + mdname(mddev)); + goto abort; + } + } + + if (mddev->degraded == 0) + printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d" + " devices, algorithm %d\n", mdname(mddev), conf->level, + mddev->raid_disks-mddev->degraded, mddev->raid_disks, + mddev->new_layout); + else + printk(KERN_ALERT "md/raid:%s: raid level %d active with %d" + " out of %d devices, algorithm %d\n", + mdname(mddev), conf->level, + mddev->raid_disks - mddev->degraded, + mddev->raid_disks, mddev->new_layout); + + print_raid5_conf(conf); + + if (conf->reshape_progress != MaxSector) { + conf->reshape_safe = conf->reshape_progress; + atomic_set(&conf->reshape_stripes, 0); + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + mddev->sync_thread = md_register_thread(md_do_sync, mddev, + "reshape"); + } + + + /* Ok, everything is just fine now */ + if (mddev->to_remove == &raid5_attrs_group) + mddev->to_remove = NULL; + else if (mddev->kobj.sd && + sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) + printk(KERN_WARNING + "raid5: failed to create sysfs attributes for %s\n", + mdname(mddev)); + md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); + + if (mddev->queue) { + int chunk_size; + /* read-ahead size must cover two whole stripes, which + * is 2 * (datadisks) * chunksize where 'n' is the + * number of raid devices + */ + int data_disks = conf->previous_raid_disks - conf->max_degraded; + int stripe = data_disks * + ((mddev->chunk_sectors << 9) / PAGE_SIZE); + if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) + mddev->queue->backing_dev_info.ra_pages = 2 * stripe; + + blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); + + mddev->queue->backing_dev_info.congested_data = mddev; + mddev->queue->backing_dev_info.congested_fn = raid5_congested; + + chunk_size = mddev->chunk_sectors << 9; + blk_queue_io_min(mddev->queue, chunk_size); + blk_queue_io_opt(mddev->queue, chunk_size * + (conf->raid_disks - conf->max_degraded)); + + rdev_for_each(rdev, mddev) + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + } + + return 0; +abort: + md_unregister_thread(&mddev->thread); + print_raid5_conf(conf); + free_conf(conf); + mddev->private = NULL; + printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev)); + return -EIO; +} + +static int stop(struct mddev *mddev) +{ + struct r5conf *conf = mddev->private; + + md_unregister_thread(&mddev->thread); + if (mddev->queue) + mddev->queue->backing_dev_info.congested_fn = NULL; + free_conf(conf); + mddev->private = NULL; + mddev->to_remove = &raid5_attrs_group; + return 0; +} + +static void status(struct seq_file *seq, struct mddev *mddev) +{ + struct r5conf *conf = mddev->private; + int i; + + seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, + mddev->chunk_sectors / 2, mddev->layout); + seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); + for (i = 0; i < conf->raid_disks; i++) + seq_printf (seq, "%s", + conf->disks[i].rdev && + test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); + seq_printf (seq, "]"); +} + +static void print_raid5_conf (struct r5conf *conf) +{ + int i; + struct disk_info *tmp; + + printk(KERN_DEBUG "RAID conf printout:\n"); + if (!conf) { + printk("(conf==NULL)\n"); + return; + } + printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level, + conf->raid_disks, + conf->raid_disks - conf->mddev->degraded); + + for (i = 0; i < conf->raid_disks; i++) { + char b[BDEVNAME_SIZE]; + tmp = conf->disks + i; + if (tmp->rdev) + printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n", + i, !test_bit(Faulty, &tmp->rdev->flags), + bdevname(tmp->rdev->bdev, b)); + } +} + +static int raid5_spare_active(struct mddev *mddev) +{ + int i; + struct r5conf *conf = mddev->private; + struct disk_info *tmp; + int count = 0; + unsigned long flags; + + for (i = 0; i < conf->raid_disks; i++) { + tmp = conf->disks + i; + if (tmp->replacement + && tmp->replacement->recovery_offset == MaxSector + && !test_bit(Faulty, &tmp->replacement->flags) + && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { + /* Replacement has just become active. */ + if (!tmp->rdev + || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) + count++; + if (tmp->rdev) { + /* Replaced device not technically faulty, + * but we need to be sure it gets removed + * and never re-added. + */ + set_bit(Faulty, &tmp->rdev->flags); + sysfs_notify_dirent_safe( + tmp->rdev->sysfs_state); + } + sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); + } else if (tmp->rdev + && tmp->rdev->recovery_offset == MaxSector + && !test_bit(Faulty, &tmp->rdev->flags) + && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { + count++; + sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); + } + } + spin_lock_irqsave(&conf->device_lock, flags); + mddev->degraded = calc_degraded(conf); + spin_unlock_irqrestore(&conf->device_lock, flags); + print_raid5_conf(conf); + return count; +} + +static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) +{ + struct r5conf *conf = mddev->private; + int err = 0; + int number = rdev->raid_disk; + struct md_rdev **rdevp; + struct disk_info *p = conf->disks + number; + + print_raid5_conf(conf); + if (rdev == p->rdev) + rdevp = &p->rdev; + else if (rdev == p->replacement) + rdevp = &p->replacement; + else + return 0; + + if (number >= conf->raid_disks && + conf->reshape_progress == MaxSector) + clear_bit(In_sync, &rdev->flags); + + if (test_bit(In_sync, &rdev->flags) || + atomic_read(&rdev->nr_pending)) { + err = -EBUSY; + goto abort; + } + /* Only remove non-faulty devices if recovery + * isn't possible. + */ + if (!test_bit(Faulty, &rdev->flags) && + mddev->recovery_disabled != conf->recovery_disabled && + !has_failed(conf) && + (!p->replacement || p->replacement == rdev) && + number < conf->raid_disks) { + err = -EBUSY; + goto abort; + } + *rdevp = NULL; + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + *rdevp = rdev; + } else if (p->replacement) { + /* We must have just cleared 'rdev' */ + p->rdev = p->replacement; + clear_bit(Replacement, &p->replacement->flags); + smp_mb(); /* Make sure other CPUs may see both as identical + * but will never see neither - if they are careful + */ + p->replacement = NULL; + clear_bit(WantReplacement, &rdev->flags); + } else + /* We might have just removed the Replacement as faulty- + * clear the bit just in case + */ + clear_bit(WantReplacement, &rdev->flags); +abort: + + print_raid5_conf(conf); + return err; +} + +static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) +{ + struct r5conf *conf = mddev->private; + int err = -EEXIST; + int disk; + struct disk_info *p; + int first = 0; + int last = conf->raid_disks - 1; + + if (mddev->recovery_disabled == conf->recovery_disabled) + return -EBUSY; + + if (rdev->saved_raid_disk < 0 && has_failed(conf)) + /* no point adding a device */ + return -EINVAL; + + if (rdev->raid_disk >= 0) + first = last = rdev->raid_disk; + + /* + * find the disk ... but prefer rdev->saved_raid_disk + * if possible. + */ + if (rdev->saved_raid_disk >= 0 && + rdev->saved_raid_disk >= first && + conf->disks[rdev->saved_raid_disk].rdev == NULL) + disk = rdev->saved_raid_disk; + else + disk = first; + for ( ; disk <= last ; disk++) { + p = conf->disks + disk; + if (p->rdev == NULL) { + clear_bit(In_sync, &rdev->flags); + rdev->raid_disk = disk; + err = 0; + if (rdev->saved_raid_disk != disk) + conf->fullsync = 1; + rcu_assign_pointer(p->rdev, rdev); + break; + } + if (test_bit(WantReplacement, &p->rdev->flags) && + p->replacement == NULL) { + clear_bit(In_sync, &rdev->flags); + set_bit(Replacement, &rdev->flags); + rdev->raid_disk = disk; + err = 0; + conf->fullsync = 1; + rcu_assign_pointer(p->replacement, rdev); + break; + } + } + print_raid5_conf(conf); + return err; +} + +static int raid5_resize(struct mddev *mddev, sector_t sectors) +{ + /* no resync is happening, and there is enough space + * on all devices, so we can resize. + * We need to make sure resync covers any new space. + * If the array is shrinking we should possibly wait until + * any io in the removed space completes, but it hardly seems + * worth it. + */ + sectors &= ~((sector_t)mddev->chunk_sectors - 1); + md_set_array_sectors(mddev, raid5_size(mddev, sectors, + mddev->raid_disks)); + if (mddev->array_sectors > + raid5_size(mddev, sectors, mddev->raid_disks)) + return -EINVAL; + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + if (sectors > mddev->dev_sectors && + mddev->recovery_cp > mddev->dev_sectors) { + mddev->recovery_cp = mddev->dev_sectors; + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + } + mddev->dev_sectors = sectors; + mddev->resync_max_sectors = sectors; + return 0; +} + +static int check_stripe_cache(struct mddev *mddev) +{ + /* Can only proceed if there are plenty of stripe_heads. + * We need a minimum of one full stripe,, and for sensible progress + * it is best to have about 4 times that. + * If we require 4 times, then the default 256 4K stripe_heads will + * allow for chunk sizes up to 256K, which is probably OK. + * If the chunk size is greater, user-space should request more + * stripe_heads first. + */ + struct r5conf *conf = mddev->private; + if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 + > conf->max_nr_stripes || + ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 + > conf->max_nr_stripes) { + printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", + mdname(mddev), + ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) + / STRIPE_SIZE)*4); + return 0; + } + return 1; +} + +static int check_reshape(struct mddev *mddev) +{ + struct r5conf *conf = mddev->private; + + if (mddev->delta_disks == 0 && + mddev->new_layout == mddev->layout && + mddev->new_chunk_sectors == mddev->chunk_sectors) + return 0; /* nothing to do */ + if (mddev->bitmap) + /* Cannot grow a bitmap yet */ + return -EBUSY; + if (has_failed(conf)) + return -EINVAL; + if (mddev->delta_disks < 0) { + /* We might be able to shrink, but the devices must + * be made bigger first. + * For raid6, 4 is the minimum size. + * Otherwise 2 is the minimum + */ + int min = 2; + if (mddev->level == 6) + min = 4; + if (mddev->raid_disks + mddev->delta_disks < min) + return -EINVAL; + } + + if (!check_stripe_cache(mddev)) + return -ENOSPC; + + return resize_stripes(conf, conf->raid_disks + mddev->delta_disks); +} + +static int raid5_start_reshape(struct mddev *mddev) +{ + struct r5conf *conf = mddev->private; + struct md_rdev *rdev; + int spares = 0; + unsigned long flags; + + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + return -EBUSY; + + if (!check_stripe_cache(mddev)) + return -ENOSPC; + + rdev_for_each(rdev, mddev) + if (!test_bit(In_sync, &rdev->flags) + && !test_bit(Faulty, &rdev->flags)) + spares++; + + if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) + /* Not enough devices even to make a degraded array + * of that size + */ + return -EINVAL; + + /* Refuse to reduce size of the array. Any reductions in + * array size must be through explicit setting of array_size + * attribute. + */ + if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) + < mddev->array_sectors) { + printk(KERN_ERR "md/raid:%s: array size must be reduced " + "before number of disks\n", mdname(mddev)); + return -EINVAL; + } + + atomic_set(&conf->reshape_stripes, 0); + spin_lock_irq(&conf->device_lock); + conf->previous_raid_disks = conf->raid_disks; + conf->raid_disks += mddev->delta_disks; + conf->prev_chunk_sectors = conf->chunk_sectors; + conf->chunk_sectors = mddev->new_chunk_sectors; + conf->prev_algo = conf->algorithm; + conf->algorithm = mddev->new_layout; + if (mddev->delta_disks < 0) + conf->reshape_progress = raid5_size(mddev, 0, 0); + else + conf->reshape_progress = 0; + conf->reshape_safe = conf->reshape_progress; + conf->generation++; + spin_unlock_irq(&conf->device_lock); + + /* Add some new drives, as many as will fit. + * We know there are enough to make the newly sized array work. + * Don't add devices if we are reducing the number of + * devices in the array. This is because it is not possible + * to correctly record the "partially reconstructed" state of + * such devices during the reshape and confusion could result. + */ + if (mddev->delta_disks >= 0) { + rdev_for_each(rdev, mddev) + if (rdev->raid_disk < 0 && + !test_bit(Faulty, &rdev->flags)) { + if (raid5_add_disk(mddev, rdev) == 0) { + if (rdev->raid_disk + >= conf->previous_raid_disks) + set_bit(In_sync, &rdev->flags); + else + rdev->recovery_offset = 0; + + if (sysfs_link_rdev(mddev, rdev)) + /* Failure here is OK */; + } + } else if (rdev->raid_disk >= conf->previous_raid_disks + && !test_bit(Faulty, &rdev->flags)) { + /* This is a spare that was manually added */ + set_bit(In_sync, &rdev->flags); + } + + /* When a reshape changes the number of devices, + * ->degraded is measured against the larger of the + * pre and post number of devices. + */ + spin_lock_irqsave(&conf->device_lock, flags); + mddev->degraded = calc_degraded(conf); + spin_unlock_irqrestore(&conf->device_lock, flags); + } + mddev->raid_disks = conf->raid_disks; + mddev->reshape_position = conf->reshape_progress; + set_bit(MD_CHANGE_DEVS, &mddev->flags); + + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + mddev->sync_thread = md_register_thread(md_do_sync, mddev, + "reshape"); + if (!mddev->sync_thread) { + mddev->recovery = 0; + spin_lock_irq(&conf->device_lock); + mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; + conf->reshape_progress = MaxSector; + mddev->reshape_position = MaxSector; + spin_unlock_irq(&conf->device_lock); + return -EAGAIN; + } + conf->reshape_checkpoint = jiffies; + md_wakeup_thread(mddev->sync_thread); + md_new_event(mddev); + return 0; +} + +/* This is called from the reshape thread and should make any + * changes needed in 'conf' + */ +static void end_reshape(struct r5conf *conf) +{ + + if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { + + spin_lock_irq(&conf->device_lock); + conf->previous_raid_disks = conf->raid_disks; + conf->reshape_progress = MaxSector; + spin_unlock_irq(&conf->device_lock); + wake_up(&conf->wait_for_overlap); + + /* read-ahead size must cover two whole stripes, which is + * 2 * (datadisks) * chunksize where 'n' is the number of raid devices + */ + if (conf->mddev->queue) { + int data_disks = conf->raid_disks - conf->max_degraded; + int stripe = data_disks * ((conf->chunk_sectors << 9) + / PAGE_SIZE); + if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) + conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; + } + } +} + +/* This is called from the raid5d thread with mddev_lock held. + * It makes config changes to the device. + */ +static void raid5_finish_reshape(struct mddev *mddev) +{ + struct r5conf *conf = mddev->private; + + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + + if (mddev->delta_disks > 0) { + md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + } else { + int d; + spin_lock_irq(&conf->device_lock); + mddev->degraded = calc_degraded(conf); + spin_unlock_irq(&conf->device_lock); + for (d = conf->raid_disks ; + d < conf->raid_disks - mddev->delta_disks; + d++) { + struct md_rdev *rdev = conf->disks[d].rdev; + if (rdev && + raid5_remove_disk(mddev, rdev) == 0) { + sysfs_unlink_rdev(mddev, rdev); + rdev->raid_disk = -1; + } + } + } + mddev->layout = conf->algorithm; + mddev->chunk_sectors = conf->chunk_sectors; + mddev->reshape_position = MaxSector; + mddev->delta_disks = 0; + } +} + +static void raid5_quiesce(struct mddev *mddev, int state) +{ + struct r5conf *conf = mddev->private; + + switch(state) { + case 2: /* resume for a suspend */ + wake_up(&conf->wait_for_overlap); + break; + + case 1: /* stop all writes */ + spin_lock_irq(&conf->device_lock); + /* '2' tells resync/reshape to pause so that all + * active stripes can drain + */ + conf->quiesce = 2; + wait_event_lock_irq(conf->wait_for_stripe, + atomic_read(&conf->active_stripes) == 0 && + atomic_read(&conf->active_aligned_reads) == 0, + conf->device_lock, /* nothing */); + conf->quiesce = 1; + spin_unlock_irq(&conf->device_lock); + /* allow reshape to continue */ + wake_up(&conf->wait_for_overlap); + break; + + case 0: /* re-enable writes */ + spin_lock_irq(&conf->device_lock); + conf->quiesce = 0; + wake_up(&conf->wait_for_stripe); + wake_up(&conf->wait_for_overlap); + spin_unlock_irq(&conf->device_lock); + break; + } +} + + +static void *raid45_takeover_raid0(struct mddev *mddev, int level) +{ + struct r0conf *raid0_conf = mddev->private; + sector_t sectors; + + /* for raid0 takeover only one zone is supported */ + if (raid0_conf->nr_strip_zones > 1) { + printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n", + mdname(mddev)); + return ERR_PTR(-EINVAL); + } + + sectors = raid0_conf->strip_zone[0].zone_end; + sector_div(sectors, raid0_conf->strip_zone[0].nb_dev); + mddev->dev_sectors = sectors; + mddev->new_level = level; + mddev->new_layout = ALGORITHM_PARITY_N; + mddev->new_chunk_sectors = mddev->chunk_sectors; + mddev->raid_disks += 1; + mddev->delta_disks = 1; + /* make sure it will be not marked as dirty */ + mddev->recovery_cp = MaxSector; + + return setup_conf(mddev); +} + + +static void *raid5_takeover_raid1(struct mddev *mddev) +{ + int chunksect; + + if (mddev->raid_disks != 2 || + mddev->degraded > 1) + return ERR_PTR(-EINVAL); + + /* Should check if there are write-behind devices? */ + + chunksect = 64*2; /* 64K by default */ + + /* The array must be an exact multiple of chunksize */ + while (chunksect && (mddev->array_sectors & (chunksect-1))) + chunksect >>= 1; + + if ((chunksect<<9) < STRIPE_SIZE) + /* array size does not allow a suitable chunk size */ + return ERR_PTR(-EINVAL); + + mddev->new_level = 5; + mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; + mddev->new_chunk_sectors = chunksect; + + return setup_conf(mddev); +} + +static void *raid5_takeover_raid6(struct mddev *mddev) +{ + int new_layout; + + switch (mddev->layout) { + case ALGORITHM_LEFT_ASYMMETRIC_6: + new_layout = ALGORITHM_LEFT_ASYMMETRIC; + break; + case ALGORITHM_RIGHT_ASYMMETRIC_6: + new_layout = ALGORITHM_RIGHT_ASYMMETRIC; + break; + case ALGORITHM_LEFT_SYMMETRIC_6: + new_layout = ALGORITHM_LEFT_SYMMETRIC; + break; + case ALGORITHM_RIGHT_SYMMETRIC_6: + new_layout = ALGORITHM_RIGHT_SYMMETRIC; + break; + case ALGORITHM_PARITY_0_6: + new_layout = ALGORITHM_PARITY_0; + break; + case ALGORITHM_PARITY_N: + new_layout = ALGORITHM_PARITY_N; + break; + default: + return ERR_PTR(-EINVAL); + } + mddev->new_level = 5; + mddev->new_layout = new_layout; + mddev->delta_disks = -1; + mddev->raid_disks -= 1; + return setup_conf(mddev); +} + + +static int raid5_check_reshape(struct mddev *mddev) +{ + /* For a 2-drive array, the layout and chunk size can be changed + * immediately as not restriping is needed. + * For larger arrays we record the new value - after validation + * to be used by a reshape pass. + */ + struct r5conf *conf = mddev->private; + int new_chunk = mddev->new_chunk_sectors; + + if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) + return -EINVAL; + if (new_chunk > 0) { + if (!is_power_of_2(new_chunk)) + return -EINVAL; + if (new_chunk < (PAGE_SIZE>>9)) + return -EINVAL; + if (mddev->array_sectors & (new_chunk-1)) + /* not factor of array size */ + return -EINVAL; + } + + /* They look valid */ + + if (mddev->raid_disks == 2) { + /* can make the change immediately */ + if (mddev->new_layout >= 0) { + conf->algorithm = mddev->new_layout; + mddev->layout = mddev->new_layout; + } + if (new_chunk > 0) { + conf->chunk_sectors = new_chunk ; + mddev->chunk_sectors = new_chunk; + } + set_bit(MD_CHANGE_DEVS, &mddev->flags); + md_wakeup_thread(mddev->thread); + } + return check_reshape(mddev); +} + +static int raid6_check_reshape(struct mddev *mddev) +{ + int new_chunk = mddev->new_chunk_sectors; + + if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) + return -EINVAL; + if (new_chunk > 0) { + if (!is_power_of_2(new_chunk)) + return -EINVAL; + if (new_chunk < (PAGE_SIZE >> 9)) + return -EINVAL; + if (mddev->array_sectors & (new_chunk-1)) + /* not factor of array size */ + return -EINVAL; + } + + /* They look valid */ + return check_reshape(mddev); +} + +static void *raid5_takeover(struct mddev *mddev) +{ + /* raid5 can take over: + * raid0 - if there is only one strip zone - make it a raid4 layout + * raid1 - if there are two drives. We need to know the chunk size + * raid4 - trivial - just use a raid4 layout. + * raid6 - Providing it is a *_6 layout + */ + if (mddev->level == 0) + return raid45_takeover_raid0(mddev, 5); + if (mddev->level == 1) + return raid5_takeover_raid1(mddev); + if (mddev->level == 4) { + mddev->new_layout = ALGORITHM_PARITY_N; + mddev->new_level = 5; + return setup_conf(mddev); + } + if (mddev->level == 6) + return raid5_takeover_raid6(mddev); + + return ERR_PTR(-EINVAL); +} + +static void *raid4_takeover(struct mddev *mddev) +{ + /* raid4 can take over: + * raid0 - if there is only one strip zone + * raid5 - if layout is right + */ + if (mddev->level == 0) + return raid45_takeover_raid0(mddev, 4); + if (mddev->level == 5 && + mddev->layout == ALGORITHM_PARITY_N) { + mddev->new_layout = 0; + mddev->new_level = 4; + return setup_conf(mddev); + } + return ERR_PTR(-EINVAL); +} + +static struct md_personality raid5_personality; + +static void *raid6_takeover(struct mddev *mddev) +{ + /* Currently can only take over a raid5. We map the + * personality to an equivalent raid6 personality + * with the Q block at the end. + */ + int new_layout; + + if (mddev->pers != &raid5_personality) + return ERR_PTR(-EINVAL); + if (mddev->degraded > 1) + return ERR_PTR(-EINVAL); + if (mddev->raid_disks > 253) + return ERR_PTR(-EINVAL); + if (mddev->raid_disks < 3) + return ERR_PTR(-EINVAL); + + switch (mddev->layout) { + case ALGORITHM_LEFT_ASYMMETRIC: + new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; + break; + case ALGORITHM_RIGHT_ASYMMETRIC: + new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; + break; + case ALGORITHM_LEFT_SYMMETRIC: + new_layout = ALGORITHM_LEFT_SYMMETRIC_6; + break; + case ALGORITHM_RIGHT_SYMMETRIC: + new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; + break; + case ALGORITHM_PARITY_0: + new_layout = ALGORITHM_PARITY_0_6; + break; + case ALGORITHM_PARITY_N: + new_layout = ALGORITHM_PARITY_N; + break; + default: + return ERR_PTR(-EINVAL); + } + mddev->new_level = 6; + mddev->new_layout = new_layout; + mddev->delta_disks = 1; + mddev->raid_disks += 1; + return setup_conf(mddev); +} + + +static struct md_personality raid6_personality = +{ + .name = "raid6", + .level = 6, + .owner = THIS_MODULE, + .make_request = make_request, + .run = run, + .stop = stop, + .status = status, + .error_handler = error, + .hot_add_disk = raid5_add_disk, + .hot_remove_disk= raid5_remove_disk, + .spare_active = raid5_spare_active, + .sync_request = sync_request, + .resize = raid5_resize, + .size = raid5_size, + .check_reshape = raid6_check_reshape, + .start_reshape = raid5_start_reshape, + .finish_reshape = raid5_finish_reshape, + .quiesce = raid5_quiesce, + .takeover = raid6_takeover, +}; +static struct md_personality raid5_personality = +{ + .name = "raid5", + .level = 5, + .owner = THIS_MODULE, + .make_request = make_request, + .run = run, + .stop = stop, + .status = status, + .error_handler = error, + .hot_add_disk = raid5_add_disk, + .hot_remove_disk= raid5_remove_disk, + .spare_active = raid5_spare_active, + .sync_request = sync_request, + .resize = raid5_resize, + .size = raid5_size, + .check_reshape = raid5_check_reshape, + .start_reshape = raid5_start_reshape, + .finish_reshape = raid5_finish_reshape, + .quiesce = raid5_quiesce, + .takeover = raid5_takeover, +}; + +static struct md_personality raid4_personality = +{ + .name = "raid4", + .level = 4, + .owner = THIS_MODULE, + .make_request = make_request, + .run = run, + .stop = stop, + .status = status, + .error_handler = error, + .hot_add_disk = raid5_add_disk, + .hot_remove_disk= raid5_remove_disk, + .spare_active = raid5_spare_active, + .sync_request = sync_request, + .resize = raid5_resize, + .size = raid5_size, + .check_reshape = raid5_check_reshape, + .start_reshape = raid5_start_reshape, + .finish_reshape = raid5_finish_reshape, + .quiesce = raid5_quiesce, + .takeover = raid4_takeover, +}; + +static int __init raid5_init(void) +{ + register_md_personality(&raid6_personality); + register_md_personality(&raid5_personality); + register_md_personality(&raid4_personality); + return 0; +} + +static void raid5_exit(void) +{ + unregister_md_personality(&raid6_personality); + unregister_md_personality(&raid5_personality); + unregister_md_personality(&raid4_personality); +} + +module_init(raid5_init); +module_exit(raid5_exit); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); +MODULE_ALIAS("md-personality-4"); /* RAID5 */ +MODULE_ALIAS("md-raid5"); +MODULE_ALIAS("md-raid4"); +MODULE_ALIAS("md-level-5"); +MODULE_ALIAS("md-level-4"); +MODULE_ALIAS("md-personality-8"); /* RAID6 */ +MODULE_ALIAS("md-raid6"); +MODULE_ALIAS("md-level-6"); + +/* This used to be two separate modules, they were: */ +MODULE_ALIAS("raid5"); +MODULE_ALIAS("raid6"); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h new file mode 100644 index 00000000..8d8e1393 --- /dev/null +++ b/drivers/md/raid5.h @@ -0,0 +1,519 @@ +#ifndef _RAID5_H +#define _RAID5_H + +#include <linux/raid/xor.h> +#include <linux/dmaengine.h> + +/* + * + * Each stripe contains one buffer per device. Each buffer can be in + * one of a number of states stored in "flags". Changes between + * these states happen *almost* exclusively under the protection of the + * STRIPE_ACTIVE flag. Some very specific changes can happen in bi_end_io, and + * these are not protected by STRIPE_ACTIVE. + * + * The flag bits that are used to represent these states are: + * R5_UPTODATE and R5_LOCKED + * + * State Empty == !UPTODATE, !LOCK + * We have no data, and there is no active request + * State Want == !UPTODATE, LOCK + * A read request is being submitted for this block + * State Dirty == UPTODATE, LOCK + * Some new data is in this buffer, and it is being written out + * State Clean == UPTODATE, !LOCK + * We have valid data which is the same as on disc + * + * The possible state transitions are: + * + * Empty -> Want - on read or write to get old data for parity calc + * Empty -> Dirty - on compute_parity to satisfy write/sync request. + * Empty -> Clean - on compute_block when computing a block for failed drive + * Want -> Empty - on failed read + * Want -> Clean - on successful completion of read request + * Dirty -> Clean - on successful completion of write request + * Dirty -> Clean - on failed write + * Clean -> Dirty - on compute_parity to satisfy write/sync (RECONSTRUCT or RMW) + * + * The Want->Empty, Want->Clean, Dirty->Clean, transitions + * all happen in b_end_io at interrupt time. + * Each sets the Uptodate bit before releasing the Lock bit. + * This leaves one multi-stage transition: + * Want->Dirty->Clean + * This is safe because thinking that a Clean buffer is actually dirty + * will at worst delay some action, and the stripe will be scheduled + * for attention after the transition is complete. + * + * There is one possibility that is not covered by these states. That + * is if one drive has failed and there is a spare being rebuilt. We + * can't distinguish between a clean block that has been generated + * from parity calculations, and a clean block that has been + * successfully written to the spare ( or to parity when resyncing). + * To distingush these states we have a stripe bit STRIPE_INSYNC that + * is set whenever a write is scheduled to the spare, or to the parity + * disc if there is no spare. A sync request clears this bit, and + * when we find it set with no buffers locked, we know the sync is + * complete. + * + * Buffers for the md device that arrive via make_request are attached + * to the appropriate stripe in one of two lists linked on b_reqnext. + * One list (bh_read) for read requests, one (bh_write) for write. + * There should never be more than one buffer on the two lists + * together, but we are not guaranteed of that so we allow for more. + * + * If a buffer is on the read list when the associated cache buffer is + * Uptodate, the data is copied into the read buffer and it's b_end_io + * routine is called. This may happen in the end_request routine only + * if the buffer has just successfully been read. end_request should + * remove the buffers from the list and then set the Uptodate bit on + * the buffer. Other threads may do this only if they first check + * that the Uptodate bit is set. Once they have checked that they may + * take buffers off the read queue. + * + * When a buffer on the write list is committed for write it is copied + * into the cache buffer, which is then marked dirty, and moved onto a + * third list, the written list (bh_written). Once both the parity + * block and the cached buffer are successfully written, any buffer on + * a written list can be returned with b_end_io. + * + * The write list and read list both act as fifos. The read list, + * write list and written list are protected by the device_lock. + * The device_lock is only for list manipulations and will only be + * held for a very short time. It can be claimed from interrupts. + * + * + * Stripes in the stripe cache can be on one of two lists (or on + * neither). The "inactive_list" contains stripes which are not + * currently being used for any request. They can freely be reused + * for another stripe. The "handle_list" contains stripes that need + * to be handled in some way. Both of these are fifo queues. Each + * stripe is also (potentially) linked to a hash bucket in the hash + * table so that it can be found by sector number. Stripes that are + * not hashed must be on the inactive_list, and will normally be at + * the front. All stripes start life this way. + * + * The inactive_list, handle_list and hash bucket lists are all protected by the + * device_lock. + * - stripes have a reference counter. If count==0, they are on a list. + * - If a stripe might need handling, STRIPE_HANDLE is set. + * - When refcount reaches zero, then if STRIPE_HANDLE it is put on + * handle_list else inactive_list + * + * This, combined with the fact that STRIPE_HANDLE is only ever + * cleared while a stripe has a non-zero count means that if the + * refcount is 0 and STRIPE_HANDLE is set, then it is on the + * handle_list and if recount is 0 and STRIPE_HANDLE is not set, then + * the stripe is on inactive_list. + * + * The possible transitions are: + * activate an unhashed/inactive stripe (get_active_stripe()) + * lockdev check-hash unlink-stripe cnt++ clean-stripe hash-stripe unlockdev + * activate a hashed, possibly active stripe (get_active_stripe()) + * lockdev check-hash if(!cnt++)unlink-stripe unlockdev + * attach a request to an active stripe (add_stripe_bh()) + * lockdev attach-buffer unlockdev + * handle a stripe (handle_stripe()) + * setSTRIPE_ACTIVE, clrSTRIPE_HANDLE ... + * (lockdev check-buffers unlockdev) .. + * change-state .. + * record io/ops needed clearSTRIPE_ACTIVE schedule io/ops + * release an active stripe (release_stripe()) + * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev + * + * The refcount counts each thread that have activated the stripe, + * plus raid5d if it is handling it, plus one for each active request + * on a cached buffer, and plus one if the stripe is undergoing stripe + * operations. + * + * The stripe operations are: + * -copying data between the stripe cache and user application buffers + * -computing blocks to save a disk access, or to recover a missing block + * -updating the parity on a write operation (reconstruct write and + * read-modify-write) + * -checking parity correctness + * -running i/o to disk + * These operations are carried out by raid5_run_ops which uses the async_tx + * api to (optionally) offload operations to dedicated hardware engines. + * When requesting an operation handle_stripe sets the pending bit for the + * operation and increments the count. raid5_run_ops is then run whenever + * the count is non-zero. + * There are some critical dependencies between the operations that prevent some + * from being requested while another is in flight. + * 1/ Parity check operations destroy the in cache version of the parity block, + * so we prevent parity dependent operations like writes and compute_blocks + * from starting while a check is in progress. Some dma engines can perform + * the check without damaging the parity block, in these cases the parity + * block is re-marked up to date (assuming the check was successful) and is + * not re-read from disk. + * 2/ When a write operation is requested we immediately lock the affected + * blocks, and mark them as not up to date. This causes new read requests + * to be held off, as well as parity checks and compute block operations. + * 3/ Once a compute block operation has been requested handle_stripe treats + * that block as if it is up to date. raid5_run_ops guaruntees that any + * operation that is dependent on the compute block result is initiated after + * the compute block completes. + */ + +/* + * Operations state - intermediate states that are visible outside of + * STRIPE_ACTIVE. + * In general _idle indicates nothing is running, _run indicates a data + * processing operation is active, and _result means the data processing result + * is stable and can be acted upon. For simple operations like biofill and + * compute that only have an _idle and _run state they are indicated with + * sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN) + */ +/** + * enum check_states - handles syncing / repairing a stripe + * @check_state_idle - check operations are quiesced + * @check_state_run - check operation is running + * @check_state_result - set outside lock when check result is valid + * @check_state_compute_run - check failed and we are repairing + * @check_state_compute_result - set outside lock when compute result is valid + */ +enum check_states { + check_state_idle = 0, + check_state_run, /* xor parity check */ + check_state_run_q, /* q-parity check */ + check_state_run_pq, /* pq dual parity check */ + check_state_check_result, + check_state_compute_run, /* parity repair */ + check_state_compute_result, +}; + +/** + * enum reconstruct_states - handles writing or expanding a stripe + */ +enum reconstruct_states { + reconstruct_state_idle = 0, + reconstruct_state_prexor_drain_run, /* prexor-write */ + reconstruct_state_drain_run, /* write */ + reconstruct_state_run, /* expand */ + reconstruct_state_prexor_drain_result, + reconstruct_state_drain_result, + reconstruct_state_result, +}; + +struct stripe_head { + struct hlist_node hash; + struct list_head lru; /* inactive_list or handle_list */ + struct r5conf *raid_conf; + short generation; /* increments with every + * reshape */ + sector_t sector; /* sector of this row */ + short pd_idx; /* parity disk index */ + short qd_idx; /* 'Q' disk index for raid6 */ + short ddf_layout;/* use DDF ordering to calculate Q */ + unsigned long state; /* state flags */ + atomic_t count; /* nr of active thread/requests */ + int bm_seq; /* sequence number for bitmap flushes */ + int disks; /* disks in stripe */ + enum check_states check_state; + enum reconstruct_states reconstruct_state; + /** + * struct stripe_operations + * @target - STRIPE_OP_COMPUTE_BLK target + * @target2 - 2nd compute target in the raid6 case + * @zero_sum_result - P and Q verification flags + * @request - async service request flags for raid_run_ops + */ + struct stripe_operations { + int target, target2; + enum sum_check_flags zero_sum_result; + #ifdef CONFIG_MULTICORE_RAID456 + unsigned long request; + wait_queue_head_t wait_for_ops; + #endif + } ops; + struct r5dev { + /* rreq and rvec are used for the replacement device when + * writing data to both devices. + */ + struct bio req, rreq; + struct bio_vec vec, rvec; + struct page *page; + struct bio *toread, *read, *towrite, *written; + sector_t sector; /* sector of this page */ + unsigned long flags; + } dev[1]; /* allocated with extra space depending of RAID geometry */ +}; + +/* stripe_head_state - collects and tracks the dynamic state of a stripe_head + * for handle_stripe. + */ +struct stripe_head_state { + /* 'syncing' means that we need to read all devices, either + * to check/correct parity, or to reconstruct a missing device. + * 'replacing' means we are replacing one or more drives and + * the source is valid at this point so we don't need to + * read all devices, just the replacement targets. + */ + int syncing, expanding, expanded, replacing; + int locked, uptodate, to_read, to_write, failed, written; + int to_fill, compute, req_compute, non_overwrite; + int failed_num[2]; + int p_failed, q_failed; + int dec_preread_active; + unsigned long ops_request; + + struct bio *return_bi; + struct md_rdev *blocked_rdev; + int handle_bad_blocks; +}; + +/* Flags for struct r5dev.flags */ +enum r5dev_flags { + R5_UPTODATE, /* page contains current data */ + R5_LOCKED, /* IO has been submitted on "req" */ + R5_DOUBLE_LOCKED,/* Cannot clear R5_LOCKED until 2 writes complete */ + R5_OVERWRITE, /* towrite covers whole page */ +/* and some that are internal to handle_stripe */ + R5_Insync, /* rdev && rdev->in_sync at start */ + R5_Wantread, /* want to schedule a read */ + R5_Wantwrite, + R5_Overlap, /* There is a pending overlapping request + * on this block */ + R5_ReadError, /* seen a read error here recently */ + R5_ReWrite, /* have tried to over-write the readerror */ + + R5_Expanded, /* This block now has post-expand data */ + R5_Wantcompute, /* compute_block in progress treat as + * uptodate + */ + R5_Wantfill, /* dev->toread contains a bio that needs + * filling + */ + R5_Wantdrain, /* dev->towrite needs to be drained */ + R5_WantFUA, /* Write should be FUA */ + R5_WriteError, /* got a write error - need to record it */ + R5_MadeGood, /* A bad block has been fixed by writing to it */ + R5_ReadRepl, /* Will/did read from replacement rather than orig */ + R5_MadeGoodRepl,/* A bad block on the replacement device has been + * fixed by writing to it */ + R5_NeedReplace, /* This device has a replacement which is not + * up-to-date at this stripe. */ + R5_WantReplace, /* We need to update the replacement, we have read + * data in, and now is a good time to write it out. + */ +}; + +/* + * Stripe state + */ +enum { + STRIPE_ACTIVE, + STRIPE_HANDLE, + STRIPE_SYNC_REQUESTED, + STRIPE_SYNCING, + STRIPE_INSYNC, + STRIPE_PREREAD_ACTIVE, + STRIPE_DELAYED, + STRIPE_DEGRADED, + STRIPE_BIT_DELAY, + STRIPE_EXPANDING, + STRIPE_EXPAND_SOURCE, + STRIPE_EXPAND_READY, + STRIPE_IO_STARTED, /* do not count towards 'bypass_count' */ + STRIPE_FULL_WRITE, /* all blocks are set to be overwritten */ + STRIPE_BIOFILL_RUN, + STRIPE_COMPUTE_RUN, + STRIPE_OPS_REQ_PENDING, +}; + +/* + * Operation request flags + */ +enum { + STRIPE_OP_BIOFILL, + STRIPE_OP_COMPUTE_BLK, + STRIPE_OP_PREXOR, + STRIPE_OP_BIODRAIN, + STRIPE_OP_RECONSTRUCT, + STRIPE_OP_CHECK, +}; +/* + * Plugging: + * + * To improve write throughput, we need to delay the handling of some + * stripes until there has been a chance that several write requests + * for the one stripe have all been collected. + * In particular, any write request that would require pre-reading + * is put on a "delayed" queue until there are no stripes currently + * in a pre-read phase. Further, if the "delayed" queue is empty when + * a stripe is put on it then we "plug" the queue and do not process it + * until an unplug call is made. (the unplug_io_fn() is called). + * + * When preread is initiated on a stripe, we set PREREAD_ACTIVE and add + * it to the count of prereading stripes. + * When write is initiated, or the stripe refcnt == 0 (just in case) we + * clear the PREREAD_ACTIVE flag and decrement the count + * Whenever the 'handle' queue is empty and the device is not plugged, we + * move any strips from delayed to handle and clear the DELAYED flag and set + * PREREAD_ACTIVE. + * In stripe_handle, if we find pre-reading is necessary, we do it if + * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue. + * HANDLE gets cleared if stripe_handle leaves nothing locked. + */ + + +struct disk_info { + struct md_rdev *rdev, *replacement; +}; + +struct r5conf { + struct hlist_head *stripe_hashtbl; + struct mddev *mddev; + int chunk_sectors; + int level, algorithm; + int max_degraded; + int raid_disks; + int max_nr_stripes; + + /* reshape_progress is the leading edge of a 'reshape' + * It has value MaxSector when no reshape is happening + * If delta_disks < 0, it is the last sector we started work on, + * else is it the next sector to work on. + */ + sector_t reshape_progress; + /* reshape_safe is the trailing edge of a reshape. We know that + * before (or after) this address, all reshape has completed. + */ + sector_t reshape_safe; + int previous_raid_disks; + int prev_chunk_sectors; + int prev_algo; + short generation; /* increments with every reshape */ + unsigned long reshape_checkpoint; /* Time we last updated + * metadata */ + + struct list_head handle_list; /* stripes needing handling */ + struct list_head hold_list; /* preread ready stripes */ + struct list_head delayed_list; /* stripes that have plugged requests */ + struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */ + struct bio *retry_read_aligned; /* currently retrying aligned bios */ + struct bio *retry_read_aligned_list; /* aligned bios retry list */ + atomic_t preread_active_stripes; /* stripes with scheduled io */ + atomic_t active_aligned_reads; + atomic_t pending_full_writes; /* full write backlog */ + int bypass_count; /* bypassed prereads */ + int bypass_threshold; /* preread nice */ + struct list_head *last_hold; /* detect hold_list promotions */ + + atomic_t reshape_stripes; /* stripes with pending writes for reshape */ + /* unfortunately we need two cache names as we temporarily have + * two caches. + */ + int active_name; + char cache_name[2][32]; + struct kmem_cache *slab_cache; /* for allocating stripes */ + + int seq_flush, seq_write; + int quiesce; + + int fullsync; /* set to 1 if a full sync is needed, + * (fresh device added). + * Cleared when a sync completes. + */ + int recovery_disabled; + /* per cpu variables */ + struct raid5_percpu { + struct page *spare_page; /* Used when checking P/Q in raid6 */ + void *scribble; /* space for constructing buffer + * lists and performing address + * conversions + */ + } __percpu *percpu; + size_t scribble_len; /* size of scribble region must be + * associated with conf to handle + * cpu hotplug while reshaping + */ +#ifdef CONFIG_HOTPLUG_CPU + struct notifier_block cpu_notify; +#endif + + /* + * Free stripes pool + */ + atomic_t active_stripes; + struct list_head inactive_list; + wait_queue_head_t wait_for_stripe; + wait_queue_head_t wait_for_overlap; + int inactive_blocked; /* release of inactive stripes blocked, + * waiting for 25% to be free + */ + int pool_size; /* number of disks in stripeheads in pool */ + spinlock_t device_lock; + struct disk_info *disks; + + /* When taking over an array from a different personality, we store + * the new thread here until we fully activate the array. + */ + struct md_thread *thread; +}; + +/* + * Our supported algorithms + */ +#define ALGORITHM_LEFT_ASYMMETRIC 0 /* Rotating Parity N with Data Restart */ +#define ALGORITHM_RIGHT_ASYMMETRIC 1 /* Rotating Parity 0 with Data Restart */ +#define ALGORITHM_LEFT_SYMMETRIC 2 /* Rotating Parity N with Data Continuation */ +#define ALGORITHM_RIGHT_SYMMETRIC 3 /* Rotating Parity 0 with Data Continuation */ + +/* Define non-rotating (raid4) algorithms. These allow + * conversion of raid4 to raid5. + */ +#define ALGORITHM_PARITY_0 4 /* P or P,Q are initial devices */ +#define ALGORITHM_PARITY_N 5 /* P or P,Q are final devices. */ + +/* DDF RAID6 layouts differ from md/raid6 layouts in two ways. + * Firstly, the exact positioning of the parity block is slightly + * different between the 'LEFT_*' modes of md and the "_N_*" modes + * of DDF. + * Secondly, or order of datablocks over which the Q syndrome is computed + * is different. + * Consequently we have different layouts for DDF/raid6 than md/raid6. + * These layouts are from the DDFv1.2 spec. + * Interestingly DDFv1.2-Errata-A does not specify N_CONTINUE but + * leaves RLQ=3 as 'Vendor Specific' + */ + +#define ALGORITHM_ROTATING_ZERO_RESTART 8 /* DDF PRL=6 RLQ=1 */ +#define ALGORITHM_ROTATING_N_RESTART 9 /* DDF PRL=6 RLQ=2 */ +#define ALGORITHM_ROTATING_N_CONTINUE 10 /*DDF PRL=6 RLQ=3 */ + + +/* For every RAID5 algorithm we define a RAID6 algorithm + * with exactly the same layout for data and parity, and + * with the Q block always on the last device (N-1). + * This allows trivial conversion from RAID5 to RAID6 + */ +#define ALGORITHM_LEFT_ASYMMETRIC_6 16 +#define ALGORITHM_RIGHT_ASYMMETRIC_6 17 +#define ALGORITHM_LEFT_SYMMETRIC_6 18 +#define ALGORITHM_RIGHT_SYMMETRIC_6 19 +#define ALGORITHM_PARITY_0_6 20 +#define ALGORITHM_PARITY_N_6 ALGORITHM_PARITY_N + +static inline int algorithm_valid_raid5(int layout) +{ + return (layout >= 0) && + (layout <= 5); +} +static inline int algorithm_valid_raid6(int layout) +{ + return (layout >= 0 && layout <= 5) + || + (layout >= 8 && layout <= 10) + || + (layout >= 16 && layout <= 20); +} + +static inline int algorithm_is_DDF(int layout) +{ + return layout >= 8 && layout <= 10; +} + +extern int md_raid5_congested(struct mddev *mddev, int bits); +extern void md_raid5_kick_device(struct r5conf *conf); +extern int raid5_set_cache_size(struct mddev *mddev, int size); +#endif |