diff options
Diffstat (limited to 'ANDROID_3.4.5/drivers/md')
82 files changed, 0 insertions, 63982 deletions
diff --git a/ANDROID_3.4.5/drivers/md/Kconfig b/ANDROID_3.4.5/drivers/md/Kconfig deleted file mode 100644 index 10f122a3..00000000 --- a/ANDROID_3.4.5/drivers/md/Kconfig +++ /dev/null @@ -1,393 +0,0 @@ -# -# Block device driver configuration -# - -menuconfig MD - bool "Multiple devices driver support (RAID and LVM)" - depends on BLOCK - help - Support multiple physical spindles through a single logical device. - Required for RAID and logical volume management. - -if MD - -config BLK_DEV_MD - tristate "RAID support" - ---help--- - This driver lets you combine several hard disk partitions into one - logical block device. This can be used to simply append one - partition to another one or to combine several redundant hard disks - into a RAID1/4/5 device so as to provide protection against hard - disk failures. This is called "Software RAID" since the combining of - the partitions is done by the kernel. "Hardware RAID" means that the - combining is done by a dedicated controller; if you have such a - controller, you do not need to say Y here. - - More information about Software RAID on Linux is contained in the - Software RAID mini-HOWTO, available from - <http://www.tldp.org/docs.html#howto>. There you will also learn - where to get the supporting user space utilities raidtools. - - If unsure, say N. - -config MD_AUTODETECT - bool "Autodetect RAID arrays during kernel boot" - depends on BLK_DEV_MD=y - default y - ---help--- - If you say Y here, then the kernel will try to autodetect raid - arrays as part of its boot process. - - If you don't use raid and say Y, this autodetection can cause - a several-second delay in the boot time due to various - synchronisation steps that are part of this step. - - If unsure, say Y. - -config MD_LINEAR - tristate "Linear (append) mode" - depends on BLK_DEV_MD - ---help--- - If you say Y here, then your multiple devices driver will be able to - use the so-called linear mode, i.e. it will combine the hard disk - partitions by simply appending one to the other. - - To compile this as a module, choose M here: the module - will be called linear. - - If unsure, say Y. - -config MD_RAID0 - tristate "RAID-0 (striping) mode" - depends on BLK_DEV_MD - ---help--- - If you say Y here, then your multiple devices driver will be able to - use the so-called raid0 mode, i.e. it will combine the hard disk - partitions into one logical device in such a fashion as to fill them - up evenly, one chunk here and one chunk there. This will increase - the throughput rate if the partitions reside on distinct disks. - - Information about Software RAID on Linux is contained in the - Software-RAID mini-HOWTO, available from - <http://www.tldp.org/docs.html#howto>. There you will also - learn where to get the supporting user space utilities raidtools. - - To compile this as a module, choose M here: the module - will be called raid0. - - If unsure, say Y. - -config MD_RAID1 - tristate "RAID-1 (mirroring) mode" - depends on BLK_DEV_MD - ---help--- - A RAID-1 set consists of several disk drives which are exact copies - of each other. In the event of a mirror failure, the RAID driver - will continue to use the operational mirrors in the set, providing - an error free MD (multiple device) to the higher levels of the - kernel. In a set with N drives, the available space is the capacity - of a single drive, and the set protects against a failure of (N - 1) - drives. - - Information about Software RAID on Linux is contained in the - Software-RAID mini-HOWTO, available from - <http://www.tldp.org/docs.html#howto>. There you will also - learn where to get the supporting user space utilities raidtools. - - If you want to use such a RAID-1 set, say Y. To compile this code - as a module, choose M here: the module will be called raid1. - - If unsure, say Y. - -config MD_RAID10 - tristate "RAID-10 (mirrored striping) mode" - depends on BLK_DEV_MD - ---help--- - RAID-10 provides a combination of striping (RAID-0) and - mirroring (RAID-1) with easier configuration and more flexible - layout. - Unlike RAID-0, but like RAID-1, RAID-10 requires all devices to - be the same size (or at least, only as much as the smallest device - will be used). - RAID-10 provides a variety of layouts that provide different levels - of redundancy and performance. - - RAID-10 requires mdadm-1.7.0 or later, available at: - - ftp://ftp.kernel.org/pub/linux/utils/raid/mdadm/ - - If unsure, say Y. - -config MD_RAID456 - tristate "RAID-4/RAID-5/RAID-6 mode" - depends on BLK_DEV_MD - select RAID6_PQ - select ASYNC_MEMCPY - select ASYNC_XOR - select ASYNC_PQ - select ASYNC_RAID6_RECOV - ---help--- - A RAID-5 set of N drives with a capacity of C MB per drive provides - the capacity of C * (N - 1) MB, and protects against a failure - of a single drive. For a given sector (row) number, (N - 1) drives - contain data sectors, and one drive contains the parity protection. - For a RAID-4 set, the parity blocks are present on a single drive, - while a RAID-5 set distributes the parity across the drives in one - of the available parity distribution methods. - - A RAID-6 set of N drives with a capacity of C MB per drive - provides the capacity of C * (N - 2) MB, and protects - against a failure of any two drives. For a given sector - (row) number, (N - 2) drives contain data sectors, and two - drives contains two independent redundancy syndromes. Like - RAID-5, RAID-6 distributes the syndromes across the drives - in one of the available parity distribution methods. - - Information about Software RAID on Linux is contained in the - Software-RAID mini-HOWTO, available from - <http://www.tldp.org/docs.html#howto>. There you will also - learn where to get the supporting user space utilities raidtools. - - If you want to use such a RAID-4/RAID-5/RAID-6 set, say Y. To - compile this code as a module, choose M here: the module - will be called raid456. - - If unsure, say Y. - -config MULTICORE_RAID456 - bool "RAID-4/RAID-5/RAID-6 Multicore processing (EXPERIMENTAL)" - depends on MD_RAID456 - depends on SMP - depends on EXPERIMENTAL - ---help--- - Enable the raid456 module to dispatch per-stripe raid operations to a - thread pool. - - If unsure, say N. - -config MD_MULTIPATH - tristate "Multipath I/O support" - depends on BLK_DEV_MD - help - MD_MULTIPATH provides a simple multi-path personality for use - the MD framework. It is not under active development. New - projects should consider using DM_MULTIPATH which has more - features and more testing. - - If unsure, say N. - -config MD_FAULTY - tristate "Faulty test module for MD" - depends on BLK_DEV_MD - help - The "faulty" module allows for a block device that occasionally returns - read or write errors. It is useful for testing. - - In unsure, say N. - -config BLK_DEV_DM - tristate "Device mapper support" - ---help--- - Device-mapper is a low level volume manager. It works by allowing - people to specify mappings for ranges of logical sectors. Various - mapping types are available, in addition people may write their own - modules containing custom mappings if they wish. - - Higher level volume managers such as LVM2 use this driver. - - To compile this as a module, choose M here: the module will be - called dm-mod. - - If unsure, say N. - -config DM_DEBUG - boolean "Device mapper debugging support" - depends on BLK_DEV_DM - ---help--- - Enable this for messages that may help debug device-mapper problems. - - If unsure, say N. - -config DM_BUFIO - tristate - depends on BLK_DEV_DM && EXPERIMENTAL - ---help--- - This interface allows you to do buffered I/O on a device and acts - as a cache, holding recently-read blocks in memory and performing - delayed writes. - -source "drivers/md/persistent-data/Kconfig" - -config DM_CRYPT - tristate "Crypt target support" - depends on BLK_DEV_DM - select CRYPTO - select CRYPTO_CBC - ---help--- - This device-mapper target allows you to create a device that - transparently encrypts the data on it. You'll need to activate - the ciphers you're going to use in the cryptoapi configuration. - - Information on how to use dm-crypt can be found on - - <http://www.saout.de/misc/dm-crypt/> - - To compile this code as a module, choose M here: the module will - be called dm-crypt. - - If unsure, say N. - -config DM_SNAPSHOT - tristate "Snapshot target" - depends on BLK_DEV_DM - ---help--- - Allow volume managers to take writable snapshots of a device. - -config DM_THIN_PROVISIONING - tristate "Thin provisioning target (EXPERIMENTAL)" - depends on BLK_DEV_DM && EXPERIMENTAL - select DM_PERSISTENT_DATA - ---help--- - Provides thin provisioning and snapshots that share a data store. - -config DM_DEBUG_BLOCK_STACK_TRACING - boolean "Keep stack trace of thin provisioning block lock holders" - depends on STACKTRACE_SUPPORT && DM_THIN_PROVISIONING - select STACKTRACE - ---help--- - Enable this for messages that may help debug problems with the - block manager locking used by thin provisioning. - - If unsure, say N. - -config DM_DEBUG_SPACE_MAPS - boolean "Extra validation for thin provisioning space maps" - depends on DM_THIN_PROVISIONING - ---help--- - Enable this for messages that may help debug problems with the - space maps used by thin provisioning. - - If unsure, say N. - -config DM_MIRROR - tristate "Mirror target" - depends on BLK_DEV_DM - ---help--- - Allow volume managers to mirror logical volumes, also - needed for live data migration tools such as 'pvmove'. - -config DM_RAID - tristate "RAID 1/4/5/6 target" - depends on BLK_DEV_DM - select MD_RAID1 - select MD_RAID456 - select BLK_DEV_MD - ---help--- - A dm target that supports RAID1, RAID4, RAID5 and RAID6 mappings - - A RAID-5 set of N drives with a capacity of C MB per drive provides - the capacity of C * (N - 1) MB, and protects against a failure - of a single drive. For a given sector (row) number, (N - 1) drives - contain data sectors, and one drive contains the parity protection. - For a RAID-4 set, the parity blocks are present on a single drive, - while a RAID-5 set distributes the parity across the drives in one - of the available parity distribution methods. - - A RAID-6 set of N drives with a capacity of C MB per drive - provides the capacity of C * (N - 2) MB, and protects - against a failure of any two drives. For a given sector - (row) number, (N - 2) drives contain data sectors, and two - drives contains two independent redundancy syndromes. Like - RAID-5, RAID-6 distributes the syndromes across the drives - in one of the available parity distribution methods. - -config DM_LOG_USERSPACE - tristate "Mirror userspace logging (EXPERIMENTAL)" - depends on DM_MIRROR && EXPERIMENTAL && NET - select CONNECTOR - ---help--- - The userspace logging module provides a mechanism for - relaying the dm-dirty-log API to userspace. Log designs - which are more suited to userspace implementation (e.g. - shared storage logs) or experimental logs can be implemented - by leveraging this framework. - -config DM_ZERO - tristate "Zero target" - depends on BLK_DEV_DM - ---help--- - A target that discards writes, and returns all zeroes for - reads. Useful in some recovery situations. - -config DM_MULTIPATH - tristate "Multipath target" - depends on BLK_DEV_DM - # nasty syntax but means make DM_MULTIPATH independent - # of SCSI_DH if the latter isn't defined but if - # it is, DM_MULTIPATH must depend on it. We get a build - # error if SCSI_DH=m and DM_MULTIPATH=y - depends on SCSI_DH || !SCSI_DH - ---help--- - Allow volume managers to support multipath hardware. - -config DM_MULTIPATH_QL - tristate "I/O Path Selector based on the number of in-flight I/Os" - depends on DM_MULTIPATH - ---help--- - This path selector is a dynamic load balancer which selects - the path with the least number of in-flight I/Os. - - If unsure, say N. - -config DM_MULTIPATH_ST - tristate "I/O Path Selector based on the service time" - depends on DM_MULTIPATH - ---help--- - This path selector is a dynamic load balancer which selects - the path expected to complete the incoming I/O in the shortest - time. - - If unsure, say N. - -config DM_DELAY - tristate "I/O delaying target (EXPERIMENTAL)" - depends on BLK_DEV_DM && EXPERIMENTAL - ---help--- - A target that delays reads and/or writes and can send - them to different devices. Useful for testing. - - If unsure, say N. - -config DM_UEVENT - bool "DM uevents" - depends on BLK_DEV_DM - ---help--- - Generate udev events for DM events. - -config DM_FLAKEY - tristate "Flakey target (EXPERIMENTAL)" - depends on BLK_DEV_DM && EXPERIMENTAL - ---help--- - A target that intermittently fails I/O for debugging purposes. - -config DM_VERITY - tristate "Verity target support (EXPERIMENTAL)" - depends on BLK_DEV_DM && EXPERIMENTAL - select CRYPTO - select CRYPTO_HASH - select DM_BUFIO - ---help--- - This device-mapper target creates a read-only device that - transparently validates the data on one underlying device against - a pre-generated tree of cryptographic checksums stored on a second - device. - - You'll need to activate the digests you're going to use in the - cryptoapi configuration. - - To compile this code as a module, choose M here: the module will - be called dm-verity. - - If unsure, say N. - -endif # MD diff --git a/ANDROID_3.4.5/drivers/md/Makefile b/ANDROID_3.4.5/drivers/md/Makefile deleted file mode 100644 index 8b2e0dff..00000000 --- a/ANDROID_3.4.5/drivers/md/Makefile +++ /dev/null @@ -1,49 +0,0 @@ -# -# Makefile for the kernel software RAID and LVM drivers. -# - -dm-mod-y += dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ - dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o -dm-multipath-y += dm-path-selector.o dm-mpath.o -dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ - dm-snap-persistent.o -dm-mirror-y += dm-raid1.o -dm-log-userspace-y \ - += dm-log-userspace-base.o dm-log-userspace-transfer.o -dm-thin-pool-y += dm-thin.o dm-thin-metadata.o -md-mod-y += md.o bitmap.o -raid456-y += raid5.o - -# Note: link order is important. All raid personalities -# and must come before md.o, as they each initialise -# themselves, and md.o may use the personalities when it -# auto-initialised. - -obj-$(CONFIG_MD_LINEAR) += linear.o -obj-$(CONFIG_MD_RAID0) += raid0.o -obj-$(CONFIG_MD_RAID1) += raid1.o -obj-$(CONFIG_MD_RAID10) += raid10.o -obj-$(CONFIG_MD_RAID456) += raid456.o -obj-$(CONFIG_MD_MULTIPATH) += multipath.o -obj-$(CONFIG_MD_FAULTY) += faulty.o -obj-$(CONFIG_BLK_DEV_MD) += md-mod.o -obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o -obj-$(CONFIG_DM_BUFIO) += dm-bufio.o -obj-$(CONFIG_DM_CRYPT) += dm-crypt.o -obj-$(CONFIG_DM_DELAY) += dm-delay.o -obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o -obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o -obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o -obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o -obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o -obj-$(CONFIG_DM_PERSISTENT_DATA) += persistent-data/ -obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o -obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o -obj-$(CONFIG_DM_ZERO) += dm-zero.o -obj-$(CONFIG_DM_RAID) += dm-raid.o -obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o -obj-$(CONFIG_DM_VERITY) += dm-verity.o - -ifeq ($(CONFIG_DM_UEVENT),y) -dm-mod-objs += dm-uevent.o -endif diff --git a/ANDROID_3.4.5/drivers/md/bitmap.c b/ANDROID_3.4.5/drivers/md/bitmap.c deleted file mode 100644 index 17e2b472..00000000 --- a/ANDROID_3.4.5/drivers/md/bitmap.c +++ /dev/null @@ -1,2113 +0,0 @@ -/* - * bitmap.c two-level bitmap (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003 - * - * bitmap_create - sets up the bitmap structure - * bitmap_destroy - destroys the bitmap structure - * - * additions, Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.: - * - added disk storage for bitmap - * - changes to allow various bitmap chunk sizes - */ - -/* - * Still to do: - * - * flush after percent set rather than just time based. (maybe both). - */ - -#include <linux/blkdev.h> -#include <linux/module.h> -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/init.h> -#include <linux/timer.h> -#include <linux/sched.h> -#include <linux/list.h> -#include <linux/file.h> -#include <linux/mount.h> -#include <linux/buffer_head.h> -#include <linux/seq_file.h> -#include "md.h" -#include "bitmap.h" - -static inline char *bmname(struct bitmap *bitmap) -{ - return bitmap->mddev ? mdname(bitmap->mddev) : "mdX"; -} - -/* - * check a page and, if necessary, allocate it (or hijack it if the alloc fails) - * - * 1) check to see if this page is allocated, if it's not then try to alloc - * 2) if the alloc fails, set the page's hijacked flag so we'll use the - * page pointer directly as a counter - * - * if we find our page, we increment the page's refcount so that it stays - * allocated while we're using it - */ -static int bitmap_checkpage(struct bitmap *bitmap, - unsigned long page, int create) -__releases(bitmap->lock) -__acquires(bitmap->lock) -{ - unsigned char *mappage; - - if (page >= bitmap->pages) { - /* This can happen if bitmap_start_sync goes beyond - * End-of-device while looking for a whole page. - * It is harmless. - */ - return -EINVAL; - } - - if (bitmap->bp[page].hijacked) /* it's hijacked, don't try to alloc */ - return 0; - - if (bitmap->bp[page].map) /* page is already allocated, just return */ - return 0; - - if (!create) - return -ENOENT; - - /* this page has not been allocated yet */ - - spin_unlock_irq(&bitmap->lock); - mappage = kzalloc(PAGE_SIZE, GFP_NOIO); - spin_lock_irq(&bitmap->lock); - - if (mappage == NULL) { - pr_debug("%s: bitmap map page allocation failed, hijacking\n", - bmname(bitmap)); - /* failed - set the hijacked flag so that we can use the - * pointer as a counter */ - if (!bitmap->bp[page].map) - bitmap->bp[page].hijacked = 1; - } else if (bitmap->bp[page].map || - bitmap->bp[page].hijacked) { - /* somebody beat us to getting the page */ - kfree(mappage); - return 0; - } else { - - /* no page was in place and we have one, so install it */ - - bitmap->bp[page].map = mappage; - bitmap->missing_pages--; - } - return 0; -} - -/* if page is completely empty, put it back on the free list, or dealloc it */ -/* if page was hijacked, unmark the flag so it might get alloced next time */ -/* Note: lock should be held when calling this */ -static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page) -{ - char *ptr; - - if (bitmap->bp[page].count) /* page is still busy */ - return; - - /* page is no longer in use, it can be released */ - - if (bitmap->bp[page].hijacked) { /* page was hijacked, undo this now */ - bitmap->bp[page].hijacked = 0; - bitmap->bp[page].map = NULL; - } else { - /* normal case, free the page */ - ptr = bitmap->bp[page].map; - bitmap->bp[page].map = NULL; - bitmap->missing_pages++; - kfree(ptr); - } -} - -/* - * bitmap file handling - read and write the bitmap file and its superblock - */ - -/* - * basic page I/O operations - */ - -/* IO operations when bitmap is stored near all superblocks */ -static struct page *read_sb_page(struct mddev *mddev, loff_t offset, - struct page *page, - unsigned long index, int size) -{ - /* choose a good rdev and read the page from there */ - - struct md_rdev *rdev; - sector_t target; - int did_alloc = 0; - - if (!page) { - page = alloc_page(GFP_KERNEL); - if (!page) - return ERR_PTR(-ENOMEM); - did_alloc = 1; - } - - rdev_for_each(rdev, mddev) { - if (! test_bit(In_sync, &rdev->flags) - || test_bit(Faulty, &rdev->flags)) - continue; - - target = offset + index * (PAGE_SIZE/512); - - if (sync_page_io(rdev, target, - roundup(size, bdev_logical_block_size(rdev->bdev)), - page, READ, true)) { - page->index = index; - attach_page_buffers(page, NULL); /* so that free_buffer will - * quietly no-op */ - return page; - } - } - if (did_alloc) - put_page(page); - return ERR_PTR(-EIO); - -} - -static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mddev) -{ - /* Iterate the disks of an mddev, using rcu to protect access to the - * linked list, and raising the refcount of devices we return to ensure - * they don't disappear while in use. - * As devices are only added or removed when raid_disk is < 0 and - * nr_pending is 0 and In_sync is clear, the entries we return will - * still be in the same position on the list when we re-enter - * list_for_each_continue_rcu. - */ - struct list_head *pos; - rcu_read_lock(); - if (rdev == NULL) - /* start at the beginning */ - pos = &mddev->disks; - else { - /* release the previous rdev and start from there. */ - rdev_dec_pending(rdev, mddev); - pos = &rdev->same_set; - } - list_for_each_continue_rcu(pos, &mddev->disks) { - rdev = list_entry(pos, struct md_rdev, same_set); - if (rdev->raid_disk >= 0 && - !test_bit(Faulty, &rdev->flags)) { - /* this is a usable devices */ - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - return rdev; - } - } - rcu_read_unlock(); - return NULL; -} - -static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) -{ - struct md_rdev *rdev = NULL; - struct block_device *bdev; - struct mddev *mddev = bitmap->mddev; - - while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { - int size = PAGE_SIZE; - loff_t offset = mddev->bitmap_info.offset; - - bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; - - if (page->index == bitmap->file_pages-1) - size = roundup(bitmap->last_page_size, - bdev_logical_block_size(bdev)); - /* Just make sure we aren't corrupting data or - * metadata - */ - if (mddev->external) { - /* Bitmap could be anywhere. */ - if (rdev->sb_start + offset + (page->index - * (PAGE_SIZE/512)) - > rdev->data_offset - && - rdev->sb_start + offset - < (rdev->data_offset + mddev->dev_sectors - + (PAGE_SIZE/512))) - goto bad_alignment; - } else if (offset < 0) { - /* DATA BITMAP METADATA */ - if (offset - + (long)(page->index * (PAGE_SIZE/512)) - + size/512 > 0) - /* bitmap runs in to metadata */ - goto bad_alignment; - if (rdev->data_offset + mddev->dev_sectors - > rdev->sb_start + offset) - /* data runs in to bitmap */ - goto bad_alignment; - } else if (rdev->sb_start < rdev->data_offset) { - /* METADATA BITMAP DATA */ - if (rdev->sb_start - + offset - + page->index*(PAGE_SIZE/512) + size/512 - > rdev->data_offset) - /* bitmap runs in to data */ - goto bad_alignment; - } else { - /* DATA METADATA BITMAP - no problems */ - } - md_super_write(mddev, rdev, - rdev->sb_start + offset - + page->index * (PAGE_SIZE/512), - size, - page); - } - - if (wait) - md_super_wait(mddev); - return 0; - - bad_alignment: - return -EINVAL; -} - -static void bitmap_file_kick(struct bitmap *bitmap); -/* - * write out a page to a file - */ -static void write_page(struct bitmap *bitmap, struct page *page, int wait) -{ - struct buffer_head *bh; - - if (bitmap->file == NULL) { - switch (write_sb_page(bitmap, page, wait)) { - case -EINVAL: - bitmap->flags |= BITMAP_WRITE_ERROR; - } - } else { - - bh = page_buffers(page); - - while (bh && bh->b_blocknr) { - atomic_inc(&bitmap->pending_writes); - set_buffer_locked(bh); - set_buffer_mapped(bh); - submit_bh(WRITE | REQ_SYNC, bh); - bh = bh->b_this_page; - } - - if (wait) - wait_event(bitmap->write_wait, - atomic_read(&bitmap->pending_writes)==0); - } - if (bitmap->flags & BITMAP_WRITE_ERROR) - bitmap_file_kick(bitmap); -} - -static void end_bitmap_write(struct buffer_head *bh, int uptodate) -{ - struct bitmap *bitmap = bh->b_private; - unsigned long flags; - - if (!uptodate) { - spin_lock_irqsave(&bitmap->lock, flags); - bitmap->flags |= BITMAP_WRITE_ERROR; - spin_unlock_irqrestore(&bitmap->lock, flags); - } - if (atomic_dec_and_test(&bitmap->pending_writes)) - wake_up(&bitmap->write_wait); -} - -/* copied from buffer.c */ -static void -__clear_page_buffers(struct page *page) -{ - ClearPagePrivate(page); - set_page_private(page, 0); - page_cache_release(page); -} -static void free_buffers(struct page *page) -{ - struct buffer_head *bh = page_buffers(page); - - while (bh) { - struct buffer_head *next = bh->b_this_page; - free_buffer_head(bh); - bh = next; - } - __clear_page_buffers(page); - put_page(page); -} - -/* read a page from a file. - * We both read the page, and attach buffers to the page to record the - * address of each block (using bmap). These addresses will be used - * to write the block later, completely bypassing the filesystem. - * This usage is similar to how swap files are handled, and allows us - * to write to a file with no concerns of memory allocation failing. - */ -static struct page *read_page(struct file *file, unsigned long index, - struct bitmap *bitmap, - unsigned long count) -{ - struct page *page = NULL; - struct inode *inode = file->f_path.dentry->d_inode; - struct buffer_head *bh; - sector_t block; - - pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE, - (unsigned long long)index << PAGE_SHIFT); - - page = alloc_page(GFP_KERNEL); - if (!page) - page = ERR_PTR(-ENOMEM); - if (IS_ERR(page)) - goto out; - - bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0); - if (!bh) { - put_page(page); - page = ERR_PTR(-ENOMEM); - goto out; - } - attach_page_buffers(page, bh); - block = index << (PAGE_SHIFT - inode->i_blkbits); - while (bh) { - if (count == 0) - bh->b_blocknr = 0; - else { - bh->b_blocknr = bmap(inode, block); - if (bh->b_blocknr == 0) { - /* Cannot use this file! */ - free_buffers(page); - page = ERR_PTR(-EINVAL); - goto out; - } - bh->b_bdev = inode->i_sb->s_bdev; - if (count < (1<<inode->i_blkbits)) - count = 0; - else - count -= (1<<inode->i_blkbits); - - bh->b_end_io = end_bitmap_write; - bh->b_private = bitmap; - atomic_inc(&bitmap->pending_writes); - set_buffer_locked(bh); - set_buffer_mapped(bh); - submit_bh(READ, bh); - } - block++; - bh = bh->b_this_page; - } - page->index = index; - - wait_event(bitmap->write_wait, - atomic_read(&bitmap->pending_writes)==0); - if (bitmap->flags & BITMAP_WRITE_ERROR) { - free_buffers(page); - page = ERR_PTR(-EIO); - } -out: - if (IS_ERR(page)) - printk(KERN_ALERT "md: bitmap read error: (%dB @ %llu): %ld\n", - (int)PAGE_SIZE, - (unsigned long long)index << PAGE_SHIFT, - PTR_ERR(page)); - return page; -} - -/* - * bitmap file superblock operations - */ - -/* update the event counter and sync the superblock to disk */ -void bitmap_update_sb(struct bitmap *bitmap) -{ - bitmap_super_t *sb; - - if (!bitmap || !bitmap->mddev) /* no bitmap for this array */ - return; - if (bitmap->mddev->bitmap_info.external) - return; - if (!bitmap->sb_page) /* no superblock */ - return; - sb = kmap_atomic(bitmap->sb_page); - sb->events = cpu_to_le64(bitmap->mddev->events); - if (bitmap->mddev->events < bitmap->events_cleared) - /* rocking back to read-only */ - bitmap->events_cleared = bitmap->mddev->events; - sb->events_cleared = cpu_to_le64(bitmap->events_cleared); - sb->state = cpu_to_le32(bitmap->flags); - /* Just in case these have been changed via sysfs: */ - sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); - sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); - kunmap_atomic(sb); - write_page(bitmap, bitmap->sb_page, 1); -} - -/* print out the bitmap file superblock */ -void bitmap_print_sb(struct bitmap *bitmap) -{ - bitmap_super_t *sb; - - if (!bitmap || !bitmap->sb_page) - return; - sb = kmap_atomic(bitmap->sb_page); - printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); - printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); - printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); - printk(KERN_DEBUG " uuid: %08x.%08x.%08x.%08x\n", - *(__u32 *)(sb->uuid+0), - *(__u32 *)(sb->uuid+4), - *(__u32 *)(sb->uuid+8), - *(__u32 *)(sb->uuid+12)); - printk(KERN_DEBUG " events: %llu\n", - (unsigned long long) le64_to_cpu(sb->events)); - printk(KERN_DEBUG "events cleared: %llu\n", - (unsigned long long) le64_to_cpu(sb->events_cleared)); - printk(KERN_DEBUG " state: %08x\n", le32_to_cpu(sb->state)); - printk(KERN_DEBUG " chunksize: %d B\n", le32_to_cpu(sb->chunksize)); - printk(KERN_DEBUG " daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep)); - printk(KERN_DEBUG " sync size: %llu KB\n", - (unsigned long long)le64_to_cpu(sb->sync_size)/2); - printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind)); - kunmap_atomic(sb); -} - -/* - * bitmap_new_disk_sb - * @bitmap - * - * This function is somewhat the reverse of bitmap_read_sb. bitmap_read_sb - * reads and verifies the on-disk bitmap superblock and populates bitmap_info. - * This function verifies 'bitmap_info' and populates the on-disk bitmap - * structure, which is to be written to disk. - * - * Returns: 0 on success, -Exxx on error - */ -static int bitmap_new_disk_sb(struct bitmap *bitmap) -{ - bitmap_super_t *sb; - unsigned long chunksize, daemon_sleep, write_behind; - int err = -EINVAL; - - bitmap->sb_page = alloc_page(GFP_KERNEL); - if (IS_ERR(bitmap->sb_page)) { - err = PTR_ERR(bitmap->sb_page); - bitmap->sb_page = NULL; - return err; - } - bitmap->sb_page->index = 0; - - sb = kmap_atomic(bitmap->sb_page); - - sb->magic = cpu_to_le32(BITMAP_MAGIC); - sb->version = cpu_to_le32(BITMAP_MAJOR_HI); - - chunksize = bitmap->mddev->bitmap_info.chunksize; - BUG_ON(!chunksize); - if (!is_power_of_2(chunksize)) { - kunmap_atomic(sb); - printk(KERN_ERR "bitmap chunksize not a power of 2\n"); - return -EINVAL; - } - sb->chunksize = cpu_to_le32(chunksize); - - daemon_sleep = bitmap->mddev->bitmap_info.daemon_sleep; - if (!daemon_sleep || - (daemon_sleep < 1) || (daemon_sleep > MAX_SCHEDULE_TIMEOUT)) { - printk(KERN_INFO "Choosing daemon_sleep default (5 sec)\n"); - daemon_sleep = 5 * HZ; - } - sb->daemon_sleep = cpu_to_le32(daemon_sleep); - bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; - - /* - * FIXME: write_behind for RAID1. If not specified, what - * is a good choice? We choose COUNTER_MAX / 2 arbitrarily. - */ - write_behind = bitmap->mddev->bitmap_info.max_write_behind; - if (write_behind > COUNTER_MAX) - write_behind = COUNTER_MAX / 2; - sb->write_behind = cpu_to_le32(write_behind); - bitmap->mddev->bitmap_info.max_write_behind = write_behind; - - /* keep the array size field of the bitmap superblock up to date */ - sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); - - memcpy(sb->uuid, bitmap->mddev->uuid, 16); - - bitmap->flags |= BITMAP_STALE; - sb->state |= cpu_to_le32(BITMAP_STALE); - bitmap->events_cleared = bitmap->mddev->events; - sb->events_cleared = cpu_to_le64(bitmap->mddev->events); - - kunmap_atomic(sb); - - return 0; -} - -/* read the superblock from the bitmap file and initialize some bitmap fields */ -static int bitmap_read_sb(struct bitmap *bitmap) -{ - char *reason = NULL; - bitmap_super_t *sb; - unsigned long chunksize, daemon_sleep, write_behind; - unsigned long long events; - int err = -EINVAL; - - /* page 0 is the superblock, read it... */ - if (bitmap->file) { - loff_t isize = i_size_read(bitmap->file->f_mapping->host); - int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize; - - bitmap->sb_page = read_page(bitmap->file, 0, bitmap, bytes); - } else { - bitmap->sb_page = read_sb_page(bitmap->mddev, - bitmap->mddev->bitmap_info.offset, - NULL, - 0, sizeof(bitmap_super_t)); - } - if (IS_ERR(bitmap->sb_page)) { - err = PTR_ERR(bitmap->sb_page); - bitmap->sb_page = NULL; - return err; - } - - sb = kmap_atomic(bitmap->sb_page); - - chunksize = le32_to_cpu(sb->chunksize); - daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; - write_behind = le32_to_cpu(sb->write_behind); - - /* verify that the bitmap-specific fields are valid */ - if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) - reason = "bad magic"; - else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO || - le32_to_cpu(sb->version) > BITMAP_MAJOR_HI) - reason = "unrecognized superblock version"; - else if (chunksize < 512) - reason = "bitmap chunksize too small"; - else if (!is_power_of_2(chunksize)) - reason = "bitmap chunksize not a power of 2"; - else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT) - reason = "daemon sleep period out of range"; - else if (write_behind > COUNTER_MAX) - reason = "write-behind limit out of range (0 - 16383)"; - if (reason) { - printk(KERN_INFO "%s: invalid bitmap file superblock: %s\n", - bmname(bitmap), reason); - goto out; - } - - /* keep the array size field of the bitmap superblock up to date */ - sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); - - if (bitmap->mddev->persistent) { - /* - * We have a persistent array superblock, so compare the - * bitmap's UUID and event counter to the mddev's - */ - if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) { - printk(KERN_INFO - "%s: bitmap superblock UUID mismatch\n", - bmname(bitmap)); - goto out; - } - events = le64_to_cpu(sb->events); - if (events < bitmap->mddev->events) { - printk(KERN_INFO - "%s: bitmap file is out of date (%llu < %llu) " - "-- forcing full recovery\n", - bmname(bitmap), events, - (unsigned long long) bitmap->mddev->events); - sb->state |= cpu_to_le32(BITMAP_STALE); - } - } - - /* assign fields using values from superblock */ - bitmap->mddev->bitmap_info.chunksize = chunksize; - bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; - bitmap->mddev->bitmap_info.max_write_behind = write_behind; - bitmap->flags |= le32_to_cpu(sb->state); - if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) - bitmap->flags |= BITMAP_HOSTENDIAN; - bitmap->events_cleared = le64_to_cpu(sb->events_cleared); - if (bitmap->flags & BITMAP_STALE) - bitmap->events_cleared = bitmap->mddev->events; - err = 0; -out: - kunmap_atomic(sb); - if (err) - bitmap_print_sb(bitmap); - return err; -} - -enum bitmap_mask_op { - MASK_SET, - MASK_UNSET -}; - -/* record the state of the bitmap in the superblock. Return the old value */ -static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, - enum bitmap_mask_op op) -{ - bitmap_super_t *sb; - int old; - - if (!bitmap->sb_page) /* can't set the state */ - return 0; - sb = kmap_atomic(bitmap->sb_page); - old = le32_to_cpu(sb->state) & bits; - switch (op) { - case MASK_SET: - sb->state |= cpu_to_le32(bits); - bitmap->flags |= bits; - break; - case MASK_UNSET: - sb->state &= cpu_to_le32(~bits); - bitmap->flags &= ~bits; - break; - default: - BUG(); - } - kunmap_atomic(sb); - return old; -} - -/* - * general bitmap file operations - */ - -/* - * on-disk bitmap: - * - * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap - * file a page at a time. There's a superblock at the start of the file. - */ -/* calculate the index of the page that contains this bit */ -static inline unsigned long file_page_index(struct bitmap *bitmap, unsigned long chunk) -{ - if (!bitmap->mddev->bitmap_info.external) - chunk += sizeof(bitmap_super_t) << 3; - return chunk >> PAGE_BIT_SHIFT; -} - -/* calculate the (bit) offset of this bit within a page */ -static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned long chunk) -{ - if (!bitmap->mddev->bitmap_info.external) - chunk += sizeof(bitmap_super_t) << 3; - return chunk & (PAGE_BITS - 1); -} - -/* - * return a pointer to the page in the filemap that contains the given bit - * - * this lookup is complicated by the fact that the bitmap sb might be exactly - * 1 page (e.g., x86) or less than 1 page -- so the bitmap might start on page - * 0 or page 1 - */ -static inline struct page *filemap_get_page(struct bitmap *bitmap, - unsigned long chunk) -{ - if (file_page_index(bitmap, chunk) >= bitmap->file_pages) - return NULL; - return bitmap->filemap[file_page_index(bitmap, chunk) - - file_page_index(bitmap, 0)]; -} - -static void bitmap_file_unmap(struct bitmap *bitmap) -{ - struct page **map, *sb_page; - unsigned long *attr; - int pages; - unsigned long flags; - - spin_lock_irqsave(&bitmap->lock, flags); - map = bitmap->filemap; - bitmap->filemap = NULL; - attr = bitmap->filemap_attr; - bitmap->filemap_attr = NULL; - pages = bitmap->file_pages; - bitmap->file_pages = 0; - sb_page = bitmap->sb_page; - bitmap->sb_page = NULL; - spin_unlock_irqrestore(&bitmap->lock, flags); - - while (pages--) - if (map[pages] != sb_page) /* 0 is sb_page, release it below */ - free_buffers(map[pages]); - kfree(map); - kfree(attr); - - if (sb_page) - free_buffers(sb_page); -} - -static void bitmap_file_put(struct bitmap *bitmap) -{ - struct file *file; - unsigned long flags; - - spin_lock_irqsave(&bitmap->lock, flags); - file = bitmap->file; - bitmap->file = NULL; - spin_unlock_irqrestore(&bitmap->lock, flags); - - if (file) - wait_event(bitmap->write_wait, - atomic_read(&bitmap->pending_writes)==0); - bitmap_file_unmap(bitmap); - - if (file) { - struct inode *inode = file->f_path.dentry->d_inode; - invalidate_mapping_pages(inode->i_mapping, 0, -1); - fput(file); - } -} - -/* - * bitmap_file_kick - if an error occurs while manipulating the bitmap file - * then it is no longer reliable, so we stop using it and we mark the file - * as failed in the superblock - */ -static void bitmap_file_kick(struct bitmap *bitmap) -{ - char *path, *ptr = NULL; - - if (bitmap_mask_state(bitmap, BITMAP_STALE, MASK_SET) == 0) { - bitmap_update_sb(bitmap); - - if (bitmap->file) { - path = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (path) - ptr = d_path(&bitmap->file->f_path, path, - PAGE_SIZE); - - printk(KERN_ALERT - "%s: kicking failed bitmap file %s from array!\n", - bmname(bitmap), IS_ERR(ptr) ? "" : ptr); - - kfree(path); - } else - printk(KERN_ALERT - "%s: disabling internal bitmap due to errors\n", - bmname(bitmap)); - } - - bitmap_file_put(bitmap); - - return; -} - -enum bitmap_page_attr { - BITMAP_PAGE_DIRTY = 0, /* there are set bits that need to be synced */ - BITMAP_PAGE_PENDING = 1, /* there are bits that are being cleaned. - * i.e. counter is 1 or 2. */ - BITMAP_PAGE_NEEDWRITE = 2, /* there are cleared bits that need to be synced */ -}; - -static inline void set_page_attr(struct bitmap *bitmap, struct page *page, - enum bitmap_page_attr attr) -{ - __set_bit((page->index<<2) + attr, bitmap->filemap_attr); -} - -static inline void clear_page_attr(struct bitmap *bitmap, struct page *page, - enum bitmap_page_attr attr) -{ - __clear_bit((page->index<<2) + attr, bitmap->filemap_attr); -} - -static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page, - enum bitmap_page_attr attr) -{ - return test_bit((page->index<<2) + attr, bitmap->filemap_attr); -} - -/* - * bitmap_file_set_bit -- called before performing a write to the md device - * to set (and eventually sync) a particular bit in the bitmap file - * - * we set the bit immediately, then we record the page number so that - * when an unplug occurs, we can flush the dirty pages out to disk - */ -static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) -{ - unsigned long bit; - struct page *page; - void *kaddr; - unsigned long chunk = block >> bitmap->chunkshift; - - if (!bitmap->filemap) - return; - - page = filemap_get_page(bitmap, chunk); - if (!page) - return; - bit = file_page_offset(bitmap, chunk); - - /* set the bit */ - kaddr = kmap_atomic(page); - if (bitmap->flags & BITMAP_HOSTENDIAN) - set_bit(bit, kaddr); - else - __set_bit_le(bit, kaddr); - kunmap_atomic(kaddr); - pr_debug("set file bit %lu page %lu\n", bit, page->index); - /* record page number so it gets flushed to disk when unplug occurs */ - set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); -} - -/* this gets called when the md device is ready to unplug its underlying - * (slave) device queues -- before we let any writes go down, we need to - * sync the dirty pages of the bitmap file to disk */ -void bitmap_unplug(struct bitmap *bitmap) -{ - unsigned long i, flags; - int dirty, need_write; - struct page *page; - int wait = 0; - - if (!bitmap) - return; - - /* look at each page to see if there are any set bits that need to be - * flushed out to disk */ - for (i = 0; i < bitmap->file_pages; i++) { - spin_lock_irqsave(&bitmap->lock, flags); - if (!bitmap->filemap) { - spin_unlock_irqrestore(&bitmap->lock, flags); - return; - } - page = bitmap->filemap[i]; - dirty = test_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); - need_write = test_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); - clear_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); - clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); - if (dirty) - wait = 1; - spin_unlock_irqrestore(&bitmap->lock, flags); - - if (dirty || need_write) - write_page(bitmap, page, 0); - } - if (wait) { /* if any writes were performed, we need to wait on them */ - if (bitmap->file) - wait_event(bitmap->write_wait, - atomic_read(&bitmap->pending_writes)==0); - else - md_super_wait(bitmap->mddev); - } - if (bitmap->flags & BITMAP_WRITE_ERROR) - bitmap_file_kick(bitmap); -} -EXPORT_SYMBOL(bitmap_unplug); - -static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed); -/* * bitmap_init_from_disk -- called at bitmap_create time to initialize - * the in-memory bitmap from the on-disk bitmap -- also, sets up the - * memory mapping of the bitmap file - * Special cases: - * if there's no bitmap file, or if the bitmap file had been - * previously kicked from the array, we mark all the bits as - * 1's in order to cause a full resync. - * - * We ignore all bits for sectors that end earlier than 'start'. - * This is used when reading an out-of-date bitmap... - */ -static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) -{ - unsigned long i, chunks, index, oldindex, bit; - struct page *page = NULL, *oldpage = NULL; - unsigned long num_pages, bit_cnt = 0; - struct file *file; - unsigned long bytes, offset; - int outofdate; - int ret = -ENOSPC; - void *paddr; - - chunks = bitmap->chunks; - file = bitmap->file; - - BUG_ON(!file && !bitmap->mddev->bitmap_info.offset); - - outofdate = bitmap->flags & BITMAP_STALE; - if (outofdate) - printk(KERN_INFO "%s: bitmap file is out of date, doing full " - "recovery\n", bmname(bitmap)); - - bytes = DIV_ROUND_UP(bitmap->chunks, 8); - if (!bitmap->mddev->bitmap_info.external) - bytes += sizeof(bitmap_super_t); - - num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE); - - if (file && i_size_read(file->f_mapping->host) < bytes) { - printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n", - bmname(bitmap), - (unsigned long) i_size_read(file->f_mapping->host), - bytes); - goto err; - } - - ret = -ENOMEM; - - bitmap->filemap = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL); - if (!bitmap->filemap) - goto err; - - /* We need 4 bits per page, rounded up to a multiple of sizeof(unsigned long) */ - bitmap->filemap_attr = kzalloc( - roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)), - GFP_KERNEL); - if (!bitmap->filemap_attr) - goto err; - - oldindex = ~0L; - - for (i = 0; i < chunks; i++) { - int b; - index = file_page_index(bitmap, i); - bit = file_page_offset(bitmap, i); - if (index != oldindex) { /* this is a new page, read it in */ - int count; - /* unmap the old page, we're done with it */ - if (index == num_pages-1) - count = bytes - index * PAGE_SIZE; - else - count = PAGE_SIZE; - if (index == 0 && bitmap->sb_page) { - /* - * if we're here then the superblock page - * contains some bits (PAGE_SIZE != sizeof sb) - * we've already read it in, so just use it - */ - page = bitmap->sb_page; - offset = sizeof(bitmap_super_t); - if (!file) - page = read_sb_page( - bitmap->mddev, - bitmap->mddev->bitmap_info.offset, - page, - index, count); - } else if (file) { - page = read_page(file, index, bitmap, count); - offset = 0; - } else { - page = read_sb_page(bitmap->mddev, - bitmap->mddev->bitmap_info.offset, - NULL, - index, count); - offset = 0; - } - if (IS_ERR(page)) { /* read error */ - ret = PTR_ERR(page); - goto err; - } - - oldindex = index; - oldpage = page; - - bitmap->filemap[bitmap->file_pages++] = page; - bitmap->last_page_size = count; - - if (outofdate) { - /* - * if bitmap is out of date, dirty the - * whole page and write it out - */ - paddr = kmap_atomic(page); - memset(paddr + offset, 0xff, - PAGE_SIZE - offset); - kunmap_atomic(paddr); - write_page(bitmap, page, 1); - - ret = -EIO; - if (bitmap->flags & BITMAP_WRITE_ERROR) - goto err; - } - } - paddr = kmap_atomic(page); - if (bitmap->flags & BITMAP_HOSTENDIAN) - b = test_bit(bit, paddr); - else - b = test_bit_le(bit, paddr); - kunmap_atomic(paddr); - if (b) { - /* if the disk bit is set, set the memory bit */ - int needed = ((sector_t)(i+1) << bitmap->chunkshift - >= start); - bitmap_set_memory_bits(bitmap, - (sector_t)i << bitmap->chunkshift, - needed); - bit_cnt++; - } - } - - /* everything went OK */ - ret = 0; - bitmap_mask_state(bitmap, BITMAP_STALE, MASK_UNSET); - - if (bit_cnt) { /* Kick recovery if any bits were set */ - set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery); - md_wakeup_thread(bitmap->mddev->thread); - } - - printk(KERN_INFO "%s: bitmap initialized from disk: " - "read %lu/%lu pages, set %lu of %lu bits\n", - bmname(bitmap), bitmap->file_pages, num_pages, bit_cnt, chunks); - - return 0; - - err: - printk(KERN_INFO "%s: bitmap initialisation failed: %d\n", - bmname(bitmap), ret); - return ret; -} - -void bitmap_write_all(struct bitmap *bitmap) -{ - /* We don't actually write all bitmap blocks here, - * just flag them as needing to be written - */ - int i; - - spin_lock_irq(&bitmap->lock); - for (i = 0; i < bitmap->file_pages; i++) - set_page_attr(bitmap, bitmap->filemap[i], - BITMAP_PAGE_NEEDWRITE); - bitmap->allclean = 0; - spin_unlock_irq(&bitmap->lock); -} - -static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc) -{ - sector_t chunk = offset >> bitmap->chunkshift; - unsigned long page = chunk >> PAGE_COUNTER_SHIFT; - bitmap->bp[page].count += inc; - bitmap_checkfree(bitmap, page); -} -static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, - sector_t offset, sector_t *blocks, - int create); - -/* - * bitmap daemon -- periodically wakes up to clean bits and flush pages - * out to disk - */ - -void bitmap_daemon_work(struct mddev *mddev) -{ - struct bitmap *bitmap; - unsigned long j; - unsigned long flags; - struct page *page = NULL, *lastpage = NULL; - sector_t blocks; - void *paddr; - - /* Use a mutex to guard daemon_work against - * bitmap_destroy. - */ - mutex_lock(&mddev->bitmap_info.mutex); - bitmap = mddev->bitmap; - if (bitmap == NULL) { - mutex_unlock(&mddev->bitmap_info.mutex); - return; - } - if (time_before(jiffies, bitmap->daemon_lastrun - + mddev->bitmap_info.daemon_sleep)) - goto done; - - bitmap->daemon_lastrun = jiffies; - if (bitmap->allclean) { - mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; - goto done; - } - bitmap->allclean = 1; - - spin_lock_irqsave(&bitmap->lock, flags); - for (j = 0; j < bitmap->chunks; j++) { - bitmap_counter_t *bmc; - if (!bitmap->filemap) - /* error or shutdown */ - break; - - page = filemap_get_page(bitmap, j); - - if (page != lastpage) { - /* skip this page unless it's marked as needing cleaning */ - if (!test_page_attr(bitmap, page, BITMAP_PAGE_PENDING)) { - int need_write = test_page_attr(bitmap, page, - BITMAP_PAGE_NEEDWRITE); - if (need_write) - clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); - - spin_unlock_irqrestore(&bitmap->lock, flags); - if (need_write) - write_page(bitmap, page, 0); - spin_lock_irqsave(&bitmap->lock, flags); - j |= (PAGE_BITS - 1); - continue; - } - - /* grab the new page, sync and release the old */ - if (lastpage != NULL) { - if (test_page_attr(bitmap, lastpage, - BITMAP_PAGE_NEEDWRITE)) { - clear_page_attr(bitmap, lastpage, - BITMAP_PAGE_NEEDWRITE); - spin_unlock_irqrestore(&bitmap->lock, flags); - write_page(bitmap, lastpage, 0); - } else { - set_page_attr(bitmap, lastpage, - BITMAP_PAGE_NEEDWRITE); - bitmap->allclean = 0; - spin_unlock_irqrestore(&bitmap->lock, flags); - } - } else - spin_unlock_irqrestore(&bitmap->lock, flags); - lastpage = page; - - /* We are possibly going to clear some bits, so make - * sure that events_cleared is up-to-date. - */ - if (bitmap->need_sync && - mddev->bitmap_info.external == 0) { - bitmap_super_t *sb; - bitmap->need_sync = 0; - sb = kmap_atomic(bitmap->sb_page); - sb->events_cleared = - cpu_to_le64(bitmap->events_cleared); - kunmap_atomic(sb); - write_page(bitmap, bitmap->sb_page, 1); - } - spin_lock_irqsave(&bitmap->lock, flags); - if (!bitmap->need_sync) - clear_page_attr(bitmap, page, BITMAP_PAGE_PENDING); - else - bitmap->allclean = 0; - } - bmc = bitmap_get_counter(bitmap, - (sector_t)j << bitmap->chunkshift, - &blocks, 0); - if (!bmc) - j |= PAGE_COUNTER_MASK; - else if (*bmc) { - if (*bmc == 1 && !bitmap->need_sync) { - /* we can clear the bit */ - *bmc = 0; - bitmap_count_page(bitmap, - (sector_t)j << bitmap->chunkshift, - -1); - - /* clear the bit */ - paddr = kmap_atomic(page); - if (bitmap->flags & BITMAP_HOSTENDIAN) - clear_bit(file_page_offset(bitmap, j), - paddr); - else - __clear_bit_le( - file_page_offset(bitmap, - j), - paddr); - kunmap_atomic(paddr); - } else if (*bmc <= 2) { - *bmc = 1; /* maybe clear the bit next time */ - set_page_attr(bitmap, page, BITMAP_PAGE_PENDING); - bitmap->allclean = 0; - } - } - } - spin_unlock_irqrestore(&bitmap->lock, flags); - - /* now sync the final page */ - if (lastpage != NULL) { - spin_lock_irqsave(&bitmap->lock, flags); - if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) { - clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); - spin_unlock_irqrestore(&bitmap->lock, flags); - write_page(bitmap, lastpage, 0); - } else { - set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); - bitmap->allclean = 0; - spin_unlock_irqrestore(&bitmap->lock, flags); - } - } - - done: - if (bitmap->allclean == 0) - mddev->thread->timeout = - mddev->bitmap_info.daemon_sleep; - mutex_unlock(&mddev->bitmap_info.mutex); -} - -static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, - sector_t offset, sector_t *blocks, - int create) -__releases(bitmap->lock) -__acquires(bitmap->lock) -{ - /* If 'create', we might release the lock and reclaim it. - * The lock must have been taken with interrupts enabled. - * If !create, we don't release the lock. - */ - sector_t chunk = offset >> bitmap->chunkshift; - unsigned long page = chunk >> PAGE_COUNTER_SHIFT; - unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT; - sector_t csize; - int err; - - err = bitmap_checkpage(bitmap, page, create); - - if (bitmap->bp[page].hijacked || - bitmap->bp[page].map == NULL) - csize = ((sector_t)1) << (bitmap->chunkshift + - PAGE_COUNTER_SHIFT - 1); - else - csize = ((sector_t)1) << bitmap->chunkshift; - *blocks = csize - (offset & (csize - 1)); - - if (err < 0) - return NULL; - - /* now locked ... */ - - if (bitmap->bp[page].hijacked) { /* hijacked pointer */ - /* should we use the first or second counter field - * of the hijacked pointer? */ - int hi = (pageoff > PAGE_COUNTER_MASK); - return &((bitmap_counter_t *) - &bitmap->bp[page].map)[hi]; - } else /* page is allocated */ - return (bitmap_counter_t *) - &(bitmap->bp[page].map[pageoff]); -} - -int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind) -{ - if (!bitmap) - return 0; - - if (behind) { - int bw; - atomic_inc(&bitmap->behind_writes); - bw = atomic_read(&bitmap->behind_writes); - if (bw > bitmap->behind_writes_used) - bitmap->behind_writes_used = bw; - - pr_debug("inc write-behind count %d/%lu\n", - bw, bitmap->mddev->bitmap_info.max_write_behind); - } - - while (sectors) { - sector_t blocks; - bitmap_counter_t *bmc; - - spin_lock_irq(&bitmap->lock); - bmc = bitmap_get_counter(bitmap, offset, &blocks, 1); - if (!bmc) { - spin_unlock_irq(&bitmap->lock); - return 0; - } - - if (unlikely(COUNTER(*bmc) == COUNTER_MAX)) { - DEFINE_WAIT(__wait); - /* note that it is safe to do the prepare_to_wait - * after the test as long as we do it before dropping - * the spinlock. - */ - prepare_to_wait(&bitmap->overflow_wait, &__wait, - TASK_UNINTERRUPTIBLE); - spin_unlock_irq(&bitmap->lock); - io_schedule(); - finish_wait(&bitmap->overflow_wait, &__wait); - continue; - } - - switch (*bmc) { - case 0: - bitmap_file_set_bit(bitmap, offset); - bitmap_count_page(bitmap, offset, 1); - /* fall through */ - case 1: - *bmc = 2; - } - - (*bmc)++; - - spin_unlock_irq(&bitmap->lock); - - offset += blocks; - if (sectors > blocks) - sectors -= blocks; - else - sectors = 0; - } - return 0; -} -EXPORT_SYMBOL(bitmap_startwrite); - -void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, - int success, int behind) -{ - if (!bitmap) - return; - if (behind) { - if (atomic_dec_and_test(&bitmap->behind_writes)) - wake_up(&bitmap->behind_wait); - pr_debug("dec write-behind count %d/%lu\n", - atomic_read(&bitmap->behind_writes), - bitmap->mddev->bitmap_info.max_write_behind); - } - - while (sectors) { - sector_t blocks; - unsigned long flags; - bitmap_counter_t *bmc; - - spin_lock_irqsave(&bitmap->lock, flags); - bmc = bitmap_get_counter(bitmap, offset, &blocks, 0); - if (!bmc) { - spin_unlock_irqrestore(&bitmap->lock, flags); - return; - } - - if (success && !bitmap->mddev->degraded && - bitmap->events_cleared < bitmap->mddev->events) { - bitmap->events_cleared = bitmap->mddev->events; - bitmap->need_sync = 1; - sysfs_notify_dirent_safe(bitmap->sysfs_can_clear); - } - - if (!success && !NEEDED(*bmc)) - *bmc |= NEEDED_MASK; - - if (COUNTER(*bmc) == COUNTER_MAX) - wake_up(&bitmap->overflow_wait); - - (*bmc)--; - if (*bmc <= 2) { - set_page_attr(bitmap, - filemap_get_page( - bitmap, - offset >> bitmap->chunkshift), - BITMAP_PAGE_PENDING); - bitmap->allclean = 0; - } - spin_unlock_irqrestore(&bitmap->lock, flags); - offset += blocks; - if (sectors > blocks) - sectors -= blocks; - else - sectors = 0; - } -} -EXPORT_SYMBOL(bitmap_endwrite); - -static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, - int degraded) -{ - bitmap_counter_t *bmc; - int rv; - if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */ - *blocks = 1024; - return 1; /* always resync if no bitmap */ - } - spin_lock_irq(&bitmap->lock); - bmc = bitmap_get_counter(bitmap, offset, blocks, 0); - rv = 0; - if (bmc) { - /* locked */ - if (RESYNC(*bmc)) - rv = 1; - else if (NEEDED(*bmc)) { - rv = 1; - if (!degraded) { /* don't set/clear bits if degraded */ - *bmc |= RESYNC_MASK; - *bmc &= ~NEEDED_MASK; - } - } - } - spin_unlock_irq(&bitmap->lock); - return rv; -} - -int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, - int degraded) -{ - /* bitmap_start_sync must always report on multiples of whole - * pages, otherwise resync (which is very PAGE_SIZE based) will - * get confused. - * So call __bitmap_start_sync repeatedly (if needed) until - * At least PAGE_SIZE>>9 blocks are covered. - * Return the 'or' of the result. - */ - int rv = 0; - sector_t blocks1; - - *blocks = 0; - while (*blocks < (PAGE_SIZE>>9)) { - rv |= __bitmap_start_sync(bitmap, offset, - &blocks1, degraded); - offset += blocks1; - *blocks += blocks1; - } - return rv; -} -EXPORT_SYMBOL(bitmap_start_sync); - -void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted) -{ - bitmap_counter_t *bmc; - unsigned long flags; - - if (bitmap == NULL) { - *blocks = 1024; - return; - } - spin_lock_irqsave(&bitmap->lock, flags); - bmc = bitmap_get_counter(bitmap, offset, blocks, 0); - if (bmc == NULL) - goto unlock; - /* locked */ - if (RESYNC(*bmc)) { - *bmc &= ~RESYNC_MASK; - - if (!NEEDED(*bmc) && aborted) - *bmc |= NEEDED_MASK; - else { - if (*bmc <= 2) { - set_page_attr(bitmap, - filemap_get_page(bitmap, offset >> bitmap->chunkshift), - BITMAP_PAGE_PENDING); - bitmap->allclean = 0; - } - } - } - unlock: - spin_unlock_irqrestore(&bitmap->lock, flags); -} -EXPORT_SYMBOL(bitmap_end_sync); - -void bitmap_close_sync(struct bitmap *bitmap) -{ - /* Sync has finished, and any bitmap chunks that weren't synced - * properly have been aborted. It remains to us to clear the - * RESYNC bit wherever it is still on - */ - sector_t sector = 0; - sector_t blocks; - if (!bitmap) - return; - while (sector < bitmap->mddev->resync_max_sectors) { - bitmap_end_sync(bitmap, sector, &blocks, 0); - sector += blocks; - } -} -EXPORT_SYMBOL(bitmap_close_sync); - -void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) -{ - sector_t s = 0; - sector_t blocks; - - if (!bitmap) - return; - if (sector == 0) { - bitmap->last_end_sync = jiffies; - return; - } - if (time_before(jiffies, (bitmap->last_end_sync - + bitmap->mddev->bitmap_info.daemon_sleep))) - return; - wait_event(bitmap->mddev->recovery_wait, - atomic_read(&bitmap->mddev->recovery_active) == 0); - - bitmap->mddev->curr_resync_completed = sector; - set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); - sector &= ~((1ULL << bitmap->chunkshift) - 1); - s = 0; - while (s < sector && s < bitmap->mddev->resync_max_sectors) { - bitmap_end_sync(bitmap, s, &blocks, 0); - s += blocks; - } - bitmap->last_end_sync = jiffies; - sysfs_notify(&bitmap->mddev->kobj, NULL, "sync_completed"); -} -EXPORT_SYMBOL(bitmap_cond_end_sync); - -static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) -{ - /* For each chunk covered by any of these sectors, set the - * counter to 1 and set resync_needed. They should all - * be 0 at this point - */ - - sector_t secs; - bitmap_counter_t *bmc; - spin_lock_irq(&bitmap->lock); - bmc = bitmap_get_counter(bitmap, offset, &secs, 1); - if (!bmc) { - spin_unlock_irq(&bitmap->lock); - return; - } - if (!*bmc) { - struct page *page; - *bmc = 2 | (needed ? NEEDED_MASK : 0); - bitmap_count_page(bitmap, offset, 1); - page = filemap_get_page(bitmap, offset >> bitmap->chunkshift); - set_page_attr(bitmap, page, BITMAP_PAGE_PENDING); - bitmap->allclean = 0; - } - spin_unlock_irq(&bitmap->lock); -} - -/* dirty the memory and file bits for bitmap chunks "s" to "e" */ -void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e) -{ - unsigned long chunk; - - for (chunk = s; chunk <= e; chunk++) { - sector_t sec = (sector_t)chunk << bitmap->chunkshift; - bitmap_set_memory_bits(bitmap, sec, 1); - spin_lock_irq(&bitmap->lock); - bitmap_file_set_bit(bitmap, sec); - spin_unlock_irq(&bitmap->lock); - if (sec < bitmap->mddev->recovery_cp) - /* We are asserting that the array is dirty, - * so move the recovery_cp address back so - * that it is obvious that it is dirty - */ - bitmap->mddev->recovery_cp = sec; - } -} - -/* - * flush out any pending updates - */ -void bitmap_flush(struct mddev *mddev) -{ - struct bitmap *bitmap = mddev->bitmap; - long sleep; - - if (!bitmap) /* there was no bitmap */ - return; - - /* run the daemon_work three time to ensure everything is flushed - * that can be - */ - sleep = mddev->bitmap_info.daemon_sleep * 2; - bitmap->daemon_lastrun -= sleep; - bitmap_daemon_work(mddev); - bitmap->daemon_lastrun -= sleep; - bitmap_daemon_work(mddev); - bitmap->daemon_lastrun -= sleep; - bitmap_daemon_work(mddev); - bitmap_update_sb(bitmap); -} - -/* - * free memory that was allocated - */ -static void bitmap_free(struct bitmap *bitmap) -{ - unsigned long k, pages; - struct bitmap_page *bp; - - if (!bitmap) /* there was no bitmap */ - return; - - /* release the bitmap file and kill the daemon */ - bitmap_file_put(bitmap); - - bp = bitmap->bp; - pages = bitmap->pages; - - /* free all allocated memory */ - - if (bp) /* deallocate the page memory */ - for (k = 0; k < pages; k++) - if (bp[k].map && !bp[k].hijacked) - kfree(bp[k].map); - kfree(bp); - kfree(bitmap); -} - -void bitmap_destroy(struct mddev *mddev) -{ - struct bitmap *bitmap = mddev->bitmap; - - if (!bitmap) /* there was no bitmap */ - return; - - mutex_lock(&mddev->bitmap_info.mutex); - mddev->bitmap = NULL; /* disconnect from the md device */ - mutex_unlock(&mddev->bitmap_info.mutex); - if (mddev->thread) - mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; - - if (bitmap->sysfs_can_clear) - sysfs_put(bitmap->sysfs_can_clear); - - bitmap_free(bitmap); -} - -/* - * initialize the bitmap structure - * if this returns an error, bitmap_destroy must be called to do clean up - */ -int bitmap_create(struct mddev *mddev) -{ - struct bitmap *bitmap; - sector_t blocks = mddev->resync_max_sectors; - unsigned long chunks; - unsigned long pages; - struct file *file = mddev->bitmap_info.file; - int err; - struct sysfs_dirent *bm = NULL; - - BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); - - if (!file - && !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */ - return 0; - - BUG_ON(file && mddev->bitmap_info.offset); - - bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); - if (!bitmap) - return -ENOMEM; - - spin_lock_init(&bitmap->lock); - atomic_set(&bitmap->pending_writes, 0); - init_waitqueue_head(&bitmap->write_wait); - init_waitqueue_head(&bitmap->overflow_wait); - init_waitqueue_head(&bitmap->behind_wait); - - bitmap->mddev = mddev; - - if (mddev->kobj.sd) - bm = sysfs_get_dirent(mddev->kobj.sd, NULL, "bitmap"); - if (bm) { - bitmap->sysfs_can_clear = sysfs_get_dirent(bm, NULL, "can_clear"); - sysfs_put(bm); - } else - bitmap->sysfs_can_clear = NULL; - - bitmap->file = file; - if (file) { - get_file(file); - /* As future accesses to this file will use bmap, - * and bypass the page cache, we must sync the file - * first. - */ - vfs_fsync(file, 1); - } - /* read superblock from bitmap file (this sets mddev->bitmap_info.chunksize) */ - if (!mddev->bitmap_info.external) { - /* - * If 'MD_ARRAY_FIRST_USE' is set, then device-mapper is - * instructing us to create a new on-disk bitmap instance. - */ - if (test_and_clear_bit(MD_ARRAY_FIRST_USE, &mddev->flags)) - err = bitmap_new_disk_sb(bitmap); - else - err = bitmap_read_sb(bitmap); - } else { - err = 0; - if (mddev->bitmap_info.chunksize == 0 || - mddev->bitmap_info.daemon_sleep == 0) - /* chunksize and time_base need to be - * set first. */ - err = -EINVAL; - } - if (err) - goto error; - - bitmap->daemon_lastrun = jiffies; - bitmap->chunkshift = (ffz(~mddev->bitmap_info.chunksize) - - BITMAP_BLOCK_SHIFT); - - chunks = (blocks + (1 << bitmap->chunkshift) - 1) >> - bitmap->chunkshift; - pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO; - - BUG_ON(!pages); - - bitmap->chunks = chunks; - bitmap->pages = pages; - bitmap->missing_pages = pages; - - bitmap->bp = kzalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL); - - err = -ENOMEM; - if (!bitmap->bp) - goto error; - - printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", - pages, bmname(bitmap)); - - mddev->bitmap = bitmap; - - - return (bitmap->flags & BITMAP_WRITE_ERROR) ? -EIO : 0; - - error: - bitmap_free(bitmap); - return err; -} - -int bitmap_load(struct mddev *mddev) -{ - int err = 0; - sector_t start = 0; - sector_t sector = 0; - struct bitmap *bitmap = mddev->bitmap; - - if (!bitmap) - goto out; - - /* Clear out old bitmap info first: Either there is none, or we - * are resuming after someone else has possibly changed things, - * so we should forget old cached info. - * All chunks should be clean, but some might need_sync. - */ - while (sector < mddev->resync_max_sectors) { - sector_t blocks; - bitmap_start_sync(bitmap, sector, &blocks, 0); - sector += blocks; - } - bitmap_close_sync(bitmap); - - if (mddev->degraded == 0 - || bitmap->events_cleared == mddev->events) - /* no need to keep dirty bits to optimise a - * re-add of a missing device */ - start = mddev->recovery_cp; - - mutex_lock(&mddev->bitmap_info.mutex); - err = bitmap_init_from_disk(bitmap, start); - mutex_unlock(&mddev->bitmap_info.mutex); - - if (err) - goto out; - - mddev->thread->timeout = mddev->bitmap_info.daemon_sleep; - md_wakeup_thread(mddev->thread); - - bitmap_update_sb(bitmap); - - if (bitmap->flags & BITMAP_WRITE_ERROR) - err = -EIO; -out: - return err; -} -EXPORT_SYMBOL_GPL(bitmap_load); - -void bitmap_status(struct seq_file *seq, struct bitmap *bitmap) -{ - unsigned long chunk_kb; - unsigned long flags; - - if (!bitmap) - return; - - spin_lock_irqsave(&bitmap->lock, flags); - chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10; - seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " - "%lu%s chunk", - bitmap->pages - bitmap->missing_pages, - bitmap->pages, - (bitmap->pages - bitmap->missing_pages) - << (PAGE_SHIFT - 10), - chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize, - chunk_kb ? "KB" : "B"); - if (bitmap->file) { - seq_printf(seq, ", file: "); - seq_path(seq, &bitmap->file->f_path, " \t\n"); - } - - seq_printf(seq, "\n"); - spin_unlock_irqrestore(&bitmap->lock, flags); -} - -static ssize_t -location_show(struct mddev *mddev, char *page) -{ - ssize_t len; - if (mddev->bitmap_info.file) - len = sprintf(page, "file"); - else if (mddev->bitmap_info.offset) - len = sprintf(page, "%+lld", (long long)mddev->bitmap_info.offset); - else - len = sprintf(page, "none"); - len += sprintf(page+len, "\n"); - return len; -} - -static ssize_t -location_store(struct mddev *mddev, const char *buf, size_t len) -{ - - if (mddev->pers) { - if (!mddev->pers->quiesce) - return -EBUSY; - if (mddev->recovery || mddev->sync_thread) - return -EBUSY; - } - - if (mddev->bitmap || mddev->bitmap_info.file || - mddev->bitmap_info.offset) { - /* bitmap already configured. Only option is to clear it */ - if (strncmp(buf, "none", 4) != 0) - return -EBUSY; - if (mddev->pers) { - mddev->pers->quiesce(mddev, 1); - bitmap_destroy(mddev); - mddev->pers->quiesce(mddev, 0); - } - mddev->bitmap_info.offset = 0; - if (mddev->bitmap_info.file) { - struct file *f = mddev->bitmap_info.file; - mddev->bitmap_info.file = NULL; - restore_bitmap_write_access(f); - fput(f); - } - } else { - /* No bitmap, OK to set a location */ - long long offset; - if (strncmp(buf, "none", 4) == 0) - /* nothing to be done */; - else if (strncmp(buf, "file:", 5) == 0) { - /* Not supported yet */ - return -EINVAL; - } else { - int rv; - if (buf[0] == '+') - rv = strict_strtoll(buf+1, 10, &offset); - else - rv = strict_strtoll(buf, 10, &offset); - if (rv) - return rv; - if (offset == 0) - return -EINVAL; - if (mddev->bitmap_info.external == 0 && - mddev->major_version == 0 && - offset != mddev->bitmap_info.default_offset) - return -EINVAL; - mddev->bitmap_info.offset = offset; - if (mddev->pers) { - mddev->pers->quiesce(mddev, 1); - rv = bitmap_create(mddev); - if (!rv) - rv = bitmap_load(mddev); - if (rv) { - bitmap_destroy(mddev); - mddev->bitmap_info.offset = 0; - } - mddev->pers->quiesce(mddev, 0); - if (rv) - return rv; - } - } - } - if (!mddev->external) { - /* Ensure new bitmap info is stored in - * metadata promptly. - */ - set_bit(MD_CHANGE_DEVS, &mddev->flags); - md_wakeup_thread(mddev->thread); - } - return len; -} - -static struct md_sysfs_entry bitmap_location = -__ATTR(location, S_IRUGO|S_IWUSR, location_show, location_store); - -static ssize_t -timeout_show(struct mddev *mddev, char *page) -{ - ssize_t len; - unsigned long secs = mddev->bitmap_info.daemon_sleep / HZ; - unsigned long jifs = mddev->bitmap_info.daemon_sleep % HZ; - - len = sprintf(page, "%lu", secs); - if (jifs) - len += sprintf(page+len, ".%03u", jiffies_to_msecs(jifs)); - len += sprintf(page+len, "\n"); - return len; -} - -static ssize_t -timeout_store(struct mddev *mddev, const char *buf, size_t len) -{ - /* timeout can be set at any time */ - unsigned long timeout; - int rv = strict_strtoul_scaled(buf, &timeout, 4); - if (rv) - return rv; - - /* just to make sure we don't overflow... */ - if (timeout >= LONG_MAX / HZ) - return -EINVAL; - - timeout = timeout * HZ / 10000; - - if (timeout >= MAX_SCHEDULE_TIMEOUT) - timeout = MAX_SCHEDULE_TIMEOUT-1; - if (timeout < 1) - timeout = 1; - mddev->bitmap_info.daemon_sleep = timeout; - if (mddev->thread) { - /* if thread->timeout is MAX_SCHEDULE_TIMEOUT, then - * the bitmap is all clean and we don't need to - * adjust the timeout right now - */ - if (mddev->thread->timeout < MAX_SCHEDULE_TIMEOUT) { - mddev->thread->timeout = timeout; - md_wakeup_thread(mddev->thread); - } - } - return len; -} - -static struct md_sysfs_entry bitmap_timeout = -__ATTR(time_base, S_IRUGO|S_IWUSR, timeout_show, timeout_store); - -static ssize_t -backlog_show(struct mddev *mddev, char *page) -{ - return sprintf(page, "%lu\n", mddev->bitmap_info.max_write_behind); -} - -static ssize_t -backlog_store(struct mddev *mddev, const char *buf, size_t len) -{ - unsigned long backlog; - int rv = strict_strtoul(buf, 10, &backlog); - if (rv) - return rv; - if (backlog > COUNTER_MAX) - return -EINVAL; - mddev->bitmap_info.max_write_behind = backlog; - return len; -} - -static struct md_sysfs_entry bitmap_backlog = -__ATTR(backlog, S_IRUGO|S_IWUSR, backlog_show, backlog_store); - -static ssize_t -chunksize_show(struct mddev *mddev, char *page) -{ - return sprintf(page, "%lu\n", mddev->bitmap_info.chunksize); -} - -static ssize_t -chunksize_store(struct mddev *mddev, const char *buf, size_t len) -{ - /* Can only be changed when no bitmap is active */ - int rv; - unsigned long csize; - if (mddev->bitmap) - return -EBUSY; - rv = strict_strtoul(buf, 10, &csize); - if (rv) - return rv; - if (csize < 512 || - !is_power_of_2(csize)) - return -EINVAL; - mddev->bitmap_info.chunksize = csize; - return len; -} - -static struct md_sysfs_entry bitmap_chunksize = -__ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store); - -static ssize_t metadata_show(struct mddev *mddev, char *page) -{ - return sprintf(page, "%s\n", (mddev->bitmap_info.external - ? "external" : "internal")); -} - -static ssize_t metadata_store(struct mddev *mddev, const char *buf, size_t len) -{ - if (mddev->bitmap || - mddev->bitmap_info.file || - mddev->bitmap_info.offset) - return -EBUSY; - if (strncmp(buf, "external", 8) == 0) - mddev->bitmap_info.external = 1; - else if (strncmp(buf, "internal", 8) == 0) - mddev->bitmap_info.external = 0; - else - return -EINVAL; - return len; -} - -static struct md_sysfs_entry bitmap_metadata = -__ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store); - -static ssize_t can_clear_show(struct mddev *mddev, char *page) -{ - int len; - if (mddev->bitmap) - len = sprintf(page, "%s\n", (mddev->bitmap->need_sync ? - "false" : "true")); - else - len = sprintf(page, "\n"); - return len; -} - -static ssize_t can_clear_store(struct mddev *mddev, const char *buf, size_t len) -{ - if (mddev->bitmap == NULL) - return -ENOENT; - if (strncmp(buf, "false", 5) == 0) - mddev->bitmap->need_sync = 1; - else if (strncmp(buf, "true", 4) == 0) { - if (mddev->degraded) - return -EBUSY; - mddev->bitmap->need_sync = 0; - } else - return -EINVAL; - return len; -} - -static struct md_sysfs_entry bitmap_can_clear = -__ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store); - -static ssize_t -behind_writes_used_show(struct mddev *mddev, char *page) -{ - if (mddev->bitmap == NULL) - return sprintf(page, "0\n"); - return sprintf(page, "%lu\n", - mddev->bitmap->behind_writes_used); -} - -static ssize_t -behind_writes_used_reset(struct mddev *mddev, const char *buf, size_t len) -{ - if (mddev->bitmap) - mddev->bitmap->behind_writes_used = 0; - return len; -} - -static struct md_sysfs_entry max_backlog_used = -__ATTR(max_backlog_used, S_IRUGO | S_IWUSR, - behind_writes_used_show, behind_writes_used_reset); - -static struct attribute *md_bitmap_attrs[] = { - &bitmap_location.attr, - &bitmap_timeout.attr, - &bitmap_backlog.attr, - &bitmap_chunksize.attr, - &bitmap_metadata.attr, - &bitmap_can_clear.attr, - &max_backlog_used.attr, - NULL -}; -struct attribute_group md_bitmap_group = { - .name = "bitmap", - .attrs = md_bitmap_attrs, -}; - diff --git a/ANDROID_3.4.5/drivers/md/bitmap.h b/ANDROID_3.4.5/drivers/md/bitmap.h deleted file mode 100644 index b44b0aba..00000000 --- a/ANDROID_3.4.5/drivers/md/bitmap.h +++ /dev/null @@ -1,247 +0,0 @@ -/* - * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003 - * - * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. - */ -#ifndef BITMAP_H -#define BITMAP_H 1 - -#define BITMAP_MAJOR_LO 3 -/* version 4 insists the bitmap is in little-endian order - * with version 3, it is host-endian which is non-portable - */ -#define BITMAP_MAJOR_HI 4 -#define BITMAP_MAJOR_HOSTENDIAN 3 - -/* - * in-memory bitmap: - * - * Use 16 bit block counters to track pending writes to each "chunk". - * The 2 high order bits are special-purpose, the first is a flag indicating - * whether a resync is needed. The second is a flag indicating whether a - * resync is active. - * This means that the counter is actually 14 bits: - * - * +--------+--------+------------------------------------------------+ - * | resync | resync | counter | - * | needed | active | | - * | (0-1) | (0-1) | (0-16383) | - * +--------+--------+------------------------------------------------+ - * - * The "resync needed" bit is set when: - * a '1' bit is read from storage at startup. - * a write request fails on some drives - * a resync is aborted on a chunk with 'resync active' set - * It is cleared (and resync-active set) when a resync starts across all drives - * of the chunk. - * - * - * The "resync active" bit is set when: - * a resync is started on all drives, and resync_needed is set. - * resync_needed will be cleared (as long as resync_active wasn't already set). - * It is cleared when a resync completes. - * - * The counter counts pending write requests, plus the on-disk bit. - * When the counter is '1' and the resync bits are clear, the on-disk - * bit can be cleared as well, thus setting the counter to 0. - * When we set a bit, or in the counter (to start a write), if the fields is - * 0, we first set the disk bit and set the counter to 1. - * - * If the counter is 0, the on-disk bit is clear and the stipe is clean - * Anything that dirties the stipe pushes the counter to 2 (at least) - * and sets the on-disk bit (lazily). - * If a periodic sweep find the counter at 2, it is decremented to 1. - * If the sweep find the counter at 1, the on-disk bit is cleared and the - * counter goes to zero. - * - * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block - * counters as a fallback when "page" memory cannot be allocated: - * - * Normal case (page memory allocated): - * - * page pointer (32-bit) - * - * [ ] ------+ - * | - * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters) - * c1 c2 c2048 - * - * Hijacked case (page memory allocation failed): - * - * hijacked page pointer (32-bit) - * - * [ ][ ] (no page memory allocated) - * counter #1 (16-bit) counter #2 (16-bit) - * - */ - -#ifdef __KERNEL__ - -#define PAGE_BITS (PAGE_SIZE << 3) -#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3) - -typedef __u16 bitmap_counter_t; -#define COUNTER_BITS 16 -#define COUNTER_BIT_SHIFT 4 -#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3) - -#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1))) -#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2))) -#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1) -#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK) -#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK) -#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX) - -/* how many counters per page? */ -#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS) -/* same, except a shift value for more efficient bitops */ -#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT) -/* same, except a mask value for more efficient bitops */ -#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) - -#define BITMAP_BLOCK_SHIFT 9 - -#endif - -/* - * bitmap structures: - */ - -#define BITMAP_MAGIC 0x6d746962 - -/* use these for bitmap->flags and bitmap->sb->state bit-fields */ -enum bitmap_state { - BITMAP_STALE = 0x002, /* the bitmap file is out of date or had -EIO */ - BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */ - BITMAP_HOSTENDIAN = 0x8000, -}; - -/* the superblock at the front of the bitmap file -- little endian */ -typedef struct bitmap_super_s { - __le32 magic; /* 0 BITMAP_MAGIC */ - __le32 version; /* 4 the bitmap major for now, could change... */ - __u8 uuid[16]; /* 8 128 bit uuid - must match md device uuid */ - __le64 events; /* 24 event counter for the bitmap (1)*/ - __le64 events_cleared;/*32 event counter when last bit cleared (2) */ - __le64 sync_size; /* 40 the size of the md device's sync range(3) */ - __le32 state; /* 48 bitmap state information */ - __le32 chunksize; /* 52 the bitmap chunk size in bytes */ - __le32 daemon_sleep; /* 56 seconds between disk flushes */ - __le32 write_behind; /* 60 number of outstanding write-behind writes */ - - __u8 pad[256 - 64]; /* set to zero */ -} bitmap_super_t; - -/* notes: - * (1) This event counter is updated before the eventcounter in the md superblock - * When a bitmap is loaded, it is only accepted if this event counter is equal - * to, or one greater than, the event counter in the superblock. - * (2) This event counter is updated when the other one is *if*and*only*if* the - * array is not degraded. As bits are not cleared when the array is degraded, - * this represents the last time that any bits were cleared. - * If a device is being added that has an event count with this value or - * higher, it is accepted as conforming to the bitmap. - * (3)This is the number of sectors represented by the bitmap, and is the range that - * resync happens across. For raid1 and raid5/6 it is the size of individual - * devices. For raid10 it is the size of the array. - */ - -#ifdef __KERNEL__ - -/* the in-memory bitmap is represented by bitmap_pages */ -struct bitmap_page { - /* - * map points to the actual memory page - */ - char *map; - /* - * in emergencies (when map cannot be alloced), hijack the map - * pointer and use it as two counters itself - */ - unsigned int hijacked:1; - /* - * count of dirty bits on the page - */ - unsigned int count:31; -}; - -/* the main bitmap structure - one per mddev */ -struct bitmap { - struct bitmap_page *bp; - unsigned long pages; /* total number of pages in the bitmap */ - unsigned long missing_pages; /* number of pages not yet allocated */ - - struct mddev *mddev; /* the md device that the bitmap is for */ - - /* bitmap chunksize -- how much data does each bit represent? */ - unsigned long chunkshift; /* chunksize = 2^(chunkshift+9) (for bitops) */ - unsigned long chunks; /* total number of data chunks for the array */ - - __u64 events_cleared; - int need_sync; - - /* bitmap spinlock */ - spinlock_t lock; - - struct file *file; /* backing disk file */ - struct page *sb_page; /* cached copy of the bitmap file superblock */ - struct page **filemap; /* list of cache pages for the file */ - unsigned long *filemap_attr; /* attributes associated w/ filemap pages */ - unsigned long file_pages; /* number of pages in the file */ - int last_page_size; /* bytes in the last page */ - - unsigned long flags; - - int allclean; - - atomic_t behind_writes; - unsigned long behind_writes_used; /* highest actual value at runtime */ - - /* - * the bitmap daemon - periodically wakes up and sweeps the bitmap - * file, cleaning up bits and flushing out pages to disk as necessary - */ - unsigned long daemon_lastrun; /* jiffies of last run */ - unsigned long last_end_sync; /* when we lasted called end_sync to - * update bitmap with resync progress */ - - atomic_t pending_writes; /* pending writes to the bitmap file */ - wait_queue_head_t write_wait; - wait_queue_head_t overflow_wait; - wait_queue_head_t behind_wait; - - struct sysfs_dirent *sysfs_can_clear; -}; - -/* the bitmap API */ - -/* these are used only by md/bitmap */ -int bitmap_create(struct mddev *mddev); -int bitmap_load(struct mddev *mddev); -void bitmap_flush(struct mddev *mddev); -void bitmap_destroy(struct mddev *mddev); - -void bitmap_print_sb(struct bitmap *bitmap); -void bitmap_update_sb(struct bitmap *bitmap); -void bitmap_status(struct seq_file *seq, struct bitmap *bitmap); - -int bitmap_setallbits(struct bitmap *bitmap); -void bitmap_write_all(struct bitmap *bitmap); - -void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e); - -/* these are exported */ -int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, - unsigned long sectors, int behind); -void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, - unsigned long sectors, int success, int behind); -int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded); -void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted); -void bitmap_close_sync(struct bitmap *bitmap); -void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); - -void bitmap_unplug(struct bitmap *bitmap); -void bitmap_daemon_work(struct mddev *mddev); -#endif - -#endif diff --git a/ANDROID_3.4.5/drivers/md/dm-bio-record.h b/ANDROID_3.4.5/drivers/md/dm-bio-record.h deleted file mode 100644 index 3a8cfa26..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-bio-record.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. - * - * This file is released under the GPL. - */ - -#ifndef DM_BIO_RECORD_H -#define DM_BIO_RECORD_H - -#include <linux/bio.h> - -/* - * There are lots of mutable fields in the bio struct that get - * changed by the lower levels of the block layer. Some targets, - * such as multipath, may wish to resubmit a bio on error. The - * functions in this file help the target record and restore the - * original bio state. - */ - -struct dm_bio_vec_details { -#if PAGE_SIZE < 65536 - __u16 bv_len; - __u16 bv_offset; -#else - unsigned bv_len; - unsigned bv_offset; -#endif -}; - -struct dm_bio_details { - sector_t bi_sector; - struct block_device *bi_bdev; - unsigned int bi_size; - unsigned short bi_idx; - unsigned long bi_flags; - struct dm_bio_vec_details bi_io_vec[BIO_MAX_PAGES]; -}; - -static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio) -{ - unsigned i; - - bd->bi_sector = bio->bi_sector; - bd->bi_bdev = bio->bi_bdev; - bd->bi_size = bio->bi_size; - bd->bi_idx = bio->bi_idx; - bd->bi_flags = bio->bi_flags; - - for (i = 0; i < bio->bi_vcnt; i++) { - bd->bi_io_vec[i].bv_len = bio->bi_io_vec[i].bv_len; - bd->bi_io_vec[i].bv_offset = bio->bi_io_vec[i].bv_offset; - } -} - -static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio) -{ - unsigned i; - - bio->bi_sector = bd->bi_sector; - bio->bi_bdev = bd->bi_bdev; - bio->bi_size = bd->bi_size; - bio->bi_idx = bd->bi_idx; - bio->bi_flags = bd->bi_flags; - - for (i = 0; i < bio->bi_vcnt; i++) { - bio->bi_io_vec[i].bv_len = bd->bi_io_vec[i].bv_len; - bio->bi_io_vec[i].bv_offset = bd->bi_io_vec[i].bv_offset; - } -} - -#endif diff --git a/ANDROID_3.4.5/drivers/md/dm-bufio.c b/ANDROID_3.4.5/drivers/md/dm-bufio.c deleted file mode 100644 index cc06a1e5..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-bufio.c +++ /dev/null @@ -1,1755 +0,0 @@ -/* - * Copyright (C) 2009-2011 Red Hat, Inc. - * - * Author: Mikulas Patocka <mpatocka@redhat.com> - * - * This file is released under the GPL. - */ - -#include "dm-bufio.h" - -#include <linux/device-mapper.h> -#include <linux/dm-io.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> -#include <linux/shrinker.h> -#include <linux/module.h> - -#define DM_MSG_PREFIX "bufio" - -/* - * Memory management policy: - * Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory - * or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower). - * Always allocate at least DM_BUFIO_MIN_BUFFERS buffers. - * Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT - * dirty buffers. - */ -#define DM_BUFIO_MIN_BUFFERS 8 - -#define DM_BUFIO_MEMORY_PERCENT 2 -#define DM_BUFIO_VMALLOC_PERCENT 25 -#define DM_BUFIO_WRITEBACK_PERCENT 75 - -/* - * Check buffer ages in this interval (seconds) - */ -#define DM_BUFIO_WORK_TIMER_SECS 10 - -/* - * Free buffers when they are older than this (seconds) - */ -#define DM_BUFIO_DEFAULT_AGE_SECS 60 - -/* - * The number of bvec entries that are embedded directly in the buffer. - * If the chunk size is larger, dm-io is used to do the io. - */ -#define DM_BUFIO_INLINE_VECS 16 - -/* - * Buffer hash - */ -#define DM_BUFIO_HASH_BITS 20 -#define DM_BUFIO_HASH(block) \ - ((((block) >> DM_BUFIO_HASH_BITS) ^ (block)) & \ - ((1 << DM_BUFIO_HASH_BITS) - 1)) - -/* - * Don't try to use kmem_cache_alloc for blocks larger than this. - * For explanation, see alloc_buffer_data below. - */ -#define DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT (PAGE_SIZE >> 1) -#define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT (PAGE_SIZE << (MAX_ORDER - 1)) - -/* - * dm_buffer->list_mode - */ -#define LIST_CLEAN 0 -#define LIST_DIRTY 1 -#define LIST_SIZE 2 - -/* - * Linking of buffers: - * All buffers are linked to cache_hash with their hash_list field. - * - * Clean buffers that are not being written (B_WRITING not set) - * are linked to lru[LIST_CLEAN] with their lru_list field. - * - * Dirty and clean buffers that are being written are linked to - * lru[LIST_DIRTY] with their lru_list field. When the write - * finishes, the buffer cannot be relinked immediately (because we - * are in an interrupt context and relinking requires process - * context), so some clean-not-writing buffers can be held on - * dirty_lru too. They are later added to lru in the process - * context. - */ -struct dm_bufio_client { - struct mutex lock; - - struct list_head lru[LIST_SIZE]; - unsigned long n_buffers[LIST_SIZE]; - - struct block_device *bdev; - unsigned block_size; - unsigned char sectors_per_block_bits; - unsigned char pages_per_block_bits; - unsigned char blocks_per_page_bits; - unsigned aux_size; - void (*alloc_callback)(struct dm_buffer *); - void (*write_callback)(struct dm_buffer *); - - struct dm_io_client *dm_io; - - struct list_head reserved_buffers; - unsigned need_reserved_buffers; - - struct hlist_head *cache_hash; - wait_queue_head_t free_buffer_wait; - - int async_write_error; - - struct list_head client_list; - struct shrinker shrinker; -}; - -/* - * Buffer state bits. - */ -#define B_READING 0 -#define B_WRITING 1 -#define B_DIRTY 2 - -/* - * Describes how the block was allocated: - * kmem_cache_alloc(), __get_free_pages() or vmalloc(). - * See the comment at alloc_buffer_data. - */ -enum data_mode { - DATA_MODE_SLAB = 0, - DATA_MODE_GET_FREE_PAGES = 1, - DATA_MODE_VMALLOC = 2, - DATA_MODE_LIMIT = 3 -}; - -struct dm_buffer { - struct hlist_node hash_list; - struct list_head lru_list; - sector_t block; - void *data; - enum data_mode data_mode; - unsigned char list_mode; /* LIST_* */ - unsigned hold_count; - int read_error; - int write_error; - unsigned long state; - unsigned long last_accessed; - struct dm_bufio_client *c; - struct bio bio; - struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS]; -}; - -/*----------------------------------------------------------------*/ - -static struct kmem_cache *dm_bufio_caches[PAGE_SHIFT - SECTOR_SHIFT]; -static char *dm_bufio_cache_names[PAGE_SHIFT - SECTOR_SHIFT]; - -static inline int dm_bufio_cache_index(struct dm_bufio_client *c) -{ - unsigned ret = c->blocks_per_page_bits - 1; - - BUG_ON(ret >= ARRAY_SIZE(dm_bufio_caches)); - - return ret; -} - -#define DM_BUFIO_CACHE(c) (dm_bufio_caches[dm_bufio_cache_index(c)]) -#define DM_BUFIO_CACHE_NAME(c) (dm_bufio_cache_names[dm_bufio_cache_index(c)]) - -#define dm_bufio_in_request() (!!current->bio_list) - -static void dm_bufio_lock(struct dm_bufio_client *c) -{ - mutex_lock_nested(&c->lock, dm_bufio_in_request()); -} - -static int dm_bufio_trylock(struct dm_bufio_client *c) -{ - return mutex_trylock(&c->lock); -} - -static void dm_bufio_unlock(struct dm_bufio_client *c) -{ - mutex_unlock(&c->lock); -} - -/* - * FIXME Move to sched.h? - */ -#ifdef CONFIG_PREEMPT_VOLUNTARY -# define dm_bufio_cond_resched() \ -do { \ - if (unlikely(need_resched())) \ - _cond_resched(); \ -} while (0) -#else -# define dm_bufio_cond_resched() do { } while (0) -#endif - -/*----------------------------------------------------------------*/ - -/* - * Default cache size: available memory divided by the ratio. - */ -static unsigned long dm_bufio_default_cache_size; - -/* - * Total cache size set by the user. - */ -static unsigned long dm_bufio_cache_size; - -/* - * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change - * at any time. If it disagrees, the user has changed cache size. - */ -static unsigned long dm_bufio_cache_size_latch; - -static DEFINE_SPINLOCK(param_spinlock); - -/* - * Buffers are freed after this timeout - */ -static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS; - -static unsigned long dm_bufio_peak_allocated; -static unsigned long dm_bufio_allocated_kmem_cache; -static unsigned long dm_bufio_allocated_get_free_pages; -static unsigned long dm_bufio_allocated_vmalloc; -static unsigned long dm_bufio_current_allocated; - -/*----------------------------------------------------------------*/ - -/* - * Per-client cache: dm_bufio_cache_size / dm_bufio_client_count - */ -static unsigned long dm_bufio_cache_size_per_client; - -/* - * The current number of clients. - */ -static int dm_bufio_client_count; - -/* - * The list of all clients. - */ -static LIST_HEAD(dm_bufio_all_clients); - -/* - * This mutex protects dm_bufio_cache_size_latch, - * dm_bufio_cache_size_per_client and dm_bufio_client_count - */ -static DEFINE_MUTEX(dm_bufio_clients_lock); - -/*----------------------------------------------------------------*/ - -static void adjust_total_allocated(enum data_mode data_mode, long diff) -{ - static unsigned long * const class_ptr[DATA_MODE_LIMIT] = { - &dm_bufio_allocated_kmem_cache, - &dm_bufio_allocated_get_free_pages, - &dm_bufio_allocated_vmalloc, - }; - - spin_lock(¶m_spinlock); - - *class_ptr[data_mode] += diff; - - dm_bufio_current_allocated += diff; - - if (dm_bufio_current_allocated > dm_bufio_peak_allocated) - dm_bufio_peak_allocated = dm_bufio_current_allocated; - - spin_unlock(¶m_spinlock); -} - -/* - * Change the number of clients and recalculate per-client limit. - */ -static void __cache_size_refresh(void) -{ - BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock)); - BUG_ON(dm_bufio_client_count < 0); - - dm_bufio_cache_size_latch = dm_bufio_cache_size; - - barrier(); - - /* - * Use default if set to 0 and report the actual cache size used. - */ - if (!dm_bufio_cache_size_latch) { - (void)cmpxchg(&dm_bufio_cache_size, 0, - dm_bufio_default_cache_size); - dm_bufio_cache_size_latch = dm_bufio_default_cache_size; - } - - dm_bufio_cache_size_per_client = dm_bufio_cache_size_latch / - (dm_bufio_client_count ? : 1); -} - -/* - * Allocating buffer data. - * - * Small buffers are allocated with kmem_cache, to use space optimally. - * - * For large buffers, we choose between get_free_pages and vmalloc. - * Each has advantages and disadvantages. - * - * __get_free_pages can randomly fail if the memory is fragmented. - * __vmalloc won't randomly fail, but vmalloc space is limited (it may be - * as low as 128M) so using it for caching is not appropriate. - * - * If the allocation may fail we use __get_free_pages. Memory fragmentation - * won't have a fatal effect here, but it just causes flushes of some other - * buffers and more I/O will be performed. Don't use __get_free_pages if it - * always fails (i.e. order >= MAX_ORDER). - * - * If the allocation shouldn't fail we use __vmalloc. This is only for the - * initial reserve allocation, so there's no risk of wasting all vmalloc - * space. - */ -static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, - enum data_mode *data_mode) -{ - if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) { - *data_mode = DATA_MODE_SLAB; - return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask); - } - - if (c->block_size <= DM_BUFIO_BLOCK_SIZE_GFP_LIMIT && - gfp_mask & __GFP_NORETRY) { - *data_mode = DATA_MODE_GET_FREE_PAGES; - return (void *)__get_free_pages(gfp_mask, - c->pages_per_block_bits); - } - - *data_mode = DATA_MODE_VMALLOC; - return __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL); -} - -/* - * Free buffer's data. - */ -static void free_buffer_data(struct dm_bufio_client *c, - void *data, enum data_mode data_mode) -{ - switch (data_mode) { - case DATA_MODE_SLAB: - kmem_cache_free(DM_BUFIO_CACHE(c), data); - break; - - case DATA_MODE_GET_FREE_PAGES: - free_pages((unsigned long)data, c->pages_per_block_bits); - break; - - case DATA_MODE_VMALLOC: - vfree(data); - break; - - default: - DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d", - data_mode); - BUG(); - } -} - -/* - * Allocate buffer and its data. - */ -static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask) -{ - struct dm_buffer *b = kmalloc(sizeof(struct dm_buffer) + c->aux_size, - gfp_mask); - - if (!b) - return NULL; - - b->c = c; - - b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode); - if (!b->data) { - kfree(b); - return NULL; - } - - adjust_total_allocated(b->data_mode, (long)c->block_size); - - return b; -} - -/* - * Free buffer and its data. - */ -static void free_buffer(struct dm_buffer *b) -{ - struct dm_bufio_client *c = b->c; - - adjust_total_allocated(b->data_mode, -(long)c->block_size); - - free_buffer_data(c, b->data, b->data_mode); - kfree(b); -} - -/* - * Link buffer to the hash list and clean or dirty queue. - */ -static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty) -{ - struct dm_bufio_client *c = b->c; - - c->n_buffers[dirty]++; - b->block = block; - b->list_mode = dirty; - list_add(&b->lru_list, &c->lru[dirty]); - hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]); - b->last_accessed = jiffies; -} - -/* - * Unlink buffer from the hash list and dirty or clean queue. - */ -static void __unlink_buffer(struct dm_buffer *b) -{ - struct dm_bufio_client *c = b->c; - - BUG_ON(!c->n_buffers[b->list_mode]); - - c->n_buffers[b->list_mode]--; - hlist_del(&b->hash_list); - list_del(&b->lru_list); -} - -/* - * Place the buffer to the head of dirty or clean LRU queue. - */ -static void __relink_lru(struct dm_buffer *b, int dirty) -{ - struct dm_bufio_client *c = b->c; - - BUG_ON(!c->n_buffers[b->list_mode]); - - c->n_buffers[b->list_mode]--; - c->n_buffers[dirty]++; - b->list_mode = dirty; - list_del(&b->lru_list); - list_add(&b->lru_list, &c->lru[dirty]); -} - -/*---------------------------------------------------------------- - * Submit I/O on the buffer. - * - * Bio interface is faster but it has some problems: - * the vector list is limited (increasing this limit increases - * memory-consumption per buffer, so it is not viable); - * - * the memory must be direct-mapped, not vmalloced; - * - * the I/O driver can reject requests spuriously if it thinks that - * the requests are too big for the device or if they cross a - * controller-defined memory boundary. - * - * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and - * it is not vmalloced, try using the bio interface. - * - * If the buffer is big, if it is vmalloced or if the underlying device - * rejects the bio because it is too large, use dm-io layer to do the I/O. - * The dm-io layer splits the I/O into multiple requests, avoiding the above - * shortcomings. - *--------------------------------------------------------------*/ - -/* - * dm-io completion routine. It just calls b->bio.bi_end_io, pretending - * that the request was handled directly with bio interface. - */ -static void dmio_complete(unsigned long error, void *context) -{ - struct dm_buffer *b = context; - - b->bio.bi_end_io(&b->bio, error ? -EIO : 0); -} - -static void use_dmio(struct dm_buffer *b, int rw, sector_t block, - bio_end_io_t *end_io) -{ - int r; - struct dm_io_request io_req = { - .bi_rw = rw, - .notify.fn = dmio_complete, - .notify.context = b, - .client = b->c->dm_io, - }; - struct dm_io_region region = { - .bdev = b->c->bdev, - .sector = block << b->c->sectors_per_block_bits, - .count = b->c->block_size >> SECTOR_SHIFT, - }; - - if (b->data_mode != DATA_MODE_VMALLOC) { - io_req.mem.type = DM_IO_KMEM; - io_req.mem.ptr.addr = b->data; - } else { - io_req.mem.type = DM_IO_VMA; - io_req.mem.ptr.vma = b->data; - } - - b->bio.bi_end_io = end_io; - - r = dm_io(&io_req, 1, ®ion, NULL); - if (r) - end_io(&b->bio, r); -} - -static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, - bio_end_io_t *end_io) -{ - char *ptr; - int len; - - bio_init(&b->bio); - b->bio.bi_io_vec = b->bio_vec; - b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS; - b->bio.bi_sector = block << b->c->sectors_per_block_bits; - b->bio.bi_bdev = b->c->bdev; - b->bio.bi_end_io = end_io; - - /* - * We assume that if len >= PAGE_SIZE ptr is page-aligned. - * If len < PAGE_SIZE the buffer doesn't cross page boundary. - */ - ptr = b->data; - len = b->c->block_size; - - if (len >= PAGE_SIZE) - BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1)); - else - BUG_ON((unsigned long)ptr & (len - 1)); - - do { - if (!bio_add_page(&b->bio, virt_to_page(ptr), - len < PAGE_SIZE ? len : PAGE_SIZE, - virt_to_phys(ptr) & (PAGE_SIZE - 1))) { - BUG_ON(b->c->block_size <= PAGE_SIZE); - use_dmio(b, rw, block, end_io); - return; - } - - len -= PAGE_SIZE; - ptr += PAGE_SIZE; - } while (len > 0); - - submit_bio(rw, &b->bio); -} - -static void submit_io(struct dm_buffer *b, int rw, sector_t block, - bio_end_io_t *end_io) -{ - if (rw == WRITE && b->c->write_callback) - b->c->write_callback(b); - - if (b->c->block_size <= DM_BUFIO_INLINE_VECS * PAGE_SIZE && - b->data_mode != DATA_MODE_VMALLOC) - use_inline_bio(b, rw, block, end_io); - else - use_dmio(b, rw, block, end_io); -} - -/*---------------------------------------------------------------- - * Writing dirty buffers - *--------------------------------------------------------------*/ - -/* - * The endio routine for write. - * - * Set the error, clear B_WRITING bit and wake anyone who was waiting on - * it. - */ -static void write_endio(struct bio *bio, int error) -{ - struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); - - b->write_error = error; - if (unlikely(error)) { - struct dm_bufio_client *c = b->c; - (void)cmpxchg(&c->async_write_error, 0, error); - } - - BUG_ON(!test_bit(B_WRITING, &b->state)); - - smp_mb__before_clear_bit(); - clear_bit(B_WRITING, &b->state); - smp_mb__after_clear_bit(); - - wake_up_bit(&b->state, B_WRITING); -} - -/* - * This function is called when wait_on_bit is actually waiting. - */ -static int do_io_schedule(void *word) -{ - io_schedule(); - - return 0; -} - -/* - * Initiate a write on a dirty buffer, but don't wait for it. - * - * - If the buffer is not dirty, exit. - * - If there some previous write going on, wait for it to finish (we can't - * have two writes on the same buffer simultaneously). - * - Submit our write and don't wait on it. We set B_WRITING indicating - * that there is a write in progress. - */ -static void __write_dirty_buffer(struct dm_buffer *b) -{ - if (!test_bit(B_DIRTY, &b->state)) - return; - - clear_bit(B_DIRTY, &b->state); - wait_on_bit_lock(&b->state, B_WRITING, - do_io_schedule, TASK_UNINTERRUPTIBLE); - - submit_io(b, WRITE, b->block, write_endio); -} - -/* - * Wait until any activity on the buffer finishes. Possibly write the - * buffer if it is dirty. When this function finishes, there is no I/O - * running on the buffer and the buffer is not dirty. - */ -static void __make_buffer_clean(struct dm_buffer *b) -{ - BUG_ON(b->hold_count); - - if (!b->state) /* fast case */ - return; - - wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE); - __write_dirty_buffer(b); - wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE); -} - -/* - * Find some buffer that is not held by anybody, clean it, unlink it and - * return it. - */ -static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c) -{ - struct dm_buffer *b; - - list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) { - BUG_ON(test_bit(B_WRITING, &b->state)); - BUG_ON(test_bit(B_DIRTY, &b->state)); - - if (!b->hold_count) { - __make_buffer_clean(b); - __unlink_buffer(b); - return b; - } - dm_bufio_cond_resched(); - } - - list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) { - BUG_ON(test_bit(B_READING, &b->state)); - - if (!b->hold_count) { - __make_buffer_clean(b); - __unlink_buffer(b); - return b; - } - dm_bufio_cond_resched(); - } - - return NULL; -} - -/* - * Wait until some other threads free some buffer or release hold count on - * some buffer. - * - * This function is entered with c->lock held, drops it and regains it - * before exiting. - */ -static void __wait_for_free_buffer(struct dm_bufio_client *c) -{ - DECLARE_WAITQUEUE(wait, current); - - add_wait_queue(&c->free_buffer_wait, &wait); - set_task_state(current, TASK_UNINTERRUPTIBLE); - dm_bufio_unlock(c); - - io_schedule(); - - set_task_state(current, TASK_RUNNING); - remove_wait_queue(&c->free_buffer_wait, &wait); - - dm_bufio_lock(c); -} - -enum new_flag { - NF_FRESH = 0, - NF_READ = 1, - NF_GET = 2, - NF_PREFETCH = 3 -}; - -/* - * Allocate a new buffer. If the allocation is not possible, wait until - * some other thread frees a buffer. - * - * May drop the lock and regain it. - */ -static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf) -{ - struct dm_buffer *b; - - /* - * dm-bufio is resistant to allocation failures (it just keeps - * one buffer reserved in cases all the allocations fail). - * So set flags to not try too hard: - * GFP_NOIO: don't recurse into the I/O layer - * __GFP_NORETRY: don't retry and rather return failure - * __GFP_NOMEMALLOC: don't use emergency reserves - * __GFP_NOWARN: don't print a warning in case of failure - * - * For debugging, if we set the cache size to 1, no new buffers will - * be allocated. - */ - while (1) { - if (dm_bufio_cache_size_latch != 1) { - b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); - if (b) - return b; - } - - if (nf == NF_PREFETCH) - return NULL; - - if (!list_empty(&c->reserved_buffers)) { - b = list_entry(c->reserved_buffers.next, - struct dm_buffer, lru_list); - list_del(&b->lru_list); - c->need_reserved_buffers++; - - return b; - } - - b = __get_unclaimed_buffer(c); - if (b) - return b; - - __wait_for_free_buffer(c); - } -} - -static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf) -{ - struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf); - - if (!b) - return NULL; - - if (c->alloc_callback) - c->alloc_callback(b); - - return b; -} - -/* - * Free a buffer and wake other threads waiting for free buffers. - */ -static void __free_buffer_wake(struct dm_buffer *b) -{ - struct dm_bufio_client *c = b->c; - - if (!c->need_reserved_buffers) - free_buffer(b); - else { - list_add(&b->lru_list, &c->reserved_buffers); - c->need_reserved_buffers--; - } - - wake_up(&c->free_buffer_wait); -} - -static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait) -{ - struct dm_buffer *b, *tmp; - - list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { - BUG_ON(test_bit(B_READING, &b->state)); - - if (!test_bit(B_DIRTY, &b->state) && - !test_bit(B_WRITING, &b->state)) { - __relink_lru(b, LIST_CLEAN); - continue; - } - - if (no_wait && test_bit(B_WRITING, &b->state)) - return; - - __write_dirty_buffer(b); - dm_bufio_cond_resched(); - } -} - -/* - * Get writeback threshold and buffer limit for a given client. - */ -static void __get_memory_limit(struct dm_bufio_client *c, - unsigned long *threshold_buffers, - unsigned long *limit_buffers) -{ - unsigned long buffers; - - if (dm_bufio_cache_size != dm_bufio_cache_size_latch) { - mutex_lock(&dm_bufio_clients_lock); - __cache_size_refresh(); - mutex_unlock(&dm_bufio_clients_lock); - } - - buffers = dm_bufio_cache_size_per_client >> - (c->sectors_per_block_bits + SECTOR_SHIFT); - - if (buffers < DM_BUFIO_MIN_BUFFERS) - buffers = DM_BUFIO_MIN_BUFFERS; - - *limit_buffers = buffers; - *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100; -} - -/* - * Check if we're over watermark. - * If we are over threshold_buffers, start freeing buffers. - * If we're over "limit_buffers", block until we get under the limit. - */ -static void __check_watermark(struct dm_bufio_client *c) -{ - unsigned long threshold_buffers, limit_buffers; - - __get_memory_limit(c, &threshold_buffers, &limit_buffers); - - while (c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY] > - limit_buffers) { - - struct dm_buffer *b = __get_unclaimed_buffer(c); - - if (!b) - return; - - __free_buffer_wake(b); - dm_bufio_cond_resched(); - } - - if (c->n_buffers[LIST_DIRTY] > threshold_buffers) - __write_dirty_buffers_async(c, 1); -} - -/* - * Find a buffer in the hash. - */ -static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) -{ - struct dm_buffer *b; - struct hlist_node *hn; - - hlist_for_each_entry(b, hn, &c->cache_hash[DM_BUFIO_HASH(block)], - hash_list) { - dm_bufio_cond_resched(); - if (b->block == block) - return b; - } - - return NULL; -} - -/*---------------------------------------------------------------- - * Getting a buffer - *--------------------------------------------------------------*/ - -static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, - enum new_flag nf, int *need_submit) -{ - struct dm_buffer *b, *new_b = NULL; - - *need_submit = 0; - - b = __find(c, block); - if (b) - goto found_buffer; - - if (nf == NF_GET) - return NULL; - - new_b = __alloc_buffer_wait(c, nf); - if (!new_b) - return NULL; - - /* - * We've had a period where the mutex was unlocked, so need to - * recheck the hash table. - */ - b = __find(c, block); - if (b) { - __free_buffer_wake(new_b); - goto found_buffer; - } - - __check_watermark(c); - - b = new_b; - b->hold_count = 1; - b->read_error = 0; - b->write_error = 0; - __link_buffer(b, block, LIST_CLEAN); - - if (nf == NF_FRESH) { - b->state = 0; - return b; - } - - b->state = 1 << B_READING; - *need_submit = 1; - - return b; - -found_buffer: - if (nf == NF_PREFETCH) - return NULL; - /* - * Note: it is essential that we don't wait for the buffer to be - * read if dm_bufio_get function is used. Both dm_bufio_get and - * dm_bufio_prefetch can be used in the driver request routine. - * If the user called both dm_bufio_prefetch and dm_bufio_get on - * the same buffer, it would deadlock if we waited. - */ - if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state))) - return NULL; - - b->hold_count++; - __relink_lru(b, test_bit(B_DIRTY, &b->state) || - test_bit(B_WRITING, &b->state)); - return b; -} - -/* - * The endio routine for reading: set the error, clear the bit and wake up - * anyone waiting on the buffer. - */ -static void read_endio(struct bio *bio, int error) -{ - struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); - - b->read_error = error; - - BUG_ON(!test_bit(B_READING, &b->state)); - - smp_mb__before_clear_bit(); - clear_bit(B_READING, &b->state); - smp_mb__after_clear_bit(); - - wake_up_bit(&b->state, B_READING); -} - -/* - * A common routine for dm_bufio_new and dm_bufio_read. Operation of these - * functions is similar except that dm_bufio_new doesn't read the - * buffer from the disk (assuming that the caller overwrites all the data - * and uses dm_bufio_mark_buffer_dirty to write new data back). - */ -static void *new_read(struct dm_bufio_client *c, sector_t block, - enum new_flag nf, struct dm_buffer **bp) -{ - int need_submit; - struct dm_buffer *b; - - dm_bufio_lock(c); - b = __bufio_new(c, block, nf, &need_submit); - dm_bufio_unlock(c); - - if (!b) - return b; - - if (need_submit) - submit_io(b, READ, b->block, read_endio); - - wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE); - - if (b->read_error) { - int error = b->read_error; - - dm_bufio_release(b); - - return ERR_PTR(error); - } - - *bp = b; - - return b->data; -} - -void *dm_bufio_get(struct dm_bufio_client *c, sector_t block, - struct dm_buffer **bp) -{ - return new_read(c, block, NF_GET, bp); -} -EXPORT_SYMBOL_GPL(dm_bufio_get); - -void *dm_bufio_read(struct dm_bufio_client *c, sector_t block, - struct dm_buffer **bp) -{ - BUG_ON(dm_bufio_in_request()); - - return new_read(c, block, NF_READ, bp); -} -EXPORT_SYMBOL_GPL(dm_bufio_read); - -void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, - struct dm_buffer **bp) -{ - BUG_ON(dm_bufio_in_request()); - - return new_read(c, block, NF_FRESH, bp); -} -EXPORT_SYMBOL_GPL(dm_bufio_new); - -void dm_bufio_prefetch(struct dm_bufio_client *c, - sector_t block, unsigned n_blocks) -{ - struct blk_plug plug; - - blk_start_plug(&plug); - dm_bufio_lock(c); - - for (; n_blocks--; block++) { - int need_submit; - struct dm_buffer *b; - b = __bufio_new(c, block, NF_PREFETCH, &need_submit); - if (unlikely(b != NULL)) { - dm_bufio_unlock(c); - - if (need_submit) - submit_io(b, READ, b->block, read_endio); - dm_bufio_release(b); - - dm_bufio_cond_resched(); - - if (!n_blocks) - goto flush_plug; - dm_bufio_lock(c); - } - - } - - dm_bufio_unlock(c); - -flush_plug: - blk_finish_plug(&plug); -} -EXPORT_SYMBOL_GPL(dm_bufio_prefetch); - -void dm_bufio_release(struct dm_buffer *b) -{ - struct dm_bufio_client *c = b->c; - - dm_bufio_lock(c); - - BUG_ON(!b->hold_count); - - b->hold_count--; - if (!b->hold_count) { - wake_up(&c->free_buffer_wait); - - /* - * If there were errors on the buffer, and the buffer is not - * to be written, free the buffer. There is no point in caching - * invalid buffer. - */ - if ((b->read_error || b->write_error) && - !test_bit(B_READING, &b->state) && - !test_bit(B_WRITING, &b->state) && - !test_bit(B_DIRTY, &b->state)) { - __unlink_buffer(b); - __free_buffer_wake(b); - } - } - - dm_bufio_unlock(c); -} -EXPORT_SYMBOL_GPL(dm_bufio_release); - -void dm_bufio_mark_buffer_dirty(struct dm_buffer *b) -{ - struct dm_bufio_client *c = b->c; - - dm_bufio_lock(c); - - BUG_ON(test_bit(B_READING, &b->state)); - - if (!test_and_set_bit(B_DIRTY, &b->state)) - __relink_lru(b, LIST_DIRTY); - - dm_bufio_unlock(c); -} -EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty); - -void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c) -{ - BUG_ON(dm_bufio_in_request()); - - dm_bufio_lock(c); - __write_dirty_buffers_async(c, 0); - dm_bufio_unlock(c); -} -EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async); - -/* - * For performance, it is essential that the buffers are written asynchronously - * and simultaneously (so that the block layer can merge the writes) and then - * waited upon. - * - * Finally, we flush hardware disk cache. - */ -int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c) -{ - int a, f; - unsigned long buffers_processed = 0; - struct dm_buffer *b, *tmp; - - dm_bufio_lock(c); - __write_dirty_buffers_async(c, 0); - -again: - list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { - int dropped_lock = 0; - - if (buffers_processed < c->n_buffers[LIST_DIRTY]) - buffers_processed++; - - BUG_ON(test_bit(B_READING, &b->state)); - - if (test_bit(B_WRITING, &b->state)) { - if (buffers_processed < c->n_buffers[LIST_DIRTY]) { - dropped_lock = 1; - b->hold_count++; - dm_bufio_unlock(c); - wait_on_bit(&b->state, B_WRITING, - do_io_schedule, - TASK_UNINTERRUPTIBLE); - dm_bufio_lock(c); - b->hold_count--; - } else - wait_on_bit(&b->state, B_WRITING, - do_io_schedule, - TASK_UNINTERRUPTIBLE); - } - - if (!test_bit(B_DIRTY, &b->state) && - !test_bit(B_WRITING, &b->state)) - __relink_lru(b, LIST_CLEAN); - - dm_bufio_cond_resched(); - - /* - * If we dropped the lock, the list is no longer consistent, - * so we must restart the search. - * - * In the most common case, the buffer just processed is - * relinked to the clean list, so we won't loop scanning the - * same buffer again and again. - * - * This may livelock if there is another thread simultaneously - * dirtying buffers, so we count the number of buffers walked - * and if it exceeds the total number of buffers, it means that - * someone is doing some writes simultaneously with us. In - * this case, stop, dropping the lock. - */ - if (dropped_lock) - goto again; - } - wake_up(&c->free_buffer_wait); - dm_bufio_unlock(c); - - a = xchg(&c->async_write_error, 0); - f = dm_bufio_issue_flush(c); - if (a) - return a; - - return f; -} -EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers); - -/* - * Use dm-io to send and empty barrier flush the device. - */ -int dm_bufio_issue_flush(struct dm_bufio_client *c) -{ - struct dm_io_request io_req = { - .bi_rw = REQ_FLUSH, - .mem.type = DM_IO_KMEM, - .mem.ptr.addr = NULL, - .client = c->dm_io, - }; - struct dm_io_region io_reg = { - .bdev = c->bdev, - .sector = 0, - .count = 0, - }; - - BUG_ON(dm_bufio_in_request()); - - return dm_io(&io_req, 1, &io_reg, NULL); -} -EXPORT_SYMBOL_GPL(dm_bufio_issue_flush); - -/* - * We first delete any other buffer that may be at that new location. - * - * Then, we write the buffer to the original location if it was dirty. - * - * Then, if we are the only one who is holding the buffer, relink the buffer - * in the hash queue for the new location. - * - * If there was someone else holding the buffer, we write it to the new - * location but not relink it, because that other user needs to have the buffer - * at the same place. - */ -void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block) -{ - struct dm_bufio_client *c = b->c; - struct dm_buffer *new; - - BUG_ON(dm_bufio_in_request()); - - dm_bufio_lock(c); - -retry: - new = __find(c, new_block); - if (new) { - if (new->hold_count) { - __wait_for_free_buffer(c); - goto retry; - } - - /* - * FIXME: Is there any point waiting for a write that's going - * to be overwritten in a bit? - */ - __make_buffer_clean(new); - __unlink_buffer(new); - __free_buffer_wake(new); - } - - BUG_ON(!b->hold_count); - BUG_ON(test_bit(B_READING, &b->state)); - - __write_dirty_buffer(b); - if (b->hold_count == 1) { - wait_on_bit(&b->state, B_WRITING, - do_io_schedule, TASK_UNINTERRUPTIBLE); - set_bit(B_DIRTY, &b->state); - __unlink_buffer(b); - __link_buffer(b, new_block, LIST_DIRTY); - } else { - sector_t old_block; - wait_on_bit_lock(&b->state, B_WRITING, - do_io_schedule, TASK_UNINTERRUPTIBLE); - /* - * Relink buffer to "new_block" so that write_callback - * sees "new_block" as a block number. - * After the write, link the buffer back to old_block. - * All this must be done in bufio lock, so that block number - * change isn't visible to other threads. - */ - old_block = b->block; - __unlink_buffer(b); - __link_buffer(b, new_block, b->list_mode); - submit_io(b, WRITE, new_block, write_endio); - wait_on_bit(&b->state, B_WRITING, - do_io_schedule, TASK_UNINTERRUPTIBLE); - __unlink_buffer(b); - __link_buffer(b, old_block, b->list_mode); - } - - dm_bufio_unlock(c); - dm_bufio_release(b); -} -EXPORT_SYMBOL_GPL(dm_bufio_release_move); - -unsigned dm_bufio_get_block_size(struct dm_bufio_client *c) -{ - return c->block_size; -} -EXPORT_SYMBOL_GPL(dm_bufio_get_block_size); - -sector_t dm_bufio_get_device_size(struct dm_bufio_client *c) -{ - return i_size_read(c->bdev->bd_inode) >> - (SECTOR_SHIFT + c->sectors_per_block_bits); -} -EXPORT_SYMBOL_GPL(dm_bufio_get_device_size); - -sector_t dm_bufio_get_block_number(struct dm_buffer *b) -{ - return b->block; -} -EXPORT_SYMBOL_GPL(dm_bufio_get_block_number); - -void *dm_bufio_get_block_data(struct dm_buffer *b) -{ - return b->data; -} -EXPORT_SYMBOL_GPL(dm_bufio_get_block_data); - -void *dm_bufio_get_aux_data(struct dm_buffer *b) -{ - return b + 1; -} -EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data); - -struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b) -{ - return b->c; -} -EXPORT_SYMBOL_GPL(dm_bufio_get_client); - -static void drop_buffers(struct dm_bufio_client *c) -{ - struct dm_buffer *b; - int i; - - BUG_ON(dm_bufio_in_request()); - - /* - * An optimization so that the buffers are not written one-by-one. - */ - dm_bufio_write_dirty_buffers_async(c); - - dm_bufio_lock(c); - - while ((b = __get_unclaimed_buffer(c))) - __free_buffer_wake(b); - - for (i = 0; i < LIST_SIZE; i++) - list_for_each_entry(b, &c->lru[i], lru_list) - DMERR("leaked buffer %llx, hold count %u, list %d", - (unsigned long long)b->block, b->hold_count, i); - - for (i = 0; i < LIST_SIZE; i++) - BUG_ON(!list_empty(&c->lru[i])); - - dm_bufio_unlock(c); -} - -/* - * Test if the buffer is unused and too old, and commit it. - * At if noio is set, we must not do any I/O because we hold - * dm_bufio_clients_lock and we would risk deadlock if the I/O gets rerouted to - * different bufio client. - */ -static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp, - unsigned long max_jiffies) -{ - if (jiffies - b->last_accessed < max_jiffies) - return 1; - - if (!(gfp & __GFP_IO)) { - if (test_bit(B_READING, &b->state) || - test_bit(B_WRITING, &b->state) || - test_bit(B_DIRTY, &b->state)) - return 1; - } - - if (b->hold_count) - return 1; - - __make_buffer_clean(b); - __unlink_buffer(b); - __free_buffer_wake(b); - - return 0; -} - -static void __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, - struct shrink_control *sc) -{ - int l; - struct dm_buffer *b, *tmp; - - for (l = 0; l < LIST_SIZE; l++) { - list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) - if (!__cleanup_old_buffer(b, sc->gfp_mask, 0) && - !--nr_to_scan) - return; - dm_bufio_cond_resched(); - } -} - -static int shrink(struct shrinker *shrinker, struct shrink_control *sc) -{ - struct dm_bufio_client *c = - container_of(shrinker, struct dm_bufio_client, shrinker); - unsigned long r; - unsigned long nr_to_scan = sc->nr_to_scan; - - if (sc->gfp_mask & __GFP_IO) - dm_bufio_lock(c); - else if (!dm_bufio_trylock(c)) - return !nr_to_scan ? 0 : -1; - - if (nr_to_scan) - __scan(c, nr_to_scan, sc); - - r = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; - if (r > INT_MAX) - r = INT_MAX; - - dm_bufio_unlock(c); - - return r; -} - -/* - * Create the buffering interface - */ -struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size, - unsigned reserved_buffers, unsigned aux_size, - void (*alloc_callback)(struct dm_buffer *), - void (*write_callback)(struct dm_buffer *)) -{ - int r; - struct dm_bufio_client *c; - unsigned i; - - BUG_ON(block_size < 1 << SECTOR_SHIFT || - (block_size & (block_size - 1))); - - c = kmalloc(sizeof(*c), GFP_KERNEL); - if (!c) { - r = -ENOMEM; - goto bad_client; - } - c->cache_hash = vmalloc(sizeof(struct hlist_head) << DM_BUFIO_HASH_BITS); - if (!c->cache_hash) { - r = -ENOMEM; - goto bad_hash; - } - - c->bdev = bdev; - c->block_size = block_size; - c->sectors_per_block_bits = ffs(block_size) - 1 - SECTOR_SHIFT; - c->pages_per_block_bits = (ffs(block_size) - 1 >= PAGE_SHIFT) ? - ffs(block_size) - 1 - PAGE_SHIFT : 0; - c->blocks_per_page_bits = (ffs(block_size) - 1 < PAGE_SHIFT ? - PAGE_SHIFT - (ffs(block_size) - 1) : 0); - - c->aux_size = aux_size; - c->alloc_callback = alloc_callback; - c->write_callback = write_callback; - - for (i = 0; i < LIST_SIZE; i++) { - INIT_LIST_HEAD(&c->lru[i]); - c->n_buffers[i] = 0; - } - - for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) - INIT_HLIST_HEAD(&c->cache_hash[i]); - - mutex_init(&c->lock); - INIT_LIST_HEAD(&c->reserved_buffers); - c->need_reserved_buffers = reserved_buffers; - - init_waitqueue_head(&c->free_buffer_wait); - c->async_write_error = 0; - - c->dm_io = dm_io_client_create(); - if (IS_ERR(c->dm_io)) { - r = PTR_ERR(c->dm_io); - goto bad_dm_io; - } - - mutex_lock(&dm_bufio_clients_lock); - if (c->blocks_per_page_bits) { - if (!DM_BUFIO_CACHE_NAME(c)) { - DM_BUFIO_CACHE_NAME(c) = kasprintf(GFP_KERNEL, "dm_bufio_cache-%u", c->block_size); - if (!DM_BUFIO_CACHE_NAME(c)) { - r = -ENOMEM; - mutex_unlock(&dm_bufio_clients_lock); - goto bad_cache; - } - } - - if (!DM_BUFIO_CACHE(c)) { - DM_BUFIO_CACHE(c) = kmem_cache_create(DM_BUFIO_CACHE_NAME(c), - c->block_size, - c->block_size, 0, NULL); - if (!DM_BUFIO_CACHE(c)) { - r = -ENOMEM; - mutex_unlock(&dm_bufio_clients_lock); - goto bad_cache; - } - } - } - mutex_unlock(&dm_bufio_clients_lock); - - while (c->need_reserved_buffers) { - struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL); - - if (!b) { - r = -ENOMEM; - goto bad_buffer; - } - __free_buffer_wake(b); - } - - mutex_lock(&dm_bufio_clients_lock); - dm_bufio_client_count++; - list_add(&c->client_list, &dm_bufio_all_clients); - __cache_size_refresh(); - mutex_unlock(&dm_bufio_clients_lock); - - c->shrinker.shrink = shrink; - c->shrinker.seeks = 1; - c->shrinker.batch = 0; - register_shrinker(&c->shrinker); - - return c; - -bad_buffer: -bad_cache: - while (!list_empty(&c->reserved_buffers)) { - struct dm_buffer *b = list_entry(c->reserved_buffers.next, - struct dm_buffer, lru_list); - list_del(&b->lru_list); - free_buffer(b); - } - dm_io_client_destroy(c->dm_io); -bad_dm_io: - vfree(c->cache_hash); -bad_hash: - kfree(c); -bad_client: - return ERR_PTR(r); -} -EXPORT_SYMBOL_GPL(dm_bufio_client_create); - -/* - * Free the buffering interface. - * It is required that there are no references on any buffers. - */ -void dm_bufio_client_destroy(struct dm_bufio_client *c) -{ - unsigned i; - - drop_buffers(c); - - unregister_shrinker(&c->shrinker); - - mutex_lock(&dm_bufio_clients_lock); - - list_del(&c->client_list); - dm_bufio_client_count--; - __cache_size_refresh(); - - mutex_unlock(&dm_bufio_clients_lock); - - for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) - BUG_ON(!hlist_empty(&c->cache_hash[i])); - - BUG_ON(c->need_reserved_buffers); - - while (!list_empty(&c->reserved_buffers)) { - struct dm_buffer *b = list_entry(c->reserved_buffers.next, - struct dm_buffer, lru_list); - list_del(&b->lru_list); - free_buffer(b); - } - - for (i = 0; i < LIST_SIZE; i++) - if (c->n_buffers[i]) - DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]); - - for (i = 0; i < LIST_SIZE; i++) - BUG_ON(c->n_buffers[i]); - - dm_io_client_destroy(c->dm_io); - vfree(c->cache_hash); - kfree(c); -} -EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); - -static void cleanup_old_buffers(void) -{ - unsigned long max_age = dm_bufio_max_age; - struct dm_bufio_client *c; - - barrier(); - - if (max_age > ULONG_MAX / HZ) - max_age = ULONG_MAX / HZ; - - mutex_lock(&dm_bufio_clients_lock); - list_for_each_entry(c, &dm_bufio_all_clients, client_list) { - if (!dm_bufio_trylock(c)) - continue; - - while (!list_empty(&c->lru[LIST_CLEAN])) { - struct dm_buffer *b; - b = list_entry(c->lru[LIST_CLEAN].prev, - struct dm_buffer, lru_list); - if (__cleanup_old_buffer(b, 0, max_age * HZ)) - break; - dm_bufio_cond_resched(); - } - - dm_bufio_unlock(c); - dm_bufio_cond_resched(); - } - mutex_unlock(&dm_bufio_clients_lock); -} - -static struct workqueue_struct *dm_bufio_wq; -static struct delayed_work dm_bufio_work; - -static void work_fn(struct work_struct *w) -{ - cleanup_old_buffers(); - - queue_delayed_work(dm_bufio_wq, &dm_bufio_work, - DM_BUFIO_WORK_TIMER_SECS * HZ); -} - -/*---------------------------------------------------------------- - * Module setup - *--------------------------------------------------------------*/ - -/* - * This is called only once for the whole dm_bufio module. - * It initializes memory limit. - */ -static int __init dm_bufio_init(void) -{ - __u64 mem; - - memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches); - memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names); - - mem = (__u64)((totalram_pages - totalhigh_pages) * - DM_BUFIO_MEMORY_PERCENT / 100) << PAGE_SHIFT; - - if (mem > ULONG_MAX) - mem = ULONG_MAX; - -#ifdef CONFIG_MMU - /* - * Get the size of vmalloc space the same way as VMALLOC_TOTAL - * in fs/proc/internal.h - */ - if (mem > (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100) - mem = (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100; -#endif - - dm_bufio_default_cache_size = mem; - - mutex_lock(&dm_bufio_clients_lock); - __cache_size_refresh(); - mutex_unlock(&dm_bufio_clients_lock); - - dm_bufio_wq = create_singlethread_workqueue("dm_bufio_cache"); - if (!dm_bufio_wq) - return -ENOMEM; - - INIT_DELAYED_WORK(&dm_bufio_work, work_fn); - queue_delayed_work(dm_bufio_wq, &dm_bufio_work, - DM_BUFIO_WORK_TIMER_SECS * HZ); - - return 0; -} - -/* - * This is called once when unloading the dm_bufio module. - */ -static void __exit dm_bufio_exit(void) -{ - int bug = 0; - int i; - - cancel_delayed_work_sync(&dm_bufio_work); - destroy_workqueue(dm_bufio_wq); - - for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++) { - struct kmem_cache *kc = dm_bufio_caches[i]; - - if (kc) - kmem_cache_destroy(kc); - } - - for (i = 0; i < ARRAY_SIZE(dm_bufio_cache_names); i++) - kfree(dm_bufio_cache_names[i]); - - if (dm_bufio_client_count) { - DMCRIT("%s: dm_bufio_client_count leaked: %d", - __func__, dm_bufio_client_count); - bug = 1; - } - - if (dm_bufio_current_allocated) { - DMCRIT("%s: dm_bufio_current_allocated leaked: %lu", - __func__, dm_bufio_current_allocated); - bug = 1; - } - - if (dm_bufio_allocated_get_free_pages) { - DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu", - __func__, dm_bufio_allocated_get_free_pages); - bug = 1; - } - - if (dm_bufio_allocated_vmalloc) { - DMCRIT("%s: dm_bufio_vmalloc leaked: %lu", - __func__, dm_bufio_allocated_vmalloc); - bug = 1; - } - - if (bug) - BUG(); -} - -module_init(dm_bufio_init) -module_exit(dm_bufio_exit) - -module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache"); - -module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds"); - -module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory"); - -module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO); -MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc"); - -module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO); -MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages"); - -module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO); -MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc"); - -module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO); -MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache"); - -MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>"); -MODULE_DESCRIPTION(DM_NAME " buffered I/O library"); -MODULE_LICENSE("GPL"); diff --git a/ANDROID_3.4.5/drivers/md/dm-bufio.h b/ANDROID_3.4.5/drivers/md/dm-bufio.h deleted file mode 100644 index b142946a..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-bufio.h +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (C) 2009-2011 Red Hat, Inc. - * - * Author: Mikulas Patocka <mpatocka@redhat.com> - * - * This file is released under the GPL. - */ - -#ifndef DM_BUFIO_H -#define DM_BUFIO_H - -#include <linux/blkdev.h> -#include <linux/types.h> - -/*----------------------------------------------------------------*/ - -struct dm_bufio_client; -struct dm_buffer; - -/* - * Create a buffered IO cache on a given device - */ -struct dm_bufio_client * -dm_bufio_client_create(struct block_device *bdev, unsigned block_size, - unsigned reserved_buffers, unsigned aux_size, - void (*alloc_callback)(struct dm_buffer *), - void (*write_callback)(struct dm_buffer *)); - -/* - * Release a buffered IO cache. - */ -void dm_bufio_client_destroy(struct dm_bufio_client *c); - -/* - * WARNING: to avoid deadlocks, these conditions are observed: - * - * - At most one thread can hold at most "reserved_buffers" simultaneously. - * - Each other threads can hold at most one buffer. - * - Threads which call only dm_bufio_get can hold unlimited number of - * buffers. - */ - -/* - * Read a given block from disk. Returns pointer to data. Returns a - * pointer to dm_buffer that can be used to release the buffer or to make - * it dirty. - */ -void *dm_bufio_read(struct dm_bufio_client *c, sector_t block, - struct dm_buffer **bp); - -/* - * Like dm_bufio_read, but return buffer from cache, don't read - * it. If the buffer is not in the cache, return NULL. - */ -void *dm_bufio_get(struct dm_bufio_client *c, sector_t block, - struct dm_buffer **bp); - -/* - * Like dm_bufio_read, but don't read anything from the disk. It is - * expected that the caller initializes the buffer and marks it dirty. - */ -void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, - struct dm_buffer **bp); - -/* - * Prefetch the specified blocks to the cache. - * The function starts to read the blocks and returns without waiting for - * I/O to finish. - */ -void dm_bufio_prefetch(struct dm_bufio_client *c, - sector_t block, unsigned n_blocks); - -/* - * Release a reference obtained with dm_bufio_{read,get,new}. The data - * pointer and dm_buffer pointer is no longer valid after this call. - */ -void dm_bufio_release(struct dm_buffer *b); - -/* - * Mark a buffer dirty. It should be called after the buffer is modified. - * - * In case of memory pressure, the buffer may be written after - * dm_bufio_mark_buffer_dirty, but before dm_bufio_write_dirty_buffers. So - * dm_bufio_write_dirty_buffers guarantees that the buffer is on-disk but - * the actual writing may occur earlier. - */ -void dm_bufio_mark_buffer_dirty(struct dm_buffer *b); - -/* - * Initiate writing of dirty buffers, without waiting for completion. - */ -void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c); - -/* - * Write all dirty buffers. Guarantees that all dirty buffers created prior - * to this call are on disk when this call exits. - */ -int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c); - -/* - * Send an empty write barrier to the device to flush hardware disk cache. - */ -int dm_bufio_issue_flush(struct dm_bufio_client *c); - -/* - * Like dm_bufio_release but also move the buffer to the new - * block. dm_bufio_write_dirty_buffers is needed to commit the new block. - */ -void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block); - -unsigned dm_bufio_get_block_size(struct dm_bufio_client *c); -sector_t dm_bufio_get_device_size(struct dm_bufio_client *c); -sector_t dm_bufio_get_block_number(struct dm_buffer *b); -void *dm_bufio_get_block_data(struct dm_buffer *b); -void *dm_bufio_get_aux_data(struct dm_buffer *b); -struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b); - -/*----------------------------------------------------------------*/ - -#endif diff --git a/ANDROID_3.4.5/drivers/md/dm-crypt.c b/ANDROID_3.4.5/drivers/md/dm-crypt.c deleted file mode 100644 index 3f06df59..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-crypt.c +++ /dev/null @@ -1,1914 +0,0 @@ -/* - * Copyright (C) 2003 Christophe Saout <christophe@saout.de> - * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> - * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved. - * - * This file is released under the GPL. - */ - -#include <linux/completion.h> -#include <linux/err.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/bio.h> -#include <linux/blkdev.h> -#include <linux/mempool.h> -#include <linux/slab.h> -#include <linux/crypto.h> -#include <linux/workqueue.h> -#include <linux/backing-dev.h> -#include <linux/percpu.h> -#include <linux/atomic.h> -#include <linux/scatterlist.h> -#include <asm/page.h> -#include <asm/unaligned.h> -#include <crypto/hash.h> -#include <crypto/md5.h> -#include <crypto/algapi.h> - -#include <linux/device-mapper.h> - -#define DM_MSG_PREFIX "crypt" - -/* - * context holding the current state of a multi-part conversion - */ -struct convert_context { - struct completion restart; - struct bio *bio_in; - struct bio *bio_out; - unsigned int offset_in; - unsigned int offset_out; - unsigned int idx_in; - unsigned int idx_out; - sector_t sector; - atomic_t pending; -}; - -/* - * per bio private data - */ -struct dm_crypt_io { - struct dm_target *target; - struct bio *base_bio; - struct work_struct work; - - struct convert_context ctx; - - atomic_t pending; - int error; - sector_t sector; - struct dm_crypt_io *base_io; -}; - -struct dm_crypt_request { - struct convert_context *ctx; - struct scatterlist sg_in; - struct scatterlist sg_out; - sector_t iv_sector; -}; - -struct crypt_config; - -struct crypt_iv_operations { - int (*ctr)(struct crypt_config *cc, struct dm_target *ti, - const char *opts); - void (*dtr)(struct crypt_config *cc); - int (*init)(struct crypt_config *cc); - int (*wipe)(struct crypt_config *cc); - int (*generator)(struct crypt_config *cc, u8 *iv, - struct dm_crypt_request *dmreq); - int (*post)(struct crypt_config *cc, u8 *iv, - struct dm_crypt_request *dmreq); -}; - -struct iv_essiv_private { - struct crypto_hash *hash_tfm; - u8 *salt; -}; - -struct iv_benbi_private { - int shift; -}; - -#define LMK_SEED_SIZE 64 /* hash + 0 */ -struct iv_lmk_private { - struct crypto_shash *hash_tfm; - u8 *seed; -}; - -/* - * Crypt: maps a linear range of a block device - * and encrypts / decrypts at the same time. - */ -enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID }; - -/* - * Duplicated per-CPU state for cipher. - */ -struct crypt_cpu { - struct ablkcipher_request *req; - /* ESSIV: struct crypto_cipher *essiv_tfm */ - void *iv_private; - struct crypto_ablkcipher *tfms[0]; -}; - -/* - * The fields in here must be read only after initialization, - * changing state should be in crypt_cpu. - */ -struct crypt_config { - struct dm_dev *dev; - sector_t start; - - /* - * pool for per bio private data, crypto requests and - * encryption requeusts/buffer pages - */ - mempool_t *io_pool; - mempool_t *req_pool; - mempool_t *page_pool; - struct bio_set *bs; - - struct workqueue_struct *io_queue; - struct workqueue_struct *crypt_queue; - - char *cipher; - char *cipher_string; - - struct crypt_iv_operations *iv_gen_ops; - union { - struct iv_essiv_private essiv; - struct iv_benbi_private benbi; - struct iv_lmk_private lmk; - } iv_gen_private; - sector_t iv_offset; - unsigned int iv_size; - - /* - * Duplicated per cpu state. Access through - * per_cpu_ptr() only. - */ - struct crypt_cpu __percpu *cpu; - unsigned tfms_count; - - /* - * Layout of each crypto request: - * - * struct ablkcipher_request - * context - * padding - * struct dm_crypt_request - * padding - * IV - * - * The padding is added so that dm_crypt_request and the IV are - * correctly aligned. - */ - unsigned int dmreq_start; - - unsigned long flags; - unsigned int key_size; - unsigned int key_parts; - u8 key[0]; -}; - -#define MIN_IOS 16 -#define MIN_POOL_PAGES 32 - -static struct kmem_cache *_crypt_io_pool; - -static void clone_init(struct dm_crypt_io *, struct bio *); -static void kcryptd_queue_crypt(struct dm_crypt_io *io); -static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq); - -static struct crypt_cpu *this_crypt_config(struct crypt_config *cc) -{ - return this_cpu_ptr(cc->cpu); -} - -/* - * Use this to access cipher attributes that are the same for each CPU. - */ -static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc) -{ - return __this_cpu_ptr(cc->cpu)->tfms[0]; -} - -/* - * Different IV generation algorithms: - * - * plain: the initial vector is the 32-bit little-endian version of the sector - * number, padded with zeros if necessary. - * - * plain64: the initial vector is the 64-bit little-endian version of the sector - * number, padded with zeros if necessary. - * - * essiv: "encrypted sector|salt initial vector", the sector number is - * encrypted with the bulk cipher using a salt as key. The salt - * should be derived from the bulk cipher's key via hashing. - * - * benbi: the 64-bit "big-endian 'narrow block'-count", starting at 1 - * (needed for LRW-32-AES and possible other narrow block modes) - * - * null: the initial vector is always zero. Provides compatibility with - * obsolete loop_fish2 devices. Do not use for new devices. - * - * lmk: Compatible implementation of the block chaining mode used - * by the Loop-AES block device encryption system - * designed by Jari Ruusu. See http://loop-aes.sourceforge.net/ - * It operates on full 512 byte sectors and uses CBC - * with an IV derived from the sector number, the data and - * optionally extra IV seed. - * This means that after decryption the first block - * of sector must be tweaked according to decrypted data. - * Loop-AES can use three encryption schemes: - * version 1: is plain aes-cbc mode - * version 2: uses 64 multikey scheme with lmk IV generator - * version 3: the same as version 2 with additional IV seed - * (it uses 65 keys, last key is used as IV seed) - * - * plumb: unimplemented, see: - * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454 - */ - -static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, - struct dm_crypt_request *dmreq) -{ - memset(iv, 0, cc->iv_size); - *(__le32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff); - - return 0; -} - -static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv, - struct dm_crypt_request *dmreq) -{ - memset(iv, 0, cc->iv_size); - *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector); - - return 0; -} - -/* Initialise ESSIV - compute salt but no local memory allocations */ -static int crypt_iv_essiv_init(struct crypt_config *cc) -{ - struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; - struct hash_desc desc; - struct scatterlist sg; - struct crypto_cipher *essiv_tfm; - int err, cpu; - - sg_init_one(&sg, cc->key, cc->key_size); - desc.tfm = essiv->hash_tfm; - desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; - - err = crypto_hash_digest(&desc, &sg, cc->key_size, essiv->salt); - if (err) - return err; - - for_each_possible_cpu(cpu) { - essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private, - - err = crypto_cipher_setkey(essiv_tfm, essiv->salt, - crypto_hash_digestsize(essiv->hash_tfm)); - if (err) - return err; - } - - return 0; -} - -/* Wipe salt and reset key derived from volume key */ -static int crypt_iv_essiv_wipe(struct crypt_config *cc) -{ - struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; - unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm); - struct crypto_cipher *essiv_tfm; - int cpu, r, err = 0; - - memset(essiv->salt, 0, salt_size); - - for_each_possible_cpu(cpu) { - essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private; - r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size); - if (r) - err = r; - } - - return err; -} - -/* Set up per cpu cipher state */ -static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc, - struct dm_target *ti, - u8 *salt, unsigned saltsize) -{ - struct crypto_cipher *essiv_tfm; - int err; - - /* Setup the essiv_tfm with the given salt */ - essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(essiv_tfm)) { - ti->error = "Error allocating crypto tfm for ESSIV"; - return essiv_tfm; - } - - if (crypto_cipher_blocksize(essiv_tfm) != - crypto_ablkcipher_ivsize(any_tfm(cc))) { - ti->error = "Block size of ESSIV cipher does " - "not match IV size of block cipher"; - crypto_free_cipher(essiv_tfm); - return ERR_PTR(-EINVAL); - } - - err = crypto_cipher_setkey(essiv_tfm, salt, saltsize); - if (err) { - ti->error = "Failed to set key for ESSIV cipher"; - crypto_free_cipher(essiv_tfm); - return ERR_PTR(err); - } - - return essiv_tfm; -} - -static void crypt_iv_essiv_dtr(struct crypt_config *cc) -{ - int cpu; - struct crypt_cpu *cpu_cc; - struct crypto_cipher *essiv_tfm; - struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; - - crypto_free_hash(essiv->hash_tfm); - essiv->hash_tfm = NULL; - - kzfree(essiv->salt); - essiv->salt = NULL; - - for_each_possible_cpu(cpu) { - cpu_cc = per_cpu_ptr(cc->cpu, cpu); - essiv_tfm = cpu_cc->iv_private; - - if (essiv_tfm) - crypto_free_cipher(essiv_tfm); - - cpu_cc->iv_private = NULL; - } -} - -static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, - const char *opts) -{ - struct crypto_cipher *essiv_tfm = NULL; - struct crypto_hash *hash_tfm = NULL; - u8 *salt = NULL; - int err, cpu; - - if (!opts) { - ti->error = "Digest algorithm missing for ESSIV mode"; - return -EINVAL; - } - - /* Allocate hash algorithm */ - hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(hash_tfm)) { - ti->error = "Error initializing ESSIV hash"; - err = PTR_ERR(hash_tfm); - goto bad; - } - - salt = kzalloc(crypto_hash_digestsize(hash_tfm), GFP_KERNEL); - if (!salt) { - ti->error = "Error kmallocing salt storage in ESSIV"; - err = -ENOMEM; - goto bad; - } - - cc->iv_gen_private.essiv.salt = salt; - cc->iv_gen_private.essiv.hash_tfm = hash_tfm; - - for_each_possible_cpu(cpu) { - essiv_tfm = setup_essiv_cpu(cc, ti, salt, - crypto_hash_digestsize(hash_tfm)); - if (IS_ERR(essiv_tfm)) { - crypt_iv_essiv_dtr(cc); - return PTR_ERR(essiv_tfm); - } - per_cpu_ptr(cc->cpu, cpu)->iv_private = essiv_tfm; - } - - return 0; - -bad: - if (hash_tfm && !IS_ERR(hash_tfm)) - crypto_free_hash(hash_tfm); - kfree(salt); - return err; -} - -static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, - struct dm_crypt_request *dmreq) -{ - struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private; - - memset(iv, 0, cc->iv_size); - *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector); - crypto_cipher_encrypt_one(essiv_tfm, iv, iv); - - return 0; -} - -static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, - const char *opts) -{ - unsigned bs = crypto_ablkcipher_blocksize(any_tfm(cc)); - int log = ilog2(bs); - - /* we need to calculate how far we must shift the sector count - * to get the cipher block count, we use this shift in _gen */ - - if (1 << log != bs) { - ti->error = "cypher blocksize is not a power of 2"; - return -EINVAL; - } - - if (log > 9) { - ti->error = "cypher blocksize is > 512"; - return -EINVAL; - } - - cc->iv_gen_private.benbi.shift = 9 - log; - - return 0; -} - -static void crypt_iv_benbi_dtr(struct crypt_config *cc) -{ -} - -static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, - struct dm_crypt_request *dmreq) -{ - __be64 val; - - memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */ - - val = cpu_to_be64(((u64)dmreq->iv_sector << cc->iv_gen_private.benbi.shift) + 1); - put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64))); - - return 0; -} - -static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, - struct dm_crypt_request *dmreq) -{ - memset(iv, 0, cc->iv_size); - - return 0; -} - -static void crypt_iv_lmk_dtr(struct crypt_config *cc) -{ - struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; - - if (lmk->hash_tfm && !IS_ERR(lmk->hash_tfm)) - crypto_free_shash(lmk->hash_tfm); - lmk->hash_tfm = NULL; - - kzfree(lmk->seed); - lmk->seed = NULL; -} - -static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti, - const char *opts) -{ - struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; - - lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0); - if (IS_ERR(lmk->hash_tfm)) { - ti->error = "Error initializing LMK hash"; - return PTR_ERR(lmk->hash_tfm); - } - - /* No seed in LMK version 2 */ - if (cc->key_parts == cc->tfms_count) { - lmk->seed = NULL; - return 0; - } - - lmk->seed = kzalloc(LMK_SEED_SIZE, GFP_KERNEL); - if (!lmk->seed) { - crypt_iv_lmk_dtr(cc); - ti->error = "Error kmallocing seed storage in LMK"; - return -ENOMEM; - } - - return 0; -} - -static int crypt_iv_lmk_init(struct crypt_config *cc) -{ - struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; - int subkey_size = cc->key_size / cc->key_parts; - - /* LMK seed is on the position of LMK_KEYS + 1 key */ - if (lmk->seed) - memcpy(lmk->seed, cc->key + (cc->tfms_count * subkey_size), - crypto_shash_digestsize(lmk->hash_tfm)); - - return 0; -} - -static int crypt_iv_lmk_wipe(struct crypt_config *cc) -{ - struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; - - if (lmk->seed) - memset(lmk->seed, 0, LMK_SEED_SIZE); - - return 0; -} - -static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv, - struct dm_crypt_request *dmreq, - u8 *data) -{ - struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; - struct { - struct shash_desc desc; - char ctx[crypto_shash_descsize(lmk->hash_tfm)]; - } sdesc; - struct md5_state md5state; - u32 buf[4]; - int i, r; - - sdesc.desc.tfm = lmk->hash_tfm; - sdesc.desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; - - r = crypto_shash_init(&sdesc.desc); - if (r) - return r; - - if (lmk->seed) { - r = crypto_shash_update(&sdesc.desc, lmk->seed, LMK_SEED_SIZE); - if (r) - return r; - } - - /* Sector is always 512B, block size 16, add data of blocks 1-31 */ - r = crypto_shash_update(&sdesc.desc, data + 16, 16 * 31); - if (r) - return r; - - /* Sector is cropped to 56 bits here */ - buf[0] = cpu_to_le32(dmreq->iv_sector & 0xFFFFFFFF); - buf[1] = cpu_to_le32((((u64)dmreq->iv_sector >> 32) & 0x00FFFFFF) | 0x80000000); - buf[2] = cpu_to_le32(4024); - buf[3] = 0; - r = crypto_shash_update(&sdesc.desc, (u8 *)buf, sizeof(buf)); - if (r) - return r; - - /* No MD5 padding here */ - r = crypto_shash_export(&sdesc.desc, &md5state); - if (r) - return r; - - for (i = 0; i < MD5_HASH_WORDS; i++) - __cpu_to_le32s(&md5state.hash[i]); - memcpy(iv, &md5state.hash, cc->iv_size); - - return 0; -} - -static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv, - struct dm_crypt_request *dmreq) -{ - u8 *src; - int r = 0; - - if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) { - src = kmap_atomic(sg_page(&dmreq->sg_in)); - r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset); - kunmap_atomic(src); - } else - memset(iv, 0, cc->iv_size); - - return r; -} - -static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv, - struct dm_crypt_request *dmreq) -{ - u8 *dst; - int r; - - if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) - return 0; - - dst = kmap_atomic(sg_page(&dmreq->sg_out)); - r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset); - - /* Tweak the first block of plaintext sector */ - if (!r) - crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size); - - kunmap_atomic(dst); - return r; -} - -static struct crypt_iv_operations crypt_iv_plain_ops = { - .generator = crypt_iv_plain_gen -}; - -static struct crypt_iv_operations crypt_iv_plain64_ops = { - .generator = crypt_iv_plain64_gen -}; - -static struct crypt_iv_operations crypt_iv_essiv_ops = { - .ctr = crypt_iv_essiv_ctr, - .dtr = crypt_iv_essiv_dtr, - .init = crypt_iv_essiv_init, - .wipe = crypt_iv_essiv_wipe, - .generator = crypt_iv_essiv_gen -}; - -static struct crypt_iv_operations crypt_iv_benbi_ops = { - .ctr = crypt_iv_benbi_ctr, - .dtr = crypt_iv_benbi_dtr, - .generator = crypt_iv_benbi_gen -}; - -static struct crypt_iv_operations crypt_iv_null_ops = { - .generator = crypt_iv_null_gen -}; - -static struct crypt_iv_operations crypt_iv_lmk_ops = { - .ctr = crypt_iv_lmk_ctr, - .dtr = crypt_iv_lmk_dtr, - .init = crypt_iv_lmk_init, - .wipe = crypt_iv_lmk_wipe, - .generator = crypt_iv_lmk_gen, - .post = crypt_iv_lmk_post -}; - -static void crypt_convert_init(struct crypt_config *cc, - struct convert_context *ctx, - struct bio *bio_out, struct bio *bio_in, - sector_t sector) -{ - ctx->bio_in = bio_in; - ctx->bio_out = bio_out; - ctx->offset_in = 0; - ctx->offset_out = 0; - ctx->idx_in = bio_in ? bio_in->bi_idx : 0; - ctx->idx_out = bio_out ? bio_out->bi_idx : 0; - ctx->sector = sector + cc->iv_offset; - init_completion(&ctx->restart); -} - -static struct dm_crypt_request *dmreq_of_req(struct crypt_config *cc, - struct ablkcipher_request *req) -{ - return (struct dm_crypt_request *)((char *)req + cc->dmreq_start); -} - -static struct ablkcipher_request *req_of_dmreq(struct crypt_config *cc, - struct dm_crypt_request *dmreq) -{ - return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start); -} - -static u8 *iv_of_dmreq(struct crypt_config *cc, - struct dm_crypt_request *dmreq) -{ - return (u8 *)ALIGN((unsigned long)(dmreq + 1), - crypto_ablkcipher_alignmask(any_tfm(cc)) + 1); -} - -static int crypt_convert_block(struct crypt_config *cc, - struct convert_context *ctx, - struct ablkcipher_request *req) -{ - struct bio_vec *bv_in = bio_iovec_idx(ctx->bio_in, ctx->idx_in); - struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out); - struct dm_crypt_request *dmreq; - u8 *iv; - int r = 0; - - dmreq = dmreq_of_req(cc, req); - iv = iv_of_dmreq(cc, dmreq); - - dmreq->iv_sector = ctx->sector; - dmreq->ctx = ctx; - sg_init_table(&dmreq->sg_in, 1); - sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, - bv_in->bv_offset + ctx->offset_in); - - sg_init_table(&dmreq->sg_out, 1); - sg_set_page(&dmreq->sg_out, bv_out->bv_page, 1 << SECTOR_SHIFT, - bv_out->bv_offset + ctx->offset_out); - - ctx->offset_in += 1 << SECTOR_SHIFT; - if (ctx->offset_in >= bv_in->bv_len) { - ctx->offset_in = 0; - ctx->idx_in++; - } - - ctx->offset_out += 1 << SECTOR_SHIFT; - if (ctx->offset_out >= bv_out->bv_len) { - ctx->offset_out = 0; - ctx->idx_out++; - } - - if (cc->iv_gen_ops) { - r = cc->iv_gen_ops->generator(cc, iv, dmreq); - if (r < 0) - return r; - } - - ablkcipher_request_set_crypt(req, &dmreq->sg_in, &dmreq->sg_out, - 1 << SECTOR_SHIFT, iv); - - if (bio_data_dir(ctx->bio_in) == WRITE) - r = crypto_ablkcipher_encrypt(req); - else - r = crypto_ablkcipher_decrypt(req); - - if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post) - r = cc->iv_gen_ops->post(cc, iv, dmreq); - - return r; -} - -static void kcryptd_async_done(struct crypto_async_request *async_req, - int error); - -static void crypt_alloc_req(struct crypt_config *cc, - struct convert_context *ctx) -{ - struct crypt_cpu *this_cc = this_crypt_config(cc); - unsigned key_index = ctx->sector & (cc->tfms_count - 1); - - if (!this_cc->req) - this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); - - ablkcipher_request_set_tfm(this_cc->req, this_cc->tfms[key_index]); - ablkcipher_request_set_callback(this_cc->req, - CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - kcryptd_async_done, dmreq_of_req(cc, this_cc->req)); -} - -/* - * Encrypt / decrypt data from one bio to another one (can be the same one) - */ -static int crypt_convert(struct crypt_config *cc, - struct convert_context *ctx) -{ - struct crypt_cpu *this_cc = this_crypt_config(cc); - int r; - - atomic_set(&ctx->pending, 1); - - while(ctx->idx_in < ctx->bio_in->bi_vcnt && - ctx->idx_out < ctx->bio_out->bi_vcnt) { - - crypt_alloc_req(cc, ctx); - - atomic_inc(&ctx->pending); - - r = crypt_convert_block(cc, ctx, this_cc->req); - - switch (r) { - /* async */ - case -EBUSY: - wait_for_completion(&ctx->restart); - INIT_COMPLETION(ctx->restart); - /* fall through*/ - case -EINPROGRESS: - this_cc->req = NULL; - ctx->sector++; - continue; - - /* sync */ - case 0: - atomic_dec(&ctx->pending); - ctx->sector++; - cond_resched(); - continue; - - /* error */ - default: - atomic_dec(&ctx->pending); - return r; - } - } - - return 0; -} - -static void dm_crypt_bio_destructor(struct bio *bio) -{ - struct dm_crypt_io *io = bio->bi_private; - struct crypt_config *cc = io->target->private; - - bio_free(bio, cc->bs); -} - -/* - * Generate a new unfragmented bio with the given size - * This should never violate the device limitations - * May return a smaller bio when running out of pages, indicated by - * *out_of_pages set to 1. - */ -static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size, - unsigned *out_of_pages) -{ - struct crypt_config *cc = io->target->private; - struct bio *clone; - unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; - gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM; - unsigned i, len; - struct page *page; - - clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs); - if (!clone) - return NULL; - - clone_init(io, clone); - *out_of_pages = 0; - - for (i = 0; i < nr_iovecs; i++) { - page = mempool_alloc(cc->page_pool, gfp_mask); - if (!page) { - *out_of_pages = 1; - break; - } - - /* - * If additional pages cannot be allocated without waiting, - * return a partially-allocated bio. The caller will then try - * to allocate more bios while submitting this partial bio. - */ - gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT; - - len = (size > PAGE_SIZE) ? PAGE_SIZE : size; - - if (!bio_add_page(clone, page, len, 0)) { - mempool_free(page, cc->page_pool); - break; - } - - size -= len; - } - - if (!clone->bi_size) { - bio_put(clone); - return NULL; - } - - return clone; -} - -static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone) -{ - unsigned int i; - struct bio_vec *bv; - - for (i = 0; i < clone->bi_vcnt; i++) { - bv = bio_iovec_idx(clone, i); - BUG_ON(!bv->bv_page); - mempool_free(bv->bv_page, cc->page_pool); - bv->bv_page = NULL; - } -} - -static struct dm_crypt_io *crypt_io_alloc(struct dm_target *ti, - struct bio *bio, sector_t sector) -{ - struct crypt_config *cc = ti->private; - struct dm_crypt_io *io; - - io = mempool_alloc(cc->io_pool, GFP_NOIO); - io->target = ti; - io->base_bio = bio; - io->sector = sector; - io->error = 0; - io->base_io = NULL; - atomic_set(&io->pending, 0); - - return io; -} - -static void crypt_inc_pending(struct dm_crypt_io *io) -{ - atomic_inc(&io->pending); -} - -/* - * One of the bios was finished. Check for completion of - * the whole request and correctly clean up the buffer. - * If base_io is set, wait for the last fragment to complete. - */ -static void crypt_dec_pending(struct dm_crypt_io *io) -{ - struct crypt_config *cc = io->target->private; - struct bio *base_bio = io->base_bio; - struct dm_crypt_io *base_io = io->base_io; - int error = io->error; - - if (!atomic_dec_and_test(&io->pending)) - return; - - mempool_free(io, cc->io_pool); - - if (likely(!base_io)) - bio_endio(base_bio, error); - else { - if (error && !base_io->error) - base_io->error = error; - crypt_dec_pending(base_io); - } -} - -/* - * kcryptd/kcryptd_io: - * - * Needed because it would be very unwise to do decryption in an - * interrupt context. - * - * kcryptd performs the actual encryption or decryption. - * - * kcryptd_io performs the IO submission. - * - * They must be separated as otherwise the final stages could be - * starved by new requests which can block in the first stages due - * to memory allocation. - * - * The work is done per CPU global for all dm-crypt instances. - * They should not depend on each other and do not block. - */ -static void crypt_endio(struct bio *clone, int error) -{ - struct dm_crypt_io *io = clone->bi_private; - struct crypt_config *cc = io->target->private; - unsigned rw = bio_data_dir(clone); - - if (unlikely(!bio_flagged(clone, BIO_UPTODATE) && !error)) - error = -EIO; - - /* - * free the processed pages - */ - if (rw == WRITE) - crypt_free_buffer_pages(cc, clone); - - bio_put(clone); - - if (rw == READ && !error) { - kcryptd_queue_crypt(io); - return; - } - - if (unlikely(error)) - io->error = error; - - crypt_dec_pending(io); -} - -static void clone_init(struct dm_crypt_io *io, struct bio *clone) -{ - struct crypt_config *cc = io->target->private; - - clone->bi_private = io; - clone->bi_end_io = crypt_endio; - clone->bi_bdev = cc->dev->bdev; - clone->bi_rw = io->base_bio->bi_rw; - clone->bi_destructor = dm_crypt_bio_destructor; -} - -static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) -{ - struct crypt_config *cc = io->target->private; - struct bio *base_bio = io->base_bio; - struct bio *clone; - - /* - * The block layer might modify the bvec array, so always - * copy the required bvecs because we need the original - * one in order to decrypt the whole bio data *afterwards*. - */ - clone = bio_alloc_bioset(gfp, bio_segments(base_bio), cc->bs); - if (!clone) - return 1; - - crypt_inc_pending(io); - - clone_init(io, clone); - clone->bi_idx = 0; - clone->bi_vcnt = bio_segments(base_bio); - clone->bi_size = base_bio->bi_size; - clone->bi_sector = cc->start + io->sector; - memcpy(clone->bi_io_vec, bio_iovec(base_bio), - sizeof(struct bio_vec) * clone->bi_vcnt); - - generic_make_request(clone); - return 0; -} - -static void kcryptd_io_write(struct dm_crypt_io *io) -{ - struct bio *clone = io->ctx.bio_out; - generic_make_request(clone); -} - -static void kcryptd_io(struct work_struct *work) -{ - struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); - - if (bio_data_dir(io->base_bio) == READ) { - crypt_inc_pending(io); - if (kcryptd_io_read(io, GFP_NOIO)) - io->error = -ENOMEM; - crypt_dec_pending(io); - } else - kcryptd_io_write(io); -} - -static void kcryptd_queue_io(struct dm_crypt_io *io) -{ - struct crypt_config *cc = io->target->private; - - INIT_WORK(&io->work, kcryptd_io); - queue_work(cc->io_queue, &io->work); -} - -static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) -{ - struct bio *clone = io->ctx.bio_out; - struct crypt_config *cc = io->target->private; - - if (unlikely(io->error < 0)) { - crypt_free_buffer_pages(cc, clone); - bio_put(clone); - crypt_dec_pending(io); - return; - } - - /* crypt_convert should have filled the clone bio */ - BUG_ON(io->ctx.idx_out < clone->bi_vcnt); - - clone->bi_sector = cc->start + io->sector; - - if (async) - kcryptd_queue_io(io); - else - generic_make_request(clone); -} - -static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) -{ - struct crypt_config *cc = io->target->private; - struct bio *clone; - struct dm_crypt_io *new_io; - int crypt_finished; - unsigned out_of_pages = 0; - unsigned remaining = io->base_bio->bi_size; - sector_t sector = io->sector; - int r; - - /* - * Prevent io from disappearing until this function completes. - */ - crypt_inc_pending(io); - crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, sector); - - /* - * The allocated buffers can be smaller than the whole bio, - * so repeat the whole process until all the data can be handled. - */ - while (remaining) { - clone = crypt_alloc_buffer(io, remaining, &out_of_pages); - if (unlikely(!clone)) { - io->error = -ENOMEM; - break; - } - - io->ctx.bio_out = clone; - io->ctx.idx_out = 0; - - remaining -= clone->bi_size; - sector += bio_sectors(clone); - - crypt_inc_pending(io); - - r = crypt_convert(cc, &io->ctx); - if (r < 0) - io->error = -EIO; - - crypt_finished = atomic_dec_and_test(&io->ctx.pending); - - /* Encryption was already finished, submit io now */ - if (crypt_finished) { - kcryptd_crypt_write_io_submit(io, 0); - - /* - * If there was an error, do not try next fragments. - * For async, error is processed in async handler. - */ - if (unlikely(r < 0)) - break; - - io->sector = sector; - } - - /* - * Out of memory -> run queues - * But don't wait if split was due to the io size restriction - */ - if (unlikely(out_of_pages)) - congestion_wait(BLK_RW_ASYNC, HZ/100); - - /* - * With async crypto it is unsafe to share the crypto context - * between fragments, so switch to a new dm_crypt_io structure. - */ - if (unlikely(!crypt_finished && remaining)) { - new_io = crypt_io_alloc(io->target, io->base_bio, - sector); - crypt_inc_pending(new_io); - crypt_convert_init(cc, &new_io->ctx, NULL, - io->base_bio, sector); - new_io->ctx.idx_in = io->ctx.idx_in; - new_io->ctx.offset_in = io->ctx.offset_in; - - /* - * Fragments after the first use the base_io - * pending count. - */ - if (!io->base_io) - new_io->base_io = io; - else { - new_io->base_io = io->base_io; - crypt_inc_pending(io->base_io); - crypt_dec_pending(io); - } - - io = new_io; - } - } - - crypt_dec_pending(io); -} - -static void kcryptd_crypt_read_done(struct dm_crypt_io *io) -{ - crypt_dec_pending(io); -} - -static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) -{ - struct crypt_config *cc = io->target->private; - int r = 0; - - crypt_inc_pending(io); - - crypt_convert_init(cc, &io->ctx, io->base_bio, io->base_bio, - io->sector); - - r = crypt_convert(cc, &io->ctx); - if (r < 0) - io->error = -EIO; - - if (atomic_dec_and_test(&io->ctx.pending)) - kcryptd_crypt_read_done(io); - - crypt_dec_pending(io); -} - -static void kcryptd_async_done(struct crypto_async_request *async_req, - int error) -{ - struct dm_crypt_request *dmreq = async_req->data; - struct convert_context *ctx = dmreq->ctx; - struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx); - struct crypt_config *cc = io->target->private; - - if (error == -EINPROGRESS) { - complete(&ctx->restart); - return; - } - - if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post) - error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq); - - if (error < 0) - io->error = -EIO; - - mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); - - if (!atomic_dec_and_test(&ctx->pending)) - return; - - if (bio_data_dir(io->base_bio) == READ) - kcryptd_crypt_read_done(io); - else - kcryptd_crypt_write_io_submit(io, 1); -} - -static void kcryptd_crypt(struct work_struct *work) -{ - struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); - - if (bio_data_dir(io->base_bio) == READ) - kcryptd_crypt_read_convert(io); - else - kcryptd_crypt_write_convert(io); -} - -static void kcryptd_queue_crypt(struct dm_crypt_io *io) -{ - struct crypt_config *cc = io->target->private; - - INIT_WORK(&io->work, kcryptd_crypt); - queue_work(cc->crypt_queue, &io->work); -} - -/* - * Decode key from its hex representation - */ -static int crypt_decode_key(u8 *key, char *hex, unsigned int size) -{ - char buffer[3]; - char *endp; - unsigned int i; - - buffer[2] = '\0'; - - for (i = 0; i < size; i++) { - buffer[0] = *hex++; - buffer[1] = *hex++; - - key[i] = (u8)simple_strtoul(buffer, &endp, 16); - - if (endp != &buffer[2]) - return -EINVAL; - } - - if (*hex != '\0') - return -EINVAL; - - return 0; -} - -/* - * Encode key into its hex representation - */ -static void crypt_encode_key(char *hex, u8 *key, unsigned int size) -{ - unsigned int i; - - for (i = 0; i < size; i++) { - sprintf(hex, "%02x", *key); - hex += 2; - key++; - } -} - -static void crypt_free_tfms(struct crypt_config *cc, int cpu) -{ - struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu); - unsigned i; - - for (i = 0; i < cc->tfms_count; i++) - if (cpu_cc->tfms[i] && !IS_ERR(cpu_cc->tfms[i])) { - crypto_free_ablkcipher(cpu_cc->tfms[i]); - cpu_cc->tfms[i] = NULL; - } -} - -static int crypt_alloc_tfms(struct crypt_config *cc, int cpu, char *ciphermode) -{ - struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu); - unsigned i; - int err; - - for (i = 0; i < cc->tfms_count; i++) { - cpu_cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0); - if (IS_ERR(cpu_cc->tfms[i])) { - err = PTR_ERR(cpu_cc->tfms[i]); - crypt_free_tfms(cc, cpu); - return err; - } - } - - return 0; -} - -static int crypt_setkey_allcpus(struct crypt_config *cc) -{ - unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count); - int cpu, err = 0, i, r; - - for_each_possible_cpu(cpu) { - for (i = 0; i < cc->tfms_count; i++) { - r = crypto_ablkcipher_setkey(per_cpu_ptr(cc->cpu, cpu)->tfms[i], - cc->key + (i * subkey_size), subkey_size); - if (r) - err = r; - } - } - - return err; -} - -static int crypt_set_key(struct crypt_config *cc, char *key) -{ - int r = -EINVAL; - int key_string_len = strlen(key); - - /* The key size may not be changed. */ - if (cc->key_size != (key_string_len >> 1)) - goto out; - - /* Hyphen (which gives a key_size of zero) means there is no key. */ - if (!cc->key_size && strcmp(key, "-")) - goto out; - - if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0) - goto out; - - set_bit(DM_CRYPT_KEY_VALID, &cc->flags); - - r = crypt_setkey_allcpus(cc); - -out: - /* Hex key string not needed after here, so wipe it. */ - memset(key, '0', key_string_len); - - return r; -} - -static int crypt_wipe_key(struct crypt_config *cc) -{ - clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); - memset(&cc->key, 0, cc->key_size * sizeof(u8)); - - return crypt_setkey_allcpus(cc); -} - -static void crypt_dtr(struct dm_target *ti) -{ - struct crypt_config *cc = ti->private; - struct crypt_cpu *cpu_cc; - int cpu; - - ti->private = NULL; - - if (!cc) - return; - - if (cc->io_queue) - destroy_workqueue(cc->io_queue); - if (cc->crypt_queue) - destroy_workqueue(cc->crypt_queue); - - if (cc->cpu) - for_each_possible_cpu(cpu) { - cpu_cc = per_cpu_ptr(cc->cpu, cpu); - if (cpu_cc->req) - mempool_free(cpu_cc->req, cc->req_pool); - crypt_free_tfms(cc, cpu); - } - - if (cc->bs) - bioset_free(cc->bs); - - if (cc->page_pool) - mempool_destroy(cc->page_pool); - if (cc->req_pool) - mempool_destroy(cc->req_pool); - if (cc->io_pool) - mempool_destroy(cc->io_pool); - - if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) - cc->iv_gen_ops->dtr(cc); - - if (cc->dev) - dm_put_device(ti, cc->dev); - - if (cc->cpu) - free_percpu(cc->cpu); - - kzfree(cc->cipher); - kzfree(cc->cipher_string); - - /* Must zero key material before freeing */ - kzfree(cc); -} - -static int crypt_ctr_cipher(struct dm_target *ti, - char *cipher_in, char *key) -{ - struct crypt_config *cc = ti->private; - char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount; - char *cipher_api = NULL; - int cpu, ret = -EINVAL; - char dummy; - - /* Convert to crypto api definition? */ - if (strchr(cipher_in, '(')) { - ti->error = "Bad cipher specification"; - return -EINVAL; - } - - cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL); - if (!cc->cipher_string) - goto bad_mem; - - /* - * Legacy dm-crypt cipher specification - * cipher[:keycount]-mode-iv:ivopts - */ - tmp = cipher_in; - keycount = strsep(&tmp, "-"); - cipher = strsep(&keycount, ":"); - - if (!keycount) - cc->tfms_count = 1; - else if (sscanf(keycount, "%u%c", &cc->tfms_count, &dummy) != 1 || - !is_power_of_2(cc->tfms_count)) { - ti->error = "Bad cipher key count specification"; - return -EINVAL; - } - cc->key_parts = cc->tfms_count; - - cc->cipher = kstrdup(cipher, GFP_KERNEL); - if (!cc->cipher) - goto bad_mem; - - chainmode = strsep(&tmp, "-"); - ivopts = strsep(&tmp, "-"); - ivmode = strsep(&ivopts, ":"); - - if (tmp) - DMWARN("Ignoring unexpected additional cipher options"); - - cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)) + - cc->tfms_count * sizeof(*(cc->cpu->tfms)), - __alignof__(struct crypt_cpu)); - if (!cc->cpu) { - ti->error = "Cannot allocate per cpu state"; - goto bad_mem; - } - - /* - * For compatibility with the original dm-crypt mapping format, if - * only the cipher name is supplied, use cbc-plain. - */ - if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) { - chainmode = "cbc"; - ivmode = "plain"; - } - - if (strcmp(chainmode, "ecb") && !ivmode) { - ti->error = "IV mechanism required"; - return -EINVAL; - } - - cipher_api = kmalloc(CRYPTO_MAX_ALG_NAME, GFP_KERNEL); - if (!cipher_api) - goto bad_mem; - - ret = snprintf(cipher_api, CRYPTO_MAX_ALG_NAME, - "%s(%s)", chainmode, cipher); - if (ret < 0) { - kfree(cipher_api); - goto bad_mem; - } - - /* Allocate cipher */ - for_each_possible_cpu(cpu) { - ret = crypt_alloc_tfms(cc, cpu, cipher_api); - if (ret < 0) { - ti->error = "Error allocating crypto tfm"; - goto bad; - } - } - - /* Initialize and set key */ - ret = crypt_set_key(cc, key); - if (ret < 0) { - ti->error = "Error decoding and setting key"; - goto bad; - } - - /* Initialize IV */ - cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc)); - if (cc->iv_size) - /* at least a 64 bit sector number should fit in our buffer */ - cc->iv_size = max(cc->iv_size, - (unsigned int)(sizeof(u64) / sizeof(u8))); - else if (ivmode) { - DMWARN("Selected cipher does not support IVs"); - ivmode = NULL; - } - - /* Choose ivmode, see comments at iv code. */ - if (ivmode == NULL) - cc->iv_gen_ops = NULL; - else if (strcmp(ivmode, "plain") == 0) - cc->iv_gen_ops = &crypt_iv_plain_ops; - else if (strcmp(ivmode, "plain64") == 0) - cc->iv_gen_ops = &crypt_iv_plain64_ops; - else if (strcmp(ivmode, "essiv") == 0) - cc->iv_gen_ops = &crypt_iv_essiv_ops; - else if (strcmp(ivmode, "benbi") == 0) - cc->iv_gen_ops = &crypt_iv_benbi_ops; - else if (strcmp(ivmode, "null") == 0) - cc->iv_gen_ops = &crypt_iv_null_ops; - else if (strcmp(ivmode, "lmk") == 0) { - cc->iv_gen_ops = &crypt_iv_lmk_ops; - /* Version 2 and 3 is recognised according - * to length of provided multi-key string. - * If present (version 3), last key is used as IV seed. - */ - if (cc->key_size % cc->key_parts) - cc->key_parts++; - } else { - ret = -EINVAL; - ti->error = "Invalid IV mode"; - goto bad; - } - - /* Allocate IV */ - if (cc->iv_gen_ops && cc->iv_gen_ops->ctr) { - ret = cc->iv_gen_ops->ctr(cc, ti, ivopts); - if (ret < 0) { - ti->error = "Error creating IV"; - goto bad; - } - } - - /* Initialize IV (set keys for ESSIV etc) */ - if (cc->iv_gen_ops && cc->iv_gen_ops->init) { - ret = cc->iv_gen_ops->init(cc); - if (ret < 0) { - ti->error = "Error initialising IV"; - goto bad; - } - } - - ret = 0; -bad: - kfree(cipher_api); - return ret; - -bad_mem: - ti->error = "Cannot allocate cipher strings"; - return -ENOMEM; -} - -/* - * Construct an encryption mapping: - * <cipher> <key> <iv_offset> <dev_path> <start> - */ -static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) -{ - struct crypt_config *cc; - unsigned int key_size, opt_params; - unsigned long long tmpll; - int ret; - struct dm_arg_set as; - const char *opt_string; - char dummy; - - static struct dm_arg _args[] = { - {0, 1, "Invalid number of feature args"}, - }; - - if (argc < 5) { - ti->error = "Not enough arguments"; - return -EINVAL; - } - - key_size = strlen(argv[1]) >> 1; - - cc = kzalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL); - if (!cc) { - ti->error = "Cannot allocate encryption context"; - return -ENOMEM; - } - cc->key_size = key_size; - - ti->private = cc; - ret = crypt_ctr_cipher(ti, argv[0], argv[1]); - if (ret < 0) - goto bad; - - ret = -ENOMEM; - cc->io_pool = mempool_create_slab_pool(MIN_IOS, _crypt_io_pool); - if (!cc->io_pool) { - ti->error = "Cannot allocate crypt io mempool"; - goto bad; - } - - cc->dmreq_start = sizeof(struct ablkcipher_request); - cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc)); - cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment()); - cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) & - ~(crypto_tfm_ctx_alignment() - 1); - - cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + - sizeof(struct dm_crypt_request) + cc->iv_size); - if (!cc->req_pool) { - ti->error = "Cannot allocate crypt request mempool"; - goto bad; - } - - cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); - if (!cc->page_pool) { - ti->error = "Cannot allocate page mempool"; - goto bad; - } - - cc->bs = bioset_create(MIN_IOS, 0); - if (!cc->bs) { - ti->error = "Cannot allocate crypt bioset"; - goto bad; - } - - ret = -EINVAL; - if (sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) { - ti->error = "Invalid iv_offset sector"; - goto bad; - } - cc->iv_offset = tmpll; - - if (dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), &cc->dev)) { - ti->error = "Device lookup failed"; - goto bad; - } - - if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) { - ti->error = "Invalid device sector"; - goto bad; - } - cc->start = tmpll; - - argv += 5; - argc -= 5; - - /* Optional parameters */ - if (argc) { - as.argc = argc; - as.argv = argv; - - ret = dm_read_arg_group(_args, &as, &opt_params, &ti->error); - if (ret) - goto bad; - - opt_string = dm_shift_arg(&as); - - if (opt_params == 1 && opt_string && - !strcasecmp(opt_string, "allow_discards")) - ti->num_discard_requests = 1; - else if (opt_params) { - ret = -EINVAL; - ti->error = "Invalid feature arguments"; - goto bad; - } - } - - ret = -ENOMEM; - cc->io_queue = alloc_workqueue("kcryptd_io", - WQ_NON_REENTRANT| - WQ_MEM_RECLAIM, - 1); - if (!cc->io_queue) { - ti->error = "Couldn't create kcryptd io queue"; - goto bad; - } - - cc->crypt_queue = alloc_workqueue("kcryptd", - WQ_NON_REENTRANT| - WQ_CPU_INTENSIVE| - WQ_MEM_RECLAIM, - 1); - if (!cc->crypt_queue) { - ti->error = "Couldn't create kcryptd queue"; - goto bad; - } - - ti->num_flush_requests = 1; - ti->discard_zeroes_data_unsupported = 1; - - return 0; - -bad: - crypt_dtr(ti); - return ret; -} - -static int crypt_map(struct dm_target *ti, struct bio *bio, - union map_info *map_context) -{ - struct dm_crypt_io *io; - struct crypt_config *cc; - - /* - * If bio is REQ_FLUSH or REQ_DISCARD, just bypass crypt queues. - * - for REQ_FLUSH device-mapper core ensures that no IO is in-flight - * - for REQ_DISCARD caller must use flush if IO ordering matters - */ - if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) { - cc = ti->private; - bio->bi_bdev = cc->dev->bdev; - if (bio_sectors(bio)) - bio->bi_sector = cc->start + dm_target_offset(ti, bio->bi_sector); - return DM_MAPIO_REMAPPED; - } - - io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector)); - - if (bio_data_dir(io->base_bio) == READ) { - if (kcryptd_io_read(io, GFP_NOWAIT)) - kcryptd_queue_io(io); - } else - kcryptd_queue_crypt(io); - - return DM_MAPIO_SUBMITTED; -} - -static int crypt_status(struct dm_target *ti, status_type_t type, - char *result, unsigned int maxlen) -{ - struct crypt_config *cc = ti->private; - unsigned int sz = 0; - - switch (type) { - case STATUSTYPE_INFO: - result[0] = '\0'; - break; - - case STATUSTYPE_TABLE: - DMEMIT("%s ", cc->cipher_string); - - if (cc->key_size > 0) { - if ((maxlen - sz) < ((cc->key_size << 1) + 1)) - return -ENOMEM; - - crypt_encode_key(result + sz, cc->key, cc->key_size); - sz += cc->key_size << 1; - } else { - if (sz >= maxlen) - return -ENOMEM; - result[sz++] = '-'; - } - - DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset, - cc->dev->name, (unsigned long long)cc->start); - - if (ti->num_discard_requests) - DMEMIT(" 1 allow_discards"); - - break; - } - return 0; -} - -static void crypt_postsuspend(struct dm_target *ti) -{ - struct crypt_config *cc = ti->private; - - set_bit(DM_CRYPT_SUSPENDED, &cc->flags); -} - -static int crypt_preresume(struct dm_target *ti) -{ - struct crypt_config *cc = ti->private; - - if (!test_bit(DM_CRYPT_KEY_VALID, &cc->flags)) { - DMERR("aborting resume - crypt key is not set."); - return -EAGAIN; - } - - return 0; -} - -static void crypt_resume(struct dm_target *ti) -{ - struct crypt_config *cc = ti->private; - - clear_bit(DM_CRYPT_SUSPENDED, &cc->flags); -} - -/* Message interface - * key set <key> - * key wipe - */ -static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) -{ - struct crypt_config *cc = ti->private; - int ret = -EINVAL; - - if (argc < 2) - goto error; - - if (!strcasecmp(argv[0], "key")) { - if (!test_bit(DM_CRYPT_SUSPENDED, &cc->flags)) { - DMWARN("not suspended during key manipulation."); - return -EINVAL; - } - if (argc == 3 && !strcasecmp(argv[1], "set")) { - ret = crypt_set_key(cc, argv[2]); - if (ret) - return ret; - if (cc->iv_gen_ops && cc->iv_gen_ops->init) - ret = cc->iv_gen_ops->init(cc); - return ret; - } - if (argc == 2 && !strcasecmp(argv[1], "wipe")) { - if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) { - ret = cc->iv_gen_ops->wipe(cc); - if (ret) - return ret; - } - return crypt_wipe_key(cc); - } - } - -error: - DMWARN("unrecognised message received."); - return -EINVAL; -} - -static int crypt_merge(struct dm_target *ti, struct bvec_merge_data *bvm, - struct bio_vec *biovec, int max_size) -{ - struct crypt_config *cc = ti->private; - struct request_queue *q = bdev_get_queue(cc->dev->bdev); - - if (!q->merge_bvec_fn) - return max_size; - - bvm->bi_bdev = cc->dev->bdev; - bvm->bi_sector = cc->start + dm_target_offset(ti, bvm->bi_sector); - - return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); -} - -static int crypt_iterate_devices(struct dm_target *ti, - iterate_devices_callout_fn fn, void *data) -{ - struct crypt_config *cc = ti->private; - - return fn(ti, cc->dev, cc->start, ti->len, data); -} - -static struct target_type crypt_target = { - .name = "crypt", - .version = {1, 11, 0}, - .module = THIS_MODULE, - .ctr = crypt_ctr, - .dtr = crypt_dtr, - .map = crypt_map, - .status = crypt_status, - .postsuspend = crypt_postsuspend, - .preresume = crypt_preresume, - .resume = crypt_resume, - .message = crypt_message, - .merge = crypt_merge, - .iterate_devices = crypt_iterate_devices, -}; - -static int __init dm_crypt_init(void) -{ - int r; - - _crypt_io_pool = KMEM_CACHE(dm_crypt_io, 0); - if (!_crypt_io_pool) - return -ENOMEM; - - r = dm_register_target(&crypt_target); - if (r < 0) { - DMERR("register failed %d", r); - kmem_cache_destroy(_crypt_io_pool); - } - - return r; -} - -static void __exit dm_crypt_exit(void) -{ - dm_unregister_target(&crypt_target); - kmem_cache_destroy(_crypt_io_pool); -} - -module_init(dm_crypt_init); -module_exit(dm_crypt_exit); - -MODULE_AUTHOR("Christophe Saout <christophe@saout.de>"); -MODULE_DESCRIPTION(DM_NAME " target for transparent encryption / decryption"); -MODULE_LICENSE("GPL"); diff --git a/ANDROID_3.4.5/drivers/md/dm-delay.c b/ANDROID_3.4.5/drivers/md/dm-delay.c deleted file mode 100644 index 2dc22ddd..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-delay.c +++ /dev/null @@ -1,397 +0,0 @@ -/* - * Copyright (C) 2005-2007 Red Hat GmbH - * - * A target that delays reads and/or writes and can send - * them to different devices. - * - * This file is released under the GPL. - */ - -#include <linux/module.h> -#include <linux/init.h> -#include <linux/blkdev.h> -#include <linux/bio.h> -#include <linux/slab.h> - -#include <linux/device-mapper.h> - -#define DM_MSG_PREFIX "delay" - -struct delay_c { - struct timer_list delay_timer; - struct mutex timer_lock; - struct work_struct flush_expired_bios; - struct list_head delayed_bios; - atomic_t may_delay; - mempool_t *delayed_pool; - - struct dm_dev *dev_read; - sector_t start_read; - unsigned read_delay; - unsigned reads; - - struct dm_dev *dev_write; - sector_t start_write; - unsigned write_delay; - unsigned writes; -}; - -struct dm_delay_info { - struct delay_c *context; - struct list_head list; - struct bio *bio; - unsigned long expires; -}; - -static DEFINE_MUTEX(delayed_bios_lock); - -static struct workqueue_struct *kdelayd_wq; -static struct kmem_cache *delayed_cache; - -static void handle_delayed_timer(unsigned long data) -{ - struct delay_c *dc = (struct delay_c *)data; - - queue_work(kdelayd_wq, &dc->flush_expired_bios); -} - -static void queue_timeout(struct delay_c *dc, unsigned long expires) -{ - mutex_lock(&dc->timer_lock); - - if (!timer_pending(&dc->delay_timer) || expires < dc->delay_timer.expires) - mod_timer(&dc->delay_timer, expires); - - mutex_unlock(&dc->timer_lock); -} - -static void flush_bios(struct bio *bio) -{ - struct bio *n; - - while (bio) { - n = bio->bi_next; - bio->bi_next = NULL; - generic_make_request(bio); - bio = n; - } -} - -static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all) -{ - struct dm_delay_info *delayed, *next; - unsigned long next_expires = 0; - int start_timer = 0; - struct bio_list flush_bios = { }; - - mutex_lock(&delayed_bios_lock); - list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) { - if (flush_all || time_after_eq(jiffies, delayed->expires)) { - list_del(&delayed->list); - bio_list_add(&flush_bios, delayed->bio); - if ((bio_data_dir(delayed->bio) == WRITE)) - delayed->context->writes--; - else - delayed->context->reads--; - mempool_free(delayed, dc->delayed_pool); - continue; - } - - if (!start_timer) { - start_timer = 1; - next_expires = delayed->expires; - } else - next_expires = min(next_expires, delayed->expires); - } - - mutex_unlock(&delayed_bios_lock); - - if (start_timer) - queue_timeout(dc, next_expires); - - return bio_list_get(&flush_bios); -} - -static void flush_expired_bios(struct work_struct *work) -{ - struct delay_c *dc; - - dc = container_of(work, struct delay_c, flush_expired_bios); - flush_bios(flush_delayed_bios(dc, 0)); -} - -/* - * Mapping parameters: - * <device> <offset> <delay> [<write_device> <write_offset> <write_delay>] - * - * With separate write parameters, the first set is only used for reads. - * Delays are specified in milliseconds. - */ -static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) -{ - struct delay_c *dc; - unsigned long long tmpll; - char dummy; - - if (argc != 3 && argc != 6) { - ti->error = "requires exactly 3 or 6 arguments"; - return -EINVAL; - } - - dc = kmalloc(sizeof(*dc), GFP_KERNEL); - if (!dc) { - ti->error = "Cannot allocate context"; - return -ENOMEM; - } - - dc->reads = dc->writes = 0; - - if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1) { - ti->error = "Invalid device sector"; - goto bad; - } - dc->start_read = tmpll; - - if (sscanf(argv[2], "%u%c", &dc->read_delay, &dummy) != 1) { - ti->error = "Invalid delay"; - goto bad; - } - - if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), - &dc->dev_read)) { - ti->error = "Device lookup failed"; - goto bad; - } - - dc->dev_write = NULL; - if (argc == 3) - goto out; - - if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) { - ti->error = "Invalid write device sector"; - goto bad_dev_read; - } - dc->start_write = tmpll; - - if (sscanf(argv[5], "%u%c", &dc->write_delay, &dummy) != 1) { - ti->error = "Invalid write delay"; - goto bad_dev_read; - } - - if (dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), - &dc->dev_write)) { - ti->error = "Write device lookup failed"; - goto bad_dev_read; - } - -out: - dc->delayed_pool = mempool_create_slab_pool(128, delayed_cache); - if (!dc->delayed_pool) { - DMERR("Couldn't create delayed bio pool."); - goto bad_dev_write; - } - - setup_timer(&dc->delay_timer, handle_delayed_timer, (unsigned long)dc); - - INIT_WORK(&dc->flush_expired_bios, flush_expired_bios); - INIT_LIST_HEAD(&dc->delayed_bios); - mutex_init(&dc->timer_lock); - atomic_set(&dc->may_delay, 1); - - ti->num_flush_requests = 1; - ti->num_discard_requests = 1; - ti->private = dc; - return 0; - -bad_dev_write: - if (dc->dev_write) - dm_put_device(ti, dc->dev_write); -bad_dev_read: - dm_put_device(ti, dc->dev_read); -bad: - kfree(dc); - return -EINVAL; -} - -static void delay_dtr(struct dm_target *ti) -{ - struct delay_c *dc = ti->private; - - flush_workqueue(kdelayd_wq); - - dm_put_device(ti, dc->dev_read); - - if (dc->dev_write) - dm_put_device(ti, dc->dev_write); - - mempool_destroy(dc->delayed_pool); - kfree(dc); -} - -static int delay_bio(struct delay_c *dc, int delay, struct bio *bio) -{ - struct dm_delay_info *delayed; - unsigned long expires = 0; - - if (!delay || !atomic_read(&dc->may_delay)) - return 1; - - delayed = mempool_alloc(dc->delayed_pool, GFP_NOIO); - - delayed->context = dc; - delayed->bio = bio; - delayed->expires = expires = jiffies + (delay * HZ / 1000); - - mutex_lock(&delayed_bios_lock); - - if (bio_data_dir(bio) == WRITE) - dc->writes++; - else - dc->reads++; - - list_add_tail(&delayed->list, &dc->delayed_bios); - - mutex_unlock(&delayed_bios_lock); - - queue_timeout(dc, expires); - - return 0; -} - -static void delay_presuspend(struct dm_target *ti) -{ - struct delay_c *dc = ti->private; - - atomic_set(&dc->may_delay, 0); - del_timer_sync(&dc->delay_timer); - flush_bios(flush_delayed_bios(dc, 1)); -} - -static void delay_resume(struct dm_target *ti) -{ - struct delay_c *dc = ti->private; - - atomic_set(&dc->may_delay, 1); -} - -static int delay_map(struct dm_target *ti, struct bio *bio, - union map_info *map_context) -{ - struct delay_c *dc = ti->private; - - if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) { - bio->bi_bdev = dc->dev_write->bdev; - if (bio_sectors(bio)) - bio->bi_sector = dc->start_write + - dm_target_offset(ti, bio->bi_sector); - - return delay_bio(dc, dc->write_delay, bio); - } - - bio->bi_bdev = dc->dev_read->bdev; - bio->bi_sector = dc->start_read + dm_target_offset(ti, bio->bi_sector); - - return delay_bio(dc, dc->read_delay, bio); -} - -static int delay_status(struct dm_target *ti, status_type_t type, - char *result, unsigned maxlen) -{ - struct delay_c *dc = ti->private; - int sz = 0; - - switch (type) { - case STATUSTYPE_INFO: - DMEMIT("%u %u", dc->reads, dc->writes); - break; - - case STATUSTYPE_TABLE: - DMEMIT("%s %llu %u", dc->dev_read->name, - (unsigned long long) dc->start_read, - dc->read_delay); - if (dc->dev_write) - DMEMIT(" %s %llu %u", dc->dev_write->name, - (unsigned long long) dc->start_write, - dc->write_delay); - break; - } - - return 0; -} - -static int delay_iterate_devices(struct dm_target *ti, - iterate_devices_callout_fn fn, void *data) -{ - struct delay_c *dc = ti->private; - int ret = 0; - - ret = fn(ti, dc->dev_read, dc->start_read, ti->len, data); - if (ret) - goto out; - - if (dc->dev_write) - ret = fn(ti, dc->dev_write, dc->start_write, ti->len, data); - -out: - return ret; -} - -static struct target_type delay_target = { - .name = "delay", - .version = {1, 1, 0}, - .module = THIS_MODULE, - .ctr = delay_ctr, - .dtr = delay_dtr, - .map = delay_map, - .presuspend = delay_presuspend, - .resume = delay_resume, - .status = delay_status, - .iterate_devices = delay_iterate_devices, -}; - -static int __init dm_delay_init(void) -{ - int r = -ENOMEM; - - kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); - if (!kdelayd_wq) { - DMERR("Couldn't start kdelayd"); - goto bad_queue; - } - - delayed_cache = KMEM_CACHE(dm_delay_info, 0); - if (!delayed_cache) { - DMERR("Couldn't create delayed bio cache."); - goto bad_memcache; - } - - r = dm_register_target(&delay_target); - if (r < 0) { - DMERR("register failed %d", r); - goto bad_register; - } - - return 0; - -bad_register: - kmem_cache_destroy(delayed_cache); -bad_memcache: - destroy_workqueue(kdelayd_wq); -bad_queue: - return r; -} - -static void __exit dm_delay_exit(void) -{ - dm_unregister_target(&delay_target); - kmem_cache_destroy(delayed_cache); - destroy_workqueue(kdelayd_wq); -} - -/* Module hooks */ -module_init(dm_delay_init); -module_exit(dm_delay_exit); - -MODULE_DESCRIPTION(DM_NAME " delay target"); -MODULE_AUTHOR("Heinz Mauelshagen <mauelshagen@redhat.com>"); -MODULE_LICENSE("GPL"); diff --git a/ANDROID_3.4.5/drivers/md/dm-exception-store.c b/ANDROID_3.4.5/drivers/md/dm-exception-store.c deleted file mode 100644 index aa70f7d4..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-exception-store.c +++ /dev/null @@ -1,295 +0,0 @@ -/* - * Copyright (C) 2001-2002 Sistina Software (UK) Limited. - * Copyright (C) 2006-2008 Red Hat GmbH - * - * This file is released under the GPL. - */ - -#include "dm-exception-store.h" - -#include <linux/ctype.h> -#include <linux/mm.h> -#include <linux/pagemap.h> -#include <linux/vmalloc.h> -#include <linux/module.h> -#include <linux/slab.h> - -#define DM_MSG_PREFIX "snapshot exception stores" - -static LIST_HEAD(_exception_store_types); -static DEFINE_SPINLOCK(_lock); - -static struct dm_exception_store_type *__find_exception_store_type(const char *name) -{ - struct dm_exception_store_type *type; - - list_for_each_entry(type, &_exception_store_types, list) - if (!strcmp(name, type->name)) - return type; - - return NULL; -} - -static struct dm_exception_store_type *_get_exception_store_type(const char *name) -{ - struct dm_exception_store_type *type; - - spin_lock(&_lock); - - type = __find_exception_store_type(name); - - if (type && !try_module_get(type->module)) - type = NULL; - - spin_unlock(&_lock); - - return type; -} - -/* - * get_type - * @type_name - * - * Attempt to retrieve the dm_exception_store_type by name. If not already - * available, attempt to load the appropriate module. - * - * Exstore modules are named "dm-exstore-" followed by the 'type_name'. - * Modules may contain multiple types. - * This function will first try the module "dm-exstore-<type_name>", - * then truncate 'type_name' on the last '-' and try again. - * - * For example, if type_name was "clustered-shared", it would search - * 'dm-exstore-clustered-shared' then 'dm-exstore-clustered'. - * - * 'dm-exception-store-<type_name>' is too long of a name in my - * opinion, which is why I've chosen to have the files - * containing exception store implementations be 'dm-exstore-<type_name>'. - * If you want your module to be autoloaded, you will follow this - * naming convention. - * - * Returns: dm_exception_store_type* on success, NULL on failure - */ -static struct dm_exception_store_type *get_type(const char *type_name) -{ - char *p, *type_name_dup; - struct dm_exception_store_type *type; - - type = _get_exception_store_type(type_name); - if (type) - return type; - - type_name_dup = kstrdup(type_name, GFP_KERNEL); - if (!type_name_dup) { - DMERR("No memory left to attempt load for \"%s\"", type_name); - return NULL; - } - - while (request_module("dm-exstore-%s", type_name_dup) || - !(type = _get_exception_store_type(type_name))) { - p = strrchr(type_name_dup, '-'); - if (!p) - break; - p[0] = '\0'; - } - - if (!type) - DMWARN("Module for exstore type \"%s\" not found.", type_name); - - kfree(type_name_dup); - - return type; -} - -static void put_type(struct dm_exception_store_type *type) -{ - spin_lock(&_lock); - module_put(type->module); - spin_unlock(&_lock); -} - -int dm_exception_store_type_register(struct dm_exception_store_type *type) -{ - int r = 0; - - spin_lock(&_lock); - if (!__find_exception_store_type(type->name)) - list_add(&type->list, &_exception_store_types); - else - r = -EEXIST; - spin_unlock(&_lock); - - return r; -} -EXPORT_SYMBOL(dm_exception_store_type_register); - -int dm_exception_store_type_unregister(struct dm_exception_store_type *type) -{ - spin_lock(&_lock); - - if (!__find_exception_store_type(type->name)) { - spin_unlock(&_lock); - return -EINVAL; - } - - list_del(&type->list); - - spin_unlock(&_lock); - - return 0; -} -EXPORT_SYMBOL(dm_exception_store_type_unregister); - -static int set_chunk_size(struct dm_exception_store *store, - const char *chunk_size_arg, char **error) -{ - unsigned long chunk_size_ulong; - char *value; - - chunk_size_ulong = simple_strtoul(chunk_size_arg, &value, 10); - if (*chunk_size_arg == '\0' || *value != '\0' || - chunk_size_ulong > UINT_MAX) { - *error = "Invalid chunk size"; - return -EINVAL; - } - - if (!chunk_size_ulong) { - store->chunk_size = store->chunk_mask = store->chunk_shift = 0; - return 0; - } - - return dm_exception_store_set_chunk_size(store, - (unsigned) chunk_size_ulong, - error); -} - -int dm_exception_store_set_chunk_size(struct dm_exception_store *store, - unsigned chunk_size, - char **error) -{ - /* Check chunk_size is a power of 2 */ - if (!is_power_of_2(chunk_size)) { - *error = "Chunk size is not a power of 2"; - return -EINVAL; - } - - /* Validate the chunk size against the device block size */ - if (chunk_size % - (bdev_logical_block_size(dm_snap_cow(store->snap)->bdev) >> 9) || - chunk_size % - (bdev_logical_block_size(dm_snap_origin(store->snap)->bdev) >> 9)) { - *error = "Chunk size is not a multiple of device blocksize"; - return -EINVAL; - } - - if (chunk_size > INT_MAX >> SECTOR_SHIFT) { - *error = "Chunk size is too high"; - return -EINVAL; - } - - store->chunk_size = chunk_size; - store->chunk_mask = chunk_size - 1; - store->chunk_shift = ffs(chunk_size) - 1; - - return 0; -} - -int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, - struct dm_snapshot *snap, - unsigned *args_used, - struct dm_exception_store **store) -{ - int r = 0; - struct dm_exception_store_type *type = NULL; - struct dm_exception_store *tmp_store; - char persistent; - - if (argc < 2) { - ti->error = "Insufficient exception store arguments"; - return -EINVAL; - } - - tmp_store = kmalloc(sizeof(*tmp_store), GFP_KERNEL); - if (!tmp_store) { - ti->error = "Exception store allocation failed"; - return -ENOMEM; - } - - persistent = toupper(*argv[0]); - if (persistent == 'P') - type = get_type("P"); - else if (persistent == 'N') - type = get_type("N"); - else { - ti->error = "Persistent flag is not P or N"; - r = -EINVAL; - goto bad_type; - } - - if (!type) { - ti->error = "Exception store type not recognised"; - r = -EINVAL; - goto bad_type; - } - - tmp_store->type = type; - tmp_store->snap = snap; - - r = set_chunk_size(tmp_store, argv[1], &ti->error); - if (r) - goto bad; - - r = type->ctr(tmp_store, 0, NULL); - if (r) { - ti->error = "Exception store type constructor failed"; - goto bad; - } - - *args_used = 2; - *store = tmp_store; - return 0; - -bad: - put_type(type); -bad_type: - kfree(tmp_store); - return r; -} -EXPORT_SYMBOL(dm_exception_store_create); - -void dm_exception_store_destroy(struct dm_exception_store *store) -{ - store->type->dtr(store); - put_type(store->type); - kfree(store); -} -EXPORT_SYMBOL(dm_exception_store_destroy); - -int dm_exception_store_init(void) -{ - int r; - - r = dm_transient_snapshot_init(); - if (r) { - DMERR("Unable to register transient exception store type."); - goto transient_fail; - } - - r = dm_persistent_snapshot_init(); - if (r) { - DMERR("Unable to register persistent exception store type"); - goto persistent_fail; - } - - return 0; - -persistent_fail: - dm_transient_snapshot_exit(); -transient_fail: - return r; -} - -void dm_exception_store_exit(void) -{ - dm_persistent_snapshot_exit(); - dm_transient_snapshot_exit(); -} diff --git a/ANDROID_3.4.5/drivers/md/dm-exception-store.h b/ANDROID_3.4.5/drivers/md/dm-exception-store.h deleted file mode 100644 index 0b253624..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-exception-store.h +++ /dev/null @@ -1,227 +0,0 @@ -/* - * Copyright (C) 2001-2002 Sistina Software (UK) Limited. - * Copyright (C) 2008 Red Hat, Inc. All rights reserved. - * - * Device-mapper snapshot exception store. - * - * This file is released under the GPL. - */ - -#ifndef _LINUX_DM_EXCEPTION_STORE -#define _LINUX_DM_EXCEPTION_STORE - -#include <linux/blkdev.h> -#include <linux/device-mapper.h> - -/* - * The snapshot code deals with largish chunks of the disk at a - * time. Typically 32k - 512k. - */ -typedef sector_t chunk_t; - -/* - * An exception is used where an old chunk of data has been - * replaced by a new one. - * If chunk_t is 64 bits in size, the top 8 bits of new_chunk hold the number - * of chunks that follow contiguously. Remaining bits hold the number of the - * chunk within the device. - */ -struct dm_exception { - struct list_head hash_list; - - chunk_t old_chunk; - chunk_t new_chunk; -}; - -/* - * Abstraction to handle the meta/layout of exception stores (the - * COW device). - */ -struct dm_exception_store; -struct dm_exception_store_type { - const char *name; - struct module *module; - - int (*ctr) (struct dm_exception_store *store, - unsigned argc, char **argv); - - /* - * Destroys this object when you've finished with it. - */ - void (*dtr) (struct dm_exception_store *store); - - /* - * The target shouldn't read the COW device until this is - * called. As exceptions are read from the COW, they are - * reported back via the callback. - */ - int (*read_metadata) (struct dm_exception_store *store, - int (*callback)(void *callback_context, - chunk_t old, chunk_t new), - void *callback_context); - - /* - * Find somewhere to store the next exception. - */ - int (*prepare_exception) (struct dm_exception_store *store, - struct dm_exception *e); - - /* - * Update the metadata with this exception. - */ - void (*commit_exception) (struct dm_exception_store *store, - struct dm_exception *e, - void (*callback) (void *, int success), - void *callback_context); - - /* - * Returns 0 if the exception store is empty. - * - * If there are exceptions still to be merged, sets - * *last_old_chunk and *last_new_chunk to the most recent - * still-to-be-merged chunk and returns the number of - * consecutive previous ones. - */ - int (*prepare_merge) (struct dm_exception_store *store, - chunk_t *last_old_chunk, chunk_t *last_new_chunk); - - /* - * Clear the last n exceptions. - * nr_merged must be <= the value returned by prepare_merge. - */ - int (*commit_merge) (struct dm_exception_store *store, int nr_merged); - - /* - * The snapshot is invalid, note this in the metadata. - */ - void (*drop_snapshot) (struct dm_exception_store *store); - - unsigned (*status) (struct dm_exception_store *store, - status_type_t status, char *result, - unsigned maxlen); - - /* - * Return how full the snapshot is. - */ - void (*usage) (struct dm_exception_store *store, - sector_t *total_sectors, sector_t *sectors_allocated, - sector_t *metadata_sectors); - - /* For internal device-mapper use only. */ - struct list_head list; -}; - -struct dm_snapshot; - -struct dm_exception_store { - struct dm_exception_store_type *type; - struct dm_snapshot *snap; - - /* Size of data blocks saved - must be a power of 2 */ - unsigned chunk_size; - unsigned chunk_mask; - unsigned chunk_shift; - - void *context; -}; - -/* - * Obtain the origin or cow device used by a given snapshot. - */ -struct dm_dev *dm_snap_origin(struct dm_snapshot *snap); -struct dm_dev *dm_snap_cow(struct dm_snapshot *snap); - -/* - * Funtions to manipulate consecutive chunks - */ -# if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64) -# define DM_CHUNK_CONSECUTIVE_BITS 8 -# define DM_CHUNK_NUMBER_BITS 56 - -static inline chunk_t dm_chunk_number(chunk_t chunk) -{ - return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL); -} - -static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e) -{ - return e->new_chunk >> DM_CHUNK_NUMBER_BITS; -} - -static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e) -{ - e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS); - - BUG_ON(!dm_consecutive_chunk_count(e)); -} - -static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e) -{ - BUG_ON(!dm_consecutive_chunk_count(e)); - - e->new_chunk -= (1ULL << DM_CHUNK_NUMBER_BITS); -} - -# else -# define DM_CHUNK_CONSECUTIVE_BITS 0 - -static inline chunk_t dm_chunk_number(chunk_t chunk) -{ - return chunk; -} - -static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e) -{ - return 0; -} - -static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e) -{ -} - -static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e) -{ -} - -# endif - -/* - * Return the number of sectors in the device. - */ -static inline sector_t get_dev_size(struct block_device *bdev) -{ - return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; -} - -static inline chunk_t sector_to_chunk(struct dm_exception_store *store, - sector_t sector) -{ - return sector >> store->chunk_shift; -} - -int dm_exception_store_type_register(struct dm_exception_store_type *type); -int dm_exception_store_type_unregister(struct dm_exception_store_type *type); - -int dm_exception_store_set_chunk_size(struct dm_exception_store *store, - unsigned chunk_size, - char **error); - -int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, - struct dm_snapshot *snap, - unsigned *args_used, - struct dm_exception_store **store); -void dm_exception_store_destroy(struct dm_exception_store *store); - -int dm_exception_store_init(void); -void dm_exception_store_exit(void); - -/* - * Two exception store implementations. - */ -int dm_persistent_snapshot_init(void); -void dm_persistent_snapshot_exit(void); - -int dm_transient_snapshot_init(void); -void dm_transient_snapshot_exit(void); - -#endif /* _LINUX_DM_EXCEPTION_STORE */ diff --git a/ANDROID_3.4.5/drivers/md/dm-flakey.c b/ANDROID_3.4.5/drivers/md/dm-flakey.c deleted file mode 100644 index ac49c01f..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-flakey.c +++ /dev/null @@ -1,442 +0,0 @@ -/* - * Copyright (C) 2003 Sistina Software (UK) Limited. - * Copyright (C) 2004, 2010-2011 Red Hat, Inc. All rights reserved. - * - * This file is released under the GPL. - */ - -#include <linux/device-mapper.h> - -#include <linux/module.h> -#include <linux/init.h> -#include <linux/blkdev.h> -#include <linux/bio.h> -#include <linux/slab.h> - -#define DM_MSG_PREFIX "flakey" - -#define all_corrupt_bio_flags_match(bio, fc) \ - (((bio)->bi_rw & (fc)->corrupt_bio_flags) == (fc)->corrupt_bio_flags) - -/* - * Flakey: Used for testing only, simulates intermittent, - * catastrophic device failure. - */ -struct flakey_c { - struct dm_dev *dev; - unsigned long start_time; - sector_t start; - unsigned up_interval; - unsigned down_interval; - unsigned long flags; - unsigned corrupt_bio_byte; - unsigned corrupt_bio_rw; - unsigned corrupt_bio_value; - unsigned corrupt_bio_flags; -}; - -enum feature_flag_bits { - DROP_WRITES -}; - -static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, - struct dm_target *ti) -{ - int r; - unsigned argc; - const char *arg_name; - - static struct dm_arg _args[] = { - {0, 6, "Invalid number of feature args"}, - {1, UINT_MAX, "Invalid corrupt bio byte"}, - {0, 255, "Invalid corrupt value to write into bio byte (0-255)"}, - {0, UINT_MAX, "Invalid corrupt bio flags mask"}, - }; - - /* No feature arguments supplied. */ - if (!as->argc) - return 0; - - r = dm_read_arg_group(_args, as, &argc, &ti->error); - if (r) - return r; - - while (argc) { - arg_name = dm_shift_arg(as); - argc--; - - /* - * drop_writes - */ - if (!strcasecmp(arg_name, "drop_writes")) { - if (test_and_set_bit(DROP_WRITES, &fc->flags)) { - ti->error = "Feature drop_writes duplicated"; - return -EINVAL; - } - - continue; - } - - /* - * corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags> - */ - if (!strcasecmp(arg_name, "corrupt_bio_byte")) { - if (!argc) { - ti->error = "Feature corrupt_bio_byte requires parameters"; - return -EINVAL; - } - - r = dm_read_arg(_args + 1, as, &fc->corrupt_bio_byte, &ti->error); - if (r) - return r; - argc--; - - /* - * Direction r or w? - */ - arg_name = dm_shift_arg(as); - if (!strcasecmp(arg_name, "w")) - fc->corrupt_bio_rw = WRITE; - else if (!strcasecmp(arg_name, "r")) - fc->corrupt_bio_rw = READ; - else { - ti->error = "Invalid corrupt bio direction (r or w)"; - return -EINVAL; - } - argc--; - - /* - * Value of byte (0-255) to write in place of correct one. - */ - r = dm_read_arg(_args + 2, as, &fc->corrupt_bio_value, &ti->error); - if (r) - return r; - argc--; - - /* - * Only corrupt bios with these flags set. - */ - r = dm_read_arg(_args + 3, as, &fc->corrupt_bio_flags, &ti->error); - if (r) - return r; - argc--; - - continue; - } - - ti->error = "Unrecognised flakey feature requested"; - return -EINVAL; - } - - if (test_bit(DROP_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) { - ti->error = "drop_writes is incompatible with corrupt_bio_byte with the WRITE flag set"; - return -EINVAL; - } - - return 0; -} - -/* - * Construct a flakey mapping: - * <dev_path> <offset> <up interval> <down interval> [<#feature args> [<arg>]*] - * - * Feature args: - * [drop_writes] - * [corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags>] - * - * Nth_byte starts from 1 for the first byte. - * Direction is r for READ or w for WRITE. - * bio_flags is ignored if 0. - */ -static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) -{ - static struct dm_arg _args[] = { - {0, UINT_MAX, "Invalid up interval"}, - {0, UINT_MAX, "Invalid down interval"}, - }; - - int r; - struct flakey_c *fc; - unsigned long long tmpll; - struct dm_arg_set as; - const char *devname; - char dummy; - - as.argc = argc; - as.argv = argv; - - if (argc < 4) { - ti->error = "Invalid argument count"; - return -EINVAL; - } - - fc = kzalloc(sizeof(*fc), GFP_KERNEL); - if (!fc) { - ti->error = "Cannot allocate linear context"; - return -ENOMEM; - } - fc->start_time = jiffies; - - devname = dm_shift_arg(&as); - - if (sscanf(dm_shift_arg(&as), "%llu%c", &tmpll, &dummy) != 1) { - ti->error = "Invalid device sector"; - goto bad; - } - fc->start = tmpll; - - r = dm_read_arg(_args, &as, &fc->up_interval, &ti->error); - if (r) - goto bad; - - r = dm_read_arg(_args, &as, &fc->down_interval, &ti->error); - if (r) - goto bad; - - if (!(fc->up_interval + fc->down_interval)) { - ti->error = "Total (up + down) interval is zero"; - goto bad; - } - - if (fc->up_interval + fc->down_interval < fc->up_interval) { - ti->error = "Interval overflow"; - goto bad; - } - - r = parse_features(&as, fc, ti); - if (r) - goto bad; - - if (dm_get_device(ti, devname, dm_table_get_mode(ti->table), &fc->dev)) { - ti->error = "Device lookup failed"; - goto bad; - } - - ti->num_flush_requests = 1; - ti->num_discard_requests = 1; - ti->private = fc; - return 0; - -bad: - kfree(fc); - return -EINVAL; -} - -static void flakey_dtr(struct dm_target *ti) -{ - struct flakey_c *fc = ti->private; - - dm_put_device(ti, fc->dev); - kfree(fc); -} - -static sector_t flakey_map_sector(struct dm_target *ti, sector_t bi_sector) -{ - struct flakey_c *fc = ti->private; - - return fc->start + dm_target_offset(ti, bi_sector); -} - -static void flakey_map_bio(struct dm_target *ti, struct bio *bio) -{ - struct flakey_c *fc = ti->private; - - bio->bi_bdev = fc->dev->bdev; - if (bio_sectors(bio)) - bio->bi_sector = flakey_map_sector(ti, bio->bi_sector); -} - -static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc) -{ - unsigned bio_bytes = bio_cur_bytes(bio); - char *data = bio_data(bio); - - /* - * Overwrite the Nth byte of the data returned. - */ - if (data && bio_bytes >= fc->corrupt_bio_byte) { - data[fc->corrupt_bio_byte - 1] = fc->corrupt_bio_value; - - DMDEBUG("Corrupting data bio=%p by writing %u to byte %u " - "(rw=%c bi_rw=%lu bi_sector=%llu cur_bytes=%u)\n", - bio, fc->corrupt_bio_value, fc->corrupt_bio_byte, - (bio_data_dir(bio) == WRITE) ? 'w' : 'r', - bio->bi_rw, (unsigned long long)bio->bi_sector, bio_bytes); - } -} - -static int flakey_map(struct dm_target *ti, struct bio *bio, - union map_info *map_context) -{ - struct flakey_c *fc = ti->private; - unsigned elapsed; - - /* Are we alive ? */ - elapsed = (jiffies - fc->start_time) / HZ; - if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) { - /* - * Flag this bio as submitted while down. - */ - map_context->ll = 1; - - /* - * Map reads as normal. - */ - if (bio_data_dir(bio) == READ) - goto map_bio; - - /* - * Drop writes? - */ - if (test_bit(DROP_WRITES, &fc->flags)) { - bio_endio(bio, 0); - return DM_MAPIO_SUBMITTED; - } - - /* - * Corrupt matching writes. - */ - if (fc->corrupt_bio_byte && (fc->corrupt_bio_rw == WRITE)) { - if (all_corrupt_bio_flags_match(bio, fc)) - corrupt_bio_data(bio, fc); - goto map_bio; - } - - /* - * By default, error all I/O. - */ - return -EIO; - } - -map_bio: - flakey_map_bio(ti, bio); - - return DM_MAPIO_REMAPPED; -} - -static int flakey_end_io(struct dm_target *ti, struct bio *bio, - int error, union map_info *map_context) -{ - struct flakey_c *fc = ti->private; - unsigned bio_submitted_while_down = map_context->ll; - - /* - * Corrupt successful READs while in down state. - * If flags were specified, only corrupt those that match. - */ - if (fc->corrupt_bio_byte && !error && bio_submitted_while_down && - (bio_data_dir(bio) == READ) && (fc->corrupt_bio_rw == READ) && - all_corrupt_bio_flags_match(bio, fc)) - corrupt_bio_data(bio, fc); - - return error; -} - -static int flakey_status(struct dm_target *ti, status_type_t type, - char *result, unsigned int maxlen) -{ - unsigned sz = 0; - struct flakey_c *fc = ti->private; - unsigned drop_writes; - - switch (type) { - case STATUSTYPE_INFO: - result[0] = '\0'; - break; - - case STATUSTYPE_TABLE: - DMEMIT("%s %llu %u %u ", fc->dev->name, - (unsigned long long)fc->start, fc->up_interval, - fc->down_interval); - - drop_writes = test_bit(DROP_WRITES, &fc->flags); - DMEMIT("%u ", drop_writes + (fc->corrupt_bio_byte > 0) * 5); - - if (drop_writes) - DMEMIT("drop_writes "); - - if (fc->corrupt_bio_byte) - DMEMIT("corrupt_bio_byte %u %c %u %u ", - fc->corrupt_bio_byte, - (fc->corrupt_bio_rw == WRITE) ? 'w' : 'r', - fc->corrupt_bio_value, fc->corrupt_bio_flags); - - break; - } - return 0; -} - -static int flakey_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg) -{ - struct flakey_c *fc = ti->private; - struct dm_dev *dev = fc->dev; - int r = 0; - - /* - * Only pass ioctls through if the device sizes match exactly. - */ - if (fc->start || - ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT) - r = scsi_verify_blk_ioctl(NULL, cmd); - - return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg); -} - -static int flakey_merge(struct dm_target *ti, struct bvec_merge_data *bvm, - struct bio_vec *biovec, int max_size) -{ - struct flakey_c *fc = ti->private; - struct request_queue *q = bdev_get_queue(fc->dev->bdev); - - if (!q->merge_bvec_fn) - return max_size; - - bvm->bi_bdev = fc->dev->bdev; - bvm->bi_sector = flakey_map_sector(ti, bvm->bi_sector); - - return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); -} - -static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) -{ - struct flakey_c *fc = ti->private; - - return fn(ti, fc->dev, fc->start, ti->len, data); -} - -static struct target_type flakey_target = { - .name = "flakey", - .version = {1, 2, 0}, - .module = THIS_MODULE, - .ctr = flakey_ctr, - .dtr = flakey_dtr, - .map = flakey_map, - .end_io = flakey_end_io, - .status = flakey_status, - .ioctl = flakey_ioctl, - .merge = flakey_merge, - .iterate_devices = flakey_iterate_devices, -}; - -static int __init dm_flakey_init(void) -{ - int r = dm_register_target(&flakey_target); - - if (r < 0) - DMERR("register failed %d", r); - - return r; -} - -static void __exit dm_flakey_exit(void) -{ - dm_unregister_target(&flakey_target); -} - -/* Module hooks */ -module_init(dm_flakey_init); -module_exit(dm_flakey_exit); - -MODULE_DESCRIPTION(DM_NAME " flakey target"); -MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); -MODULE_LICENSE("GPL"); diff --git a/ANDROID_3.4.5/drivers/md/dm-io.c b/ANDROID_3.4.5/drivers/md/dm-io.c deleted file mode 100644 index ea5dd289..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-io.c +++ /dev/null @@ -1,523 +0,0 @@ -/* - * Copyright (C) 2003 Sistina Software - * Copyright (C) 2006 Red Hat GmbH - * - * This file is released under the GPL. - */ - -#include "dm.h" - -#include <linux/device-mapper.h> - -#include <linux/bio.h> -#include <linux/mempool.h> -#include <linux/module.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/dm-io.h> - -#define DM_MSG_PREFIX "io" - -#define DM_IO_MAX_REGIONS BITS_PER_LONG -#define MIN_IOS 16 -#define MIN_BIOS 16 - -struct dm_io_client { - mempool_t *pool; - struct bio_set *bios; -}; - -/* - * Aligning 'struct io' reduces the number of bits required to store - * its address. Refer to store_io_and_region_in_bio() below. - */ -struct io { - unsigned long error_bits; - atomic_t count; - struct task_struct *sleeper; - struct dm_io_client *client; - io_notify_fn callback; - void *context; - void *vma_invalidate_address; - unsigned long vma_invalidate_size; -} __attribute__((aligned(DM_IO_MAX_REGIONS))); - -static struct kmem_cache *_dm_io_cache; - -/* - * Create a client with mempool and bioset. - */ -struct dm_io_client *dm_io_client_create(void) -{ - struct dm_io_client *client; - - client = kmalloc(sizeof(*client), GFP_KERNEL); - if (!client) - return ERR_PTR(-ENOMEM); - - client->pool = mempool_create_slab_pool(MIN_IOS, _dm_io_cache); - if (!client->pool) - goto bad; - - client->bios = bioset_create(MIN_BIOS, 0); - if (!client->bios) - goto bad; - - return client; - - bad: - if (client->pool) - mempool_destroy(client->pool); - kfree(client); - return ERR_PTR(-ENOMEM); -} -EXPORT_SYMBOL(dm_io_client_create); - -void dm_io_client_destroy(struct dm_io_client *client) -{ - mempool_destroy(client->pool); - bioset_free(client->bios); - kfree(client); -} -EXPORT_SYMBOL(dm_io_client_destroy); - -/*----------------------------------------------------------------- - * We need to keep track of which region a bio is doing io for. - * To avoid a memory allocation to store just 5 or 6 bits, we - * ensure the 'struct io' pointer is aligned so enough low bits are - * always zero and then combine it with the region number directly in - * bi_private. - *---------------------------------------------------------------*/ -static void store_io_and_region_in_bio(struct bio *bio, struct io *io, - unsigned region) -{ - if (unlikely(!IS_ALIGNED((unsigned long)io, DM_IO_MAX_REGIONS))) { - DMCRIT("Unaligned struct io pointer %p", io); - BUG(); - } - - bio->bi_private = (void *)((unsigned long)io | region); -} - -static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io, - unsigned *region) -{ - unsigned long val = (unsigned long)bio->bi_private; - - *io = (void *)(val & -(unsigned long)DM_IO_MAX_REGIONS); - *region = val & (DM_IO_MAX_REGIONS - 1); -} - -/*----------------------------------------------------------------- - * We need an io object to keep track of the number of bios that - * have been dispatched for a particular io. - *---------------------------------------------------------------*/ -static void dec_count(struct io *io, unsigned int region, int error) -{ - if (error) - set_bit(region, &io->error_bits); - - if (atomic_dec_and_test(&io->count)) { - if (io->vma_invalidate_size) - invalidate_kernel_vmap_range(io->vma_invalidate_address, - io->vma_invalidate_size); - - if (io->sleeper) - wake_up_process(io->sleeper); - - else { - unsigned long r = io->error_bits; - io_notify_fn fn = io->callback; - void *context = io->context; - - mempool_free(io, io->client->pool); - fn(r, context); - } - } -} - -static void endio(struct bio *bio, int error) -{ - struct io *io; - unsigned region; - - if (error && bio_data_dir(bio) == READ) - zero_fill_bio(bio); - - /* - * The bio destructor in bio_put() may use the io object. - */ - retrieve_io_and_region_from_bio(bio, &io, ®ion); - - bio_put(bio); - - dec_count(io, region, error); -} - -/*----------------------------------------------------------------- - * These little objects provide an abstraction for getting a new - * destination page for io. - *---------------------------------------------------------------*/ -struct dpages { - void (*get_page)(struct dpages *dp, - struct page **p, unsigned long *len, unsigned *offset); - void (*next_page)(struct dpages *dp); - - unsigned context_u; - void *context_ptr; - - void *vma_invalidate_address; - unsigned long vma_invalidate_size; -}; - -/* - * Functions for getting the pages from a list. - */ -static void list_get_page(struct dpages *dp, - struct page **p, unsigned long *len, unsigned *offset) -{ - unsigned o = dp->context_u; - struct page_list *pl = (struct page_list *) dp->context_ptr; - - *p = pl->page; - *len = PAGE_SIZE - o; - *offset = o; -} - -static void list_next_page(struct dpages *dp) -{ - struct page_list *pl = (struct page_list *) dp->context_ptr; - dp->context_ptr = pl->next; - dp->context_u = 0; -} - -static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned offset) -{ - dp->get_page = list_get_page; - dp->next_page = list_next_page; - dp->context_u = offset; - dp->context_ptr = pl; -} - -/* - * Functions for getting the pages from a bvec. - */ -static void bvec_get_page(struct dpages *dp, - struct page **p, unsigned long *len, unsigned *offset) -{ - struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr; - *p = bvec->bv_page; - *len = bvec->bv_len; - *offset = bvec->bv_offset; -} - -static void bvec_next_page(struct dpages *dp) -{ - struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr; - dp->context_ptr = bvec + 1; -} - -static void bvec_dp_init(struct dpages *dp, struct bio_vec *bvec) -{ - dp->get_page = bvec_get_page; - dp->next_page = bvec_next_page; - dp->context_ptr = bvec; -} - -/* - * Functions for getting the pages from a VMA. - */ -static void vm_get_page(struct dpages *dp, - struct page **p, unsigned long *len, unsigned *offset) -{ - *p = vmalloc_to_page(dp->context_ptr); - *offset = dp->context_u; - *len = PAGE_SIZE - dp->context_u; -} - -static void vm_next_page(struct dpages *dp) -{ - dp->context_ptr += PAGE_SIZE - dp->context_u; - dp->context_u = 0; -} - -static void vm_dp_init(struct dpages *dp, void *data) -{ - dp->get_page = vm_get_page; - dp->next_page = vm_next_page; - dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1); - dp->context_ptr = data; -} - -static void dm_bio_destructor(struct bio *bio) -{ - unsigned region; - struct io *io; - - retrieve_io_and_region_from_bio(bio, &io, ®ion); - - bio_free(bio, io->client->bios); -} - -/* - * Functions for getting the pages from kernel memory. - */ -static void km_get_page(struct dpages *dp, struct page **p, unsigned long *len, - unsigned *offset) -{ - *p = virt_to_page(dp->context_ptr); - *offset = dp->context_u; - *len = PAGE_SIZE - dp->context_u; -} - -static void km_next_page(struct dpages *dp) -{ - dp->context_ptr += PAGE_SIZE - dp->context_u; - dp->context_u = 0; -} - -static void km_dp_init(struct dpages *dp, void *data) -{ - dp->get_page = km_get_page; - dp->next_page = km_next_page; - dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1); - dp->context_ptr = data; -} - -/*----------------------------------------------------------------- - * IO routines that accept a list of pages. - *---------------------------------------------------------------*/ -static void do_region(int rw, unsigned region, struct dm_io_region *where, - struct dpages *dp, struct io *io) -{ - struct bio *bio; - struct page *page; - unsigned long len; - unsigned offset; - unsigned num_bvecs; - sector_t remaining = where->count; - struct request_queue *q = bdev_get_queue(where->bdev); - sector_t discard_sectors; - - /* - * where->count may be zero if rw holds a flush and we need to - * send a zero-sized flush. - */ - do { - /* - * Allocate a suitably sized-bio. - */ - if (rw & REQ_DISCARD) - num_bvecs = 1; - else - num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), - dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT))); - - bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); - bio->bi_sector = where->sector + (where->count - remaining); - bio->bi_bdev = where->bdev; - bio->bi_end_io = endio; - bio->bi_destructor = dm_bio_destructor; - store_io_and_region_in_bio(bio, io, region); - - if (rw & REQ_DISCARD) { - discard_sectors = min_t(sector_t, q->limits.max_discard_sectors, remaining); - bio->bi_size = discard_sectors << SECTOR_SHIFT; - remaining -= discard_sectors; - } else while (remaining) { - /* - * Try and add as many pages as possible. - */ - dp->get_page(dp, &page, &len, &offset); - len = min(len, to_bytes(remaining)); - if (!bio_add_page(bio, page, len, offset)) - break; - - offset = 0; - remaining -= to_sector(len); - dp->next_page(dp); - } - - atomic_inc(&io->count); - submit_bio(rw, bio); - } while (remaining); -} - -static void dispatch_io(int rw, unsigned int num_regions, - struct dm_io_region *where, struct dpages *dp, - struct io *io, int sync) -{ - int i; - struct dpages old_pages = *dp; - - BUG_ON(num_regions > DM_IO_MAX_REGIONS); - - if (sync) - rw |= REQ_SYNC; - - /* - * For multiple regions we need to be careful to rewind - * the dp object for each call to do_region. - */ - for (i = 0; i < num_regions; i++) { - *dp = old_pages; - if (where[i].count || (rw & REQ_FLUSH)) - do_region(rw, i, where + i, dp, io); - } - - /* - * Drop the extra reference that we were holding to avoid - * the io being completed too early. - */ - dec_count(io, 0, 0); -} - -static int sync_io(struct dm_io_client *client, unsigned int num_regions, - struct dm_io_region *where, int rw, struct dpages *dp, - unsigned long *error_bits) -{ - /* - * gcc <= 4.3 can't do the alignment for stack variables, so we must - * align it on our own. - * volatile prevents the optimizer from removing or reusing - * "io_" field from the stack frame (allowed in ANSI C). - */ - volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1]; - struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io)); - - if (num_regions > 1 && (rw & RW_MASK) != WRITE) { - WARN_ON(1); - return -EIO; - } - - io->error_bits = 0; - atomic_set(&io->count, 1); /* see dispatch_io() */ - io->sleeper = current; - io->client = client; - - io->vma_invalidate_address = dp->vma_invalidate_address; - io->vma_invalidate_size = dp->vma_invalidate_size; - - dispatch_io(rw, num_regions, where, dp, io, 1); - - while (1) { - set_current_state(TASK_UNINTERRUPTIBLE); - - if (!atomic_read(&io->count)) - break; - - io_schedule(); - } - set_current_state(TASK_RUNNING); - - if (error_bits) - *error_bits = io->error_bits; - - return io->error_bits ? -EIO : 0; -} - -static int async_io(struct dm_io_client *client, unsigned int num_regions, - struct dm_io_region *where, int rw, struct dpages *dp, - io_notify_fn fn, void *context) -{ - struct io *io; - - if (num_regions > 1 && (rw & RW_MASK) != WRITE) { - WARN_ON(1); - fn(1, context); - return -EIO; - } - - io = mempool_alloc(client->pool, GFP_NOIO); - io->error_bits = 0; - atomic_set(&io->count, 1); /* see dispatch_io() */ - io->sleeper = NULL; - io->client = client; - io->callback = fn; - io->context = context; - - io->vma_invalidate_address = dp->vma_invalidate_address; - io->vma_invalidate_size = dp->vma_invalidate_size; - - dispatch_io(rw, num_regions, where, dp, io, 0); - return 0; -} - -static int dp_init(struct dm_io_request *io_req, struct dpages *dp, - unsigned long size) -{ - /* Set up dpages based on memory type */ - - dp->vma_invalidate_address = NULL; - dp->vma_invalidate_size = 0; - - switch (io_req->mem.type) { - case DM_IO_PAGE_LIST: - list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset); - break; - - case DM_IO_BVEC: - bvec_dp_init(dp, io_req->mem.ptr.bvec); - break; - - case DM_IO_VMA: - flush_kernel_vmap_range(io_req->mem.ptr.vma, size); - if ((io_req->bi_rw & RW_MASK) == READ) { - dp->vma_invalidate_address = io_req->mem.ptr.vma; - dp->vma_invalidate_size = size; - } - vm_dp_init(dp, io_req->mem.ptr.vma); - break; - - case DM_IO_KMEM: - km_dp_init(dp, io_req->mem.ptr.addr); - break; - - default: - return -EINVAL; - } - - return 0; -} - -/* - * New collapsed (a)synchronous interface. - * - * If the IO is asynchronous (i.e. it has notify.fn), you must either unplug - * the queue with blk_unplug() some time later or set REQ_SYNC in -io_req->bi_rw. If you fail to do one of these, the IO will be submitted to - * the disk after q->unplug_delay, which defaults to 3ms in blk-settings.c. - */ -int dm_io(struct dm_io_request *io_req, unsigned num_regions, - struct dm_io_region *where, unsigned long *sync_error_bits) -{ - int r; - struct dpages dp; - - r = dp_init(io_req, &dp, (unsigned long)where->count << SECTOR_SHIFT); - if (r) - return r; - - if (!io_req->notify.fn) - return sync_io(io_req->client, num_regions, where, - io_req->bi_rw, &dp, sync_error_bits); - - return async_io(io_req->client, num_regions, where, io_req->bi_rw, - &dp, io_req->notify.fn, io_req->notify.context); -} -EXPORT_SYMBOL(dm_io); - -int __init dm_io_init(void) -{ - _dm_io_cache = KMEM_CACHE(io, 0); - if (!_dm_io_cache) - return -ENOMEM; - - return 0; -} - -void dm_io_exit(void) -{ - kmem_cache_destroy(_dm_io_cache); - _dm_io_cache = NULL; -} diff --git a/ANDROID_3.4.5/drivers/md/dm-ioctl.c b/ANDROID_3.4.5/drivers/md/dm-ioctl.c deleted file mode 100644 index a1a3e6df..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-ioctl.c +++ /dev/null @@ -1,1782 +0,0 @@ -/* - * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. - * Copyright (C) 2004 - 2006 Red Hat, Inc. All rights reserved. - * - * This file is released under the GPL. - */ - -#include "dm.h" - -#include <linux/module.h> -#include <linux/vmalloc.h> -#include <linux/miscdevice.h> -#include <linux/init.h> -#include <linux/wait.h> -#include <linux/slab.h> -#include <linux/dm-ioctl.h> -#include <linux/hdreg.h> -#include <linux/compat.h> - -#include <asm/uaccess.h> - -#define DM_MSG_PREFIX "ioctl" -#define DM_DRIVER_EMAIL "dm-devel@redhat.com" - -/*----------------------------------------------------------------- - * The ioctl interface needs to be able to look up devices by - * name or uuid. - *---------------------------------------------------------------*/ -struct hash_cell { - struct list_head name_list; - struct list_head uuid_list; - - char *name; - char *uuid; - struct mapped_device *md; - struct dm_table *new_map; -}; - -struct vers_iter { - size_t param_size; - struct dm_target_versions *vers, *old_vers; - char *end; - uint32_t flags; -}; - - -#define NUM_BUCKETS 64 -#define MASK_BUCKETS (NUM_BUCKETS - 1) -static struct list_head _name_buckets[NUM_BUCKETS]; -static struct list_head _uuid_buckets[NUM_BUCKETS]; - -static void dm_hash_remove_all(int keep_open_devices); - -/* - * Guards access to both hash tables. - */ -static DECLARE_RWSEM(_hash_lock); - -/* - * Protects use of mdptr to obtain hash cell name and uuid from mapped device. - */ -static DEFINE_MUTEX(dm_hash_cells_mutex); - -static void init_buckets(struct list_head *buckets) -{ - unsigned int i; - - for (i = 0; i < NUM_BUCKETS; i++) - INIT_LIST_HEAD(buckets + i); -} - -static int dm_hash_init(void) -{ - init_buckets(_name_buckets); - init_buckets(_uuid_buckets); - return 0; -} - -static void dm_hash_exit(void) -{ - dm_hash_remove_all(0); -} - -/*----------------------------------------------------------------- - * Hash function: - * We're not really concerned with the str hash function being - * fast since it's only used by the ioctl interface. - *---------------------------------------------------------------*/ -static unsigned int hash_str(const char *str) -{ - const unsigned int hash_mult = 2654435387U; - unsigned int h = 0; - - while (*str) - h = (h + (unsigned int) *str++) * hash_mult; - - return h & MASK_BUCKETS; -} - -/*----------------------------------------------------------------- - * Code for looking up a device by name - *---------------------------------------------------------------*/ -static struct hash_cell *__get_name_cell(const char *str) -{ - struct hash_cell *hc; - unsigned int h = hash_str(str); - - list_for_each_entry (hc, _name_buckets + h, name_list) - if (!strcmp(hc->name, str)) { - dm_get(hc->md); - return hc; - } - - return NULL; -} - -static struct hash_cell *__get_uuid_cell(const char *str) -{ - struct hash_cell *hc; - unsigned int h = hash_str(str); - - list_for_each_entry (hc, _uuid_buckets + h, uuid_list) - if (!strcmp(hc->uuid, str)) { - dm_get(hc->md); - return hc; - } - - return NULL; -} - -static struct hash_cell *__get_dev_cell(uint64_t dev) -{ - struct mapped_device *md; - struct hash_cell *hc; - - md = dm_get_md(huge_decode_dev(dev)); - if (!md) - return NULL; - - hc = dm_get_mdptr(md); - if (!hc) { - dm_put(md); - return NULL; - } - - return hc; -} - -/*----------------------------------------------------------------- - * Inserting, removing and renaming a device. - *---------------------------------------------------------------*/ -static struct hash_cell *alloc_cell(const char *name, const char *uuid, - struct mapped_device *md) -{ - struct hash_cell *hc; - - hc = kmalloc(sizeof(*hc), GFP_KERNEL); - if (!hc) - return NULL; - - hc->name = kstrdup(name, GFP_KERNEL); - if (!hc->name) { - kfree(hc); - return NULL; - } - - if (!uuid) - hc->uuid = NULL; - - else { - hc->uuid = kstrdup(uuid, GFP_KERNEL); - if (!hc->uuid) { - kfree(hc->name); - kfree(hc); - return NULL; - } - } - - INIT_LIST_HEAD(&hc->name_list); - INIT_LIST_HEAD(&hc->uuid_list); - hc->md = md; - hc->new_map = NULL; - return hc; -} - -static void free_cell(struct hash_cell *hc) -{ - if (hc) { - kfree(hc->name); - kfree(hc->uuid); - kfree(hc); - } -} - -/* - * The kdev_t and uuid of a device can never change once it is - * initially inserted. - */ -static int dm_hash_insert(const char *name, const char *uuid, struct mapped_device *md) -{ - struct hash_cell *cell, *hc; - - /* - * Allocate the new cells. - */ - cell = alloc_cell(name, uuid, md); - if (!cell) - return -ENOMEM; - - /* - * Insert the cell into both hash tables. - */ - down_write(&_hash_lock); - hc = __get_name_cell(name); - if (hc) { - dm_put(hc->md); - goto bad; - } - - list_add(&cell->name_list, _name_buckets + hash_str(name)); - - if (uuid) { - hc = __get_uuid_cell(uuid); - if (hc) { - list_del(&cell->name_list); - dm_put(hc->md); - goto bad; - } - list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid)); - } - dm_get(md); - mutex_lock(&dm_hash_cells_mutex); - dm_set_mdptr(md, cell); - mutex_unlock(&dm_hash_cells_mutex); - up_write(&_hash_lock); - - return 0; - - bad: - up_write(&_hash_lock); - free_cell(cell); - return -EBUSY; -} - -static void __hash_remove(struct hash_cell *hc) -{ - struct dm_table *table; - - /* remove from the dev hash */ - list_del(&hc->uuid_list); - list_del(&hc->name_list); - mutex_lock(&dm_hash_cells_mutex); - dm_set_mdptr(hc->md, NULL); - mutex_unlock(&dm_hash_cells_mutex); - - table = dm_get_live_table(hc->md); - if (table) { - dm_table_event(table); - dm_table_put(table); - } - - if (hc->new_map) - dm_table_destroy(hc->new_map); - dm_put(hc->md); - free_cell(hc); -} - -static void dm_hash_remove_all(int keep_open_devices) -{ - int i, dev_skipped; - struct hash_cell *hc; - struct mapped_device *md; - -retry: - dev_skipped = 0; - - down_write(&_hash_lock); - - for (i = 0; i < NUM_BUCKETS; i++) { - list_for_each_entry(hc, _name_buckets + i, name_list) { - md = hc->md; - dm_get(md); - - if (keep_open_devices && dm_lock_for_deletion(md)) { - dm_put(md); - dev_skipped++; - continue; - } - - __hash_remove(hc); - - up_write(&_hash_lock); - - dm_put(md); - if (likely(keep_open_devices)) - dm_destroy(md); - else - dm_destroy_immediate(md); - - /* - * Some mapped devices may be using other mapped - * devices, so repeat until we make no further - * progress. If a new mapped device is created - * here it will also get removed. - */ - goto retry; - } - } - - up_write(&_hash_lock); - - if (dev_skipped) - DMWARN("remove_all left %d open device(s)", dev_skipped); -} - -/* - * Set the uuid of a hash_cell that isn't already set. - */ -static void __set_cell_uuid(struct hash_cell *hc, char *new_uuid) -{ - mutex_lock(&dm_hash_cells_mutex); - hc->uuid = new_uuid; - mutex_unlock(&dm_hash_cells_mutex); - - list_add(&hc->uuid_list, _uuid_buckets + hash_str(new_uuid)); -} - -/* - * Changes the name of a hash_cell and returns the old name for - * the caller to free. - */ -static char *__change_cell_name(struct hash_cell *hc, char *new_name) -{ - char *old_name; - - /* - * Rename and move the name cell. - */ - list_del(&hc->name_list); - old_name = hc->name; - - mutex_lock(&dm_hash_cells_mutex); - hc->name = new_name; - mutex_unlock(&dm_hash_cells_mutex); - - list_add(&hc->name_list, _name_buckets + hash_str(new_name)); - - return old_name; -} - -static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, - const char *new) -{ - char *new_data, *old_name = NULL; - struct hash_cell *hc; - struct dm_table *table; - struct mapped_device *md; - unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0; - - /* - * duplicate new. - */ - new_data = kstrdup(new, GFP_KERNEL); - if (!new_data) - return ERR_PTR(-ENOMEM); - - down_write(&_hash_lock); - - /* - * Is new free ? - */ - if (change_uuid) - hc = __get_uuid_cell(new); - else - hc = __get_name_cell(new); - - if (hc) { - DMWARN("Unable to change %s on mapped device %s to one that " - "already exists: %s", - change_uuid ? "uuid" : "name", - param->name, new); - dm_put(hc->md); - up_write(&_hash_lock); - kfree(new_data); - return ERR_PTR(-EBUSY); - } - - /* - * Is there such a device as 'old' ? - */ - hc = __get_name_cell(param->name); - if (!hc) { - DMWARN("Unable to rename non-existent device, %s to %s%s", - param->name, change_uuid ? "uuid " : "", new); - up_write(&_hash_lock); - kfree(new_data); - return ERR_PTR(-ENXIO); - } - - /* - * Does this device already have a uuid? - */ - if (change_uuid && hc->uuid) { - DMWARN("Unable to change uuid of mapped device %s to %s " - "because uuid is already set to %s", - param->name, new, hc->uuid); - dm_put(hc->md); - up_write(&_hash_lock); - kfree(new_data); - return ERR_PTR(-EINVAL); - } - - if (change_uuid) - __set_cell_uuid(hc, new_data); - else - old_name = __change_cell_name(hc, new_data); - - /* - * Wake up any dm event waiters. - */ - table = dm_get_live_table(hc->md); - if (table) { - dm_table_event(table); - dm_table_put(table); - } - - if (!dm_kobject_uevent(hc->md, KOBJ_CHANGE, param->event_nr)) - param->flags |= DM_UEVENT_GENERATED_FLAG; - - md = hc->md; - up_write(&_hash_lock); - kfree(old_name); - - return md; -} - -/*----------------------------------------------------------------- - * Implementation of the ioctl commands - *---------------------------------------------------------------*/ -/* - * All the ioctl commands get dispatched to functions with this - * prototype. - */ -typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size); - -static int remove_all(struct dm_ioctl *param, size_t param_size) -{ - dm_hash_remove_all(1); - param->data_size = 0; - return 0; -} - -/* - * Round up the ptr to an 8-byte boundary. - */ -#define ALIGN_MASK 7 -static inline void *align_ptr(void *ptr) -{ - return (void *) (((size_t) (ptr + ALIGN_MASK)) & ~ALIGN_MASK); -} - -/* - * Retrieves the data payload buffer from an already allocated - * struct dm_ioctl. - */ -static void *get_result_buffer(struct dm_ioctl *param, size_t param_size, - size_t *len) -{ - param->data_start = align_ptr(param + 1) - (void *) param; - - if (param->data_start < param_size) - *len = param_size - param->data_start; - else - *len = 0; - - return ((void *) param) + param->data_start; -} - -static int list_devices(struct dm_ioctl *param, size_t param_size) -{ - unsigned int i; - struct hash_cell *hc; - size_t len, needed = 0; - struct gendisk *disk; - struct dm_name_list *nl, *old_nl = NULL; - - down_write(&_hash_lock); - - /* - * Loop through all the devices working out how much - * space we need. - */ - for (i = 0; i < NUM_BUCKETS; i++) { - list_for_each_entry (hc, _name_buckets + i, name_list) { - needed += sizeof(struct dm_name_list); - needed += strlen(hc->name) + 1; - needed += ALIGN_MASK; - } - } - - /* - * Grab our output buffer. - */ - nl = get_result_buffer(param, param_size, &len); - if (len < needed) { - param->flags |= DM_BUFFER_FULL_FLAG; - goto out; - } - param->data_size = param->data_start + needed; - - nl->dev = 0; /* Flags no data */ - - /* - * Now loop through filling out the names. - */ - for (i = 0; i < NUM_BUCKETS; i++) { - list_for_each_entry (hc, _name_buckets + i, name_list) { - if (old_nl) - old_nl->next = (uint32_t) ((void *) nl - - (void *) old_nl); - disk = dm_disk(hc->md); - nl->dev = huge_encode_dev(disk_devt(disk)); - nl->next = 0; - strcpy(nl->name, hc->name); - - old_nl = nl; - nl = align_ptr(((void *) ++nl) + strlen(hc->name) + 1); - } - } - - out: - up_write(&_hash_lock); - return 0; -} - -static void list_version_get_needed(struct target_type *tt, void *needed_param) -{ - size_t *needed = needed_param; - - *needed += sizeof(struct dm_target_versions); - *needed += strlen(tt->name); - *needed += ALIGN_MASK; -} - -static void list_version_get_info(struct target_type *tt, void *param) -{ - struct vers_iter *info = param; - - /* Check space - it might have changed since the first iteration */ - if ((char *)info->vers + sizeof(tt->version) + strlen(tt->name) + 1 > - info->end) { - - info->flags = DM_BUFFER_FULL_FLAG; - return; - } - - if (info->old_vers) - info->old_vers->next = (uint32_t) ((void *)info->vers - - (void *)info->old_vers); - info->vers->version[0] = tt->version[0]; - info->vers->version[1] = tt->version[1]; - info->vers->version[2] = tt->version[2]; - info->vers->next = 0; - strcpy(info->vers->name, tt->name); - - info->old_vers = info->vers; - info->vers = align_ptr(((void *) ++info->vers) + strlen(tt->name) + 1); -} - -static int list_versions(struct dm_ioctl *param, size_t param_size) -{ - size_t len, needed = 0; - struct dm_target_versions *vers; - struct vers_iter iter_info; - - /* - * Loop through all the devices working out how much - * space we need. - */ - dm_target_iterate(list_version_get_needed, &needed); - - /* - * Grab our output buffer. - */ - vers = get_result_buffer(param, param_size, &len); - if (len < needed) { - param->flags |= DM_BUFFER_FULL_FLAG; - goto out; - } - param->data_size = param->data_start + needed; - - iter_info.param_size = param_size; - iter_info.old_vers = NULL; - iter_info.vers = vers; - iter_info.flags = 0; - iter_info.end = (char *)vers+len; - - /* - * Now loop through filling out the names & versions. - */ - dm_target_iterate(list_version_get_info, &iter_info); - param->flags |= iter_info.flags; - - out: - return 0; -} - -static int check_name(const char *name) -{ - if (strchr(name, '/')) { - DMWARN("invalid device name"); - return -EINVAL; - } - - return 0; -} - -/* - * On successful return, the caller must not attempt to acquire - * _hash_lock without first calling dm_table_put, because dm_table_destroy - * waits for this dm_table_put and could be called under this lock. - */ -static struct dm_table *dm_get_inactive_table(struct mapped_device *md) -{ - struct hash_cell *hc; - struct dm_table *table = NULL; - - down_read(&_hash_lock); - hc = dm_get_mdptr(md); - if (!hc || hc->md != md) { - DMWARN("device has been removed from the dev hash table."); - goto out; - } - - table = hc->new_map; - if (table) - dm_table_get(table); - -out: - up_read(&_hash_lock); - - return table; -} - -static struct dm_table *dm_get_live_or_inactive_table(struct mapped_device *md, - struct dm_ioctl *param) -{ - return (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) ? - dm_get_inactive_table(md) : dm_get_live_table(md); -} - -/* - * Fills in a dm_ioctl structure, ready for sending back to - * userland. - */ -static void __dev_status(struct mapped_device *md, struct dm_ioctl *param) -{ - struct gendisk *disk = dm_disk(md); - struct dm_table *table; - - param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | - DM_ACTIVE_PRESENT_FLAG); - - if (dm_suspended_md(md)) - param->flags |= DM_SUSPEND_FLAG; - - param->dev = huge_encode_dev(disk_devt(disk)); - - /* - * Yes, this will be out of date by the time it gets back - * to userland, but it is still very useful for - * debugging. - */ - param->open_count = dm_open_count(md); - - param->event_nr = dm_get_event_nr(md); - param->target_count = 0; - - table = dm_get_live_table(md); - if (table) { - if (!(param->flags & DM_QUERY_INACTIVE_TABLE_FLAG)) { - if (get_disk_ro(disk)) - param->flags |= DM_READONLY_FLAG; - param->target_count = dm_table_get_num_targets(table); - } - dm_table_put(table); - - param->flags |= DM_ACTIVE_PRESENT_FLAG; - } - - if (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) { - table = dm_get_inactive_table(md); - if (table) { - if (!(dm_table_get_mode(table) & FMODE_WRITE)) - param->flags |= DM_READONLY_FLAG; - param->target_count = dm_table_get_num_targets(table); - dm_table_put(table); - } - } -} - -static int dev_create(struct dm_ioctl *param, size_t param_size) -{ - int r, m = DM_ANY_MINOR; - struct mapped_device *md; - - r = check_name(param->name); - if (r) - return r; - - if (param->flags & DM_PERSISTENT_DEV_FLAG) - m = MINOR(huge_decode_dev(param->dev)); - - r = dm_create(m, &md); - if (r) - return r; - - r = dm_hash_insert(param->name, *param->uuid ? param->uuid : NULL, md); - if (r) { - dm_put(md); - dm_destroy(md); - return r; - } - - param->flags &= ~DM_INACTIVE_PRESENT_FLAG; - - __dev_status(md, param); - - dm_put(md); - - return 0; -} - -/* - * Always use UUID for lookups if it's present, otherwise use name or dev. - */ -static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param) -{ - struct hash_cell *hc = NULL; - - if (*param->uuid) { - if (*param->name || param->dev) - return NULL; - - hc = __get_uuid_cell(param->uuid); - if (!hc) - return NULL; - } else if (*param->name) { - if (param->dev) - return NULL; - - hc = __get_name_cell(param->name); - if (!hc) - return NULL; - } else if (param->dev) { - hc = __get_dev_cell(param->dev); - if (!hc) - return NULL; - } else - return NULL; - - /* - * Sneakily write in both the name and the uuid - * while we have the cell. - */ - strlcpy(param->name, hc->name, sizeof(param->name)); - if (hc->uuid) - strlcpy(param->uuid, hc->uuid, sizeof(param->uuid)); - else - param->uuid[0] = '\0'; - - if (hc->new_map) - param->flags |= DM_INACTIVE_PRESENT_FLAG; - else - param->flags &= ~DM_INACTIVE_PRESENT_FLAG; - - return hc; -} - -static struct mapped_device *find_device(struct dm_ioctl *param) -{ - struct hash_cell *hc; - struct mapped_device *md = NULL; - - down_read(&_hash_lock); - hc = __find_device_hash_cell(param); - if (hc) - md = hc->md; - up_read(&_hash_lock); - - return md; -} - -static int dev_remove(struct dm_ioctl *param, size_t param_size) -{ - struct hash_cell *hc; - struct mapped_device *md; - int r; - - down_write(&_hash_lock); - hc = __find_device_hash_cell(param); - - if (!hc) { - DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table."); - up_write(&_hash_lock); - return -ENXIO; - } - - md = hc->md; - - /* - * Ensure the device is not open and nothing further can open it. - */ - r = dm_lock_for_deletion(md); - if (r) { - DMDEBUG_LIMIT("unable to remove open device %s", hc->name); - up_write(&_hash_lock); - dm_put(md); - return r; - } - - __hash_remove(hc); - up_write(&_hash_lock); - - if (!dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr)) - param->flags |= DM_UEVENT_GENERATED_FLAG; - - dm_put(md); - dm_destroy(md); - return 0; -} - -/* - * Check a string doesn't overrun the chunk of - * memory we copied from userland. - */ -static int invalid_str(char *str, void *end) -{ - while ((void *) str < end) - if (!*str++) - return 0; - - return -EINVAL; -} - -static int dev_rename(struct dm_ioctl *param, size_t param_size) -{ - int r; - char *new_data = (char *) param + param->data_start; - struct mapped_device *md; - unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0; - - if (new_data < param->data || - invalid_str(new_data, (void *) param + param_size) || - strlen(new_data) > (change_uuid ? DM_UUID_LEN - 1 : DM_NAME_LEN - 1)) { - DMWARN("Invalid new mapped device name or uuid string supplied."); - return -EINVAL; - } - - if (!change_uuid) { - r = check_name(new_data); - if (r) - return r; - } - - md = dm_hash_rename(param, new_data); - if (IS_ERR(md)) - return PTR_ERR(md); - - __dev_status(md, param); - dm_put(md); - - return 0; -} - -static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) -{ - int r = -EINVAL, x; - struct mapped_device *md; - struct hd_geometry geometry; - unsigned long indata[4]; - char *geostr = (char *) param + param->data_start; - char dummy; - - md = find_device(param); - if (!md) - return -ENXIO; - - if (geostr < param->data || - invalid_str(geostr, (void *) param + param_size)) { - DMWARN("Invalid geometry supplied."); - goto out; - } - - x = sscanf(geostr, "%lu %lu %lu %lu%c", indata, - indata + 1, indata + 2, indata + 3, &dummy); - - if (x != 4) { - DMWARN("Unable to interpret geometry settings."); - goto out; - } - - if (indata[0] > 65535 || indata[1] > 255 || - indata[2] > 255 || indata[3] > ULONG_MAX) { - DMWARN("Geometry exceeds range limits."); - goto out; - } - - geometry.cylinders = indata[0]; - geometry.heads = indata[1]; - geometry.sectors = indata[2]; - geometry.start = indata[3]; - - r = dm_set_geometry(md, &geometry); - - param->data_size = 0; - -out: - dm_put(md); - return r; -} - -static int do_suspend(struct dm_ioctl *param) -{ - int r = 0; - unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG; - struct mapped_device *md; - - md = find_device(param); - if (!md) - return -ENXIO; - - if (param->flags & DM_SKIP_LOCKFS_FLAG) - suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG; - if (param->flags & DM_NOFLUSH_FLAG) - suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; - - if (!dm_suspended_md(md)) { - r = dm_suspend(md, suspend_flags); - if (r) - goto out; - } - - __dev_status(md, param); - -out: - dm_put(md); - - return r; -} - -static int do_resume(struct dm_ioctl *param) -{ - int r = 0; - unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG; - struct hash_cell *hc; - struct mapped_device *md; - struct dm_table *new_map, *old_map = NULL; - - down_write(&_hash_lock); - - hc = __find_device_hash_cell(param); - if (!hc) { - DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table."); - up_write(&_hash_lock); - return -ENXIO; - } - - md = hc->md; - - new_map = hc->new_map; - hc->new_map = NULL; - param->flags &= ~DM_INACTIVE_PRESENT_FLAG; - - up_write(&_hash_lock); - - /* Do we need to load a new map ? */ - if (new_map) { - /* Suspend if it isn't already suspended */ - if (param->flags & DM_SKIP_LOCKFS_FLAG) - suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG; - if (param->flags & DM_NOFLUSH_FLAG) - suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; - if (!dm_suspended_md(md)) - dm_suspend(md, suspend_flags); - - old_map = dm_swap_table(md, new_map); - if (IS_ERR(old_map)) { - dm_table_destroy(new_map); - dm_put(md); - return PTR_ERR(old_map); - } - - if (dm_table_get_mode(new_map) & FMODE_WRITE) - set_disk_ro(dm_disk(md), 0); - else - set_disk_ro(dm_disk(md), 1); - } - - if (dm_suspended_md(md)) { - r = dm_resume(md); - if (!r && !dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr)) - param->flags |= DM_UEVENT_GENERATED_FLAG; - } - - if (old_map) - dm_table_destroy(old_map); - - if (!r) - __dev_status(md, param); - - dm_put(md); - return r; -} - -/* - * Set or unset the suspension state of a device. - * If the device already is in the requested state we just return its status. - */ -static int dev_suspend(struct dm_ioctl *param, size_t param_size) -{ - if (param->flags & DM_SUSPEND_FLAG) - return do_suspend(param); - - return do_resume(param); -} - -/* - * Copies device info back to user space, used by - * the create and info ioctls. - */ -static int dev_status(struct dm_ioctl *param, size_t param_size) -{ - struct mapped_device *md; - - md = find_device(param); - if (!md) - return -ENXIO; - - __dev_status(md, param); - dm_put(md); - - return 0; -} - -/* - * Build up the status struct for each target - */ -static void retrieve_status(struct dm_table *table, - struct dm_ioctl *param, size_t param_size) -{ - unsigned int i, num_targets; - struct dm_target_spec *spec; - char *outbuf, *outptr; - status_type_t type; - size_t remaining, len, used = 0; - - outptr = outbuf = get_result_buffer(param, param_size, &len); - - if (param->flags & DM_STATUS_TABLE_FLAG) - type = STATUSTYPE_TABLE; - else - type = STATUSTYPE_INFO; - - /* Get all the target info */ - num_targets = dm_table_get_num_targets(table); - for (i = 0; i < num_targets; i++) { - struct dm_target *ti = dm_table_get_target(table, i); - - remaining = len - (outptr - outbuf); - if (remaining <= sizeof(struct dm_target_spec)) { - param->flags |= DM_BUFFER_FULL_FLAG; - break; - } - - spec = (struct dm_target_spec *) outptr; - - spec->status = 0; - spec->sector_start = ti->begin; - spec->length = ti->len; - strncpy(spec->target_type, ti->type->name, - sizeof(spec->target_type)); - - outptr += sizeof(struct dm_target_spec); - remaining = len - (outptr - outbuf); - if (remaining <= 0) { - param->flags |= DM_BUFFER_FULL_FLAG; - break; - } - - /* Get the status/table string from the target driver */ - if (ti->type->status) { - if (ti->type->status(ti, type, outptr, remaining)) { - param->flags |= DM_BUFFER_FULL_FLAG; - break; - } - } else - outptr[0] = '\0'; - - outptr += strlen(outptr) + 1; - used = param->data_start + (outptr - outbuf); - - outptr = align_ptr(outptr); - spec->next = outptr - outbuf; - } - - if (used) - param->data_size = used; - - param->target_count = num_targets; -} - -/* - * Wait for a device to report an event - */ -static int dev_wait(struct dm_ioctl *param, size_t param_size) -{ - int r = 0; - struct mapped_device *md; - struct dm_table *table; - - md = find_device(param); - if (!md) - return -ENXIO; - - /* - * Wait for a notification event - */ - if (dm_wait_event(md, param->event_nr)) { - r = -ERESTARTSYS; - goto out; - } - - /* - * The userland program is going to want to know what - * changed to trigger the event, so we may as well tell - * him and save an ioctl. - */ - __dev_status(md, param); - - table = dm_get_live_or_inactive_table(md, param); - if (table) { - retrieve_status(table, param, param_size); - dm_table_put(table); - } - -out: - dm_put(md); - - return r; -} - -static inline fmode_t get_mode(struct dm_ioctl *param) -{ - fmode_t mode = FMODE_READ | FMODE_WRITE; - - if (param->flags & DM_READONLY_FLAG) - mode = FMODE_READ; - - return mode; -} - -static int next_target(struct dm_target_spec *last, uint32_t next, void *end, - struct dm_target_spec **spec, char **target_params) -{ - *spec = (struct dm_target_spec *) ((unsigned char *) last + next); - *target_params = (char *) (*spec + 1); - - if (*spec < (last + 1)) - return -EINVAL; - - return invalid_str(*target_params, end); -} - -static int populate_table(struct dm_table *table, - struct dm_ioctl *param, size_t param_size) -{ - int r; - unsigned int i = 0; - struct dm_target_spec *spec = (struct dm_target_spec *) param; - uint32_t next = param->data_start; - void *end = (void *) param + param_size; - char *target_params; - - if (!param->target_count) { - DMWARN("populate_table: no targets specified"); - return -EINVAL; - } - - for (i = 0; i < param->target_count; i++) { - - r = next_target(spec, next, end, &spec, &target_params); - if (r) { - DMWARN("unable to find target"); - return r; - } - - r = dm_table_add_target(table, spec->target_type, - (sector_t) spec->sector_start, - (sector_t) spec->length, - target_params); - if (r) { - DMWARN("error adding target to table"); - return r; - } - - next = spec->next; - } - - return dm_table_complete(table); -} - -static int table_load(struct dm_ioctl *param, size_t param_size) -{ - int r; - struct hash_cell *hc; - struct dm_table *t; - struct mapped_device *md; - struct target_type *immutable_target_type; - - md = find_device(param); - if (!md) - return -ENXIO; - - r = dm_table_create(&t, get_mode(param), param->target_count, md); - if (r) - goto out; - - r = populate_table(t, param, param_size); - if (r) { - dm_table_destroy(t); - goto out; - } - - immutable_target_type = dm_get_immutable_target_type(md); - if (immutable_target_type && - (immutable_target_type != dm_table_get_immutable_target_type(t))) { - DMWARN("can't replace immutable target type %s", - immutable_target_type->name); - dm_table_destroy(t); - r = -EINVAL; - goto out; - } - - /* Protect md->type and md->queue against concurrent table loads. */ - dm_lock_md_type(md); - if (dm_get_md_type(md) == DM_TYPE_NONE) - /* Initial table load: acquire type of table. */ - dm_set_md_type(md, dm_table_get_type(t)); - else if (dm_get_md_type(md) != dm_table_get_type(t)) { - DMWARN("can't change device type after initial table load."); - dm_table_destroy(t); - dm_unlock_md_type(md); - r = -EINVAL; - goto out; - } - - /* setup md->queue to reflect md's type (may block) */ - r = dm_setup_md_queue(md); - if (r) { - DMWARN("unable to set up device queue for new table."); - dm_table_destroy(t); - dm_unlock_md_type(md); - goto out; - } - dm_unlock_md_type(md); - - /* stage inactive table */ - down_write(&_hash_lock); - hc = dm_get_mdptr(md); - if (!hc || hc->md != md) { - DMWARN("device has been removed from the dev hash table."); - dm_table_destroy(t); - up_write(&_hash_lock); - r = -ENXIO; - goto out; - } - - if (hc->new_map) - dm_table_destroy(hc->new_map); - hc->new_map = t; - up_write(&_hash_lock); - - param->flags |= DM_INACTIVE_PRESENT_FLAG; - __dev_status(md, param); - -out: - dm_put(md); - - return r; -} - -static int table_clear(struct dm_ioctl *param, size_t param_size) -{ - struct hash_cell *hc; - struct mapped_device *md; - - down_write(&_hash_lock); - - hc = __find_device_hash_cell(param); - if (!hc) { - DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table."); - up_write(&_hash_lock); - return -ENXIO; - } - - if (hc->new_map) { - dm_table_destroy(hc->new_map); - hc->new_map = NULL; - } - - param->flags &= ~DM_INACTIVE_PRESENT_FLAG; - - __dev_status(hc->md, param); - md = hc->md; - up_write(&_hash_lock); - dm_put(md); - - return 0; -} - -/* - * Retrieves a list of devices used by a particular dm device. - */ -static void retrieve_deps(struct dm_table *table, - struct dm_ioctl *param, size_t param_size) -{ - unsigned int count = 0; - struct list_head *tmp; - size_t len, needed; - struct dm_dev_internal *dd; - struct dm_target_deps *deps; - - deps = get_result_buffer(param, param_size, &len); - - /* - * Count the devices. - */ - list_for_each (tmp, dm_table_get_devices(table)) - count++; - - /* - * Check we have enough space. - */ - needed = sizeof(*deps) + (sizeof(*deps->dev) * count); - if (len < needed) { - param->flags |= DM_BUFFER_FULL_FLAG; - return; - } - - /* - * Fill in the devices. - */ - deps->count = count; - count = 0; - list_for_each_entry (dd, dm_table_get_devices(table), list) - deps->dev[count++] = huge_encode_dev(dd->dm_dev.bdev->bd_dev); - - param->data_size = param->data_start + needed; -} - -static int table_deps(struct dm_ioctl *param, size_t param_size) -{ - struct mapped_device *md; - struct dm_table *table; - - md = find_device(param); - if (!md) - return -ENXIO; - - __dev_status(md, param); - - table = dm_get_live_or_inactive_table(md, param); - if (table) { - retrieve_deps(table, param, param_size); - dm_table_put(table); - } - - dm_put(md); - - return 0; -} - -/* - * Return the status of a device as a text string for each - * target. - */ -static int table_status(struct dm_ioctl *param, size_t param_size) -{ - struct mapped_device *md; - struct dm_table *table; - - md = find_device(param); - if (!md) - return -ENXIO; - - __dev_status(md, param); - - table = dm_get_live_or_inactive_table(md, param); - if (table) { - retrieve_status(table, param, param_size); - dm_table_put(table); - } - - dm_put(md); - - return 0; -} - -/* - * Pass a message to the target that's at the supplied device offset. - */ -static int target_message(struct dm_ioctl *param, size_t param_size) -{ - int r, argc; - char **argv; - struct mapped_device *md; - struct dm_table *table; - struct dm_target *ti; - struct dm_target_msg *tmsg = (void *) param + param->data_start; - - md = find_device(param); - if (!md) - return -ENXIO; - - if (tmsg < (struct dm_target_msg *) param->data || - invalid_str(tmsg->message, (void *) param + param_size)) { - DMWARN("Invalid target message parameters."); - r = -EINVAL; - goto out; - } - - r = dm_split_args(&argc, &argv, tmsg->message); - if (r) { - DMWARN("Failed to split target message parameters"); - goto out; - } - - if (!argc) { - DMWARN("Empty message received."); - goto out_argv; - } - - table = dm_get_live_table(md); - if (!table) - goto out_argv; - - if (dm_deleting_md(md)) { - r = -ENXIO; - goto out_table; - } - - ti = dm_table_find_target(table, tmsg->sector); - if (!dm_target_is_valid(ti)) { - DMWARN("Target message sector outside device."); - r = -EINVAL; - } else if (ti->type->message) - r = ti->type->message(ti, argc, argv); - else { - DMWARN("Target type does not support messages"); - r = -EINVAL; - } - - out_table: - dm_table_put(table); - out_argv: - kfree(argv); - out: - param->data_size = 0; - dm_put(md); - return r; -} - -/*----------------------------------------------------------------- - * Implementation of open/close/ioctl on the special char - * device. - *---------------------------------------------------------------*/ -static ioctl_fn lookup_ioctl(unsigned int cmd) -{ - static struct { - int cmd; - ioctl_fn fn; - } _ioctls[] = { - {DM_VERSION_CMD, NULL}, /* version is dealt with elsewhere */ - {DM_REMOVE_ALL_CMD, remove_all}, - {DM_LIST_DEVICES_CMD, list_devices}, - - {DM_DEV_CREATE_CMD, dev_create}, - {DM_DEV_REMOVE_CMD, dev_remove}, - {DM_DEV_RENAME_CMD, dev_rename}, - {DM_DEV_SUSPEND_CMD, dev_suspend}, - {DM_DEV_STATUS_CMD, dev_status}, - {DM_DEV_WAIT_CMD, dev_wait}, - - {DM_TABLE_LOAD_CMD, table_load}, - {DM_TABLE_CLEAR_CMD, table_clear}, - {DM_TABLE_DEPS_CMD, table_deps}, - {DM_TABLE_STATUS_CMD, table_status}, - - {DM_LIST_VERSIONS_CMD, list_versions}, - - {DM_TARGET_MSG_CMD, target_message}, - {DM_DEV_SET_GEOMETRY_CMD, dev_set_geometry} - }; - - return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn; -} - -/* - * As well as checking the version compatibility this always - * copies the kernel interface version out. - */ -static int check_version(unsigned int cmd, struct dm_ioctl __user *user) -{ - uint32_t version[3]; - int r = 0; - - if (copy_from_user(version, user->version, sizeof(version))) - return -EFAULT; - - if ((DM_VERSION_MAJOR != version[0]) || - (DM_VERSION_MINOR < version[1])) { - DMWARN("ioctl interface mismatch: " - "kernel(%u.%u.%u), user(%u.%u.%u), cmd(%d)", - DM_VERSION_MAJOR, DM_VERSION_MINOR, - DM_VERSION_PATCHLEVEL, - version[0], version[1], version[2], cmd); - r = -EINVAL; - } - - /* - * Fill in the kernel version. - */ - version[0] = DM_VERSION_MAJOR; - version[1] = DM_VERSION_MINOR; - version[2] = DM_VERSION_PATCHLEVEL; - if (copy_to_user(user->version, version, sizeof(version))) - return -EFAULT; - - return r; -} - -static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param) -{ - struct dm_ioctl tmp, *dmi; - int secure_data; - - if (copy_from_user(&tmp, user, sizeof(tmp) - sizeof(tmp.data))) - return -EFAULT; - - if (tmp.data_size < (sizeof(tmp) - sizeof(tmp.data))) - return -EINVAL; - - secure_data = tmp.flags & DM_SECURE_DATA_FLAG; - - dmi = vmalloc(tmp.data_size); - if (!dmi) { - if (secure_data && clear_user(user, tmp.data_size)) - return -EFAULT; - return -ENOMEM; - } - - if (copy_from_user(dmi, user, tmp.data_size)) - goto bad; - - /* Wipe the user buffer so we do not return it to userspace */ - if (secure_data && clear_user(user, tmp.data_size)) - goto bad; - - *param = dmi; - return 0; - -bad: - if (secure_data) - memset(dmi, 0, tmp.data_size); - vfree(dmi); - return -EFAULT; -} - -static int validate_params(uint cmd, struct dm_ioctl *param) -{ - /* Always clear this flag */ - param->flags &= ~DM_BUFFER_FULL_FLAG; - param->flags &= ~DM_UEVENT_GENERATED_FLAG; - param->flags &= ~DM_SECURE_DATA_FLAG; - - /* Ignores parameters */ - if (cmd == DM_REMOVE_ALL_CMD || - cmd == DM_LIST_DEVICES_CMD || - cmd == DM_LIST_VERSIONS_CMD) - return 0; - - if ((cmd == DM_DEV_CREATE_CMD)) { - if (!*param->name) { - DMWARN("name not supplied when creating device"); - return -EINVAL; - } - } else if ((*param->uuid && *param->name)) { - DMWARN("only supply one of name or uuid, cmd(%u)", cmd); - return -EINVAL; - } - - /* Ensure strings are terminated */ - param->name[DM_NAME_LEN - 1] = '\0'; - param->uuid[DM_UUID_LEN - 1] = '\0'; - - return 0; -} - -static int ctl_ioctl(uint command, struct dm_ioctl __user *user) -{ - int r = 0; - int wipe_buffer; - unsigned int cmd; - struct dm_ioctl *uninitialized_var(param); - ioctl_fn fn = NULL; - size_t input_param_size; - - /* only root can play with this */ - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - - if (_IOC_TYPE(command) != DM_IOCTL) - return -ENOTTY; - - cmd = _IOC_NR(command); - - /* - * Check the interface version passed in. This also - * writes out the kernel's interface version. - */ - r = check_version(cmd, user); - if (r) - return r; - - /* - * Nothing more to do for the version command. - */ - if (cmd == DM_VERSION_CMD) - return 0; - - fn = lookup_ioctl(cmd); - if (!fn) { - DMWARN("dm_ctl_ioctl: unknown command 0x%x", command); - return -ENOTTY; - } - - /* - * Trying to avoid low memory issues when a device is - * suspended. - */ - current->flags |= PF_MEMALLOC; - - /* - * Copy the parameters into kernel space. - */ - r = copy_params(user, ¶m); - - current->flags &= ~PF_MEMALLOC; - - if (r) - return r; - - input_param_size = param->data_size; - wipe_buffer = param->flags & DM_SECURE_DATA_FLAG; - - r = validate_params(cmd, param); - if (r) - goto out; - - param->data_size = sizeof(*param); - r = fn(param, input_param_size); - - /* - * Copy the results back to userland. - */ - if (!r && copy_to_user(user, param, param->data_size)) - r = -EFAULT; - -out: - if (wipe_buffer) - memset(param, 0, input_param_size); - - vfree(param); - return r; -} - -static long dm_ctl_ioctl(struct file *file, uint command, ulong u) -{ - return (long)ctl_ioctl(command, (struct dm_ioctl __user *)u); -} - -#ifdef CONFIG_COMPAT -static long dm_compat_ctl_ioctl(struct file *file, uint command, ulong u) -{ - return (long)dm_ctl_ioctl(file, command, (ulong) compat_ptr(u)); -} -#else -#define dm_compat_ctl_ioctl NULL -#endif - -static const struct file_operations _ctl_fops = { - .open = nonseekable_open, - .unlocked_ioctl = dm_ctl_ioctl, - .compat_ioctl = dm_compat_ctl_ioctl, - .owner = THIS_MODULE, - .llseek = noop_llseek, -}; - -static struct miscdevice _dm_misc = { - .minor = MAPPER_CTRL_MINOR, - .name = DM_NAME, - .nodename = DM_DIR "/" DM_CONTROL_NODE, - .fops = &_ctl_fops -}; - -MODULE_ALIAS_MISCDEV(MAPPER_CTRL_MINOR); -MODULE_ALIAS("devname:" DM_DIR "/" DM_CONTROL_NODE); - -/* - * Create misc character device and link to DM_DIR/control. - */ -int __init dm_interface_init(void) -{ - int r; - - r = dm_hash_init(); - if (r) - return r; - - r = misc_register(&_dm_misc); - if (r) { - DMERR("misc_register failed for control device"); - dm_hash_exit(); - return r; - } - - DMINFO("%d.%d.%d%s initialised: %s", DM_VERSION_MAJOR, - DM_VERSION_MINOR, DM_VERSION_PATCHLEVEL, DM_VERSION_EXTRA, - DM_DRIVER_EMAIL); - return 0; -} - -void dm_interface_exit(void) -{ - if (misc_deregister(&_dm_misc) < 0) - DMERR("misc_deregister failed for control device"); - - dm_hash_exit(); -} - -/** - * dm_copy_name_and_uuid - Copy mapped device name & uuid into supplied buffers - * @md: Pointer to mapped_device - * @name: Buffer (size DM_NAME_LEN) for name - * @uuid: Buffer (size DM_UUID_LEN) for uuid or empty string if uuid not defined - */ -int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid) -{ - int r = 0; - struct hash_cell *hc; - - if (!md) - return -ENXIO; - - mutex_lock(&dm_hash_cells_mutex); - hc = dm_get_mdptr(md); - if (!hc || hc->md != md) { - r = -ENXIO; - goto out; - } - - if (name) - strcpy(name, hc->name); - if (uuid) - strcpy(uuid, hc->uuid ? : ""); - -out: - mutex_unlock(&dm_hash_cells_mutex); - - return r; -} diff --git a/ANDROID_3.4.5/drivers/md/dm-kcopyd.c b/ANDROID_3.4.5/drivers/md/dm-kcopyd.c deleted file mode 100644 index bed444c9..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-kcopyd.c +++ /dev/null @@ -1,756 +0,0 @@ -/* - * Copyright (C) 2002 Sistina Software (UK) Limited. - * Copyright (C) 2006 Red Hat GmbH - * - * This file is released under the GPL. - * - * Kcopyd provides a simple interface for copying an area of one - * block-device to one or more other block-devices, with an asynchronous - * completion notification. - */ - -#include <linux/types.h> -#include <linux/atomic.h> -#include <linux/blkdev.h> -#include <linux/fs.h> -#include <linux/init.h> -#include <linux/list.h> -#include <linux/mempool.h> -#include <linux/module.h> -#include <linux/pagemap.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> -#include <linux/workqueue.h> -#include <linux/mutex.h> -#include <linux/device-mapper.h> -#include <linux/dm-kcopyd.h> - -#include "dm.h" - -#define SUB_JOB_SIZE 128 -#define SPLIT_COUNT 8 -#define MIN_JOBS 8 -#define RESERVE_PAGES (DIV_ROUND_UP(SUB_JOB_SIZE << SECTOR_SHIFT, PAGE_SIZE)) - -/*----------------------------------------------------------------- - * Each kcopyd client has its own little pool of preallocated - * pages for kcopyd io. - *---------------------------------------------------------------*/ -struct dm_kcopyd_client { - struct page_list *pages; - unsigned nr_reserved_pages; - unsigned nr_free_pages; - - struct dm_io_client *io_client; - - wait_queue_head_t destroyq; - atomic_t nr_jobs; - - mempool_t *job_pool; - - struct workqueue_struct *kcopyd_wq; - struct work_struct kcopyd_work; - -/* - * We maintain three lists of jobs: - * - * i) jobs waiting for pages - * ii) jobs that have pages, and are waiting for the io to be issued. - * iii) jobs that have completed. - * - * All three of these are protected by job_lock. - */ - spinlock_t job_lock; - struct list_head complete_jobs; - struct list_head io_jobs; - struct list_head pages_jobs; -}; - -static struct page_list zero_page_list; - -static void wake(struct dm_kcopyd_client *kc) -{ - queue_work(kc->kcopyd_wq, &kc->kcopyd_work); -} - -/* - * Obtain one page for the use of kcopyd. - */ -static struct page_list *alloc_pl(gfp_t gfp) -{ - struct page_list *pl; - - pl = kmalloc(sizeof(*pl), gfp); - if (!pl) - return NULL; - - pl->page = alloc_page(gfp); - if (!pl->page) { - kfree(pl); - return NULL; - } - - return pl; -} - -static void free_pl(struct page_list *pl) -{ - __free_page(pl->page); - kfree(pl); -} - -/* - * Add the provided pages to a client's free page list, releasing - * back to the system any beyond the reserved_pages limit. - */ -static void kcopyd_put_pages(struct dm_kcopyd_client *kc, struct page_list *pl) -{ - struct page_list *next; - - do { - next = pl->next; - - if (kc->nr_free_pages >= kc->nr_reserved_pages) - free_pl(pl); - else { - pl->next = kc->pages; - kc->pages = pl; - kc->nr_free_pages++; - } - - pl = next; - } while (pl); -} - -static int kcopyd_get_pages(struct dm_kcopyd_client *kc, - unsigned int nr, struct page_list **pages) -{ - struct page_list *pl; - - *pages = NULL; - - do { - pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY); - if (unlikely(!pl)) { - /* Use reserved pages */ - pl = kc->pages; - if (unlikely(!pl)) - goto out_of_memory; - kc->pages = pl->next; - kc->nr_free_pages--; - } - pl->next = *pages; - *pages = pl; - } while (--nr); - - return 0; - -out_of_memory: - if (*pages) - kcopyd_put_pages(kc, *pages); - return -ENOMEM; -} - -/* - * These three functions resize the page pool. - */ -static void drop_pages(struct page_list *pl) -{ - struct page_list *next; - - while (pl) { - next = pl->next; - free_pl(pl); - pl = next; - } -} - -/* - * Allocate and reserve nr_pages for the use of a specific client. - */ -static int client_reserve_pages(struct dm_kcopyd_client *kc, unsigned nr_pages) -{ - unsigned i; - struct page_list *pl = NULL, *next; - - for (i = 0; i < nr_pages; i++) { - next = alloc_pl(GFP_KERNEL); - if (!next) { - if (pl) - drop_pages(pl); - return -ENOMEM; - } - next->next = pl; - pl = next; - } - - kc->nr_reserved_pages += nr_pages; - kcopyd_put_pages(kc, pl); - - return 0; -} - -static void client_free_pages(struct dm_kcopyd_client *kc) -{ - BUG_ON(kc->nr_free_pages != kc->nr_reserved_pages); - drop_pages(kc->pages); - kc->pages = NULL; - kc->nr_free_pages = kc->nr_reserved_pages = 0; -} - -/*----------------------------------------------------------------- - * kcopyd_jobs need to be allocated by the *clients* of kcopyd, - * for this reason we use a mempool to prevent the client from - * ever having to do io (which could cause a deadlock). - *---------------------------------------------------------------*/ -struct kcopyd_job { - struct dm_kcopyd_client *kc; - struct list_head list; - unsigned long flags; - - /* - * Error state of the job. - */ - int read_err; - unsigned long write_err; - - /* - * Either READ or WRITE - */ - int rw; - struct dm_io_region source; - - /* - * The destinations for the transfer. - */ - unsigned int num_dests; - struct dm_io_region dests[DM_KCOPYD_MAX_REGIONS]; - - struct page_list *pages; - - /* - * Set this to ensure you are notified when the job has - * completed. 'context' is for callback to use. - */ - dm_kcopyd_notify_fn fn; - void *context; - - /* - * These fields are only used if the job has been split - * into more manageable parts. - */ - struct mutex lock; - atomic_t sub_jobs; - sector_t progress; - - struct kcopyd_job *master_job; -}; - -static struct kmem_cache *_job_cache; - -int __init dm_kcopyd_init(void) -{ - _job_cache = kmem_cache_create("kcopyd_job", - sizeof(struct kcopyd_job) * (SPLIT_COUNT + 1), - __alignof__(struct kcopyd_job), 0, NULL); - if (!_job_cache) - return -ENOMEM; - - zero_page_list.next = &zero_page_list; - zero_page_list.page = ZERO_PAGE(0); - - return 0; -} - -void dm_kcopyd_exit(void) -{ - kmem_cache_destroy(_job_cache); - _job_cache = NULL; -} - -/* - * Functions to push and pop a job onto the head of a given job - * list. - */ -static struct kcopyd_job *pop(struct list_head *jobs, - struct dm_kcopyd_client *kc) -{ - struct kcopyd_job *job = NULL; - unsigned long flags; - - spin_lock_irqsave(&kc->job_lock, flags); - - if (!list_empty(jobs)) { - job = list_entry(jobs->next, struct kcopyd_job, list); - list_del(&job->list); - } - spin_unlock_irqrestore(&kc->job_lock, flags); - - return job; -} - -static void push(struct list_head *jobs, struct kcopyd_job *job) -{ - unsigned long flags; - struct dm_kcopyd_client *kc = job->kc; - - spin_lock_irqsave(&kc->job_lock, flags); - list_add_tail(&job->list, jobs); - spin_unlock_irqrestore(&kc->job_lock, flags); -} - - -static void push_head(struct list_head *jobs, struct kcopyd_job *job) -{ - unsigned long flags; - struct dm_kcopyd_client *kc = job->kc; - - spin_lock_irqsave(&kc->job_lock, flags); - list_add(&job->list, jobs); - spin_unlock_irqrestore(&kc->job_lock, flags); -} - -/* - * These three functions process 1 item from the corresponding - * job list. - * - * They return: - * < 0: error - * 0: success - * > 0: can't process yet. - */ -static int run_complete_job(struct kcopyd_job *job) -{ - void *context = job->context; - int read_err = job->read_err; - unsigned long write_err = job->write_err; - dm_kcopyd_notify_fn fn = job->fn; - struct dm_kcopyd_client *kc = job->kc; - - if (job->pages && job->pages != &zero_page_list) - kcopyd_put_pages(kc, job->pages); - /* - * If this is the master job, the sub jobs have already - * completed so we can free everything. - */ - if (job->master_job == job) - mempool_free(job, kc->job_pool); - fn(read_err, write_err, context); - - if (atomic_dec_and_test(&kc->nr_jobs)) - wake_up(&kc->destroyq); - - return 0; -} - -static void complete_io(unsigned long error, void *context) -{ - struct kcopyd_job *job = (struct kcopyd_job *) context; - struct dm_kcopyd_client *kc = job->kc; - - if (error) { - if (job->rw == WRITE) - job->write_err |= error; - else - job->read_err = 1; - - if (!test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags)) { - push(&kc->complete_jobs, job); - wake(kc); - return; - } - } - - if (job->rw == WRITE) - push(&kc->complete_jobs, job); - - else { - job->rw = WRITE; - push(&kc->io_jobs, job); - } - - wake(kc); -} - -/* - * Request io on as many buffer heads as we can currently get for - * a particular job. - */ -static int run_io_job(struct kcopyd_job *job) -{ - int r; - struct dm_io_request io_req = { - .bi_rw = job->rw, - .mem.type = DM_IO_PAGE_LIST, - .mem.ptr.pl = job->pages, - .mem.offset = 0, - .notify.fn = complete_io, - .notify.context = job, - .client = job->kc->io_client, - }; - - if (job->rw == READ) - r = dm_io(&io_req, 1, &job->source, NULL); - else - r = dm_io(&io_req, job->num_dests, job->dests, NULL); - - return r; -} - -static int run_pages_job(struct kcopyd_job *job) -{ - int r; - unsigned nr_pages = dm_div_up(job->dests[0].count, PAGE_SIZE >> 9); - - r = kcopyd_get_pages(job->kc, nr_pages, &job->pages); - if (!r) { - /* this job is ready for io */ - push(&job->kc->io_jobs, job); - return 0; - } - - if (r == -ENOMEM) - /* can't complete now */ - return 1; - - return r; -} - -/* - * Run through a list for as long as possible. Returns the count - * of successful jobs. - */ -static int process_jobs(struct list_head *jobs, struct dm_kcopyd_client *kc, - int (*fn) (struct kcopyd_job *)) -{ - struct kcopyd_job *job; - int r, count = 0; - - while ((job = pop(jobs, kc))) { - - r = fn(job); - - if (r < 0) { - /* error this rogue job */ - if (job->rw == WRITE) - job->write_err = (unsigned long) -1L; - else - job->read_err = 1; - push(&kc->complete_jobs, job); - break; - } - - if (r > 0) { - /* - * We couldn't service this job ATM, so - * push this job back onto the list. - */ - push_head(jobs, job); - break; - } - - count++; - } - - return count; -} - -/* - * kcopyd does this every time it's woken up. - */ -static void do_work(struct work_struct *work) -{ - struct dm_kcopyd_client *kc = container_of(work, - struct dm_kcopyd_client, kcopyd_work); - struct blk_plug plug; - - /* - * The order that these are called is *very* important. - * complete jobs can free some pages for pages jobs. - * Pages jobs when successful will jump onto the io jobs - * list. io jobs call wake when they complete and it all - * starts again. - */ - blk_start_plug(&plug); - process_jobs(&kc->complete_jobs, kc, run_complete_job); - process_jobs(&kc->pages_jobs, kc, run_pages_job); - process_jobs(&kc->io_jobs, kc, run_io_job); - blk_finish_plug(&plug); -} - -/* - * If we are copying a small region we just dispatch a single job - * to do the copy, otherwise the io has to be split up into many - * jobs. - */ -static void dispatch_job(struct kcopyd_job *job) -{ - struct dm_kcopyd_client *kc = job->kc; - atomic_inc(&kc->nr_jobs); - if (unlikely(!job->source.count)) - push(&kc->complete_jobs, job); - else if (job->pages == &zero_page_list) - push(&kc->io_jobs, job); - else - push(&kc->pages_jobs, job); - wake(kc); -} - -static void segment_complete(int read_err, unsigned long write_err, - void *context) -{ - /* FIXME: tidy this function */ - sector_t progress = 0; - sector_t count = 0; - struct kcopyd_job *sub_job = (struct kcopyd_job *) context; - struct kcopyd_job *job = sub_job->master_job; - struct dm_kcopyd_client *kc = job->kc; - - mutex_lock(&job->lock); - - /* update the error */ - if (read_err) - job->read_err = 1; - - if (write_err) - job->write_err |= write_err; - - /* - * Only dispatch more work if there hasn't been an error. - */ - if ((!job->read_err && !job->write_err) || - test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags)) { - /* get the next chunk of work */ - progress = job->progress; - count = job->source.count - progress; - if (count) { - if (count > SUB_JOB_SIZE) - count = SUB_JOB_SIZE; - - job->progress += count; - } - } - mutex_unlock(&job->lock); - - if (count) { - int i; - - *sub_job = *job; - sub_job->source.sector += progress; - sub_job->source.count = count; - - for (i = 0; i < job->num_dests; i++) { - sub_job->dests[i].sector += progress; - sub_job->dests[i].count = count; - } - - sub_job->fn = segment_complete; - sub_job->context = sub_job; - dispatch_job(sub_job); - - } else if (atomic_dec_and_test(&job->sub_jobs)) { - - /* - * Queue the completion callback to the kcopyd thread. - * - * Some callers assume that all the completions are called - * from a single thread and don't race with each other. - * - * We must not call the callback directly here because this - * code may not be executing in the thread. - */ - push(&kc->complete_jobs, job); - wake(kc); - } -} - -/* - * Create some sub jobs to share the work between them. - */ -static void split_job(struct kcopyd_job *master_job) -{ - int i; - - atomic_inc(&master_job->kc->nr_jobs); - - atomic_set(&master_job->sub_jobs, SPLIT_COUNT); - for (i = 0; i < SPLIT_COUNT; i++) { - master_job[i + 1].master_job = master_job; - segment_complete(0, 0u, &master_job[i + 1]); - } -} - -int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, - unsigned int num_dests, struct dm_io_region *dests, - unsigned int flags, dm_kcopyd_notify_fn fn, void *context) -{ - struct kcopyd_job *job; - - /* - * Allocate an array of jobs consisting of one master job - * followed by SPLIT_COUNT sub jobs. - */ - job = mempool_alloc(kc->job_pool, GFP_NOIO); - - /* - * set up for the read. - */ - job->kc = kc; - job->flags = flags; - job->read_err = 0; - job->write_err = 0; - - job->num_dests = num_dests; - memcpy(&job->dests, dests, sizeof(*dests) * num_dests); - - if (from) { - job->source = *from; - job->pages = NULL; - job->rw = READ; - } else { - memset(&job->source, 0, sizeof job->source); - job->source.count = job->dests[0].count; - job->pages = &zero_page_list; - job->rw = WRITE; - } - - job->fn = fn; - job->context = context; - job->master_job = job; - - if (job->source.count <= SUB_JOB_SIZE) - dispatch_job(job); - else { - mutex_init(&job->lock); - job->progress = 0; - split_job(job); - } - - return 0; -} -EXPORT_SYMBOL(dm_kcopyd_copy); - -int dm_kcopyd_zero(struct dm_kcopyd_client *kc, - unsigned num_dests, struct dm_io_region *dests, - unsigned flags, dm_kcopyd_notify_fn fn, void *context) -{ - return dm_kcopyd_copy(kc, NULL, num_dests, dests, flags, fn, context); -} -EXPORT_SYMBOL(dm_kcopyd_zero); - -void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc, - dm_kcopyd_notify_fn fn, void *context) -{ - struct kcopyd_job *job; - - job = mempool_alloc(kc->job_pool, GFP_NOIO); - - memset(job, 0, sizeof(struct kcopyd_job)); - job->kc = kc; - job->fn = fn; - job->context = context; - job->master_job = job; - - atomic_inc(&kc->nr_jobs); - - return job; -} -EXPORT_SYMBOL(dm_kcopyd_prepare_callback); - -void dm_kcopyd_do_callback(void *j, int read_err, unsigned long write_err) -{ - struct kcopyd_job *job = j; - struct dm_kcopyd_client *kc = job->kc; - - job->read_err = read_err; - job->write_err = write_err; - - push(&kc->complete_jobs, job); - wake(kc); -} -EXPORT_SYMBOL(dm_kcopyd_do_callback); - -/* - * Cancels a kcopyd job, eg. someone might be deactivating a - * mirror. - */ -#if 0 -int kcopyd_cancel(struct kcopyd_job *job, int block) -{ - /* FIXME: finish */ - return -1; -} -#endif /* 0 */ - -/*----------------------------------------------------------------- - * Client setup - *---------------------------------------------------------------*/ -struct dm_kcopyd_client *dm_kcopyd_client_create(void) -{ - int r = -ENOMEM; - struct dm_kcopyd_client *kc; - - kc = kmalloc(sizeof(*kc), GFP_KERNEL); - if (!kc) - return ERR_PTR(-ENOMEM); - - spin_lock_init(&kc->job_lock); - INIT_LIST_HEAD(&kc->complete_jobs); - INIT_LIST_HEAD(&kc->io_jobs); - INIT_LIST_HEAD(&kc->pages_jobs); - - kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache); - if (!kc->job_pool) - goto bad_slab; - - INIT_WORK(&kc->kcopyd_work, do_work); - kc->kcopyd_wq = alloc_workqueue("kcopyd", - WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); - if (!kc->kcopyd_wq) - goto bad_workqueue; - - kc->pages = NULL; - kc->nr_reserved_pages = kc->nr_free_pages = 0; - r = client_reserve_pages(kc, RESERVE_PAGES); - if (r) - goto bad_client_pages; - - kc->io_client = dm_io_client_create(); - if (IS_ERR(kc->io_client)) { - r = PTR_ERR(kc->io_client); - goto bad_io_client; - } - - init_waitqueue_head(&kc->destroyq); - atomic_set(&kc->nr_jobs, 0); - - return kc; - -bad_io_client: - client_free_pages(kc); -bad_client_pages: - destroy_workqueue(kc->kcopyd_wq); -bad_workqueue: - mempool_destroy(kc->job_pool); -bad_slab: - kfree(kc); - - return ERR_PTR(r); -} -EXPORT_SYMBOL(dm_kcopyd_client_create); - -void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc) -{ - /* Wait for completion of all jobs submitted by this client. */ - wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs)); - - BUG_ON(!list_empty(&kc->complete_jobs)); - BUG_ON(!list_empty(&kc->io_jobs)); - BUG_ON(!list_empty(&kc->pages_jobs)); - destroy_workqueue(kc->kcopyd_wq); - dm_io_client_destroy(kc->io_client); - client_free_pages(kc); - mempool_destroy(kc->job_pool); - kfree(kc); -} -EXPORT_SYMBOL(dm_kcopyd_client_destroy); diff --git a/ANDROID_3.4.5/drivers/md/dm-linear.c b/ANDROID_3.4.5/drivers/md/dm-linear.c deleted file mode 100644 index 3639eeab..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-linear.c +++ /dev/null @@ -1,182 +0,0 @@ -/* - * Copyright (C) 2001-2003 Sistina Software (UK) Limited. - * - * This file is released under the GPL. - */ - -#include "dm.h" -#include <linux/module.h> -#include <linux/init.h> -#include <linux/blkdev.h> -#include <linux/bio.h> -#include <linux/slab.h> -#include <linux/device-mapper.h> - -#define DM_MSG_PREFIX "linear" - -/* - * Linear: maps a linear range of a device. - */ -struct linear_c { - struct dm_dev *dev; - sector_t start; -}; - -/* - * Construct a linear mapping: <dev_path> <offset> - */ -static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) -{ - struct linear_c *lc; - unsigned long long tmp; - char dummy; - - if (argc != 2) { - ti->error = "Invalid argument count"; - return -EINVAL; - } - - lc = kmalloc(sizeof(*lc), GFP_KERNEL); - if (lc == NULL) { - ti->error = "dm-linear: Cannot allocate linear context"; - return -ENOMEM; - } - - if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) { - ti->error = "dm-linear: Invalid device sector"; - goto bad; - } - lc->start = tmp; - - if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &lc->dev)) { - ti->error = "dm-linear: Device lookup failed"; - goto bad; - } - - ti->num_flush_requests = 1; - ti->num_discard_requests = 1; - ti->private = lc; - return 0; - - bad: - kfree(lc); - return -EINVAL; -} - -static void linear_dtr(struct dm_target *ti) -{ - struct linear_c *lc = (struct linear_c *) ti->private; - - dm_put_device(ti, lc->dev); - kfree(lc); -} - -static sector_t linear_map_sector(struct dm_target *ti, sector_t bi_sector) -{ - struct linear_c *lc = ti->private; - - return lc->start + dm_target_offset(ti, bi_sector); -} - -static void linear_map_bio(struct dm_target *ti, struct bio *bio) -{ - struct linear_c *lc = ti->private; - - bio->bi_bdev = lc->dev->bdev; - if (bio_sectors(bio)) - bio->bi_sector = linear_map_sector(ti, bio->bi_sector); -} - -static int linear_map(struct dm_target *ti, struct bio *bio, - union map_info *map_context) -{ - linear_map_bio(ti, bio); - - return DM_MAPIO_REMAPPED; -} - -static int linear_status(struct dm_target *ti, status_type_t type, - char *result, unsigned int maxlen) -{ - struct linear_c *lc = (struct linear_c *) ti->private; - - switch (type) { - case STATUSTYPE_INFO: - result[0] = '\0'; - break; - - case STATUSTYPE_TABLE: - snprintf(result, maxlen, "%s %llu", lc->dev->name, - (unsigned long long)lc->start); - break; - } - return 0; -} - -static int linear_ioctl(struct dm_target *ti, unsigned int cmd, - unsigned long arg) -{ - struct linear_c *lc = (struct linear_c *) ti->private; - struct dm_dev *dev = lc->dev; - int r = 0; - - /* - * Only pass ioctls through if the device sizes match exactly. - */ - if (lc->start || - ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT) - r = scsi_verify_blk_ioctl(NULL, cmd); - - return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg); -} - -static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm, - struct bio_vec *biovec, int max_size) -{ - struct linear_c *lc = ti->private; - struct request_queue *q = bdev_get_queue(lc->dev->bdev); - - if (!q->merge_bvec_fn) - return max_size; - - bvm->bi_bdev = lc->dev->bdev; - bvm->bi_sector = linear_map_sector(ti, bvm->bi_sector); - - return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); -} - -static int linear_iterate_devices(struct dm_target *ti, - iterate_devices_callout_fn fn, void *data) -{ - struct linear_c *lc = ti->private; - - return fn(ti, lc->dev, lc->start, ti->len, data); -} - -static struct target_type linear_target = { - .name = "linear", - .version = {1, 1, 0}, - .module = THIS_MODULE, - .ctr = linear_ctr, - .dtr = linear_dtr, - .map = linear_map, - .status = linear_status, - .ioctl = linear_ioctl, - .merge = linear_merge, - .iterate_devices = linear_iterate_devices, -}; - -int __init dm_linear_init(void) -{ - int r = dm_register_target(&linear_target); - - if (r < 0) - DMERR("register failed %d", r); - - return r; -} - -void dm_linear_exit(void) -{ - dm_unregister_target(&linear_target); -} diff --git a/ANDROID_3.4.5/drivers/md/dm-log-userspace-base.c b/ANDROID_3.4.5/drivers/md/dm-log-userspace-base.c deleted file mode 100644 index 9429159d..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-log-userspace-base.c +++ /dev/null @@ -1,818 +0,0 @@ -/* - * Copyright (C) 2006-2009 Red Hat, Inc. - * - * This file is released under the LGPL. - */ - -#include <linux/bio.h> -#include <linux/slab.h> -#include <linux/dm-dirty-log.h> -#include <linux/device-mapper.h> -#include <linux/dm-log-userspace.h> -#include <linux/module.h> - -#include "dm-log-userspace-transfer.h" - -#define DM_LOG_USERSPACE_VSN "1.1.0" - -struct flush_entry { - int type; - region_t region; - struct list_head list; -}; - -/* - * This limit on the number of mark and clear request is, to a degree, - * arbitrary. However, there is some basis for the choice in the limits - * imposed on the size of data payload by dm-log-userspace-transfer.c: - * dm_consult_userspace(). - */ -#define MAX_FLUSH_GROUP_COUNT 32 - -struct log_c { - struct dm_target *ti; - struct dm_dev *log_dev; - uint32_t region_size; - region_t region_count; - uint64_t luid; - char uuid[DM_UUID_LEN]; - - char *usr_argv_str; - uint32_t usr_argc; - - /* - * in_sync_hint gets set when doing is_remote_recovering. It - * represents the first region that needs recovery. IOW, the - * first zero bit of sync_bits. This can be useful for to limit - * traffic for calls like is_remote_recovering and get_resync_work, - * but be take care in its use for anything else. - */ - uint64_t in_sync_hint; - - /* - * Mark and clear requests are held until a flush is issued - * so that we can group, and thereby limit, the amount of - * network traffic between kernel and userspace. The 'flush_lock' - * is used to protect these lists. - */ - spinlock_t flush_lock; - struct list_head mark_list; - struct list_head clear_list; -}; - -static mempool_t *flush_entry_pool; - -static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data) -{ - return kmalloc(sizeof(struct flush_entry), gfp_mask); -} - -static void flush_entry_free(void *element, void *pool_data) -{ - kfree(element); -} - -static int userspace_do_request(struct log_c *lc, const char *uuid, - int request_type, char *data, size_t data_size, - char *rdata, size_t *rdata_size) -{ - int r; - - /* - * If the server isn't there, -ESRCH is returned, - * and we must keep trying until the server is - * restored. - */ -retry: - r = dm_consult_userspace(uuid, lc->luid, request_type, data, - data_size, rdata, rdata_size); - - if (r != -ESRCH) - return r; - - DMERR(" Userspace log server not found."); - while (1) { - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(2*HZ); - DMWARN("Attempting to contact userspace log server..."); - r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_CTR, - lc->usr_argv_str, - strlen(lc->usr_argv_str) + 1, - NULL, NULL); - if (!r) - break; - } - DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete"); - r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_RESUME, NULL, - 0, NULL, NULL); - if (!r) - goto retry; - - DMERR("Error trying to resume userspace log: %d", r); - - return -ESRCH; -} - -static int build_constructor_string(struct dm_target *ti, - unsigned argc, char **argv, - char **ctr_str) -{ - int i, str_size; - char *str = NULL; - - *ctr_str = NULL; - - for (i = 0, str_size = 0; i < argc; i++) - str_size += strlen(argv[i]) + 1; /* +1 for space between args */ - - str_size += 20; /* Max number of chars in a printed u64 number */ - - str = kzalloc(str_size, GFP_KERNEL); - if (!str) { - DMWARN("Unable to allocate memory for constructor string"); - return -ENOMEM; - } - - str_size = sprintf(str, "%llu", (unsigned long long)ti->len); - for (i = 0; i < argc; i++) - str_size += sprintf(str + str_size, " %s", argv[i]); - - *ctr_str = str; - return str_size; -} - -/* - * userspace_ctr - * - * argv contains: - * <UUID> <other args> - * Where 'other args' is the userspace implementation specific log - * arguments. An example might be: - * <UUID> clustered-disk <arg count> <log dev> <region_size> [[no]sync] - * - * So, this module will strip off the <UUID> for identification purposes - * when communicating with userspace about a log; but will pass on everything - * else. - */ -static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, - unsigned argc, char **argv) -{ - int r = 0; - int str_size; - char *ctr_str = NULL; - struct log_c *lc = NULL; - uint64_t rdata; - size_t rdata_size = sizeof(rdata); - char *devices_rdata = NULL; - size_t devices_rdata_size = DM_NAME_LEN; - - if (argc < 3) { - DMWARN("Too few arguments to userspace dirty log"); - return -EINVAL; - } - - lc = kzalloc(sizeof(*lc), GFP_KERNEL); - if (!lc) { - DMWARN("Unable to allocate userspace log context."); - return -ENOMEM; - } - - /* The ptr value is sufficient for local unique id */ - lc->luid = (unsigned long)lc; - - lc->ti = ti; - - if (strlen(argv[0]) > (DM_UUID_LEN - 1)) { - DMWARN("UUID argument too long."); - kfree(lc); - return -EINVAL; - } - - strncpy(lc->uuid, argv[0], DM_UUID_LEN); - spin_lock_init(&lc->flush_lock); - INIT_LIST_HEAD(&lc->mark_list); - INIT_LIST_HEAD(&lc->clear_list); - - str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str); - if (str_size < 0) { - kfree(lc); - return str_size; - } - - devices_rdata = kzalloc(devices_rdata_size, GFP_KERNEL); - if (!devices_rdata) { - DMERR("Failed to allocate memory for device information"); - r = -ENOMEM; - goto out; - } - - /* - * Send table string and get back any opened device. - */ - r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR, - ctr_str, str_size, - devices_rdata, &devices_rdata_size); - - if (r < 0) { - if (r == -ESRCH) - DMERR("Userspace log server not found"); - else - DMERR("Userspace log server failed to create log"); - goto out; - } - - /* Since the region size does not change, get it now */ - rdata_size = sizeof(rdata); - r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_GET_REGION_SIZE, - NULL, 0, (char *)&rdata, &rdata_size); - - if (r) { - DMERR("Failed to get region size of dirty log"); - goto out; - } - - lc->region_size = (uint32_t)rdata; - lc->region_count = dm_sector_div_up(ti->len, lc->region_size); - - if (devices_rdata_size) { - if (devices_rdata[devices_rdata_size - 1] != '\0') { - DMERR("DM_ULOG_CTR device return string not properly terminated"); - r = -EINVAL; - goto out; - } - r = dm_get_device(ti, devices_rdata, - dm_table_get_mode(ti->table), &lc->log_dev); - if (r) - DMERR("Failed to register %s with device-mapper", - devices_rdata); - } -out: - kfree(devices_rdata); - if (r) { - kfree(lc); - kfree(ctr_str); - } else { - lc->usr_argv_str = ctr_str; - lc->usr_argc = argc; - log->context = lc; - } - - return r; -} - -static void userspace_dtr(struct dm_dirty_log *log) -{ - struct log_c *lc = log->context; - - (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR, - NULL, 0, - NULL, NULL); - - if (lc->log_dev) - dm_put_device(lc->ti, lc->log_dev); - - kfree(lc->usr_argv_str); - kfree(lc); - - return; -} - -static int userspace_presuspend(struct dm_dirty_log *log) -{ - int r; - struct log_c *lc = log->context; - - r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_PRESUSPEND, - NULL, 0, - NULL, NULL); - - return r; -} - -static int userspace_postsuspend(struct dm_dirty_log *log) -{ - int r; - struct log_c *lc = log->context; - - r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_POSTSUSPEND, - NULL, 0, - NULL, NULL); - - return r; -} - -static int userspace_resume(struct dm_dirty_log *log) -{ - int r; - struct log_c *lc = log->context; - - lc->in_sync_hint = 0; - r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_RESUME, - NULL, 0, - NULL, NULL); - - return r; -} - -static uint32_t userspace_get_region_size(struct dm_dirty_log *log) -{ - struct log_c *lc = log->context; - - return lc->region_size; -} - -/* - * userspace_is_clean - * - * Check whether a region is clean. If there is any sort of - * failure when consulting the server, we return not clean. - * - * Returns: 1 if clean, 0 otherwise - */ -static int userspace_is_clean(struct dm_dirty_log *log, region_t region) -{ - int r; - uint64_t region64 = (uint64_t)region; - int64_t is_clean; - size_t rdata_size; - struct log_c *lc = log->context; - - rdata_size = sizeof(is_clean); - r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_CLEAN, - (char *)®ion64, sizeof(region64), - (char *)&is_clean, &rdata_size); - - return (r) ? 0 : (int)is_clean; -} - -/* - * userspace_in_sync - * - * Check if the region is in-sync. If there is any sort - * of failure when consulting the server, we assume that - * the region is not in sync. - * - * If 'can_block' is set, return immediately - * - * Returns: 1 if in-sync, 0 if not-in-sync, -EWOULDBLOCK - */ -static int userspace_in_sync(struct dm_dirty_log *log, region_t region, - int can_block) -{ - int r; - uint64_t region64 = region; - int64_t in_sync; - size_t rdata_size; - struct log_c *lc = log->context; - - /* - * We can never respond directly - even if in_sync_hint is - * set. This is because another machine could see a device - * failure and mark the region out-of-sync. If we don't go - * to userspace to ask, we might think the region is in-sync - * and allow a read to pick up data that is stale. (This is - * very unlikely if a device actually fails; but it is very - * likely if a connection to one device from one machine fails.) - * - * There still might be a problem if the mirror caches the region - * state as in-sync... but then this call would not be made. So, - * that is a mirror problem. - */ - if (!can_block) - return -EWOULDBLOCK; - - rdata_size = sizeof(in_sync); - r = userspace_do_request(lc, lc->uuid, DM_ULOG_IN_SYNC, - (char *)®ion64, sizeof(region64), - (char *)&in_sync, &rdata_size); - return (r) ? 0 : (int)in_sync; -} - -static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list) -{ - int r = 0; - struct flush_entry *fe; - - list_for_each_entry(fe, flush_list, list) { - r = userspace_do_request(lc, lc->uuid, fe->type, - (char *)&fe->region, - sizeof(fe->region), - NULL, NULL); - if (r) - break; - } - - return r; -} - -static int flush_by_group(struct log_c *lc, struct list_head *flush_list) -{ - int r = 0; - int count; - uint32_t type = 0; - struct flush_entry *fe, *tmp_fe; - LIST_HEAD(tmp_list); - uint64_t group[MAX_FLUSH_GROUP_COUNT]; - - /* - * Group process the requests - */ - while (!list_empty(flush_list)) { - count = 0; - - list_for_each_entry_safe(fe, tmp_fe, flush_list, list) { - group[count] = fe->region; - count++; - - list_move(&fe->list, &tmp_list); - - type = fe->type; - if (count >= MAX_FLUSH_GROUP_COUNT) - break; - } - - r = userspace_do_request(lc, lc->uuid, type, - (char *)(group), - count * sizeof(uint64_t), - NULL, NULL); - if (r) { - /* Group send failed. Attempt one-by-one. */ - list_splice_init(&tmp_list, flush_list); - r = flush_one_by_one(lc, flush_list); - break; - } - } - - /* - * Must collect flush_entrys that were successfully processed - * as a group so that they will be free'd by the caller. - */ - list_splice_init(&tmp_list, flush_list); - - return r; -} - -/* - * userspace_flush - * - * This function is ok to block. - * The flush happens in two stages. First, it sends all - * clear/mark requests that are on the list. Then it - * tells the server to commit them. This gives the - * server a chance to optimise the commit, instead of - * doing it for every request. - * - * Additionally, we could implement another thread that - * sends the requests up to the server - reducing the - * load on flush. Then the flush would have less in - * the list and be responsible for the finishing commit. - * - * Returns: 0 on success, < 0 on failure - */ -static int userspace_flush(struct dm_dirty_log *log) -{ - int r = 0; - unsigned long flags; - struct log_c *lc = log->context; - LIST_HEAD(mark_list); - LIST_HEAD(clear_list); - struct flush_entry *fe, *tmp_fe; - - spin_lock_irqsave(&lc->flush_lock, flags); - list_splice_init(&lc->mark_list, &mark_list); - list_splice_init(&lc->clear_list, &clear_list); - spin_unlock_irqrestore(&lc->flush_lock, flags); - - if (list_empty(&mark_list) && list_empty(&clear_list)) - return 0; - - r = flush_by_group(lc, &mark_list); - if (r) - goto fail; - - r = flush_by_group(lc, &clear_list); - if (r) - goto fail; - - r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, - NULL, 0, NULL, NULL); - -fail: - /* - * We can safely remove these entries, even if failure. - * Calling code will receive an error and will know that - * the log facility has failed. - */ - list_for_each_entry_safe(fe, tmp_fe, &mark_list, list) { - list_del(&fe->list); - mempool_free(fe, flush_entry_pool); - } - list_for_each_entry_safe(fe, tmp_fe, &clear_list, list) { - list_del(&fe->list); - mempool_free(fe, flush_entry_pool); - } - - if (r) - dm_table_event(lc->ti->table); - - return r; -} - -/* - * userspace_mark_region - * - * This function should avoid blocking unless absolutely required. - * (Memory allocation is valid for blocking.) - */ -static void userspace_mark_region(struct dm_dirty_log *log, region_t region) -{ - unsigned long flags; - struct log_c *lc = log->context; - struct flush_entry *fe; - - /* Wait for an allocation, but _never_ fail */ - fe = mempool_alloc(flush_entry_pool, GFP_NOIO); - BUG_ON(!fe); - - spin_lock_irqsave(&lc->flush_lock, flags); - fe->type = DM_ULOG_MARK_REGION; - fe->region = region; - list_add(&fe->list, &lc->mark_list); - spin_unlock_irqrestore(&lc->flush_lock, flags); - - return; -} - -/* - * userspace_clear_region - * - * This function must not block. - * So, the alloc can't block. In the worst case, it is ok to - * fail. It would simply mean we can't clear the region. - * Does nothing to current sync context, but does mean - * the region will be re-sync'ed on a reload of the mirror - * even though it is in-sync. - */ -static void userspace_clear_region(struct dm_dirty_log *log, region_t region) -{ - unsigned long flags; - struct log_c *lc = log->context; - struct flush_entry *fe; - - /* - * If we fail to allocate, we skip the clearing of - * the region. This doesn't hurt us in any way, except - * to cause the region to be resync'ed when the - * device is activated next time. - */ - fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC); - if (!fe) { - DMERR("Failed to allocate memory to clear region."); - return; - } - - spin_lock_irqsave(&lc->flush_lock, flags); - fe->type = DM_ULOG_CLEAR_REGION; - fe->region = region; - list_add(&fe->list, &lc->clear_list); - spin_unlock_irqrestore(&lc->flush_lock, flags); - - return; -} - -/* - * userspace_get_resync_work - * - * Get a region that needs recovery. It is valid to return - * an error for this function. - * - * Returns: 1 if region filled, 0 if no work, <0 on error - */ -static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region) -{ - int r; - size_t rdata_size; - struct log_c *lc = log->context; - struct { - int64_t i; /* 64-bit for mix arch compatibility */ - region_t r; - } pkg; - - if (lc->in_sync_hint >= lc->region_count) - return 0; - - rdata_size = sizeof(pkg); - r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK, - NULL, 0, - (char *)&pkg, &rdata_size); - - *region = pkg.r; - return (r) ? r : (int)pkg.i; -} - -/* - * userspace_set_region_sync - * - * Set the sync status of a given region. This function - * must not fail. - */ -static void userspace_set_region_sync(struct dm_dirty_log *log, - region_t region, int in_sync) -{ - int r; - struct log_c *lc = log->context; - struct { - region_t r; - int64_t i; - } pkg; - - pkg.r = region; - pkg.i = (int64_t)in_sync; - - r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC, - (char *)&pkg, sizeof(pkg), - NULL, NULL); - - /* - * It would be nice to be able to report failures. - * However, it is easy emough to detect and resolve. - */ - return; -} - -/* - * userspace_get_sync_count - * - * If there is any sort of failure when consulting the server, - * we assume that the sync count is zero. - * - * Returns: sync count on success, 0 on failure - */ -static region_t userspace_get_sync_count(struct dm_dirty_log *log) -{ - int r; - size_t rdata_size; - uint64_t sync_count; - struct log_c *lc = log->context; - - rdata_size = sizeof(sync_count); - r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT, - NULL, 0, - (char *)&sync_count, &rdata_size); - - if (r) - return 0; - - if (sync_count >= lc->region_count) - lc->in_sync_hint = lc->region_count; - - return (region_t)sync_count; -} - -/* - * userspace_status - * - * Returns: amount of space consumed - */ -static int userspace_status(struct dm_dirty_log *log, status_type_t status_type, - char *result, unsigned maxlen) -{ - int r = 0; - char *table_args; - size_t sz = (size_t)maxlen; - struct log_c *lc = log->context; - - switch (status_type) { - case STATUSTYPE_INFO: - r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO, - NULL, 0, - result, &sz); - - if (r) { - sz = 0; - DMEMIT("%s 1 COM_FAILURE", log->type->name); - } - break; - case STATUSTYPE_TABLE: - sz = 0; - table_args = strchr(lc->usr_argv_str, ' '); - BUG_ON(!table_args); /* There will always be a ' ' */ - table_args++; - - DMEMIT("%s %u %s %s ", log->type->name, lc->usr_argc, - lc->uuid, table_args); - break; - } - return (r) ? 0 : (int)sz; -} - -/* - * userspace_is_remote_recovering - * - * Returns: 1 if region recovering, 0 otherwise - */ -static int userspace_is_remote_recovering(struct dm_dirty_log *log, - region_t region) -{ - int r; - uint64_t region64 = region; - struct log_c *lc = log->context; - static unsigned long long limit; - struct { - int64_t is_recovering; - uint64_t in_sync_hint; - } pkg; - size_t rdata_size = sizeof(pkg); - - /* - * Once the mirror has been reported to be in-sync, - * it will never again ask for recovery work. So, - * we can safely say there is not a remote machine - * recovering if the device is in-sync. (in_sync_hint - * must be reset at resume time.) - */ - if (region < lc->in_sync_hint) - return 0; - else if (jiffies < limit) - return 1; - - limit = jiffies + (HZ / 4); - r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_REMOTE_RECOVERING, - (char *)®ion64, sizeof(region64), - (char *)&pkg, &rdata_size); - if (r) - return 1; - - lc->in_sync_hint = pkg.in_sync_hint; - - return (int)pkg.is_recovering; -} - -static struct dm_dirty_log_type _userspace_type = { - .name = "userspace", - .module = THIS_MODULE, - .ctr = userspace_ctr, - .dtr = userspace_dtr, - .presuspend = userspace_presuspend, - .postsuspend = userspace_postsuspend, - .resume = userspace_resume, - .get_region_size = userspace_get_region_size, - .is_clean = userspace_is_clean, - .in_sync = userspace_in_sync, - .flush = userspace_flush, - .mark_region = userspace_mark_region, - .clear_region = userspace_clear_region, - .get_resync_work = userspace_get_resync_work, - .set_region_sync = userspace_set_region_sync, - .get_sync_count = userspace_get_sync_count, - .status = userspace_status, - .is_remote_recovering = userspace_is_remote_recovering, -}; - -static int __init userspace_dirty_log_init(void) -{ - int r = 0; - - flush_entry_pool = mempool_create(100, flush_entry_alloc, - flush_entry_free, NULL); - - if (!flush_entry_pool) { - DMWARN("Unable to create flush_entry_pool: No memory."); - return -ENOMEM; - } - - r = dm_ulog_tfr_init(); - if (r) { - DMWARN("Unable to initialize userspace log communications"); - mempool_destroy(flush_entry_pool); - return r; - } - - r = dm_dirty_log_type_register(&_userspace_type); - if (r) { - DMWARN("Couldn't register userspace dirty log type"); - dm_ulog_tfr_exit(); - mempool_destroy(flush_entry_pool); - return r; - } - - DMINFO("version " DM_LOG_USERSPACE_VSN " loaded"); - return 0; -} - -static void __exit userspace_dirty_log_exit(void) -{ - dm_dirty_log_type_unregister(&_userspace_type); - dm_ulog_tfr_exit(); - mempool_destroy(flush_entry_pool); - - DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded"); - return; -} - -module_init(userspace_dirty_log_init); -module_exit(userspace_dirty_log_exit); - -MODULE_DESCRIPTION(DM_NAME " userspace dirty log link"); -MODULE_AUTHOR("Jonathan Brassow <dm-devel@redhat.com>"); -MODULE_LICENSE("GPL"); diff --git a/ANDROID_3.4.5/drivers/md/dm-log-userspace-transfer.c b/ANDROID_3.4.5/drivers/md/dm-log-userspace-transfer.c deleted file mode 100644 index 08d9a207..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-log-userspace-transfer.c +++ /dev/null @@ -1,286 +0,0 @@ -/* - * Copyright (C) 2006-2009 Red Hat, Inc. - * - * This file is released under the LGPL. - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <net/sock.h> -#include <linux/workqueue.h> -#include <linux/connector.h> -#include <linux/device-mapper.h> -#include <linux/dm-log-userspace.h> - -#include "dm-log-userspace-transfer.h" - -static uint32_t dm_ulog_seq; - -/* - * Netlink/Connector is an unreliable protocol. How long should - * we wait for a response before assuming it was lost and retrying? - * (If we do receive a response after this time, it will be discarded - * and the response to the resent request will be waited for. - */ -#define DM_ULOG_RETRY_TIMEOUT (15 * HZ) - -/* - * Pre-allocated space for speed - */ -#define DM_ULOG_PREALLOCED_SIZE 512 -static struct cn_msg *prealloced_cn_msg; -static struct dm_ulog_request *prealloced_ulog_tfr; - -static struct cb_id ulog_cn_id = { - .idx = CN_IDX_DM, - .val = CN_VAL_DM_USERSPACE_LOG -}; - -static DEFINE_MUTEX(dm_ulog_lock); - -struct receiving_pkg { - struct list_head list; - struct completion complete; - - uint32_t seq; - - int error; - size_t *data_size; - char *data; -}; - -static DEFINE_SPINLOCK(receiving_list_lock); -static struct list_head receiving_list; - -static int dm_ulog_sendto_server(struct dm_ulog_request *tfr) -{ - int r; - struct cn_msg *msg = prealloced_cn_msg; - - memset(msg, 0, sizeof(struct cn_msg)); - - msg->id.idx = ulog_cn_id.idx; - msg->id.val = ulog_cn_id.val; - msg->ack = 0; - msg->seq = tfr->seq; - msg->len = sizeof(struct dm_ulog_request) + tfr->data_size; - - r = cn_netlink_send(msg, 0, gfp_any()); - - return r; -} - -/* - * Parameters for this function can be either msg or tfr, but not - * both. This function fills in the reply for a waiting request. - * If just msg is given, then the reply is simply an ACK from userspace - * that the request was received. - * - * Returns: 0 on success, -ENOENT on failure - */ -static int fill_pkg(struct cn_msg *msg, struct dm_ulog_request *tfr) -{ - uint32_t rtn_seq = (msg) ? msg->seq : (tfr) ? tfr->seq : 0; - struct receiving_pkg *pkg; - - /* - * The 'receiving_pkg' entries in this list are statically - * allocated on the stack in 'dm_consult_userspace'. - * Each process that is waiting for a reply from the user - * space server will have an entry in this list. - * - * We are safe to do it this way because the stack space - * is unique to each process, but still addressable by - * other processes. - */ - list_for_each_entry(pkg, &receiving_list, list) { - if (rtn_seq != pkg->seq) - continue; - - if (msg) { - pkg->error = -msg->ack; - /* - * If we are trying again, we will need to know our - * storage capacity. Otherwise, along with the - * error code, we make explicit that we have no data. - */ - if (pkg->error != -EAGAIN) - *(pkg->data_size) = 0; - } else if (tfr->data_size > *(pkg->data_size)) { - DMERR("Insufficient space to receive package [%u] " - "(%u vs %zu)", tfr->request_type, - tfr->data_size, *(pkg->data_size)); - - *(pkg->data_size) = 0; - pkg->error = -ENOSPC; - } else { - pkg->error = tfr->error; - memcpy(pkg->data, tfr->data, tfr->data_size); - *(pkg->data_size) = tfr->data_size; - } - complete(&pkg->complete); - return 0; - } - - return -ENOENT; -} - -/* - * This is the connector callback that delivers data - * that was sent from userspace. - */ -static void cn_ulog_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp) -{ - struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1); - - if (!capable(CAP_SYS_ADMIN)) - return; - - spin_lock(&receiving_list_lock); - if (msg->len == 0) - fill_pkg(msg, NULL); - else if (msg->len < sizeof(*tfr)) - DMERR("Incomplete message received (expected %u, got %u): [%u]", - (unsigned)sizeof(*tfr), msg->len, msg->seq); - else - fill_pkg(NULL, tfr); - spin_unlock(&receiving_list_lock); -} - -/** - * dm_consult_userspace - * @uuid: log's universal unique identifier (must be DM_UUID_LEN in size) - * @luid: log's local unique identifier - * @request_type: found in include/linux/dm-log-userspace.h - * @data: data to tx to the server - * @data_size: size of data in bytes - * @rdata: place to put return data from server - * @rdata_size: value-result (amount of space given/amount of space used) - * - * rdata_size is undefined on failure. - * - * Memory used to communicate with userspace is zero'ed - * before populating to ensure that no unwanted bits leak - * from kernel space to user-space. All userspace log communications - * between kernel and user space go through this function. - * - * Returns: 0 on success, -EXXX on failure - **/ -int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type, - char *data, size_t data_size, - char *rdata, size_t *rdata_size) -{ - int r = 0; - size_t dummy = 0; - int overhead_size = sizeof(struct dm_ulog_request) + sizeof(struct cn_msg); - struct dm_ulog_request *tfr = prealloced_ulog_tfr; - struct receiving_pkg pkg; - - /* - * Given the space needed to hold the 'struct cn_msg' and - * 'struct dm_ulog_request' - do we have enough payload - * space remaining? - */ - if (data_size > (DM_ULOG_PREALLOCED_SIZE - overhead_size)) { - DMINFO("Size of tfr exceeds preallocated size"); - return -EINVAL; - } - - if (!rdata_size) - rdata_size = &dummy; -resend: - /* - * We serialize the sending of requests so we can - * use the preallocated space. - */ - mutex_lock(&dm_ulog_lock); - - memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg)); - memcpy(tfr->uuid, uuid, DM_UUID_LEN); - tfr->version = DM_ULOG_REQUEST_VERSION; - tfr->luid = luid; - tfr->seq = dm_ulog_seq++; - - /* - * Must be valid request type (all other bits set to - * zero). This reserves other bits for possible future - * use. - */ - tfr->request_type = request_type & DM_ULOG_REQUEST_MASK; - - tfr->data_size = data_size; - if (data && data_size) - memcpy(tfr->data, data, data_size); - - memset(&pkg, 0, sizeof(pkg)); - init_completion(&pkg.complete); - pkg.seq = tfr->seq; - pkg.data_size = rdata_size; - pkg.data = rdata; - spin_lock(&receiving_list_lock); - list_add(&(pkg.list), &receiving_list); - spin_unlock(&receiving_list_lock); - - r = dm_ulog_sendto_server(tfr); - - mutex_unlock(&dm_ulog_lock); - - if (r) { - DMERR("Unable to send log request [%u] to userspace: %d", - request_type, r); - spin_lock(&receiving_list_lock); - list_del_init(&(pkg.list)); - spin_unlock(&receiving_list_lock); - - goto out; - } - - r = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT); - spin_lock(&receiving_list_lock); - list_del_init(&(pkg.list)); - spin_unlock(&receiving_list_lock); - if (!r) { - DMWARN("[%s] Request timed out: [%u/%u] - retrying", - (strlen(uuid) > 8) ? - (uuid + (strlen(uuid) - 8)) : (uuid), - request_type, pkg.seq); - goto resend; - } - - r = pkg.error; - if (r == -EAGAIN) - goto resend; - -out: - return r; -} - -int dm_ulog_tfr_init(void) -{ - int r; - void *prealloced; - - INIT_LIST_HEAD(&receiving_list); - - prealloced = kmalloc(DM_ULOG_PREALLOCED_SIZE, GFP_KERNEL); - if (!prealloced) - return -ENOMEM; - - prealloced_cn_msg = prealloced; - prealloced_ulog_tfr = prealloced + sizeof(struct cn_msg); - - r = cn_add_callback(&ulog_cn_id, "dmlogusr", cn_ulog_callback); - if (r) { - cn_del_callback(&ulog_cn_id); - return r; - } - - return 0; -} - -void dm_ulog_tfr_exit(void) -{ - cn_del_callback(&ulog_cn_id); - kfree(prealloced_cn_msg); -} diff --git a/ANDROID_3.4.5/drivers/md/dm-log-userspace-transfer.h b/ANDROID_3.4.5/drivers/md/dm-log-userspace-transfer.h deleted file mode 100644 index 04ee874f..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-log-userspace-transfer.h +++ /dev/null @@ -1,18 +0,0 @@ -/* - * Copyright (C) 2006-2009 Red Hat, Inc. - * - * This file is released under the LGPL. - */ - -#ifndef __DM_LOG_USERSPACE_TRANSFER_H__ -#define __DM_LOG_USERSPACE_TRANSFER_H__ - -#define DM_MSG_PREFIX "dm-log-userspace" - -int dm_ulog_tfr_init(void); -void dm_ulog_tfr_exit(void); -int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type, - char *data, size_t data_size, - char *rdata, size_t *rdata_size); - -#endif /* __DM_LOG_USERSPACE_TRANSFER_H__ */ diff --git a/ANDROID_3.4.5/drivers/md/dm-log.c b/ANDROID_3.4.5/drivers/md/dm-log.c deleted file mode 100644 index 65ebaebf..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-log.c +++ /dev/null @@ -1,897 +0,0 @@ -/* - * Copyright (C) 2003 Sistina Software - * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. - * - * This file is released under the LGPL. - */ - -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/module.h> -#include <linux/vmalloc.h> -#include <linux/dm-io.h> -#include <linux/dm-dirty-log.h> - -#include <linux/device-mapper.h> - -#define DM_MSG_PREFIX "dirty region log" - -static LIST_HEAD(_log_types); -static DEFINE_SPINLOCK(_lock); - -static struct dm_dirty_log_type *__find_dirty_log_type(const char *name) -{ - struct dm_dirty_log_type *log_type; - - list_for_each_entry(log_type, &_log_types, list) - if (!strcmp(name, log_type->name)) - return log_type; - - return NULL; -} - -static struct dm_dirty_log_type *_get_dirty_log_type(const char *name) -{ - struct dm_dirty_log_type *log_type; - - spin_lock(&_lock); - - log_type = __find_dirty_log_type(name); - if (log_type && !try_module_get(log_type->module)) - log_type = NULL; - - spin_unlock(&_lock); - - return log_type; -} - -/* - * get_type - * @type_name - * - * Attempt to retrieve the dm_dirty_log_type by name. If not already - * available, attempt to load the appropriate module. - * - * Log modules are named "dm-log-" followed by the 'type_name'. - * Modules may contain multiple types. - * This function will first try the module "dm-log-<type_name>", - * then truncate 'type_name' on the last '-' and try again. - * - * For example, if type_name was "clustered-disk", it would search - * 'dm-log-clustered-disk' then 'dm-log-clustered'. - * - * Returns: dirty_log_type* on success, NULL on failure - */ -static struct dm_dirty_log_type *get_type(const char *type_name) -{ - char *p, *type_name_dup; - struct dm_dirty_log_type *log_type; - - if (!type_name) - return NULL; - - log_type = _get_dirty_log_type(type_name); - if (log_type) - return log_type; - - type_name_dup = kstrdup(type_name, GFP_KERNEL); - if (!type_name_dup) { - DMWARN("No memory left to attempt log module load for \"%s\"", - type_name); - return NULL; - } - - while (request_module("dm-log-%s", type_name_dup) || - !(log_type = _get_dirty_log_type(type_name))) { - p = strrchr(type_name_dup, '-'); - if (!p) - break; - p[0] = '\0'; - } - - if (!log_type) - DMWARN("Module for logging type \"%s\" not found.", type_name); - - kfree(type_name_dup); - - return log_type; -} - -static void put_type(struct dm_dirty_log_type *type) -{ - if (!type) - return; - - spin_lock(&_lock); - if (!__find_dirty_log_type(type->name)) - goto out; - - module_put(type->module); - -out: - spin_unlock(&_lock); -} - -int dm_dirty_log_type_register(struct dm_dirty_log_type *type) -{ - int r = 0; - - spin_lock(&_lock); - if (!__find_dirty_log_type(type->name)) - list_add(&type->list, &_log_types); - else - r = -EEXIST; - spin_unlock(&_lock); - - return r; -} -EXPORT_SYMBOL(dm_dirty_log_type_register); - -int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type) -{ - spin_lock(&_lock); - - if (!__find_dirty_log_type(type->name)) { - spin_unlock(&_lock); - return -EINVAL; - } - - list_del(&type->list); - - spin_unlock(&_lock); - - return 0; -} -EXPORT_SYMBOL(dm_dirty_log_type_unregister); - -struct dm_dirty_log *dm_dirty_log_create(const char *type_name, - struct dm_target *ti, - int (*flush_callback_fn)(struct dm_target *ti), - unsigned int argc, char **argv) -{ - struct dm_dirty_log_type *type; - struct dm_dirty_log *log; - - log = kmalloc(sizeof(*log), GFP_KERNEL); - if (!log) - return NULL; - - type = get_type(type_name); - if (!type) { - kfree(log); - return NULL; - } - - log->flush_callback_fn = flush_callback_fn; - log->type = type; - if (type->ctr(log, ti, argc, argv)) { - kfree(log); - put_type(type); - return NULL; - } - - return log; -} -EXPORT_SYMBOL(dm_dirty_log_create); - -void dm_dirty_log_destroy(struct dm_dirty_log *log) -{ - log->type->dtr(log); - put_type(log->type); - kfree(log); -} -EXPORT_SYMBOL(dm_dirty_log_destroy); - -/*----------------------------------------------------------------- - * Persistent and core logs share a lot of their implementation. - * FIXME: need a reload method to be called from a resume - *---------------------------------------------------------------*/ -/* - * Magic for persistent mirrors: "MiRr" - */ -#define MIRROR_MAGIC 0x4D695272 - -/* - * The on-disk version of the metadata. - */ -#define MIRROR_DISK_VERSION 2 -#define LOG_OFFSET 2 - -struct log_header_disk { - __le32 magic; - - /* - * Simple, incrementing version. no backward - * compatibility. - */ - __le32 version; - __le64 nr_regions; -} __packed; - -struct log_header_core { - uint32_t magic; - uint32_t version; - uint64_t nr_regions; -}; - -struct log_c { - struct dm_target *ti; - int touched_dirtied; - int touched_cleaned; - int flush_failed; - uint32_t region_size; - unsigned int region_count; - region_t sync_count; - - unsigned bitset_uint32_count; - uint32_t *clean_bits; - uint32_t *sync_bits; - uint32_t *recovering_bits; /* FIXME: this seems excessive */ - - int sync_search; - - /* Resync flag */ - enum sync { - DEFAULTSYNC, /* Synchronize if necessary */ - NOSYNC, /* Devices known to be already in sync */ - FORCESYNC, /* Force a sync to happen */ - } sync; - - struct dm_io_request io_req; - - /* - * Disk log fields - */ - int log_dev_failed; - int log_dev_flush_failed; - struct dm_dev *log_dev; - struct log_header_core header; - - struct dm_io_region header_location; - struct log_header_disk *disk_header; -}; - -/* - * The touched member needs to be updated every time we access - * one of the bitsets. - */ -static inline int log_test_bit(uint32_t *bs, unsigned bit) -{ - return test_bit_le(bit, bs) ? 1 : 0; -} - -static inline void log_set_bit(struct log_c *l, - uint32_t *bs, unsigned bit) -{ - __set_bit_le(bit, bs); - l->touched_cleaned = 1; -} - -static inline void log_clear_bit(struct log_c *l, - uint32_t *bs, unsigned bit) -{ - __clear_bit_le(bit, bs); - l->touched_dirtied = 1; -} - -/*---------------------------------------------------------------- - * Header IO - *--------------------------------------------------------------*/ -static void header_to_disk(struct log_header_core *core, struct log_header_disk *disk) -{ - disk->magic = cpu_to_le32(core->magic); - disk->version = cpu_to_le32(core->version); - disk->nr_regions = cpu_to_le64(core->nr_regions); -} - -static void header_from_disk(struct log_header_core *core, struct log_header_disk *disk) -{ - core->magic = le32_to_cpu(disk->magic); - core->version = le32_to_cpu(disk->version); - core->nr_regions = le64_to_cpu(disk->nr_regions); -} - -static int rw_header(struct log_c *lc, int rw) -{ - lc->io_req.bi_rw = rw; - - return dm_io(&lc->io_req, 1, &lc->header_location, NULL); -} - -static int flush_header(struct log_c *lc) -{ - struct dm_io_region null_location = { - .bdev = lc->header_location.bdev, - .sector = 0, - .count = 0, - }; - - lc->io_req.bi_rw = WRITE_FLUSH; - - return dm_io(&lc->io_req, 1, &null_location, NULL); -} - -static int read_header(struct log_c *log) -{ - int r; - - r = rw_header(log, READ); - if (r) - return r; - - header_from_disk(&log->header, log->disk_header); - - /* New log required? */ - if (log->sync != DEFAULTSYNC || log->header.magic != MIRROR_MAGIC) { - log->header.magic = MIRROR_MAGIC; - log->header.version = MIRROR_DISK_VERSION; - log->header.nr_regions = 0; - } - -#ifdef __LITTLE_ENDIAN - if (log->header.version == 1) - log->header.version = 2; -#endif - - if (log->header.version != MIRROR_DISK_VERSION) { - DMWARN("incompatible disk log version"); - return -EINVAL; - } - - return 0; -} - -static int _check_region_size(struct dm_target *ti, uint32_t region_size) -{ - if (region_size < 2 || region_size > ti->len) - return 0; - - if (!is_power_of_2(region_size)) - return 0; - - return 1; -} - -/*---------------------------------------------------------------- - * core log constructor/destructor - * - * argv contains region_size followed optionally by [no]sync - *--------------------------------------------------------------*/ -#define BYTE_SHIFT 3 -static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, - unsigned int argc, char **argv, - struct dm_dev *dev) -{ - enum sync sync = DEFAULTSYNC; - - struct log_c *lc; - uint32_t region_size; - unsigned int region_count; - size_t bitset_size, buf_size; - int r; - char dummy; - - if (argc < 1 || argc > 2) { - DMWARN("wrong number of arguments to dirty region log"); - return -EINVAL; - } - - if (argc > 1) { - if (!strcmp(argv[1], "sync")) - sync = FORCESYNC; - else if (!strcmp(argv[1], "nosync")) - sync = NOSYNC; - else { - DMWARN("unrecognised sync argument to " - "dirty region log: %s", argv[1]); - return -EINVAL; - } - } - - if (sscanf(argv[0], "%u%c", ®ion_size, &dummy) != 1 || - !_check_region_size(ti, region_size)) { - DMWARN("invalid region size %s", argv[0]); - return -EINVAL; - } - - region_count = dm_sector_div_up(ti->len, region_size); - - lc = kmalloc(sizeof(*lc), GFP_KERNEL); - if (!lc) { - DMWARN("couldn't allocate core log"); - return -ENOMEM; - } - - lc->ti = ti; - lc->touched_dirtied = 0; - lc->touched_cleaned = 0; - lc->flush_failed = 0; - lc->region_size = region_size; - lc->region_count = region_count; - lc->sync = sync; - - /* - * Work out how many "unsigned long"s we need to hold the bitset. - */ - bitset_size = dm_round_up(region_count, - sizeof(*lc->clean_bits) << BYTE_SHIFT); - bitset_size >>= BYTE_SHIFT; - - lc->bitset_uint32_count = bitset_size / sizeof(*lc->clean_bits); - - /* - * Disk log? - */ - if (!dev) { - lc->clean_bits = vmalloc(bitset_size); - if (!lc->clean_bits) { - DMWARN("couldn't allocate clean bitset"); - kfree(lc); - return -ENOMEM; - } - lc->disk_header = NULL; - } else { - lc->log_dev = dev; - lc->log_dev_failed = 0; - lc->log_dev_flush_failed = 0; - lc->header_location.bdev = lc->log_dev->bdev; - lc->header_location.sector = 0; - - /* - * Buffer holds both header and bitset. - */ - buf_size = - dm_round_up((LOG_OFFSET << SECTOR_SHIFT) + bitset_size, - bdev_logical_block_size(lc->header_location. - bdev)); - - if (buf_size > i_size_read(dev->bdev->bd_inode)) { - DMWARN("log device %s too small: need %llu bytes", - dev->name, (unsigned long long)buf_size); - kfree(lc); - return -EINVAL; - } - - lc->header_location.count = buf_size >> SECTOR_SHIFT; - - lc->io_req.mem.type = DM_IO_VMA; - lc->io_req.notify.fn = NULL; - lc->io_req.client = dm_io_client_create(); - if (IS_ERR(lc->io_req.client)) { - r = PTR_ERR(lc->io_req.client); - DMWARN("couldn't allocate disk io client"); - kfree(lc); - return r; - } - - lc->disk_header = vmalloc(buf_size); - if (!lc->disk_header) { - DMWARN("couldn't allocate disk log buffer"); - dm_io_client_destroy(lc->io_req.client); - kfree(lc); - return -ENOMEM; - } - - lc->io_req.mem.ptr.vma = lc->disk_header; - lc->clean_bits = (void *)lc->disk_header + - (LOG_OFFSET << SECTOR_SHIFT); - } - - memset(lc->clean_bits, -1, bitset_size); - - lc->sync_bits = vmalloc(bitset_size); - if (!lc->sync_bits) { - DMWARN("couldn't allocate sync bitset"); - if (!dev) - vfree(lc->clean_bits); - else - dm_io_client_destroy(lc->io_req.client); - vfree(lc->disk_header); - kfree(lc); - return -ENOMEM; - } - memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size); - lc->sync_count = (sync == NOSYNC) ? region_count : 0; - - lc->recovering_bits = vzalloc(bitset_size); - if (!lc->recovering_bits) { - DMWARN("couldn't allocate sync bitset"); - vfree(lc->sync_bits); - if (!dev) - vfree(lc->clean_bits); - else - dm_io_client_destroy(lc->io_req.client); - vfree(lc->disk_header); - kfree(lc); - return -ENOMEM; - } - lc->sync_search = 0; - log->context = lc; - - return 0; -} - -static int core_ctr(struct dm_dirty_log *log, struct dm_target *ti, - unsigned int argc, char **argv) -{ - return create_log_context(log, ti, argc, argv, NULL); -} - -static void destroy_log_context(struct log_c *lc) -{ - vfree(lc->sync_bits); - vfree(lc->recovering_bits); - kfree(lc); -} - -static void core_dtr(struct dm_dirty_log *log) -{ - struct log_c *lc = (struct log_c *) log->context; - - vfree(lc->clean_bits); - destroy_log_context(lc); -} - -/*---------------------------------------------------------------- - * disk log constructor/destructor - * - * argv contains log_device region_size followed optionally by [no]sync - *--------------------------------------------------------------*/ -static int disk_ctr(struct dm_dirty_log *log, struct dm_target *ti, - unsigned int argc, char **argv) -{ - int r; - struct dm_dev *dev; - - if (argc < 2 || argc > 3) { - DMWARN("wrong number of arguments to disk dirty region log"); - return -EINVAL; - } - - r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dev); - if (r) - return r; - - r = create_log_context(log, ti, argc - 1, argv + 1, dev); - if (r) { - dm_put_device(ti, dev); - return r; - } - - return 0; -} - -static void disk_dtr(struct dm_dirty_log *log) -{ - struct log_c *lc = (struct log_c *) log->context; - - dm_put_device(lc->ti, lc->log_dev); - vfree(lc->disk_header); - dm_io_client_destroy(lc->io_req.client); - destroy_log_context(lc); -} - -static int count_bits32(uint32_t *addr, unsigned size) -{ - int count = 0, i; - - for (i = 0; i < size; i++) { - count += hweight32(*(addr+i)); - } - return count; -} - -static void fail_log_device(struct log_c *lc) -{ - if (lc->log_dev_failed) - return; - - lc->log_dev_failed = 1; - dm_table_event(lc->ti->table); -} - -static int disk_resume(struct dm_dirty_log *log) -{ - int r; - unsigned i; - struct log_c *lc = (struct log_c *) log->context; - size_t size = lc->bitset_uint32_count * sizeof(uint32_t); - - /* read the disk header */ - r = read_header(lc); - if (r) { - DMWARN("%s: Failed to read header on dirty region log device", - lc->log_dev->name); - fail_log_device(lc); - /* - * If the log device cannot be read, we must assume - * all regions are out-of-sync. If we simply return - * here, the state will be uninitialized and could - * lead us to return 'in-sync' status for regions - * that are actually 'out-of-sync'. - */ - lc->header.nr_regions = 0; - } - - /* set or clear any new bits -- device has grown */ - if (lc->sync == NOSYNC) - for (i = lc->header.nr_regions; i < lc->region_count; i++) - /* FIXME: amazingly inefficient */ - log_set_bit(lc, lc->clean_bits, i); - else - for (i = lc->header.nr_regions; i < lc->region_count; i++) - /* FIXME: amazingly inefficient */ - log_clear_bit(lc, lc->clean_bits, i); - - /* clear any old bits -- device has shrunk */ - for (i = lc->region_count; i % (sizeof(*lc->clean_bits) << BYTE_SHIFT); i++) - log_clear_bit(lc, lc->clean_bits, i); - - /* copy clean across to sync */ - memcpy(lc->sync_bits, lc->clean_bits, size); - lc->sync_count = count_bits32(lc->clean_bits, lc->bitset_uint32_count); - lc->sync_search = 0; - - /* set the correct number of regions in the header */ - lc->header.nr_regions = lc->region_count; - - header_to_disk(&lc->header, lc->disk_header); - - /* write the new header */ - r = rw_header(lc, WRITE); - if (!r) { - r = flush_header(lc); - if (r) - lc->log_dev_flush_failed = 1; - } - if (r) { - DMWARN("%s: Failed to write header on dirty region log device", - lc->log_dev->name); - fail_log_device(lc); - } - - return r; -} - -static uint32_t core_get_region_size(struct dm_dirty_log *log) -{ - struct log_c *lc = (struct log_c *) log->context; - return lc->region_size; -} - -static int core_resume(struct dm_dirty_log *log) -{ - struct log_c *lc = (struct log_c *) log->context; - lc->sync_search = 0; - return 0; -} - -static int core_is_clean(struct dm_dirty_log *log, region_t region) -{ - struct log_c *lc = (struct log_c *) log->context; - return log_test_bit(lc->clean_bits, region); -} - -static int core_in_sync(struct dm_dirty_log *log, region_t region, int block) -{ - struct log_c *lc = (struct log_c *) log->context; - return log_test_bit(lc->sync_bits, region); -} - -static int core_flush(struct dm_dirty_log *log) -{ - /* no op */ - return 0; -} - -static int disk_flush(struct dm_dirty_log *log) -{ - int r, i; - struct log_c *lc = log->context; - - /* only write if the log has changed */ - if (!lc->touched_cleaned && !lc->touched_dirtied) - return 0; - - if (lc->touched_cleaned && log->flush_callback_fn && - log->flush_callback_fn(lc->ti)) { - /* - * At this point it is impossible to determine which - * regions are clean and which are dirty (without - * re-reading the log off disk). So mark all of them - * dirty. - */ - lc->flush_failed = 1; - for (i = 0; i < lc->region_count; i++) - log_clear_bit(lc, lc->clean_bits, i); - } - - r = rw_header(lc, WRITE); - if (r) - fail_log_device(lc); - else { - if (lc->touched_dirtied) { - r = flush_header(lc); - if (r) { - lc->log_dev_flush_failed = 1; - fail_log_device(lc); - } else - lc->touched_dirtied = 0; - } - lc->touched_cleaned = 0; - } - - return r; -} - -static void core_mark_region(struct dm_dirty_log *log, region_t region) -{ - struct log_c *lc = (struct log_c *) log->context; - log_clear_bit(lc, lc->clean_bits, region); -} - -static void core_clear_region(struct dm_dirty_log *log, region_t region) -{ - struct log_c *lc = (struct log_c *) log->context; - if (likely(!lc->flush_failed)) - log_set_bit(lc, lc->clean_bits, region); -} - -static int core_get_resync_work(struct dm_dirty_log *log, region_t *region) -{ - struct log_c *lc = (struct log_c *) log->context; - - if (lc->sync_search >= lc->region_count) - return 0; - - do { - *region = find_next_zero_bit_le(lc->sync_bits, - lc->region_count, - lc->sync_search); - lc->sync_search = *region + 1; - - if (*region >= lc->region_count) - return 0; - - } while (log_test_bit(lc->recovering_bits, *region)); - - log_set_bit(lc, lc->recovering_bits, *region); - return 1; -} - -static void core_set_region_sync(struct dm_dirty_log *log, region_t region, - int in_sync) -{ - struct log_c *lc = (struct log_c *) log->context; - - log_clear_bit(lc, lc->recovering_bits, region); - if (in_sync) { - log_set_bit(lc, lc->sync_bits, region); - lc->sync_count++; - } else if (log_test_bit(lc->sync_bits, region)) { - lc->sync_count--; - log_clear_bit(lc, lc->sync_bits, region); - } -} - -static region_t core_get_sync_count(struct dm_dirty_log *log) -{ - struct log_c *lc = (struct log_c *) log->context; - - return lc->sync_count; -} - -#define DMEMIT_SYNC \ - if (lc->sync != DEFAULTSYNC) \ - DMEMIT("%ssync ", lc->sync == NOSYNC ? "no" : "") - -static int core_status(struct dm_dirty_log *log, status_type_t status, - char *result, unsigned int maxlen) -{ - int sz = 0; - struct log_c *lc = log->context; - - switch(status) { - case STATUSTYPE_INFO: - DMEMIT("1 %s", log->type->name); - break; - - case STATUSTYPE_TABLE: - DMEMIT("%s %u %u ", log->type->name, - lc->sync == DEFAULTSYNC ? 1 : 2, lc->region_size); - DMEMIT_SYNC; - } - - return sz; -} - -static int disk_status(struct dm_dirty_log *log, status_type_t status, - char *result, unsigned int maxlen) -{ - int sz = 0; - struct log_c *lc = log->context; - - switch(status) { - case STATUSTYPE_INFO: - DMEMIT("3 %s %s %c", log->type->name, lc->log_dev->name, - lc->log_dev_flush_failed ? 'F' : - lc->log_dev_failed ? 'D' : - 'A'); - break; - - case STATUSTYPE_TABLE: - DMEMIT("%s %u %s %u ", log->type->name, - lc->sync == DEFAULTSYNC ? 2 : 3, lc->log_dev->name, - lc->region_size); - DMEMIT_SYNC; - } - - return sz; -} - -static struct dm_dirty_log_type _core_type = { - .name = "core", - .module = THIS_MODULE, - .ctr = core_ctr, - .dtr = core_dtr, - .resume = core_resume, - .get_region_size = core_get_region_size, - .is_clean = core_is_clean, - .in_sync = core_in_sync, - .flush = core_flush, - .mark_region = core_mark_region, - .clear_region = core_clear_region, - .get_resync_work = core_get_resync_work, - .set_region_sync = core_set_region_sync, - .get_sync_count = core_get_sync_count, - .status = core_status, -}; - -static struct dm_dirty_log_type _disk_type = { - .name = "disk", - .module = THIS_MODULE, - .ctr = disk_ctr, - .dtr = disk_dtr, - .postsuspend = disk_flush, - .resume = disk_resume, - .get_region_size = core_get_region_size, - .is_clean = core_is_clean, - .in_sync = core_in_sync, - .flush = disk_flush, - .mark_region = core_mark_region, - .clear_region = core_clear_region, - .get_resync_work = core_get_resync_work, - .set_region_sync = core_set_region_sync, - .get_sync_count = core_get_sync_count, - .status = disk_status, -}; - -static int __init dm_dirty_log_init(void) -{ - int r; - - r = dm_dirty_log_type_register(&_core_type); - if (r) - DMWARN("couldn't register core log"); - - r = dm_dirty_log_type_register(&_disk_type); - if (r) { - DMWARN("couldn't register disk type"); - dm_dirty_log_type_unregister(&_core_type); - } - - return r; -} - -static void __exit dm_dirty_log_exit(void) -{ - dm_dirty_log_type_unregister(&_disk_type); - dm_dirty_log_type_unregister(&_core_type); -} - -module_init(dm_dirty_log_init); -module_exit(dm_dirty_log_exit); - -MODULE_DESCRIPTION(DM_NAME " dirty region log"); -MODULE_AUTHOR("Joe Thornber, Heinz Mauelshagen <dm-devel@redhat.com>"); -MODULE_LICENSE("GPL"); diff --git a/ANDROID_3.4.5/drivers/md/dm-mpath.c b/ANDROID_3.4.5/drivers/md/dm-mpath.c deleted file mode 100644 index 754f38f8..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-mpath.c +++ /dev/null @@ -1,1723 +0,0 @@ -/* - * Copyright (C) 2003 Sistina Software Limited. - * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. - * - * This file is released under the GPL. - */ - -#include <linux/device-mapper.h> - -#include "dm-path-selector.h" -#include "dm-uevent.h" - -#include <linux/ctype.h> -#include <linux/init.h> -#include <linux/mempool.h> -#include <linux/module.h> -#include <linux/pagemap.h> -#include <linux/slab.h> -#include <linux/time.h> -#include <linux/workqueue.h> -#include <scsi/scsi_dh.h> -#include <linux/atomic.h> - -#define DM_MSG_PREFIX "multipath" -#define DM_PG_INIT_DELAY_MSECS 2000 -#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1) - -/* Path properties */ -struct pgpath { - struct list_head list; - - struct priority_group *pg; /* Owning PG */ - unsigned is_active; /* Path status */ - unsigned fail_count; /* Cumulative failure count */ - - struct dm_path path; - struct delayed_work activate_path; -}; - -#define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) - -/* - * Paths are grouped into Priority Groups and numbered from 1 upwards. - * Each has a path selector which controls which path gets used. - */ -struct priority_group { - struct list_head list; - - struct multipath *m; /* Owning multipath instance */ - struct path_selector ps; - - unsigned pg_num; /* Reference number */ - unsigned bypassed; /* Temporarily bypass this PG? */ - - unsigned nr_pgpaths; /* Number of paths in PG */ - struct list_head pgpaths; -}; - -/* Multipath context */ -struct multipath { - struct list_head list; - struct dm_target *ti; - - spinlock_t lock; - - const char *hw_handler_name; - char *hw_handler_params; - - unsigned nr_priority_groups; - struct list_head priority_groups; - - wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ - - unsigned pg_init_required; /* pg_init needs calling? */ - unsigned pg_init_in_progress; /* Only one pg_init allowed at once */ - unsigned pg_init_delay_retry; /* Delay pg_init retry? */ - - unsigned nr_valid_paths; /* Total number of usable paths */ - struct pgpath *current_pgpath; - struct priority_group *current_pg; - struct priority_group *next_pg; /* Switch to this PG if set */ - unsigned repeat_count; /* I/Os left before calling PS again */ - - unsigned queue_io; /* Must we queue all I/O? */ - unsigned queue_if_no_path; /* Queue I/O if last path fails? */ - unsigned saved_queue_if_no_path;/* Saved state during suspension */ - unsigned pg_init_retries; /* Number of times to retry pg_init */ - unsigned pg_init_count; /* Number of times pg_init called */ - unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */ - - struct work_struct process_queued_ios; - struct list_head queued_ios; - unsigned queue_size; - - struct work_struct trigger_event; - - /* - * We must use a mempool of dm_mpath_io structs so that we - * can resubmit bios on error. - */ - mempool_t *mpio_pool; - - struct mutex work_mutex; -}; - -/* - * Context information attached to each bio we process. - */ -struct dm_mpath_io { - struct pgpath *pgpath; - size_t nr_bytes; -}; - -typedef int (*action_fn) (struct pgpath *pgpath); - -#define MIN_IOS 256 /* Mempool size */ - -static struct kmem_cache *_mpio_cache; - -static struct workqueue_struct *kmultipathd, *kmpath_handlerd; -static void process_queued_ios(struct work_struct *work); -static void trigger_event(struct work_struct *work); -static void activate_path(struct work_struct *work); - - -/*----------------------------------------------- - * Allocation routines - *-----------------------------------------------*/ - -static struct pgpath *alloc_pgpath(void) -{ - struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL); - - if (pgpath) { - pgpath->is_active = 1; - INIT_DELAYED_WORK(&pgpath->activate_path, activate_path); - } - - return pgpath; -} - -static void free_pgpath(struct pgpath *pgpath) -{ - kfree(pgpath); -} - -static struct priority_group *alloc_priority_group(void) -{ - struct priority_group *pg; - - pg = kzalloc(sizeof(*pg), GFP_KERNEL); - - if (pg) - INIT_LIST_HEAD(&pg->pgpaths); - - return pg; -} - -static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) -{ - struct pgpath *pgpath, *tmp; - struct multipath *m = ti->private; - - list_for_each_entry_safe(pgpath, tmp, pgpaths, list) { - list_del(&pgpath->list); - if (m->hw_handler_name) - scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev)); - dm_put_device(ti, pgpath->path.dev); - free_pgpath(pgpath); - } -} - -static void free_priority_group(struct priority_group *pg, - struct dm_target *ti) -{ - struct path_selector *ps = &pg->ps; - - if (ps->type) { - ps->type->destroy(ps); - dm_put_path_selector(ps->type); - } - - free_pgpaths(&pg->pgpaths, ti); - kfree(pg); -} - -static struct multipath *alloc_multipath(struct dm_target *ti) -{ - struct multipath *m; - - m = kzalloc(sizeof(*m), GFP_KERNEL); - if (m) { - INIT_LIST_HEAD(&m->priority_groups); - INIT_LIST_HEAD(&m->queued_ios); - spin_lock_init(&m->lock); - m->queue_io = 1; - m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; - INIT_WORK(&m->process_queued_ios, process_queued_ios); - INIT_WORK(&m->trigger_event, trigger_event); - init_waitqueue_head(&m->pg_init_wait); - mutex_init(&m->work_mutex); - m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache); - if (!m->mpio_pool) { - kfree(m); - return NULL; - } - m->ti = ti; - ti->private = m; - } - - return m; -} - -static void free_multipath(struct multipath *m) -{ - struct priority_group *pg, *tmp; - - list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) { - list_del(&pg->list); - free_priority_group(pg, m->ti); - } - - kfree(m->hw_handler_name); - kfree(m->hw_handler_params); - mempool_destroy(m->mpio_pool); - kfree(m); -} - -static int set_mapinfo(struct multipath *m, union map_info *info) -{ - struct dm_mpath_io *mpio; - - mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC); - if (!mpio) - return -ENOMEM; - - memset(mpio, 0, sizeof(*mpio)); - info->ptr = mpio; - - return 0; -} - -static void clear_mapinfo(struct multipath *m, union map_info *info) -{ - struct dm_mpath_io *mpio = info->ptr; - - info->ptr = NULL; - mempool_free(mpio, m->mpio_pool); -} - -/*----------------------------------------------- - * Path selection - *-----------------------------------------------*/ - -static void __pg_init_all_paths(struct multipath *m) -{ - struct pgpath *pgpath; - unsigned long pg_init_delay = 0; - - m->pg_init_count++; - m->pg_init_required = 0; - if (m->pg_init_delay_retry) - pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ? - m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS); - list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) { - /* Skip failed paths */ - if (!pgpath->is_active) - continue; - if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path, - pg_init_delay)) - m->pg_init_in_progress++; - } -} - -static void __switch_pg(struct multipath *m, struct pgpath *pgpath) -{ - m->current_pg = pgpath->pg; - - /* Must we initialise the PG first, and queue I/O till it's ready? */ - if (m->hw_handler_name) { - m->pg_init_required = 1; - m->queue_io = 1; - } else { - m->pg_init_required = 0; - m->queue_io = 0; - } - - m->pg_init_count = 0; -} - -static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg, - size_t nr_bytes) -{ - struct dm_path *path; - - path = pg->ps.type->select_path(&pg->ps, &m->repeat_count, nr_bytes); - if (!path) - return -ENXIO; - - m->current_pgpath = path_to_pgpath(path); - - if (m->current_pg != pg) - __switch_pg(m, m->current_pgpath); - - return 0; -} - -static void __choose_pgpath(struct multipath *m, size_t nr_bytes) -{ - struct priority_group *pg; - unsigned bypassed = 1; - - if (!m->nr_valid_paths) - goto failed; - - /* Were we instructed to switch PG? */ - if (m->next_pg) { - pg = m->next_pg; - m->next_pg = NULL; - if (!__choose_path_in_pg(m, pg, nr_bytes)) - return; - } - - /* Don't change PG until it has no remaining paths */ - if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes)) - return; - - /* - * Loop through priority groups until we find a valid path. - * First time we skip PGs marked 'bypassed'. - * Second time we only try the ones we skipped. - */ - do { - list_for_each_entry(pg, &m->priority_groups, list) { - if (pg->bypassed == bypassed) - continue; - if (!__choose_path_in_pg(m, pg, nr_bytes)) - return; - } - } while (bypassed--); - -failed: - m->current_pgpath = NULL; - m->current_pg = NULL; -} - -/* - * Check whether bios must be queued in the device-mapper core rather - * than here in the target. - * - * m->lock must be held on entry. - * - * If m->queue_if_no_path and m->saved_queue_if_no_path hold the - * same value then we are not between multipath_presuspend() - * and multipath_resume() calls and we have no need to check - * for the DMF_NOFLUSH_SUSPENDING flag. - */ -static int __must_push_back(struct multipath *m) -{ - return (m->queue_if_no_path != m->saved_queue_if_no_path && - dm_noflush_suspending(m->ti)); -} - -static int map_io(struct multipath *m, struct request *clone, - union map_info *map_context, unsigned was_queued) -{ - int r = DM_MAPIO_REMAPPED; - size_t nr_bytes = blk_rq_bytes(clone); - unsigned long flags; - struct pgpath *pgpath; - struct block_device *bdev; - struct dm_mpath_io *mpio = map_context->ptr; - - spin_lock_irqsave(&m->lock, flags); - - /* Do we need to select a new pgpath? */ - if (!m->current_pgpath || - (!m->queue_io && (m->repeat_count && --m->repeat_count == 0))) - __choose_pgpath(m, nr_bytes); - - pgpath = m->current_pgpath; - - if (was_queued) - m->queue_size--; - - if ((pgpath && m->queue_io) || - (!pgpath && m->queue_if_no_path)) { - /* Queue for the daemon to resubmit */ - list_add_tail(&clone->queuelist, &m->queued_ios); - m->queue_size++; - if ((m->pg_init_required && !m->pg_init_in_progress) || - !m->queue_io) - queue_work(kmultipathd, &m->process_queued_ios); - pgpath = NULL; - r = DM_MAPIO_SUBMITTED; - } else if (pgpath) { - bdev = pgpath->path.dev->bdev; - clone->q = bdev_get_queue(bdev); - clone->rq_disk = bdev->bd_disk; - } else if (__must_push_back(m)) - r = DM_MAPIO_REQUEUE; - else - r = -EIO; /* Failed */ - - mpio->pgpath = pgpath; - mpio->nr_bytes = nr_bytes; - - if (r == DM_MAPIO_REMAPPED && pgpath->pg->ps.type->start_io) - pgpath->pg->ps.type->start_io(&pgpath->pg->ps, &pgpath->path, - nr_bytes); - - spin_unlock_irqrestore(&m->lock, flags); - - return r; -} - -/* - * If we run out of usable paths, should we queue I/O or error it? - */ -static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path, - unsigned save_old_value) -{ - unsigned long flags; - - spin_lock_irqsave(&m->lock, flags); - - if (save_old_value) - m->saved_queue_if_no_path = m->queue_if_no_path; - else - m->saved_queue_if_no_path = queue_if_no_path; - m->queue_if_no_path = queue_if_no_path; - if (!m->queue_if_no_path && m->queue_size) - queue_work(kmultipathd, &m->process_queued_ios); - - spin_unlock_irqrestore(&m->lock, flags); - - return 0; -} - -/*----------------------------------------------------------------- - * The multipath daemon is responsible for resubmitting queued ios. - *---------------------------------------------------------------*/ - -static void dispatch_queued_ios(struct multipath *m) -{ - int r; - unsigned long flags; - union map_info *info; - struct request *clone, *n; - LIST_HEAD(cl); - - spin_lock_irqsave(&m->lock, flags); - list_splice_init(&m->queued_ios, &cl); - spin_unlock_irqrestore(&m->lock, flags); - - list_for_each_entry_safe(clone, n, &cl, queuelist) { - list_del_init(&clone->queuelist); - - info = dm_get_rq_mapinfo(clone); - - r = map_io(m, clone, info, 1); - if (r < 0) { - clear_mapinfo(m, info); - dm_kill_unmapped_request(clone, r); - } else if (r == DM_MAPIO_REMAPPED) - dm_dispatch_request(clone); - else if (r == DM_MAPIO_REQUEUE) { - clear_mapinfo(m, info); - dm_requeue_unmapped_request(clone); - } - } -} - -static void process_queued_ios(struct work_struct *work) -{ - struct multipath *m = - container_of(work, struct multipath, process_queued_ios); - struct pgpath *pgpath = NULL; - unsigned must_queue = 1; - unsigned long flags; - - spin_lock_irqsave(&m->lock, flags); - - if (!m->queue_size) - goto out; - - if (!m->current_pgpath) - __choose_pgpath(m, 0); - - pgpath = m->current_pgpath; - - if ((pgpath && !m->queue_io) || - (!pgpath && !m->queue_if_no_path)) - must_queue = 0; - - if (m->pg_init_required && !m->pg_init_in_progress && pgpath) - __pg_init_all_paths(m); - -out: - spin_unlock_irqrestore(&m->lock, flags); - if (!must_queue) - dispatch_queued_ios(m); -} - -/* - * An event is triggered whenever a path is taken out of use. - * Includes path failure and PG bypass. - */ -static void trigger_event(struct work_struct *work) -{ - struct multipath *m = - container_of(work, struct multipath, trigger_event); - - dm_table_event(m->ti->table); -} - -/*----------------------------------------------------------------- - * Constructor/argument parsing: - * <#multipath feature args> [<arg>]* - * <#hw_handler args> [hw_handler [<arg>]*] - * <#priority groups> - * <initial priority group> - * [<selector> <#selector args> [<arg>]* - * <#paths> <#per-path selector args> - * [<path> [<arg>]* ]+ ]+ - *---------------------------------------------------------------*/ -static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg, - struct dm_target *ti) -{ - int r; - struct path_selector_type *pst; - unsigned ps_argc; - - static struct dm_arg _args[] = { - {0, 1024, "invalid number of path selector args"}, - }; - - pst = dm_get_path_selector(dm_shift_arg(as)); - if (!pst) { - ti->error = "unknown path selector type"; - return -EINVAL; - } - - r = dm_read_arg_group(_args, as, &ps_argc, &ti->error); - if (r) { - dm_put_path_selector(pst); - return -EINVAL; - } - - r = pst->create(&pg->ps, ps_argc, as->argv); - if (r) { - dm_put_path_selector(pst); - ti->error = "path selector constructor failed"; - return r; - } - - pg->ps.type = pst; - dm_consume_args(as, ps_argc); - - return 0; -} - -static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps, - struct dm_target *ti) -{ - int r; - struct pgpath *p; - struct multipath *m = ti->private; - - /* we need at least a path arg */ - if (as->argc < 1) { - ti->error = "no device given"; - return ERR_PTR(-EINVAL); - } - - p = alloc_pgpath(); - if (!p) - return ERR_PTR(-ENOMEM); - - r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table), - &p->path.dev); - if (r) { - ti->error = "error getting device"; - goto bad; - } - - if (m->hw_handler_name) { - struct request_queue *q = bdev_get_queue(p->path.dev->bdev); - - r = scsi_dh_attach(q, m->hw_handler_name); - if (r == -EBUSY) { - /* - * Already attached to different hw_handler, - * try to reattach with correct one. - */ - scsi_dh_detach(q); - r = scsi_dh_attach(q, m->hw_handler_name); - } - - if (r < 0) { - ti->error = "error attaching hardware handler"; - dm_put_device(ti, p->path.dev); - goto bad; - } - - if (m->hw_handler_params) { - r = scsi_dh_set_params(q, m->hw_handler_params); - if (r < 0) { - ti->error = "unable to set hardware " - "handler parameters"; - scsi_dh_detach(q); - dm_put_device(ti, p->path.dev); - goto bad; - } - } - } - - r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error); - if (r) { - dm_put_device(ti, p->path.dev); - goto bad; - } - - return p; - - bad: - free_pgpath(p); - return ERR_PTR(r); -} - -static struct priority_group *parse_priority_group(struct dm_arg_set *as, - struct multipath *m) -{ - static struct dm_arg _args[] = { - {1, 1024, "invalid number of paths"}, - {0, 1024, "invalid number of selector args"} - }; - - int r; - unsigned i, nr_selector_args, nr_args; - struct priority_group *pg; - struct dm_target *ti = m->ti; - - if (as->argc < 2) { - as->argc = 0; - ti->error = "not enough priority group arguments"; - return ERR_PTR(-EINVAL); - } - - pg = alloc_priority_group(); - if (!pg) { - ti->error = "couldn't allocate priority group"; - return ERR_PTR(-ENOMEM); - } - pg->m = m; - - r = parse_path_selector(as, pg, ti); - if (r) - goto bad; - - /* - * read the paths - */ - r = dm_read_arg(_args, as, &pg->nr_pgpaths, &ti->error); - if (r) - goto bad; - - r = dm_read_arg(_args + 1, as, &nr_selector_args, &ti->error); - if (r) - goto bad; - - nr_args = 1 + nr_selector_args; - for (i = 0; i < pg->nr_pgpaths; i++) { - struct pgpath *pgpath; - struct dm_arg_set path_args; - - if (as->argc < nr_args) { - ti->error = "not enough path parameters"; - r = -EINVAL; - goto bad; - } - - path_args.argc = nr_args; - path_args.argv = as->argv; - - pgpath = parse_path(&path_args, &pg->ps, ti); - if (IS_ERR(pgpath)) { - r = PTR_ERR(pgpath); - goto bad; - } - - pgpath->pg = pg; - list_add_tail(&pgpath->list, &pg->pgpaths); - dm_consume_args(as, nr_args); - } - - return pg; - - bad: - free_priority_group(pg, ti); - return ERR_PTR(r); -} - -static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m) -{ - unsigned hw_argc; - int ret; - struct dm_target *ti = m->ti; - - static struct dm_arg _args[] = { - {0, 1024, "invalid number of hardware handler args"}, - }; - - if (dm_read_arg_group(_args, as, &hw_argc, &ti->error)) - return -EINVAL; - - if (!hw_argc) - return 0; - - m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL); - if (!try_then_request_module(scsi_dh_handler_exist(m->hw_handler_name), - "scsi_dh_%s", m->hw_handler_name)) { - ti->error = "unknown hardware handler type"; - ret = -EINVAL; - goto fail; - } - - if (hw_argc > 1) { - char *p; - int i, j, len = 4; - - for (i = 0; i <= hw_argc - 2; i++) - len += strlen(as->argv[i]) + 1; - p = m->hw_handler_params = kzalloc(len, GFP_KERNEL); - if (!p) { - ti->error = "memory allocation failed"; - ret = -ENOMEM; - goto fail; - } - j = sprintf(p, "%d", hw_argc - 1); - for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1) - j = sprintf(p, "%s", as->argv[i]); - } - dm_consume_args(as, hw_argc - 1); - - return 0; -fail: - kfree(m->hw_handler_name); - m->hw_handler_name = NULL; - return ret; -} - -static int parse_features(struct dm_arg_set *as, struct multipath *m) -{ - int r; - unsigned argc; - struct dm_target *ti = m->ti; - const char *arg_name; - - static struct dm_arg _args[] = { - {0, 5, "invalid number of feature args"}, - {1, 50, "pg_init_retries must be between 1 and 50"}, - {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"}, - }; - - r = dm_read_arg_group(_args, as, &argc, &ti->error); - if (r) - return -EINVAL; - - if (!argc) - return 0; - - do { - arg_name = dm_shift_arg(as); - argc--; - - if (!strcasecmp(arg_name, "queue_if_no_path")) { - r = queue_if_no_path(m, 1, 0); - continue; - } - - if (!strcasecmp(arg_name, "pg_init_retries") && - (argc >= 1)) { - r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error); - argc--; - continue; - } - - if (!strcasecmp(arg_name, "pg_init_delay_msecs") && - (argc >= 1)) { - r = dm_read_arg(_args + 2, as, &m->pg_init_delay_msecs, &ti->error); - argc--; - continue; - } - - ti->error = "Unrecognised multipath feature request"; - r = -EINVAL; - } while (argc && !r); - - return r; -} - -static int multipath_ctr(struct dm_target *ti, unsigned int argc, - char **argv) -{ - /* target arguments */ - static struct dm_arg _args[] = { - {0, 1024, "invalid number of priority groups"}, - {0, 1024, "invalid initial priority group number"}, - }; - - int r; - struct multipath *m; - struct dm_arg_set as; - unsigned pg_count = 0; - unsigned next_pg_num; - - as.argc = argc; - as.argv = argv; - - m = alloc_multipath(ti); - if (!m) { - ti->error = "can't allocate multipath"; - return -EINVAL; - } - - r = parse_features(&as, m); - if (r) - goto bad; - - r = parse_hw_handler(&as, m); - if (r) - goto bad; - - r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error); - if (r) - goto bad; - - r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error); - if (r) - goto bad; - - if ((!m->nr_priority_groups && next_pg_num) || - (m->nr_priority_groups && !next_pg_num)) { - ti->error = "invalid initial priority group"; - r = -EINVAL; - goto bad; - } - - /* parse the priority groups */ - while (as.argc) { - struct priority_group *pg; - - pg = parse_priority_group(&as, m); - if (IS_ERR(pg)) { - r = PTR_ERR(pg); - goto bad; - } - - m->nr_valid_paths += pg->nr_pgpaths; - list_add_tail(&pg->list, &m->priority_groups); - pg_count++; - pg->pg_num = pg_count; - if (!--next_pg_num) - m->next_pg = pg; - } - - if (pg_count != m->nr_priority_groups) { - ti->error = "priority group count mismatch"; - r = -EINVAL; - goto bad; - } - - ti->num_flush_requests = 1; - ti->num_discard_requests = 1; - - return 0; - - bad: - free_multipath(m); - return r; -} - -static void multipath_wait_for_pg_init_completion(struct multipath *m) -{ - DECLARE_WAITQUEUE(wait, current); - unsigned long flags; - - add_wait_queue(&m->pg_init_wait, &wait); - - while (1) { - set_current_state(TASK_UNINTERRUPTIBLE); - - spin_lock_irqsave(&m->lock, flags); - if (!m->pg_init_in_progress) { - spin_unlock_irqrestore(&m->lock, flags); - break; - } - spin_unlock_irqrestore(&m->lock, flags); - - io_schedule(); - } - set_current_state(TASK_RUNNING); - - remove_wait_queue(&m->pg_init_wait, &wait); -} - -static void flush_multipath_work(struct multipath *m) -{ - flush_workqueue(kmpath_handlerd); - multipath_wait_for_pg_init_completion(m); - flush_workqueue(kmultipathd); - flush_work_sync(&m->trigger_event); -} - -static void multipath_dtr(struct dm_target *ti) -{ - struct multipath *m = ti->private; - - flush_multipath_work(m); - free_multipath(m); -} - -/* - * Map cloned requests - */ -static int multipath_map(struct dm_target *ti, struct request *clone, - union map_info *map_context) -{ - int r; - struct multipath *m = (struct multipath *) ti->private; - - if (set_mapinfo(m, map_context) < 0) - /* ENOMEM, requeue */ - return DM_MAPIO_REQUEUE; - - clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; - r = map_io(m, clone, map_context, 0); - if (r < 0 || r == DM_MAPIO_REQUEUE) - clear_mapinfo(m, map_context); - - return r; -} - -/* - * Take a path out of use. - */ -static int fail_path(struct pgpath *pgpath) -{ - unsigned long flags; - struct multipath *m = pgpath->pg->m; - - spin_lock_irqsave(&m->lock, flags); - - if (!pgpath->is_active) - goto out; - - DMWARN("Failing path %s.", pgpath->path.dev->name); - - pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path); - pgpath->is_active = 0; - pgpath->fail_count++; - - m->nr_valid_paths--; - - if (pgpath == m->current_pgpath) - m->current_pgpath = NULL; - - dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti, - pgpath->path.dev->name, m->nr_valid_paths); - - schedule_work(&m->trigger_event); - -out: - spin_unlock_irqrestore(&m->lock, flags); - - return 0; -} - -/* - * Reinstate a previously-failed path - */ -static int reinstate_path(struct pgpath *pgpath) -{ - int r = 0; - unsigned long flags; - struct multipath *m = pgpath->pg->m; - - spin_lock_irqsave(&m->lock, flags); - - if (pgpath->is_active) - goto out; - - if (!pgpath->pg->ps.type->reinstate_path) { - DMWARN("Reinstate path not supported by path selector %s", - pgpath->pg->ps.type->name); - r = -EINVAL; - goto out; - } - - r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path); - if (r) - goto out; - - pgpath->is_active = 1; - - if (!m->nr_valid_paths++ && m->queue_size) { - m->current_pgpath = NULL; - queue_work(kmultipathd, &m->process_queued_ios); - } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) { - if (queue_work(kmpath_handlerd, &pgpath->activate_path.work)) - m->pg_init_in_progress++; - } - - dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti, - pgpath->path.dev->name, m->nr_valid_paths); - - schedule_work(&m->trigger_event); - -out: - spin_unlock_irqrestore(&m->lock, flags); - - return r; -} - -/* - * Fail or reinstate all paths that match the provided struct dm_dev. - */ -static int action_dev(struct multipath *m, struct dm_dev *dev, - action_fn action) -{ - int r = -EINVAL; - struct pgpath *pgpath; - struct priority_group *pg; - - list_for_each_entry(pg, &m->priority_groups, list) { - list_for_each_entry(pgpath, &pg->pgpaths, list) { - if (pgpath->path.dev == dev) - r = action(pgpath); - } - } - - return r; -} - -/* - * Temporarily try to avoid having to use the specified PG - */ -static void bypass_pg(struct multipath *m, struct priority_group *pg, - int bypassed) -{ - unsigned long flags; - - spin_lock_irqsave(&m->lock, flags); - - pg->bypassed = bypassed; - m->current_pgpath = NULL; - m->current_pg = NULL; - - spin_unlock_irqrestore(&m->lock, flags); - - schedule_work(&m->trigger_event); -} - -/* - * Switch to using the specified PG from the next I/O that gets mapped - */ -static int switch_pg_num(struct multipath *m, const char *pgstr) -{ - struct priority_group *pg; - unsigned pgnum; - unsigned long flags; - char dummy; - - if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || - (pgnum > m->nr_priority_groups)) { - DMWARN("invalid PG number supplied to switch_pg_num"); - return -EINVAL; - } - - spin_lock_irqsave(&m->lock, flags); - list_for_each_entry(pg, &m->priority_groups, list) { - pg->bypassed = 0; - if (--pgnum) - continue; - - m->current_pgpath = NULL; - m->current_pg = NULL; - m->next_pg = pg; - } - spin_unlock_irqrestore(&m->lock, flags); - - schedule_work(&m->trigger_event); - return 0; -} - -/* - * Set/clear bypassed status of a PG. - * PGs are numbered upwards from 1 in the order they were declared. - */ -static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed) -{ - struct priority_group *pg; - unsigned pgnum; - char dummy; - - if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || - (pgnum > m->nr_priority_groups)) { - DMWARN("invalid PG number supplied to bypass_pg"); - return -EINVAL; - } - - list_for_each_entry(pg, &m->priority_groups, list) { - if (!--pgnum) - break; - } - - bypass_pg(m, pg, bypassed); - return 0; -} - -/* - * Should we retry pg_init immediately? - */ -static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath) -{ - unsigned long flags; - int limit_reached = 0; - - spin_lock_irqsave(&m->lock, flags); - - if (m->pg_init_count <= m->pg_init_retries) - m->pg_init_required = 1; - else - limit_reached = 1; - - spin_unlock_irqrestore(&m->lock, flags); - - return limit_reached; -} - -static void pg_init_done(void *data, int errors) -{ - struct pgpath *pgpath = data; - struct priority_group *pg = pgpath->pg; - struct multipath *m = pg->m; - unsigned long flags; - unsigned delay_retry = 0; - - /* device or driver problems */ - switch (errors) { - case SCSI_DH_OK: - break; - case SCSI_DH_NOSYS: - if (!m->hw_handler_name) { - errors = 0; - break; - } - DMERR("Could not failover the device: Handler scsi_dh_%s " - "Error %d.", m->hw_handler_name, errors); - /* - * Fail path for now, so we do not ping pong - */ - fail_path(pgpath); - break; - case SCSI_DH_DEV_TEMP_BUSY: - /* - * Probably doing something like FW upgrade on the - * controller so try the other pg. - */ - bypass_pg(m, pg, 1); - break; - case SCSI_DH_RETRY: - /* Wait before retrying. */ - delay_retry = 1; - case SCSI_DH_IMM_RETRY: - case SCSI_DH_RES_TEMP_UNAVAIL: - if (pg_init_limit_reached(m, pgpath)) - fail_path(pgpath); - errors = 0; - break; - default: - /* - * We probably do not want to fail the path for a device - * error, but this is what the old dm did. In future - * patches we can do more advanced handling. - */ - fail_path(pgpath); - } - - spin_lock_irqsave(&m->lock, flags); - if (errors) { - if (pgpath == m->current_pgpath) { - DMERR("Could not failover device. Error %d.", errors); - m->current_pgpath = NULL; - m->current_pg = NULL; - } - } else if (!m->pg_init_required) - pg->bypassed = 0; - - if (--m->pg_init_in_progress) - /* Activations of other paths are still on going */ - goto out; - - if (!m->pg_init_required) - m->queue_io = 0; - - m->pg_init_delay_retry = delay_retry; - queue_work(kmultipathd, &m->process_queued_ios); - - /* - * Wake up any thread waiting to suspend. - */ - wake_up(&m->pg_init_wait); - -out: - spin_unlock_irqrestore(&m->lock, flags); -} - -static void activate_path(struct work_struct *work) -{ - struct pgpath *pgpath = - container_of(work, struct pgpath, activate_path.work); - - scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev), - pg_init_done, pgpath); -} - -/* - * end_io handling - */ -static int do_end_io(struct multipath *m, struct request *clone, - int error, struct dm_mpath_io *mpio) -{ - /* - * We don't queue any clone request inside the multipath target - * during end I/O handling, since those clone requests don't have - * bio clones. If we queue them inside the multipath target, - * we need to make bio clones, that requires memory allocation. - * (See drivers/md/dm.c:end_clone_bio() about why the clone requests - * don't have bio clones.) - * Instead of queueing the clone request here, we queue the original - * request into dm core, which will remake a clone request and - * clone bios for it and resubmit it later. - */ - int r = DM_ENDIO_REQUEUE; - unsigned long flags; - - if (!error && !clone->errors) - return 0; /* I/O complete */ - - if (error == -EOPNOTSUPP || error == -EREMOTEIO || error == -EILSEQ) - return error; - - if (mpio->pgpath) - fail_path(mpio->pgpath); - - spin_lock_irqsave(&m->lock, flags); - if (!m->nr_valid_paths) { - if (!m->queue_if_no_path) { - if (!__must_push_back(m)) - r = -EIO; - } else { - if (error == -EBADE) - r = error; - } - } - spin_unlock_irqrestore(&m->lock, flags); - - return r; -} - -static int multipath_end_io(struct dm_target *ti, struct request *clone, - int error, union map_info *map_context) -{ - struct multipath *m = ti->private; - struct dm_mpath_io *mpio = map_context->ptr; - struct pgpath *pgpath = mpio->pgpath; - struct path_selector *ps; - int r; - - BUG_ON(!mpio); - - r = do_end_io(m, clone, error, mpio); - if (pgpath) { - ps = &pgpath->pg->ps; - if (ps->type->end_io) - ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); - } - clear_mapinfo(m, map_context); - - return r; -} - -/* - * Suspend can't complete until all the I/O is processed so if - * the last path fails we must error any remaining I/O. - * Note that if the freeze_bdev fails while suspending, the - * queue_if_no_path state is lost - userspace should reset it. - */ -static void multipath_presuspend(struct dm_target *ti) -{ - struct multipath *m = (struct multipath *) ti->private; - - queue_if_no_path(m, 0, 1); -} - -static void multipath_postsuspend(struct dm_target *ti) -{ - struct multipath *m = ti->private; - - mutex_lock(&m->work_mutex); - flush_multipath_work(m); - mutex_unlock(&m->work_mutex); -} - -/* - * Restore the queue_if_no_path setting. - */ -static void multipath_resume(struct dm_target *ti) -{ - struct multipath *m = (struct multipath *) ti->private; - unsigned long flags; - - spin_lock_irqsave(&m->lock, flags); - m->queue_if_no_path = m->saved_queue_if_no_path; - spin_unlock_irqrestore(&m->lock, flags); -} - -/* - * Info output has the following format: - * num_multipath_feature_args [multipath_feature_args]* - * num_handler_status_args [handler_status_args]* - * num_groups init_group_number - * [A|D|E num_ps_status_args [ps_status_args]* - * num_paths num_selector_args - * [path_dev A|F fail_count [selector_args]* ]+ ]+ - * - * Table output has the following format (identical to the constructor string): - * num_feature_args [features_args]* - * num_handler_args hw_handler [hw_handler_args]* - * num_groups init_group_number - * [priority selector-name num_ps_args [ps_args]* - * num_paths num_selector_args [path_dev [selector_args]* ]+ ]+ - */ -static int multipath_status(struct dm_target *ti, status_type_t type, - char *result, unsigned int maxlen) -{ - int sz = 0; - unsigned long flags; - struct multipath *m = (struct multipath *) ti->private; - struct priority_group *pg; - struct pgpath *p; - unsigned pg_num; - char state; - - spin_lock_irqsave(&m->lock, flags); - - /* Features */ - if (type == STATUSTYPE_INFO) - DMEMIT("2 %u %u ", m->queue_size, m->pg_init_count); - else { - DMEMIT("%u ", m->queue_if_no_path + - (m->pg_init_retries > 0) * 2 + - (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2); - if (m->queue_if_no_path) - DMEMIT("queue_if_no_path "); - if (m->pg_init_retries) - DMEMIT("pg_init_retries %u ", m->pg_init_retries); - if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) - DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs); - } - - if (!m->hw_handler_name || type == STATUSTYPE_INFO) - DMEMIT("0 "); - else - DMEMIT("1 %s ", m->hw_handler_name); - - DMEMIT("%u ", m->nr_priority_groups); - - if (m->next_pg) - pg_num = m->next_pg->pg_num; - else if (m->current_pg) - pg_num = m->current_pg->pg_num; - else - pg_num = (m->nr_priority_groups ? 1 : 0); - - DMEMIT("%u ", pg_num); - - switch (type) { - case STATUSTYPE_INFO: - list_for_each_entry(pg, &m->priority_groups, list) { - if (pg->bypassed) - state = 'D'; /* Disabled */ - else if (pg == m->current_pg) - state = 'A'; /* Currently Active */ - else - state = 'E'; /* Enabled */ - - DMEMIT("%c ", state); - - if (pg->ps.type->status) - sz += pg->ps.type->status(&pg->ps, NULL, type, - result + sz, - maxlen - sz); - else - DMEMIT("0 "); - - DMEMIT("%u %u ", pg->nr_pgpaths, - pg->ps.type->info_args); - - list_for_each_entry(p, &pg->pgpaths, list) { - DMEMIT("%s %s %u ", p->path.dev->name, - p->is_active ? "A" : "F", - p->fail_count); - if (pg->ps.type->status) - sz += pg->ps.type->status(&pg->ps, - &p->path, type, result + sz, - maxlen - sz); - } - } - break; - - case STATUSTYPE_TABLE: - list_for_each_entry(pg, &m->priority_groups, list) { - DMEMIT("%s ", pg->ps.type->name); - - if (pg->ps.type->status) - sz += pg->ps.type->status(&pg->ps, NULL, type, - result + sz, - maxlen - sz); - else - DMEMIT("0 "); - - DMEMIT("%u %u ", pg->nr_pgpaths, - pg->ps.type->table_args); - - list_for_each_entry(p, &pg->pgpaths, list) { - DMEMIT("%s ", p->path.dev->name); - if (pg->ps.type->status) - sz += pg->ps.type->status(&pg->ps, - &p->path, type, result + sz, - maxlen - sz); - } - } - break; - } - - spin_unlock_irqrestore(&m->lock, flags); - - return 0; -} - -static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) -{ - int r = -EINVAL; - struct dm_dev *dev; - struct multipath *m = (struct multipath *) ti->private; - action_fn action; - - mutex_lock(&m->work_mutex); - - if (dm_suspended(ti)) { - r = -EBUSY; - goto out; - } - - if (argc == 1) { - if (!strcasecmp(argv[0], "queue_if_no_path")) { - r = queue_if_no_path(m, 1, 0); - goto out; - } else if (!strcasecmp(argv[0], "fail_if_no_path")) { - r = queue_if_no_path(m, 0, 0); - goto out; - } - } - - if (argc != 2) { - DMWARN("Unrecognised multipath message received."); - goto out; - } - - if (!strcasecmp(argv[0], "disable_group")) { - r = bypass_pg_num(m, argv[1], 1); - goto out; - } else if (!strcasecmp(argv[0], "enable_group")) { - r = bypass_pg_num(m, argv[1], 0); - goto out; - } else if (!strcasecmp(argv[0], "switch_group")) { - r = switch_pg_num(m, argv[1]); - goto out; - } else if (!strcasecmp(argv[0], "reinstate_path")) - action = reinstate_path; - else if (!strcasecmp(argv[0], "fail_path")) - action = fail_path; - else { - DMWARN("Unrecognised multipath message received."); - goto out; - } - - r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev); - if (r) { - DMWARN("message: error getting device %s", - argv[1]); - goto out; - } - - r = action_dev(m, dev, action); - - dm_put_device(ti, dev); - -out: - mutex_unlock(&m->work_mutex); - return r; -} - -static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, - unsigned long arg) -{ - struct multipath *m = (struct multipath *) ti->private; - struct block_device *bdev = NULL; - fmode_t mode = 0; - unsigned long flags; - int r = 0; - - spin_lock_irqsave(&m->lock, flags); - - if (!m->current_pgpath) - __choose_pgpath(m, 0); - - if (m->current_pgpath) { - bdev = m->current_pgpath->path.dev->bdev; - mode = m->current_pgpath->path.dev->mode; - } - - if (m->queue_io) - r = -EAGAIN; - else if (!bdev) - r = -EIO; - - spin_unlock_irqrestore(&m->lock, flags); - - /* - * Only pass ioctls through if the device sizes match exactly. - */ - if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) - r = scsi_verify_blk_ioctl(NULL, cmd); - - return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); -} - -static int multipath_iterate_devices(struct dm_target *ti, - iterate_devices_callout_fn fn, void *data) -{ - struct multipath *m = ti->private; - struct priority_group *pg; - struct pgpath *p; - int ret = 0; - - list_for_each_entry(pg, &m->priority_groups, list) { - list_for_each_entry(p, &pg->pgpaths, list) { - ret = fn(ti, p->path.dev, ti->begin, ti->len, data); - if (ret) - goto out; - } - } - -out: - return ret; -} - -static int __pgpath_busy(struct pgpath *pgpath) -{ - struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev); - - return dm_underlying_device_busy(q); -} - -/* - * We return "busy", only when we can map I/Os but underlying devices - * are busy (so even if we map I/Os now, the I/Os will wait on - * the underlying queue). - * In other words, if we want to kill I/Os or queue them inside us - * due to map unavailability, we don't return "busy". Otherwise, - * dm core won't give us the I/Os and we can't do what we want. - */ -static int multipath_busy(struct dm_target *ti) -{ - int busy = 0, has_active = 0; - struct multipath *m = ti->private; - struct priority_group *pg; - struct pgpath *pgpath; - unsigned long flags; - - spin_lock_irqsave(&m->lock, flags); - - /* Guess which priority_group will be used at next mapping time */ - if (unlikely(!m->current_pgpath && m->next_pg)) - pg = m->next_pg; - else if (likely(m->current_pg)) - pg = m->current_pg; - else - /* - * We don't know which pg will be used at next mapping time. - * We don't call __choose_pgpath() here to avoid to trigger - * pg_init just by busy checking. - * So we don't know whether underlying devices we will be using - * at next mapping time are busy or not. Just try mapping. - */ - goto out; - - /* - * If there is one non-busy active path at least, the path selector - * will be able to select it. So we consider such a pg as not busy. - */ - busy = 1; - list_for_each_entry(pgpath, &pg->pgpaths, list) - if (pgpath->is_active) { - has_active = 1; - - if (!__pgpath_busy(pgpath)) { - busy = 0; - break; - } - } - - if (!has_active) - /* - * No active path in this pg, so this pg won't be used and - * the current_pg will be changed at next mapping time. - * We need to try mapping to determine it. - */ - busy = 0; - -out: - spin_unlock_irqrestore(&m->lock, flags); - - return busy; -} - -/*----------------------------------------------------------------- - * Module setup - *---------------------------------------------------------------*/ -static struct target_type multipath_target = { - .name = "multipath", - .version = {1, 3, 0}, - .module = THIS_MODULE, - .ctr = multipath_ctr, - .dtr = multipath_dtr, - .map_rq = multipath_map, - .rq_end_io = multipath_end_io, - .presuspend = multipath_presuspend, - .postsuspend = multipath_postsuspend, - .resume = multipath_resume, - .status = multipath_status, - .message = multipath_message, - .ioctl = multipath_ioctl, - .iterate_devices = multipath_iterate_devices, - .busy = multipath_busy, -}; - -static int __init dm_multipath_init(void) -{ - int r; - - /* allocate a slab for the dm_ios */ - _mpio_cache = KMEM_CACHE(dm_mpath_io, 0); - if (!_mpio_cache) - return -ENOMEM; - - r = dm_register_target(&multipath_target); - if (r < 0) { - DMERR("register failed %d", r); - kmem_cache_destroy(_mpio_cache); - return -EINVAL; - } - - kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0); - if (!kmultipathd) { - DMERR("failed to create workqueue kmpathd"); - dm_unregister_target(&multipath_target); - kmem_cache_destroy(_mpio_cache); - return -ENOMEM; - } - - /* - * A separate workqueue is used to handle the device handlers - * to avoid overloading existing workqueue. Overloading the - * old workqueue would also create a bottleneck in the - * path of the storage hardware device activation. - */ - kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd", - WQ_MEM_RECLAIM); - if (!kmpath_handlerd) { - DMERR("failed to create workqueue kmpath_handlerd"); - destroy_workqueue(kmultipathd); - dm_unregister_target(&multipath_target); - kmem_cache_destroy(_mpio_cache); - return -ENOMEM; - } - - DMINFO("version %u.%u.%u loaded", - multipath_target.version[0], multipath_target.version[1], - multipath_target.version[2]); - - return r; -} - -static void __exit dm_multipath_exit(void) -{ - destroy_workqueue(kmpath_handlerd); - destroy_workqueue(kmultipathd); - - dm_unregister_target(&multipath_target); - kmem_cache_destroy(_mpio_cache); -} - -module_init(dm_multipath_init); -module_exit(dm_multipath_exit); - -MODULE_DESCRIPTION(DM_NAME " multipath target"); -MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>"); -MODULE_LICENSE("GPL"); diff --git a/ANDROID_3.4.5/drivers/md/dm-mpath.h b/ANDROID_3.4.5/drivers/md/dm-mpath.h deleted file mode 100644 index e230f719..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-mpath.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright (C) 2004 Red Hat, Inc. All rights reserved. - * - * This file is released under the GPL. - * - * Multipath. - */ - -#ifndef DM_MPATH_H -#define DM_MPATH_H - -struct dm_dev; - -struct dm_path { - struct dm_dev *dev; /* Read-only */ - void *pscontext; /* For path-selector use */ -}; - -/* Callback for hwh_pg_init_fn to use when complete */ -void dm_pg_init_complete(struct dm_path *path, unsigned err_flags); - -#endif diff --git a/ANDROID_3.4.5/drivers/md/dm-path-selector.c b/ANDROID_3.4.5/drivers/md/dm-path-selector.c deleted file mode 100644 index fa0ccc58..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-path-selector.c +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright (C) 2003 Sistina Software. - * Copyright (C) 2004 Red Hat, Inc. All rights reserved. - * - * Module Author: Heinz Mauelshagen - * - * This file is released under the GPL. - * - * Path selector registration. - */ - -#include <linux/device-mapper.h> -#include <linux/module.h> - -#include "dm-path-selector.h" - -#include <linux/slab.h> - -struct ps_internal { - struct path_selector_type pst; - struct list_head list; -}; - -#define pst_to_psi(__pst) container_of((__pst), struct ps_internal, pst) - -static LIST_HEAD(_path_selectors); -static DECLARE_RWSEM(_ps_lock); - -static struct ps_internal *__find_path_selector_type(const char *name) -{ - struct ps_internal *psi; - - list_for_each_entry(psi, &_path_selectors, list) { - if (!strcmp(name, psi->pst.name)) - return psi; - } - - return NULL; -} - -static struct ps_internal *get_path_selector(const char *name) -{ - struct ps_internal *psi; - - down_read(&_ps_lock); - psi = __find_path_selector_type(name); - if (psi && !try_module_get(psi->pst.module)) - psi = NULL; - up_read(&_ps_lock); - - return psi; -} - -struct path_selector_type *dm_get_path_selector(const char *name) -{ - struct ps_internal *psi; - - if (!name) - return NULL; - - psi = get_path_selector(name); - if (!psi) { - request_module("dm-%s", name); - psi = get_path_selector(name); - } - - return psi ? &psi->pst : NULL; -} - -void dm_put_path_selector(struct path_selector_type *pst) -{ - struct ps_internal *psi; - - if (!pst) - return; - - down_read(&_ps_lock); - psi = __find_path_selector_type(pst->name); - if (!psi) - goto out; - - module_put(psi->pst.module); -out: - up_read(&_ps_lock); -} - -static struct ps_internal *_alloc_path_selector(struct path_selector_type *pst) -{ - struct ps_internal *psi = kzalloc(sizeof(*psi), GFP_KERNEL); - - if (psi) - psi->pst = *pst; - - return psi; -} - -int dm_register_path_selector(struct path_selector_type *pst) -{ - int r = 0; - struct ps_internal *psi = _alloc_path_selector(pst); - - if (!psi) - return -ENOMEM; - - down_write(&_ps_lock); - - if (__find_path_selector_type(pst->name)) { - kfree(psi); - r = -EEXIST; - } else - list_add(&psi->list, &_path_selectors); - - up_write(&_ps_lock); - - return r; -} - -int dm_unregister_path_selector(struct path_selector_type *pst) -{ - struct ps_internal *psi; - - down_write(&_ps_lock); - - psi = __find_path_selector_type(pst->name); - if (!psi) { - up_write(&_ps_lock); - return -EINVAL; - } - - list_del(&psi->list); - - up_write(&_ps_lock); - - kfree(psi); - - return 0; -} - -EXPORT_SYMBOL_GPL(dm_register_path_selector); -EXPORT_SYMBOL_GPL(dm_unregister_path_selector); diff --git a/ANDROID_3.4.5/drivers/md/dm-path-selector.h b/ANDROID_3.4.5/drivers/md/dm-path-selector.h deleted file mode 100644 index e7d1fa8b..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-path-selector.h +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright (C) 2003 Sistina Software. - * Copyright (C) 2004 Red Hat, Inc. All rights reserved. - * - * Module Author: Heinz Mauelshagen - * - * This file is released under the GPL. - * - * Path-Selector registration. - */ - -#ifndef DM_PATH_SELECTOR_H -#define DM_PATH_SELECTOR_H - -#include <linux/device-mapper.h> - -#include "dm-mpath.h" - -/* - * We provide an abstraction for the code that chooses which path - * to send some io down. - */ -struct path_selector_type; -struct path_selector { - struct path_selector_type *type; - void *context; -}; - -/* Information about a path selector type */ -struct path_selector_type { - char *name; - struct module *module; - - unsigned int table_args; - unsigned int info_args; - - /* - * Constructs a path selector object, takes custom arguments - */ - int (*create) (struct path_selector *ps, unsigned argc, char **argv); - void (*destroy) (struct path_selector *ps); - - /* - * Add an opaque path object, along with some selector specific - * path args (eg, path priority). - */ - int (*add_path) (struct path_selector *ps, struct dm_path *path, - int argc, char **argv, char **error); - - /* - * Chooses a path for this io, if no paths are available then - * NULL will be returned. - * - * repeat_count is the number of times to use the path before - * calling the function again. 0 means don't call it again unless - * the path fails. - */ - struct dm_path *(*select_path) (struct path_selector *ps, - unsigned *repeat_count, - size_t nr_bytes); - - /* - * Notify the selector that a path has failed. - */ - void (*fail_path) (struct path_selector *ps, struct dm_path *p); - - /* - * Ask selector to reinstate a path. - */ - int (*reinstate_path) (struct path_selector *ps, struct dm_path *p); - - /* - * Table content based on parameters added in ps_add_path_fn - * or path selector status - */ - int (*status) (struct path_selector *ps, struct dm_path *path, - status_type_t type, char *result, unsigned int maxlen); - - int (*start_io) (struct path_selector *ps, struct dm_path *path, - size_t nr_bytes); - int (*end_io) (struct path_selector *ps, struct dm_path *path, - size_t nr_bytes); -}; - -/* Register a path selector */ -int dm_register_path_selector(struct path_selector_type *type); - -/* Unregister a path selector */ -int dm_unregister_path_selector(struct path_selector_type *type); - -/* Returns a registered path selector type */ -struct path_selector_type *dm_get_path_selector(const char *name); - -/* Releases a path selector */ -void dm_put_path_selector(struct path_selector_type *pst); - -#endif diff --git a/ANDROID_3.4.5/drivers/md/dm-queue-length.c b/ANDROID_3.4.5/drivers/md/dm-queue-length.c deleted file mode 100644 index 3941fae0..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-queue-length.c +++ /dev/null @@ -1,264 +0,0 @@ -/* - * Copyright (C) 2004-2005 IBM Corp. All Rights Reserved. - * Copyright (C) 2006-2009 NEC Corporation. - * - * dm-queue-length.c - * - * Module Author: Stefan Bader, IBM - * Modified by: Kiyoshi Ueda, NEC - * - * This file is released under the GPL. - * - * queue-length path selector - choose a path with the least number of - * in-flight I/Os. - */ - -#include "dm.h" -#include "dm-path-selector.h" - -#include <linux/slab.h> -#include <linux/ctype.h> -#include <linux/errno.h> -#include <linux/module.h> -#include <linux/atomic.h> - -#define DM_MSG_PREFIX "multipath queue-length" -#define QL_MIN_IO 128 -#define QL_VERSION "0.1.0" - -struct selector { - struct list_head valid_paths; - struct list_head failed_paths; -}; - -struct path_info { - struct list_head list; - struct dm_path *path; - unsigned repeat_count; - atomic_t qlen; /* the number of in-flight I/Os */ -}; - -static struct selector *alloc_selector(void) -{ - struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); - - if (s) { - INIT_LIST_HEAD(&s->valid_paths); - INIT_LIST_HEAD(&s->failed_paths); - } - - return s; -} - -static int ql_create(struct path_selector *ps, unsigned argc, char **argv) -{ - struct selector *s = alloc_selector(); - - if (!s) - return -ENOMEM; - - ps->context = s; - return 0; -} - -static void ql_free_paths(struct list_head *paths) -{ - struct path_info *pi, *next; - - list_for_each_entry_safe(pi, next, paths, list) { - list_del(&pi->list); - kfree(pi); - } -} - -static void ql_destroy(struct path_selector *ps) -{ - struct selector *s = ps->context; - - ql_free_paths(&s->valid_paths); - ql_free_paths(&s->failed_paths); - kfree(s); - ps->context = NULL; -} - -static int ql_status(struct path_selector *ps, struct dm_path *path, - status_type_t type, char *result, unsigned maxlen) -{ - unsigned sz = 0; - struct path_info *pi; - - /* When called with NULL path, return selector status/args. */ - if (!path) - DMEMIT("0 "); - else { - pi = path->pscontext; - - switch (type) { - case STATUSTYPE_INFO: - DMEMIT("%d ", atomic_read(&pi->qlen)); - break; - case STATUSTYPE_TABLE: - DMEMIT("%u ", pi->repeat_count); - break; - } - } - - return sz; -} - -static int ql_add_path(struct path_selector *ps, struct dm_path *path, - int argc, char **argv, char **error) -{ - struct selector *s = ps->context; - struct path_info *pi; - unsigned repeat_count = QL_MIN_IO; - char dummy; - - /* - * Arguments: [<repeat_count>] - * <repeat_count>: The number of I/Os before switching path. - * If not given, default (QL_MIN_IO) is used. - */ - if (argc > 1) { - *error = "queue-length ps: incorrect number of arguments"; - return -EINVAL; - } - - if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { - *error = "queue-length ps: invalid repeat count"; - return -EINVAL; - } - - /* Allocate the path information structure */ - pi = kmalloc(sizeof(*pi), GFP_KERNEL); - if (!pi) { - *error = "queue-length ps: Error allocating path information"; - return -ENOMEM; - } - - pi->path = path; - pi->repeat_count = repeat_count; - atomic_set(&pi->qlen, 0); - - path->pscontext = pi; - - list_add_tail(&pi->list, &s->valid_paths); - - return 0; -} - -static void ql_fail_path(struct path_selector *ps, struct dm_path *path) -{ - struct selector *s = ps->context; - struct path_info *pi = path->pscontext; - - list_move(&pi->list, &s->failed_paths); -} - -static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path) -{ - struct selector *s = ps->context; - struct path_info *pi = path->pscontext; - - list_move_tail(&pi->list, &s->valid_paths); - - return 0; -} - -/* - * Select a path having the minimum number of in-flight I/Os - */ -static struct dm_path *ql_select_path(struct path_selector *ps, - unsigned *repeat_count, size_t nr_bytes) -{ - struct selector *s = ps->context; - struct path_info *pi = NULL, *best = NULL; - - if (list_empty(&s->valid_paths)) - return NULL; - - /* Change preferred (first in list) path to evenly balance. */ - list_move_tail(s->valid_paths.next, &s->valid_paths); - - list_for_each_entry(pi, &s->valid_paths, list) { - if (!best || - (atomic_read(&pi->qlen) < atomic_read(&best->qlen))) - best = pi; - - if (!atomic_read(&best->qlen)) - break; - } - - if (!best) - return NULL; - - *repeat_count = best->repeat_count; - - return best->path; -} - -static int ql_start_io(struct path_selector *ps, struct dm_path *path, - size_t nr_bytes) -{ - struct path_info *pi = path->pscontext; - - atomic_inc(&pi->qlen); - - return 0; -} - -static int ql_end_io(struct path_selector *ps, struct dm_path *path, - size_t nr_bytes) -{ - struct path_info *pi = path->pscontext; - - atomic_dec(&pi->qlen); - - return 0; -} - -static struct path_selector_type ql_ps = { - .name = "queue-length", - .module = THIS_MODULE, - .table_args = 1, - .info_args = 1, - .create = ql_create, - .destroy = ql_destroy, - .status = ql_status, - .add_path = ql_add_path, - .fail_path = ql_fail_path, - .reinstate_path = ql_reinstate_path, - .select_path = ql_select_path, - .start_io = ql_start_io, - .end_io = ql_end_io, -}; - -static int __init dm_ql_init(void) -{ - int r = dm_register_path_selector(&ql_ps); - - if (r < 0) - DMERR("register failed %d", r); - - DMINFO("version " QL_VERSION " loaded"); - - return r; -} - -static void __exit dm_ql_exit(void) -{ - int r = dm_unregister_path_selector(&ql_ps); - - if (r < 0) - DMERR("unregister failed %d", r); -} - -module_init(dm_ql_init); -module_exit(dm_ql_exit); - -MODULE_AUTHOR("Stefan Bader <Stefan.Bader at de.ibm.com>"); -MODULE_DESCRIPTION( - "(C) Copyright IBM Corp. 2004,2005 All Rights Reserved.\n" - DM_NAME " path selector to balance the number of in-flight I/Os" -); -MODULE_LICENSE("GPL"); diff --git a/ANDROID_3.4.5/drivers/md/dm-raid.c b/ANDROID_3.4.5/drivers/md/dm-raid.c deleted file mode 100644 index 68965e66..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-raid.c +++ /dev/null @@ -1,1297 +0,0 @@ -/* - * Copyright (C) 2010-2011 Neil Brown - * Copyright (C) 2010-2011 Red Hat, Inc. All rights reserved. - * - * This file is released under the GPL. - */ - -#include <linux/slab.h> -#include <linux/module.h> - -#include "md.h" -#include "raid1.h" -#include "raid5.h" -#include "bitmap.h" - -#include <linux/device-mapper.h> - -#define DM_MSG_PREFIX "raid" - -/* - * The following flags are used by dm-raid.c to set up the array state. - * They must be cleared before md_run is called. - */ -#define FirstUse 10 /* rdev flag */ - -struct raid_dev { - /* - * Two DM devices, one to hold metadata and one to hold the - * actual data/parity. The reason for this is to not confuse - * ti->len and give more flexibility in altering size and - * characteristics. - * - * While it is possible for this device to be associated - * with a different physical device than the data_dev, it - * is intended for it to be the same. - * |--------- Physical Device ---------| - * |- meta_dev -|------ data_dev ------| - */ - struct dm_dev *meta_dev; - struct dm_dev *data_dev; - struct md_rdev rdev; -}; - -/* - * Flags for rs->print_flags field. - */ -#define DMPF_SYNC 0x1 -#define DMPF_NOSYNC 0x2 -#define DMPF_REBUILD 0x4 -#define DMPF_DAEMON_SLEEP 0x8 -#define DMPF_MIN_RECOVERY_RATE 0x10 -#define DMPF_MAX_RECOVERY_RATE 0x20 -#define DMPF_MAX_WRITE_BEHIND 0x40 -#define DMPF_STRIPE_CACHE 0x80 -#define DMPF_REGION_SIZE 0X100 -struct raid_set { - struct dm_target *ti; - - uint32_t bitmap_loaded; - uint32_t print_flags; - - struct mddev md; - struct raid_type *raid_type; - struct dm_target_callbacks callbacks; - - struct raid_dev dev[0]; -}; - -/* Supported raid types and properties. */ -static struct raid_type { - const char *name; /* RAID algorithm. */ - const char *descr; /* Descriptor text for logging. */ - const unsigned parity_devs; /* # of parity devices. */ - const unsigned minimal_devs; /* minimal # of devices in set. */ - const unsigned level; /* RAID level. */ - const unsigned algorithm; /* RAID algorithm. */ -} raid_types[] = { - {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */}, - {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, - {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, - {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, - {"raid5_ls", "RAID5 (left symmetric)", 1, 2, 5, ALGORITHM_LEFT_SYMMETRIC}, - {"raid5_rs", "RAID5 (right symmetric)", 1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC}, - {"raid6_zr", "RAID6 (zero restart)", 2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART}, - {"raid6_nr", "RAID6 (N restart)", 2, 4, 6, ALGORITHM_ROTATING_N_RESTART}, - {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} -}; - -static struct raid_type *get_raid_type(char *name) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(raid_types); i++) - if (!strcmp(raid_types[i].name, name)) - return &raid_types[i]; - - return NULL; -} - -static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *raid_type, unsigned raid_devs) -{ - unsigned i; - struct raid_set *rs; - sector_t sectors_per_dev; - - if (raid_devs <= raid_type->parity_devs) { - ti->error = "Insufficient number of devices"; - return ERR_PTR(-EINVAL); - } - - sectors_per_dev = ti->len; - if ((raid_type->level > 1) && - sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) { - ti->error = "Target length not divisible by number of data devices"; - return ERR_PTR(-EINVAL); - } - - rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL); - if (!rs) { - ti->error = "Cannot allocate raid context"; - return ERR_PTR(-ENOMEM); - } - - mddev_init(&rs->md); - - rs->ti = ti; - rs->raid_type = raid_type; - rs->md.raid_disks = raid_devs; - rs->md.level = raid_type->level; - rs->md.new_level = rs->md.level; - rs->md.dev_sectors = sectors_per_dev; - rs->md.layout = raid_type->algorithm; - rs->md.new_layout = rs->md.layout; - rs->md.delta_disks = 0; - rs->md.recovery_cp = 0; - - for (i = 0; i < raid_devs; i++) - md_rdev_init(&rs->dev[i].rdev); - - /* - * Remaining items to be initialized by further RAID params: - * rs->md.persistent - * rs->md.external - * rs->md.chunk_sectors - * rs->md.new_chunk_sectors - */ - - return rs; -} - -static void context_free(struct raid_set *rs) -{ - int i; - - for (i = 0; i < rs->md.raid_disks; i++) { - if (rs->dev[i].meta_dev) - dm_put_device(rs->ti, rs->dev[i].meta_dev); - if (rs->dev[i].rdev.sb_page) - put_page(rs->dev[i].rdev.sb_page); - rs->dev[i].rdev.sb_page = NULL; - rs->dev[i].rdev.sb_loaded = 0; - if (rs->dev[i].data_dev) - dm_put_device(rs->ti, rs->dev[i].data_dev); - } - - kfree(rs); -} - -/* - * For every device we have two words - * <meta_dev>: meta device name or '-' if missing - * <data_dev>: data device name or '-' if missing - * - * The following are permitted: - * - - - * - <data_dev> - * <meta_dev> <data_dev> - * - * The following is not allowed: - * <meta_dev> - - * - * This code parses those words. If there is a failure, - * the caller must use context_free to unwind the operations. - */ -static int dev_parms(struct raid_set *rs, char **argv) -{ - int i; - int rebuild = 0; - int metadata_available = 0; - int ret = 0; - - for (i = 0; i < rs->md.raid_disks; i++, argv += 2) { - rs->dev[i].rdev.raid_disk = i; - - rs->dev[i].meta_dev = NULL; - rs->dev[i].data_dev = NULL; - - /* - * There are no offsets, since there is a separate device - * for data and metadata. - */ - rs->dev[i].rdev.data_offset = 0; - rs->dev[i].rdev.mddev = &rs->md; - - if (strcmp(argv[0], "-")) { - ret = dm_get_device(rs->ti, argv[0], - dm_table_get_mode(rs->ti->table), - &rs->dev[i].meta_dev); - rs->ti->error = "RAID metadata device lookup failure"; - if (ret) - return ret; - - rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL); - if (!rs->dev[i].rdev.sb_page) - return -ENOMEM; - } - - if (!strcmp(argv[1], "-")) { - if (!test_bit(In_sync, &rs->dev[i].rdev.flags) && - (!rs->dev[i].rdev.recovery_offset)) { - rs->ti->error = "Drive designated for rebuild not specified"; - return -EINVAL; - } - - rs->ti->error = "No data device supplied with metadata device"; - if (rs->dev[i].meta_dev) - return -EINVAL; - - continue; - } - - ret = dm_get_device(rs->ti, argv[1], - dm_table_get_mode(rs->ti->table), - &rs->dev[i].data_dev); - if (ret) { - rs->ti->error = "RAID device lookup failure"; - return ret; - } - - if (rs->dev[i].meta_dev) { - metadata_available = 1; - rs->dev[i].rdev.meta_bdev = rs->dev[i].meta_dev->bdev; - } - rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev; - list_add(&rs->dev[i].rdev.same_set, &rs->md.disks); - if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) - rebuild++; - } - - if (metadata_available) { - rs->md.external = 0; - rs->md.persistent = 1; - rs->md.major_version = 2; - } else if (rebuild && !rs->md.recovery_cp) { - /* - * Without metadata, we will not be able to tell if the array - * is in-sync or not - we must assume it is not. Therefore, - * it is impossible to rebuild a drive. - * - * Even if there is metadata, the on-disk information may - * indicate that the array is not in-sync and it will then - * fail at that time. - * - * User could specify 'nosync' option if desperate. - */ - DMERR("Unable to rebuild drive while array is not in-sync"); - rs->ti->error = "RAID device lookup failure"; - return -EINVAL; - } - - return 0; -} - -/* - * validate_region_size - * @rs - * @region_size: region size in sectors. If 0, pick a size (4MiB default). - * - * Set rs->md.bitmap_info.chunksize (which really refers to 'region size'). - * Ensure that (ti->len/region_size < 2^21) - required by MD bitmap. - * - * Returns: 0 on success, -EINVAL on failure. - */ -static int validate_region_size(struct raid_set *rs, unsigned long region_size) -{ - unsigned long min_region_size = rs->ti->len / (1 << 21); - - if (!region_size) { - /* - * Choose a reasonable default. All figures in sectors. - */ - if (min_region_size > (1 << 13)) { - DMINFO("Choosing default region size of %lu sectors", - region_size); - region_size = min_region_size; - } else { - DMINFO("Choosing default region size of 4MiB"); - region_size = 1 << 13; /* sectors */ - } - } else { - /* - * Validate user-supplied value. - */ - if (region_size > rs->ti->len) { - rs->ti->error = "Supplied region size is too large"; - return -EINVAL; - } - - if (region_size < min_region_size) { - DMERR("Supplied region_size (%lu sectors) below minimum (%lu)", - region_size, min_region_size); - rs->ti->error = "Supplied region size is too small"; - return -EINVAL; - } - - if (!is_power_of_2(region_size)) { - rs->ti->error = "Region size is not a power of 2"; - return -EINVAL; - } - - if (region_size < rs->md.chunk_sectors) { - rs->ti->error = "Region size is smaller than the chunk size"; - return -EINVAL; - } - } - - /* - * Convert sectors to bytes. - */ - rs->md.bitmap_info.chunksize = (region_size << 9); - - return 0; -} - -/* - * Possible arguments are... - * <chunk_size> [optional_args] - * - * Argument definitions - * <chunk_size> The number of sectors per disk that - * will form the "stripe" - * [[no]sync] Force or prevent recovery of the - * entire array - * [rebuild <idx>] Rebuild the drive indicated by the index - * [daemon_sleep <ms>] Time between bitmap daemon work to - * clear bits - * [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization - * [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization - * [write_mostly <idx>] Indicate a write mostly drive via index - * [max_write_behind <sectors>] See '-write-behind=' (man mdadm) - * [stripe_cache <sectors>] Stripe cache size for higher RAIDs - * [region_size <sectors>] Defines granularity of bitmap - */ -static int parse_raid_params(struct raid_set *rs, char **argv, - unsigned num_raid_params) -{ - unsigned i, rebuild_cnt = 0; - unsigned long value, region_size = 0; - char *key; - - /* - * First, parse the in-order required arguments - * "chunk_size" is the only argument of this type. - */ - if ((strict_strtoul(argv[0], 10, &value) < 0)) { - rs->ti->error = "Bad chunk size"; - return -EINVAL; - } else if (rs->raid_type->level == 1) { - if (value) - DMERR("Ignoring chunk size parameter for RAID 1"); - value = 0; - } else if (!is_power_of_2(value)) { - rs->ti->error = "Chunk size must be a power of 2"; - return -EINVAL; - } else if (value < 8) { - rs->ti->error = "Chunk size value is too small"; - return -EINVAL; - } - - rs->md.new_chunk_sectors = rs->md.chunk_sectors = value; - argv++; - num_raid_params--; - - /* - * We set each individual device as In_sync with a completed - * 'recovery_offset'. If there has been a device failure or - * replacement then one of the following cases applies: - * - * 1) User specifies 'rebuild'. - * - Device is reset when param is read. - * 2) A new device is supplied. - * - No matching superblock found, resets device. - * 3) Device failure was transient and returns on reload. - * - Failure noticed, resets device for bitmap replay. - * 4) Device hadn't completed recovery after previous failure. - * - Superblock is read and overrides recovery_offset. - * - * What is found in the superblocks of the devices is always - * authoritative, unless 'rebuild' or '[no]sync' was specified. - */ - for (i = 0; i < rs->md.raid_disks; i++) { - set_bit(In_sync, &rs->dev[i].rdev.flags); - rs->dev[i].rdev.recovery_offset = MaxSector; - } - - /* - * Second, parse the unordered optional arguments - */ - for (i = 0; i < num_raid_params; i++) { - if (!strcasecmp(argv[i], "nosync")) { - rs->md.recovery_cp = MaxSector; - rs->print_flags |= DMPF_NOSYNC; - continue; - } - if (!strcasecmp(argv[i], "sync")) { - rs->md.recovery_cp = 0; - rs->print_flags |= DMPF_SYNC; - continue; - } - - /* The rest of the optional arguments come in key/value pairs */ - if ((i + 1) >= num_raid_params) { - rs->ti->error = "Wrong number of raid parameters given"; - return -EINVAL; - } - - key = argv[i++]; - if (strict_strtoul(argv[i], 10, &value) < 0) { - rs->ti->error = "Bad numerical argument given in raid params"; - return -EINVAL; - } - - if (!strcasecmp(key, "rebuild")) { - rebuild_cnt++; - if (((rs->raid_type->level != 1) && - (rebuild_cnt > rs->raid_type->parity_devs)) || - ((rs->raid_type->level == 1) && - (rebuild_cnt > (rs->md.raid_disks - 1)))) { - rs->ti->error = "Too many rebuild devices specified for given RAID type"; - return -EINVAL; - } - if (value > rs->md.raid_disks) { - rs->ti->error = "Invalid rebuild index given"; - return -EINVAL; - } - clear_bit(In_sync, &rs->dev[value].rdev.flags); - rs->dev[value].rdev.recovery_offset = 0; - rs->print_flags |= DMPF_REBUILD; - } else if (!strcasecmp(key, "write_mostly")) { - if (rs->raid_type->level != 1) { - rs->ti->error = "write_mostly option is only valid for RAID1"; - return -EINVAL; - } - if (value >= rs->md.raid_disks) { - rs->ti->error = "Invalid write_mostly drive index given"; - return -EINVAL; - } - set_bit(WriteMostly, &rs->dev[value].rdev.flags); - } else if (!strcasecmp(key, "max_write_behind")) { - if (rs->raid_type->level != 1) { - rs->ti->error = "max_write_behind option is only valid for RAID1"; - return -EINVAL; - } - rs->print_flags |= DMPF_MAX_WRITE_BEHIND; - - /* - * In device-mapper, we specify things in sectors, but - * MD records this value in kB - */ - value /= 2; - if (value > COUNTER_MAX) { - rs->ti->error = "Max write-behind limit out of range"; - return -EINVAL; - } - rs->md.bitmap_info.max_write_behind = value; - } else if (!strcasecmp(key, "daemon_sleep")) { - rs->print_flags |= DMPF_DAEMON_SLEEP; - if (!value || (value > MAX_SCHEDULE_TIMEOUT)) { - rs->ti->error = "daemon sleep period out of range"; - return -EINVAL; - } - rs->md.bitmap_info.daemon_sleep = value; - } else if (!strcasecmp(key, "stripe_cache")) { - rs->print_flags |= DMPF_STRIPE_CACHE; - - /* - * In device-mapper, we specify things in sectors, but - * MD records this value in kB - */ - value /= 2; - - if (rs->raid_type->level < 5) { - rs->ti->error = "Inappropriate argument: stripe_cache"; - return -EINVAL; - } - if (raid5_set_cache_size(&rs->md, (int)value)) { - rs->ti->error = "Bad stripe_cache size"; - return -EINVAL; - } - } else if (!strcasecmp(key, "min_recovery_rate")) { - rs->print_flags |= DMPF_MIN_RECOVERY_RATE; - if (value > INT_MAX) { - rs->ti->error = "min_recovery_rate out of range"; - return -EINVAL; - } - rs->md.sync_speed_min = (int)value; - } else if (!strcasecmp(key, "max_recovery_rate")) { - rs->print_flags |= DMPF_MAX_RECOVERY_RATE; - if (value > INT_MAX) { - rs->ti->error = "max_recovery_rate out of range"; - return -EINVAL; - } - rs->md.sync_speed_max = (int)value; - } else if (!strcasecmp(key, "region_size")) { - rs->print_flags |= DMPF_REGION_SIZE; - region_size = value; - } else { - DMERR("Unable to parse RAID parameter: %s", key); - rs->ti->error = "Unable to parse RAID parameters"; - return -EINVAL; - } - } - - if (validate_region_size(rs, region_size)) - return -EINVAL; - - if (rs->md.chunk_sectors) - rs->ti->split_io = rs->md.chunk_sectors; - else - rs->ti->split_io = region_size; - - if (rs->md.chunk_sectors) - rs->ti->split_io = rs->md.chunk_sectors; - else - rs->ti->split_io = region_size; - - /* Assume there are no metadata devices until the drives are parsed */ - rs->md.persistent = 0; - rs->md.external = 1; - - return 0; -} - -static void do_table_event(struct work_struct *ws) -{ - struct raid_set *rs = container_of(ws, struct raid_set, md.event_work); - - dm_table_event(rs->ti->table); -} - -static int raid_is_congested(struct dm_target_callbacks *cb, int bits) -{ - struct raid_set *rs = container_of(cb, struct raid_set, callbacks); - - if (rs->raid_type->level == 1) - return md_raid1_congested(&rs->md, bits); - - return md_raid5_congested(&rs->md, bits); -} - -/* - * This structure is never routinely used by userspace, unlike md superblocks. - * Devices with this superblock should only ever be accessed via device-mapper. - */ -#define DM_RAID_MAGIC 0x64526D44 -struct dm_raid_superblock { - __le32 magic; /* "DmRd" */ - __le32 features; /* Used to indicate possible future changes */ - - __le32 num_devices; /* Number of devices in this array. (Max 64) */ - __le32 array_position; /* The position of this drive in the array */ - - __le64 events; /* Incremented by md when superblock updated */ - __le64 failed_devices; /* Bit field of devices to indicate failures */ - - /* - * This offset tracks the progress of the repair or replacement of - * an individual drive. - */ - __le64 disk_recovery_offset; - - /* - * This offset tracks the progress of the initial array - * synchronisation/parity calculation. - */ - __le64 array_resync_offset; - - /* - * RAID characteristics - */ - __le32 level; - __le32 layout; - __le32 stripe_sectors; - - __u8 pad[452]; /* Round struct to 512 bytes. */ - /* Always set to 0 when writing. */ -} __packed; - -static int read_disk_sb(struct md_rdev *rdev, int size) -{ - BUG_ON(!rdev->sb_page); - - if (rdev->sb_loaded) - return 0; - - if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) { - DMERR("Failed to read superblock of device at position %d", - rdev->raid_disk); - set_bit(Faulty, &rdev->flags); - return -EINVAL; - } - - rdev->sb_loaded = 1; - - return 0; -} - -static void super_sync(struct mddev *mddev, struct md_rdev *rdev) -{ - struct md_rdev *r; - uint64_t failed_devices; - struct dm_raid_superblock *sb; - - sb = page_address(rdev->sb_page); - failed_devices = le64_to_cpu(sb->failed_devices); - - rdev_for_each(r, mddev) - if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags)) - failed_devices |= (1ULL << r->raid_disk); - - memset(sb, 0, sizeof(*sb)); - - sb->magic = cpu_to_le32(DM_RAID_MAGIC); - sb->features = cpu_to_le32(0); /* No features yet */ - - sb->num_devices = cpu_to_le32(mddev->raid_disks); - sb->array_position = cpu_to_le32(rdev->raid_disk); - - sb->events = cpu_to_le64(mddev->events); - sb->failed_devices = cpu_to_le64(failed_devices); - - sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset); - sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp); - - sb->level = cpu_to_le32(mddev->level); - sb->layout = cpu_to_le32(mddev->layout); - sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors); -} - -/* - * super_load - * - * This function creates a superblock if one is not found on the device - * and will decide which superblock to use if there's a choice. - * - * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise - */ -static int super_load(struct md_rdev *rdev, struct md_rdev *refdev) -{ - int ret; - struct dm_raid_superblock *sb; - struct dm_raid_superblock *refsb; - uint64_t events_sb, events_refsb; - - rdev->sb_start = 0; - rdev->sb_size = sizeof(*sb); - - ret = read_disk_sb(rdev, rdev->sb_size); - if (ret) - return ret; - - sb = page_address(rdev->sb_page); - - /* - * Two cases that we want to write new superblocks and rebuild: - * 1) New device (no matching magic number) - * 2) Device specified for rebuild (!In_sync w/ offset == 0) - */ - if ((sb->magic != cpu_to_le32(DM_RAID_MAGIC)) || - (!test_bit(In_sync, &rdev->flags) && !rdev->recovery_offset)) { - super_sync(rdev->mddev, rdev); - - set_bit(FirstUse, &rdev->flags); - - /* Force writing of superblocks to disk */ - set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags); - - /* Any superblock is better than none, choose that if given */ - return refdev ? 0 : 1; - } - - if (!refdev) - return 1; - - events_sb = le64_to_cpu(sb->events); - - refsb = page_address(refdev->sb_page); - events_refsb = le64_to_cpu(refsb->events); - - return (events_sb > events_refsb) ? 1 : 0; -} - -static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev) -{ - int role; - struct raid_set *rs = container_of(mddev, struct raid_set, md); - uint64_t events_sb; - uint64_t failed_devices; - struct dm_raid_superblock *sb; - uint32_t new_devs = 0; - uint32_t rebuilds = 0; - struct md_rdev *r; - struct dm_raid_superblock *sb2; - - sb = page_address(rdev->sb_page); - events_sb = le64_to_cpu(sb->events); - failed_devices = le64_to_cpu(sb->failed_devices); - - /* - * Initialise to 1 if this is a new superblock. - */ - mddev->events = events_sb ? : 1; - - /* - * Reshaping is not currently allowed - */ - if ((le32_to_cpu(sb->level) != mddev->level) || - (le32_to_cpu(sb->layout) != mddev->layout) || - (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) { - DMERR("Reshaping arrays not yet supported."); - return -EINVAL; - } - - /* We can only change the number of devices in RAID1 right now */ - if ((rs->raid_type->level != 1) && - (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) { - DMERR("Reshaping arrays not yet supported."); - return -EINVAL; - } - - if (!(rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))) - mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset); - - /* - * During load, we set FirstUse if a new superblock was written. - * There are two reasons we might not have a superblock: - * 1) The array is brand new - in which case, all of the - * devices must have their In_sync bit set. Also, - * recovery_cp must be 0, unless forced. - * 2) This is a new device being added to an old array - * and the new device needs to be rebuilt - in which - * case the In_sync bit will /not/ be set and - * recovery_cp must be MaxSector. - */ - rdev_for_each(r, mddev) { - if (!test_bit(In_sync, &r->flags)) { - DMINFO("Device %d specified for rebuild: " - "Clearing superblock", r->raid_disk); - rebuilds++; - } else if (test_bit(FirstUse, &r->flags)) - new_devs++; - } - - if (!rebuilds) { - if (new_devs == mddev->raid_disks) { - DMINFO("Superblocks created for new array"); - set_bit(MD_ARRAY_FIRST_USE, &mddev->flags); - } else if (new_devs) { - DMERR("New device injected " - "into existing array without 'rebuild' " - "parameter specified"); - return -EINVAL; - } - } else if (new_devs) { - DMERR("'rebuild' devices cannot be " - "injected into an array with other first-time devices"); - return -EINVAL; - } else if (mddev->recovery_cp != MaxSector) { - DMERR("'rebuild' specified while array is not in-sync"); - return -EINVAL; - } - - /* - * Now we set the Faulty bit for those devices that are - * recorded in the superblock as failed. - */ - rdev_for_each(r, mddev) { - if (!r->sb_page) - continue; - sb2 = page_address(r->sb_page); - sb2->failed_devices = 0; - - /* - * Check for any device re-ordering. - */ - if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) { - role = le32_to_cpu(sb2->array_position); - if (role != r->raid_disk) { - if (rs->raid_type->level != 1) { - rs->ti->error = "Cannot change device " - "positions in RAID array"; - return -EINVAL; - } - DMINFO("RAID1 device #%d now at position #%d", - role, r->raid_disk); - } - - /* - * Partial recovery is performed on - * returning failed devices. - */ - if (failed_devices & (1 << role)) - set_bit(Faulty, &r->flags); - } - } - - return 0; -} - -static int super_validate(struct mddev *mddev, struct md_rdev *rdev) -{ - struct dm_raid_superblock *sb = page_address(rdev->sb_page); - - /* - * If mddev->events is not set, we know we have not yet initialized - * the array. - */ - if (!mddev->events && super_init_validation(mddev, rdev)) - return -EINVAL; - - mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */ - rdev->mddev->bitmap_info.default_offset = 4096 >> 9; - if (!test_bit(FirstUse, &rdev->flags)) { - rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset); - if (rdev->recovery_offset != MaxSector) - clear_bit(In_sync, &rdev->flags); - } - - /* - * If a device comes back, set it as not In_sync and no longer faulty. - */ - if (test_bit(Faulty, &rdev->flags)) { - clear_bit(Faulty, &rdev->flags); - clear_bit(In_sync, &rdev->flags); - rdev->saved_raid_disk = rdev->raid_disk; - rdev->recovery_offset = 0; - } - - clear_bit(FirstUse, &rdev->flags); - - return 0; -} - -/* - * Analyse superblocks and select the freshest. - */ -static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) -{ - int ret; - unsigned redundancy = 0; - struct raid_dev *dev; - struct md_rdev *rdev, *tmp, *freshest; - struct mddev *mddev = &rs->md; - - switch (rs->raid_type->level) { - case 1: - redundancy = rs->md.raid_disks - 1; - break; - case 4: - case 5: - case 6: - redundancy = rs->raid_type->parity_devs; - break; - default: - ti->error = "Unknown RAID type"; - return -EINVAL; - } - - freshest = NULL; - rdev_for_each_safe(rdev, tmp, mddev) { - if (!rdev->meta_bdev) - continue; - - ret = super_load(rdev, freshest); - - switch (ret) { - case 1: - freshest = rdev; - break; - case 0: - break; - default: - dev = container_of(rdev, struct raid_dev, rdev); - if (redundancy--) { - if (dev->meta_dev) - dm_put_device(ti, dev->meta_dev); - - dev->meta_dev = NULL; - rdev->meta_bdev = NULL; - - if (rdev->sb_page) - put_page(rdev->sb_page); - - rdev->sb_page = NULL; - - rdev->sb_loaded = 0; - - /* - * We might be able to salvage the data device - * even though the meta device has failed. For - * now, we behave as though '- -' had been - * set for this device in the table. - */ - if (dev->data_dev) - dm_put_device(ti, dev->data_dev); - - dev->data_dev = NULL; - rdev->bdev = NULL; - - list_del(&rdev->same_set); - - continue; - } - ti->error = "Failed to load superblock"; - return ret; - } - } - - if (!freshest) - return 0; - - /* - * Validation of the freshest device provides the source of - * validation for the remaining devices. - */ - ti->error = "Unable to assemble array: Invalid superblocks"; - if (super_validate(mddev, freshest)) - return -EINVAL; - - rdev_for_each(rdev, mddev) - if ((rdev != freshest) && super_validate(mddev, rdev)) - return -EINVAL; - - return 0; -} - -/* - * Construct a RAID4/5/6 mapping: - * Args: - * <raid_type> <#raid_params> <raid_params> \ - * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> } - * - * <raid_params> varies by <raid_type>. See 'parse_raid_params' for - * details on possible <raid_params>. - */ -static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) -{ - int ret; - struct raid_type *rt; - unsigned long num_raid_params, num_raid_devs; - struct raid_set *rs = NULL; - - /* Must have at least <raid_type> <#raid_params> */ - if (argc < 2) { - ti->error = "Too few arguments"; - return -EINVAL; - } - - /* raid type */ - rt = get_raid_type(argv[0]); - if (!rt) { - ti->error = "Unrecognised raid_type"; - return -EINVAL; - } - argc--; - argv++; - - /* number of RAID parameters */ - if (strict_strtoul(argv[0], 10, &num_raid_params) < 0) { - ti->error = "Cannot understand number of RAID parameters"; - return -EINVAL; - } - argc--; - argv++; - - /* Skip over RAID params for now and find out # of devices */ - if (num_raid_params + 1 > argc) { - ti->error = "Arguments do not agree with counts given"; - return -EINVAL; - } - - if ((strict_strtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) || - (num_raid_devs >= INT_MAX)) { - ti->error = "Cannot understand number of raid devices"; - return -EINVAL; - } - - rs = context_alloc(ti, rt, (unsigned)num_raid_devs); - if (IS_ERR(rs)) - return PTR_ERR(rs); - - ret = parse_raid_params(rs, argv, (unsigned)num_raid_params); - if (ret) - goto bad; - - ret = -EINVAL; - - argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */ - argv += num_raid_params + 1; - - if (argc != (num_raid_devs * 2)) { - ti->error = "Supplied RAID devices does not match the count given"; - goto bad; - } - - ret = dev_parms(rs, argv); - if (ret) - goto bad; - - rs->md.sync_super = super_sync; - ret = analyse_superblocks(ti, rs); - if (ret) - goto bad; - - INIT_WORK(&rs->md.event_work, do_table_event); - ti->private = rs; - ti->num_flush_requests = 1; - - mutex_lock(&rs->md.reconfig_mutex); - ret = md_run(&rs->md); - rs->md.in_sync = 0; /* Assume already marked dirty */ - mutex_unlock(&rs->md.reconfig_mutex); - - if (ret) { - ti->error = "Fail to run raid array"; - goto bad; - } - - rs->callbacks.congested_fn = raid_is_congested; - dm_table_add_target_callbacks(ti->table, &rs->callbacks); - - mddev_suspend(&rs->md); - return 0; - -bad: - context_free(rs); - - return ret; -} - -static void raid_dtr(struct dm_target *ti) -{ - struct raid_set *rs = ti->private; - - list_del_init(&rs->callbacks.list); - md_stop(&rs->md); - context_free(rs); -} - -static int raid_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) -{ - struct raid_set *rs = ti->private; - struct mddev *mddev = &rs->md; - - mddev->pers->make_request(mddev, bio); - - return DM_MAPIO_SUBMITTED; -} - -static int raid_status(struct dm_target *ti, status_type_t type, - char *result, unsigned maxlen) -{ - struct raid_set *rs = ti->private; - unsigned raid_param_cnt = 1; /* at least 1 for chunksize */ - unsigned sz = 0; - int i, array_in_sync = 0; - sector_t sync; - - switch (type) { - case STATUSTYPE_INFO: - DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks); - - if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery)) - sync = rs->md.curr_resync_completed; - else - sync = rs->md.recovery_cp; - - if (sync >= rs->md.resync_max_sectors) { - array_in_sync = 1; - sync = rs->md.resync_max_sectors; - } else { - /* - * The array may be doing an initial sync, or it may - * be rebuilding individual components. If all the - * devices are In_sync, then it is the array that is - * being initialized. - */ - for (i = 0; i < rs->md.raid_disks; i++) - if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) - array_in_sync = 1; - } - /* - * Status characters: - * 'D' = Dead/Failed device - * 'a' = Alive but not in-sync - * 'A' = Alive and in-sync - */ - for (i = 0; i < rs->md.raid_disks; i++) { - if (test_bit(Faulty, &rs->dev[i].rdev.flags)) - DMEMIT("D"); - else if (!array_in_sync || - !test_bit(In_sync, &rs->dev[i].rdev.flags)) - DMEMIT("a"); - else - DMEMIT("A"); - } - - /* - * In-sync ratio: - * The in-sync ratio shows the progress of: - * - Initializing the array - * - Rebuilding a subset of devices of the array - * The user can distinguish between the two by referring - * to the status characters. - */ - DMEMIT(" %llu/%llu", - (unsigned long long) sync, - (unsigned long long) rs->md.resync_max_sectors); - - break; - case STATUSTYPE_TABLE: - /* The string you would use to construct this array */ - for (i = 0; i < rs->md.raid_disks; i++) { - if ((rs->print_flags & DMPF_REBUILD) && - rs->dev[i].data_dev && - !test_bit(In_sync, &rs->dev[i].rdev.flags)) - raid_param_cnt += 2; /* for rebuilds */ - if (rs->dev[i].data_dev && - test_bit(WriteMostly, &rs->dev[i].rdev.flags)) - raid_param_cnt += 2; - } - - raid_param_cnt += (hweight32(rs->print_flags & ~DMPF_REBUILD) * 2); - if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)) - raid_param_cnt--; - - DMEMIT("%s %u %u", rs->raid_type->name, - raid_param_cnt, rs->md.chunk_sectors); - - if ((rs->print_flags & DMPF_SYNC) && - (rs->md.recovery_cp == MaxSector)) - DMEMIT(" sync"); - if (rs->print_flags & DMPF_NOSYNC) - DMEMIT(" nosync"); - - for (i = 0; i < rs->md.raid_disks; i++) - if ((rs->print_flags & DMPF_REBUILD) && - rs->dev[i].data_dev && - !test_bit(In_sync, &rs->dev[i].rdev.flags)) - DMEMIT(" rebuild %u", i); - - if (rs->print_flags & DMPF_DAEMON_SLEEP) - DMEMIT(" daemon_sleep %lu", - rs->md.bitmap_info.daemon_sleep); - - if (rs->print_flags & DMPF_MIN_RECOVERY_RATE) - DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min); - - if (rs->print_flags & DMPF_MAX_RECOVERY_RATE) - DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max); - - for (i = 0; i < rs->md.raid_disks; i++) - if (rs->dev[i].data_dev && - test_bit(WriteMostly, &rs->dev[i].rdev.flags)) - DMEMIT(" write_mostly %u", i); - - if (rs->print_flags & DMPF_MAX_WRITE_BEHIND) - DMEMIT(" max_write_behind %lu", - rs->md.bitmap_info.max_write_behind); - - if (rs->print_flags & DMPF_STRIPE_CACHE) { - struct r5conf *conf = rs->md.private; - - /* convert from kiB to sectors */ - DMEMIT(" stripe_cache %d", - conf ? conf->max_nr_stripes * 2 : 0); - } - - if (rs->print_flags & DMPF_REGION_SIZE) - DMEMIT(" region_size %lu", - rs->md.bitmap_info.chunksize >> 9); - - DMEMIT(" %d", rs->md.raid_disks); - for (i = 0; i < rs->md.raid_disks; i++) { - if (rs->dev[i].meta_dev) - DMEMIT(" %s", rs->dev[i].meta_dev->name); - else - DMEMIT(" -"); - - if (rs->dev[i].data_dev) - DMEMIT(" %s", rs->dev[i].data_dev->name); - else - DMEMIT(" -"); - } - } - - return 0; -} - -static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) -{ - struct raid_set *rs = ti->private; - unsigned i; - int ret = 0; - - for (i = 0; !ret && i < rs->md.raid_disks; i++) - if (rs->dev[i].data_dev) - ret = fn(ti, - rs->dev[i].data_dev, - 0, /* No offset on data devs */ - rs->md.dev_sectors, - data); - - return ret; -} - -static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) -{ - struct raid_set *rs = ti->private; - unsigned chunk_size = rs->md.chunk_sectors << 9; - struct r5conf *conf = rs->md.private; - - blk_limits_io_min(limits, chunk_size); - blk_limits_io_opt(limits, chunk_size * (conf->raid_disks - conf->max_degraded)); -} - -static void raid_presuspend(struct dm_target *ti) -{ - struct raid_set *rs = ti->private; - - md_stop_writes(&rs->md); -} - -static void raid_postsuspend(struct dm_target *ti) -{ - struct raid_set *rs = ti->private; - - mddev_suspend(&rs->md); -} - -static void raid_resume(struct dm_target *ti) -{ - struct raid_set *rs = ti->private; - - if (!rs->bitmap_loaded) { - bitmap_load(&rs->md); - rs->bitmap_loaded = 1; - } else - md_wakeup_thread(rs->md.thread); - - mddev_resume(&rs->md); -} - -static struct target_type raid_target = { - .name = "raid", - .version = {1, 2, 0}, - .module = THIS_MODULE, - .ctr = raid_ctr, - .dtr = raid_dtr, - .map = raid_map, - .status = raid_status, - .iterate_devices = raid_iterate_devices, - .io_hints = raid_io_hints, - .presuspend = raid_presuspend, - .postsuspend = raid_postsuspend, - .resume = raid_resume, -}; - -static int __init dm_raid_init(void) -{ - return dm_register_target(&raid_target); -} - -static void __exit dm_raid_exit(void) -{ - dm_unregister_target(&raid_target); -} - -module_init(dm_raid_init); -module_exit(dm_raid_exit); - -MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target"); -MODULE_ALIAS("dm-raid4"); -MODULE_ALIAS("dm-raid5"); -MODULE_ALIAS("dm-raid6"); -MODULE_AUTHOR("Neil Brown <dm-devel@redhat.com>"); -MODULE_LICENSE("GPL"); diff --git a/ANDROID_3.4.5/drivers/md/dm-raid1.c b/ANDROID_3.4.5/drivers/md/dm-raid1.c deleted file mode 100644 index d039de83..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-raid1.c +++ /dev/null @@ -1,1470 +0,0 @@ -/* - * Copyright (C) 2003 Sistina Software Limited. - * Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. - * - * This file is released under the GPL. - */ - -#include "dm-bio-record.h" - -#include <linux/init.h> -#include <linux/mempool.h> -#include <linux/module.h> -#include <linux/pagemap.h> -#include <linux/slab.h> -#include <linux/workqueue.h> -#include <linux/device-mapper.h> -#include <linux/dm-io.h> -#include <linux/dm-dirty-log.h> -#include <linux/dm-kcopyd.h> -#include <linux/dm-region-hash.h> - -#define DM_MSG_PREFIX "raid1" - -#define MAX_RECOVERY 1 /* Maximum number of regions recovered in parallel. */ - -#define DM_RAID1_HANDLE_ERRORS 0x01 -#define errors_handled(p) ((p)->features & DM_RAID1_HANDLE_ERRORS) - -static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped); - -/*----------------------------------------------------------------- - * Mirror set structures. - *---------------------------------------------------------------*/ -enum dm_raid1_error { - DM_RAID1_WRITE_ERROR, - DM_RAID1_FLUSH_ERROR, - DM_RAID1_SYNC_ERROR, - DM_RAID1_READ_ERROR -}; - -struct mirror { - struct mirror_set *ms; - atomic_t error_count; - unsigned long error_type; - struct dm_dev *dev; - sector_t offset; -}; - -struct mirror_set { - struct dm_target *ti; - struct list_head list; - - uint64_t features; - - spinlock_t lock; /* protects the lists */ - struct bio_list reads; - struct bio_list writes; - struct bio_list failures; - struct bio_list holds; /* bios are waiting until suspend */ - - struct dm_region_hash *rh; - struct dm_kcopyd_client *kcopyd_client; - struct dm_io_client *io_client; - mempool_t *read_record_pool; - - /* recovery */ - region_t nr_regions; - int in_sync; - int log_failure; - int leg_failure; - atomic_t suspend; - - atomic_t default_mirror; /* Default mirror */ - - struct workqueue_struct *kmirrord_wq; - struct work_struct kmirrord_work; - struct timer_list timer; - unsigned long timer_pending; - - struct work_struct trigger_event; - - unsigned nr_mirrors; - struct mirror mirror[0]; -}; - -static void wakeup_mirrord(void *context) -{ - struct mirror_set *ms = context; - - queue_work(ms->kmirrord_wq, &ms->kmirrord_work); -} - -static void delayed_wake_fn(unsigned long data) -{ - struct mirror_set *ms = (struct mirror_set *) data; - - clear_bit(0, &ms->timer_pending); - wakeup_mirrord(ms); -} - -static void delayed_wake(struct mirror_set *ms) -{ - if (test_and_set_bit(0, &ms->timer_pending)) - return; - - ms->timer.expires = jiffies + HZ / 5; - ms->timer.data = (unsigned long) ms; - ms->timer.function = delayed_wake_fn; - add_timer(&ms->timer); -} - -static void wakeup_all_recovery_waiters(void *context) -{ - wake_up_all(&_kmirrord_recovery_stopped); -} - -static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) -{ - unsigned long flags; - int should_wake = 0; - struct bio_list *bl; - - bl = (rw == WRITE) ? &ms->writes : &ms->reads; - spin_lock_irqsave(&ms->lock, flags); - should_wake = !(bl->head); - bio_list_add(bl, bio); - spin_unlock_irqrestore(&ms->lock, flags); - - if (should_wake) - wakeup_mirrord(ms); -} - -static void dispatch_bios(void *context, struct bio_list *bio_list) -{ - struct mirror_set *ms = context; - struct bio *bio; - - while ((bio = bio_list_pop(bio_list))) - queue_bio(ms, bio, WRITE); -} - -#define MIN_READ_RECORDS 20 -struct dm_raid1_read_record { - struct mirror *m; - struct dm_bio_details details; -}; - -static struct kmem_cache *_dm_raid1_read_record_cache; - -/* - * Every mirror should look like this one. - */ -#define DEFAULT_MIRROR 0 - -/* - * This is yucky. We squirrel the mirror struct away inside - * bi_next for read/write buffers. This is safe since the bh - * doesn't get submitted to the lower levels of block layer. - */ -static struct mirror *bio_get_m(struct bio *bio) -{ - return (struct mirror *) bio->bi_next; -} - -static void bio_set_m(struct bio *bio, struct mirror *m) -{ - bio->bi_next = (struct bio *) m; -} - -static struct mirror *get_default_mirror(struct mirror_set *ms) -{ - return &ms->mirror[atomic_read(&ms->default_mirror)]; -} - -static void set_default_mirror(struct mirror *m) -{ - struct mirror_set *ms = m->ms; - struct mirror *m0 = &(ms->mirror[0]); - - atomic_set(&ms->default_mirror, m - m0); -} - -static struct mirror *get_valid_mirror(struct mirror_set *ms) -{ - struct mirror *m; - - for (m = ms->mirror; m < ms->mirror + ms->nr_mirrors; m++) - if (!atomic_read(&m->error_count)) - return m; - - return NULL; -} - -/* fail_mirror - * @m: mirror device to fail - * @error_type: one of the enum's, DM_RAID1_*_ERROR - * - * If errors are being handled, record the type of - * error encountered for this device. If this type - * of error has already been recorded, we can return; - * otherwise, we must signal userspace by triggering - * an event. Additionally, if the device is the - * primary device, we must choose a new primary, but - * only if the mirror is in-sync. - * - * This function must not block. - */ -static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type) -{ - struct mirror_set *ms = m->ms; - struct mirror *new; - - ms->leg_failure = 1; - - /* - * error_count is used for nothing more than a - * simple way to tell if a device has encountered - * errors. - */ - atomic_inc(&m->error_count); - - if (test_and_set_bit(error_type, &m->error_type)) - return; - - if (!errors_handled(ms)) - return; - - if (m != get_default_mirror(ms)) - goto out; - - if (!ms->in_sync) { - /* - * Better to issue requests to same failing device - * than to risk returning corrupt data. - */ - DMERR("Primary mirror (%s) failed while out-of-sync: " - "Reads may fail.", m->dev->name); - goto out; - } - - new = get_valid_mirror(ms); - if (new) - set_default_mirror(new); - else - DMWARN("All sides of mirror have failed."); - -out: - schedule_work(&ms->trigger_event); -} - -static int mirror_flush(struct dm_target *ti) -{ - struct mirror_set *ms = ti->private; - unsigned long error_bits; - - unsigned int i; - struct dm_io_region io[ms->nr_mirrors]; - struct mirror *m; - struct dm_io_request io_req = { - .bi_rw = WRITE_FLUSH, - .mem.type = DM_IO_KMEM, - .mem.ptr.addr = NULL, - .client = ms->io_client, - }; - - for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) { - io[i].bdev = m->dev->bdev; - io[i].sector = 0; - io[i].count = 0; - } - - error_bits = -1; - dm_io(&io_req, ms->nr_mirrors, io, &error_bits); - if (unlikely(error_bits != 0)) { - for (i = 0; i < ms->nr_mirrors; i++) - if (test_bit(i, &error_bits)) - fail_mirror(ms->mirror + i, - DM_RAID1_FLUSH_ERROR); - return -EIO; - } - - return 0; -} - -/*----------------------------------------------------------------- - * Recovery. - * - * When a mirror is first activated we may find that some regions - * are in the no-sync state. We have to recover these by - * recopying from the default mirror to all the others. - *---------------------------------------------------------------*/ -static void recovery_complete(int read_err, unsigned long write_err, - void *context) -{ - struct dm_region *reg = context; - struct mirror_set *ms = dm_rh_region_context(reg); - int m, bit = 0; - - if (read_err) { - /* Read error means the failure of default mirror. */ - DMERR_LIMIT("Unable to read primary mirror during recovery"); - fail_mirror(get_default_mirror(ms), DM_RAID1_SYNC_ERROR); - } - - if (write_err) { - DMERR_LIMIT("Write error during recovery (error = 0x%lx)", - write_err); - /* - * Bits correspond to devices (excluding default mirror). - * The default mirror cannot change during recovery. - */ - for (m = 0; m < ms->nr_mirrors; m++) { - if (&ms->mirror[m] == get_default_mirror(ms)) - continue; - if (test_bit(bit, &write_err)) - fail_mirror(ms->mirror + m, - DM_RAID1_SYNC_ERROR); - bit++; - } - } - - dm_rh_recovery_end(reg, !(read_err || write_err)); -} - -static int recover(struct mirror_set *ms, struct dm_region *reg) -{ - int r; - unsigned i; - struct dm_io_region from, to[DM_KCOPYD_MAX_REGIONS], *dest; - struct mirror *m; - unsigned long flags = 0; - region_t key = dm_rh_get_region_key(reg); - sector_t region_size = dm_rh_get_region_size(ms->rh); - - /* fill in the source */ - m = get_default_mirror(ms); - from.bdev = m->dev->bdev; - from.sector = m->offset + dm_rh_region_to_sector(ms->rh, key); - if (key == (ms->nr_regions - 1)) { - /* - * The final region may be smaller than - * region_size. - */ - from.count = ms->ti->len & (region_size - 1); - if (!from.count) - from.count = region_size; - } else - from.count = region_size; - - /* fill in the destinations */ - for (i = 0, dest = to; i < ms->nr_mirrors; i++) { - if (&ms->mirror[i] == get_default_mirror(ms)) - continue; - - m = ms->mirror + i; - dest->bdev = m->dev->bdev; - dest->sector = m->offset + dm_rh_region_to_sector(ms->rh, key); - dest->count = from.count; - dest++; - } - - /* hand to kcopyd */ - if (!errors_handled(ms)) - set_bit(DM_KCOPYD_IGNORE_ERROR, &flags); - - r = dm_kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, - flags, recovery_complete, reg); - - return r; -} - -static void do_recovery(struct mirror_set *ms) -{ - struct dm_region *reg; - struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); - int r; - - /* - * Start quiescing some regions. - */ - dm_rh_recovery_prepare(ms->rh); - - /* - * Copy any already quiesced regions. - */ - while ((reg = dm_rh_recovery_start(ms->rh))) { - r = recover(ms, reg); - if (r) - dm_rh_recovery_end(reg, 0); - } - - /* - * Update the in sync flag. - */ - if (!ms->in_sync && - (log->type->get_sync_count(log) == ms->nr_regions)) { - /* the sync is complete */ - dm_table_event(ms->ti->table); - ms->in_sync = 1; - } -} - -/*----------------------------------------------------------------- - * Reads - *---------------------------------------------------------------*/ -static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) -{ - struct mirror *m = get_default_mirror(ms); - - do { - if (likely(!atomic_read(&m->error_count))) - return m; - - if (m-- == ms->mirror) - m += ms->nr_mirrors; - } while (m != get_default_mirror(ms)); - - return NULL; -} - -static int default_ok(struct mirror *m) -{ - struct mirror *default_mirror = get_default_mirror(m->ms); - - return !atomic_read(&default_mirror->error_count); -} - -static int mirror_available(struct mirror_set *ms, struct bio *bio) -{ - struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); - region_t region = dm_rh_bio_to_region(ms->rh, bio); - - if (log->type->in_sync(log, region, 0)) - return choose_mirror(ms, bio->bi_sector) ? 1 : 0; - - return 0; -} - -/* - * remap a buffer to a particular mirror. - */ -static sector_t map_sector(struct mirror *m, struct bio *bio) -{ - if (unlikely(!bio->bi_size)) - return 0; - return m->offset + dm_target_offset(m->ms->ti, bio->bi_sector); -} - -static void map_bio(struct mirror *m, struct bio *bio) -{ - bio->bi_bdev = m->dev->bdev; - bio->bi_sector = map_sector(m, bio); -} - -static void map_region(struct dm_io_region *io, struct mirror *m, - struct bio *bio) -{ - io->bdev = m->dev->bdev; - io->sector = map_sector(m, bio); - io->count = bio->bi_size >> 9; -} - -static void hold_bio(struct mirror_set *ms, struct bio *bio) -{ - /* - * Lock is required to avoid race condition during suspend - * process. - */ - spin_lock_irq(&ms->lock); - - if (atomic_read(&ms->suspend)) { - spin_unlock_irq(&ms->lock); - - /* - * If device is suspended, complete the bio. - */ - if (dm_noflush_suspending(ms->ti)) - bio_endio(bio, DM_ENDIO_REQUEUE); - else - bio_endio(bio, -EIO); - return; - } - - /* - * Hold bio until the suspend is complete. - */ - bio_list_add(&ms->holds, bio); - spin_unlock_irq(&ms->lock); -} - -/*----------------------------------------------------------------- - * Reads - *---------------------------------------------------------------*/ -static void read_callback(unsigned long error, void *context) -{ - struct bio *bio = context; - struct mirror *m; - - m = bio_get_m(bio); - bio_set_m(bio, NULL); - - if (likely(!error)) { - bio_endio(bio, 0); - return; - } - - fail_mirror(m, DM_RAID1_READ_ERROR); - - if (likely(default_ok(m)) || mirror_available(m->ms, bio)) { - DMWARN_LIMIT("Read failure on mirror device %s. " - "Trying alternative device.", - m->dev->name); - queue_bio(m->ms, bio, bio_rw(bio)); - return; - } - - DMERR_LIMIT("Read failure on mirror device %s. Failing I/O.", - m->dev->name); - bio_endio(bio, -EIO); -} - -/* Asynchronous read. */ -static void read_async_bio(struct mirror *m, struct bio *bio) -{ - struct dm_io_region io; - struct dm_io_request io_req = { - .bi_rw = READ, - .mem.type = DM_IO_BVEC, - .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, - .notify.fn = read_callback, - .notify.context = bio, - .client = m->ms->io_client, - }; - - map_region(&io, m, bio); - bio_set_m(bio, m); - BUG_ON(dm_io(&io_req, 1, &io, NULL)); -} - -static inline int region_in_sync(struct mirror_set *ms, region_t region, - int may_block) -{ - int state = dm_rh_get_state(ms->rh, region, may_block); - return state == DM_RH_CLEAN || state == DM_RH_DIRTY; -} - -static void do_reads(struct mirror_set *ms, struct bio_list *reads) -{ - region_t region; - struct bio *bio; - struct mirror *m; - - while ((bio = bio_list_pop(reads))) { - region = dm_rh_bio_to_region(ms->rh, bio); - m = get_default_mirror(ms); - - /* - * We can only read balance if the region is in sync. - */ - if (likely(region_in_sync(ms, region, 1))) - m = choose_mirror(ms, bio->bi_sector); - else if (m && atomic_read(&m->error_count)) - m = NULL; - - if (likely(m)) - read_async_bio(m, bio); - else - bio_endio(bio, -EIO); - } -} - -/*----------------------------------------------------------------- - * Writes. - * - * We do different things with the write io depending on the - * state of the region that it's in: - * - * SYNC: increment pending, use kcopyd to write to *all* mirrors - * RECOVERING: delay the io until recovery completes - * NOSYNC: increment pending, just write to the default mirror - *---------------------------------------------------------------*/ - - -static void write_callback(unsigned long error, void *context) -{ - unsigned i, ret = 0; - struct bio *bio = (struct bio *) context; - struct mirror_set *ms; - int should_wake = 0; - unsigned long flags; - - ms = bio_get_m(bio)->ms; - bio_set_m(bio, NULL); - - /* - * NOTE: We don't decrement the pending count here, - * instead it is done by the targets endio function. - * This way we handle both writes to SYNC and NOSYNC - * regions with the same code. - */ - if (likely(!error)) { - bio_endio(bio, ret); - return; - } - - for (i = 0; i < ms->nr_mirrors; i++) - if (test_bit(i, &error)) - fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR); - - /* - * Need to raise event. Since raising - * events can block, we need to do it in - * the main thread. - */ - spin_lock_irqsave(&ms->lock, flags); - if (!ms->failures.head) - should_wake = 1; - bio_list_add(&ms->failures, bio); - spin_unlock_irqrestore(&ms->lock, flags); - if (should_wake) - wakeup_mirrord(ms); -} - -static void do_write(struct mirror_set *ms, struct bio *bio) -{ - unsigned int i; - struct dm_io_region io[ms->nr_mirrors], *dest = io; - struct mirror *m; - struct dm_io_request io_req = { - .bi_rw = WRITE | (bio->bi_rw & WRITE_FLUSH_FUA), - .mem.type = DM_IO_BVEC, - .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, - .notify.fn = write_callback, - .notify.context = bio, - .client = ms->io_client, - }; - - if (bio->bi_rw & REQ_DISCARD) { - io_req.bi_rw |= REQ_DISCARD; - io_req.mem.type = DM_IO_KMEM; - io_req.mem.ptr.addr = NULL; - } - - for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) - map_region(dest++, m, bio); - - /* - * Use default mirror because we only need it to retrieve the reference - * to the mirror set in write_callback(). - */ - bio_set_m(bio, get_default_mirror(ms)); - - BUG_ON(dm_io(&io_req, ms->nr_mirrors, io, NULL)); -} - -static void do_writes(struct mirror_set *ms, struct bio_list *writes) -{ - int state; - struct bio *bio; - struct bio_list sync, nosync, recover, *this_list = NULL; - struct bio_list requeue; - struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); - region_t region; - - if (!writes->head) - return; - - /* - * Classify each write. - */ - bio_list_init(&sync); - bio_list_init(&nosync); - bio_list_init(&recover); - bio_list_init(&requeue); - - while ((bio = bio_list_pop(writes))) { - if ((bio->bi_rw & REQ_FLUSH) || - (bio->bi_rw & REQ_DISCARD)) { - bio_list_add(&sync, bio); - continue; - } - - region = dm_rh_bio_to_region(ms->rh, bio); - - if (log->type->is_remote_recovering && - log->type->is_remote_recovering(log, region)) { - bio_list_add(&requeue, bio); - continue; - } - - state = dm_rh_get_state(ms->rh, region, 1); - switch (state) { - case DM_RH_CLEAN: - case DM_RH_DIRTY: - this_list = &sync; - break; - - case DM_RH_NOSYNC: - this_list = &nosync; - break; - - case DM_RH_RECOVERING: - this_list = &recover; - break; - } - - bio_list_add(this_list, bio); - } - - /* - * Add bios that are delayed due to remote recovery - * back on to the write queue - */ - if (unlikely(requeue.head)) { - spin_lock_irq(&ms->lock); - bio_list_merge(&ms->writes, &requeue); - spin_unlock_irq(&ms->lock); - delayed_wake(ms); - } - - /* - * Increment the pending counts for any regions that will - * be written to (writes to recover regions are going to - * be delayed). - */ - dm_rh_inc_pending(ms->rh, &sync); - dm_rh_inc_pending(ms->rh, &nosync); - - /* - * If the flush fails on a previous call and succeeds here, - * we must not reset the log_failure variable. We need - * userspace interaction to do that. - */ - ms->log_failure = dm_rh_flush(ms->rh) ? 1 : ms->log_failure; - - /* - * Dispatch io. - */ - if (unlikely(ms->log_failure) && errors_handled(ms)) { - spin_lock_irq(&ms->lock); - bio_list_merge(&ms->failures, &sync); - spin_unlock_irq(&ms->lock); - wakeup_mirrord(ms); - } else - while ((bio = bio_list_pop(&sync))) - do_write(ms, bio); - - while ((bio = bio_list_pop(&recover))) - dm_rh_delay(ms->rh, bio); - - while ((bio = bio_list_pop(&nosync))) { - if (unlikely(ms->leg_failure) && errors_handled(ms)) { - spin_lock_irq(&ms->lock); - bio_list_add(&ms->failures, bio); - spin_unlock_irq(&ms->lock); - wakeup_mirrord(ms); - } else { - map_bio(get_default_mirror(ms), bio); - generic_make_request(bio); - } - } -} - -static void do_failures(struct mirror_set *ms, struct bio_list *failures) -{ - struct bio *bio; - - if (likely(!failures->head)) - return; - - /* - * If the log has failed, unattempted writes are being - * put on the holds list. We can't issue those writes - * until a log has been marked, so we must store them. - * - * If a 'noflush' suspend is in progress, we can requeue - * the I/O's to the core. This give userspace a chance - * to reconfigure the mirror, at which point the core - * will reissue the writes. If the 'noflush' flag is - * not set, we have no choice but to return errors. - * - * Some writes on the failures list may have been - * submitted before the log failure and represent a - * failure to write to one of the devices. It is ok - * for us to treat them the same and requeue them - * as well. - */ - while ((bio = bio_list_pop(failures))) { - if (!ms->log_failure) { - ms->in_sync = 0; - dm_rh_mark_nosync(ms->rh, bio); - } - - /* - * If all the legs are dead, fail the I/O. - * If we have been told to handle errors, hold the bio - * and wait for userspace to deal with the problem. - * Otherwise pretend that the I/O succeeded. (This would - * be wrong if the failed leg returned after reboot and - * got replicated back to the good legs.) - */ - if (!get_valid_mirror(ms)) - bio_endio(bio, -EIO); - else if (errors_handled(ms)) - hold_bio(ms, bio); - else - bio_endio(bio, 0); - } -} - -static void trigger_event(struct work_struct *work) -{ - struct mirror_set *ms = - container_of(work, struct mirror_set, trigger_event); - - dm_table_event(ms->ti->table); -} - -/*----------------------------------------------------------------- - * kmirrord - *---------------------------------------------------------------*/ -static void do_mirror(struct work_struct *work) -{ - struct mirror_set *ms = container_of(work, struct mirror_set, - kmirrord_work); - struct bio_list reads, writes, failures; - unsigned long flags; - - spin_lock_irqsave(&ms->lock, flags); - reads = ms->reads; - writes = ms->writes; - failures = ms->failures; - bio_list_init(&ms->reads); - bio_list_init(&ms->writes); - bio_list_init(&ms->failures); - spin_unlock_irqrestore(&ms->lock, flags); - - dm_rh_update_states(ms->rh, errors_handled(ms)); - do_recovery(ms); - do_reads(ms, &reads); - do_writes(ms, &writes); - do_failures(ms, &failures); -} - -/*----------------------------------------------------------------- - * Target functions - *---------------------------------------------------------------*/ -static struct mirror_set *alloc_context(unsigned int nr_mirrors, - uint32_t region_size, - struct dm_target *ti, - struct dm_dirty_log *dl) -{ - size_t len; - struct mirror_set *ms = NULL; - - len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors); - - ms = kzalloc(len, GFP_KERNEL); - if (!ms) { - ti->error = "Cannot allocate mirror context"; - return NULL; - } - - spin_lock_init(&ms->lock); - bio_list_init(&ms->reads); - bio_list_init(&ms->writes); - bio_list_init(&ms->failures); - bio_list_init(&ms->holds); - - ms->ti = ti; - ms->nr_mirrors = nr_mirrors; - ms->nr_regions = dm_sector_div_up(ti->len, region_size); - ms->in_sync = 0; - ms->log_failure = 0; - ms->leg_failure = 0; - atomic_set(&ms->suspend, 0); - atomic_set(&ms->default_mirror, DEFAULT_MIRROR); - - ms->read_record_pool = mempool_create_slab_pool(MIN_READ_RECORDS, - _dm_raid1_read_record_cache); - - if (!ms->read_record_pool) { - ti->error = "Error creating mirror read_record_pool"; - kfree(ms); - return NULL; - } - - ms->io_client = dm_io_client_create(); - if (IS_ERR(ms->io_client)) { - ti->error = "Error creating dm_io client"; - mempool_destroy(ms->read_record_pool); - kfree(ms); - return NULL; - } - - ms->rh = dm_region_hash_create(ms, dispatch_bios, wakeup_mirrord, - wakeup_all_recovery_waiters, - ms->ti->begin, MAX_RECOVERY, - dl, region_size, ms->nr_regions); - if (IS_ERR(ms->rh)) { - ti->error = "Error creating dirty region hash"; - dm_io_client_destroy(ms->io_client); - mempool_destroy(ms->read_record_pool); - kfree(ms); - return NULL; - } - - return ms; -} - -static void free_context(struct mirror_set *ms, struct dm_target *ti, - unsigned int m) -{ - while (m--) - dm_put_device(ti, ms->mirror[m].dev); - - dm_io_client_destroy(ms->io_client); - dm_region_hash_destroy(ms->rh); - mempool_destroy(ms->read_record_pool); - kfree(ms); -} - -static int get_mirror(struct mirror_set *ms, struct dm_target *ti, - unsigned int mirror, char **argv) -{ - unsigned long long offset; - char dummy; - - if (sscanf(argv[1], "%llu%c", &offset, &dummy) != 1) { - ti->error = "Invalid offset"; - return -EINVAL; - } - - if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), - &ms->mirror[mirror].dev)) { - ti->error = "Device lookup failure"; - return -ENXIO; - } - - ms->mirror[mirror].ms = ms; - atomic_set(&(ms->mirror[mirror].error_count), 0); - ms->mirror[mirror].error_type = 0; - ms->mirror[mirror].offset = offset; - - return 0; -} - -/* - * Create dirty log: log_type #log_params <log_params> - */ -static struct dm_dirty_log *create_dirty_log(struct dm_target *ti, - unsigned argc, char **argv, - unsigned *args_used) -{ - unsigned param_count; - struct dm_dirty_log *dl; - char dummy; - - if (argc < 2) { - ti->error = "Insufficient mirror log arguments"; - return NULL; - } - - if (sscanf(argv[1], "%u%c", ¶m_count, &dummy) != 1) { - ti->error = "Invalid mirror log argument count"; - return NULL; - } - - *args_used = 2 + param_count; - - if (argc < *args_used) { - ti->error = "Insufficient mirror log arguments"; - return NULL; - } - - dl = dm_dirty_log_create(argv[0], ti, mirror_flush, param_count, - argv + 2); - if (!dl) { - ti->error = "Error creating mirror dirty log"; - return NULL; - } - - return dl; -} - -static int parse_features(struct mirror_set *ms, unsigned argc, char **argv, - unsigned *args_used) -{ - unsigned num_features; - struct dm_target *ti = ms->ti; - char dummy; - - *args_used = 0; - - if (!argc) - return 0; - - if (sscanf(argv[0], "%u%c", &num_features, &dummy) != 1) { - ti->error = "Invalid number of features"; - return -EINVAL; - } - - argc--; - argv++; - (*args_used)++; - - if (num_features > argc) { - ti->error = "Not enough arguments to support feature count"; - return -EINVAL; - } - - if (!strcmp("handle_errors", argv[0])) - ms->features |= DM_RAID1_HANDLE_ERRORS; - else { - ti->error = "Unrecognised feature requested"; - return -EINVAL; - } - - (*args_used)++; - - return 0; -} - -/* - * Construct a mirror mapping: - * - * log_type #log_params <log_params> - * #mirrors [mirror_path offset]{2,} - * [#features <features>] - * - * log_type is "core" or "disk" - * #log_params is between 1 and 3 - * - * If present, features must be "handle_errors". - */ -static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) -{ - int r; - unsigned int nr_mirrors, m, args_used; - struct mirror_set *ms; - struct dm_dirty_log *dl; - char dummy; - - dl = create_dirty_log(ti, argc, argv, &args_used); - if (!dl) - return -EINVAL; - - argv += args_used; - argc -= args_used; - - if (!argc || sscanf(argv[0], "%u%c", &nr_mirrors, &dummy) != 1 || - nr_mirrors < 2 || nr_mirrors > DM_KCOPYD_MAX_REGIONS + 1) { - ti->error = "Invalid number of mirrors"; - dm_dirty_log_destroy(dl); - return -EINVAL; - } - - argv++, argc--; - - if (argc < nr_mirrors * 2) { - ti->error = "Too few mirror arguments"; - dm_dirty_log_destroy(dl); - return -EINVAL; - } - - ms = alloc_context(nr_mirrors, dl->type->get_region_size(dl), ti, dl); - if (!ms) { - dm_dirty_log_destroy(dl); - return -ENOMEM; - } - - /* Get the mirror parameter sets */ - for (m = 0; m < nr_mirrors; m++) { - r = get_mirror(ms, ti, m, argv); - if (r) { - free_context(ms, ti, m); - return r; - } - argv += 2; - argc -= 2; - } - - ti->private = ms; - ti->split_io = dm_rh_get_region_size(ms->rh); - ti->num_flush_requests = 1; - ti->num_discard_requests = 1; - - ms->kmirrord_wq = alloc_workqueue("kmirrord", - WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); - if (!ms->kmirrord_wq) { - DMERR("couldn't start kmirrord"); - r = -ENOMEM; - goto err_free_context; - } - INIT_WORK(&ms->kmirrord_work, do_mirror); - init_timer(&ms->timer); - ms->timer_pending = 0; - INIT_WORK(&ms->trigger_event, trigger_event); - - r = parse_features(ms, argc, argv, &args_used); - if (r) - goto err_destroy_wq; - - argv += args_used; - argc -= args_used; - - /* - * Any read-balancing addition depends on the - * DM_RAID1_HANDLE_ERRORS flag being present. - * This is because the decision to balance depends - * on the sync state of a region. If the above - * flag is not present, we ignore errors; and - * the sync state may be inaccurate. - */ - - if (argc) { - ti->error = "Too many mirror arguments"; - r = -EINVAL; - goto err_destroy_wq; - } - - ms->kcopyd_client = dm_kcopyd_client_create(); - if (IS_ERR(ms->kcopyd_client)) { - r = PTR_ERR(ms->kcopyd_client); - goto err_destroy_wq; - } - - wakeup_mirrord(ms); - return 0; - -err_destroy_wq: - destroy_workqueue(ms->kmirrord_wq); -err_free_context: - free_context(ms, ti, ms->nr_mirrors); - return r; -} - -static void mirror_dtr(struct dm_target *ti) -{ - struct mirror_set *ms = (struct mirror_set *) ti->private; - - del_timer_sync(&ms->timer); - flush_workqueue(ms->kmirrord_wq); - flush_work_sync(&ms->trigger_event); - dm_kcopyd_client_destroy(ms->kcopyd_client); - destroy_workqueue(ms->kmirrord_wq); - free_context(ms, ti, ms->nr_mirrors); -} - -/* - * Mirror mapping function - */ -static int mirror_map(struct dm_target *ti, struct bio *bio, - union map_info *map_context) -{ - int r, rw = bio_rw(bio); - struct mirror *m; - struct mirror_set *ms = ti->private; - struct dm_raid1_read_record *read_record = NULL; - struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); - - if (rw == WRITE) { - /* Save region for mirror_end_io() handler */ - map_context->ll = dm_rh_bio_to_region(ms->rh, bio); - queue_bio(ms, bio, rw); - return DM_MAPIO_SUBMITTED; - } - - r = log->type->in_sync(log, dm_rh_bio_to_region(ms->rh, bio), 0); - if (r < 0 && r != -EWOULDBLOCK) - return r; - - /* - * If region is not in-sync queue the bio. - */ - if (!r || (r == -EWOULDBLOCK)) { - if (rw == READA) - return -EWOULDBLOCK; - - queue_bio(ms, bio, rw); - return DM_MAPIO_SUBMITTED; - } - - /* - * The region is in-sync and we can perform reads directly. - * Store enough information so we can retry if it fails. - */ - m = choose_mirror(ms, bio->bi_sector); - if (unlikely(!m)) - return -EIO; - - read_record = mempool_alloc(ms->read_record_pool, GFP_NOIO); - if (likely(read_record)) { - dm_bio_record(&read_record->details, bio); - map_context->ptr = read_record; - read_record->m = m; - } - - map_bio(m, bio); - - return DM_MAPIO_REMAPPED; -} - -static int mirror_end_io(struct dm_target *ti, struct bio *bio, - int error, union map_info *map_context) -{ - int rw = bio_rw(bio); - struct mirror_set *ms = (struct mirror_set *) ti->private; - struct mirror *m = NULL; - struct dm_bio_details *bd = NULL; - struct dm_raid1_read_record *read_record = map_context->ptr; - - /* - * We need to dec pending if this was a write. - */ - if (rw == WRITE) { - if (!(bio->bi_rw & REQ_FLUSH)) - dm_rh_dec(ms->rh, map_context->ll); - return error; - } - - if (error == -EOPNOTSUPP) - goto out; - - if ((error == -EWOULDBLOCK) && (bio->bi_rw & REQ_RAHEAD)) - goto out; - - if (unlikely(error)) { - if (!read_record) { - /* - * There wasn't enough memory to record necessary - * information for a retry or there was no other - * mirror in-sync. - */ - DMERR_LIMIT("Mirror read failed."); - return -EIO; - } - - m = read_record->m; - - DMERR("Mirror read failed from %s. Trying alternative device.", - m->dev->name); - - fail_mirror(m, DM_RAID1_READ_ERROR); - - /* - * A failed read is requeued for another attempt using an intact - * mirror. - */ - if (default_ok(m) || mirror_available(ms, bio)) { - bd = &read_record->details; - - dm_bio_restore(bd, bio); - mempool_free(read_record, ms->read_record_pool); - map_context->ptr = NULL; - queue_bio(ms, bio, rw); - return 1; - } - DMERR("All replicated volumes dead, failing I/O"); - } - -out: - if (read_record) { - mempool_free(read_record, ms->read_record_pool); - map_context->ptr = NULL; - } - - return error; -} - -static void mirror_presuspend(struct dm_target *ti) -{ - struct mirror_set *ms = (struct mirror_set *) ti->private; - struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); - - struct bio_list holds; - struct bio *bio; - - atomic_set(&ms->suspend, 1); - - /* - * Process bios in the hold list to start recovery waiting - * for bios in the hold list. After the process, no bio has - * a chance to be added in the hold list because ms->suspend - * is set. - */ - spin_lock_irq(&ms->lock); - holds = ms->holds; - bio_list_init(&ms->holds); - spin_unlock_irq(&ms->lock); - - while ((bio = bio_list_pop(&holds))) - hold_bio(ms, bio); - - /* - * We must finish up all the work that we've - * generated (i.e. recovery work). - */ - dm_rh_stop_recovery(ms->rh); - - wait_event(_kmirrord_recovery_stopped, - !dm_rh_recovery_in_flight(ms->rh)); - - if (log->type->presuspend && log->type->presuspend(log)) - /* FIXME: need better error handling */ - DMWARN("log presuspend failed"); - - /* - * Now that recovery is complete/stopped and the - * delayed bios are queued, we need to wait for - * the worker thread to complete. This way, - * we know that all of our I/O has been pushed. - */ - flush_workqueue(ms->kmirrord_wq); -} - -static void mirror_postsuspend(struct dm_target *ti) -{ - struct mirror_set *ms = ti->private; - struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); - - if (log->type->postsuspend && log->type->postsuspend(log)) - /* FIXME: need better error handling */ - DMWARN("log postsuspend failed"); -} - -static void mirror_resume(struct dm_target *ti) -{ - struct mirror_set *ms = ti->private; - struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); - - atomic_set(&ms->suspend, 0); - if (log->type->resume && log->type->resume(log)) - /* FIXME: need better error handling */ - DMWARN("log resume failed"); - dm_rh_start_recovery(ms->rh); -} - -/* - * device_status_char - * @m: mirror device/leg we want the status of - * - * We return one character representing the most severe error - * we have encountered. - * A => Alive - No failures - * D => Dead - A write failure occurred leaving mirror out-of-sync - * S => Sync - A sychronization failure occurred, mirror out-of-sync - * R => Read - A read failure occurred, mirror data unaffected - * - * Returns: <char> - */ -static char device_status_char(struct mirror *m) -{ - if (!atomic_read(&(m->error_count))) - return 'A'; - - return (test_bit(DM_RAID1_FLUSH_ERROR, &(m->error_type))) ? 'F' : - (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' : - (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' : - (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U'; -} - - -static int mirror_status(struct dm_target *ti, status_type_t type, - char *result, unsigned int maxlen) -{ - unsigned int m, sz = 0; - struct mirror_set *ms = (struct mirror_set *) ti->private; - struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); - char buffer[ms->nr_mirrors + 1]; - - switch (type) { - case STATUSTYPE_INFO: - DMEMIT("%d ", ms->nr_mirrors); - for (m = 0; m < ms->nr_mirrors; m++) { - DMEMIT("%s ", ms->mirror[m].dev->name); - buffer[m] = device_status_char(&(ms->mirror[m])); - } - buffer[m] = '\0'; - - DMEMIT("%llu/%llu 1 %s ", - (unsigned long long)log->type->get_sync_count(log), - (unsigned long long)ms->nr_regions, buffer); - - sz += log->type->status(log, type, result+sz, maxlen-sz); - - break; - - case STATUSTYPE_TABLE: - sz = log->type->status(log, type, result, maxlen); - - DMEMIT("%d", ms->nr_mirrors); - for (m = 0; m < ms->nr_mirrors; m++) - DMEMIT(" %s %llu", ms->mirror[m].dev->name, - (unsigned long long)ms->mirror[m].offset); - - if (ms->features & DM_RAID1_HANDLE_ERRORS) - DMEMIT(" 1 handle_errors"); - } - - return 0; -} - -static int mirror_iterate_devices(struct dm_target *ti, - iterate_devices_callout_fn fn, void *data) -{ - struct mirror_set *ms = ti->private; - int ret = 0; - unsigned i; - - for (i = 0; !ret && i < ms->nr_mirrors; i++) - ret = fn(ti, ms->mirror[i].dev, - ms->mirror[i].offset, ti->len, data); - - return ret; -} - -static struct target_type mirror_target = { - .name = "mirror", - .version = {1, 12, 1}, - .module = THIS_MODULE, - .ctr = mirror_ctr, - .dtr = mirror_dtr, - .map = mirror_map, - .end_io = mirror_end_io, - .presuspend = mirror_presuspend, - .postsuspend = mirror_postsuspend, - .resume = mirror_resume, - .status = mirror_status, - .iterate_devices = mirror_iterate_devices, -}; - -static int __init dm_mirror_init(void) -{ - int r; - - _dm_raid1_read_record_cache = KMEM_CACHE(dm_raid1_read_record, 0); - if (!_dm_raid1_read_record_cache) { - DMERR("Can't allocate dm_raid1_read_record cache"); - r = -ENOMEM; - goto bad_cache; - } - - r = dm_register_target(&mirror_target); - if (r < 0) { - DMERR("Failed to register mirror target"); - goto bad_target; - } - - return 0; - -bad_target: - kmem_cache_destroy(_dm_raid1_read_record_cache); -bad_cache: - return r; -} - -static void __exit dm_mirror_exit(void) -{ - dm_unregister_target(&mirror_target); - kmem_cache_destroy(_dm_raid1_read_record_cache); -} - -/* Module hooks */ -module_init(dm_mirror_init); -module_exit(dm_mirror_exit); - -MODULE_DESCRIPTION(DM_NAME " mirror target"); -MODULE_AUTHOR("Joe Thornber"); -MODULE_LICENSE("GPL"); diff --git a/ANDROID_3.4.5/drivers/md/dm-region-hash.c b/ANDROID_3.4.5/drivers/md/dm-region-hash.c deleted file mode 100644 index 7771ed21..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-region-hash.c +++ /dev/null @@ -1,720 +0,0 @@ -/* - * Copyright (C) 2003 Sistina Software Limited. - * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. - * - * This file is released under the GPL. - */ - -#include <linux/dm-dirty-log.h> -#include <linux/dm-region-hash.h> - -#include <linux/ctype.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> - -#include "dm.h" - -#define DM_MSG_PREFIX "region hash" - -/*----------------------------------------------------------------- - * Region hash - * - * The mirror splits itself up into discrete regions. Each - * region can be in one of three states: clean, dirty, - * nosync. There is no need to put clean regions in the hash. - * - * In addition to being present in the hash table a region _may_ - * be present on one of three lists. - * - * clean_regions: Regions on this list have no io pending to - * them, they are in sync, we are no longer interested in them, - * they are dull. dm_rh_update_states() will remove them from the - * hash table. - * - * quiesced_regions: These regions have been spun down, ready - * for recovery. rh_recovery_start() will remove regions from - * this list and hand them to kmirrord, which will schedule the - * recovery io with kcopyd. - * - * recovered_regions: Regions that kcopyd has successfully - * recovered. dm_rh_update_states() will now schedule any delayed - * io, up the recovery_count, and remove the region from the - * hash. - * - * There are 2 locks: - * A rw spin lock 'hash_lock' protects just the hash table, - * this is never held in write mode from interrupt context, - * which I believe means that we only have to disable irqs when - * doing a write lock. - * - * An ordinary spin lock 'region_lock' that protects the three - * lists in the region_hash, with the 'state', 'list' and - * 'delayed_bios' fields of the regions. This is used from irq - * context, so all other uses will have to suspend local irqs. - *---------------------------------------------------------------*/ -struct dm_region_hash { - uint32_t region_size; - unsigned region_shift; - - /* holds persistent region state */ - struct dm_dirty_log *log; - - /* hash table */ - rwlock_t hash_lock; - mempool_t *region_pool; - unsigned mask; - unsigned nr_buckets; - unsigned prime; - unsigned shift; - struct list_head *buckets; - - unsigned max_recovery; /* Max # of regions to recover in parallel */ - - spinlock_t region_lock; - atomic_t recovery_in_flight; - struct semaphore recovery_count; - struct list_head clean_regions; - struct list_head quiesced_regions; - struct list_head recovered_regions; - struct list_head failed_recovered_regions; - - /* - * If there was a flush failure no regions can be marked clean. - */ - int flush_failure; - - void *context; - sector_t target_begin; - - /* Callback function to schedule bios writes */ - void (*dispatch_bios)(void *context, struct bio_list *bios); - - /* Callback function to wakeup callers worker thread. */ - void (*wakeup_workers)(void *context); - - /* Callback function to wakeup callers recovery waiters. */ - void (*wakeup_all_recovery_waiters)(void *context); -}; - -struct dm_region { - struct dm_region_hash *rh; /* FIXME: can we get rid of this ? */ - region_t key; - int state; - - struct list_head hash_list; - struct list_head list; - - atomic_t pending; - struct bio_list delayed_bios; -}; - -/* - * Conversion fns - */ -static region_t dm_rh_sector_to_region(struct dm_region_hash *rh, sector_t sector) -{ - return sector >> rh->region_shift; -} - -sector_t dm_rh_region_to_sector(struct dm_region_hash *rh, region_t region) -{ - return region << rh->region_shift; -} -EXPORT_SYMBOL_GPL(dm_rh_region_to_sector); - -region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio) -{ - return dm_rh_sector_to_region(rh, bio->bi_sector - rh->target_begin); -} -EXPORT_SYMBOL_GPL(dm_rh_bio_to_region); - -void *dm_rh_region_context(struct dm_region *reg) -{ - return reg->rh->context; -} -EXPORT_SYMBOL_GPL(dm_rh_region_context); - -region_t dm_rh_get_region_key(struct dm_region *reg) -{ - return reg->key; -} -EXPORT_SYMBOL_GPL(dm_rh_get_region_key); - -sector_t dm_rh_get_region_size(struct dm_region_hash *rh) -{ - return rh->region_size; -} -EXPORT_SYMBOL_GPL(dm_rh_get_region_size); - -/* - * FIXME: shall we pass in a structure instead of all these args to - * dm_region_hash_create()???? - */ -#define RH_HASH_MULT 2654435387U -#define RH_HASH_SHIFT 12 - -#define MIN_REGIONS 64 -struct dm_region_hash *dm_region_hash_create( - void *context, void (*dispatch_bios)(void *context, - struct bio_list *bios), - void (*wakeup_workers)(void *context), - void (*wakeup_all_recovery_waiters)(void *context), - sector_t target_begin, unsigned max_recovery, - struct dm_dirty_log *log, uint32_t region_size, - region_t nr_regions) -{ - struct dm_region_hash *rh; - unsigned nr_buckets, max_buckets; - size_t i; - - /* - * Calculate a suitable number of buckets for our hash - * table. - */ - max_buckets = nr_regions >> 6; - for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1) - ; - nr_buckets >>= 1; - - rh = kmalloc(sizeof(*rh), GFP_KERNEL); - if (!rh) { - DMERR("unable to allocate region hash memory"); - return ERR_PTR(-ENOMEM); - } - - rh->context = context; - rh->dispatch_bios = dispatch_bios; - rh->wakeup_workers = wakeup_workers; - rh->wakeup_all_recovery_waiters = wakeup_all_recovery_waiters; - rh->target_begin = target_begin; - rh->max_recovery = max_recovery; - rh->log = log; - rh->region_size = region_size; - rh->region_shift = ffs(region_size) - 1; - rwlock_init(&rh->hash_lock); - rh->mask = nr_buckets - 1; - rh->nr_buckets = nr_buckets; - - rh->shift = RH_HASH_SHIFT; - rh->prime = RH_HASH_MULT; - - rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets)); - if (!rh->buckets) { - DMERR("unable to allocate region hash bucket memory"); - kfree(rh); - return ERR_PTR(-ENOMEM); - } - - for (i = 0; i < nr_buckets; i++) - INIT_LIST_HEAD(rh->buckets + i); - - spin_lock_init(&rh->region_lock); - sema_init(&rh->recovery_count, 0); - atomic_set(&rh->recovery_in_flight, 0); - INIT_LIST_HEAD(&rh->clean_regions); - INIT_LIST_HEAD(&rh->quiesced_regions); - INIT_LIST_HEAD(&rh->recovered_regions); - INIT_LIST_HEAD(&rh->failed_recovered_regions); - rh->flush_failure = 0; - - rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, - sizeof(struct dm_region)); - if (!rh->region_pool) { - vfree(rh->buckets); - kfree(rh); - rh = ERR_PTR(-ENOMEM); - } - - return rh; -} -EXPORT_SYMBOL_GPL(dm_region_hash_create); - -void dm_region_hash_destroy(struct dm_region_hash *rh) -{ - unsigned h; - struct dm_region *reg, *nreg; - - BUG_ON(!list_empty(&rh->quiesced_regions)); - for (h = 0; h < rh->nr_buckets; h++) { - list_for_each_entry_safe(reg, nreg, rh->buckets + h, - hash_list) { - BUG_ON(atomic_read(®->pending)); - mempool_free(reg, rh->region_pool); - } - } - - if (rh->log) - dm_dirty_log_destroy(rh->log); - - if (rh->region_pool) - mempool_destroy(rh->region_pool); - - vfree(rh->buckets); - kfree(rh); -} -EXPORT_SYMBOL_GPL(dm_region_hash_destroy); - -struct dm_dirty_log *dm_rh_dirty_log(struct dm_region_hash *rh) -{ - return rh->log; -} -EXPORT_SYMBOL_GPL(dm_rh_dirty_log); - -static unsigned rh_hash(struct dm_region_hash *rh, region_t region) -{ - return (unsigned) ((region * rh->prime) >> rh->shift) & rh->mask; -} - -static struct dm_region *__rh_lookup(struct dm_region_hash *rh, region_t region) -{ - struct dm_region *reg; - struct list_head *bucket = rh->buckets + rh_hash(rh, region); - - list_for_each_entry(reg, bucket, hash_list) - if (reg->key == region) - return reg; - - return NULL; -} - -static void __rh_insert(struct dm_region_hash *rh, struct dm_region *reg) -{ - list_add(®->hash_list, rh->buckets + rh_hash(rh, reg->key)); -} - -static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region) -{ - struct dm_region *reg, *nreg; - - nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); - if (unlikely(!nreg)) - nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL); - - nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? - DM_RH_CLEAN : DM_RH_NOSYNC; - nreg->rh = rh; - nreg->key = region; - INIT_LIST_HEAD(&nreg->list); - atomic_set(&nreg->pending, 0); - bio_list_init(&nreg->delayed_bios); - - write_lock_irq(&rh->hash_lock); - reg = __rh_lookup(rh, region); - if (reg) - /* We lost the race. */ - mempool_free(nreg, rh->region_pool); - else { - __rh_insert(rh, nreg); - if (nreg->state == DM_RH_CLEAN) { - spin_lock(&rh->region_lock); - list_add(&nreg->list, &rh->clean_regions); - spin_unlock(&rh->region_lock); - } - - reg = nreg; - } - write_unlock_irq(&rh->hash_lock); - - return reg; -} - -static struct dm_region *__rh_find(struct dm_region_hash *rh, region_t region) -{ - struct dm_region *reg; - - reg = __rh_lookup(rh, region); - if (!reg) { - read_unlock(&rh->hash_lock); - reg = __rh_alloc(rh, region); - read_lock(&rh->hash_lock); - } - - return reg; -} - -int dm_rh_get_state(struct dm_region_hash *rh, region_t region, int may_block) -{ - int r; - struct dm_region *reg; - - read_lock(&rh->hash_lock); - reg = __rh_lookup(rh, region); - read_unlock(&rh->hash_lock); - - if (reg) - return reg->state; - - /* - * The region wasn't in the hash, so we fall back to the - * dirty log. - */ - r = rh->log->type->in_sync(rh->log, region, may_block); - - /* - * Any error from the dirty log (eg. -EWOULDBLOCK) gets - * taken as a DM_RH_NOSYNC - */ - return r == 1 ? DM_RH_CLEAN : DM_RH_NOSYNC; -} -EXPORT_SYMBOL_GPL(dm_rh_get_state); - -static void complete_resync_work(struct dm_region *reg, int success) -{ - struct dm_region_hash *rh = reg->rh; - - rh->log->type->set_region_sync(rh->log, reg->key, success); - - /* - * Dispatch the bios before we call 'wake_up_all'. - * This is important because if we are suspending, - * we want to know that recovery is complete and - * the work queue is flushed. If we wake_up_all - * before we dispatch_bios (queue bios and call wake()), - * then we risk suspending before the work queue - * has been properly flushed. - */ - rh->dispatch_bios(rh->context, ®->delayed_bios); - if (atomic_dec_and_test(&rh->recovery_in_flight)) - rh->wakeup_all_recovery_waiters(rh->context); - up(&rh->recovery_count); -} - -/* dm_rh_mark_nosync - * @ms - * @bio - * - * The bio was written on some mirror(s) but failed on other mirror(s). - * We can successfully endio the bio but should avoid the region being - * marked clean by setting the state DM_RH_NOSYNC. - * - * This function is _not_ safe in interrupt context! - */ -void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio) -{ - unsigned long flags; - struct dm_dirty_log *log = rh->log; - struct dm_region *reg; - region_t region = dm_rh_bio_to_region(rh, bio); - int recovering = 0; - - if (bio->bi_rw & REQ_FLUSH) { - rh->flush_failure = 1; - return; - } - - /* We must inform the log that the sync count has changed. */ - log->type->set_region_sync(log, region, 0); - - read_lock(&rh->hash_lock); - reg = __rh_find(rh, region); - read_unlock(&rh->hash_lock); - - /* region hash entry should exist because write was in-flight */ - BUG_ON(!reg); - BUG_ON(!list_empty(®->list)); - - spin_lock_irqsave(&rh->region_lock, flags); - /* - * Possible cases: - * 1) DM_RH_DIRTY - * 2) DM_RH_NOSYNC: was dirty, other preceding writes failed - * 3) DM_RH_RECOVERING: flushing pending writes - * Either case, the region should have not been connected to list. - */ - recovering = (reg->state == DM_RH_RECOVERING); - reg->state = DM_RH_NOSYNC; - BUG_ON(!list_empty(®->list)); - spin_unlock_irqrestore(&rh->region_lock, flags); - - if (recovering) - complete_resync_work(reg, 0); -} -EXPORT_SYMBOL_GPL(dm_rh_mark_nosync); - -void dm_rh_update_states(struct dm_region_hash *rh, int errors_handled) -{ - struct dm_region *reg, *next; - - LIST_HEAD(clean); - LIST_HEAD(recovered); - LIST_HEAD(failed_recovered); - - /* - * Quickly grab the lists. - */ - write_lock_irq(&rh->hash_lock); - spin_lock(&rh->region_lock); - if (!list_empty(&rh->clean_regions)) { - list_splice_init(&rh->clean_regions, &clean); - - list_for_each_entry(reg, &clean, list) - list_del(®->hash_list); - } - - if (!list_empty(&rh->recovered_regions)) { - list_splice_init(&rh->recovered_regions, &recovered); - - list_for_each_entry(reg, &recovered, list) - list_del(®->hash_list); - } - - if (!list_empty(&rh->failed_recovered_regions)) { - list_splice_init(&rh->failed_recovered_regions, - &failed_recovered); - - list_for_each_entry(reg, &failed_recovered, list) - list_del(®->hash_list); - } - - spin_unlock(&rh->region_lock); - write_unlock_irq(&rh->hash_lock); - - /* - * All the regions on the recovered and clean lists have - * now been pulled out of the system, so no need to do - * any more locking. - */ - list_for_each_entry_safe(reg, next, &recovered, list) { - rh->log->type->clear_region(rh->log, reg->key); - complete_resync_work(reg, 1); - mempool_free(reg, rh->region_pool); - } - - list_for_each_entry_safe(reg, next, &failed_recovered, list) { - complete_resync_work(reg, errors_handled ? 0 : 1); - mempool_free(reg, rh->region_pool); - } - - list_for_each_entry_safe(reg, next, &clean, list) { - rh->log->type->clear_region(rh->log, reg->key); - mempool_free(reg, rh->region_pool); - } - - rh->log->type->flush(rh->log); -} -EXPORT_SYMBOL_GPL(dm_rh_update_states); - -static void rh_inc(struct dm_region_hash *rh, region_t region) -{ - struct dm_region *reg; - - read_lock(&rh->hash_lock); - reg = __rh_find(rh, region); - - spin_lock_irq(&rh->region_lock); - atomic_inc(®->pending); - - if (reg->state == DM_RH_CLEAN) { - reg->state = DM_RH_DIRTY; - list_del_init(®->list); /* take off the clean list */ - spin_unlock_irq(&rh->region_lock); - - rh->log->type->mark_region(rh->log, reg->key); - } else - spin_unlock_irq(&rh->region_lock); - - - read_unlock(&rh->hash_lock); -} - -void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios) -{ - struct bio *bio; - - for (bio = bios->head; bio; bio = bio->bi_next) { - if (bio->bi_rw & REQ_FLUSH) - continue; - rh_inc(rh, dm_rh_bio_to_region(rh, bio)); - } -} -EXPORT_SYMBOL_GPL(dm_rh_inc_pending); - -void dm_rh_dec(struct dm_region_hash *rh, region_t region) -{ - unsigned long flags; - struct dm_region *reg; - int should_wake = 0; - - read_lock(&rh->hash_lock); - reg = __rh_lookup(rh, region); - read_unlock(&rh->hash_lock); - - spin_lock_irqsave(&rh->region_lock, flags); - if (atomic_dec_and_test(®->pending)) { - /* - * There is no pending I/O for this region. - * We can move the region to corresponding list for next action. - * At this point, the region is not yet connected to any list. - * - * If the state is DM_RH_NOSYNC, the region should be kept off - * from clean list. - * The hash entry for DM_RH_NOSYNC will remain in memory - * until the region is recovered or the map is reloaded. - */ - - /* do nothing for DM_RH_NOSYNC */ - if (unlikely(rh->flush_failure)) { - /* - * If a write flush failed some time ago, we - * don't know whether or not this write made it - * to the disk, so we must resync the device. - */ - reg->state = DM_RH_NOSYNC; - } else if (reg->state == DM_RH_RECOVERING) { - list_add_tail(®->list, &rh->quiesced_regions); - } else if (reg->state == DM_RH_DIRTY) { - reg->state = DM_RH_CLEAN; - list_add(®->list, &rh->clean_regions); - } - should_wake = 1; - } - spin_unlock_irqrestore(&rh->region_lock, flags); - - if (should_wake) - rh->wakeup_workers(rh->context); -} -EXPORT_SYMBOL_GPL(dm_rh_dec); - -/* - * Starts quiescing a region in preparation for recovery. - */ -static int __rh_recovery_prepare(struct dm_region_hash *rh) -{ - int r; - region_t region; - struct dm_region *reg; - - /* - * Ask the dirty log what's next. - */ - r = rh->log->type->get_resync_work(rh->log, ®ion); - if (r <= 0) - return r; - - /* - * Get this region, and start it quiescing by setting the - * recovering flag. - */ - read_lock(&rh->hash_lock); - reg = __rh_find(rh, region); - read_unlock(&rh->hash_lock); - - spin_lock_irq(&rh->region_lock); - reg->state = DM_RH_RECOVERING; - - /* Already quiesced ? */ - if (atomic_read(®->pending)) - list_del_init(®->list); - else - list_move(®->list, &rh->quiesced_regions); - - spin_unlock_irq(&rh->region_lock); - - return 1; -} - -void dm_rh_recovery_prepare(struct dm_region_hash *rh) -{ - /* Extra reference to avoid race with dm_rh_stop_recovery */ - atomic_inc(&rh->recovery_in_flight); - - while (!down_trylock(&rh->recovery_count)) { - atomic_inc(&rh->recovery_in_flight); - if (__rh_recovery_prepare(rh) <= 0) { - atomic_dec(&rh->recovery_in_flight); - up(&rh->recovery_count); - break; - } - } - - /* Drop the extra reference */ - if (atomic_dec_and_test(&rh->recovery_in_flight)) - rh->wakeup_all_recovery_waiters(rh->context); -} -EXPORT_SYMBOL_GPL(dm_rh_recovery_prepare); - -/* - * Returns any quiesced regions. - */ -struct dm_region *dm_rh_recovery_start(struct dm_region_hash *rh) -{ - struct dm_region *reg = NULL; - - spin_lock_irq(&rh->region_lock); - if (!list_empty(&rh->quiesced_regions)) { - reg = list_entry(rh->quiesced_regions.next, - struct dm_region, list); - list_del_init(®->list); /* remove from the quiesced list */ - } - spin_unlock_irq(&rh->region_lock); - - return reg; -} -EXPORT_SYMBOL_GPL(dm_rh_recovery_start); - -void dm_rh_recovery_end(struct dm_region *reg, int success) -{ - struct dm_region_hash *rh = reg->rh; - - spin_lock_irq(&rh->region_lock); - if (success) - list_add(®->list, ®->rh->recovered_regions); - else - list_add(®->list, ®->rh->failed_recovered_regions); - - spin_unlock_irq(&rh->region_lock); - - rh->wakeup_workers(rh->context); -} -EXPORT_SYMBOL_GPL(dm_rh_recovery_end); - -/* Return recovery in flight count. */ -int dm_rh_recovery_in_flight(struct dm_region_hash *rh) -{ - return atomic_read(&rh->recovery_in_flight); -} -EXPORT_SYMBOL_GPL(dm_rh_recovery_in_flight); - -int dm_rh_flush(struct dm_region_hash *rh) -{ - return rh->log->type->flush(rh->log); -} -EXPORT_SYMBOL_GPL(dm_rh_flush); - -void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio) -{ - struct dm_region *reg; - - read_lock(&rh->hash_lock); - reg = __rh_find(rh, dm_rh_bio_to_region(rh, bio)); - bio_list_add(®->delayed_bios, bio); - read_unlock(&rh->hash_lock); -} -EXPORT_SYMBOL_GPL(dm_rh_delay); - -void dm_rh_stop_recovery(struct dm_region_hash *rh) -{ - int i; - - /* wait for any recovering regions */ - for (i = 0; i < rh->max_recovery; i++) - down(&rh->recovery_count); -} -EXPORT_SYMBOL_GPL(dm_rh_stop_recovery); - -void dm_rh_start_recovery(struct dm_region_hash *rh) -{ - int i; - - for (i = 0; i < rh->max_recovery; i++) - up(&rh->recovery_count); - - rh->wakeup_workers(rh->context); -} -EXPORT_SYMBOL_GPL(dm_rh_start_recovery); - -MODULE_DESCRIPTION(DM_NAME " region hash"); -MODULE_AUTHOR("Joe Thornber/Heinz Mauelshagen <dm-devel@redhat.com>"); -MODULE_LICENSE("GPL"); diff --git a/ANDROID_3.4.5/drivers/md/dm-round-robin.c b/ANDROID_3.4.5/drivers/md/dm-round-robin.c deleted file mode 100644 index 6ab1192c..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-round-robin.c +++ /dev/null @@ -1,219 +0,0 @@ -/* - * Copyright (C) 2003 Sistina Software. - * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. - * - * Module Author: Heinz Mauelshagen - * - * This file is released under the GPL. - * - * Round-robin path selector. - */ - -#include <linux/device-mapper.h> - -#include "dm-path-selector.h" - -#include <linux/slab.h> -#include <linux/module.h> - -#define DM_MSG_PREFIX "multipath round-robin" - -/*----------------------------------------------------------------- - * Path-handling code, paths are held in lists - *---------------------------------------------------------------*/ -struct path_info { - struct list_head list; - struct dm_path *path; - unsigned repeat_count; -}; - -static void free_paths(struct list_head *paths) -{ - struct path_info *pi, *next; - - list_for_each_entry_safe(pi, next, paths, list) { - list_del(&pi->list); - kfree(pi); - } -} - -/*----------------------------------------------------------------- - * Round-robin selector - *---------------------------------------------------------------*/ - -#define RR_MIN_IO 1000 - -struct selector { - struct list_head valid_paths; - struct list_head invalid_paths; -}; - -static struct selector *alloc_selector(void) -{ - struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); - - if (s) { - INIT_LIST_HEAD(&s->valid_paths); - INIT_LIST_HEAD(&s->invalid_paths); - } - - return s; -} - -static int rr_create(struct path_selector *ps, unsigned argc, char **argv) -{ - struct selector *s; - - s = alloc_selector(); - if (!s) - return -ENOMEM; - - ps->context = s; - return 0; -} - -static void rr_destroy(struct path_selector *ps) -{ - struct selector *s = (struct selector *) ps->context; - - free_paths(&s->valid_paths); - free_paths(&s->invalid_paths); - kfree(s); - ps->context = NULL; -} - -static int rr_status(struct path_selector *ps, struct dm_path *path, - status_type_t type, char *result, unsigned int maxlen) -{ - struct path_info *pi; - int sz = 0; - - if (!path) - DMEMIT("0 "); - else { - switch(type) { - case STATUSTYPE_INFO: - break; - case STATUSTYPE_TABLE: - pi = path->pscontext; - DMEMIT("%u ", pi->repeat_count); - break; - } - } - - return sz; -} - -/* - * Called during initialisation to register each path with an - * optional repeat_count. - */ -static int rr_add_path(struct path_selector *ps, struct dm_path *path, - int argc, char **argv, char **error) -{ - struct selector *s = (struct selector *) ps->context; - struct path_info *pi; - unsigned repeat_count = RR_MIN_IO; - char dummy; - - if (argc > 1) { - *error = "round-robin ps: incorrect number of arguments"; - return -EINVAL; - } - - /* First path argument is number of I/Os before switching path */ - if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { - *error = "round-robin ps: invalid repeat count"; - return -EINVAL; - } - - /* allocate the path */ - pi = kmalloc(sizeof(*pi), GFP_KERNEL); - if (!pi) { - *error = "round-robin ps: Error allocating path context"; - return -ENOMEM; - } - - pi->path = path; - pi->repeat_count = repeat_count; - - path->pscontext = pi; - - list_add_tail(&pi->list, &s->valid_paths); - - return 0; -} - -static void rr_fail_path(struct path_selector *ps, struct dm_path *p) -{ - struct selector *s = (struct selector *) ps->context; - struct path_info *pi = p->pscontext; - - list_move(&pi->list, &s->invalid_paths); -} - -static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p) -{ - struct selector *s = (struct selector *) ps->context; - struct path_info *pi = p->pscontext; - - list_move(&pi->list, &s->valid_paths); - - return 0; -} - -static struct dm_path *rr_select_path(struct path_selector *ps, - unsigned *repeat_count, size_t nr_bytes) -{ - struct selector *s = (struct selector *) ps->context; - struct path_info *pi = NULL; - - if (!list_empty(&s->valid_paths)) { - pi = list_entry(s->valid_paths.next, struct path_info, list); - list_move_tail(&pi->list, &s->valid_paths); - *repeat_count = pi->repeat_count; - } - - return pi ? pi->path : NULL; -} - -static struct path_selector_type rr_ps = { - .name = "round-robin", - .module = THIS_MODULE, - .table_args = 1, - .info_args = 0, - .create = rr_create, - .destroy = rr_destroy, - .status = rr_status, - .add_path = rr_add_path, - .fail_path = rr_fail_path, - .reinstate_path = rr_reinstate_path, - .select_path = rr_select_path, -}; - -static int __init dm_rr_init(void) -{ - int r = dm_register_path_selector(&rr_ps); - - if (r < 0) - DMERR("register failed %d", r); - - DMINFO("version 1.0.0 loaded"); - - return r; -} - -static void __exit dm_rr_exit(void) -{ - int r = dm_unregister_path_selector(&rr_ps); - - if (r < 0) - DMERR("unregister failed %d", r); -} - -module_init(dm_rr_init); -module_exit(dm_rr_exit); - -MODULE_DESCRIPTION(DM_NAME " round-robin multipath path selector"); -MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>"); -MODULE_LICENSE("GPL"); diff --git a/ANDROID_3.4.5/drivers/md/dm-service-time.c b/ANDROID_3.4.5/drivers/md/dm-service-time.c deleted file mode 100644 index 9df8f6bd..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-service-time.c +++ /dev/null @@ -1,343 +0,0 @@ -/* - * Copyright (C) 2007-2009 NEC Corporation. All Rights Reserved. - * - * Module Author: Kiyoshi Ueda - * - * This file is released under the GPL. - * - * Throughput oriented path selector. - */ - -#include "dm.h" -#include "dm-path-selector.h" - -#include <linux/slab.h> -#include <linux/module.h> - -#define DM_MSG_PREFIX "multipath service-time" -#define ST_MIN_IO 1 -#define ST_MAX_RELATIVE_THROUGHPUT 100 -#define ST_MAX_RELATIVE_THROUGHPUT_SHIFT 7 -#define ST_MAX_INFLIGHT_SIZE ((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT) -#define ST_VERSION "0.2.0" - -struct selector { - struct list_head valid_paths; - struct list_head failed_paths; -}; - -struct path_info { - struct list_head list; - struct dm_path *path; - unsigned repeat_count; - unsigned relative_throughput; - atomic_t in_flight_size; /* Total size of in-flight I/Os */ -}; - -static struct selector *alloc_selector(void) -{ - struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); - - if (s) { - INIT_LIST_HEAD(&s->valid_paths); - INIT_LIST_HEAD(&s->failed_paths); - } - - return s; -} - -static int st_create(struct path_selector *ps, unsigned argc, char **argv) -{ - struct selector *s = alloc_selector(); - - if (!s) - return -ENOMEM; - - ps->context = s; - return 0; -} - -static void free_paths(struct list_head *paths) -{ - struct path_info *pi, *next; - - list_for_each_entry_safe(pi, next, paths, list) { - list_del(&pi->list); - kfree(pi); - } -} - -static void st_destroy(struct path_selector *ps) -{ - struct selector *s = ps->context; - - free_paths(&s->valid_paths); - free_paths(&s->failed_paths); - kfree(s); - ps->context = NULL; -} - -static int st_status(struct path_selector *ps, struct dm_path *path, - status_type_t type, char *result, unsigned maxlen) -{ - unsigned sz = 0; - struct path_info *pi; - - if (!path) - DMEMIT("0 "); - else { - pi = path->pscontext; - - switch (type) { - case STATUSTYPE_INFO: - DMEMIT("%d %u ", atomic_read(&pi->in_flight_size), - pi->relative_throughput); - break; - case STATUSTYPE_TABLE: - DMEMIT("%u %u ", pi->repeat_count, - pi->relative_throughput); - break; - } - } - - return sz; -} - -static int st_add_path(struct path_selector *ps, struct dm_path *path, - int argc, char **argv, char **error) -{ - struct selector *s = ps->context; - struct path_info *pi; - unsigned repeat_count = ST_MIN_IO; - unsigned relative_throughput = 1; - char dummy; - - /* - * Arguments: [<repeat_count> [<relative_throughput>]] - * <repeat_count>: The number of I/Os before switching path. - * If not given, default (ST_MIN_IO) is used. - * <relative_throughput>: The relative throughput value of - * the path among all paths in the path-group. - * The valid range: 0-<ST_MAX_RELATIVE_THROUGHPUT> - * If not given, minimum value '1' is used. - * If '0' is given, the path isn't selected while - * other paths having a positive value are - * available. - */ - if (argc > 2) { - *error = "service-time ps: incorrect number of arguments"; - return -EINVAL; - } - - if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { - *error = "service-time ps: invalid repeat count"; - return -EINVAL; - } - - if ((argc == 2) && - (sscanf(argv[1], "%u%c", &relative_throughput, &dummy) != 1 || - relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) { - *error = "service-time ps: invalid relative_throughput value"; - return -EINVAL; - } - - /* allocate the path */ - pi = kmalloc(sizeof(*pi), GFP_KERNEL); - if (!pi) { - *error = "service-time ps: Error allocating path context"; - return -ENOMEM; - } - - pi->path = path; - pi->repeat_count = repeat_count; - pi->relative_throughput = relative_throughput; - atomic_set(&pi->in_flight_size, 0); - - path->pscontext = pi; - - list_add_tail(&pi->list, &s->valid_paths); - - return 0; -} - -static void st_fail_path(struct path_selector *ps, struct dm_path *path) -{ - struct selector *s = ps->context; - struct path_info *pi = path->pscontext; - - list_move(&pi->list, &s->failed_paths); -} - -static int st_reinstate_path(struct path_selector *ps, struct dm_path *path) -{ - struct selector *s = ps->context; - struct path_info *pi = path->pscontext; - - list_move_tail(&pi->list, &s->valid_paths); - - return 0; -} - -/* - * Compare the estimated service time of 2 paths, pi1 and pi2, - * for the incoming I/O. - * - * Returns: - * < 0 : pi1 is better - * 0 : no difference between pi1 and pi2 - * > 0 : pi2 is better - * - * Description: - * Basically, the service time is estimated by: - * ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput' - * To reduce the calculation, some optimizations are made. - * (See comments inline) - */ -static int st_compare_load(struct path_info *pi1, struct path_info *pi2, - size_t incoming) -{ - size_t sz1, sz2, st1, st2; - - sz1 = atomic_read(&pi1->in_flight_size); - sz2 = atomic_read(&pi2->in_flight_size); - - /* - * Case 1: Both have same throughput value. Choose less loaded path. - */ - if (pi1->relative_throughput == pi2->relative_throughput) - return sz1 - sz2; - - /* - * Case 2a: Both have same load. Choose higher throughput path. - * Case 2b: One path has no throughput value. Choose the other one. - */ - if (sz1 == sz2 || - !pi1->relative_throughput || !pi2->relative_throughput) - return pi2->relative_throughput - pi1->relative_throughput; - - /* - * Case 3: Calculate service time. Choose faster path. - * Service time using pi1: - * st1 = (sz1 + incoming) / pi1->relative_throughput - * Service time using pi2: - * st2 = (sz2 + incoming) / pi2->relative_throughput - * - * To avoid the division, transform the expression to use - * multiplication. - * Because ->relative_throughput > 0 here, if st1 < st2, - * the expressions below are the same meaning: - * (sz1 + incoming) / pi1->relative_throughput < - * (sz2 + incoming) / pi2->relative_throughput - * (sz1 + incoming) * pi2->relative_throughput < - * (sz2 + incoming) * pi1->relative_throughput - * So use the later one. - */ - sz1 += incoming; - sz2 += incoming; - if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE || - sz2 >= ST_MAX_INFLIGHT_SIZE)) { - /* - * Size may be too big for multiplying pi->relative_throughput - * and overflow. - * To avoid the overflow and mis-selection, shift down both. - */ - sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; - sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; - } - st1 = sz1 * pi2->relative_throughput; - st2 = sz2 * pi1->relative_throughput; - if (st1 != st2) - return st1 - st2; - - /* - * Case 4: Service time is equal. Choose higher throughput path. - */ - return pi2->relative_throughput - pi1->relative_throughput; -} - -static struct dm_path *st_select_path(struct path_selector *ps, - unsigned *repeat_count, size_t nr_bytes) -{ - struct selector *s = ps->context; - struct path_info *pi = NULL, *best = NULL; - - if (list_empty(&s->valid_paths)) - return NULL; - - /* Change preferred (first in list) path to evenly balance. */ - list_move_tail(s->valid_paths.next, &s->valid_paths); - - list_for_each_entry(pi, &s->valid_paths, list) - if (!best || (st_compare_load(pi, best, nr_bytes) < 0)) - best = pi; - - if (!best) - return NULL; - - *repeat_count = best->repeat_count; - - return best->path; -} - -static int st_start_io(struct path_selector *ps, struct dm_path *path, - size_t nr_bytes) -{ - struct path_info *pi = path->pscontext; - - atomic_add(nr_bytes, &pi->in_flight_size); - - return 0; -} - -static int st_end_io(struct path_selector *ps, struct dm_path *path, - size_t nr_bytes) -{ - struct path_info *pi = path->pscontext; - - atomic_sub(nr_bytes, &pi->in_flight_size); - - return 0; -} - -static struct path_selector_type st_ps = { - .name = "service-time", - .module = THIS_MODULE, - .table_args = 2, - .info_args = 2, - .create = st_create, - .destroy = st_destroy, - .status = st_status, - .add_path = st_add_path, - .fail_path = st_fail_path, - .reinstate_path = st_reinstate_path, - .select_path = st_select_path, - .start_io = st_start_io, - .end_io = st_end_io, -}; - -static int __init dm_st_init(void) -{ - int r = dm_register_path_selector(&st_ps); - - if (r < 0) - DMERR("register failed %d", r); - - DMINFO("version " ST_VERSION " loaded"); - - return r; -} - -static void __exit dm_st_exit(void) -{ - int r = dm_unregister_path_selector(&st_ps); - - if (r < 0) - DMERR("unregister failed %d", r); -} - -module_init(dm_st_init); -module_exit(dm_st_exit); - -MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector"); -MODULE_AUTHOR("Kiyoshi Ueda <k-ueda@ct.jp.nec.com>"); -MODULE_LICENSE("GPL"); diff --git a/ANDROID_3.4.5/drivers/md/dm-snap-persistent.c b/ANDROID_3.4.5/drivers/md/dm-snap-persistent.c deleted file mode 100644 index 3ac41567..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-snap-persistent.c +++ /dev/null @@ -1,898 +0,0 @@ -/* - * Copyright (C) 2001-2002 Sistina Software (UK) Limited. - * Copyright (C) 2006-2008 Red Hat GmbH - * - * This file is released under the GPL. - */ - -#include "dm-exception-store.h" - -#include <linux/mm.h> -#include <linux/pagemap.h> -#include <linux/vmalloc.h> -#include <linux/export.h> -#include <linux/slab.h> -#include <linux/dm-io.h> - -#define DM_MSG_PREFIX "persistent snapshot" -#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */ - -/*----------------------------------------------------------------- - * Persistent snapshots, by persistent we mean that the snapshot - * will survive a reboot. - *---------------------------------------------------------------*/ - -/* - * We need to store a record of which parts of the origin have - * been copied to the snapshot device. The snapshot code - * requires that we copy exception chunks to chunk aligned areas - * of the COW store. It makes sense therefore, to store the - * metadata in chunk size blocks. - * - * There is no backward or forward compatibility implemented, - * snapshots with different disk versions than the kernel will - * not be usable. It is expected that "lvcreate" will blank out - * the start of a fresh COW device before calling the snapshot - * constructor. - * - * The first chunk of the COW device just contains the header. - * After this there is a chunk filled with exception metadata, - * followed by as many exception chunks as can fit in the - * metadata areas. - * - * All on disk structures are in little-endian format. The end - * of the exceptions info is indicated by an exception with a - * new_chunk of 0, which is invalid since it would point to the - * header chunk. - */ - -/* - * Magic for persistent snapshots: "SnAp" - Feeble isn't it. - */ -#define SNAP_MAGIC 0x70416e53 - -/* - * The on-disk version of the metadata. - */ -#define SNAPSHOT_DISK_VERSION 1 - -#define NUM_SNAPSHOT_HDR_CHUNKS 1 - -struct disk_header { - __le32 magic; - - /* - * Is this snapshot valid. There is no way of recovering - * an invalid snapshot. - */ - __le32 valid; - - /* - * Simple, incrementing version. no backward - * compatibility. - */ - __le32 version; - - /* In sectors */ - __le32 chunk_size; -} __packed; - -struct disk_exception { - __le64 old_chunk; - __le64 new_chunk; -} __packed; - -struct core_exception { - uint64_t old_chunk; - uint64_t new_chunk; -}; - -struct commit_callback { - void (*callback)(void *, int success); - void *context; -}; - -/* - * The top level structure for a persistent exception store. - */ -struct pstore { - struct dm_exception_store *store; - int version; - int valid; - uint32_t exceptions_per_area; - - /* - * Now that we have an asynchronous kcopyd there is no - * need for large chunk sizes, so it wont hurt to have a - * whole chunks worth of metadata in memory at once. - */ - void *area; - - /* - * An area of zeros used to clear the next area. - */ - void *zero_area; - - /* - * An area used for header. The header can be written - * concurrently with metadata (when invalidating the snapshot), - * so it needs a separate buffer. - */ - void *header_area; - - /* - * Used to keep track of which metadata area the data in - * 'chunk' refers to. - */ - chunk_t current_area; - - /* - * The next free chunk for an exception. - * - * When creating exceptions, all the chunks here and above are - * free. It holds the next chunk to be allocated. On rare - * occasions (e.g. after a system crash) holes can be left in - * the exception store because chunks can be committed out of - * order. - * - * When merging exceptions, it does not necessarily mean all the - * chunks here and above are free. It holds the value it would - * have held if all chunks had been committed in order of - * allocation. Consequently the value may occasionally be - * slightly too low, but since it's only used for 'status' and - * it can never reach its minimum value too early this doesn't - * matter. - */ - - chunk_t next_free; - - /* - * The index of next free exception in the current - * metadata area. - */ - uint32_t current_committed; - - atomic_t pending_count; - uint32_t callback_count; - struct commit_callback *callbacks; - struct dm_io_client *io_client; - - struct workqueue_struct *metadata_wq; -}; - -static int alloc_area(struct pstore *ps) -{ - int r = -ENOMEM; - size_t len; - - len = ps->store->chunk_size << SECTOR_SHIFT; - - /* - * Allocate the chunk_size block of memory that will hold - * a single metadata area. - */ - ps->area = vmalloc(len); - if (!ps->area) - goto err_area; - - ps->zero_area = vzalloc(len); - if (!ps->zero_area) - goto err_zero_area; - - ps->header_area = vmalloc(len); - if (!ps->header_area) - goto err_header_area; - - return 0; - -err_header_area: - vfree(ps->zero_area); - -err_zero_area: - vfree(ps->area); - -err_area: - return r; -} - -static void free_area(struct pstore *ps) -{ - if (ps->area) - vfree(ps->area); - ps->area = NULL; - - if (ps->zero_area) - vfree(ps->zero_area); - ps->zero_area = NULL; - - if (ps->header_area) - vfree(ps->header_area); - ps->header_area = NULL; -} - -struct mdata_req { - struct dm_io_region *where; - struct dm_io_request *io_req; - struct work_struct work; - int result; -}; - -static void do_metadata(struct work_struct *work) -{ - struct mdata_req *req = container_of(work, struct mdata_req, work); - - req->result = dm_io(req->io_req, 1, req->where, NULL); -} - -/* - * Read or write a chunk aligned and sized block of data from a device. - */ -static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw, - int metadata) -{ - struct dm_io_region where = { - .bdev = dm_snap_cow(ps->store->snap)->bdev, - .sector = ps->store->chunk_size * chunk, - .count = ps->store->chunk_size, - }; - struct dm_io_request io_req = { - .bi_rw = rw, - .mem.type = DM_IO_VMA, - .mem.ptr.vma = area, - .client = ps->io_client, - .notify.fn = NULL, - }; - struct mdata_req req; - - if (!metadata) - return dm_io(&io_req, 1, &where, NULL); - - req.where = &where; - req.io_req = &io_req; - - /* - * Issue the synchronous I/O from a different thread - * to avoid generic_make_request recursion. - */ - INIT_WORK_ONSTACK(&req.work, do_metadata); - queue_work(ps->metadata_wq, &req.work); - flush_work(&req.work); - - return req.result; -} - -/* - * Convert a metadata area index to a chunk index. - */ -static chunk_t area_location(struct pstore *ps, chunk_t area) -{ - return NUM_SNAPSHOT_HDR_CHUNKS + ((ps->exceptions_per_area + 1) * area); -} - -/* - * Read or write a metadata area. Remembering to skip the first - * chunk which holds the header. - */ -static int area_io(struct pstore *ps, int rw) -{ - int r; - chunk_t chunk; - - chunk = area_location(ps, ps->current_area); - - r = chunk_io(ps, ps->area, chunk, rw, 0); - if (r) - return r; - - return 0; -} - -static void zero_memory_area(struct pstore *ps) -{ - memset(ps->area, 0, ps->store->chunk_size << SECTOR_SHIFT); -} - -static int zero_disk_area(struct pstore *ps, chunk_t area) -{ - return chunk_io(ps, ps->zero_area, area_location(ps, area), WRITE, 0); -} - -static int read_header(struct pstore *ps, int *new_snapshot) -{ - int r; - struct disk_header *dh; - unsigned chunk_size; - int chunk_size_supplied = 1; - char *chunk_err; - - /* - * Use default chunk size (or logical_block_size, if larger) - * if none supplied - */ - if (!ps->store->chunk_size) { - ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, - bdev_logical_block_size(dm_snap_cow(ps->store->snap)-> - bdev) >> 9); - ps->store->chunk_mask = ps->store->chunk_size - 1; - ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1; - chunk_size_supplied = 0; - } - - ps->io_client = dm_io_client_create(); - if (IS_ERR(ps->io_client)) - return PTR_ERR(ps->io_client); - - r = alloc_area(ps); - if (r) - return r; - - r = chunk_io(ps, ps->header_area, 0, READ, 1); - if (r) - goto bad; - - dh = ps->header_area; - - if (le32_to_cpu(dh->magic) == 0) { - *new_snapshot = 1; - return 0; - } - - if (le32_to_cpu(dh->magic) != SNAP_MAGIC) { - DMWARN("Invalid or corrupt snapshot"); - r = -ENXIO; - goto bad; - } - - *new_snapshot = 0; - ps->valid = le32_to_cpu(dh->valid); - ps->version = le32_to_cpu(dh->version); - chunk_size = le32_to_cpu(dh->chunk_size); - - if (ps->store->chunk_size == chunk_size) - return 0; - - if (chunk_size_supplied) - DMWARN("chunk size %u in device metadata overrides " - "table chunk size of %u.", - chunk_size, ps->store->chunk_size); - - /* We had a bogus chunk_size. Fix stuff up. */ - free_area(ps); - - r = dm_exception_store_set_chunk_size(ps->store, chunk_size, - &chunk_err); - if (r) { - DMERR("invalid on-disk chunk size %u: %s.", - chunk_size, chunk_err); - return r; - } - - r = alloc_area(ps); - return r; - -bad: - free_area(ps); - return r; -} - -static int write_header(struct pstore *ps) -{ - struct disk_header *dh; - - memset(ps->header_area, 0, ps->store->chunk_size << SECTOR_SHIFT); - - dh = ps->header_area; - dh->magic = cpu_to_le32(SNAP_MAGIC); - dh->valid = cpu_to_le32(ps->valid); - dh->version = cpu_to_le32(ps->version); - dh->chunk_size = cpu_to_le32(ps->store->chunk_size); - - return chunk_io(ps, ps->header_area, 0, WRITE, 1); -} - -/* - * Access functions for the disk exceptions, these do the endian conversions. - */ -static struct disk_exception *get_exception(struct pstore *ps, uint32_t index) -{ - BUG_ON(index >= ps->exceptions_per_area); - - return ((struct disk_exception *) ps->area) + index; -} - -static void read_exception(struct pstore *ps, - uint32_t index, struct core_exception *result) -{ - struct disk_exception *de = get_exception(ps, index); - - /* copy it */ - result->old_chunk = le64_to_cpu(de->old_chunk); - result->new_chunk = le64_to_cpu(de->new_chunk); -} - -static void write_exception(struct pstore *ps, - uint32_t index, struct core_exception *e) -{ - struct disk_exception *de = get_exception(ps, index); - - /* copy it */ - de->old_chunk = cpu_to_le64(e->old_chunk); - de->new_chunk = cpu_to_le64(e->new_chunk); -} - -static void clear_exception(struct pstore *ps, uint32_t index) -{ - struct disk_exception *de = get_exception(ps, index); - - /* clear it */ - de->old_chunk = 0; - de->new_chunk = 0; -} - -/* - * Registers the exceptions that are present in the current area. - * 'full' is filled in to indicate if the area has been - * filled. - */ -static int insert_exceptions(struct pstore *ps, - int (*callback)(void *callback_context, - chunk_t old, chunk_t new), - void *callback_context, - int *full) -{ - int r; - unsigned int i; - struct core_exception e; - - /* presume the area is full */ - *full = 1; - - for (i = 0; i < ps->exceptions_per_area; i++) { - read_exception(ps, i, &e); - - /* - * If the new_chunk is pointing at the start of - * the COW device, where the first metadata area - * is we know that we've hit the end of the - * exceptions. Therefore the area is not full. - */ - if (e.new_chunk == 0LL) { - ps->current_committed = i; - *full = 0; - break; - } - - /* - * Keep track of the start of the free chunks. - */ - if (ps->next_free <= e.new_chunk) - ps->next_free = e.new_chunk + 1; - - /* - * Otherwise we add the exception to the snapshot. - */ - r = callback(callback_context, e.old_chunk, e.new_chunk); - if (r) - return r; - } - - return 0; -} - -static int read_exceptions(struct pstore *ps, - int (*callback)(void *callback_context, chunk_t old, - chunk_t new), - void *callback_context) -{ - int r, full = 1; - - /* - * Keeping reading chunks and inserting exceptions until - * we find a partially full area. - */ - for (ps->current_area = 0; full; ps->current_area++) { - r = area_io(ps, READ); - if (r) - return r; - - r = insert_exceptions(ps, callback, callback_context, &full); - if (r) - return r; - } - - ps->current_area--; - - return 0; -} - -static struct pstore *get_info(struct dm_exception_store *store) -{ - return (struct pstore *) store->context; -} - -static void persistent_usage(struct dm_exception_store *store, - sector_t *total_sectors, - sector_t *sectors_allocated, - sector_t *metadata_sectors) -{ - struct pstore *ps = get_info(store); - - *sectors_allocated = ps->next_free * store->chunk_size; - *total_sectors = get_dev_size(dm_snap_cow(store->snap)->bdev); - - /* - * First chunk is the fixed header. - * Then there are (ps->current_area + 1) metadata chunks, each one - * separated from the next by ps->exceptions_per_area data chunks. - */ - *metadata_sectors = (ps->current_area + 1 + NUM_SNAPSHOT_HDR_CHUNKS) * - store->chunk_size; -} - -static void persistent_dtr(struct dm_exception_store *store) -{ - struct pstore *ps = get_info(store); - - destroy_workqueue(ps->metadata_wq); - - /* Created in read_header */ - if (ps->io_client) - dm_io_client_destroy(ps->io_client); - free_area(ps); - - /* Allocated in persistent_read_metadata */ - if (ps->callbacks) - vfree(ps->callbacks); - - kfree(ps); -} - -static int persistent_read_metadata(struct dm_exception_store *store, - int (*callback)(void *callback_context, - chunk_t old, chunk_t new), - void *callback_context) -{ - int r, uninitialized_var(new_snapshot); - struct pstore *ps = get_info(store); - - /* - * Read the snapshot header. - */ - r = read_header(ps, &new_snapshot); - if (r) - return r; - - /* - * Now we know correct chunk_size, complete the initialisation. - */ - ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) / - sizeof(struct disk_exception); - ps->callbacks = dm_vcalloc(ps->exceptions_per_area, - sizeof(*ps->callbacks)); - if (!ps->callbacks) - return -ENOMEM; - - /* - * Do we need to setup a new snapshot ? - */ - if (new_snapshot) { - r = write_header(ps); - if (r) { - DMWARN("write_header failed"); - return r; - } - - ps->current_area = 0; - zero_memory_area(ps); - r = zero_disk_area(ps, 0); - if (r) - DMWARN("zero_disk_area(0) failed"); - return r; - } - /* - * Sanity checks. - */ - if (ps->version != SNAPSHOT_DISK_VERSION) { - DMWARN("unable to handle snapshot disk version %d", - ps->version); - return -EINVAL; - } - - /* - * Metadata are valid, but snapshot is invalidated - */ - if (!ps->valid) - return 1; - - /* - * Read the metadata. - */ - r = read_exceptions(ps, callback, callback_context); - - return r; -} - -static int persistent_prepare_exception(struct dm_exception_store *store, - struct dm_exception *e) -{ - struct pstore *ps = get_info(store); - uint32_t stride; - chunk_t next_free; - sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev); - - /* Is there enough room ? */ - if (size < ((ps->next_free + 1) * store->chunk_size)) - return -ENOSPC; - - e->new_chunk = ps->next_free; - - /* - * Move onto the next free pending, making sure to take - * into account the location of the metadata chunks. - */ - stride = (ps->exceptions_per_area + 1); - next_free = ++ps->next_free; - if (sector_div(next_free, stride) == 1) - ps->next_free++; - - atomic_inc(&ps->pending_count); - return 0; -} - -static void persistent_commit_exception(struct dm_exception_store *store, - struct dm_exception *e, - void (*callback) (void *, int success), - void *callback_context) -{ - unsigned int i; - struct pstore *ps = get_info(store); - struct core_exception ce; - struct commit_callback *cb; - - ce.old_chunk = e->old_chunk; - ce.new_chunk = e->new_chunk; - write_exception(ps, ps->current_committed++, &ce); - - /* - * Add the callback to the back of the array. This code - * is the only place where the callback array is - * manipulated, and we know that it will never be called - * multiple times concurrently. - */ - cb = ps->callbacks + ps->callback_count++; - cb->callback = callback; - cb->context = callback_context; - - /* - * If there are exceptions in flight and we have not yet - * filled this metadata area there's nothing more to do. - */ - if (!atomic_dec_and_test(&ps->pending_count) && - (ps->current_committed != ps->exceptions_per_area)) - return; - - /* - * If we completely filled the current area, then wipe the next one. - */ - if ((ps->current_committed == ps->exceptions_per_area) && - zero_disk_area(ps, ps->current_area + 1)) - ps->valid = 0; - - /* - * Commit exceptions to disk. - */ - if (ps->valid && area_io(ps, WRITE_FLUSH_FUA)) - ps->valid = 0; - - /* - * Advance to the next area if this one is full. - */ - if (ps->current_committed == ps->exceptions_per_area) { - ps->current_committed = 0; - ps->current_area++; - zero_memory_area(ps); - } - - for (i = 0; i < ps->callback_count; i++) { - cb = ps->callbacks + i; - cb->callback(cb->context, ps->valid); - } - - ps->callback_count = 0; -} - -static int persistent_prepare_merge(struct dm_exception_store *store, - chunk_t *last_old_chunk, - chunk_t *last_new_chunk) -{ - struct pstore *ps = get_info(store); - struct core_exception ce; - int nr_consecutive; - int r; - - /* - * When current area is empty, move back to preceding area. - */ - if (!ps->current_committed) { - /* - * Have we finished? - */ - if (!ps->current_area) - return 0; - - ps->current_area--; - r = area_io(ps, READ); - if (r < 0) - return r; - ps->current_committed = ps->exceptions_per_area; - } - - read_exception(ps, ps->current_committed - 1, &ce); - *last_old_chunk = ce.old_chunk; - *last_new_chunk = ce.new_chunk; - - /* - * Find number of consecutive chunks within the current area, - * working backwards. - */ - for (nr_consecutive = 1; nr_consecutive < ps->current_committed; - nr_consecutive++) { - read_exception(ps, ps->current_committed - 1 - nr_consecutive, - &ce); - if (ce.old_chunk != *last_old_chunk - nr_consecutive || - ce.new_chunk != *last_new_chunk - nr_consecutive) - break; - } - - return nr_consecutive; -} - -static int persistent_commit_merge(struct dm_exception_store *store, - int nr_merged) -{ - int r, i; - struct pstore *ps = get_info(store); - - BUG_ON(nr_merged > ps->current_committed); - - for (i = 0; i < nr_merged; i++) - clear_exception(ps, ps->current_committed - 1 - i); - - r = area_io(ps, WRITE_FLUSH_FUA); - if (r < 0) - return r; - - ps->current_committed -= nr_merged; - - /* - * At this stage, only persistent_usage() uses ps->next_free, so - * we make no attempt to keep ps->next_free strictly accurate - * as exceptions may have been committed out-of-order originally. - * Once a snapshot has become merging, we set it to the value it - * would have held had all the exceptions been committed in order. - * - * ps->current_area does not get reduced by prepare_merge() until - * after commit_merge() has removed the nr_merged previous exceptions. - */ - ps->next_free = area_location(ps, ps->current_area) + - ps->current_committed + 1; - - return 0; -} - -static void persistent_drop_snapshot(struct dm_exception_store *store) -{ - struct pstore *ps = get_info(store); - - ps->valid = 0; - if (write_header(ps)) - DMWARN("write header failed"); -} - -static int persistent_ctr(struct dm_exception_store *store, - unsigned argc, char **argv) -{ - struct pstore *ps; - - /* allocate the pstore */ - ps = kzalloc(sizeof(*ps), GFP_KERNEL); - if (!ps) - return -ENOMEM; - - ps->store = store; - ps->valid = 1; - ps->version = SNAPSHOT_DISK_VERSION; - ps->area = NULL; - ps->zero_area = NULL; - ps->header_area = NULL; - ps->next_free = NUM_SNAPSHOT_HDR_CHUNKS + 1; /* header and 1st area */ - ps->current_committed = 0; - - ps->callback_count = 0; - atomic_set(&ps->pending_count, 0); - ps->callbacks = NULL; - - ps->metadata_wq = alloc_workqueue("ksnaphd", WQ_MEM_RECLAIM, 0); - if (!ps->metadata_wq) { - kfree(ps); - DMERR("couldn't start header metadata update thread"); - return -ENOMEM; - } - - store->context = ps; - - return 0; -} - -static unsigned persistent_status(struct dm_exception_store *store, - status_type_t status, char *result, - unsigned maxlen) -{ - unsigned sz = 0; - - switch (status) { - case STATUSTYPE_INFO: - break; - case STATUSTYPE_TABLE: - DMEMIT(" P %llu", (unsigned long long)store->chunk_size); - } - - return sz; -} - -static struct dm_exception_store_type _persistent_type = { - .name = "persistent", - .module = THIS_MODULE, - .ctr = persistent_ctr, - .dtr = persistent_dtr, - .read_metadata = persistent_read_metadata, - .prepare_exception = persistent_prepare_exception, - .commit_exception = persistent_commit_exception, - .prepare_merge = persistent_prepare_merge, - .commit_merge = persistent_commit_merge, - .drop_snapshot = persistent_drop_snapshot, - .usage = persistent_usage, - .status = persistent_status, -}; - -static struct dm_exception_store_type _persistent_compat_type = { - .name = "P", - .module = THIS_MODULE, - .ctr = persistent_ctr, - .dtr = persistent_dtr, - .read_metadata = persistent_read_metadata, - .prepare_exception = persistent_prepare_exception, - .commit_exception = persistent_commit_exception, - .prepare_merge = persistent_prepare_merge, - .commit_merge = persistent_commit_merge, - .drop_snapshot = persistent_drop_snapshot, - .usage = persistent_usage, - .status = persistent_status, -}; - -int dm_persistent_snapshot_init(void) -{ - int r; - - r = dm_exception_store_type_register(&_persistent_type); - if (r) { - DMERR("Unable to register persistent exception store type"); - return r; - } - - r = dm_exception_store_type_register(&_persistent_compat_type); - if (r) { - DMERR("Unable to register old-style persistent exception " - "store type"); - dm_exception_store_type_unregister(&_persistent_type); - return r; - } - - return r; -} - -void dm_persistent_snapshot_exit(void) -{ - dm_exception_store_type_unregister(&_persistent_type); - dm_exception_store_type_unregister(&_persistent_compat_type); -} diff --git a/ANDROID_3.4.5/drivers/md/dm-snap-transient.c b/ANDROID_3.4.5/drivers/md/dm-snap-transient.c deleted file mode 100644 index 1ce9a258..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-snap-transient.c +++ /dev/null @@ -1,153 +0,0 @@ -/* - * Copyright (C) 2001-2002 Sistina Software (UK) Limited. - * Copyright (C) 2006-2008 Red Hat GmbH - * - * This file is released under the GPL. - */ - -#include "dm-exception-store.h" - -#include <linux/mm.h> -#include <linux/pagemap.h> -#include <linux/vmalloc.h> -#include <linux/export.h> -#include <linux/slab.h> -#include <linux/dm-io.h> - -#define DM_MSG_PREFIX "transient snapshot" - -/*----------------------------------------------------------------- - * Implementation of the store for non-persistent snapshots. - *---------------------------------------------------------------*/ -struct transient_c { - sector_t next_free; -}; - -static void transient_dtr(struct dm_exception_store *store) -{ - kfree(store->context); -} - -static int transient_read_metadata(struct dm_exception_store *store, - int (*callback)(void *callback_context, - chunk_t old, chunk_t new), - void *callback_context) -{ - return 0; -} - -static int transient_prepare_exception(struct dm_exception_store *store, - struct dm_exception *e) -{ - struct transient_c *tc = store->context; - sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev); - - if (size < (tc->next_free + store->chunk_size)) - return -1; - - e->new_chunk = sector_to_chunk(store, tc->next_free); - tc->next_free += store->chunk_size; - - return 0; -} - -static void transient_commit_exception(struct dm_exception_store *store, - struct dm_exception *e, - void (*callback) (void *, int success), - void *callback_context) -{ - /* Just succeed */ - callback(callback_context, 1); -} - -static void transient_usage(struct dm_exception_store *store, - sector_t *total_sectors, - sector_t *sectors_allocated, - sector_t *metadata_sectors) -{ - *sectors_allocated = ((struct transient_c *) store->context)->next_free; - *total_sectors = get_dev_size(dm_snap_cow(store->snap)->bdev); - *metadata_sectors = 0; -} - -static int transient_ctr(struct dm_exception_store *store, - unsigned argc, char **argv) -{ - struct transient_c *tc; - - tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL); - if (!tc) - return -ENOMEM; - - tc->next_free = 0; - store->context = tc; - - return 0; -} - -static unsigned transient_status(struct dm_exception_store *store, - status_type_t status, char *result, - unsigned maxlen) -{ - unsigned sz = 0; - - switch (status) { - case STATUSTYPE_INFO: - break; - case STATUSTYPE_TABLE: - DMEMIT(" N %llu", (unsigned long long)store->chunk_size); - } - - return sz; -} - -static struct dm_exception_store_type _transient_type = { - .name = "transient", - .module = THIS_MODULE, - .ctr = transient_ctr, - .dtr = transient_dtr, - .read_metadata = transient_read_metadata, - .prepare_exception = transient_prepare_exception, - .commit_exception = transient_commit_exception, - .usage = transient_usage, - .status = transient_status, -}; - -static struct dm_exception_store_type _transient_compat_type = { - .name = "N", - .module = THIS_MODULE, - .ctr = transient_ctr, - .dtr = transient_dtr, - .read_metadata = transient_read_metadata, - .prepare_exception = transient_prepare_exception, - .commit_exception = transient_commit_exception, - .usage = transient_usage, - .status = transient_status, -}; - -int dm_transient_snapshot_init(void) -{ - int r; - - r = dm_exception_store_type_register(&_transient_type); - if (r) { - DMWARN("Unable to register transient exception store type"); - return r; - } - - r = dm_exception_store_type_register(&_transient_compat_type); - if (r) { - DMWARN("Unable to register old-style transient " - "exception store type"); - dm_exception_store_type_unregister(&_transient_type); - return r; - } - - return r; -} - -void dm_transient_snapshot_exit(void) -{ - dm_exception_store_type_unregister(&_transient_type); - dm_exception_store_type_unregister(&_transient_compat_type); -} diff --git a/ANDROID_3.4.5/drivers/md/dm-snap.c b/ANDROID_3.4.5/drivers/md/dm-snap.c deleted file mode 100644 index 6f758870..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-snap.c +++ /dev/null @@ -1,2329 +0,0 @@ -/* - * dm-snapshot.c - * - * Copyright (C) 2001-2002 Sistina Software (UK) Limited. - * - * This file is released under the GPL. - */ - -#include <linux/blkdev.h> -#include <linux/device-mapper.h> -#include <linux/delay.h> -#include <linux/fs.h> -#include <linux/init.h> -#include <linux/kdev_t.h> -#include <linux/list.h> -#include <linux/mempool.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> -#include <linux/log2.h> -#include <linux/dm-kcopyd.h> - -#include "dm-exception-store.h" - -#define DM_MSG_PREFIX "snapshots" - -static const char dm_snapshot_merge_target_name[] = "snapshot-merge"; - -#define dm_target_is_snapshot_merge(ti) \ - ((ti)->type->name == dm_snapshot_merge_target_name) - -/* - * The size of the mempool used to track chunks in use. - */ -#define MIN_IOS 256 - -#define DM_TRACKED_CHUNK_HASH_SIZE 16 -#define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \ - (DM_TRACKED_CHUNK_HASH_SIZE - 1)) - -struct dm_exception_table { - uint32_t hash_mask; - unsigned hash_shift; - struct list_head *table; -}; - -struct dm_snapshot { - struct rw_semaphore lock; - - struct dm_dev *origin; - struct dm_dev *cow; - - struct dm_target *ti; - - /* List of snapshots per Origin */ - struct list_head list; - - /* - * You can't use a snapshot if this is 0 (e.g. if full). - * A snapshot-merge target never clears this. - */ - int valid; - - /* Origin writes don't trigger exceptions until this is set */ - int active; - - atomic_t pending_exceptions_count; - - mempool_t *pending_pool; - - struct dm_exception_table pending; - struct dm_exception_table complete; - - /* - * pe_lock protects all pending_exception operations and access - * as well as the snapshot_bios list. - */ - spinlock_t pe_lock; - - /* Chunks with outstanding reads */ - spinlock_t tracked_chunk_lock; - mempool_t *tracked_chunk_pool; - struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE]; - - /* The on disk metadata handler */ - struct dm_exception_store *store; - - struct dm_kcopyd_client *kcopyd_client; - - /* Wait for events based on state_bits */ - unsigned long state_bits; - - /* Range of chunks currently being merged. */ - chunk_t first_merging_chunk; - int num_merging_chunks; - - /* - * The merge operation failed if this flag is set. - * Failure modes are handled as follows: - * - I/O error reading the header - * => don't load the target; abort. - * - Header does not have "valid" flag set - * => use the origin; forget about the snapshot. - * - I/O error when reading exceptions - * => don't load the target; abort. - * (We can't use the intermediate origin state.) - * - I/O error while merging - * => stop merging; set merge_failed; process I/O normally. - */ - int merge_failed; - - /* - * Incoming bios that overlap with chunks being merged must wait - * for them to be committed. - */ - struct bio_list bios_queued_during_merge; -}; - -/* - * state_bits: - * RUNNING_MERGE - Merge operation is in progress. - * SHUTDOWN_MERGE - Set to signal that merge needs to be stopped; - * cleared afterwards. - */ -#define RUNNING_MERGE 0 -#define SHUTDOWN_MERGE 1 - -struct dm_dev *dm_snap_origin(struct dm_snapshot *s) -{ - return s->origin; -} -EXPORT_SYMBOL(dm_snap_origin); - -struct dm_dev *dm_snap_cow(struct dm_snapshot *s) -{ - return s->cow; -} -EXPORT_SYMBOL(dm_snap_cow); - -static sector_t chunk_to_sector(struct dm_exception_store *store, - chunk_t chunk) -{ - return chunk << store->chunk_shift; -} - -static int bdev_equal(struct block_device *lhs, struct block_device *rhs) -{ - /* - * There is only ever one instance of a particular block - * device so we can compare pointers safely. - */ - return lhs == rhs; -} - -struct dm_snap_pending_exception { - struct dm_exception e; - - /* - * Origin buffers waiting for this to complete are held - * in a bio list - */ - struct bio_list origin_bios; - struct bio_list snapshot_bios; - - /* Pointer back to snapshot context */ - struct dm_snapshot *snap; - - /* - * 1 indicates the exception has already been sent to - * kcopyd. - */ - int started; - - /* - * For writing a complete chunk, bypassing the copy. - */ - struct bio *full_bio; - bio_end_io_t *full_bio_end_io; - void *full_bio_private; -}; - -/* - * Hash table mapping origin volumes to lists of snapshots and - * a lock to protect it - */ -static struct kmem_cache *exception_cache; -static struct kmem_cache *pending_cache; - -struct dm_snap_tracked_chunk { - struct hlist_node node; - chunk_t chunk; -}; - -static struct kmem_cache *tracked_chunk_cache; - -static struct dm_snap_tracked_chunk *track_chunk(struct dm_snapshot *s, - chunk_t chunk) -{ - struct dm_snap_tracked_chunk *c = mempool_alloc(s->tracked_chunk_pool, - GFP_NOIO); - unsigned long flags; - - c->chunk = chunk; - - spin_lock_irqsave(&s->tracked_chunk_lock, flags); - hlist_add_head(&c->node, - &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)]); - spin_unlock_irqrestore(&s->tracked_chunk_lock, flags); - - return c; -} - -static void stop_tracking_chunk(struct dm_snapshot *s, - struct dm_snap_tracked_chunk *c) -{ - unsigned long flags; - - spin_lock_irqsave(&s->tracked_chunk_lock, flags); - hlist_del(&c->node); - spin_unlock_irqrestore(&s->tracked_chunk_lock, flags); - - mempool_free(c, s->tracked_chunk_pool); -} - -static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk) -{ - struct dm_snap_tracked_chunk *c; - struct hlist_node *hn; - int found = 0; - - spin_lock_irq(&s->tracked_chunk_lock); - - hlist_for_each_entry(c, hn, - &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)], node) { - if (c->chunk == chunk) { - found = 1; - break; - } - } - - spin_unlock_irq(&s->tracked_chunk_lock); - - return found; -} - -/* - * This conflicting I/O is extremely improbable in the caller, - * so msleep(1) is sufficient and there is no need for a wait queue. - */ -static void __check_for_conflicting_io(struct dm_snapshot *s, chunk_t chunk) -{ - while (__chunk_is_tracked(s, chunk)) - msleep(1); -} - -/* - * One of these per registered origin, held in the snapshot_origins hash - */ -struct origin { - /* The origin device */ - struct block_device *bdev; - - struct list_head hash_list; - - /* List of snapshots for this origin */ - struct list_head snapshots; -}; - -/* - * Size of the hash table for origin volumes. If we make this - * the size of the minors list then it should be nearly perfect - */ -#define ORIGIN_HASH_SIZE 256 -#define ORIGIN_MASK 0xFF -static struct list_head *_origins; -static struct rw_semaphore _origins_lock; - -static DECLARE_WAIT_QUEUE_HEAD(_pending_exceptions_done); -static DEFINE_SPINLOCK(_pending_exceptions_done_spinlock); -static uint64_t _pending_exceptions_done_count; - -static int init_origin_hash(void) -{ - int i; - - _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head), - GFP_KERNEL); - if (!_origins) { - DMERR("unable to allocate memory"); - return -ENOMEM; - } - - for (i = 0; i < ORIGIN_HASH_SIZE; i++) - INIT_LIST_HEAD(_origins + i); - init_rwsem(&_origins_lock); - - return 0; -} - -static void exit_origin_hash(void) -{ - kfree(_origins); -} - -static unsigned origin_hash(struct block_device *bdev) -{ - return bdev->bd_dev & ORIGIN_MASK; -} - -static struct origin *__lookup_origin(struct block_device *origin) -{ - struct list_head *ol; - struct origin *o; - - ol = &_origins[origin_hash(origin)]; - list_for_each_entry (o, ol, hash_list) - if (bdev_equal(o->bdev, origin)) - return o; - - return NULL; -} - -static void __insert_origin(struct origin *o) -{ - struct list_head *sl = &_origins[origin_hash(o->bdev)]; - list_add_tail(&o->hash_list, sl); -} - -/* - * _origins_lock must be held when calling this function. - * Returns number of snapshots registered using the supplied cow device, plus: - * snap_src - a snapshot suitable for use as a source of exception handover - * snap_dest - a snapshot capable of receiving exception handover. - * snap_merge - an existing snapshot-merge target linked to the same origin. - * There can be at most one snapshot-merge target. The parameter is optional. - * - * Possible return values and states of snap_src and snap_dest. - * 0: NULL, NULL - first new snapshot - * 1: snap_src, NULL - normal snapshot - * 2: snap_src, snap_dest - waiting for handover - * 2: snap_src, NULL - handed over, waiting for old to be deleted - * 1: NULL, snap_dest - source got destroyed without handover - */ -static int __find_snapshots_sharing_cow(struct dm_snapshot *snap, - struct dm_snapshot **snap_src, - struct dm_snapshot **snap_dest, - struct dm_snapshot **snap_merge) -{ - struct dm_snapshot *s; - struct origin *o; - int count = 0; - int active; - - o = __lookup_origin(snap->origin->bdev); - if (!o) - goto out; - - list_for_each_entry(s, &o->snapshots, list) { - if (dm_target_is_snapshot_merge(s->ti) && snap_merge) - *snap_merge = s; - if (!bdev_equal(s->cow->bdev, snap->cow->bdev)) - continue; - - down_read(&s->lock); - active = s->active; - up_read(&s->lock); - - if (active) { - if (snap_src) - *snap_src = s; - } else if (snap_dest) - *snap_dest = s; - - count++; - } - -out: - return count; -} - -/* - * On success, returns 1 if this snapshot is a handover destination, - * otherwise returns 0. - */ -static int __validate_exception_handover(struct dm_snapshot *snap) -{ - struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; - struct dm_snapshot *snap_merge = NULL; - - /* Does snapshot need exceptions handed over to it? */ - if ((__find_snapshots_sharing_cow(snap, &snap_src, &snap_dest, - &snap_merge) == 2) || - snap_dest) { - snap->ti->error = "Snapshot cow pairing for exception " - "table handover failed"; - return -EINVAL; - } - - /* - * If no snap_src was found, snap cannot become a handover - * destination. - */ - if (!snap_src) - return 0; - - /* - * Non-snapshot-merge handover? - */ - if (!dm_target_is_snapshot_merge(snap->ti)) - return 1; - - /* - * Do not allow more than one merging snapshot. - */ - if (snap_merge) { - snap->ti->error = "A snapshot is already merging."; - return -EINVAL; - } - - if (!snap_src->store->type->prepare_merge || - !snap_src->store->type->commit_merge) { - snap->ti->error = "Snapshot exception store does not " - "support snapshot-merge."; - return -EINVAL; - } - - return 1; -} - -static void __insert_snapshot(struct origin *o, struct dm_snapshot *s) -{ - struct dm_snapshot *l; - - /* Sort the list according to chunk size, largest-first smallest-last */ - list_for_each_entry(l, &o->snapshots, list) - if (l->store->chunk_size < s->store->chunk_size) - break; - list_add_tail(&s->list, &l->list); -} - -/* - * Make a note of the snapshot and its origin so we can look it - * up when the origin has a write on it. - * - * Also validate snapshot exception store handovers. - * On success, returns 1 if this registration is a handover destination, - * otherwise returns 0. - */ -static int register_snapshot(struct dm_snapshot *snap) -{ - struct origin *o, *new_o = NULL; - struct block_device *bdev = snap->origin->bdev; - int r = 0; - - new_o = kmalloc(sizeof(*new_o), GFP_KERNEL); - if (!new_o) - return -ENOMEM; - - down_write(&_origins_lock); - - r = __validate_exception_handover(snap); - if (r < 0) { - kfree(new_o); - goto out; - } - - o = __lookup_origin(bdev); - if (o) - kfree(new_o); - else { - /* New origin */ - o = new_o; - - /* Initialise the struct */ - INIT_LIST_HEAD(&o->snapshots); - o->bdev = bdev; - - __insert_origin(o); - } - - __insert_snapshot(o, snap); - -out: - up_write(&_origins_lock); - - return r; -} - -/* - * Move snapshot to correct place in list according to chunk size. - */ -static void reregister_snapshot(struct dm_snapshot *s) -{ - struct block_device *bdev = s->origin->bdev; - - down_write(&_origins_lock); - - list_del(&s->list); - __insert_snapshot(__lookup_origin(bdev), s); - - up_write(&_origins_lock); -} - -static void unregister_snapshot(struct dm_snapshot *s) -{ - struct origin *o; - - down_write(&_origins_lock); - o = __lookup_origin(s->origin->bdev); - - list_del(&s->list); - if (o && list_empty(&o->snapshots)) { - list_del(&o->hash_list); - kfree(o); - } - - up_write(&_origins_lock); -} - -/* - * Implementation of the exception hash tables. - * The lowest hash_shift bits of the chunk number are ignored, allowing - * some consecutive chunks to be grouped together. - */ -static int dm_exception_table_init(struct dm_exception_table *et, - uint32_t size, unsigned hash_shift) -{ - unsigned int i; - - et->hash_shift = hash_shift; - et->hash_mask = size - 1; - et->table = dm_vcalloc(size, sizeof(struct list_head)); - if (!et->table) - return -ENOMEM; - - for (i = 0; i < size; i++) - INIT_LIST_HEAD(et->table + i); - - return 0; -} - -static void dm_exception_table_exit(struct dm_exception_table *et, - struct kmem_cache *mem) -{ - struct list_head *slot; - struct dm_exception *ex, *next; - int i, size; - - size = et->hash_mask + 1; - for (i = 0; i < size; i++) { - slot = et->table + i; - - list_for_each_entry_safe (ex, next, slot, hash_list) - kmem_cache_free(mem, ex); - } - - vfree(et->table); -} - -static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk) -{ - return (chunk >> et->hash_shift) & et->hash_mask; -} - -static void dm_remove_exception(struct dm_exception *e) -{ - list_del(&e->hash_list); -} - -/* - * Return the exception data for a sector, or NULL if not - * remapped. - */ -static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et, - chunk_t chunk) -{ - struct list_head *slot; - struct dm_exception *e; - - slot = &et->table[exception_hash(et, chunk)]; - list_for_each_entry (e, slot, hash_list) - if (chunk >= e->old_chunk && - chunk <= e->old_chunk + dm_consecutive_chunk_count(e)) - return e; - - return NULL; -} - -static struct dm_exception *alloc_completed_exception(void) -{ - struct dm_exception *e; - - e = kmem_cache_alloc(exception_cache, GFP_NOIO); - if (!e) - e = kmem_cache_alloc(exception_cache, GFP_ATOMIC); - - return e; -} - -static void free_completed_exception(struct dm_exception *e) -{ - kmem_cache_free(exception_cache, e); -} - -static struct dm_snap_pending_exception *alloc_pending_exception(struct dm_snapshot *s) -{ - struct dm_snap_pending_exception *pe = mempool_alloc(s->pending_pool, - GFP_NOIO); - - atomic_inc(&s->pending_exceptions_count); - pe->snap = s; - - return pe; -} - -static void free_pending_exception(struct dm_snap_pending_exception *pe) -{ - struct dm_snapshot *s = pe->snap; - - mempool_free(pe, s->pending_pool); - smp_mb__before_atomic_dec(); - atomic_dec(&s->pending_exceptions_count); -} - -static void dm_insert_exception(struct dm_exception_table *eh, - struct dm_exception *new_e) -{ - struct list_head *l; - struct dm_exception *e = NULL; - - l = &eh->table[exception_hash(eh, new_e->old_chunk)]; - - /* Add immediately if this table doesn't support consecutive chunks */ - if (!eh->hash_shift) - goto out; - - /* List is ordered by old_chunk */ - list_for_each_entry_reverse(e, l, hash_list) { - /* Insert after an existing chunk? */ - if (new_e->old_chunk == (e->old_chunk + - dm_consecutive_chunk_count(e) + 1) && - new_e->new_chunk == (dm_chunk_number(e->new_chunk) + - dm_consecutive_chunk_count(e) + 1)) { - dm_consecutive_chunk_count_inc(e); - free_completed_exception(new_e); - return; - } - - /* Insert before an existing chunk? */ - if (new_e->old_chunk == (e->old_chunk - 1) && - new_e->new_chunk == (dm_chunk_number(e->new_chunk) - 1)) { - dm_consecutive_chunk_count_inc(e); - e->old_chunk--; - e->new_chunk--; - free_completed_exception(new_e); - return; - } - - if (new_e->old_chunk > e->old_chunk) - break; - } - -out: - list_add(&new_e->hash_list, e ? &e->hash_list : l); -} - -/* - * Callback used by the exception stores to load exceptions when - * initialising. - */ -static int dm_add_exception(void *context, chunk_t old, chunk_t new) -{ - struct dm_snapshot *s = context; - struct dm_exception *e; - - e = alloc_completed_exception(); - if (!e) - return -ENOMEM; - - e->old_chunk = old; - - /* Consecutive_count is implicitly initialised to zero */ - e->new_chunk = new; - - dm_insert_exception(&s->complete, e); - - return 0; -} - -/* - * Return a minimum chunk size of all snapshots that have the specified origin. - * Return zero if the origin has no snapshots. - */ -static sector_t __minimum_chunk_size(struct origin *o) -{ - struct dm_snapshot *snap; - unsigned chunk_size = 0; - - if (o) - list_for_each_entry(snap, &o->snapshots, list) - chunk_size = min_not_zero(chunk_size, - snap->store->chunk_size); - - return chunk_size; -} - -/* - * Hard coded magic. - */ -static int calc_max_buckets(void) -{ - /* use a fixed size of 2MB */ - unsigned long mem = 2 * 1024 * 1024; - mem /= sizeof(struct list_head); - - return mem; -} - -/* - * Allocate room for a suitable hash table. - */ -static int init_hash_tables(struct dm_snapshot *s) -{ - sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets; - - /* - * Calculate based on the size of the original volume or - * the COW volume... - */ - cow_dev_size = get_dev_size(s->cow->bdev); - origin_dev_size = get_dev_size(s->origin->bdev); - max_buckets = calc_max_buckets(); - - hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift; - hash_size = min(hash_size, max_buckets); - - if (hash_size < 64) - hash_size = 64; - hash_size = rounddown_pow_of_two(hash_size); - if (dm_exception_table_init(&s->complete, hash_size, - DM_CHUNK_CONSECUTIVE_BITS)) - return -ENOMEM; - - /* - * Allocate hash table for in-flight exceptions - * Make this smaller than the real hash table - */ - hash_size >>= 3; - if (hash_size < 64) - hash_size = 64; - - if (dm_exception_table_init(&s->pending, hash_size, 0)) { - dm_exception_table_exit(&s->complete, exception_cache); - return -ENOMEM; - } - - return 0; -} - -static void merge_shutdown(struct dm_snapshot *s) -{ - clear_bit_unlock(RUNNING_MERGE, &s->state_bits); - smp_mb__after_clear_bit(); - wake_up_bit(&s->state_bits, RUNNING_MERGE); -} - -static struct bio *__release_queued_bios_after_merge(struct dm_snapshot *s) -{ - s->first_merging_chunk = 0; - s->num_merging_chunks = 0; - - return bio_list_get(&s->bios_queued_during_merge); -} - -/* - * Remove one chunk from the index of completed exceptions. - */ -static int __remove_single_exception_chunk(struct dm_snapshot *s, - chunk_t old_chunk) -{ - struct dm_exception *e; - - e = dm_lookup_exception(&s->complete, old_chunk); - if (!e) { - DMERR("Corruption detected: exception for block %llu is " - "on disk but not in memory", - (unsigned long long)old_chunk); - return -EINVAL; - } - - /* - * If this is the only chunk using this exception, remove exception. - */ - if (!dm_consecutive_chunk_count(e)) { - dm_remove_exception(e); - free_completed_exception(e); - return 0; - } - - /* - * The chunk may be either at the beginning or the end of a - * group of consecutive chunks - never in the middle. We are - * removing chunks in the opposite order to that in which they - * were added, so this should always be true. - * Decrement the consecutive chunk counter and adjust the - * starting point if necessary. - */ - if (old_chunk == e->old_chunk) { - e->old_chunk++; - e->new_chunk++; - } else if (old_chunk != e->old_chunk + - dm_consecutive_chunk_count(e)) { - DMERR("Attempt to merge block %llu from the " - "middle of a chunk range [%llu - %llu]", - (unsigned long long)old_chunk, - (unsigned long long)e->old_chunk, - (unsigned long long) - e->old_chunk + dm_consecutive_chunk_count(e)); - return -EINVAL; - } - - dm_consecutive_chunk_count_dec(e); - - return 0; -} - -static void flush_bios(struct bio *bio); - -static int remove_single_exception_chunk(struct dm_snapshot *s) -{ - struct bio *b = NULL; - int r; - chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1; - - down_write(&s->lock); - - /* - * Process chunks (and associated exceptions) in reverse order - * so that dm_consecutive_chunk_count_dec() accounting works. - */ - do { - r = __remove_single_exception_chunk(s, old_chunk); - if (r) - goto out; - } while (old_chunk-- > s->first_merging_chunk); - - b = __release_queued_bios_after_merge(s); - -out: - up_write(&s->lock); - if (b) - flush_bios(b); - - return r; -} - -static int origin_write_extent(struct dm_snapshot *merging_snap, - sector_t sector, unsigned chunk_size); - -static void merge_callback(int read_err, unsigned long write_err, - void *context); - -static uint64_t read_pending_exceptions_done_count(void) -{ - uint64_t pending_exceptions_done; - - spin_lock(&_pending_exceptions_done_spinlock); - pending_exceptions_done = _pending_exceptions_done_count; - spin_unlock(&_pending_exceptions_done_spinlock); - - return pending_exceptions_done; -} - -static void increment_pending_exceptions_done_count(void) -{ - spin_lock(&_pending_exceptions_done_spinlock); - _pending_exceptions_done_count++; - spin_unlock(&_pending_exceptions_done_spinlock); - - wake_up_all(&_pending_exceptions_done); -} - -static void snapshot_merge_next_chunks(struct dm_snapshot *s) -{ - int i, linear_chunks; - chunk_t old_chunk, new_chunk; - struct dm_io_region src, dest; - sector_t io_size; - uint64_t previous_count; - - BUG_ON(!test_bit(RUNNING_MERGE, &s->state_bits)); - if (unlikely(test_bit(SHUTDOWN_MERGE, &s->state_bits))) - goto shut; - - /* - * valid flag never changes during merge, so no lock required. - */ - if (!s->valid) { - DMERR("Snapshot is invalid: can't merge"); - goto shut; - } - - linear_chunks = s->store->type->prepare_merge(s->store, &old_chunk, - &new_chunk); - if (linear_chunks <= 0) { - if (linear_chunks < 0) { - DMERR("Read error in exception store: " - "shutting down merge"); - down_write(&s->lock); - s->merge_failed = 1; - up_write(&s->lock); - } - goto shut; - } - - /* Adjust old_chunk and new_chunk to reflect start of linear region */ - old_chunk = old_chunk + 1 - linear_chunks; - new_chunk = new_chunk + 1 - linear_chunks; - - /* - * Use one (potentially large) I/O to copy all 'linear_chunks' - * from the exception store to the origin - */ - io_size = linear_chunks * s->store->chunk_size; - - dest.bdev = s->origin->bdev; - dest.sector = chunk_to_sector(s->store, old_chunk); - dest.count = min(io_size, get_dev_size(dest.bdev) - dest.sector); - - src.bdev = s->cow->bdev; - src.sector = chunk_to_sector(s->store, new_chunk); - src.count = dest.count; - - /* - * Reallocate any exceptions needed in other snapshots then - * wait for the pending exceptions to complete. - * Each time any pending exception (globally on the system) - * completes we are woken and repeat the process to find out - * if we can proceed. While this may not seem a particularly - * efficient algorithm, it is not expected to have any - * significant impact on performance. - */ - previous_count = read_pending_exceptions_done_count(); - while (origin_write_extent(s, dest.sector, io_size)) { - wait_event(_pending_exceptions_done, - (read_pending_exceptions_done_count() != - previous_count)); - /* Retry after the wait, until all exceptions are done. */ - previous_count = read_pending_exceptions_done_count(); - } - - down_write(&s->lock); - s->first_merging_chunk = old_chunk; - s->num_merging_chunks = linear_chunks; - up_write(&s->lock); - - /* Wait until writes to all 'linear_chunks' drain */ - for (i = 0; i < linear_chunks; i++) - __check_for_conflicting_io(s, old_chunk + i); - - dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, merge_callback, s); - return; - -shut: - merge_shutdown(s); -} - -static void error_bios(struct bio *bio); - -static void merge_callback(int read_err, unsigned long write_err, void *context) -{ - struct dm_snapshot *s = context; - struct bio *b = NULL; - - if (read_err || write_err) { - if (read_err) - DMERR("Read error: shutting down merge."); - else - DMERR("Write error: shutting down merge."); - goto shut; - } - - if (s->store->type->commit_merge(s->store, - s->num_merging_chunks) < 0) { - DMERR("Write error in exception store: shutting down merge"); - goto shut; - } - - if (remove_single_exception_chunk(s) < 0) - goto shut; - - snapshot_merge_next_chunks(s); - - return; - -shut: - down_write(&s->lock); - s->merge_failed = 1; - b = __release_queued_bios_after_merge(s); - up_write(&s->lock); - error_bios(b); - - merge_shutdown(s); -} - -static void start_merge(struct dm_snapshot *s) -{ - if (!test_and_set_bit(RUNNING_MERGE, &s->state_bits)) - snapshot_merge_next_chunks(s); -} - -static int wait_schedule(void *ptr) -{ - schedule(); - - return 0; -} - -/* - * Stop the merging process and wait until it finishes. - */ -static void stop_merge(struct dm_snapshot *s) -{ - set_bit(SHUTDOWN_MERGE, &s->state_bits); - wait_on_bit(&s->state_bits, RUNNING_MERGE, wait_schedule, - TASK_UNINTERRUPTIBLE); - clear_bit(SHUTDOWN_MERGE, &s->state_bits); -} - -/* - * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size> - */ -static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) -{ - struct dm_snapshot *s; - int i; - int r = -EINVAL; - char *origin_path, *cow_path; - unsigned args_used, num_flush_requests = 1; - fmode_t origin_mode = FMODE_READ; - - if (argc != 4) { - ti->error = "requires exactly 4 arguments"; - r = -EINVAL; - goto bad; - } - - if (dm_target_is_snapshot_merge(ti)) { - num_flush_requests = 2; - origin_mode = FMODE_WRITE; - } - - s = kmalloc(sizeof(*s), GFP_KERNEL); - if (!s) { - ti->error = "Cannot allocate private snapshot structure"; - r = -ENOMEM; - goto bad; - } - - origin_path = argv[0]; - argv++; - argc--; - - r = dm_get_device(ti, origin_path, origin_mode, &s->origin); - if (r) { - ti->error = "Cannot get origin device"; - goto bad_origin; - } - - cow_path = argv[0]; - argv++; - argc--; - - r = dm_get_device(ti, cow_path, dm_table_get_mode(ti->table), &s->cow); - if (r) { - ti->error = "Cannot get COW device"; - goto bad_cow; - } - - r = dm_exception_store_create(ti, argc, argv, s, &args_used, &s->store); - if (r) { - ti->error = "Couldn't create exception store"; - r = -EINVAL; - goto bad_store; - } - - argv += args_used; - argc -= args_used; - - s->ti = ti; - s->valid = 1; - s->active = 0; - atomic_set(&s->pending_exceptions_count, 0); - init_rwsem(&s->lock); - INIT_LIST_HEAD(&s->list); - spin_lock_init(&s->pe_lock); - s->state_bits = 0; - s->merge_failed = 0; - s->first_merging_chunk = 0; - s->num_merging_chunks = 0; - bio_list_init(&s->bios_queued_during_merge); - - /* Allocate hash table for COW data */ - if (init_hash_tables(s)) { - ti->error = "Unable to allocate hash table space"; - r = -ENOMEM; - goto bad_hash_tables; - } - - s->kcopyd_client = dm_kcopyd_client_create(); - if (IS_ERR(s->kcopyd_client)) { - r = PTR_ERR(s->kcopyd_client); - ti->error = "Could not create kcopyd client"; - goto bad_kcopyd; - } - - s->pending_pool = mempool_create_slab_pool(MIN_IOS, pending_cache); - if (!s->pending_pool) { - ti->error = "Could not allocate mempool for pending exceptions"; - goto bad_pending_pool; - } - - s->tracked_chunk_pool = mempool_create_slab_pool(MIN_IOS, - tracked_chunk_cache); - if (!s->tracked_chunk_pool) { - ti->error = "Could not allocate tracked_chunk mempool for " - "tracking reads"; - goto bad_tracked_chunk_pool; - } - - for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++) - INIT_HLIST_HEAD(&s->tracked_chunk_hash[i]); - - spin_lock_init(&s->tracked_chunk_lock); - - ti->private = s; - ti->num_flush_requests = num_flush_requests; - - /* Add snapshot to the list of snapshots for this origin */ - /* Exceptions aren't triggered till snapshot_resume() is called */ - r = register_snapshot(s); - if (r == -ENOMEM) { - ti->error = "Snapshot origin struct allocation failed"; - goto bad_load_and_register; - } else if (r < 0) { - /* invalid handover, register_snapshot has set ti->error */ - goto bad_load_and_register; - } - - /* - * Metadata must only be loaded into one table at once, so skip this - * if metadata will be handed over during resume. - * Chunk size will be set during the handover - set it to zero to - * ensure it's ignored. - */ - if (r > 0) { - s->store->chunk_size = 0; - return 0; - } - - r = s->store->type->read_metadata(s->store, dm_add_exception, - (void *)s); - if (r < 0) { - ti->error = "Failed to read snapshot metadata"; - goto bad_read_metadata; - } else if (r > 0) { - s->valid = 0; - DMWARN("Snapshot is marked invalid."); - } - - if (!s->store->chunk_size) { - ti->error = "Chunk size not set"; - goto bad_read_metadata; - } - ti->split_io = s->store->chunk_size; - - return 0; - -bad_read_metadata: - unregister_snapshot(s); - -bad_load_and_register: - mempool_destroy(s->tracked_chunk_pool); - -bad_tracked_chunk_pool: - mempool_destroy(s->pending_pool); - -bad_pending_pool: - dm_kcopyd_client_destroy(s->kcopyd_client); - -bad_kcopyd: - dm_exception_table_exit(&s->pending, pending_cache); - dm_exception_table_exit(&s->complete, exception_cache); - -bad_hash_tables: - dm_exception_store_destroy(s->store); - -bad_store: - dm_put_device(ti, s->cow); - -bad_cow: - dm_put_device(ti, s->origin); - -bad_origin: - kfree(s); - -bad: - return r; -} - -static void __free_exceptions(struct dm_snapshot *s) -{ - dm_kcopyd_client_destroy(s->kcopyd_client); - s->kcopyd_client = NULL; - - dm_exception_table_exit(&s->pending, pending_cache); - dm_exception_table_exit(&s->complete, exception_cache); -} - -static void __handover_exceptions(struct dm_snapshot *snap_src, - struct dm_snapshot *snap_dest) -{ - union { - struct dm_exception_table table_swap; - struct dm_exception_store *store_swap; - } u; - - /* - * Swap all snapshot context information between the two instances. - */ - u.table_swap = snap_dest->complete; - snap_dest->complete = snap_src->complete; - snap_src->complete = u.table_swap; - - u.store_swap = snap_dest->store; - snap_dest->store = snap_src->store; - snap_src->store = u.store_swap; - - snap_dest->store->snap = snap_dest; - snap_src->store->snap = snap_src; - - snap_dest->ti->split_io = snap_dest->store->chunk_size; - snap_dest->valid = snap_src->valid; - - /* - * Set source invalid to ensure it receives no further I/O. - */ - snap_src->valid = 0; -} - -static void snapshot_dtr(struct dm_target *ti) -{ -#ifdef CONFIG_DM_DEBUG - int i; -#endif - struct dm_snapshot *s = ti->private; - struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; - - down_read(&_origins_lock); - /* Check whether exception handover must be cancelled */ - (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); - if (snap_src && snap_dest && (s == snap_src)) { - down_write(&snap_dest->lock); - snap_dest->valid = 0; - up_write(&snap_dest->lock); - DMERR("Cancelling snapshot handover."); - } - up_read(&_origins_lock); - - if (dm_target_is_snapshot_merge(ti)) - stop_merge(s); - - /* Prevent further origin writes from using this snapshot. */ - /* After this returns there can be no new kcopyd jobs. */ - unregister_snapshot(s); - - while (atomic_read(&s->pending_exceptions_count)) - msleep(1); - /* - * Ensure instructions in mempool_destroy aren't reordered - * before atomic_read. - */ - smp_mb(); - -#ifdef CONFIG_DM_DEBUG - for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++) - BUG_ON(!hlist_empty(&s->tracked_chunk_hash[i])); -#endif - - mempool_destroy(s->tracked_chunk_pool); - - __free_exceptions(s); - - mempool_destroy(s->pending_pool); - - dm_exception_store_destroy(s->store); - - dm_put_device(ti, s->cow); - - dm_put_device(ti, s->origin); - - kfree(s); -} - -/* - * Flush a list of buffers. - */ -static void flush_bios(struct bio *bio) -{ - struct bio *n; - - while (bio) { - n = bio->bi_next; - bio->bi_next = NULL; - generic_make_request(bio); - bio = n; - } -} - -static int do_origin(struct dm_dev *origin, struct bio *bio); - -/* - * Flush a list of buffers. - */ -static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio) -{ - struct bio *n; - int r; - - while (bio) { - n = bio->bi_next; - bio->bi_next = NULL; - r = do_origin(s->origin, bio); - if (r == DM_MAPIO_REMAPPED) - generic_make_request(bio); - bio = n; - } -} - -/* - * Error a list of buffers. - */ -static void error_bios(struct bio *bio) -{ - struct bio *n; - - while (bio) { - n = bio->bi_next; - bio->bi_next = NULL; - bio_io_error(bio); - bio = n; - } -} - -static void __invalidate_snapshot(struct dm_snapshot *s, int err) -{ - if (!s->valid) - return; - - if (err == -EIO) - DMERR("Invalidating snapshot: Error reading/writing."); - else if (err == -ENOMEM) - DMERR("Invalidating snapshot: Unable to allocate exception."); - - if (s->store->type->drop_snapshot) - s->store->type->drop_snapshot(s->store); - - s->valid = 0; - - dm_table_event(s->ti->table); -} - -static void pending_complete(struct dm_snap_pending_exception *pe, int success) -{ - struct dm_exception *e; - struct dm_snapshot *s = pe->snap; - struct bio *origin_bios = NULL; - struct bio *snapshot_bios = NULL; - struct bio *full_bio = NULL; - int error = 0; - - if (!success) { - /* Read/write error - snapshot is unusable */ - down_write(&s->lock); - __invalidate_snapshot(s, -EIO); - error = 1; - goto out; - } - - e = alloc_completed_exception(); - if (!e) { - down_write(&s->lock); - __invalidate_snapshot(s, -ENOMEM); - error = 1; - goto out; - } - *e = pe->e; - - down_write(&s->lock); - if (!s->valid) { - free_completed_exception(e); - error = 1; - goto out; - } - - /* Check for conflicting reads */ - __check_for_conflicting_io(s, pe->e.old_chunk); - - /* - * Add a proper exception, and remove the - * in-flight exception from the list. - */ - dm_insert_exception(&s->complete, e); - -out: - dm_remove_exception(&pe->e); - snapshot_bios = bio_list_get(&pe->snapshot_bios); - origin_bios = bio_list_get(&pe->origin_bios); - full_bio = pe->full_bio; - if (full_bio) { - full_bio->bi_end_io = pe->full_bio_end_io; - full_bio->bi_private = pe->full_bio_private; - } - free_pending_exception(pe); - - increment_pending_exceptions_done_count(); - - up_write(&s->lock); - - /* Submit any pending write bios */ - if (error) { - if (full_bio) - bio_io_error(full_bio); - error_bios(snapshot_bios); - } else { - if (full_bio) - bio_endio(full_bio, 0); - flush_bios(snapshot_bios); - } - - retry_origin_bios(s, origin_bios); -} - -static void commit_callback(void *context, int success) -{ - struct dm_snap_pending_exception *pe = context; - - pending_complete(pe, success); -} - -/* - * Called when the copy I/O has finished. kcopyd actually runs - * this code so don't block. - */ -static void copy_callback(int read_err, unsigned long write_err, void *context) -{ - struct dm_snap_pending_exception *pe = context; - struct dm_snapshot *s = pe->snap; - - if (read_err || write_err) - pending_complete(pe, 0); - - else - /* Update the metadata if we are persistent */ - s->store->type->commit_exception(s->store, &pe->e, - commit_callback, pe); -} - -/* - * Dispatches the copy operation to kcopyd. - */ -static void start_copy(struct dm_snap_pending_exception *pe) -{ - struct dm_snapshot *s = pe->snap; - struct dm_io_region src, dest; - struct block_device *bdev = s->origin->bdev; - sector_t dev_size; - - dev_size = get_dev_size(bdev); - - src.bdev = bdev; - src.sector = chunk_to_sector(s->store, pe->e.old_chunk); - src.count = min((sector_t)s->store->chunk_size, dev_size - src.sector); - - dest.bdev = s->cow->bdev; - dest.sector = chunk_to_sector(s->store, pe->e.new_chunk); - dest.count = src.count; - - /* Hand over to kcopyd */ - dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe); -} - -static void full_bio_end_io(struct bio *bio, int error) -{ - void *callback_data = bio->bi_private; - - dm_kcopyd_do_callback(callback_data, 0, error ? 1 : 0); -} - -static void start_full_bio(struct dm_snap_pending_exception *pe, - struct bio *bio) -{ - struct dm_snapshot *s = pe->snap; - void *callback_data; - - pe->full_bio = bio; - pe->full_bio_end_io = bio->bi_end_io; - pe->full_bio_private = bio->bi_private; - - callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client, - copy_callback, pe); - - bio->bi_end_io = full_bio_end_io; - bio->bi_private = callback_data; - - generic_make_request(bio); -} - -static struct dm_snap_pending_exception * -__lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk) -{ - struct dm_exception *e = dm_lookup_exception(&s->pending, chunk); - - if (!e) - return NULL; - - return container_of(e, struct dm_snap_pending_exception, e); -} - -/* - * Looks to see if this snapshot already has a pending exception - * for this chunk, otherwise it allocates a new one and inserts - * it into the pending table. - * - * NOTE: a write lock must be held on snap->lock before calling - * this. - */ -static struct dm_snap_pending_exception * -__find_pending_exception(struct dm_snapshot *s, - struct dm_snap_pending_exception *pe, chunk_t chunk) -{ - struct dm_snap_pending_exception *pe2; - - pe2 = __lookup_pending_exception(s, chunk); - if (pe2) { - free_pending_exception(pe); - return pe2; - } - - pe->e.old_chunk = chunk; - bio_list_init(&pe->origin_bios); - bio_list_init(&pe->snapshot_bios); - pe->started = 0; - pe->full_bio = NULL; - - if (s->store->type->prepare_exception(s->store, &pe->e)) { - free_pending_exception(pe); - return NULL; - } - - dm_insert_exception(&s->pending, &pe->e); - - return pe; -} - -static void remap_exception(struct dm_snapshot *s, struct dm_exception *e, - struct bio *bio, chunk_t chunk) -{ - bio->bi_bdev = s->cow->bdev; - bio->bi_sector = chunk_to_sector(s->store, - dm_chunk_number(e->new_chunk) + - (chunk - e->old_chunk)) + - (bio->bi_sector & - s->store->chunk_mask); -} - -static int snapshot_map(struct dm_target *ti, struct bio *bio, - union map_info *map_context) -{ - struct dm_exception *e; - struct dm_snapshot *s = ti->private; - int r = DM_MAPIO_REMAPPED; - chunk_t chunk; - struct dm_snap_pending_exception *pe = NULL; - - if (bio->bi_rw & REQ_FLUSH) { - bio->bi_bdev = s->cow->bdev; - return DM_MAPIO_REMAPPED; - } - - chunk = sector_to_chunk(s->store, bio->bi_sector); - - /* Full snapshots are not usable */ - /* To get here the table must be live so s->active is always set. */ - if (!s->valid) - return -EIO; - - /* FIXME: should only take write lock if we need - * to copy an exception */ - down_write(&s->lock); - - if (!s->valid) { - r = -EIO; - goto out_unlock; - } - - /* If the block is already remapped - use that, else remap it */ - e = dm_lookup_exception(&s->complete, chunk); - if (e) { - remap_exception(s, e, bio, chunk); - goto out_unlock; - } - - /* - * Write to snapshot - higher level takes care of RW/RO - * flags so we should only get this if we are - * writeable. - */ - if (bio_rw(bio) == WRITE) { - pe = __lookup_pending_exception(s, chunk); - if (!pe) { - up_write(&s->lock); - pe = alloc_pending_exception(s); - down_write(&s->lock); - - if (!s->valid) { - free_pending_exception(pe); - r = -EIO; - goto out_unlock; - } - - e = dm_lookup_exception(&s->complete, chunk); - if (e) { - free_pending_exception(pe); - remap_exception(s, e, bio, chunk); - goto out_unlock; - } - - pe = __find_pending_exception(s, pe, chunk); - if (!pe) { - __invalidate_snapshot(s, -ENOMEM); - r = -EIO; - goto out_unlock; - } - } - - remap_exception(s, &pe->e, bio, chunk); - - r = DM_MAPIO_SUBMITTED; - - if (!pe->started && - bio->bi_size == (s->store->chunk_size << SECTOR_SHIFT)) { - pe->started = 1; - up_write(&s->lock); - start_full_bio(pe, bio); - goto out; - } - - bio_list_add(&pe->snapshot_bios, bio); - - if (!pe->started) { - /* this is protected by snap->lock */ - pe->started = 1; - up_write(&s->lock); - start_copy(pe); - goto out; - } - } else { - bio->bi_bdev = s->origin->bdev; - map_context->ptr = track_chunk(s, chunk); - } - -out_unlock: - up_write(&s->lock); -out: - return r; -} - -/* - * A snapshot-merge target behaves like a combination of a snapshot - * target and a snapshot-origin target. It only generates new - * exceptions in other snapshots and not in the one that is being - * merged. - * - * For each chunk, if there is an existing exception, it is used to - * redirect I/O to the cow device. Otherwise I/O is sent to the origin, - * which in turn might generate exceptions in other snapshots. - * If merging is currently taking place on the chunk in question, the - * I/O is deferred by adding it to s->bios_queued_during_merge. - */ -static int snapshot_merge_map(struct dm_target *ti, struct bio *bio, - union map_info *map_context) -{ - struct dm_exception *e; - struct dm_snapshot *s = ti->private; - int r = DM_MAPIO_REMAPPED; - chunk_t chunk; - - if (bio->bi_rw & REQ_FLUSH) { - if (!map_context->target_request_nr) - bio->bi_bdev = s->origin->bdev; - else - bio->bi_bdev = s->cow->bdev; - map_context->ptr = NULL; - return DM_MAPIO_REMAPPED; - } - - chunk = sector_to_chunk(s->store, bio->bi_sector); - - down_write(&s->lock); - - /* Full merging snapshots are redirected to the origin */ - if (!s->valid) - goto redirect_to_origin; - - /* If the block is already remapped - use that */ - e = dm_lookup_exception(&s->complete, chunk); - if (e) { - /* Queue writes overlapping with chunks being merged */ - if (bio_rw(bio) == WRITE && - chunk >= s->first_merging_chunk && - chunk < (s->first_merging_chunk + - s->num_merging_chunks)) { - bio->bi_bdev = s->origin->bdev; - bio_list_add(&s->bios_queued_during_merge, bio); - r = DM_MAPIO_SUBMITTED; - goto out_unlock; - } - - remap_exception(s, e, bio, chunk); - - if (bio_rw(bio) == WRITE) - map_context->ptr = track_chunk(s, chunk); - goto out_unlock; - } - -redirect_to_origin: - bio->bi_bdev = s->origin->bdev; - - if (bio_rw(bio) == WRITE) { - up_write(&s->lock); - return do_origin(s->origin, bio); - } - -out_unlock: - up_write(&s->lock); - - return r; -} - -static int snapshot_end_io(struct dm_target *ti, struct bio *bio, - int error, union map_info *map_context) -{ - struct dm_snapshot *s = ti->private; - struct dm_snap_tracked_chunk *c = map_context->ptr; - - if (c) - stop_tracking_chunk(s, c); - - return 0; -} - -static void snapshot_merge_presuspend(struct dm_target *ti) -{ - struct dm_snapshot *s = ti->private; - - stop_merge(s); -} - -static int snapshot_preresume(struct dm_target *ti) -{ - int r = 0; - struct dm_snapshot *s = ti->private; - struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; - - down_read(&_origins_lock); - (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); - if (snap_src && snap_dest) { - down_read(&snap_src->lock); - if (s == snap_src) { - DMERR("Unable to resume snapshot source until " - "handover completes."); - r = -EINVAL; - } else if (!dm_suspended(snap_src->ti)) { - DMERR("Unable to perform snapshot handover until " - "source is suspended."); - r = -EINVAL; - } - up_read(&snap_src->lock); - } - up_read(&_origins_lock); - - return r; -} - -static void snapshot_resume(struct dm_target *ti) -{ - struct dm_snapshot *s = ti->private; - struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; - - down_read(&_origins_lock); - (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); - if (snap_src && snap_dest) { - down_write(&snap_src->lock); - down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING); - __handover_exceptions(snap_src, snap_dest); - up_write(&snap_dest->lock); - up_write(&snap_src->lock); - } - up_read(&_origins_lock); - - /* Now we have correct chunk size, reregister */ - reregister_snapshot(s); - - down_write(&s->lock); - s->active = 1; - up_write(&s->lock); -} - -static sector_t get_origin_minimum_chunksize(struct block_device *bdev) -{ - sector_t min_chunksize; - - down_read(&_origins_lock); - min_chunksize = __minimum_chunk_size(__lookup_origin(bdev)); - up_read(&_origins_lock); - - return min_chunksize; -} - -static void snapshot_merge_resume(struct dm_target *ti) -{ - struct dm_snapshot *s = ti->private; - - /* - * Handover exceptions from existing snapshot. - */ - snapshot_resume(ti); - - /* - * snapshot-merge acts as an origin, so set ti->split_io - */ - ti->split_io = get_origin_minimum_chunksize(s->origin->bdev); - - start_merge(s); -} - -static int snapshot_status(struct dm_target *ti, status_type_t type, - char *result, unsigned int maxlen) -{ - unsigned sz = 0; - struct dm_snapshot *snap = ti->private; - - switch (type) { - case STATUSTYPE_INFO: - - down_write(&snap->lock); - - if (!snap->valid) - DMEMIT("Invalid"); - else if (snap->merge_failed) - DMEMIT("Merge failed"); - else { - if (snap->store->type->usage) { - sector_t total_sectors, sectors_allocated, - metadata_sectors; - snap->store->type->usage(snap->store, - &total_sectors, - §ors_allocated, - &metadata_sectors); - DMEMIT("%llu/%llu %llu", - (unsigned long long)sectors_allocated, - (unsigned long long)total_sectors, - (unsigned long long)metadata_sectors); - } - else - DMEMIT("Unknown"); - } - - up_write(&snap->lock); - - break; - - case STATUSTYPE_TABLE: - /* - * kdevname returns a static pointer so we need - * to make private copies if the output is to - * make sense. - */ - DMEMIT("%s %s", snap->origin->name, snap->cow->name); - snap->store->type->status(snap->store, type, result + sz, - maxlen - sz); - break; - } - - return 0; -} - -static int snapshot_iterate_devices(struct dm_target *ti, - iterate_devices_callout_fn fn, void *data) -{ - struct dm_snapshot *snap = ti->private; - int r; - - r = fn(ti, snap->origin, 0, ti->len, data); - - if (!r) - r = fn(ti, snap->cow, 0, get_dev_size(snap->cow->bdev), data); - - return r; -} - - -/*----------------------------------------------------------------- - * Origin methods - *---------------------------------------------------------------*/ - -/* - * If no exceptions need creating, DM_MAPIO_REMAPPED is returned and any - * supplied bio was ignored. The caller may submit it immediately. - * (No remapping actually occurs as the origin is always a direct linear - * map.) - * - * If further exceptions are required, DM_MAPIO_SUBMITTED is returned - * and any supplied bio is added to a list to be submitted once all - * the necessary exceptions exist. - */ -static int __origin_write(struct list_head *snapshots, sector_t sector, - struct bio *bio) -{ - int r = DM_MAPIO_REMAPPED; - struct dm_snapshot *snap; - struct dm_exception *e; - struct dm_snap_pending_exception *pe; - struct dm_snap_pending_exception *pe_to_start_now = NULL; - struct dm_snap_pending_exception *pe_to_start_last = NULL; - chunk_t chunk; - - /* Do all the snapshots on this origin */ - list_for_each_entry (snap, snapshots, list) { - /* - * Don't make new exceptions in a merging snapshot - * because it has effectively been deleted - */ - if (dm_target_is_snapshot_merge(snap->ti)) - continue; - - down_write(&snap->lock); - - /* Only deal with valid and active snapshots */ - if (!snap->valid || !snap->active) - goto next_snapshot; - - /* Nothing to do if writing beyond end of snapshot */ - if (sector >= dm_table_get_size(snap->ti->table)) - goto next_snapshot; - - /* - * Remember, different snapshots can have - * different chunk sizes. - */ - chunk = sector_to_chunk(snap->store, sector); - - /* - * Check exception table to see if block - * is already remapped in this snapshot - * and trigger an exception if not. - */ - e = dm_lookup_exception(&snap->complete, chunk); - if (e) - goto next_snapshot; - - pe = __lookup_pending_exception(snap, chunk); - if (!pe) { - up_write(&snap->lock); - pe = alloc_pending_exception(snap); - down_write(&snap->lock); - - if (!snap->valid) { - free_pending_exception(pe); - goto next_snapshot; - } - - e = dm_lookup_exception(&snap->complete, chunk); - if (e) { - free_pending_exception(pe); - goto next_snapshot; - } - - pe = __find_pending_exception(snap, pe, chunk); - if (!pe) { - __invalidate_snapshot(snap, -ENOMEM); - goto next_snapshot; - } - } - - r = DM_MAPIO_SUBMITTED; - - /* - * If an origin bio was supplied, queue it to wait for the - * completion of this exception, and start this one last, - * at the end of the function. - */ - if (bio) { - bio_list_add(&pe->origin_bios, bio); - bio = NULL; - - if (!pe->started) { - pe->started = 1; - pe_to_start_last = pe; - } - } - - if (!pe->started) { - pe->started = 1; - pe_to_start_now = pe; - } - -next_snapshot: - up_write(&snap->lock); - - if (pe_to_start_now) { - start_copy(pe_to_start_now); - pe_to_start_now = NULL; - } - } - - /* - * Submit the exception against which the bio is queued last, - * to give the other exceptions a head start. - */ - if (pe_to_start_last) - start_copy(pe_to_start_last); - - return r; -} - -/* - * Called on a write from the origin driver. - */ -static int do_origin(struct dm_dev *origin, struct bio *bio) -{ - struct origin *o; - int r = DM_MAPIO_REMAPPED; - - down_read(&_origins_lock); - o = __lookup_origin(origin->bdev); - if (o) - r = __origin_write(&o->snapshots, bio->bi_sector, bio); - up_read(&_origins_lock); - - return r; -} - -/* - * Trigger exceptions in all non-merging snapshots. - * - * The chunk size of the merging snapshot may be larger than the chunk - * size of some other snapshot so we may need to reallocate multiple - * chunks in other snapshots. - * - * We scan all the overlapping exceptions in the other snapshots. - * Returns 1 if anything was reallocated and must be waited for, - * otherwise returns 0. - * - * size must be a multiple of merging_snap's chunk_size. - */ -static int origin_write_extent(struct dm_snapshot *merging_snap, - sector_t sector, unsigned size) -{ - int must_wait = 0; - sector_t n; - struct origin *o; - - /* - * The origin's __minimum_chunk_size() got stored in split_io - * by snapshot_merge_resume(). - */ - down_read(&_origins_lock); - o = __lookup_origin(merging_snap->origin->bdev); - for (n = 0; n < size; n += merging_snap->ti->split_io) - if (__origin_write(&o->snapshots, sector + n, NULL) == - DM_MAPIO_SUBMITTED) - must_wait = 1; - up_read(&_origins_lock); - - return must_wait; -} - -/* - * Origin: maps a linear range of a device, with hooks for snapshotting. - */ - -/* - * Construct an origin mapping: <dev_path> - * The context for an origin is merely a 'struct dm_dev *' - * pointing to the real device. - */ -static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv) -{ - int r; - struct dm_dev *dev; - - if (argc != 1) { - ti->error = "origin: incorrect number of arguments"; - return -EINVAL; - } - - r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dev); - if (r) { - ti->error = "Cannot get target device"; - return r; - } - - ti->private = dev; - ti->num_flush_requests = 1; - - return 0; -} - -static void origin_dtr(struct dm_target *ti) -{ - struct dm_dev *dev = ti->private; - dm_put_device(ti, dev); -} - -static int origin_map(struct dm_target *ti, struct bio *bio, - union map_info *map_context) -{ - struct dm_dev *dev = ti->private; - bio->bi_bdev = dev->bdev; - - if (bio->bi_rw & REQ_FLUSH) - return DM_MAPIO_REMAPPED; - - /* Only tell snapshots if this is a write */ - return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED; -} - -/* - * Set the target "split_io" field to the minimum of all the snapshots' - * chunk sizes. - */ -static void origin_resume(struct dm_target *ti) -{ - struct dm_dev *dev = ti->private; - - ti->split_io = get_origin_minimum_chunksize(dev->bdev); -} - -static int origin_status(struct dm_target *ti, status_type_t type, char *result, - unsigned int maxlen) -{ - struct dm_dev *dev = ti->private; - - switch (type) { - case STATUSTYPE_INFO: - result[0] = '\0'; - break; - - case STATUSTYPE_TABLE: - snprintf(result, maxlen, "%s", dev->name); - break; - } - - return 0; -} - -static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm, - struct bio_vec *biovec, int max_size) -{ - struct dm_dev *dev = ti->private; - struct request_queue *q = bdev_get_queue(dev->bdev); - - if (!q->merge_bvec_fn) - return max_size; - - bvm->bi_bdev = dev->bdev; - bvm->bi_sector = bvm->bi_sector; - - return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); -} - -static int origin_iterate_devices(struct dm_target *ti, - iterate_devices_callout_fn fn, void *data) -{ - struct dm_dev *dev = ti->private; - - return fn(ti, dev, 0, ti->len, data); -} - -static struct target_type origin_target = { - .name = "snapshot-origin", - .version = {1, 7, 1}, - .module = THIS_MODULE, - .ctr = origin_ctr, - .dtr = origin_dtr, - .map = origin_map, - .resume = origin_resume, - .status = origin_status, - .merge = origin_merge, - .iterate_devices = origin_iterate_devices, -}; - -static struct target_type snapshot_target = { - .name = "snapshot", - .version = {1, 10, 0}, - .module = THIS_MODULE, - .ctr = snapshot_ctr, - .dtr = snapshot_dtr, - .map = snapshot_map, - .end_io = snapshot_end_io, - .preresume = snapshot_preresume, - .resume = snapshot_resume, - .status = snapshot_status, - .iterate_devices = snapshot_iterate_devices, -}; - -static struct target_type merge_target = { - .name = dm_snapshot_merge_target_name, - .version = {1, 1, 0}, - .module = THIS_MODULE, - .ctr = snapshot_ctr, - .dtr = snapshot_dtr, - .map = snapshot_merge_map, - .end_io = snapshot_end_io, - .presuspend = snapshot_merge_presuspend, - .preresume = snapshot_preresume, - .resume = snapshot_merge_resume, - .status = snapshot_status, - .iterate_devices = snapshot_iterate_devices, -}; - -static int __init dm_snapshot_init(void) -{ - int r; - - r = dm_exception_store_init(); - if (r) { - DMERR("Failed to initialize exception stores"); - return r; - } - - r = dm_register_target(&snapshot_target); - if (r < 0) { - DMERR("snapshot target register failed %d", r); - goto bad_register_snapshot_target; - } - - r = dm_register_target(&origin_target); - if (r < 0) { - DMERR("Origin target register failed %d", r); - goto bad_register_origin_target; - } - - r = dm_register_target(&merge_target); - if (r < 0) { - DMERR("Merge target register failed %d", r); - goto bad_register_merge_target; - } - - r = init_origin_hash(); - if (r) { - DMERR("init_origin_hash failed."); - goto bad_origin_hash; - } - - exception_cache = KMEM_CACHE(dm_exception, 0); - if (!exception_cache) { - DMERR("Couldn't create exception cache."); - r = -ENOMEM; - goto bad_exception_cache; - } - - pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0); - if (!pending_cache) { - DMERR("Couldn't create pending cache."); - r = -ENOMEM; - goto bad_pending_cache; - } - - tracked_chunk_cache = KMEM_CACHE(dm_snap_tracked_chunk, 0); - if (!tracked_chunk_cache) { - DMERR("Couldn't create cache to track chunks in use."); - r = -ENOMEM; - goto bad_tracked_chunk_cache; - } - - return 0; - -bad_tracked_chunk_cache: - kmem_cache_destroy(pending_cache); -bad_pending_cache: - kmem_cache_destroy(exception_cache); -bad_exception_cache: - exit_origin_hash(); -bad_origin_hash: - dm_unregister_target(&merge_target); -bad_register_merge_target: - dm_unregister_target(&origin_target); -bad_register_origin_target: - dm_unregister_target(&snapshot_target); -bad_register_snapshot_target: - dm_exception_store_exit(); - - return r; -} - -static void __exit dm_snapshot_exit(void) -{ - dm_unregister_target(&snapshot_target); - dm_unregister_target(&origin_target); - dm_unregister_target(&merge_target); - - exit_origin_hash(); - kmem_cache_destroy(pending_cache); - kmem_cache_destroy(exception_cache); - kmem_cache_destroy(tracked_chunk_cache); - - dm_exception_store_exit(); -} - -/* Module hooks */ -module_init(dm_snapshot_init); -module_exit(dm_snapshot_exit); - -MODULE_DESCRIPTION(DM_NAME " snapshot target"); -MODULE_AUTHOR("Joe Thornber"); -MODULE_LICENSE("GPL"); diff --git a/ANDROID_3.4.5/drivers/md/dm-stripe.c b/ANDROID_3.4.5/drivers/md/dm-stripe.c deleted file mode 100644 index 35c94ff2..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-stripe.c +++ /dev/null @@ -1,450 +0,0 @@ -/* - * Copyright (C) 2001-2003 Sistina Software (UK) Limited. - * - * This file is released under the GPL. - */ - -#include <linux/device-mapper.h> - -#include <linux/module.h> -#include <linux/init.h> -#include <linux/blkdev.h> -#include <linux/bio.h> -#include <linux/slab.h> -#include <linux/log2.h> - -#define DM_MSG_PREFIX "striped" -#define DM_IO_ERROR_THRESHOLD 15 - -struct stripe { - struct dm_dev *dev; - sector_t physical_start; - - atomic_t error_count; -}; - -struct stripe_c { - uint32_t stripes; - int stripes_shift; - sector_t stripes_mask; - - /* The size of this target / num. stripes */ - sector_t stripe_width; - - /* stripe chunk size */ - uint32_t chunk_shift; - sector_t chunk_mask; - - /* Needed for handling events */ - struct dm_target *ti; - - /* Work struct used for triggering events*/ - struct work_struct trigger_event; - - struct stripe stripe[0]; -}; - -/* - * An event is triggered whenever a drive - * drops out of a stripe volume. - */ -static void trigger_event(struct work_struct *work) -{ - struct stripe_c *sc = container_of(work, struct stripe_c, - trigger_event); - dm_table_event(sc->ti->table); -} - -static inline struct stripe_c *alloc_context(unsigned int stripes) -{ - size_t len; - - if (dm_array_too_big(sizeof(struct stripe_c), sizeof(struct stripe), - stripes)) - return NULL; - - len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes); - - return kmalloc(len, GFP_KERNEL); -} - -/* - * Parse a single <dev> <sector> pair - */ -static int get_stripe(struct dm_target *ti, struct stripe_c *sc, - unsigned int stripe, char **argv) -{ - unsigned long long start; - char dummy; - - if (sscanf(argv[1], "%llu%c", &start, &dummy) != 1) - return -EINVAL; - - if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), - &sc->stripe[stripe].dev)) - return -ENXIO; - - sc->stripe[stripe].physical_start = start; - - return 0; -} - -/* - * Construct a striped mapping. - * <number of stripes> <chunk size (2^^n)> [<dev_path> <offset>]+ - */ -static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) -{ - struct stripe_c *sc; - sector_t width; - uint32_t stripes; - uint32_t chunk_size; - char *end; - int r; - unsigned int i; - - if (argc < 2) { - ti->error = "Not enough arguments"; - return -EINVAL; - } - - stripes = simple_strtoul(argv[0], &end, 10); - if (!stripes || *end) { - ti->error = "Invalid stripe count"; - return -EINVAL; - } - - chunk_size = simple_strtoul(argv[1], &end, 10); - if (*end) { - ti->error = "Invalid chunk_size"; - return -EINVAL; - } - - /* - * chunk_size is a power of two - */ - if (!is_power_of_2(chunk_size) || - (chunk_size < (PAGE_SIZE >> SECTOR_SHIFT))) { - ti->error = "Invalid chunk size"; - return -EINVAL; - } - - if (ti->len & (chunk_size - 1)) { - ti->error = "Target length not divisible by " - "chunk size"; - return -EINVAL; - } - - width = ti->len; - if (sector_div(width, stripes)) { - ti->error = "Target length not divisible by " - "number of stripes"; - return -EINVAL; - } - - /* - * Do we have enough arguments for that many stripes ? - */ - if (argc != (2 + 2 * stripes)) { - ti->error = "Not enough destinations " - "specified"; - return -EINVAL; - } - - sc = alloc_context(stripes); - if (!sc) { - ti->error = "Memory allocation for striped context " - "failed"; - return -ENOMEM; - } - - INIT_WORK(&sc->trigger_event, trigger_event); - - /* Set pointer to dm target; used in trigger_event */ - sc->ti = ti; - sc->stripes = stripes; - sc->stripe_width = width; - - if (stripes & (stripes - 1)) - sc->stripes_shift = -1; - else { - sc->stripes_shift = ffs(stripes) - 1; - sc->stripes_mask = ((sector_t) stripes) - 1; - } - - ti->split_io = chunk_size; - ti->num_flush_requests = stripes; - ti->num_discard_requests = stripes; - - sc->chunk_shift = ffs(chunk_size) - 1; - sc->chunk_mask = ((sector_t) chunk_size) - 1; - - /* - * Get the stripe destinations. - */ - for (i = 0; i < stripes; i++) { - argv += 2; - - r = get_stripe(ti, sc, i, argv); - if (r < 0) { - ti->error = "Couldn't parse stripe destination"; - while (i--) - dm_put_device(ti, sc->stripe[i].dev); - kfree(sc); - return r; - } - atomic_set(&(sc->stripe[i].error_count), 0); - } - - ti->private = sc; - - return 0; -} - -static void stripe_dtr(struct dm_target *ti) -{ - unsigned int i; - struct stripe_c *sc = (struct stripe_c *) ti->private; - - for (i = 0; i < sc->stripes; i++) - dm_put_device(ti, sc->stripe[i].dev); - - flush_work_sync(&sc->trigger_event); - kfree(sc); -} - -static void stripe_map_sector(struct stripe_c *sc, sector_t sector, - uint32_t *stripe, sector_t *result) -{ - sector_t offset = dm_target_offset(sc->ti, sector); - sector_t chunk = offset >> sc->chunk_shift; - - if (sc->stripes_shift < 0) - *stripe = sector_div(chunk, sc->stripes); - else { - *stripe = chunk & sc->stripes_mask; - chunk >>= sc->stripes_shift; - } - - *result = (chunk << sc->chunk_shift) | (offset & sc->chunk_mask); -} - -static void stripe_map_range_sector(struct stripe_c *sc, sector_t sector, - uint32_t target_stripe, sector_t *result) -{ - uint32_t stripe; - - stripe_map_sector(sc, sector, &stripe, result); - if (stripe == target_stripe) - return; - *result &= ~sc->chunk_mask; /* round down */ - if (target_stripe < stripe) - *result += sc->chunk_mask + 1; /* next chunk */ -} - -static int stripe_map_discard(struct stripe_c *sc, struct bio *bio, - uint32_t target_stripe) -{ - sector_t begin, end; - - stripe_map_range_sector(sc, bio->bi_sector, target_stripe, &begin); - stripe_map_range_sector(sc, bio->bi_sector + bio_sectors(bio), - target_stripe, &end); - if (begin < end) { - bio->bi_bdev = sc->stripe[target_stripe].dev->bdev; - bio->bi_sector = begin + sc->stripe[target_stripe].physical_start; - bio->bi_size = to_bytes(end - begin); - return DM_MAPIO_REMAPPED; - } else { - /* The range doesn't map to the target stripe */ - bio_endio(bio, 0); - return DM_MAPIO_SUBMITTED; - } -} - -static int stripe_map(struct dm_target *ti, struct bio *bio, - union map_info *map_context) -{ - struct stripe_c *sc = ti->private; - uint32_t stripe; - unsigned target_request_nr; - - if (bio->bi_rw & REQ_FLUSH) { - target_request_nr = map_context->target_request_nr; - BUG_ON(target_request_nr >= sc->stripes); - bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev; - return DM_MAPIO_REMAPPED; - } - if (unlikely(bio->bi_rw & REQ_DISCARD)) { - target_request_nr = map_context->target_request_nr; - BUG_ON(target_request_nr >= sc->stripes); - return stripe_map_discard(sc, bio, target_request_nr); - } - - stripe_map_sector(sc, bio->bi_sector, &stripe, &bio->bi_sector); - - bio->bi_sector += sc->stripe[stripe].physical_start; - bio->bi_bdev = sc->stripe[stripe].dev->bdev; - - return DM_MAPIO_REMAPPED; -} - -/* - * Stripe status: - * - * INFO - * #stripes [stripe_name <stripe_name>] [group word count] - * [error count 'A|D' <error count 'A|D'>] - * - * TABLE - * #stripes [stripe chunk size] - * [stripe_name physical_start <stripe_name physical_start>] - * - */ - -static int stripe_status(struct dm_target *ti, - status_type_t type, char *result, unsigned int maxlen) -{ - struct stripe_c *sc = (struct stripe_c *) ti->private; - char buffer[sc->stripes + 1]; - unsigned int sz = 0; - unsigned int i; - - switch (type) { - case STATUSTYPE_INFO: - DMEMIT("%d ", sc->stripes); - for (i = 0; i < sc->stripes; i++) { - DMEMIT("%s ", sc->stripe[i].dev->name); - buffer[i] = atomic_read(&(sc->stripe[i].error_count)) ? - 'D' : 'A'; - } - buffer[i] = '\0'; - DMEMIT("1 %s", buffer); - break; - - case STATUSTYPE_TABLE: - DMEMIT("%d %llu", sc->stripes, - (unsigned long long)sc->chunk_mask + 1); - for (i = 0; i < sc->stripes; i++) - DMEMIT(" %s %llu", sc->stripe[i].dev->name, - (unsigned long long)sc->stripe[i].physical_start); - break; - } - return 0; -} - -static int stripe_end_io(struct dm_target *ti, struct bio *bio, - int error, union map_info *map_context) -{ - unsigned i; - char major_minor[16]; - struct stripe_c *sc = ti->private; - - if (!error) - return 0; /* I/O complete */ - - if ((error == -EWOULDBLOCK) && (bio->bi_rw & REQ_RAHEAD)) - return error; - - if (error == -EOPNOTSUPP) - return error; - - memset(major_minor, 0, sizeof(major_minor)); - sprintf(major_minor, "%d:%d", - MAJOR(disk_devt(bio->bi_bdev->bd_disk)), - MINOR(disk_devt(bio->bi_bdev->bd_disk))); - - /* - * Test to see which stripe drive triggered the event - * and increment error count for all stripes on that device. - * If the error count for a given device exceeds the threshold - * value we will no longer trigger any further events. - */ - for (i = 0; i < sc->stripes; i++) - if (!strcmp(sc->stripe[i].dev->name, major_minor)) { - atomic_inc(&(sc->stripe[i].error_count)); - if (atomic_read(&(sc->stripe[i].error_count)) < - DM_IO_ERROR_THRESHOLD) - schedule_work(&sc->trigger_event); - } - - return error; -} - -static int stripe_iterate_devices(struct dm_target *ti, - iterate_devices_callout_fn fn, void *data) -{ - struct stripe_c *sc = ti->private; - int ret = 0; - unsigned i = 0; - - do { - ret = fn(ti, sc->stripe[i].dev, - sc->stripe[i].physical_start, - sc->stripe_width, data); - } while (!ret && ++i < sc->stripes); - - return ret; -} - -static void stripe_io_hints(struct dm_target *ti, - struct queue_limits *limits) -{ - struct stripe_c *sc = ti->private; - unsigned chunk_size = (sc->chunk_mask + 1) << 9; - - blk_limits_io_min(limits, chunk_size); - blk_limits_io_opt(limits, chunk_size * sc->stripes); -} - -static int stripe_merge(struct dm_target *ti, struct bvec_merge_data *bvm, - struct bio_vec *biovec, int max_size) -{ - struct stripe_c *sc = ti->private; - sector_t bvm_sector = bvm->bi_sector; - uint32_t stripe; - struct request_queue *q; - - stripe_map_sector(sc, bvm_sector, &stripe, &bvm_sector); - - q = bdev_get_queue(sc->stripe[stripe].dev->bdev); - if (!q->merge_bvec_fn) - return max_size; - - bvm->bi_bdev = sc->stripe[stripe].dev->bdev; - bvm->bi_sector = sc->stripe[stripe].physical_start + bvm_sector; - - return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); -} - -static struct target_type stripe_target = { - .name = "striped", - .version = {1, 4, 0}, - .module = THIS_MODULE, - .ctr = stripe_ctr, - .dtr = stripe_dtr, - .map = stripe_map, - .end_io = stripe_end_io, - .status = stripe_status, - .iterate_devices = stripe_iterate_devices, - .io_hints = stripe_io_hints, - .merge = stripe_merge, -}; - -int __init dm_stripe_init(void) -{ - int r; - - r = dm_register_target(&stripe_target); - if (r < 0) { - DMWARN("target registration failed"); - return r; - } - - return r; -} - -void dm_stripe_exit(void) -{ - dm_unregister_target(&stripe_target); -} diff --git a/ANDROID_3.4.5/drivers/md/dm-sysfs.c b/ANDROID_3.4.5/drivers/md/dm-sysfs.c deleted file mode 100644 index 84d2b91e..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-sysfs.c +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (C) 2008 Red Hat, Inc. All rights reserved. - * - * This file is released under the GPL. - */ - -#include <linux/sysfs.h> -#include <linux/dm-ioctl.h> -#include "dm.h" - -struct dm_sysfs_attr { - struct attribute attr; - ssize_t (*show)(struct mapped_device *, char *); - ssize_t (*store)(struct mapped_device *, char *); -}; - -#define DM_ATTR_RO(_name) \ -struct dm_sysfs_attr dm_attr_##_name = \ - __ATTR(_name, S_IRUGO, dm_attr_##_name##_show, NULL) - -static ssize_t dm_attr_show(struct kobject *kobj, struct attribute *attr, - char *page) -{ - struct dm_sysfs_attr *dm_attr; - struct mapped_device *md; - ssize_t ret; - - dm_attr = container_of(attr, struct dm_sysfs_attr, attr); - if (!dm_attr->show) - return -EIO; - - md = dm_get_from_kobject(kobj); - if (!md) - return -EINVAL; - - ret = dm_attr->show(md, page); - dm_put(md); - - return ret; -} - -static ssize_t dm_attr_name_show(struct mapped_device *md, char *buf) -{ - if (dm_copy_name_and_uuid(md, buf, NULL)) - return -EIO; - - strcat(buf, "\n"); - return strlen(buf); -} - -static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf) -{ - if (dm_copy_name_and_uuid(md, NULL, buf)) - return -EIO; - - strcat(buf, "\n"); - return strlen(buf); -} - -static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf) -{ - sprintf(buf, "%d\n", dm_suspended_md(md)); - - return strlen(buf); -} - -static DM_ATTR_RO(name); -static DM_ATTR_RO(uuid); -static DM_ATTR_RO(suspended); - -static struct attribute *dm_attrs[] = { - &dm_attr_name.attr, - &dm_attr_uuid.attr, - &dm_attr_suspended.attr, - NULL, -}; - -static const struct sysfs_ops dm_sysfs_ops = { - .show = dm_attr_show, -}; - -/* - * dm kobject is embedded in mapped_device structure - * no need to define release function here - */ -static struct kobj_type dm_ktype = { - .sysfs_ops = &dm_sysfs_ops, - .default_attrs = dm_attrs, -}; - -/* - * Initialize kobj - * because nobody using md yet, no need to call explicit dm_get/put - */ -int dm_sysfs_init(struct mapped_device *md) -{ - return kobject_init_and_add(dm_kobject(md), &dm_ktype, - &disk_to_dev(dm_disk(md))->kobj, - "%s", "dm"); -} - -/* - * Remove kobj, called after all references removed - */ -void dm_sysfs_exit(struct mapped_device *md) -{ - kobject_put(dm_kobject(md)); -} diff --git a/ANDROID_3.4.5/drivers/md/dm-table.c b/ANDROID_3.4.5/drivers/md/dm-table.c deleted file mode 100644 index 2e227fbf..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-table.c +++ /dev/null @@ -1,1577 +0,0 @@ -/* - * Copyright (C) 2001 Sistina Software (UK) Limited. - * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. - * - * This file is released under the GPL. - */ - -#include "dm.h" - -#include <linux/module.h> -#include <linux/vmalloc.h> -#include <linux/blkdev.h> -#include <linux/namei.h> -#include <linux/ctype.h> -#include <linux/string.h> -#include <linux/slab.h> -#include <linux/interrupt.h> -#include <linux/mutex.h> -#include <linux/delay.h> -#include <linux/atomic.h> - -#define DM_MSG_PREFIX "table" - -#define MAX_DEPTH 16 -#define NODE_SIZE L1_CACHE_BYTES -#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t)) -#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1) - -/* - * The table has always exactly one reference from either mapped_device->map - * or hash_cell->new_map. This reference is not counted in table->holders. - * A pair of dm_create_table/dm_destroy_table functions is used for table - * creation/destruction. - * - * Temporary references from the other code increase table->holders. A pair - * of dm_table_get/dm_table_put functions is used to manipulate it. - * - * When the table is about to be destroyed, we wait for table->holders to - * drop to zero. - */ - -struct dm_table { - struct mapped_device *md; - atomic_t holders; - unsigned type; - - /* btree table */ - unsigned int depth; - unsigned int counts[MAX_DEPTH]; /* in nodes */ - sector_t *index[MAX_DEPTH]; - - unsigned int num_targets; - unsigned int num_allocated; - sector_t *highs; - struct dm_target *targets; - - struct target_type *immutable_target_type; - unsigned integrity_supported:1; - unsigned singleton:1; - - /* - * Indicates the rw permissions for the new logical - * device. This should be a combination of FMODE_READ - * and FMODE_WRITE. - */ - fmode_t mode; - - /* a list of devices used by this table */ - struct list_head devices; - - /* events get handed up using this callback */ - void (*event_fn)(void *); - void *event_context; - - struct dm_md_mempools *mempools; - - struct list_head target_callbacks; -}; - -/* - * Similar to ceiling(log_size(n)) - */ -static unsigned int int_log(unsigned int n, unsigned int base) -{ - int result = 0; - - while (n > 1) { - n = dm_div_up(n, base); - result++; - } - - return result; -} - -/* - * Calculate the index of the child node of the n'th node k'th key. - */ -static inline unsigned int get_child(unsigned int n, unsigned int k) -{ - return (n * CHILDREN_PER_NODE) + k; -} - -/* - * Return the n'th node of level l from table t. - */ -static inline sector_t *get_node(struct dm_table *t, - unsigned int l, unsigned int n) -{ - return t->index[l] + (n * KEYS_PER_NODE); -} - -/* - * Return the highest key that you could lookup from the n'th - * node on level l of the btree. - */ -static sector_t high(struct dm_table *t, unsigned int l, unsigned int n) -{ - for (; l < t->depth - 1; l++) - n = get_child(n, CHILDREN_PER_NODE - 1); - - if (n >= t->counts[l]) - return (sector_t) - 1; - - return get_node(t, l, n)[KEYS_PER_NODE - 1]; -} - -/* - * Fills in a level of the btree based on the highs of the level - * below it. - */ -static int setup_btree_index(unsigned int l, struct dm_table *t) -{ - unsigned int n, k; - sector_t *node; - - for (n = 0U; n < t->counts[l]; n++) { - node = get_node(t, l, n); - - for (k = 0U; k < KEYS_PER_NODE; k++) - node[k] = high(t, l + 1, get_child(n, k)); - } - - return 0; -} - -void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size) -{ - unsigned long size; - void *addr; - - /* - * Check that we're not going to overflow. - */ - if (nmemb > (ULONG_MAX / elem_size)) - return NULL; - - size = nmemb * elem_size; - addr = vzalloc(size); - - return addr; -} -EXPORT_SYMBOL(dm_vcalloc); - -/* - * highs, and targets are managed as dynamic arrays during a - * table load. - */ -static int alloc_targets(struct dm_table *t, unsigned int num) -{ - sector_t *n_highs; - struct dm_target *n_targets; - int n = t->num_targets; - - /* - * Allocate both the target array and offset array at once. - * Append an empty entry to catch sectors beyond the end of - * the device. - */ - n_highs = (sector_t *) dm_vcalloc(num + 1, sizeof(struct dm_target) + - sizeof(sector_t)); - if (!n_highs) - return -ENOMEM; - - n_targets = (struct dm_target *) (n_highs + num); - - if (n) { - memcpy(n_highs, t->highs, sizeof(*n_highs) * n); - memcpy(n_targets, t->targets, sizeof(*n_targets) * n); - } - - memset(n_highs + n, -1, sizeof(*n_highs) * (num - n)); - vfree(t->highs); - - t->num_allocated = num; - t->highs = n_highs; - t->targets = n_targets; - - return 0; -} - -int dm_table_create(struct dm_table **result, fmode_t mode, - unsigned num_targets, struct mapped_device *md) -{ - struct dm_table *t = kzalloc(sizeof(*t), GFP_KERNEL); - - if (!t) - return -ENOMEM; - - INIT_LIST_HEAD(&t->devices); - INIT_LIST_HEAD(&t->target_callbacks); - atomic_set(&t->holders, 0); - - if (!num_targets) - num_targets = KEYS_PER_NODE; - - num_targets = dm_round_up(num_targets, KEYS_PER_NODE); - - if (alloc_targets(t, num_targets)) { - kfree(t); - t = NULL; - return -ENOMEM; - } - - t->mode = mode; - t->md = md; - *result = t; - return 0; -} - -static void free_devices(struct list_head *devices) -{ - struct list_head *tmp, *next; - - list_for_each_safe(tmp, next, devices) { - struct dm_dev_internal *dd = - list_entry(tmp, struct dm_dev_internal, list); - DMWARN("dm_table_destroy: dm_put_device call missing for %s", - dd->dm_dev.name); - kfree(dd); - } -} - -void dm_table_destroy(struct dm_table *t) -{ - unsigned int i; - - if (!t) - return; - - while (atomic_read(&t->holders)) - msleep(1); - smp_mb(); - - /* free the indexes */ - if (t->depth >= 2) - vfree(t->index[t->depth - 2]); - - /* free the targets */ - for (i = 0; i < t->num_targets; i++) { - struct dm_target *tgt = t->targets + i; - - if (tgt->type->dtr) - tgt->type->dtr(tgt); - - dm_put_target_type(tgt->type); - } - - vfree(t->highs); - - /* free the device list */ - free_devices(&t->devices); - - dm_free_md_mempools(t->mempools); - - kfree(t); -} - -void dm_table_get(struct dm_table *t) -{ - atomic_inc(&t->holders); -} -EXPORT_SYMBOL(dm_table_get); - -void dm_table_put(struct dm_table *t) -{ - if (!t) - return; - - smp_mb__before_atomic_dec(); - atomic_dec(&t->holders); -} -EXPORT_SYMBOL(dm_table_put); - -/* - * Checks to see if we need to extend highs or targets. - */ -static inline int check_space(struct dm_table *t) -{ - if (t->num_targets >= t->num_allocated) - return alloc_targets(t, t->num_allocated * 2); - - return 0; -} - -/* - * See if we've already got a device in the list. - */ -static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev) -{ - struct dm_dev_internal *dd; - - list_for_each_entry (dd, l, list) - if (dd->dm_dev.bdev->bd_dev == dev) - return dd; - - return NULL; -} - -/* - * Open a device so we can use it as a map destination. - */ -static int open_dev(struct dm_dev_internal *d, dev_t dev, - struct mapped_device *md) -{ - static char *_claim_ptr = "I belong to device-mapper"; - struct block_device *bdev; - - int r; - - BUG_ON(d->dm_dev.bdev); - - bdev = blkdev_get_by_dev(dev, d->dm_dev.mode | FMODE_EXCL, _claim_ptr); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); - - r = bd_link_disk_holder(bdev, dm_disk(md)); - if (r) { - blkdev_put(bdev, d->dm_dev.mode | FMODE_EXCL); - return r; - } - - d->dm_dev.bdev = bdev; - return 0; -} - -/* - * Close a device that we've been using. - */ -static void close_dev(struct dm_dev_internal *d, struct mapped_device *md) -{ - if (!d->dm_dev.bdev) - return; - - bd_unlink_disk_holder(d->dm_dev.bdev, dm_disk(md)); - blkdev_put(d->dm_dev.bdev, d->dm_dev.mode | FMODE_EXCL); - d->dm_dev.bdev = NULL; -} - -/* - * If possible, this checks an area of a destination device is invalid. - */ -static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) -{ - struct request_queue *q; - struct queue_limits *limits = data; - struct block_device *bdev = dev->bdev; - sector_t dev_size = - i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; - unsigned short logical_block_size_sectors = - limits->logical_block_size >> SECTOR_SHIFT; - char b[BDEVNAME_SIZE]; - - /* - * Some devices exist without request functions, - * such as loop devices not yet bound to backing files. - * Forbid the use of such devices. - */ - q = bdev_get_queue(bdev); - if (!q || !q->make_request_fn) { - DMWARN("%s: %s is not yet initialised: " - "start=%llu, len=%llu, dev_size=%llu", - dm_device_name(ti->table->md), bdevname(bdev, b), - (unsigned long long)start, - (unsigned long long)len, - (unsigned long long)dev_size); - return 1; - } - - if (!dev_size) - return 0; - - if ((start >= dev_size) || (start + len > dev_size)) { - DMWARN("%s: %s too small for target: " - "start=%llu, len=%llu, dev_size=%llu", - dm_device_name(ti->table->md), bdevname(bdev, b), - (unsigned long long)start, - (unsigned long long)len, - (unsigned long long)dev_size); - return 1; - } - - if (logical_block_size_sectors <= 1) - return 0; - - if (start & (logical_block_size_sectors - 1)) { - DMWARN("%s: start=%llu not aligned to h/w " - "logical block size %u of %s", - dm_device_name(ti->table->md), - (unsigned long long)start, - limits->logical_block_size, bdevname(bdev, b)); - return 1; - } - - if (len & (logical_block_size_sectors - 1)) { - DMWARN("%s: len=%llu not aligned to h/w " - "logical block size %u of %s", - dm_device_name(ti->table->md), - (unsigned long long)len, - limits->logical_block_size, bdevname(bdev, b)); - return 1; - } - - return 0; -} - -/* - * This upgrades the mode on an already open dm_dev, being - * careful to leave things as they were if we fail to reopen the - * device and not to touch the existing bdev field in case - * it is accessed concurrently inside dm_table_any_congested(). - */ -static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, - struct mapped_device *md) -{ - int r; - struct dm_dev_internal dd_new, dd_old; - - dd_new = dd_old = *dd; - - dd_new.dm_dev.mode |= new_mode; - dd_new.dm_dev.bdev = NULL; - - r = open_dev(&dd_new, dd->dm_dev.bdev->bd_dev, md); - if (r) - return r; - - dd->dm_dev.mode |= new_mode; - close_dev(&dd_old, md); - - return 0; -} - -/* - * Add a device to the list, or just increment the usage count if - * it's already present. - */ -int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, - struct dm_dev **result) -{ - int r; - dev_t uninitialized_var(dev); - struct dm_dev_internal *dd; - unsigned int major, minor; - struct dm_table *t = ti->table; - char dummy; - - BUG_ON(!t); - - if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) { - /* Extract the major/minor numbers */ - dev = MKDEV(major, minor); - if (MAJOR(dev) != major || MINOR(dev) != minor) - return -EOVERFLOW; - } else { - /* convert the path to a device */ - struct block_device *bdev = lookup_bdev(path); - - if (IS_ERR(bdev)) - return PTR_ERR(bdev); - dev = bdev->bd_dev; - bdput(bdev); - } - - dd = find_device(&t->devices, dev); - if (!dd) { - dd = kmalloc(sizeof(*dd), GFP_KERNEL); - if (!dd) - return -ENOMEM; - - dd->dm_dev.mode = mode; - dd->dm_dev.bdev = NULL; - - if ((r = open_dev(dd, dev, t->md))) { - kfree(dd); - return r; - } - - format_dev_t(dd->dm_dev.name, dev); - - atomic_set(&dd->count, 0); - list_add(&dd->list, &t->devices); - - } else if (dd->dm_dev.mode != (mode | dd->dm_dev.mode)) { - r = upgrade_mode(dd, mode, t->md); - if (r) - return r; - } - atomic_inc(&dd->count); - - *result = &dd->dm_dev; - return 0; -} -EXPORT_SYMBOL(dm_get_device); - -int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) -{ - struct queue_limits *limits = data; - struct block_device *bdev = dev->bdev; - struct request_queue *q = bdev_get_queue(bdev); - char b[BDEVNAME_SIZE]; - - if (unlikely(!q)) { - DMWARN("%s: Cannot set limits for nonexistent device %s", - dm_device_name(ti->table->md), bdevname(bdev, b)); - return 0; - } - - if (bdev_stack_limits(limits, bdev, start) < 0) - DMWARN("%s: adding target device %s caused an alignment inconsistency: " - "physical_block_size=%u, logical_block_size=%u, " - "alignment_offset=%u, start=%llu", - dm_device_name(ti->table->md), bdevname(bdev, b), - q->limits.physical_block_size, - q->limits.logical_block_size, - q->limits.alignment_offset, - (unsigned long long) start << SECTOR_SHIFT); - - /* - * Check if merge fn is supported. - * If not we'll force DM to use PAGE_SIZE or - * smaller I/O, just to be safe. - */ - if (dm_queue_merge_is_compulsory(q) && !ti->type->merge) - blk_limits_max_hw_sectors(limits, - (unsigned int) (PAGE_SIZE >> 9)); - return 0; -} -EXPORT_SYMBOL_GPL(dm_set_device_limits); - -/* - * Decrement a device's use count and remove it if necessary. - */ -void dm_put_device(struct dm_target *ti, struct dm_dev *d) -{ - struct dm_dev_internal *dd = container_of(d, struct dm_dev_internal, - dm_dev); - - if (atomic_dec_and_test(&dd->count)) { - close_dev(dd, ti->table->md); - list_del(&dd->list); - kfree(dd); - } -} -EXPORT_SYMBOL(dm_put_device); - -/* - * Checks to see if the target joins onto the end of the table. - */ -static int adjoin(struct dm_table *table, struct dm_target *ti) -{ - struct dm_target *prev; - - if (!table->num_targets) - return !ti->begin; - - prev = &table->targets[table->num_targets - 1]; - return (ti->begin == (prev->begin + prev->len)); -} - -/* - * Used to dynamically allocate the arg array. - */ -static char **realloc_argv(unsigned *array_size, char **old_argv) -{ - char **argv; - unsigned new_size; - - new_size = *array_size ? *array_size * 2 : 64; - argv = kmalloc(new_size * sizeof(*argv), GFP_KERNEL); - if (argv) { - memcpy(argv, old_argv, *array_size * sizeof(*argv)); - *array_size = new_size; - } - - kfree(old_argv); - return argv; -} - -/* - * Destructively splits up the argument list to pass to ctr. - */ -int dm_split_args(int *argc, char ***argvp, char *input) -{ - char *start, *end = input, *out, **argv = NULL; - unsigned array_size = 0; - - *argc = 0; - - if (!input) { - *argvp = NULL; - return 0; - } - - argv = realloc_argv(&array_size, argv); - if (!argv) - return -ENOMEM; - - while (1) { - /* Skip whitespace */ - start = skip_spaces(end); - - if (!*start) - break; /* success, we hit the end */ - - /* 'out' is used to remove any back-quotes */ - end = out = start; - while (*end) { - /* Everything apart from '\0' can be quoted */ - if (*end == '\\' && *(end + 1)) { - *out++ = *(end + 1); - end += 2; - continue; - } - - if (isspace(*end)) - break; /* end of token */ - - *out++ = *end++; - } - - /* have we already filled the array ? */ - if ((*argc + 1) > array_size) { - argv = realloc_argv(&array_size, argv); - if (!argv) - return -ENOMEM; - } - - /* we know this is whitespace */ - if (*end) - end++; - - /* terminate the string and put it in the array */ - *out = '\0'; - argv[*argc] = start; - (*argc)++; - } - - *argvp = argv; - return 0; -} - -/* - * Impose necessary and sufficient conditions on a devices's table such - * that any incoming bio which respects its logical_block_size can be - * processed successfully. If it falls across the boundary between - * two or more targets, the size of each piece it gets split into must - * be compatible with the logical_block_size of the target processing it. - */ -static int validate_hardware_logical_block_alignment(struct dm_table *table, - struct queue_limits *limits) -{ - /* - * This function uses arithmetic modulo the logical_block_size - * (in units of 512-byte sectors). - */ - unsigned short device_logical_block_size_sects = - limits->logical_block_size >> SECTOR_SHIFT; - - /* - * Offset of the start of the next table entry, mod logical_block_size. - */ - unsigned short next_target_start = 0; - - /* - * Given an aligned bio that extends beyond the end of a - * target, how many sectors must the next target handle? - */ - unsigned short remaining = 0; - - struct dm_target *uninitialized_var(ti); - struct queue_limits ti_limits; - unsigned i = 0; - - /* - * Check each entry in the table in turn. - */ - while (i < dm_table_get_num_targets(table)) { - ti = dm_table_get_target(table, i++); - - blk_set_stacking_limits(&ti_limits); - - /* combine all target devices' limits */ - if (ti->type->iterate_devices) - ti->type->iterate_devices(ti, dm_set_device_limits, - &ti_limits); - - /* - * If the remaining sectors fall entirely within this - * table entry are they compatible with its logical_block_size? - */ - if (remaining < ti->len && - remaining & ((ti_limits.logical_block_size >> - SECTOR_SHIFT) - 1)) - break; /* Error */ - - next_target_start = - (unsigned short) ((next_target_start + ti->len) & - (device_logical_block_size_sects - 1)); - remaining = next_target_start ? - device_logical_block_size_sects - next_target_start : 0; - } - - if (remaining) { - DMWARN("%s: table line %u (start sect %llu len %llu) " - "not aligned to h/w logical block size %u", - dm_device_name(table->md), i, - (unsigned long long) ti->begin, - (unsigned long long) ti->len, - limits->logical_block_size); - return -EINVAL; - } - - return 0; -} - -int dm_table_add_target(struct dm_table *t, const char *type, - sector_t start, sector_t len, char *params) -{ - int r = -EINVAL, argc; - char **argv; - struct dm_target *tgt; - - if (t->singleton) { - DMERR("%s: target type %s must appear alone in table", - dm_device_name(t->md), t->targets->type->name); - return -EINVAL; - } - - if ((r = check_space(t))) - return r; - - tgt = t->targets + t->num_targets; - memset(tgt, 0, sizeof(*tgt)); - - if (!len) { - DMERR("%s: zero-length target", dm_device_name(t->md)); - return -EINVAL; - } - - tgt->type = dm_get_target_type(type); - if (!tgt->type) { - DMERR("%s: %s: unknown target type", dm_device_name(t->md), - type); - return -EINVAL; - } - - if (dm_target_needs_singleton(tgt->type)) { - if (t->num_targets) { - DMERR("%s: target type %s must appear alone in table", - dm_device_name(t->md), type); - return -EINVAL; - } - t->singleton = 1; - } - - if (dm_target_always_writeable(tgt->type) && !(t->mode & FMODE_WRITE)) { - DMERR("%s: target type %s may not be included in read-only tables", - dm_device_name(t->md), type); - return -EINVAL; - } - - if (t->immutable_target_type) { - if (t->immutable_target_type != tgt->type) { - DMERR("%s: immutable target type %s cannot be mixed with other target types", - dm_device_name(t->md), t->immutable_target_type->name); - return -EINVAL; - } - } else if (dm_target_is_immutable(tgt->type)) { - if (t->num_targets) { - DMERR("%s: immutable target type %s cannot be mixed with other target types", - dm_device_name(t->md), tgt->type->name); - return -EINVAL; - } - t->immutable_target_type = tgt->type; - } - - tgt->table = t; - tgt->begin = start; - tgt->len = len; - tgt->error = "Unknown error"; - - /* - * Does this target adjoin the previous one ? - */ - if (!adjoin(t, tgt)) { - tgt->error = "Gap in table"; - r = -EINVAL; - goto bad; - } - - r = dm_split_args(&argc, &argv, params); - if (r) { - tgt->error = "couldn't split parameters (insufficient memory)"; - goto bad; - } - - r = tgt->type->ctr(tgt, argc, argv); - kfree(argv); - if (r) - goto bad; - - t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; - - if (!tgt->num_discard_requests && tgt->discards_supported) - DMWARN("%s: %s: ignoring discards_supported because num_discard_requests is zero.", - dm_device_name(t->md), type); - - return 0; - - bad: - DMERR("%s: %s: %s", dm_device_name(t->md), type, tgt->error); - dm_put_target_type(tgt->type); - return r; -} - -/* - * Target argument parsing helpers. - */ -static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set, - unsigned *value, char **error, unsigned grouped) -{ - const char *arg_str = dm_shift_arg(arg_set); - char dummy; - - if (!arg_str || - (sscanf(arg_str, "%u%c", value, &dummy) != 1) || - (*value < arg->min) || - (*value > arg->max) || - (grouped && arg_set->argc < *value)) { - *error = arg->error; - return -EINVAL; - } - - return 0; -} - -int dm_read_arg(struct dm_arg *arg, struct dm_arg_set *arg_set, - unsigned *value, char **error) -{ - return validate_next_arg(arg, arg_set, value, error, 0); -} -EXPORT_SYMBOL(dm_read_arg); - -int dm_read_arg_group(struct dm_arg *arg, struct dm_arg_set *arg_set, - unsigned *value, char **error) -{ - return validate_next_arg(arg, arg_set, value, error, 1); -} -EXPORT_SYMBOL(dm_read_arg_group); - -const char *dm_shift_arg(struct dm_arg_set *as) -{ - char *r; - - if (as->argc) { - as->argc--; - r = *as->argv; - as->argv++; - return r; - } - - return NULL; -} -EXPORT_SYMBOL(dm_shift_arg); - -void dm_consume_args(struct dm_arg_set *as, unsigned num_args) -{ - BUG_ON(as->argc < num_args); - as->argc -= num_args; - as->argv += num_args; -} -EXPORT_SYMBOL(dm_consume_args); - -static int dm_table_set_type(struct dm_table *t) -{ - unsigned i; - unsigned bio_based = 0, request_based = 0; - struct dm_target *tgt; - struct dm_dev_internal *dd; - struct list_head *devices; - - for (i = 0; i < t->num_targets; i++) { - tgt = t->targets + i; - if (dm_target_request_based(tgt)) - request_based = 1; - else - bio_based = 1; - - if (bio_based && request_based) { - DMWARN("Inconsistent table: different target types" - " can't be mixed up"); - return -EINVAL; - } - } - - if (bio_based) { - /* We must use this table as bio-based */ - t->type = DM_TYPE_BIO_BASED; - return 0; - } - - BUG_ON(!request_based); /* No targets in this table */ - - /* Non-request-stackable devices can't be used for request-based dm */ - devices = dm_table_get_devices(t); - list_for_each_entry(dd, devices, list) { - if (!blk_queue_stackable(bdev_get_queue(dd->dm_dev.bdev))) { - DMWARN("table load rejected: including" - " non-request-stackable devices"); - return -EINVAL; - } - } - - /* - * Request-based dm supports only tables that have a single target now. - * To support multiple targets, request splitting support is needed, - * and that needs lots of changes in the block-layer. - * (e.g. request completion process for partial completion.) - */ - if (t->num_targets > 1) { - DMWARN("Request-based dm doesn't support multiple targets yet"); - return -EINVAL; - } - - t->type = DM_TYPE_REQUEST_BASED; - - return 0; -} - -unsigned dm_table_get_type(struct dm_table *t) -{ - return t->type; -} - -struct target_type *dm_table_get_immutable_target_type(struct dm_table *t) -{ - return t->immutable_target_type; -} - -bool dm_table_request_based(struct dm_table *t) -{ - return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED; -} - -int dm_table_alloc_md_mempools(struct dm_table *t) -{ - unsigned type = dm_table_get_type(t); - - if (unlikely(type == DM_TYPE_NONE)) { - DMWARN("no table type is set, can't allocate mempools"); - return -EINVAL; - } - - t->mempools = dm_alloc_md_mempools(type, t->integrity_supported); - if (!t->mempools) - return -ENOMEM; - - return 0; -} - -void dm_table_free_md_mempools(struct dm_table *t) -{ - dm_free_md_mempools(t->mempools); - t->mempools = NULL; -} - -struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t) -{ - return t->mempools; -} - -static int setup_indexes(struct dm_table *t) -{ - int i; - unsigned int total = 0; - sector_t *indexes; - - /* allocate the space for *all* the indexes */ - for (i = t->depth - 2; i >= 0; i--) { - t->counts[i] = dm_div_up(t->counts[i + 1], CHILDREN_PER_NODE); - total += t->counts[i]; - } - - indexes = (sector_t *) dm_vcalloc(total, (unsigned long) NODE_SIZE); - if (!indexes) - return -ENOMEM; - - /* set up internal nodes, bottom-up */ - for (i = t->depth - 2; i >= 0; i--) { - t->index[i] = indexes; - indexes += (KEYS_PER_NODE * t->counts[i]); - setup_btree_index(i, t); - } - - return 0; -} - -/* - * Builds the btree to index the map. - */ -static int dm_table_build_index(struct dm_table *t) -{ - int r = 0; - unsigned int leaf_nodes; - - /* how many indexes will the btree have ? */ - leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE); - t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE); - - /* leaf layer has already been set up */ - t->counts[t->depth - 1] = leaf_nodes; - t->index[t->depth - 1] = t->highs; - - if (t->depth >= 2) - r = setup_indexes(t); - - return r; -} - -/* - * Get a disk whose integrity profile reflects the table's profile. - * If %match_all is true, all devices' profiles must match. - * If %match_all is false, all devices must at least have an - * allocated integrity profile; but uninitialized is ok. - * Returns NULL if integrity support was inconsistent or unavailable. - */ -static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t, - bool match_all) -{ - struct list_head *devices = dm_table_get_devices(t); - struct dm_dev_internal *dd = NULL; - struct gendisk *prev_disk = NULL, *template_disk = NULL; - - list_for_each_entry(dd, devices, list) { - template_disk = dd->dm_dev.bdev->bd_disk; - if (!blk_get_integrity(template_disk)) - goto no_integrity; - if (!match_all && !blk_integrity_is_initialized(template_disk)) - continue; /* skip uninitialized profiles */ - else if (prev_disk && - blk_integrity_compare(prev_disk, template_disk) < 0) - goto no_integrity; - prev_disk = template_disk; - } - - return template_disk; - -no_integrity: - if (prev_disk) - DMWARN("%s: integrity not set: %s and %s profile mismatch", - dm_device_name(t->md), - prev_disk->disk_name, - template_disk->disk_name); - return NULL; -} - -/* - * Register the mapped device for blk_integrity support if - * the underlying devices have an integrity profile. But all devices - * may not have matching profiles (checking all devices isn't reliable - * during table load because this table may use other DM device(s) which - * must be resumed before they will have an initialized integity profile). - * Stacked DM devices force a 2 stage integrity profile validation: - * 1 - during load, validate all initialized integrity profiles match - * 2 - during resume, validate all integrity profiles match - */ -static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device *md) -{ - struct gendisk *template_disk = NULL; - - template_disk = dm_table_get_integrity_disk(t, false); - if (!template_disk) - return 0; - - if (!blk_integrity_is_initialized(dm_disk(md))) { - t->integrity_supported = 1; - return blk_integrity_register(dm_disk(md), NULL); - } - - /* - * If DM device already has an initalized integrity - * profile the new profile should not conflict. - */ - if (blk_integrity_is_initialized(template_disk) && - blk_integrity_compare(dm_disk(md), template_disk) < 0) { - DMWARN("%s: conflict with existing integrity profile: " - "%s profile mismatch", - dm_device_name(t->md), - template_disk->disk_name); - return 1; - } - - /* Preserve existing initialized integrity profile */ - t->integrity_supported = 1; - return 0; -} - -/* - * Prepares the table for use by building the indices, - * setting the type, and allocating mempools. - */ -int dm_table_complete(struct dm_table *t) -{ - int r; - - r = dm_table_set_type(t); - if (r) { - DMERR("unable to set table type"); - return r; - } - - r = dm_table_build_index(t); - if (r) { - DMERR("unable to build btrees"); - return r; - } - - r = dm_table_prealloc_integrity(t, t->md); - if (r) { - DMERR("could not register integrity profile."); - return r; - } - - r = dm_table_alloc_md_mempools(t); - if (r) - DMERR("unable to allocate mempools"); - - return r; -} - -static DEFINE_MUTEX(_event_lock); -void dm_table_event_callback(struct dm_table *t, - void (*fn)(void *), void *context) -{ - mutex_lock(&_event_lock); - t->event_fn = fn; - t->event_context = context; - mutex_unlock(&_event_lock); -} - -void dm_table_event(struct dm_table *t) -{ - /* - * You can no longer call dm_table_event() from interrupt - * context, use a bottom half instead. - */ - BUG_ON(in_interrupt()); - - mutex_lock(&_event_lock); - if (t->event_fn) - t->event_fn(t->event_context); - mutex_unlock(&_event_lock); -} -EXPORT_SYMBOL(dm_table_event); - -sector_t dm_table_get_size(struct dm_table *t) -{ - return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0; -} -EXPORT_SYMBOL(dm_table_get_size); - -struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index) -{ - if (index >= t->num_targets) - return NULL; - - return t->targets + index; -} - -/* - * Search the btree for the correct target. - * - * Caller should check returned pointer with dm_target_is_valid() - * to trap I/O beyond end of device. - */ -struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) -{ - unsigned int l, n = 0, k = 0; - sector_t *node; - - for (l = 0; l < t->depth; l++) { - n = get_child(n, k); - node = get_node(t, l, n); - - for (k = 0; k < KEYS_PER_NODE; k++) - if (node[k] >= sector) - break; - } - - return &t->targets[(KEYS_PER_NODE * n) + k]; -} - -/* - * Establish the new table's queue_limits and validate them. - */ -int dm_calculate_queue_limits(struct dm_table *table, - struct queue_limits *limits) -{ - struct dm_target *uninitialized_var(ti); - struct queue_limits ti_limits; - unsigned i = 0; - - blk_set_stacking_limits(limits); - - while (i < dm_table_get_num_targets(table)) { - blk_set_stacking_limits(&ti_limits); - - ti = dm_table_get_target(table, i++); - - if (!ti->type->iterate_devices) - goto combine_limits; - - /* - * Combine queue limits of all the devices this target uses. - */ - ti->type->iterate_devices(ti, dm_set_device_limits, - &ti_limits); - - /* Set I/O hints portion of queue limits */ - if (ti->type->io_hints) - ti->type->io_hints(ti, &ti_limits); - - /* - * Check each device area is consistent with the target's - * overall queue limits. - */ - if (ti->type->iterate_devices(ti, device_area_is_invalid, - &ti_limits)) - return -EINVAL; - -combine_limits: - /* - * Merge this target's queue limits into the overall limits - * for the table. - */ - if (blk_stack_limits(limits, &ti_limits, 0) < 0) - DMWARN("%s: adding target device " - "(start sect %llu len %llu) " - "caused an alignment inconsistency", - dm_device_name(table->md), - (unsigned long long) ti->begin, - (unsigned long long) ti->len); - } - - return validate_hardware_logical_block_alignment(table, limits); -} - -/* - * Set the integrity profile for this device if all devices used have - * matching profiles. We're quite deep in the resume path but still - * don't know if all devices (particularly DM devices this device - * may be stacked on) have matching profiles. Even if the profiles - * don't match we have no way to fail (to resume) at this point. - */ -static void dm_table_set_integrity(struct dm_table *t) -{ - struct gendisk *template_disk = NULL; - - if (!blk_get_integrity(dm_disk(t->md))) - return; - - template_disk = dm_table_get_integrity_disk(t, true); - if (template_disk) - blk_integrity_register(dm_disk(t->md), - blk_get_integrity(template_disk)); - else if (blk_integrity_is_initialized(dm_disk(t->md))) - DMWARN("%s: device no longer has a valid integrity profile", - dm_device_name(t->md)); - else - DMWARN("%s: unable to establish an integrity profile", - dm_device_name(t->md)); -} - -static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) -{ - unsigned flush = (*(unsigned *)data); - struct request_queue *q = bdev_get_queue(dev->bdev); - - return q && (q->flush_flags & flush); -} - -static bool dm_table_supports_flush(struct dm_table *t, unsigned flush) -{ - struct dm_target *ti; - unsigned i = 0; - - /* - * Require at least one underlying device to support flushes. - * t->devices includes internal dm devices such as mirror logs - * so we need to use iterate_devices here, which targets - * supporting flushes must provide. - */ - while (i < dm_table_get_num_targets(t)) { - ti = dm_table_get_target(t, i++); - - if (!ti->num_flush_requests) - continue; - - if (ti->type->iterate_devices && - ti->type->iterate_devices(ti, device_flush_capable, &flush)) - return 1; - } - - return 0; -} - -static bool dm_table_discard_zeroes_data(struct dm_table *t) -{ - struct dm_target *ti; - unsigned i = 0; - - /* Ensure that all targets supports discard_zeroes_data. */ - while (i < dm_table_get_num_targets(t)) { - ti = dm_table_get_target(t, i++); - - if (ti->discard_zeroes_data_unsupported) - return 0; - } - - return 1; -} - -static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) -{ - struct request_queue *q = bdev_get_queue(dev->bdev); - - return q && blk_queue_nonrot(q); -} - -static bool dm_table_is_nonrot(struct dm_table *t) -{ - struct dm_target *ti; - unsigned i = 0; - - /* Ensure that all underlying device are non-rotational. */ - while (i < dm_table_get_num_targets(t)) { - ti = dm_table_get_target(t, i++); - - if (!ti->type->iterate_devices || - !ti->type->iterate_devices(ti, device_is_nonrot, NULL)) - return 0; - } - - return 1; -} - -void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, - struct queue_limits *limits) -{ - unsigned flush = 0; - - /* - * Copy table's limits to the DM device's request_queue - */ - q->limits = *limits; - - if (!dm_table_supports_discards(t)) - queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); - else - queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); - - if (dm_table_supports_flush(t, REQ_FLUSH)) { - flush |= REQ_FLUSH; - if (dm_table_supports_flush(t, REQ_FUA)) - flush |= REQ_FUA; - } - blk_queue_flush(q, flush); - - if (!dm_table_discard_zeroes_data(t)) - q->limits.discard_zeroes_data = 0; - - if (dm_table_is_nonrot(t)) - queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); - else - queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, q); - - dm_table_set_integrity(t); - - /* - * QUEUE_FLAG_STACKABLE must be set after all queue settings are - * visible to other CPUs because, once the flag is set, incoming bios - * are processed by request-based dm, which refers to the queue - * settings. - * Until the flag set, bios are passed to bio-based dm and queued to - * md->deferred where queue settings are not needed yet. - * Those bios are passed to request-based dm at the resume time. - */ - smp_mb(); - if (dm_table_request_based(t)) - queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q); -} - -unsigned int dm_table_get_num_targets(struct dm_table *t) -{ - return t->num_targets; -} - -struct list_head *dm_table_get_devices(struct dm_table *t) -{ - return &t->devices; -} - -fmode_t dm_table_get_mode(struct dm_table *t) -{ - return t->mode; -} -EXPORT_SYMBOL(dm_table_get_mode); - -static void suspend_targets(struct dm_table *t, unsigned postsuspend) -{ - int i = t->num_targets; - struct dm_target *ti = t->targets; - - while (i--) { - if (postsuspend) { - if (ti->type->postsuspend) - ti->type->postsuspend(ti); - } else if (ti->type->presuspend) - ti->type->presuspend(ti); - - ti++; - } -} - -void dm_table_presuspend_targets(struct dm_table *t) -{ - if (!t) - return; - - suspend_targets(t, 0); -} - -void dm_table_postsuspend_targets(struct dm_table *t) -{ - if (!t) - return; - - suspend_targets(t, 1); -} - -int dm_table_resume_targets(struct dm_table *t) -{ - int i, r = 0; - - for (i = 0; i < t->num_targets; i++) { - struct dm_target *ti = t->targets + i; - - if (!ti->type->preresume) - continue; - - r = ti->type->preresume(ti); - if (r) - return r; - } - - for (i = 0; i < t->num_targets; i++) { - struct dm_target *ti = t->targets + i; - - if (ti->type->resume) - ti->type->resume(ti); - } - - return 0; -} - -void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb) -{ - list_add(&cb->list, &t->target_callbacks); -} -EXPORT_SYMBOL_GPL(dm_table_add_target_callbacks); - -int dm_table_any_congested(struct dm_table *t, int bdi_bits) -{ - struct dm_dev_internal *dd; - struct list_head *devices = dm_table_get_devices(t); - struct dm_target_callbacks *cb; - int r = 0; - - list_for_each_entry(dd, devices, list) { - struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev); - char b[BDEVNAME_SIZE]; - - if (likely(q)) - r |= bdi_congested(&q->backing_dev_info, bdi_bits); - else - DMWARN_LIMIT("%s: any_congested: nonexistent device %s", - dm_device_name(t->md), - bdevname(dd->dm_dev.bdev, b)); - } - - list_for_each_entry(cb, &t->target_callbacks, list) - if (cb->congested_fn) - r |= cb->congested_fn(cb, bdi_bits); - - return r; -} - -int dm_table_any_busy_target(struct dm_table *t) -{ - unsigned i; - struct dm_target *ti; - - for (i = 0; i < t->num_targets; i++) { - ti = t->targets + i; - if (ti->type->busy && ti->type->busy(ti)) - return 1; - } - - return 0; -} - -struct mapped_device *dm_table_get_md(struct dm_table *t) -{ - return t->md; -} -EXPORT_SYMBOL(dm_table_get_md); - -static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) -{ - struct request_queue *q = bdev_get_queue(dev->bdev); - - return q && blk_queue_discard(q); -} - -bool dm_table_supports_discards(struct dm_table *t) -{ - struct dm_target *ti; - unsigned i = 0; - - /* - * Unless any target used by the table set discards_supported, - * require at least one underlying device to support discards. - * t->devices includes internal dm devices such as mirror logs - * so we need to use iterate_devices here, which targets - * supporting discard selectively must provide. - */ - while (i < dm_table_get_num_targets(t)) { - ti = dm_table_get_target(t, i++); - - if (!ti->num_discard_requests) - continue; - - if (ti->discards_supported) - return 1; - - if (ti->type->iterate_devices && - ti->type->iterate_devices(ti, device_discard_capable, NULL)) - return 1; - } - - return 0; -} diff --git a/ANDROID_3.4.5/drivers/md/dm-target.c b/ANDROID_3.4.5/drivers/md/dm-target.c deleted file mode 100644 index 8da366cf..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-target.c +++ /dev/null @@ -1,154 +0,0 @@ -/* - * Copyright (C) 2001 Sistina Software (UK) Limited - * - * This file is released under the GPL. - */ - -#include "dm.h" - -#include <linux/module.h> -#include <linux/init.h> -#include <linux/kmod.h> -#include <linux/bio.h> - -#define DM_MSG_PREFIX "target" - -static LIST_HEAD(_targets); -static DECLARE_RWSEM(_lock); - -#define DM_MOD_NAME_SIZE 32 - -static inline struct target_type *__find_target_type(const char *name) -{ - struct target_type *tt; - - list_for_each_entry(tt, &_targets, list) - if (!strcmp(name, tt->name)) - return tt; - - return NULL; -} - -static struct target_type *get_target_type(const char *name) -{ - struct target_type *tt; - - down_read(&_lock); - - tt = __find_target_type(name); - if (tt && !try_module_get(tt->module)) - tt = NULL; - - up_read(&_lock); - return tt; -} - -static void load_module(const char *name) -{ - request_module("dm-%s", name); -} - -struct target_type *dm_get_target_type(const char *name) -{ - struct target_type *tt = get_target_type(name); - - if (!tt) { - load_module(name); - tt = get_target_type(name); - } - - return tt; -} - -void dm_put_target_type(struct target_type *tt) -{ - down_read(&_lock); - module_put(tt->module); - up_read(&_lock); -} - -int dm_target_iterate(void (*iter_func)(struct target_type *tt, - void *param), void *param) -{ - struct target_type *tt; - - down_read(&_lock); - list_for_each_entry(tt, &_targets, list) - iter_func(tt, param); - up_read(&_lock); - - return 0; -} - -int dm_register_target(struct target_type *tt) -{ - int rv = 0; - - down_write(&_lock); - if (__find_target_type(tt->name)) - rv = -EEXIST; - else - list_add(&tt->list, &_targets); - - up_write(&_lock); - return rv; -} - -void dm_unregister_target(struct target_type *tt) -{ - down_write(&_lock); - if (!__find_target_type(tt->name)) { - DMCRIT("Unregistering unrecognised target: %s", tt->name); - BUG(); - } - - list_del(&tt->list); - - up_write(&_lock); -} - -/* - * io-err: always fails an io, useful for bringing - * up LVs that have holes in them. - */ -static int io_err_ctr(struct dm_target *tt, unsigned int argc, char **args) -{ - /* - * Return error for discards instead of -EOPNOTSUPP - */ - tt->num_discard_requests = 1; - - return 0; -} - -static void io_err_dtr(struct dm_target *tt) -{ - /* empty */ -} - -static int io_err_map(struct dm_target *tt, struct bio *bio, - union map_info *map_context) -{ - return -EIO; -} - -static struct target_type error_target = { - .name = "error", - .version = {1, 0, 1}, - .ctr = io_err_ctr, - .dtr = io_err_dtr, - .map = io_err_map, -}; - -int __init dm_target_init(void) -{ - return dm_register_target(&error_target); -} - -void dm_target_exit(void) -{ - dm_unregister_target(&error_target); -} - -EXPORT_SYMBOL(dm_register_target); -EXPORT_SYMBOL(dm_unregister_target); diff --git a/ANDROID_3.4.5/drivers/md/dm-thin-metadata.c b/ANDROID_3.4.5/drivers/md/dm-thin-metadata.c deleted file mode 100644 index 737d3886..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-thin-metadata.c +++ /dev/null @@ -1,1409 +0,0 @@ -/* - * Copyright (C) 2011 Red Hat, Inc. - * - * This file is released under the GPL. - */ - -#include "dm-thin-metadata.h" -#include "persistent-data/dm-btree.h" -#include "persistent-data/dm-space-map.h" -#include "persistent-data/dm-space-map-disk.h" -#include "persistent-data/dm-transaction-manager.h" - -#include <linux/list.h> -#include <linux/device-mapper.h> -#include <linux/workqueue.h> - -/*-------------------------------------------------------------------------- - * As far as the metadata goes, there is: - * - * - A superblock in block zero, taking up fewer than 512 bytes for - * atomic writes. - * - * - A space map managing the metadata blocks. - * - * - A space map managing the data blocks. - * - * - A btree mapping our internal thin dev ids onto struct disk_device_details. - * - * - A hierarchical btree, with 2 levels which effectively maps (thin - * dev id, virtual block) -> block_time. Block time is a 64-bit - * field holding the time in the low 24 bits, and block in the top 48 - * bits. - * - * BTrees consist solely of btree_nodes, that fill a block. Some are - * internal nodes, as such their values are a __le64 pointing to other - * nodes. Leaf nodes can store data of any reasonable size (ie. much - * smaller than the block size). The nodes consist of the header, - * followed by an array of keys, followed by an array of values. We have - * to binary search on the keys so they're all held together to help the - * cpu cache. - * - * Space maps have 2 btrees: - * - * - One maps a uint64_t onto a struct index_entry. Which points to a - * bitmap block, and has some details about how many free entries there - * are etc. - * - * - The bitmap blocks have a header (for the checksum). Then the rest - * of the block is pairs of bits. With the meaning being: - * - * 0 - ref count is 0 - * 1 - ref count is 1 - * 2 - ref count is 2 - * 3 - ref count is higher than 2 - * - * - If the count is higher than 2 then the ref count is entered in a - * second btree that directly maps the block_address to a uint32_t ref - * count. - * - * The space map metadata variant doesn't have a bitmaps btree. Instead - * it has one single blocks worth of index_entries. This avoids - * recursive issues with the bitmap btree needing to allocate space in - * order to insert. With a small data block size such as 64k the - * metadata support data devices that are hundreds of terrabytes. - * - * The space maps allocate space linearly from front to back. Space that - * is freed in a transaction is never recycled within that transaction. - * To try and avoid fragmenting _free_ space the allocator always goes - * back and fills in gaps. - * - * All metadata io is in THIN_METADATA_BLOCK_SIZE sized/aligned chunks - * from the block manager. - *--------------------------------------------------------------------------*/ - -#define DM_MSG_PREFIX "thin metadata" - -#define THIN_SUPERBLOCK_MAGIC 27022010 -#define THIN_SUPERBLOCK_LOCATION 0 -#define THIN_VERSION 1 -#define THIN_METADATA_CACHE_SIZE 64 -#define SECTOR_TO_BLOCK_SHIFT 3 - -/* This should be plenty */ -#define SPACE_MAP_ROOT_SIZE 128 - -/* - * Little endian on-disk superblock and device details. - */ -struct thin_disk_superblock { - __le32 csum; /* Checksum of superblock except for this field. */ - __le32 flags; - __le64 blocknr; /* This block number, dm_block_t. */ - - __u8 uuid[16]; - __le64 magic; - __le32 version; - __le32 time; - - __le64 trans_id; - - /* - * Root held by userspace transactions. - */ - __le64 held_root; - - __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE]; - __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; - - /* - * 2-level btree mapping (dev_id, (dev block, time)) -> data block - */ - __le64 data_mapping_root; - - /* - * Device detail root mapping dev_id -> device_details - */ - __le64 device_details_root; - - __le32 data_block_size; /* In 512-byte sectors. */ - - __le32 metadata_block_size; /* In 512-byte sectors. */ - __le64 metadata_nr_blocks; - - __le32 compat_flags; - __le32 compat_ro_flags; - __le32 incompat_flags; -} __packed; - -struct disk_device_details { - __le64 mapped_blocks; - __le64 transaction_id; /* When created. */ - __le32 creation_time; - __le32 snapshotted_time; -} __packed; - -struct dm_pool_metadata { - struct hlist_node hash; - - struct block_device *bdev; - struct dm_block_manager *bm; - struct dm_space_map *metadata_sm; - struct dm_space_map *data_sm; - struct dm_transaction_manager *tm; - struct dm_transaction_manager *nb_tm; - - /* - * Two-level btree. - * First level holds thin_dev_t. - * Second level holds mappings. - */ - struct dm_btree_info info; - - /* - * Non-blocking version of the above. - */ - struct dm_btree_info nb_info; - - /* - * Just the top level for deleting whole devices. - */ - struct dm_btree_info tl_info; - - /* - * Just the bottom level for creating new devices. - */ - struct dm_btree_info bl_info; - - /* - * Describes the device details btree. - */ - struct dm_btree_info details_info; - - struct rw_semaphore root_lock; - uint32_t time; - int need_commit; - dm_block_t root; - dm_block_t details_root; - struct list_head thin_devices; - uint64_t trans_id; - unsigned long flags; - sector_t data_block_size; -}; - -struct dm_thin_device { - struct list_head list; - struct dm_pool_metadata *pmd; - dm_thin_id id; - - int open_count; - int changed; - uint64_t mapped_blocks; - uint64_t transaction_id; - uint32_t creation_time; - uint32_t snapshotted_time; -}; - -/*---------------------------------------------------------------- - * superblock validator - *--------------------------------------------------------------*/ - -#define SUPERBLOCK_CSUM_XOR 160774 - -static void sb_prepare_for_write(struct dm_block_validator *v, - struct dm_block *b, - size_t block_size) -{ - struct thin_disk_superblock *disk_super = dm_block_data(b); - - disk_super->blocknr = cpu_to_le64(dm_block_location(b)); - disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags, - block_size - sizeof(__le32), - SUPERBLOCK_CSUM_XOR)); -} - -static int sb_check(struct dm_block_validator *v, - struct dm_block *b, - size_t block_size) -{ - struct thin_disk_superblock *disk_super = dm_block_data(b); - __le32 csum_le; - - if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) { - DMERR("sb_check failed: blocknr %llu: " - "wanted %llu", le64_to_cpu(disk_super->blocknr), - (unsigned long long)dm_block_location(b)); - return -ENOTBLK; - } - - if (le64_to_cpu(disk_super->magic) != THIN_SUPERBLOCK_MAGIC) { - DMERR("sb_check failed: magic %llu: " - "wanted %llu", le64_to_cpu(disk_super->magic), - (unsigned long long)THIN_SUPERBLOCK_MAGIC); - return -EILSEQ; - } - - csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags, - block_size - sizeof(__le32), - SUPERBLOCK_CSUM_XOR)); - if (csum_le != disk_super->csum) { - DMERR("sb_check failed: csum %u: wanted %u", - le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum)); - return -EILSEQ; - } - - return 0; -} - -static struct dm_block_validator sb_validator = { - .name = "superblock", - .prepare_for_write = sb_prepare_for_write, - .check = sb_check -}; - -/*---------------------------------------------------------------- - * Methods for the btree value types - *--------------------------------------------------------------*/ - -static uint64_t pack_block_time(dm_block_t b, uint32_t t) -{ - return (b << 24) | t; -} - -static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t) -{ - *b = v >> 24; - *t = v & ((1 << 24) - 1); -} - -static void data_block_inc(void *context, void *value_le) -{ - struct dm_space_map *sm = context; - __le64 v_le; - uint64_t b; - uint32_t t; - - memcpy(&v_le, value_le, sizeof(v_le)); - unpack_block_time(le64_to_cpu(v_le), &b, &t); - dm_sm_inc_block(sm, b); -} - -static void data_block_dec(void *context, void *value_le) -{ - struct dm_space_map *sm = context; - __le64 v_le; - uint64_t b; - uint32_t t; - - memcpy(&v_le, value_le, sizeof(v_le)); - unpack_block_time(le64_to_cpu(v_le), &b, &t); - dm_sm_dec_block(sm, b); -} - -static int data_block_equal(void *context, void *value1_le, void *value2_le) -{ - __le64 v1_le, v2_le; - uint64_t b1, b2; - uint32_t t; - - memcpy(&v1_le, value1_le, sizeof(v1_le)); - memcpy(&v2_le, value2_le, sizeof(v2_le)); - unpack_block_time(le64_to_cpu(v1_le), &b1, &t); - unpack_block_time(le64_to_cpu(v2_le), &b2, &t); - - return b1 == b2; -} - -static void subtree_inc(void *context, void *value) -{ - struct dm_btree_info *info = context; - __le64 root_le; - uint64_t root; - - memcpy(&root_le, value, sizeof(root_le)); - root = le64_to_cpu(root_le); - dm_tm_inc(info->tm, root); -} - -static void subtree_dec(void *context, void *value) -{ - struct dm_btree_info *info = context; - __le64 root_le; - uint64_t root; - - memcpy(&root_le, value, sizeof(root_le)); - root = le64_to_cpu(root_le); - if (dm_btree_del(info, root)) - DMERR("btree delete failed\n"); -} - -static int subtree_equal(void *context, void *value1_le, void *value2_le) -{ - __le64 v1_le, v2_le; - memcpy(&v1_le, value1_le, sizeof(v1_le)); - memcpy(&v2_le, value2_le, sizeof(v2_le)); - - return v1_le == v2_le; -} - -/*----------------------------------------------------------------*/ - -static int superblock_all_zeroes(struct dm_block_manager *bm, int *result) -{ - int r; - unsigned i; - struct dm_block *b; - __le64 *data_le, zero = cpu_to_le64(0); - unsigned block_size = dm_bm_block_size(bm) / sizeof(__le64); - - /* - * We can't use a validator here - it may be all zeroes. - */ - r = dm_bm_read_lock(bm, THIN_SUPERBLOCK_LOCATION, NULL, &b); - if (r) - return r; - - data_le = dm_block_data(b); - *result = 1; - for (i = 0; i < block_size; i++) { - if (data_le[i] != zero) { - *result = 0; - break; - } - } - - return dm_bm_unlock(b); -} - -static int init_pmd(struct dm_pool_metadata *pmd, - struct dm_block_manager *bm, - dm_block_t nr_blocks, int create) -{ - int r; - struct dm_space_map *sm, *data_sm; - struct dm_transaction_manager *tm; - struct dm_block *sblock; - - if (create) { - r = dm_tm_create_with_sm(bm, THIN_SUPERBLOCK_LOCATION, - &sb_validator, &tm, &sm, &sblock); - if (r < 0) { - DMERR("tm_create_with_sm failed"); - return r; - } - - data_sm = dm_sm_disk_create(tm, nr_blocks); - if (IS_ERR(data_sm)) { - DMERR("sm_disk_create failed"); - dm_tm_unlock(tm, sblock); - r = PTR_ERR(data_sm); - goto bad; - } - } else { - struct thin_disk_superblock *disk_super = NULL; - size_t space_map_root_offset = - offsetof(struct thin_disk_superblock, metadata_space_map_root); - - r = dm_tm_open_with_sm(bm, THIN_SUPERBLOCK_LOCATION, - &sb_validator, space_map_root_offset, - SPACE_MAP_ROOT_SIZE, &tm, &sm, &sblock); - if (r < 0) { - DMERR("tm_open_with_sm failed"); - return r; - } - - disk_super = dm_block_data(sblock); - data_sm = dm_sm_disk_open(tm, disk_super->data_space_map_root, - sizeof(disk_super->data_space_map_root)); - if (IS_ERR(data_sm)) { - DMERR("sm_disk_open failed"); - r = PTR_ERR(data_sm); - goto bad; - } - } - - - r = dm_tm_unlock(tm, sblock); - if (r < 0) { - DMERR("couldn't unlock superblock"); - goto bad_data_sm; - } - - pmd->bm = bm; - pmd->metadata_sm = sm; - pmd->data_sm = data_sm; - pmd->tm = tm; - pmd->nb_tm = dm_tm_create_non_blocking_clone(tm); - if (!pmd->nb_tm) { - DMERR("could not create clone tm"); - r = -ENOMEM; - goto bad_data_sm; - } - - pmd->info.tm = tm; - pmd->info.levels = 2; - pmd->info.value_type.context = pmd->data_sm; - pmd->info.value_type.size = sizeof(__le64); - pmd->info.value_type.inc = data_block_inc; - pmd->info.value_type.dec = data_block_dec; - pmd->info.value_type.equal = data_block_equal; - - memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info)); - pmd->nb_info.tm = pmd->nb_tm; - - pmd->tl_info.tm = tm; - pmd->tl_info.levels = 1; - pmd->tl_info.value_type.context = &pmd->info; - pmd->tl_info.value_type.size = sizeof(__le64); - pmd->tl_info.value_type.inc = subtree_inc; - pmd->tl_info.value_type.dec = subtree_dec; - pmd->tl_info.value_type.equal = subtree_equal; - - pmd->bl_info.tm = tm; - pmd->bl_info.levels = 1; - pmd->bl_info.value_type.context = pmd->data_sm; - pmd->bl_info.value_type.size = sizeof(__le64); - pmd->bl_info.value_type.inc = data_block_inc; - pmd->bl_info.value_type.dec = data_block_dec; - pmd->bl_info.value_type.equal = data_block_equal; - - pmd->details_info.tm = tm; - pmd->details_info.levels = 1; - pmd->details_info.value_type.context = NULL; - pmd->details_info.value_type.size = sizeof(struct disk_device_details); - pmd->details_info.value_type.inc = NULL; - pmd->details_info.value_type.dec = NULL; - pmd->details_info.value_type.equal = NULL; - - pmd->root = 0; - - init_rwsem(&pmd->root_lock); - pmd->time = 0; - pmd->need_commit = 0; - pmd->details_root = 0; - pmd->trans_id = 0; - pmd->flags = 0; - INIT_LIST_HEAD(&pmd->thin_devices); - - return 0; - -bad_data_sm: - dm_sm_destroy(data_sm); -bad: - dm_tm_destroy(tm); - dm_sm_destroy(sm); - - return r; -} - -static int __begin_transaction(struct dm_pool_metadata *pmd) -{ - int r; - u32 features; - struct thin_disk_superblock *disk_super; - struct dm_block *sblock; - - /* - * __maybe_commit_transaction() resets these - */ - WARN_ON(pmd->need_commit); - - /* - * We re-read the superblock every time. Shouldn't need to do this - * really. - */ - r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, - &sb_validator, &sblock); - if (r) - return r; - - disk_super = dm_block_data(sblock); - pmd->time = le32_to_cpu(disk_super->time); - pmd->root = le64_to_cpu(disk_super->data_mapping_root); - pmd->details_root = le64_to_cpu(disk_super->device_details_root); - pmd->trans_id = le64_to_cpu(disk_super->trans_id); - pmd->flags = le32_to_cpu(disk_super->flags); - pmd->data_block_size = le32_to_cpu(disk_super->data_block_size); - - features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP; - if (features) { - DMERR("could not access metadata due to " - "unsupported optional features (%lx).", - (unsigned long)features); - r = -EINVAL; - goto out; - } - - /* - * Check for read-only metadata to skip the following RDWR checks. - */ - if (get_disk_ro(pmd->bdev->bd_disk)) - goto out; - - features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP; - if (features) { - DMERR("could not access metadata RDWR due to " - "unsupported optional features (%lx).", - (unsigned long)features); - r = -EINVAL; - } - -out: - dm_bm_unlock(sblock); - return r; -} - -static int __write_changed_details(struct dm_pool_metadata *pmd) -{ - int r; - struct dm_thin_device *td, *tmp; - struct disk_device_details details; - uint64_t key; - - list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) { - if (!td->changed) - continue; - - key = td->id; - - details.mapped_blocks = cpu_to_le64(td->mapped_blocks); - details.transaction_id = cpu_to_le64(td->transaction_id); - details.creation_time = cpu_to_le32(td->creation_time); - details.snapshotted_time = cpu_to_le32(td->snapshotted_time); - __dm_bless_for_disk(&details); - - r = dm_btree_insert(&pmd->details_info, pmd->details_root, - &key, &details, &pmd->details_root); - if (r) - return r; - - if (td->open_count) - td->changed = 0; - else { - list_del(&td->list); - kfree(td); - } - - pmd->need_commit = 1; - } - - return 0; -} - -static int __commit_transaction(struct dm_pool_metadata *pmd) -{ - /* - * FIXME: Associated pool should be made read-only on failure. - */ - int r; - size_t metadata_len, data_len; - struct thin_disk_superblock *disk_super; - struct dm_block *sblock; - - /* - * We need to know if the thin_disk_superblock exceeds a 512-byte sector. - */ - BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512); - - r = __write_changed_details(pmd); - if (r < 0) - goto out; - - if (!pmd->need_commit) - goto out; - - r = dm_sm_commit(pmd->data_sm); - if (r < 0) - goto out; - - r = dm_tm_pre_commit(pmd->tm); - if (r < 0) - goto out; - - r = dm_sm_root_size(pmd->metadata_sm, &metadata_len); - if (r < 0) - goto out; - - r = dm_sm_root_size(pmd->data_sm, &data_len); - if (r < 0) - goto out; - - r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, - &sb_validator, &sblock); - if (r) - goto out; - - disk_super = dm_block_data(sblock); - disk_super->time = cpu_to_le32(pmd->time); - disk_super->data_mapping_root = cpu_to_le64(pmd->root); - disk_super->device_details_root = cpu_to_le64(pmd->details_root); - disk_super->trans_id = cpu_to_le64(pmd->trans_id); - disk_super->flags = cpu_to_le32(pmd->flags); - - r = dm_sm_copy_root(pmd->metadata_sm, &disk_super->metadata_space_map_root, - metadata_len); - if (r < 0) - goto out_locked; - - r = dm_sm_copy_root(pmd->data_sm, &disk_super->data_space_map_root, - data_len); - if (r < 0) - goto out_locked; - - r = dm_tm_commit(pmd->tm, sblock); - if (!r) - pmd->need_commit = 0; - -out: - return r; - -out_locked: - dm_bm_unlock(sblock); - return r; -} - -struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, - sector_t data_block_size) -{ - int r; - struct thin_disk_superblock *disk_super; - struct dm_pool_metadata *pmd; - sector_t bdev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; - struct dm_block_manager *bm; - int create; - struct dm_block *sblock; - - pmd = kmalloc(sizeof(*pmd), GFP_KERNEL); - if (!pmd) { - DMERR("could not allocate metadata struct"); - return ERR_PTR(-ENOMEM); - } - - /* - * Max hex locks: - * 3 for btree insert + - * 2 for btree lookup used within space map - */ - bm = dm_block_manager_create(bdev, THIN_METADATA_BLOCK_SIZE, - THIN_METADATA_CACHE_SIZE, 5); - if (!bm) { - DMERR("could not create block manager"); - kfree(pmd); - return ERR_PTR(-ENOMEM); - } - - r = superblock_all_zeroes(bm, &create); - if (r) { - dm_block_manager_destroy(bm); - kfree(pmd); - return ERR_PTR(r); - } - - - r = init_pmd(pmd, bm, 0, create); - if (r) { - dm_block_manager_destroy(bm); - kfree(pmd); - return ERR_PTR(r); - } - pmd->bdev = bdev; - - if (!create) { - r = __begin_transaction(pmd); - if (r < 0) - goto bad; - return pmd; - } - - /* - * Create. - */ - r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, - &sb_validator, &sblock); - if (r) - goto bad; - - if (bdev_size > THIN_METADATA_MAX_SECTORS) - bdev_size = THIN_METADATA_MAX_SECTORS; - - disk_super = dm_block_data(sblock); - disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC); - disk_super->version = cpu_to_le32(THIN_VERSION); - disk_super->time = 0; - disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); - disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT); - disk_super->data_block_size = cpu_to_le32(data_block_size); - - r = dm_bm_unlock(sblock); - if (r < 0) - goto bad; - - r = dm_btree_empty(&pmd->info, &pmd->root); - if (r < 0) - goto bad; - - r = dm_btree_empty(&pmd->details_info, &pmd->details_root); - if (r < 0) { - DMERR("couldn't create devices root"); - goto bad; - } - - pmd->flags = 0; - pmd->need_commit = 1; - r = dm_pool_commit_metadata(pmd); - if (r < 0) { - DMERR("%s: dm_pool_commit_metadata() failed, error = %d", - __func__, r); - goto bad; - } - - return pmd; - -bad: - if (dm_pool_metadata_close(pmd) < 0) - DMWARN("%s: dm_pool_metadata_close() failed.", __func__); - return ERR_PTR(r); -} - -int dm_pool_metadata_close(struct dm_pool_metadata *pmd) -{ - int r; - unsigned open_devices = 0; - struct dm_thin_device *td, *tmp; - - down_read(&pmd->root_lock); - list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) { - if (td->open_count) - open_devices++; - else { - list_del(&td->list); - kfree(td); - } - } - up_read(&pmd->root_lock); - - if (open_devices) { - DMERR("attempt to close pmd when %u device(s) are still open", - open_devices); - return -EBUSY; - } - - r = __commit_transaction(pmd); - if (r < 0) - DMWARN("%s: __commit_transaction() failed, error = %d", - __func__, r); - - dm_tm_destroy(pmd->tm); - dm_tm_destroy(pmd->nb_tm); - dm_block_manager_destroy(pmd->bm); - dm_sm_destroy(pmd->metadata_sm); - dm_sm_destroy(pmd->data_sm); - kfree(pmd); - - return 0; -} - -/* - * __open_device: Returns @td corresponding to device with id @dev, - * creating it if @create is set and incrementing @td->open_count. - * On failure, @td is undefined. - */ -static int __open_device(struct dm_pool_metadata *pmd, - dm_thin_id dev, int create, - struct dm_thin_device **td) -{ - int r, changed = 0; - struct dm_thin_device *td2; - uint64_t key = dev; - struct disk_device_details details_le; - - /* - * If the device is already open, return it. - */ - list_for_each_entry(td2, &pmd->thin_devices, list) - if (td2->id == dev) { - /* - * May not create an already-open device. - */ - if (create) - return -EEXIST; - - td2->open_count++; - *td = td2; - return 0; - } - - /* - * Check the device exists. - */ - r = dm_btree_lookup(&pmd->details_info, pmd->details_root, - &key, &details_le); - if (r) { - if (r != -ENODATA || !create) - return r; - - /* - * Create new device. - */ - changed = 1; - details_le.mapped_blocks = 0; - details_le.transaction_id = cpu_to_le64(pmd->trans_id); - details_le.creation_time = cpu_to_le32(pmd->time); - details_le.snapshotted_time = cpu_to_le32(pmd->time); - } - - *td = kmalloc(sizeof(**td), GFP_NOIO); - if (!*td) - return -ENOMEM; - - (*td)->pmd = pmd; - (*td)->id = dev; - (*td)->open_count = 1; - (*td)->changed = changed; - (*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks); - (*td)->transaction_id = le64_to_cpu(details_le.transaction_id); - (*td)->creation_time = le32_to_cpu(details_le.creation_time); - (*td)->snapshotted_time = le32_to_cpu(details_le.snapshotted_time); - - list_add(&(*td)->list, &pmd->thin_devices); - - return 0; -} - -static void __close_device(struct dm_thin_device *td) -{ - --td->open_count; -} - -static int __create_thin(struct dm_pool_metadata *pmd, - dm_thin_id dev) -{ - int r; - dm_block_t dev_root; - uint64_t key = dev; - struct disk_device_details details_le; - struct dm_thin_device *td; - __le64 value; - - r = dm_btree_lookup(&pmd->details_info, pmd->details_root, - &key, &details_le); - if (!r) - return -EEXIST; - - /* - * Create an empty btree for the mappings. - */ - r = dm_btree_empty(&pmd->bl_info, &dev_root); - if (r) - return r; - - /* - * Insert it into the main mapping tree. - */ - value = cpu_to_le64(dev_root); - __dm_bless_for_disk(&value); - r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root); - if (r) { - dm_btree_del(&pmd->bl_info, dev_root); - return r; - } - - r = __open_device(pmd, dev, 1, &td); - if (r) { - dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); - dm_btree_del(&pmd->bl_info, dev_root); - return r; - } - __close_device(td); - - return r; -} - -int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev) -{ - int r; - - down_write(&pmd->root_lock); - r = __create_thin(pmd, dev); - up_write(&pmd->root_lock); - - return r; -} - -static int __set_snapshot_details(struct dm_pool_metadata *pmd, - struct dm_thin_device *snap, - dm_thin_id origin, uint32_t time) -{ - int r; - struct dm_thin_device *td; - - r = __open_device(pmd, origin, 0, &td); - if (r) - return r; - - td->changed = 1; - td->snapshotted_time = time; - - snap->mapped_blocks = td->mapped_blocks; - snap->snapshotted_time = time; - __close_device(td); - - return 0; -} - -static int __create_snap(struct dm_pool_metadata *pmd, - dm_thin_id dev, dm_thin_id origin) -{ - int r; - dm_block_t origin_root; - uint64_t key = origin, dev_key = dev; - struct dm_thin_device *td; - struct disk_device_details details_le; - __le64 value; - - /* check this device is unused */ - r = dm_btree_lookup(&pmd->details_info, pmd->details_root, - &dev_key, &details_le); - if (!r) - return -EEXIST; - - /* find the mapping tree for the origin */ - r = dm_btree_lookup(&pmd->tl_info, pmd->root, &key, &value); - if (r) - return r; - origin_root = le64_to_cpu(value); - - /* clone the origin, an inc will do */ - dm_tm_inc(pmd->tm, origin_root); - - /* insert into the main mapping tree */ - value = cpu_to_le64(origin_root); - __dm_bless_for_disk(&value); - key = dev; - r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root); - if (r) { - dm_tm_dec(pmd->tm, origin_root); - return r; - } - - pmd->time++; - - r = __open_device(pmd, dev, 1, &td); - if (r) - goto bad; - - r = __set_snapshot_details(pmd, td, origin, pmd->time); - __close_device(td); - - if (r) - goto bad; - - return 0; - -bad: - dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); - dm_btree_remove(&pmd->details_info, pmd->details_root, - &key, &pmd->details_root); - return r; -} - -int dm_pool_create_snap(struct dm_pool_metadata *pmd, - dm_thin_id dev, - dm_thin_id origin) -{ - int r; - - down_write(&pmd->root_lock); - r = __create_snap(pmd, dev, origin); - up_write(&pmd->root_lock); - - return r; -} - -static int __delete_device(struct dm_pool_metadata *pmd, dm_thin_id dev) -{ - int r; - uint64_t key = dev; - struct dm_thin_device *td; - - /* TODO: failure should mark the transaction invalid */ - r = __open_device(pmd, dev, 0, &td); - if (r) - return r; - - if (td->open_count > 1) { - __close_device(td); - return -EBUSY; - } - - list_del(&td->list); - kfree(td); - r = dm_btree_remove(&pmd->details_info, pmd->details_root, - &key, &pmd->details_root); - if (r) - return r; - - r = dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); - if (r) - return r; - - pmd->need_commit = 1; - - return 0; -} - -int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd, - dm_thin_id dev) -{ - int r; - - down_write(&pmd->root_lock); - r = __delete_device(pmd, dev); - up_write(&pmd->root_lock); - - return r; -} - -int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd, - uint64_t current_id, - uint64_t new_id) -{ - down_write(&pmd->root_lock); - if (pmd->trans_id != current_id) { - up_write(&pmd->root_lock); - DMERR("mismatched transaction id"); - return -EINVAL; - } - - pmd->trans_id = new_id; - pmd->need_commit = 1; - up_write(&pmd->root_lock); - - return 0; -} - -int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd, - uint64_t *result) -{ - down_read(&pmd->root_lock); - *result = pmd->trans_id; - up_read(&pmd->root_lock); - - return 0; -} - -static int __get_held_metadata_root(struct dm_pool_metadata *pmd, - dm_block_t *result) -{ - int r; - struct thin_disk_superblock *disk_super; - struct dm_block *sblock; - - r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, - &sb_validator, &sblock); - if (r) - return r; - - disk_super = dm_block_data(sblock); - *result = le64_to_cpu(disk_super->held_root); - - return dm_bm_unlock(sblock); -} - -int dm_pool_get_held_metadata_root(struct dm_pool_metadata *pmd, - dm_block_t *result) -{ - int r; - - down_read(&pmd->root_lock); - r = __get_held_metadata_root(pmd, result); - up_read(&pmd->root_lock); - - return r; -} - -int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev, - struct dm_thin_device **td) -{ - int r; - - down_write(&pmd->root_lock); - r = __open_device(pmd, dev, 0, td); - up_write(&pmd->root_lock); - - return r; -} - -int dm_pool_close_thin_device(struct dm_thin_device *td) -{ - down_write(&td->pmd->root_lock); - __close_device(td); - up_write(&td->pmd->root_lock); - - return 0; -} - -dm_thin_id dm_thin_dev_id(struct dm_thin_device *td) -{ - return td->id; -} - -static int __snapshotted_since(struct dm_thin_device *td, uint32_t time) -{ - return td->snapshotted_time > time; -} - -int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block, - int can_block, struct dm_thin_lookup_result *result) -{ - int r; - uint64_t block_time = 0; - __le64 value; - struct dm_pool_metadata *pmd = td->pmd; - dm_block_t keys[2] = { td->id, block }; - - if (can_block) { - down_read(&pmd->root_lock); - r = dm_btree_lookup(&pmd->info, pmd->root, keys, &value); - if (!r) - block_time = le64_to_cpu(value); - up_read(&pmd->root_lock); - - } else if (down_read_trylock(&pmd->root_lock)) { - r = dm_btree_lookup(&pmd->nb_info, pmd->root, keys, &value); - if (!r) - block_time = le64_to_cpu(value); - up_read(&pmd->root_lock); - - } else - return -EWOULDBLOCK; - - if (!r) { - dm_block_t exception_block; - uint32_t exception_time; - unpack_block_time(block_time, &exception_block, - &exception_time); - result->block = exception_block; - result->shared = __snapshotted_since(td, exception_time); - } - - return r; -} - -static int __insert(struct dm_thin_device *td, dm_block_t block, - dm_block_t data_block) -{ - int r, inserted; - __le64 value; - struct dm_pool_metadata *pmd = td->pmd; - dm_block_t keys[2] = { td->id, block }; - - pmd->need_commit = 1; - value = cpu_to_le64(pack_block_time(data_block, pmd->time)); - __dm_bless_for_disk(&value); - - r = dm_btree_insert_notify(&pmd->info, pmd->root, keys, &value, - &pmd->root, &inserted); - if (r) - return r; - - if (inserted) { - td->mapped_blocks++; - td->changed = 1; - } - - return 0; -} - -int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block, - dm_block_t data_block) -{ - int r; - - down_write(&td->pmd->root_lock); - r = __insert(td, block, data_block); - up_write(&td->pmd->root_lock); - - return r; -} - -static int __remove(struct dm_thin_device *td, dm_block_t block) -{ - int r; - struct dm_pool_metadata *pmd = td->pmd; - dm_block_t keys[2] = { td->id, block }; - - r = dm_btree_remove(&pmd->info, pmd->root, keys, &pmd->root); - if (r) - return r; - - td->mapped_blocks--; - td->changed = 1; - pmd->need_commit = 1; - - return 0; -} - -int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block) -{ - int r; - - down_write(&td->pmd->root_lock); - r = __remove(td, block); - up_write(&td->pmd->root_lock); - - return r; -} - -int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result) -{ - int r; - - down_write(&pmd->root_lock); - - r = dm_sm_new_block(pmd->data_sm, result); - pmd->need_commit = 1; - - up_write(&pmd->root_lock); - - return r; -} - -int dm_pool_commit_metadata(struct dm_pool_metadata *pmd) -{ - int r; - - down_write(&pmd->root_lock); - - r = __commit_transaction(pmd); - if (r <= 0) - goto out; - - /* - * Open the next transaction. - */ - r = __begin_transaction(pmd); -out: - up_write(&pmd->root_lock); - return r; -} - -int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result) -{ - int r; - - down_read(&pmd->root_lock); - r = dm_sm_get_nr_free(pmd->data_sm, result); - up_read(&pmd->root_lock); - - return r; -} - -int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd, - dm_block_t *result) -{ - int r; - - down_read(&pmd->root_lock); - r = dm_sm_get_nr_free(pmd->metadata_sm, result); - up_read(&pmd->root_lock); - - return r; -} - -int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd, - dm_block_t *result) -{ - int r; - - down_read(&pmd->root_lock); - r = dm_sm_get_nr_blocks(pmd->metadata_sm, result); - up_read(&pmd->root_lock); - - return r; -} - -int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result) -{ - down_read(&pmd->root_lock); - *result = pmd->data_block_size; - up_read(&pmd->root_lock); - - return 0; -} - -int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result) -{ - int r; - - down_read(&pmd->root_lock); - r = dm_sm_get_nr_blocks(pmd->data_sm, result); - up_read(&pmd->root_lock); - - return r; -} - -int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result) -{ - struct dm_pool_metadata *pmd = td->pmd; - - down_read(&pmd->root_lock); - *result = td->mapped_blocks; - up_read(&pmd->root_lock); - - return 0; -} - -static int __highest_block(struct dm_thin_device *td, dm_block_t *result) -{ - int r; - __le64 value_le; - dm_block_t thin_root; - struct dm_pool_metadata *pmd = td->pmd; - - r = dm_btree_lookup(&pmd->tl_info, pmd->root, &td->id, &value_le); - if (r) - return r; - - thin_root = le64_to_cpu(value_le); - - return dm_btree_find_highest_key(&pmd->bl_info, thin_root, result); -} - -int dm_thin_get_highest_mapped_block(struct dm_thin_device *td, - dm_block_t *result) -{ - int r; - struct dm_pool_metadata *pmd = td->pmd; - - down_read(&pmd->root_lock); - r = __highest_block(td, result); - up_read(&pmd->root_lock); - - return r; -} - -static int __resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) -{ - int r; - dm_block_t old_count; - - r = dm_sm_get_nr_blocks(pmd->data_sm, &old_count); - if (r) - return r; - - if (new_count == old_count) - return 0; - - if (new_count < old_count) { - DMERR("cannot reduce size of data device"); - return -EINVAL; - } - - r = dm_sm_extend(pmd->data_sm, new_count - old_count); - if (!r) - pmd->need_commit = 1; - - return r; -} - -int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) -{ - int r; - - down_write(&pmd->root_lock); - r = __resize_data_dev(pmd, new_count); - up_write(&pmd->root_lock); - - return r; -} diff --git a/ANDROID_3.4.5/drivers/md/dm-thin-metadata.h b/ANDROID_3.4.5/drivers/md/dm-thin-metadata.h deleted file mode 100644 index ed4725e6..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-thin-metadata.h +++ /dev/null @@ -1,169 +0,0 @@ -/* - * Copyright (C) 2010-2011 Red Hat, Inc. - * - * This file is released under the GPL. - */ - -#ifndef DM_THIN_METADATA_H -#define DM_THIN_METADATA_H - -#include "persistent-data/dm-block-manager.h" - -#define THIN_METADATA_BLOCK_SIZE 4096 - -/* - * The metadata device is currently limited in size. - * - * We have one block of index, which can hold 255 index entries. Each - * index entry contains allocation info about 16k metadata blocks. - */ -#define THIN_METADATA_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT))) - -/* - * A metadata device larger than 16GB triggers a warning. - */ -#define THIN_METADATA_MAX_SECTORS_WARNING (16 * (1024 * 1024 * 1024 >> SECTOR_SHIFT)) - -/*----------------------------------------------------------------*/ - -struct dm_pool_metadata; -struct dm_thin_device; - -/* - * Device identifier - */ -typedef uint64_t dm_thin_id; - -/* - * Reopens or creates a new, empty metadata volume. - */ -struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, - sector_t data_block_size); - -int dm_pool_metadata_close(struct dm_pool_metadata *pmd); - -/* - * Compat feature flags. Any incompat flags beyond the ones - * specified below will prevent use of the thin metadata. - */ -#define THIN_FEATURE_COMPAT_SUPP 0UL -#define THIN_FEATURE_COMPAT_RO_SUPP 0UL -#define THIN_FEATURE_INCOMPAT_SUPP 0UL - -/* - * Device creation/deletion. - */ -int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev); - -/* - * An internal snapshot. - * - * You can only snapshot a quiesced origin i.e. one that is either - * suspended or not instanced at all. - */ -int dm_pool_create_snap(struct dm_pool_metadata *pmd, dm_thin_id dev, - dm_thin_id origin); - -/* - * Deletes a virtual device from the metadata. It _is_ safe to call this - * when that device is open. Operations on that device will just start - * failing. You still need to call close() on the device. - */ -int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd, - dm_thin_id dev); - -/* - * Commits _all_ metadata changes: device creation, deletion, mapping - * updates. - */ -int dm_pool_commit_metadata(struct dm_pool_metadata *pmd); - -/* - * Set/get userspace transaction id. - */ -int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd, - uint64_t current_id, - uint64_t new_id); - -int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd, - uint64_t *result); - -/* - * Hold/get root for userspace transaction. - */ -int dm_pool_hold_metadata_root(struct dm_pool_metadata *pmd); - -int dm_pool_get_held_metadata_root(struct dm_pool_metadata *pmd, - dm_block_t *result); - -/* - * Actions on a single virtual device. - */ - -/* - * Opening the same device more than once will fail with -EBUSY. - */ -int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev, - struct dm_thin_device **td); - -int dm_pool_close_thin_device(struct dm_thin_device *td); - -dm_thin_id dm_thin_dev_id(struct dm_thin_device *td); - -struct dm_thin_lookup_result { - dm_block_t block; - int shared; -}; - -/* - * Returns: - * -EWOULDBLOCK iff @can_block is set and would block. - * -ENODATA iff that mapping is not present. - * 0 success - */ -int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block, - int can_block, struct dm_thin_lookup_result *result); - -/* - * Obtain an unused block. - */ -int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result); - -/* - * Insert or remove block. - */ -int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block, - dm_block_t data_block); - -int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block); - -/* - * Queries. - */ -int dm_thin_get_highest_mapped_block(struct dm_thin_device *td, - dm_block_t *highest_mapped); - -int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result); - -int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, - dm_block_t *result); - -int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd, - dm_block_t *result); - -int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd, - dm_block_t *result); - -int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result); - -int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result); - -/* - * Returns -ENOSPC if the new size is too small and already allocated - * blocks would be lost. - */ -int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_size); - -/*----------------------------------------------------------------*/ - -#endif diff --git a/ANDROID_3.4.5/drivers/md/dm-thin.c b/ANDROID_3.4.5/drivers/md/dm-thin.c deleted file mode 100644 index eb3d138f..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-thin.c +++ /dev/null @@ -1,2774 +0,0 @@ -/* - * Copyright (C) 2011 Red Hat UK. - * - * This file is released under the GPL. - */ - -#include "dm-thin-metadata.h" - -#include <linux/device-mapper.h> -#include <linux/dm-io.h> -#include <linux/dm-kcopyd.h> -#include <linux/list.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/slab.h> - -#define DM_MSG_PREFIX "thin" - -/* - * Tunable constants - */ -#define ENDIO_HOOK_POOL_SIZE 10240 -#define DEFERRED_SET_SIZE 64 -#define MAPPING_POOL_SIZE 1024 -#define PRISON_CELLS 1024 -#define COMMIT_PERIOD HZ - -/* - * The block size of the device holding pool data must be - * between 64KB and 1GB. - */ -#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT) -#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) - -/* - * Device id is restricted to 24 bits. - */ -#define MAX_DEV_ID ((1 << 24) - 1) - -/* - * How do we handle breaking sharing of data blocks? - * ================================================= - * - * We use a standard copy-on-write btree to store the mappings for the - * devices (note I'm talking about copy-on-write of the metadata here, not - * the data). When you take an internal snapshot you clone the root node - * of the origin btree. After this there is no concept of an origin or a - * snapshot. They are just two device trees that happen to point to the - * same data blocks. - * - * When we get a write in we decide if it's to a shared data block using - * some timestamp magic. If it is, we have to break sharing. - * - * Let's say we write to a shared block in what was the origin. The - * steps are: - * - * i) plug io further to this physical block. (see bio_prison code). - * - * ii) quiesce any read io to that shared data block. Obviously - * including all devices that share this block. (see deferred_set code) - * - * iii) copy the data block to a newly allocate block. This step can be - * missed out if the io covers the block. (schedule_copy). - * - * iv) insert the new mapping into the origin's btree - * (process_prepared_mapping). This act of inserting breaks some - * sharing of btree nodes between the two devices. Breaking sharing only - * effects the btree of that specific device. Btrees for the other - * devices that share the block never change. The btree for the origin - * device as it was after the last commit is untouched, ie. we're using - * persistent data structures in the functional programming sense. - * - * v) unplug io to this physical block, including the io that triggered - * the breaking of sharing. - * - * Steps (ii) and (iii) occur in parallel. - * - * The metadata _doesn't_ need to be committed before the io continues. We - * get away with this because the io is always written to a _new_ block. - * If there's a crash, then: - * - * - The origin mapping will point to the old origin block (the shared - * one). This will contain the data as it was before the io that triggered - * the breaking of sharing came in. - * - * - The snap mapping still points to the old block. As it would after - * the commit. - * - * The downside of this scheme is the timestamp magic isn't perfect, and - * will continue to think that data block in the snapshot device is shared - * even after the write to the origin has broken sharing. I suspect data - * blocks will typically be shared by many different devices, so we're - * breaking sharing n + 1 times, rather than n, where n is the number of - * devices that reference this data block. At the moment I think the - * benefits far, far outweigh the disadvantages. - */ - -/*----------------------------------------------------------------*/ - -/* - * Sometimes we can't deal with a bio straight away. We put them in prison - * where they can't cause any mischief. Bios are put in a cell identified - * by a key, multiple bios can be in the same cell. When the cell is - * subsequently unlocked the bios become available. - */ -struct bio_prison; - -struct cell_key { - int virtual; - dm_thin_id dev; - dm_block_t block; -}; - -struct cell { - struct hlist_node list; - struct bio_prison *prison; - struct cell_key key; - struct bio *holder; - struct bio_list bios; -}; - -struct bio_prison { - spinlock_t lock; - mempool_t *cell_pool; - - unsigned nr_buckets; - unsigned hash_mask; - struct hlist_head *cells; -}; - -static uint32_t calc_nr_buckets(unsigned nr_cells) -{ - uint32_t n = 128; - - nr_cells /= 4; - nr_cells = min(nr_cells, 8192u); - - while (n < nr_cells) - n <<= 1; - - return n; -} - -/* - * @nr_cells should be the number of cells you want in use _concurrently_. - * Don't confuse it with the number of distinct keys. - */ -static struct bio_prison *prison_create(unsigned nr_cells) -{ - unsigned i; - uint32_t nr_buckets = calc_nr_buckets(nr_cells); - size_t len = sizeof(struct bio_prison) + - (sizeof(struct hlist_head) * nr_buckets); - struct bio_prison *prison = kmalloc(len, GFP_KERNEL); - - if (!prison) - return NULL; - - spin_lock_init(&prison->lock); - prison->cell_pool = mempool_create_kmalloc_pool(nr_cells, - sizeof(struct cell)); - if (!prison->cell_pool) { - kfree(prison); - return NULL; - } - - prison->nr_buckets = nr_buckets; - prison->hash_mask = nr_buckets - 1; - prison->cells = (struct hlist_head *) (prison + 1); - for (i = 0; i < nr_buckets; i++) - INIT_HLIST_HEAD(prison->cells + i); - - return prison; -} - -static void prison_destroy(struct bio_prison *prison) -{ - mempool_destroy(prison->cell_pool); - kfree(prison); -} - -static uint32_t hash_key(struct bio_prison *prison, struct cell_key *key) -{ - const unsigned long BIG_PRIME = 4294967291UL; - uint64_t hash = key->block * BIG_PRIME; - - return (uint32_t) (hash & prison->hash_mask); -} - -static int keys_equal(struct cell_key *lhs, struct cell_key *rhs) -{ - return (lhs->virtual == rhs->virtual) && - (lhs->dev == rhs->dev) && - (lhs->block == rhs->block); -} - -static struct cell *__search_bucket(struct hlist_head *bucket, - struct cell_key *key) -{ - struct cell *cell; - struct hlist_node *tmp; - - hlist_for_each_entry(cell, tmp, bucket, list) - if (keys_equal(&cell->key, key)) - return cell; - - return NULL; -} - -/* - * This may block if a new cell needs allocating. You must ensure that - * cells will be unlocked even if the calling thread is blocked. - * - * Returns 1 if the cell was already held, 0 if @inmate is the new holder. - */ -static int bio_detain(struct bio_prison *prison, struct cell_key *key, - struct bio *inmate, struct cell **ref) -{ - int r = 1; - unsigned long flags; - uint32_t hash = hash_key(prison, key); - struct cell *cell, *cell2; - - BUG_ON(hash > prison->nr_buckets); - - spin_lock_irqsave(&prison->lock, flags); - - cell = __search_bucket(prison->cells + hash, key); - if (cell) { - bio_list_add(&cell->bios, inmate); - goto out; - } - - /* - * Allocate a new cell - */ - spin_unlock_irqrestore(&prison->lock, flags); - cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO); - spin_lock_irqsave(&prison->lock, flags); - - /* - * We've been unlocked, so we have to double check that - * nobody else has inserted this cell in the meantime. - */ - cell = __search_bucket(prison->cells + hash, key); - if (cell) { - mempool_free(cell2, prison->cell_pool); - bio_list_add(&cell->bios, inmate); - goto out; - } - - /* - * Use new cell. - */ - cell = cell2; - - cell->prison = prison; - memcpy(&cell->key, key, sizeof(cell->key)); - cell->holder = inmate; - bio_list_init(&cell->bios); - hlist_add_head(&cell->list, prison->cells + hash); - - r = 0; - -out: - spin_unlock_irqrestore(&prison->lock, flags); - - *ref = cell; - - return r; -} - -/* - * @inmates must have been initialised prior to this call - */ -static void __cell_release(struct cell *cell, struct bio_list *inmates) -{ - struct bio_prison *prison = cell->prison; - - hlist_del(&cell->list); - - if (inmates) { - bio_list_add(inmates, cell->holder); - bio_list_merge(inmates, &cell->bios); - } - - mempool_free(cell, prison->cell_pool); -} - -static void cell_release(struct cell *cell, struct bio_list *bios) -{ - unsigned long flags; - struct bio_prison *prison = cell->prison; - - spin_lock_irqsave(&prison->lock, flags); - __cell_release(cell, bios); - spin_unlock_irqrestore(&prison->lock, flags); -} - -/* - * There are a couple of places where we put a bio into a cell briefly - * before taking it out again. In these situations we know that no other - * bio may be in the cell. This function releases the cell, and also does - * a sanity check. - */ -static void __cell_release_singleton(struct cell *cell, struct bio *bio) -{ - BUG_ON(cell->holder != bio); - BUG_ON(!bio_list_empty(&cell->bios)); - - __cell_release(cell, NULL); -} - -static void cell_release_singleton(struct cell *cell, struct bio *bio) -{ - unsigned long flags; - struct bio_prison *prison = cell->prison; - - spin_lock_irqsave(&prison->lock, flags); - __cell_release_singleton(cell, bio); - spin_unlock_irqrestore(&prison->lock, flags); -} - -/* - * Sometimes we don't want the holder, just the additional bios. - */ -static void __cell_release_no_holder(struct cell *cell, struct bio_list *inmates) -{ - struct bio_prison *prison = cell->prison; - - hlist_del(&cell->list); - bio_list_merge(inmates, &cell->bios); - - mempool_free(cell, prison->cell_pool); -} - -static void cell_release_no_holder(struct cell *cell, struct bio_list *inmates) -{ - unsigned long flags; - struct bio_prison *prison = cell->prison; - - spin_lock_irqsave(&prison->lock, flags); - __cell_release_no_holder(cell, inmates); - spin_unlock_irqrestore(&prison->lock, flags); -} - -static void cell_error(struct cell *cell) -{ - struct bio_prison *prison = cell->prison; - struct bio_list bios; - struct bio *bio; - unsigned long flags; - - bio_list_init(&bios); - - spin_lock_irqsave(&prison->lock, flags); - __cell_release(cell, &bios); - spin_unlock_irqrestore(&prison->lock, flags); - - while ((bio = bio_list_pop(&bios))) - bio_io_error(bio); -} - -/*----------------------------------------------------------------*/ - -/* - * We use the deferred set to keep track of pending reads to shared blocks. - * We do this to ensure the new mapping caused by a write isn't performed - * until these prior reads have completed. Otherwise the insertion of the - * new mapping could free the old block that the read bios are mapped to. - */ - -struct deferred_set; -struct deferred_entry { - struct deferred_set *ds; - unsigned count; - struct list_head work_items; -}; - -struct deferred_set { - spinlock_t lock; - unsigned current_entry; - unsigned sweeper; - struct deferred_entry entries[DEFERRED_SET_SIZE]; -}; - -static void ds_init(struct deferred_set *ds) -{ - int i; - - spin_lock_init(&ds->lock); - ds->current_entry = 0; - ds->sweeper = 0; - for (i = 0; i < DEFERRED_SET_SIZE; i++) { - ds->entries[i].ds = ds; - ds->entries[i].count = 0; - INIT_LIST_HEAD(&ds->entries[i].work_items); - } -} - -static struct deferred_entry *ds_inc(struct deferred_set *ds) -{ - unsigned long flags; - struct deferred_entry *entry; - - spin_lock_irqsave(&ds->lock, flags); - entry = ds->entries + ds->current_entry; - entry->count++; - spin_unlock_irqrestore(&ds->lock, flags); - - return entry; -} - -static unsigned ds_next(unsigned index) -{ - return (index + 1) % DEFERRED_SET_SIZE; -} - -static void __sweep(struct deferred_set *ds, struct list_head *head) -{ - while ((ds->sweeper != ds->current_entry) && - !ds->entries[ds->sweeper].count) { - list_splice_init(&ds->entries[ds->sweeper].work_items, head); - ds->sweeper = ds_next(ds->sweeper); - } - - if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count) - list_splice_init(&ds->entries[ds->sweeper].work_items, head); -} - -static void ds_dec(struct deferred_entry *entry, struct list_head *head) -{ - unsigned long flags; - - spin_lock_irqsave(&entry->ds->lock, flags); - BUG_ON(!entry->count); - --entry->count; - __sweep(entry->ds, head); - spin_unlock_irqrestore(&entry->ds->lock, flags); -} - -/* - * Returns 1 if deferred or 0 if no pending items to delay job. - */ -static int ds_add_work(struct deferred_set *ds, struct list_head *work) -{ - int r = 1; - unsigned long flags; - unsigned next_entry; - - spin_lock_irqsave(&ds->lock, flags); - if ((ds->sweeper == ds->current_entry) && - !ds->entries[ds->current_entry].count) - r = 0; - else { - list_add(work, &ds->entries[ds->current_entry].work_items); - next_entry = ds_next(ds->current_entry); - if (!ds->entries[next_entry].count) - ds->current_entry = next_entry; - } - spin_unlock_irqrestore(&ds->lock, flags); - - return r; -} - -/*----------------------------------------------------------------*/ - -/* - * Key building. - */ -static void build_data_key(struct dm_thin_device *td, - dm_block_t b, struct cell_key *key) -{ - key->virtual = 0; - key->dev = dm_thin_dev_id(td); - key->block = b; -} - -static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, - struct cell_key *key) -{ - key->virtual = 1; - key->dev = dm_thin_dev_id(td); - key->block = b; -} - -/*----------------------------------------------------------------*/ - -/* - * A pool device ties together a metadata device and a data device. It - * also provides the interface for creating and destroying internal - * devices. - */ -struct new_mapping; - -struct pool_features { - unsigned zero_new_blocks:1; - unsigned discard_enabled:1; - unsigned discard_passdown:1; -}; - -struct pool { - struct list_head list; - struct dm_target *ti; /* Only set if a pool target is bound */ - - struct mapped_device *pool_md; - struct block_device *md_dev; - struct dm_pool_metadata *pmd; - - uint32_t sectors_per_block; - unsigned block_shift; - dm_block_t offset_mask; - dm_block_t low_water_blocks; - - struct pool_features pf; - unsigned low_water_triggered:1; /* A dm event has been sent */ - unsigned no_free_space:1; /* A -ENOSPC warning has been issued */ - - struct bio_prison *prison; - struct dm_kcopyd_client *copier; - - struct workqueue_struct *wq; - struct work_struct worker; - struct delayed_work waker; - - unsigned ref_count; - unsigned long last_commit_jiffies; - - spinlock_t lock; - struct bio_list deferred_bios; - struct bio_list deferred_flush_bios; - struct list_head prepared_mappings; - struct list_head prepared_discards; - - struct bio_list retry_on_resume_list; - - struct deferred_set shared_read_ds; - struct deferred_set all_io_ds; - - struct new_mapping *next_mapping; - mempool_t *mapping_pool; - mempool_t *endio_hook_pool; -}; - -/* - * Target context for a pool. - */ -struct pool_c { - struct dm_target *ti; - struct pool *pool; - struct dm_dev *data_dev; - struct dm_dev *metadata_dev; - struct dm_target_callbacks callbacks; - - dm_block_t low_water_blocks; - struct pool_features pf; -}; - -/* - * Target context for a thin. - */ -struct thin_c { - struct dm_dev *pool_dev; - struct dm_dev *origin_dev; - dm_thin_id dev_id; - - struct pool *pool; - struct dm_thin_device *td; -}; - -/*----------------------------------------------------------------*/ - -/* - * A global list of pools that uses a struct mapped_device as a key. - */ -static struct dm_thin_pool_table { - struct mutex mutex; - struct list_head pools; -} dm_thin_pool_table; - -static void pool_table_init(void) -{ - mutex_init(&dm_thin_pool_table.mutex); - INIT_LIST_HEAD(&dm_thin_pool_table.pools); -} - -static void __pool_table_insert(struct pool *pool) -{ - BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); - list_add(&pool->list, &dm_thin_pool_table.pools); -} - -static void __pool_table_remove(struct pool *pool) -{ - BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); - list_del(&pool->list); -} - -static struct pool *__pool_table_lookup(struct mapped_device *md) -{ - struct pool *pool = NULL, *tmp; - - BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); - - list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) { - if (tmp->pool_md == md) { - pool = tmp; - break; - } - } - - return pool; -} - -static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev) -{ - struct pool *pool = NULL, *tmp; - - BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); - - list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) { - if (tmp->md_dev == md_dev) { - pool = tmp; - break; - } - } - - return pool; -} - -/*----------------------------------------------------------------*/ - -struct endio_hook { - struct thin_c *tc; - struct deferred_entry *shared_read_entry; - struct deferred_entry *all_io_entry; - struct new_mapping *overwrite_mapping; -}; - -static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) -{ - struct bio *bio; - struct bio_list bios; - - bio_list_init(&bios); - bio_list_merge(&bios, master); - bio_list_init(master); - - while ((bio = bio_list_pop(&bios))) { - struct endio_hook *h = dm_get_mapinfo(bio)->ptr; - if (h->tc == tc) - bio_endio(bio, DM_ENDIO_REQUEUE); - else - bio_list_add(master, bio); - } -} - -static void requeue_io(struct thin_c *tc) -{ - struct pool *pool = tc->pool; - unsigned long flags; - - spin_lock_irqsave(&pool->lock, flags); - __requeue_bio_list(tc, &pool->deferred_bios); - __requeue_bio_list(tc, &pool->retry_on_resume_list); - spin_unlock_irqrestore(&pool->lock, flags); -} - -/* - * This section of code contains the logic for processing a thin device's IO. - * Much of the code depends on pool object resources (lists, workqueues, etc) - * but most is exclusively called from the thin target rather than the thin-pool - * target. - */ - -static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio) -{ - return bio->bi_sector >> tc->pool->block_shift; -} - -static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) -{ - struct pool *pool = tc->pool; - - bio->bi_bdev = tc->pool_dev->bdev; - bio->bi_sector = (block << pool->block_shift) + - (bio->bi_sector & pool->offset_mask); -} - -static void remap_to_origin(struct thin_c *tc, struct bio *bio) -{ - bio->bi_bdev = tc->origin_dev->bdev; -} - -static void issue(struct thin_c *tc, struct bio *bio) -{ - struct pool *pool = tc->pool; - unsigned long flags; - - /* - * Batch together any FUA/FLUSH bios we find and then issue - * a single commit for them in process_deferred_bios(). - */ - if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { - spin_lock_irqsave(&pool->lock, flags); - bio_list_add(&pool->deferred_flush_bios, bio); - spin_unlock_irqrestore(&pool->lock, flags); - } else - generic_make_request(bio); -} - -static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio) -{ - remap_to_origin(tc, bio); - issue(tc, bio); -} - -static void remap_and_issue(struct thin_c *tc, struct bio *bio, - dm_block_t block) -{ - remap(tc, bio, block); - issue(tc, bio); -} - -/* - * wake_worker() is used when new work is queued and when pool_resume is - * ready to continue deferred IO processing. - */ -static void wake_worker(struct pool *pool) -{ - queue_work(pool->wq, &pool->worker); -} - -/*----------------------------------------------------------------*/ - -/* - * Bio endio functions. - */ -struct new_mapping { - struct list_head list; - - unsigned quiesced:1; - unsigned prepared:1; - unsigned pass_discard:1; - - struct thin_c *tc; - dm_block_t virt_block; - dm_block_t data_block; - struct cell *cell, *cell2; - int err; - - /* - * If the bio covers the whole area of a block then we can avoid - * zeroing or copying. Instead this bio is hooked. The bio will - * still be in the cell, so care has to be taken to avoid issuing - * the bio twice. - */ - struct bio *bio; - bio_end_io_t *saved_bi_end_io; -}; - -static void __maybe_add_mapping(struct new_mapping *m) -{ - struct pool *pool = m->tc->pool; - - if (m->quiesced && m->prepared) { - list_add(&m->list, &pool->prepared_mappings); - wake_worker(pool); - } -} - -static void copy_complete(int read_err, unsigned long write_err, void *context) -{ - unsigned long flags; - struct new_mapping *m = context; - struct pool *pool = m->tc->pool; - - m->err = read_err || write_err ? -EIO : 0; - - spin_lock_irqsave(&pool->lock, flags); - m->prepared = 1; - __maybe_add_mapping(m); - spin_unlock_irqrestore(&pool->lock, flags); -} - -static void overwrite_endio(struct bio *bio, int err) -{ - unsigned long flags; - struct endio_hook *h = dm_get_mapinfo(bio)->ptr; - struct new_mapping *m = h->overwrite_mapping; - struct pool *pool = m->tc->pool; - - m->err = err; - - spin_lock_irqsave(&pool->lock, flags); - m->prepared = 1; - __maybe_add_mapping(m); - spin_unlock_irqrestore(&pool->lock, flags); -} - -/*----------------------------------------------------------------*/ - -/* - * Workqueue. - */ - -/* - * Prepared mapping jobs. - */ - -/* - * This sends the bios in the cell back to the deferred_bios list. - */ -static void cell_defer(struct thin_c *tc, struct cell *cell, - dm_block_t data_block) -{ - struct pool *pool = tc->pool; - unsigned long flags; - - spin_lock_irqsave(&pool->lock, flags); - cell_release(cell, &pool->deferred_bios); - spin_unlock_irqrestore(&tc->pool->lock, flags); - - wake_worker(pool); -} - -/* - * Same as cell_defer above, except it omits one particular detainee, - * a write bio that covers the block and has already been processed. - */ -static void cell_defer_except(struct thin_c *tc, struct cell *cell) -{ - struct bio_list bios; - struct pool *pool = tc->pool; - unsigned long flags; - - bio_list_init(&bios); - - spin_lock_irqsave(&pool->lock, flags); - cell_release_no_holder(cell, &pool->deferred_bios); - spin_unlock_irqrestore(&pool->lock, flags); - - wake_worker(pool); -} - -static void process_prepared_mapping(struct new_mapping *m) -{ - struct thin_c *tc = m->tc; - struct bio *bio; - int r; - - bio = m->bio; - if (bio) - bio->bi_end_io = m->saved_bi_end_io; - - if (m->err) { - cell_error(m->cell); - return; - } - - /* - * Commit the prepared block into the mapping btree. - * Any I/O for this block arriving after this point will get - * remapped to it directly. - */ - r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); - if (r) { - DMERR("dm_thin_insert_block() failed"); - cell_error(m->cell); - return; - } - - /* - * Release any bios held while the block was being provisioned. - * If we are processing a write bio that completely covers the block, - * we already processed it so can ignore it now when processing - * the bios in the cell. - */ - if (bio) { - cell_defer_except(tc, m->cell); - bio_endio(bio, 0); - } else - cell_defer(tc, m->cell, m->data_block); - - list_del(&m->list); - mempool_free(m, tc->pool->mapping_pool); -} - -static void process_prepared_discard(struct new_mapping *m) -{ - int r; - struct thin_c *tc = m->tc; - - r = dm_thin_remove_block(tc->td, m->virt_block); - if (r) - DMERR("dm_thin_remove_block() failed"); - - /* - * Pass the discard down to the underlying device? - */ - if (m->pass_discard) - remap_and_issue(tc, m->bio, m->data_block); - else - bio_endio(m->bio, 0); - - cell_defer_except(tc, m->cell); - cell_defer_except(tc, m->cell2); - mempool_free(m, tc->pool->mapping_pool); -} - -static void process_prepared(struct pool *pool, struct list_head *head, - void (*fn)(struct new_mapping *)) -{ - unsigned long flags; - struct list_head maps; - struct new_mapping *m, *tmp; - - INIT_LIST_HEAD(&maps); - spin_lock_irqsave(&pool->lock, flags); - list_splice_init(head, &maps); - spin_unlock_irqrestore(&pool->lock, flags); - - list_for_each_entry_safe(m, tmp, &maps, list) - fn(m); -} - -/* - * Deferred bio jobs. - */ -static int io_overlaps_block(struct pool *pool, struct bio *bio) -{ - return !(bio->bi_sector & pool->offset_mask) && - (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT)); - -} - -static int io_overwrites_block(struct pool *pool, struct bio *bio) -{ - return (bio_data_dir(bio) == WRITE) && - io_overlaps_block(pool, bio); -} - -static void save_and_set_endio(struct bio *bio, bio_end_io_t **save, - bio_end_io_t *fn) -{ - *save = bio->bi_end_io; - bio->bi_end_io = fn; -} - -static int ensure_next_mapping(struct pool *pool) -{ - if (pool->next_mapping) - return 0; - - pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC); - - return pool->next_mapping ? 0 : -ENOMEM; -} - -static struct new_mapping *get_next_mapping(struct pool *pool) -{ - struct new_mapping *r = pool->next_mapping; - - BUG_ON(!pool->next_mapping); - - pool->next_mapping = NULL; - - return r; -} - -static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, - struct dm_dev *origin, dm_block_t data_origin, - dm_block_t data_dest, - struct cell *cell, struct bio *bio) -{ - int r; - struct pool *pool = tc->pool; - struct new_mapping *m = get_next_mapping(pool); - - INIT_LIST_HEAD(&m->list); - m->quiesced = 0; - m->prepared = 0; - m->tc = tc; - m->virt_block = virt_block; - m->data_block = data_dest; - m->cell = cell; - m->err = 0; - m->bio = NULL; - - if (!ds_add_work(&pool->shared_read_ds, &m->list)) - m->quiesced = 1; - - /* - * IO to pool_dev remaps to the pool target's data_dev. - * - * If the whole block of data is being overwritten, we can issue the - * bio immediately. Otherwise we use kcopyd to clone the data first. - */ - if (io_overwrites_block(pool, bio)) { - struct endio_hook *h = dm_get_mapinfo(bio)->ptr; - h->overwrite_mapping = m; - m->bio = bio; - save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); - remap_and_issue(tc, bio, data_dest); - } else { - struct dm_io_region from, to; - - from.bdev = origin->bdev; - from.sector = data_origin * pool->sectors_per_block; - from.count = pool->sectors_per_block; - - to.bdev = tc->pool_dev->bdev; - to.sector = data_dest * pool->sectors_per_block; - to.count = pool->sectors_per_block; - - r = dm_kcopyd_copy(pool->copier, &from, 1, &to, - 0, copy_complete, m); - if (r < 0) { - mempool_free(m, pool->mapping_pool); - DMERR("dm_kcopyd_copy() failed"); - cell_error(cell); - } - } -} - -static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block, - dm_block_t data_origin, dm_block_t data_dest, - struct cell *cell, struct bio *bio) -{ - schedule_copy(tc, virt_block, tc->pool_dev, - data_origin, data_dest, cell, bio); -} - -static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block, - dm_block_t data_dest, - struct cell *cell, struct bio *bio) -{ - schedule_copy(tc, virt_block, tc->origin_dev, - virt_block, data_dest, cell, bio); -} - -static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, - dm_block_t data_block, struct cell *cell, - struct bio *bio) -{ - struct pool *pool = tc->pool; - struct new_mapping *m = get_next_mapping(pool); - - INIT_LIST_HEAD(&m->list); - m->quiesced = 1; - m->prepared = 0; - m->tc = tc; - m->virt_block = virt_block; - m->data_block = data_block; - m->cell = cell; - m->err = 0; - m->bio = NULL; - - /* - * If the whole block of data is being overwritten or we are not - * zeroing pre-existing data, we can issue the bio immediately. - * Otherwise we use kcopyd to zero the data first. - */ - if (!pool->pf.zero_new_blocks) - process_prepared_mapping(m); - - else if (io_overwrites_block(pool, bio)) { - struct endio_hook *h = dm_get_mapinfo(bio)->ptr; - h->overwrite_mapping = m; - m->bio = bio; - save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); - remap_and_issue(tc, bio, data_block); - - } else { - int r; - struct dm_io_region to; - - to.bdev = tc->pool_dev->bdev; - to.sector = data_block * pool->sectors_per_block; - to.count = pool->sectors_per_block; - - r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m); - if (r < 0) { - mempool_free(m, pool->mapping_pool); - DMERR("dm_kcopyd_zero() failed"); - cell_error(cell); - } - } -} - -static int alloc_data_block(struct thin_c *tc, dm_block_t *result) -{ - int r; - dm_block_t free_blocks; - unsigned long flags; - struct pool *pool = tc->pool; - - r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); - if (r) - return r; - - if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) { - DMWARN("%s: reached low water mark, sending event.", - dm_device_name(pool->pool_md)); - spin_lock_irqsave(&pool->lock, flags); - pool->low_water_triggered = 1; - spin_unlock_irqrestore(&pool->lock, flags); - dm_table_event(pool->ti->table); - } - - if (!free_blocks) { - if (pool->no_free_space) - return -ENOSPC; - else { - /* - * Try to commit to see if that will free up some - * more space. - */ - r = dm_pool_commit_metadata(pool->pmd); - if (r) { - DMERR("%s: dm_pool_commit_metadata() failed, error = %d", - __func__, r); - return r; - } - - r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); - if (r) - return r; - - /* - * If we still have no space we set a flag to avoid - * doing all this checking and return -ENOSPC. - */ - if (!free_blocks) { - DMWARN("%s: no free space available.", - dm_device_name(pool->pool_md)); - spin_lock_irqsave(&pool->lock, flags); - pool->no_free_space = 1; - spin_unlock_irqrestore(&pool->lock, flags); - return -ENOSPC; - } - } - } - - r = dm_pool_alloc_data_block(pool->pmd, result); - if (r) - return r; - - return 0; -} - -/* - * If we have run out of space, queue bios until the device is - * resumed, presumably after having been reloaded with more space. - */ -static void retry_on_resume(struct bio *bio) -{ - struct endio_hook *h = dm_get_mapinfo(bio)->ptr; - struct thin_c *tc = h->tc; - struct pool *pool = tc->pool; - unsigned long flags; - - spin_lock_irqsave(&pool->lock, flags); - bio_list_add(&pool->retry_on_resume_list, bio); - spin_unlock_irqrestore(&pool->lock, flags); -} - -static void no_space(struct cell *cell) -{ - struct bio *bio; - struct bio_list bios; - - bio_list_init(&bios); - cell_release(cell, &bios); - - while ((bio = bio_list_pop(&bios))) - retry_on_resume(bio); -} - -static void process_discard(struct thin_c *tc, struct bio *bio) -{ - int r; - unsigned long flags; - struct pool *pool = tc->pool; - struct cell *cell, *cell2; - struct cell_key key, key2; - dm_block_t block = get_bio_block(tc, bio); - struct dm_thin_lookup_result lookup_result; - struct new_mapping *m; - - build_virtual_key(tc->td, block, &key); - if (bio_detain(tc->pool->prison, &key, bio, &cell)) - return; - - r = dm_thin_find_block(tc->td, block, 1, &lookup_result); - switch (r) { - case 0: - /* - * Check nobody is fiddling with this pool block. This can - * happen if someone's in the process of breaking sharing - * on this block. - */ - build_data_key(tc->td, lookup_result.block, &key2); - if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) { - cell_release_singleton(cell, bio); - break; - } - - if (io_overlaps_block(pool, bio)) { - /* - * IO may still be going to the destination block. We must - * quiesce before we can do the removal. - */ - m = get_next_mapping(pool); - m->tc = tc; - m->pass_discard = (!lookup_result.shared) & pool->pf.discard_passdown; - m->virt_block = block; - m->data_block = lookup_result.block; - m->cell = cell; - m->cell2 = cell2; - m->err = 0; - m->bio = bio; - - if (!ds_add_work(&pool->all_io_ds, &m->list)) { - spin_lock_irqsave(&pool->lock, flags); - list_add(&m->list, &pool->prepared_discards); - spin_unlock_irqrestore(&pool->lock, flags); - wake_worker(pool); - } - } else { - /* - * This path is hit if people are ignoring - * limits->discard_granularity. It ignores any - * part of the discard that is in a subsequent - * block. - */ - sector_t offset = bio->bi_sector - (block << pool->block_shift); - unsigned remaining = (pool->sectors_per_block - offset) << 9; - bio->bi_size = min(bio->bi_size, remaining); - - cell_release_singleton(cell, bio); - cell_release_singleton(cell2, bio); - remap_and_issue(tc, bio, lookup_result.block); - } - break; - - case -ENODATA: - /* - * It isn't provisioned, just forget it. - */ - cell_release_singleton(cell, bio); - bio_endio(bio, 0); - break; - - default: - DMERR("discard: find block unexpectedly returned %d", r); - cell_release_singleton(cell, bio); - bio_io_error(bio); - break; - } -} - -static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, - struct cell_key *key, - struct dm_thin_lookup_result *lookup_result, - struct cell *cell) -{ - int r; - dm_block_t data_block; - - r = alloc_data_block(tc, &data_block); - switch (r) { - case 0: - schedule_internal_copy(tc, block, lookup_result->block, - data_block, cell, bio); - break; - - case -ENOSPC: - no_space(cell); - break; - - default: - DMERR("%s: alloc_data_block() failed, error = %d", __func__, r); - cell_error(cell); - break; - } -} - -static void process_shared_bio(struct thin_c *tc, struct bio *bio, - dm_block_t block, - struct dm_thin_lookup_result *lookup_result) -{ - struct cell *cell; - struct pool *pool = tc->pool; - struct cell_key key; - - /* - * If cell is already occupied, then sharing is already in the process - * of being broken so we have nothing further to do here. - */ - build_data_key(tc->td, lookup_result->block, &key); - if (bio_detain(pool->prison, &key, bio, &cell)) - return; - - if (bio_data_dir(bio) == WRITE) - break_sharing(tc, bio, block, &key, lookup_result, cell); - else { - struct endio_hook *h = dm_get_mapinfo(bio)->ptr; - - h->shared_read_entry = ds_inc(&pool->shared_read_ds); - - cell_release_singleton(cell, bio); - remap_and_issue(tc, bio, lookup_result->block); - } -} - -static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block, - struct cell *cell) -{ - int r; - dm_block_t data_block; - - /* - * Remap empty bios (flushes) immediately, without provisioning. - */ - if (!bio->bi_size) { - cell_release_singleton(cell, bio); - remap_and_issue(tc, bio, 0); - return; - } - - /* - * Fill read bios with zeroes and complete them immediately. - */ - if (bio_data_dir(bio) == READ) { - zero_fill_bio(bio); - cell_release_singleton(cell, bio); - bio_endio(bio, 0); - return; - } - - r = alloc_data_block(tc, &data_block); - switch (r) { - case 0: - if (tc->origin_dev) - schedule_external_copy(tc, block, data_block, cell, bio); - else - schedule_zero(tc, block, data_block, cell, bio); - break; - - case -ENOSPC: - no_space(cell); - break; - - default: - DMERR("%s: alloc_data_block() failed, error = %d", __func__, r); - cell_error(cell); - break; - } -} - -static void process_bio(struct thin_c *tc, struct bio *bio) -{ - int r; - dm_block_t block = get_bio_block(tc, bio); - struct cell *cell; - struct cell_key key; - struct dm_thin_lookup_result lookup_result; - - /* - * If cell is already occupied, then the block is already - * being provisioned so we have nothing further to do here. - */ - build_virtual_key(tc->td, block, &key); - if (bio_detain(tc->pool->prison, &key, bio, &cell)) - return; - - r = dm_thin_find_block(tc->td, block, 1, &lookup_result); - switch (r) { - case 0: - /* - * We can release this cell now. This thread is the only - * one that puts bios into a cell, and we know there were - * no preceding bios. - */ - /* - * TODO: this will probably have to change when discard goes - * back in. - */ - cell_release_singleton(cell, bio); - - if (lookup_result.shared) - process_shared_bio(tc, bio, block, &lookup_result); - else - remap_and_issue(tc, bio, lookup_result.block); - break; - - case -ENODATA: - if (bio_data_dir(bio) == READ && tc->origin_dev) { - cell_release_singleton(cell, bio); - remap_to_origin_and_issue(tc, bio); - } else - provision_block(tc, bio, block, cell); - break; - - default: - DMERR("dm_thin_find_block() failed, error = %d", r); - cell_release_singleton(cell, bio); - bio_io_error(bio); - break; - } -} - -static int need_commit_due_to_time(struct pool *pool) -{ - return jiffies < pool->last_commit_jiffies || - jiffies > pool->last_commit_jiffies + COMMIT_PERIOD; -} - -static void process_deferred_bios(struct pool *pool) -{ - unsigned long flags; - struct bio *bio; - struct bio_list bios; - int r; - - bio_list_init(&bios); - - spin_lock_irqsave(&pool->lock, flags); - bio_list_merge(&bios, &pool->deferred_bios); - bio_list_init(&pool->deferred_bios); - spin_unlock_irqrestore(&pool->lock, flags); - - while ((bio = bio_list_pop(&bios))) { - struct endio_hook *h = dm_get_mapinfo(bio)->ptr; - struct thin_c *tc = h->tc; - - /* - * If we've got no free new_mapping structs, and processing - * this bio might require one, we pause until there are some - * prepared mappings to process. - */ - if (ensure_next_mapping(pool)) { - spin_lock_irqsave(&pool->lock, flags); - bio_list_merge(&pool->deferred_bios, &bios); - spin_unlock_irqrestore(&pool->lock, flags); - - break; - } - - if (bio->bi_rw & REQ_DISCARD) - process_discard(tc, bio); - else - process_bio(tc, bio); - } - - /* - * If there are any deferred flush bios, we must commit - * the metadata before issuing them. - */ - bio_list_init(&bios); - spin_lock_irqsave(&pool->lock, flags); - bio_list_merge(&bios, &pool->deferred_flush_bios); - bio_list_init(&pool->deferred_flush_bios); - spin_unlock_irqrestore(&pool->lock, flags); - - if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) - return; - - r = dm_pool_commit_metadata(pool->pmd); - if (r) { - DMERR("%s: dm_pool_commit_metadata() failed, error = %d", - __func__, r); - while ((bio = bio_list_pop(&bios))) - bio_io_error(bio); - return; - } - pool->last_commit_jiffies = jiffies; - - while ((bio = bio_list_pop(&bios))) - generic_make_request(bio); -} - -static void do_worker(struct work_struct *ws) -{ - struct pool *pool = container_of(ws, struct pool, worker); - - process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping); - process_prepared(pool, &pool->prepared_discards, process_prepared_discard); - process_deferred_bios(pool); -} - -/* - * We want to commit periodically so that not too much - * unwritten data builds up. - */ -static void do_waker(struct work_struct *ws) -{ - struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker); - wake_worker(pool); - queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD); -} - -/*----------------------------------------------------------------*/ - -/* - * Mapping functions. - */ - -/* - * Called only while mapping a thin bio to hand it over to the workqueue. - */ -static void thin_defer_bio(struct thin_c *tc, struct bio *bio) -{ - unsigned long flags; - struct pool *pool = tc->pool; - - spin_lock_irqsave(&pool->lock, flags); - bio_list_add(&pool->deferred_bios, bio); - spin_unlock_irqrestore(&pool->lock, flags); - - wake_worker(pool); -} - -static struct endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio) -{ - struct pool *pool = tc->pool; - struct endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO); - - h->tc = tc; - h->shared_read_entry = NULL; - h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds); - h->overwrite_mapping = NULL; - - return h; -} - -/* - * Non-blocking function called from the thin target's map function. - */ -static int thin_bio_map(struct dm_target *ti, struct bio *bio, - union map_info *map_context) -{ - int r; - struct thin_c *tc = ti->private; - dm_block_t block = get_bio_block(tc, bio); - struct dm_thin_device *td = tc->td; - struct dm_thin_lookup_result result; - - map_context->ptr = thin_hook_bio(tc, bio); - if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) { - thin_defer_bio(tc, bio); - return DM_MAPIO_SUBMITTED; - } - - r = dm_thin_find_block(td, block, 0, &result); - - /* - * Note that we defer readahead too. - */ - switch (r) { - case 0: - if (unlikely(result.shared)) { - /* - * We have a race condition here between the - * result.shared value returned by the lookup and - * snapshot creation, which may cause new - * sharing. - * - * To avoid this always quiesce the origin before - * taking the snap. You want to do this anyway to - * ensure a consistent application view - * (i.e. lockfs). - * - * More distant ancestors are irrelevant. The - * shared flag will be set in their case. - */ - thin_defer_bio(tc, bio); - r = DM_MAPIO_SUBMITTED; - } else { - remap(tc, bio, result.block); - r = DM_MAPIO_REMAPPED; - } - break; - - case -ENODATA: - /* - * In future, the failed dm_thin_find_block above could - * provide the hint to load the metadata into cache. - */ - case -EWOULDBLOCK: - thin_defer_bio(tc, bio); - r = DM_MAPIO_SUBMITTED; - break; - } - - return r; -} - -static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits) -{ - int r; - unsigned long flags; - struct pool_c *pt = container_of(cb, struct pool_c, callbacks); - - spin_lock_irqsave(&pt->pool->lock, flags); - r = !bio_list_empty(&pt->pool->retry_on_resume_list); - spin_unlock_irqrestore(&pt->pool->lock, flags); - - if (!r) { - struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); - r = bdi_congested(&q->backing_dev_info, bdi_bits); - } - - return r; -} - -static void __requeue_bios(struct pool *pool) -{ - bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list); - bio_list_init(&pool->retry_on_resume_list); -} - -/*---------------------------------------------------------------- - * Binding of control targets to a pool object - *--------------------------------------------------------------*/ -static int bind_control_target(struct pool *pool, struct dm_target *ti) -{ - struct pool_c *pt = ti->private; - - pool->ti = ti; - pool->low_water_blocks = pt->low_water_blocks; - pool->pf = pt->pf; - - /* - * If discard_passdown was enabled verify that the data device - * supports discards. Disable discard_passdown if not; otherwise - * -EOPNOTSUPP will be returned. - */ - if (pt->pf.discard_passdown) { - struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); - if (!q || !blk_queue_discard(q)) { - char buf[BDEVNAME_SIZE]; - DMWARN("Discard unsupported by data device (%s): Disabling discard passdown.", - bdevname(pt->data_dev->bdev, buf)); - pool->pf.discard_passdown = 0; - } - } - - return 0; -} - -static void unbind_control_target(struct pool *pool, struct dm_target *ti) -{ - if (pool->ti == ti) - pool->ti = NULL; -} - -/*---------------------------------------------------------------- - * Pool creation - *--------------------------------------------------------------*/ -/* Initialize pool features. */ -static void pool_features_init(struct pool_features *pf) -{ - pf->zero_new_blocks = 1; - pf->discard_enabled = 1; - pf->discard_passdown = 1; -} - -static void __pool_destroy(struct pool *pool) -{ - __pool_table_remove(pool); - - if (dm_pool_metadata_close(pool->pmd) < 0) - DMWARN("%s: dm_pool_metadata_close() failed.", __func__); - - prison_destroy(pool->prison); - dm_kcopyd_client_destroy(pool->copier); - - if (pool->wq) - destroy_workqueue(pool->wq); - - if (pool->next_mapping) - mempool_free(pool->next_mapping, pool->mapping_pool); - mempool_destroy(pool->mapping_pool); - mempool_destroy(pool->endio_hook_pool); - kfree(pool); -} - -static struct pool *pool_create(struct mapped_device *pool_md, - struct block_device *metadata_dev, - unsigned long block_size, char **error) -{ - int r; - void *err_p; - struct pool *pool; - struct dm_pool_metadata *pmd; - - pmd = dm_pool_metadata_open(metadata_dev, block_size); - if (IS_ERR(pmd)) { - *error = "Error creating metadata object"; - return (struct pool *)pmd; - } - - pool = kmalloc(sizeof(*pool), GFP_KERNEL); - if (!pool) { - *error = "Error allocating memory for pool"; - err_p = ERR_PTR(-ENOMEM); - goto bad_pool; - } - - pool->pmd = pmd; - pool->sectors_per_block = block_size; - pool->block_shift = ffs(block_size) - 1; - pool->offset_mask = block_size - 1; - pool->low_water_blocks = 0; - pool_features_init(&pool->pf); - pool->prison = prison_create(PRISON_CELLS); - if (!pool->prison) { - *error = "Error creating pool's bio prison"; - err_p = ERR_PTR(-ENOMEM); - goto bad_prison; - } - - pool->copier = dm_kcopyd_client_create(); - if (IS_ERR(pool->copier)) { - r = PTR_ERR(pool->copier); - *error = "Error creating pool's kcopyd client"; - err_p = ERR_PTR(r); - goto bad_kcopyd_client; - } - - /* - * Create singlethreaded workqueue that will service all devices - * that use this metadata. - */ - pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); - if (!pool->wq) { - *error = "Error creating pool's workqueue"; - err_p = ERR_PTR(-ENOMEM); - goto bad_wq; - } - - INIT_WORK(&pool->worker, do_worker); - INIT_DELAYED_WORK(&pool->waker, do_waker); - spin_lock_init(&pool->lock); - bio_list_init(&pool->deferred_bios); - bio_list_init(&pool->deferred_flush_bios); - INIT_LIST_HEAD(&pool->prepared_mappings); - INIT_LIST_HEAD(&pool->prepared_discards); - pool->low_water_triggered = 0; - pool->no_free_space = 0; - bio_list_init(&pool->retry_on_resume_list); - ds_init(&pool->shared_read_ds); - ds_init(&pool->all_io_ds); - - pool->next_mapping = NULL; - pool->mapping_pool = - mempool_create_kmalloc_pool(MAPPING_POOL_SIZE, sizeof(struct new_mapping)); - if (!pool->mapping_pool) { - *error = "Error creating pool's mapping mempool"; - err_p = ERR_PTR(-ENOMEM); - goto bad_mapping_pool; - } - - pool->endio_hook_pool = - mempool_create_kmalloc_pool(ENDIO_HOOK_POOL_SIZE, sizeof(struct endio_hook)); - if (!pool->endio_hook_pool) { - *error = "Error creating pool's endio_hook mempool"; - err_p = ERR_PTR(-ENOMEM); - goto bad_endio_hook_pool; - } - pool->ref_count = 1; - pool->last_commit_jiffies = jiffies; - pool->pool_md = pool_md; - pool->md_dev = metadata_dev; - __pool_table_insert(pool); - - return pool; - -bad_endio_hook_pool: - mempool_destroy(pool->mapping_pool); -bad_mapping_pool: - destroy_workqueue(pool->wq); -bad_wq: - dm_kcopyd_client_destroy(pool->copier); -bad_kcopyd_client: - prison_destroy(pool->prison); -bad_prison: - kfree(pool); -bad_pool: - if (dm_pool_metadata_close(pmd)) - DMWARN("%s: dm_pool_metadata_close() failed.", __func__); - - return err_p; -} - -static void __pool_inc(struct pool *pool) -{ - BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); - pool->ref_count++; -} - -static void __pool_dec(struct pool *pool) -{ - BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); - BUG_ON(!pool->ref_count); - if (!--pool->ref_count) - __pool_destroy(pool); -} - -static struct pool *__pool_find(struct mapped_device *pool_md, - struct block_device *metadata_dev, - unsigned long block_size, char **error, - int *created) -{ - struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); - - if (pool) { - if (pool->pool_md != pool_md) - return ERR_PTR(-EBUSY); - __pool_inc(pool); - - } else { - pool = __pool_table_lookup(pool_md); - if (pool) { - if (pool->md_dev != metadata_dev) - return ERR_PTR(-EINVAL); - __pool_inc(pool); - - } else { - pool = pool_create(pool_md, metadata_dev, block_size, error); - *created = 1; - } - } - - return pool; -} - -/*---------------------------------------------------------------- - * Pool target methods - *--------------------------------------------------------------*/ -static void pool_dtr(struct dm_target *ti) -{ - struct pool_c *pt = ti->private; - - mutex_lock(&dm_thin_pool_table.mutex); - - unbind_control_target(pt->pool, ti); - __pool_dec(pt->pool); - dm_put_device(ti, pt->metadata_dev); - dm_put_device(ti, pt->data_dev); - kfree(pt); - - mutex_unlock(&dm_thin_pool_table.mutex); -} - -static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, - struct dm_target *ti) -{ - int r; - unsigned argc; - const char *arg_name; - - static struct dm_arg _args[] = { - {0, 3, "Invalid number of pool feature arguments"}, - }; - - /* - * No feature arguments supplied. - */ - if (!as->argc) - return 0; - - r = dm_read_arg_group(_args, as, &argc, &ti->error); - if (r) - return -EINVAL; - - while (argc && !r) { - arg_name = dm_shift_arg(as); - argc--; - - if (!strcasecmp(arg_name, "skip_block_zeroing")) { - pf->zero_new_blocks = 0; - continue; - } else if (!strcasecmp(arg_name, "ignore_discard")) { - pf->discard_enabled = 0; - continue; - } else if (!strcasecmp(arg_name, "no_discard_passdown")) { - pf->discard_passdown = 0; - continue; - } - - ti->error = "Unrecognised pool feature requested"; - r = -EINVAL; - } - - return r; -} - -/* - * thin-pool <metadata dev> <data dev> - * <data block size (sectors)> - * <low water mark (blocks)> - * [<#feature args> [<arg>]*] - * - * Optional feature arguments are: - * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. - * ignore_discard: disable discard - * no_discard_passdown: don't pass discards down to the data device - */ -static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) -{ - int r, pool_created = 0; - struct pool_c *pt; - struct pool *pool; - struct pool_features pf; - struct dm_arg_set as; - struct dm_dev *data_dev; - unsigned long block_size; - dm_block_t low_water_blocks; - struct dm_dev *metadata_dev; - sector_t metadata_dev_size; - char b[BDEVNAME_SIZE]; - - /* - * FIXME Remove validation from scope of lock. - */ - mutex_lock(&dm_thin_pool_table.mutex); - - if (argc < 4) { - ti->error = "Invalid argument count"; - r = -EINVAL; - goto out_unlock; - } - as.argc = argc; - as.argv = argv; - - r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev); - if (r) { - ti->error = "Error opening metadata block device"; - goto out_unlock; - } - - metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT; - if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) - DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", - bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); - - r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); - if (r) { - ti->error = "Error getting data device"; - goto out_metadata; - } - - if (kstrtoul(argv[2], 10, &block_size) || !block_size || - block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || - block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || - !is_power_of_2(block_size)) { - ti->error = "Invalid block size"; - r = -EINVAL; - goto out; - } - - if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) { - ti->error = "Invalid low water mark"; - r = -EINVAL; - goto out; - } - - /* - * Set default pool features. - */ - pool_features_init(&pf); - - dm_consume_args(&as, 4); - r = parse_pool_features(&as, &pf, ti); - if (r) - goto out; - - pt = kzalloc(sizeof(*pt), GFP_KERNEL); - if (!pt) { - r = -ENOMEM; - goto out; - } - - pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, - block_size, &ti->error, &pool_created); - if (IS_ERR(pool)) { - r = PTR_ERR(pool); - goto out_free_pt; - } - - /* - * 'pool_created' reflects whether this is the first table load. - * Top level discard support is not allowed to be changed after - * initial load. This would require a pool reload to trigger thin - * device changes. - */ - if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) { - ti->error = "Discard support cannot be disabled once enabled"; - r = -EINVAL; - goto out_flags_changed; - } - - pt->pool = pool; - pt->ti = ti; - pt->metadata_dev = metadata_dev; - pt->data_dev = data_dev; - pt->low_water_blocks = low_water_blocks; - pt->pf = pf; - ti->num_flush_requests = 1; - /* - * Only need to enable discards if the pool should pass - * them down to the data device. The thin device's discard - * processing will cause mappings to be removed from the btree. - */ - if (pf.discard_enabled && pf.discard_passdown) { - ti->num_discard_requests = 1; - /* - * Setting 'discards_supported' circumvents the normal - * stacking of discard limits (this keeps the pool and - * thin devices' discard limits consistent). - */ - ti->discards_supported = 1; - } - ti->private = pt; - - pt->callbacks.congested_fn = pool_is_congested; - dm_table_add_target_callbacks(ti->table, &pt->callbacks); - - mutex_unlock(&dm_thin_pool_table.mutex); - - return 0; - -out_flags_changed: - __pool_dec(pool); -out_free_pt: - kfree(pt); -out: - dm_put_device(ti, data_dev); -out_metadata: - dm_put_device(ti, metadata_dev); -out_unlock: - mutex_unlock(&dm_thin_pool_table.mutex); - - return r; -} - -static int pool_map(struct dm_target *ti, struct bio *bio, - union map_info *map_context) -{ - int r; - struct pool_c *pt = ti->private; - struct pool *pool = pt->pool; - unsigned long flags; - - /* - * As this is a singleton target, ti->begin is always zero. - */ - spin_lock_irqsave(&pool->lock, flags); - bio->bi_bdev = pt->data_dev->bdev; - r = DM_MAPIO_REMAPPED; - spin_unlock_irqrestore(&pool->lock, flags); - - return r; -} - -/* - * Retrieves the number of blocks of the data device from - * the superblock and compares it to the actual device size, - * thus resizing the data device in case it has grown. - * - * This both copes with opening preallocated data devices in the ctr - * being followed by a resume - * -and- - * calling the resume method individually after userspace has - * grown the data device in reaction to a table event. - */ -static int pool_preresume(struct dm_target *ti) -{ - int r; - struct pool_c *pt = ti->private; - struct pool *pool = pt->pool; - dm_block_t data_size, sb_data_size; - - /* - * Take control of the pool object. - */ - r = bind_control_target(pool, ti); - if (r) - return r; - - data_size = ti->len >> pool->block_shift; - r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size); - if (r) { - DMERR("failed to retrieve data device size"); - return r; - } - - if (data_size < sb_data_size) { - DMERR("pool target too small, is %llu blocks (expected %llu)", - data_size, sb_data_size); - return -EINVAL; - - } else if (data_size > sb_data_size) { - r = dm_pool_resize_data_dev(pool->pmd, data_size); - if (r) { - DMERR("failed to resize data device"); - return r; - } - - r = dm_pool_commit_metadata(pool->pmd); - if (r) { - DMERR("%s: dm_pool_commit_metadata() failed, error = %d", - __func__, r); - return r; - } - } - - return 0; -} - -static void pool_resume(struct dm_target *ti) -{ - struct pool_c *pt = ti->private; - struct pool *pool = pt->pool; - unsigned long flags; - - spin_lock_irqsave(&pool->lock, flags); - pool->low_water_triggered = 0; - pool->no_free_space = 0; - __requeue_bios(pool); - spin_unlock_irqrestore(&pool->lock, flags); - - do_waker(&pool->waker.work); -} - -static void pool_postsuspend(struct dm_target *ti) -{ - int r; - struct pool_c *pt = ti->private; - struct pool *pool = pt->pool; - - cancel_delayed_work(&pool->waker); - flush_workqueue(pool->wq); - - r = dm_pool_commit_metadata(pool->pmd); - if (r < 0) { - DMERR("%s: dm_pool_commit_metadata() failed, error = %d", - __func__, r); - /* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/ - } -} - -static int check_arg_count(unsigned argc, unsigned args_required) -{ - if (argc != args_required) { - DMWARN("Message received with %u arguments instead of %u.", - argc, args_required); - return -EINVAL; - } - - return 0; -} - -static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning) -{ - if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) && - *dev_id <= MAX_DEV_ID) - return 0; - - if (warning) - DMWARN("Message received with invalid device id: %s", arg); - - return -EINVAL; -} - -static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool) -{ - dm_thin_id dev_id; - int r; - - r = check_arg_count(argc, 2); - if (r) - return r; - - r = read_dev_id(argv[1], &dev_id, 1); - if (r) - return r; - - r = dm_pool_create_thin(pool->pmd, dev_id); - if (r) { - DMWARN("Creation of new thinly-provisioned device with id %s failed.", - argv[1]); - return r; - } - - return 0; -} - -static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool) -{ - dm_thin_id dev_id; - dm_thin_id origin_dev_id; - int r; - - r = check_arg_count(argc, 3); - if (r) - return r; - - r = read_dev_id(argv[1], &dev_id, 1); - if (r) - return r; - - r = read_dev_id(argv[2], &origin_dev_id, 1); - if (r) - return r; - - r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id); - if (r) { - DMWARN("Creation of new snapshot %s of device %s failed.", - argv[1], argv[2]); - return r; - } - - return 0; -} - -static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool) -{ - dm_thin_id dev_id; - int r; - - r = check_arg_count(argc, 2); - if (r) - return r; - - r = read_dev_id(argv[1], &dev_id, 1); - if (r) - return r; - - r = dm_pool_delete_thin_device(pool->pmd, dev_id); - if (r) - DMWARN("Deletion of thin device %s failed.", argv[1]); - - return r; -} - -static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool) -{ - dm_thin_id old_id, new_id; - int r; - - r = check_arg_count(argc, 3); - if (r) - return r; - - if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) { - DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]); - return -EINVAL; - } - - if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) { - DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]); - return -EINVAL; - } - - r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id); - if (r) { - DMWARN("Failed to change transaction id from %s to %s.", - argv[1], argv[2]); - return r; - } - - return 0; -} - -/* - * Messages supported: - * create_thin <dev_id> - * create_snap <dev_id> <origin_id> - * delete <dev_id> - * trim <dev_id> <new_size_in_sectors> - * set_transaction_id <current_trans_id> <new_trans_id> - */ -static int pool_message(struct dm_target *ti, unsigned argc, char **argv) -{ - int r = -EINVAL; - struct pool_c *pt = ti->private; - struct pool *pool = pt->pool; - - if (!strcasecmp(argv[0], "create_thin")) - r = process_create_thin_mesg(argc, argv, pool); - - else if (!strcasecmp(argv[0], "create_snap")) - r = process_create_snap_mesg(argc, argv, pool); - - else if (!strcasecmp(argv[0], "delete")) - r = process_delete_mesg(argc, argv, pool); - - else if (!strcasecmp(argv[0], "set_transaction_id")) - r = process_set_transaction_id_mesg(argc, argv, pool); - - else - DMWARN("Unrecognised thin pool target message received: %s", argv[0]); - - if (!r) { - r = dm_pool_commit_metadata(pool->pmd); - if (r) - DMERR("%s message: dm_pool_commit_metadata() failed, error = %d", - argv[0], r); - } - - return r; -} - -/* - * Status line is: - * <transaction id> <used metadata sectors>/<total metadata sectors> - * <used data sectors>/<total data sectors> <held metadata root> - */ -static int pool_status(struct dm_target *ti, status_type_t type, - char *result, unsigned maxlen) -{ - int r, count; - unsigned sz = 0; - uint64_t transaction_id; - dm_block_t nr_free_blocks_data; - dm_block_t nr_free_blocks_metadata; - dm_block_t nr_blocks_data; - dm_block_t nr_blocks_metadata; - dm_block_t held_root; - char buf[BDEVNAME_SIZE]; - char buf2[BDEVNAME_SIZE]; - struct pool_c *pt = ti->private; - struct pool *pool = pt->pool; - - switch (type) { - case STATUSTYPE_INFO: - r = dm_pool_get_metadata_transaction_id(pool->pmd, - &transaction_id); - if (r) - return r; - - r = dm_pool_get_free_metadata_block_count(pool->pmd, - &nr_free_blocks_metadata); - if (r) - return r; - - r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata); - if (r) - return r; - - r = dm_pool_get_free_block_count(pool->pmd, - &nr_free_blocks_data); - if (r) - return r; - - r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data); - if (r) - return r; - - r = dm_pool_get_held_metadata_root(pool->pmd, &held_root); - if (r) - return r; - - DMEMIT("%llu %llu/%llu %llu/%llu ", - (unsigned long long)transaction_id, - (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), - (unsigned long long)nr_blocks_metadata, - (unsigned long long)(nr_blocks_data - nr_free_blocks_data), - (unsigned long long)nr_blocks_data); - - if (held_root) - DMEMIT("%llu", held_root); - else - DMEMIT("-"); - - break; - - case STATUSTYPE_TABLE: - DMEMIT("%s %s %lu %llu ", - format_dev_t(buf, pt->metadata_dev->bdev->bd_dev), - format_dev_t(buf2, pt->data_dev->bdev->bd_dev), - (unsigned long)pool->sectors_per_block, - (unsigned long long)pt->low_water_blocks); - - count = !pool->pf.zero_new_blocks + !pool->pf.discard_enabled + - !pt->pf.discard_passdown; - DMEMIT("%u ", count); - - if (!pool->pf.zero_new_blocks) - DMEMIT("skip_block_zeroing "); - - if (!pool->pf.discard_enabled) - DMEMIT("ignore_discard "); - - if (!pt->pf.discard_passdown) - DMEMIT("no_discard_passdown "); - - break; - } - - return 0; -} - -static int pool_iterate_devices(struct dm_target *ti, - iterate_devices_callout_fn fn, void *data) -{ - struct pool_c *pt = ti->private; - - return fn(ti, pt->data_dev, 0, ti->len, data); -} - -static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm, - struct bio_vec *biovec, int max_size) -{ - struct pool_c *pt = ti->private; - struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); - - if (!q->merge_bvec_fn) - return max_size; - - bvm->bi_bdev = pt->data_dev->bdev; - - return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); -} - -static void set_discard_limits(struct pool *pool, struct queue_limits *limits) -{ - /* - * FIXME: these limits may be incompatible with the pool's data device - */ - limits->max_discard_sectors = pool->sectors_per_block; - - /* - * This is just a hint, and not enforced. We have to cope with - * bios that overlap 2 blocks. - */ - limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; - limits->discard_zeroes_data = pool->pf.zero_new_blocks; -} - -static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) -{ - struct pool_c *pt = ti->private; - struct pool *pool = pt->pool; - - blk_limits_io_min(limits, 0); - blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); - if (pool->pf.discard_enabled) - set_discard_limits(pool, limits); -} - -static struct target_type pool_target = { - .name = "thin-pool", - .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | - DM_TARGET_IMMUTABLE, - .version = {1, 1, 0}, - .module = THIS_MODULE, - .ctr = pool_ctr, - .dtr = pool_dtr, - .map = pool_map, - .postsuspend = pool_postsuspend, - .preresume = pool_preresume, - .resume = pool_resume, - .message = pool_message, - .status = pool_status, - .merge = pool_merge, - .iterate_devices = pool_iterate_devices, - .io_hints = pool_io_hints, -}; - -/*---------------------------------------------------------------- - * Thin target methods - *--------------------------------------------------------------*/ -static void thin_dtr(struct dm_target *ti) -{ - struct thin_c *tc = ti->private; - - mutex_lock(&dm_thin_pool_table.mutex); - - __pool_dec(tc->pool); - dm_pool_close_thin_device(tc->td); - dm_put_device(ti, tc->pool_dev); - if (tc->origin_dev) - dm_put_device(ti, tc->origin_dev); - kfree(tc); - - mutex_unlock(&dm_thin_pool_table.mutex); -} - -/* - * Thin target parameters: - * - * <pool_dev> <dev_id> [origin_dev] - * - * pool_dev: the path to the pool (eg, /dev/mapper/my_pool) - * dev_id: the internal device identifier - * origin_dev: a device external to the pool that should act as the origin - * - * If the pool device has discards disabled, they get disabled for the thin - * device as well. - */ -static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) -{ - int r; - struct thin_c *tc; - struct dm_dev *pool_dev, *origin_dev; - struct mapped_device *pool_md; - - mutex_lock(&dm_thin_pool_table.mutex); - - if (argc != 2 && argc != 3) { - ti->error = "Invalid argument count"; - r = -EINVAL; - goto out_unlock; - } - - tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL); - if (!tc) { - ti->error = "Out of memory"; - r = -ENOMEM; - goto out_unlock; - } - - if (argc == 3) { - r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev); - if (r) { - ti->error = "Error opening origin device"; - goto bad_origin_dev; - } - tc->origin_dev = origin_dev; - } - - r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev); - if (r) { - ti->error = "Error opening pool device"; - goto bad_pool_dev; - } - tc->pool_dev = pool_dev; - - if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) { - ti->error = "Invalid device id"; - r = -EINVAL; - goto bad_common; - } - - pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev); - if (!pool_md) { - ti->error = "Couldn't get pool mapped device"; - r = -EINVAL; - goto bad_common; - } - - tc->pool = __pool_table_lookup(pool_md); - if (!tc->pool) { - ti->error = "Couldn't find pool object"; - r = -EINVAL; - goto bad_pool_lookup; - } - __pool_inc(tc->pool); - - r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td); - if (r) { - ti->error = "Couldn't open thin internal device"; - goto bad_thin_open; - } - - ti->split_io = tc->pool->sectors_per_block; - ti->num_flush_requests = 1; - - /* In case the pool supports discards, pass them on. */ - if (tc->pool->pf.discard_enabled) { - ti->discards_supported = 1; - ti->num_discard_requests = 1; - } - - dm_put(pool_md); - - mutex_unlock(&dm_thin_pool_table.mutex); - - return 0; - -bad_thin_open: - __pool_dec(tc->pool); -bad_pool_lookup: - dm_put(pool_md); -bad_common: - dm_put_device(ti, tc->pool_dev); -bad_pool_dev: - if (tc->origin_dev) - dm_put_device(ti, tc->origin_dev); -bad_origin_dev: - kfree(tc); -out_unlock: - mutex_unlock(&dm_thin_pool_table.mutex); - - return r; -} - -static int thin_map(struct dm_target *ti, struct bio *bio, - union map_info *map_context) -{ - bio->bi_sector = dm_target_offset(ti, bio->bi_sector); - - return thin_bio_map(ti, bio, map_context); -} - -static int thin_endio(struct dm_target *ti, - struct bio *bio, int err, - union map_info *map_context) -{ - unsigned long flags; - struct endio_hook *h = map_context->ptr; - struct list_head work; - struct new_mapping *m, *tmp; - struct pool *pool = h->tc->pool; - - if (h->shared_read_entry) { - INIT_LIST_HEAD(&work); - ds_dec(h->shared_read_entry, &work); - - spin_lock_irqsave(&pool->lock, flags); - list_for_each_entry_safe(m, tmp, &work, list) { - list_del(&m->list); - m->quiesced = 1; - __maybe_add_mapping(m); - } - spin_unlock_irqrestore(&pool->lock, flags); - } - - if (h->all_io_entry) { - INIT_LIST_HEAD(&work); - ds_dec(h->all_io_entry, &work); - spin_lock_irqsave(&pool->lock, flags); - list_for_each_entry_safe(m, tmp, &work, list) - list_add(&m->list, &pool->prepared_discards); - spin_unlock_irqrestore(&pool->lock, flags); - } - - mempool_free(h, pool->endio_hook_pool); - - return 0; -} - -static void thin_postsuspend(struct dm_target *ti) -{ - if (dm_noflush_suspending(ti)) - requeue_io((struct thin_c *)ti->private); -} - -/* - * <nr mapped sectors> <highest mapped sector> - */ -static int thin_status(struct dm_target *ti, status_type_t type, - char *result, unsigned maxlen) -{ - int r; - ssize_t sz = 0; - dm_block_t mapped, highest; - char buf[BDEVNAME_SIZE]; - struct thin_c *tc = ti->private; - - if (!tc->td) - DMEMIT("-"); - else { - switch (type) { - case STATUSTYPE_INFO: - r = dm_thin_get_mapped_count(tc->td, &mapped); - if (r) - return r; - - r = dm_thin_get_highest_mapped_block(tc->td, &highest); - if (r < 0) - return r; - - DMEMIT("%llu ", mapped * tc->pool->sectors_per_block); - if (r) - DMEMIT("%llu", ((highest + 1) * - tc->pool->sectors_per_block) - 1); - else - DMEMIT("-"); - break; - - case STATUSTYPE_TABLE: - DMEMIT("%s %lu", - format_dev_t(buf, tc->pool_dev->bdev->bd_dev), - (unsigned long) tc->dev_id); - if (tc->origin_dev) - DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev)); - break; - } - } - - return 0; -} - -static int thin_iterate_devices(struct dm_target *ti, - iterate_devices_callout_fn fn, void *data) -{ - dm_block_t blocks; - struct thin_c *tc = ti->private; - - /* - * We can't call dm_pool_get_data_dev_size() since that blocks. So - * we follow a more convoluted path through to the pool's target. - */ - if (!tc->pool->ti) - return 0; /* nothing is bound */ - - blocks = tc->pool->ti->len >> tc->pool->block_shift; - if (blocks) - return fn(ti, tc->pool_dev, 0, tc->pool->sectors_per_block * blocks, data); - - return 0; -} - -static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) -{ - struct thin_c *tc = ti->private; - struct pool *pool = tc->pool; - - blk_limits_io_min(limits, 0); - blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); - set_discard_limits(pool, limits); -} - -static struct target_type thin_target = { - .name = "thin", - .version = {1, 1, 0}, - .module = THIS_MODULE, - .ctr = thin_ctr, - .dtr = thin_dtr, - .map = thin_map, - .end_io = thin_endio, - .postsuspend = thin_postsuspend, - .status = thin_status, - .iterate_devices = thin_iterate_devices, - .io_hints = thin_io_hints, -}; - -/*----------------------------------------------------------------*/ - -static int __init dm_thin_init(void) -{ - int r; - - pool_table_init(); - - r = dm_register_target(&thin_target); - if (r) - return r; - - r = dm_register_target(&pool_target); - if (r) - dm_unregister_target(&thin_target); - - return r; -} - -static void dm_thin_exit(void) -{ - dm_unregister_target(&thin_target); - dm_unregister_target(&pool_target); -} - -module_init(dm_thin_init); -module_exit(dm_thin_exit); - -MODULE_DESCRIPTION(DM_NAME " thin provisioning target"); -MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); -MODULE_LICENSE("GPL"); diff --git a/ANDROID_3.4.5/drivers/md/dm-uevent.c b/ANDROID_3.4.5/drivers/md/dm-uevent.c deleted file mode 100644 index 8efe033b..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-uevent.c +++ /dev/null @@ -1,219 +0,0 @@ -/* - * Device Mapper Uevent Support (dm-uevent) - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright IBM Corporation, 2007 - * Author: Mike Anderson <andmike@linux.vnet.ibm.com> - */ -#include <linux/list.h> -#include <linux/slab.h> -#include <linux/kobject.h> -#include <linux/dm-ioctl.h> -#include <linux/export.h> - -#include "dm.h" -#include "dm-uevent.h" - -#define DM_MSG_PREFIX "uevent" - -static const struct { - enum dm_uevent_type type; - enum kobject_action action; - char *name; -} _dm_uevent_type_names[] = { - {DM_UEVENT_PATH_FAILED, KOBJ_CHANGE, "PATH_FAILED"}, - {DM_UEVENT_PATH_REINSTATED, KOBJ_CHANGE, "PATH_REINSTATED"}, -}; - -static struct kmem_cache *_dm_event_cache; - -struct dm_uevent { - struct mapped_device *md; - enum kobject_action action; - struct kobj_uevent_env ku_env; - struct list_head elist; - char name[DM_NAME_LEN]; - char uuid[DM_UUID_LEN]; -}; - -static void dm_uevent_free(struct dm_uevent *event) -{ - kmem_cache_free(_dm_event_cache, event); -} - -static struct dm_uevent *dm_uevent_alloc(struct mapped_device *md) -{ - struct dm_uevent *event; - - event = kmem_cache_zalloc(_dm_event_cache, GFP_ATOMIC); - if (!event) - return NULL; - - INIT_LIST_HEAD(&event->elist); - event->md = md; - - return event; -} - -static struct dm_uevent *dm_build_path_uevent(struct mapped_device *md, - struct dm_target *ti, - enum kobject_action action, - const char *dm_action, - const char *path, - unsigned nr_valid_paths) -{ - struct dm_uevent *event; - - event = dm_uevent_alloc(md); - if (!event) { - DMERR("%s: dm_uevent_alloc() failed", __func__); - goto err_nomem; - } - - event->action = action; - - if (add_uevent_var(&event->ku_env, "DM_TARGET=%s", ti->type->name)) { - DMERR("%s: add_uevent_var() for DM_TARGET failed", - __func__); - goto err_add; - } - - if (add_uevent_var(&event->ku_env, "DM_ACTION=%s", dm_action)) { - DMERR("%s: add_uevent_var() for DM_ACTION failed", - __func__); - goto err_add; - } - - if (add_uevent_var(&event->ku_env, "DM_SEQNUM=%u", - dm_next_uevent_seq(md))) { - DMERR("%s: add_uevent_var() for DM_SEQNUM failed", - __func__); - goto err_add; - } - - if (add_uevent_var(&event->ku_env, "DM_PATH=%s", path)) { - DMERR("%s: add_uevent_var() for DM_PATH failed", __func__); - goto err_add; - } - - if (add_uevent_var(&event->ku_env, "DM_NR_VALID_PATHS=%d", - nr_valid_paths)) { - DMERR("%s: add_uevent_var() for DM_NR_VALID_PATHS failed", - __func__); - goto err_add; - } - - return event; - -err_add: - dm_uevent_free(event); -err_nomem: - return ERR_PTR(-ENOMEM); -} - -/** - * dm_send_uevents - send uevents for given list - * - * @events: list of events to send - * @kobj: kobject generating event - * - */ -void dm_send_uevents(struct list_head *events, struct kobject *kobj) -{ - int r; - struct dm_uevent *event, *next; - - list_for_each_entry_safe(event, next, events, elist) { - list_del_init(&event->elist); - - /* - * When a device is being removed this copy fails and we - * discard these unsent events. - */ - if (dm_copy_name_and_uuid(event->md, event->name, - event->uuid)) { - DMINFO("%s: skipping sending uevent for lost device", - __func__); - goto uevent_free; - } - - if (add_uevent_var(&event->ku_env, "DM_NAME=%s", event->name)) { - DMERR("%s: add_uevent_var() for DM_NAME failed", - __func__); - goto uevent_free; - } - - if (add_uevent_var(&event->ku_env, "DM_UUID=%s", event->uuid)) { - DMERR("%s: add_uevent_var() for DM_UUID failed", - __func__); - goto uevent_free; - } - - r = kobject_uevent_env(kobj, event->action, event->ku_env.envp); - if (r) - DMERR("%s: kobject_uevent_env failed", __func__); -uevent_free: - dm_uevent_free(event); - } -} -EXPORT_SYMBOL_GPL(dm_send_uevents); - -/** - * dm_path_uevent - called to create a new path event and queue it - * - * @event_type: path event type enum - * @ti: pointer to a dm_target - * @path: string containing pathname - * @nr_valid_paths: number of valid paths remaining - * - */ -void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti, - const char *path, unsigned nr_valid_paths) -{ - struct mapped_device *md = dm_table_get_md(ti->table); - struct dm_uevent *event; - - if (event_type >= ARRAY_SIZE(_dm_uevent_type_names)) { - DMERR("%s: Invalid event_type %d", __func__, event_type); - return; - } - - event = dm_build_path_uevent(md, ti, - _dm_uevent_type_names[event_type].action, - _dm_uevent_type_names[event_type].name, - path, nr_valid_paths); - if (IS_ERR(event)) - return; - - dm_uevent_add(md, &event->elist); -} -EXPORT_SYMBOL_GPL(dm_path_uevent); - -int dm_uevent_init(void) -{ - _dm_event_cache = KMEM_CACHE(dm_uevent, 0); - if (!_dm_event_cache) - return -ENOMEM; - - DMINFO("version 1.0.3"); - - return 0; -} - -void dm_uevent_exit(void) -{ - kmem_cache_destroy(_dm_event_cache); -} diff --git a/ANDROID_3.4.5/drivers/md/dm-uevent.h b/ANDROID_3.4.5/drivers/md/dm-uevent.h deleted file mode 100644 index 2eccc8bd..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-uevent.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Device Mapper Uevent Support - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright IBM Corporation, 2007 - * Author: Mike Anderson <andmike@linux.vnet.ibm.com> - */ -#ifndef DM_UEVENT_H -#define DM_UEVENT_H - -enum dm_uevent_type { - DM_UEVENT_PATH_FAILED, - DM_UEVENT_PATH_REINSTATED, -}; - -#ifdef CONFIG_DM_UEVENT - -extern int dm_uevent_init(void); -extern void dm_uevent_exit(void); -extern void dm_send_uevents(struct list_head *events, struct kobject *kobj); -extern void dm_path_uevent(enum dm_uevent_type event_type, - struct dm_target *ti, const char *path, - unsigned nr_valid_paths); - -#else - -static inline int dm_uevent_init(void) -{ - return 0; -} -static inline void dm_uevent_exit(void) -{ -} -static inline void dm_send_uevents(struct list_head *events, - struct kobject *kobj) -{ -} -static inline void dm_path_uevent(enum dm_uevent_type event_type, - struct dm_target *ti, const char *path, - unsigned nr_valid_paths) -{ -} - -#endif /* CONFIG_DM_UEVENT */ - -#endif /* DM_UEVENT_H */ diff --git a/ANDROID_3.4.5/drivers/md/dm-verity.c b/ANDROID_3.4.5/drivers/md/dm-verity.c deleted file mode 100644 index fa365d39..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-verity.c +++ /dev/null @@ -1,913 +0,0 @@ -/* - * Copyright (C) 2012 Red Hat, Inc. - * - * Author: Mikulas Patocka <mpatocka@redhat.com> - * - * Based on Chromium dm-verity driver (C) 2011 The Chromium OS Authors - * - * This file is released under the GPLv2. - * - * In the file "/sys/module/dm_verity/parameters/prefetch_cluster" you can set - * default prefetch value. Data are read in "prefetch_cluster" chunks from the - * hash device. Setting this greatly improves performance when data and hash - * are on the same disk on different partitions on devices with poor random - * access behavior. - */ - -#include "dm-bufio.h" - -#include <linux/module.h> -#include <linux/device-mapper.h> -#include <crypto/hash.h> - -#define DM_MSG_PREFIX "verity" - -#define DM_VERITY_IO_VEC_INLINE 16 -#define DM_VERITY_MEMPOOL_SIZE 4 -#define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144 - -#define DM_VERITY_MAX_LEVELS 63 - -static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE; - -module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR); - -struct dm_verity { - struct dm_dev *data_dev; - struct dm_dev *hash_dev; - struct dm_target *ti; - struct dm_bufio_client *bufio; - char *alg_name; - struct crypto_shash *tfm; - u8 *root_digest; /* digest of the root block */ - u8 *salt; /* salt: its size is salt_size */ - unsigned salt_size; - sector_t data_start; /* data offset in 512-byte sectors */ - sector_t hash_start; /* hash start in blocks */ - sector_t data_blocks; /* the number of data blocks */ - sector_t hash_blocks; /* the number of hash blocks */ - unsigned char data_dev_block_bits; /* log2(data blocksize) */ - unsigned char hash_dev_block_bits; /* log2(hash blocksize) */ - unsigned char hash_per_block_bits; /* log2(hashes in hash block) */ - unsigned char levels; /* the number of tree levels */ - unsigned char version; - unsigned digest_size; /* digest size for the current hash algorithm */ - unsigned shash_descsize;/* the size of temporary space for crypto */ - int hash_failed; /* set to 1 if hash of any block failed */ - - mempool_t *io_mempool; /* mempool of struct dm_verity_io */ - mempool_t *vec_mempool; /* mempool of bio vector */ - - struct workqueue_struct *verify_wq; - - /* starting blocks for each tree level. 0 is the lowest level. */ - sector_t hash_level_block[DM_VERITY_MAX_LEVELS]; -}; - -struct dm_verity_io { - struct dm_verity *v; - struct bio *bio; - - /* original values of bio->bi_end_io and bio->bi_private */ - bio_end_io_t *orig_bi_end_io; - void *orig_bi_private; - - sector_t block; - unsigned n_blocks; - - /* saved bio vector */ - struct bio_vec *io_vec; - unsigned io_vec_size; - - struct work_struct work; - - /* A space for short vectors; longer vectors are allocated separately. */ - struct bio_vec io_vec_inline[DM_VERITY_IO_VEC_INLINE]; - - /* - * Three variably-size fields follow this struct: - * - * u8 hash_desc[v->shash_descsize]; - * u8 real_digest[v->digest_size]; - * u8 want_digest[v->digest_size]; - * - * To access them use: io_hash_desc(), io_real_digest() and io_want_digest(). - */ -}; - -static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io) -{ - return (struct shash_desc *)(io + 1); -} - -static u8 *io_real_digest(struct dm_verity *v, struct dm_verity_io *io) -{ - return (u8 *)(io + 1) + v->shash_descsize; -} - -static u8 *io_want_digest(struct dm_verity *v, struct dm_verity_io *io) -{ - return (u8 *)(io + 1) + v->shash_descsize + v->digest_size; -} - -/* - * Auxiliary structure appended to each dm-bufio buffer. If the value - * hash_verified is nonzero, hash of the block has been verified. - * - * The variable hash_verified is set to 0 when allocating the buffer, then - * it can be changed to 1 and it is never reset to 0 again. - * - * There is no lock around this value, a race condition can at worst cause - * that multiple processes verify the hash of the same buffer simultaneously - * and write 1 to hash_verified simultaneously. - * This condition is harmless, so we don't need locking. - */ -struct buffer_aux { - int hash_verified; -}; - -/* - * Initialize struct buffer_aux for a freshly created buffer. - */ -static void dm_bufio_alloc_callback(struct dm_buffer *buf) -{ - struct buffer_aux *aux = dm_bufio_get_aux_data(buf); - - aux->hash_verified = 0; -} - -/* - * Translate input sector number to the sector number on the target device. - */ -static sector_t verity_map_sector(struct dm_verity *v, sector_t bi_sector) -{ - return v->data_start + dm_target_offset(v->ti, bi_sector); -} - -/* - * Return hash position of a specified block at a specified tree level - * (0 is the lowest level). - * The lowest "hash_per_block_bits"-bits of the result denote hash position - * inside a hash block. The remaining bits denote location of the hash block. - */ -static sector_t verity_position_at_level(struct dm_verity *v, sector_t block, - int level) -{ - return block >> (level * v->hash_per_block_bits); -} - -static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level, - sector_t *hash_block, unsigned *offset) -{ - sector_t position = verity_position_at_level(v, block, level); - unsigned idx; - - *hash_block = v->hash_level_block[level] + (position >> v->hash_per_block_bits); - - if (!offset) - return; - - idx = position & ((1 << v->hash_per_block_bits) - 1); - if (!v->version) - *offset = idx * v->digest_size; - else - *offset = idx << (v->hash_dev_block_bits - v->hash_per_block_bits); -} - -/* - * Verify hash of a metadata block pertaining to the specified data block - * ("block" argument) at a specified level ("level" argument). - * - * On successful return, io_want_digest(v, io) contains the hash value for - * a lower tree level or for the data block (if we're at the lowest leve). - * - * If "skip_unverified" is true, unverified buffer is skipped and 1 is returned. - * If "skip_unverified" is false, unverified buffer is hashed and verified - * against current value of io_want_digest(v, io). - */ -static int verity_verify_level(struct dm_verity_io *io, sector_t block, - int level, bool skip_unverified) -{ - struct dm_verity *v = io->v; - struct dm_buffer *buf; - struct buffer_aux *aux; - u8 *data; - int r; - sector_t hash_block; - unsigned offset; - - verity_hash_at_level(v, block, level, &hash_block, &offset); - - data = dm_bufio_read(v->bufio, hash_block, &buf); - if (unlikely(IS_ERR(data))) - return PTR_ERR(data); - - aux = dm_bufio_get_aux_data(buf); - - if (!aux->hash_verified) { - struct shash_desc *desc; - u8 *result; - - if (skip_unverified) { - r = 1; - goto release_ret_r; - } - - desc = io_hash_desc(v, io); - desc->tfm = v->tfm; - desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; - r = crypto_shash_init(desc); - if (r < 0) { - DMERR("crypto_shash_init failed: %d", r); - goto release_ret_r; - } - - if (likely(v->version >= 1)) { - r = crypto_shash_update(desc, v->salt, v->salt_size); - if (r < 0) { - DMERR("crypto_shash_update failed: %d", r); - goto release_ret_r; - } - } - - r = crypto_shash_update(desc, data, 1 << v->hash_dev_block_bits); - if (r < 0) { - DMERR("crypto_shash_update failed: %d", r); - goto release_ret_r; - } - - if (!v->version) { - r = crypto_shash_update(desc, v->salt, v->salt_size); - if (r < 0) { - DMERR("crypto_shash_update failed: %d", r); - goto release_ret_r; - } - } - - result = io_real_digest(v, io); - r = crypto_shash_final(desc, result); - if (r < 0) { - DMERR("crypto_shash_final failed: %d", r); - goto release_ret_r; - } - if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) { - DMERR_LIMIT("metadata block %llu is corrupted", - (unsigned long long)hash_block); - v->hash_failed = 1; - r = -EIO; - goto release_ret_r; - } else - aux->hash_verified = 1; - } - - data += offset; - - memcpy(io_want_digest(v, io), data, v->digest_size); - - dm_bufio_release(buf); - return 0; - -release_ret_r: - dm_bufio_release(buf); - - return r; -} - -/* - * Verify one "dm_verity_io" structure. - */ -static int verity_verify_io(struct dm_verity_io *io) -{ - struct dm_verity *v = io->v; - unsigned b; - int i; - unsigned vector = 0, offset = 0; - - for (b = 0; b < io->n_blocks; b++) { - struct shash_desc *desc; - u8 *result; - int r; - unsigned todo; - - if (likely(v->levels)) { - /* - * First, we try to get the requested hash for - * the current block. If the hash block itself is - * verified, zero is returned. If it isn't, this - * function returns 0 and we fall back to whole - * chain verification. - */ - int r = verity_verify_level(io, io->block + b, 0, true); - if (likely(!r)) - goto test_block_hash; - if (r < 0) - return r; - } - - memcpy(io_want_digest(v, io), v->root_digest, v->digest_size); - - for (i = v->levels - 1; i >= 0; i--) { - int r = verity_verify_level(io, io->block + b, i, false); - if (unlikely(r)) - return r; - } - -test_block_hash: - desc = io_hash_desc(v, io); - desc->tfm = v->tfm; - desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; - r = crypto_shash_init(desc); - if (r < 0) { - DMERR("crypto_shash_init failed: %d", r); - return r; - } - - if (likely(v->version >= 1)) { - r = crypto_shash_update(desc, v->salt, v->salt_size); - if (r < 0) { - DMERR("crypto_shash_update failed: %d", r); - return r; - } - } - - todo = 1 << v->data_dev_block_bits; - do { - struct bio_vec *bv; - u8 *page; - unsigned len; - - BUG_ON(vector >= io->io_vec_size); - bv = &io->io_vec[vector]; - page = kmap_atomic(bv->bv_page); - len = bv->bv_len - offset; - if (likely(len >= todo)) - len = todo; - r = crypto_shash_update(desc, - page + bv->bv_offset + offset, len); - kunmap_atomic(page); - if (r < 0) { - DMERR("crypto_shash_update failed: %d", r); - return r; - } - offset += len; - if (likely(offset == bv->bv_len)) { - offset = 0; - vector++; - } - todo -= len; - } while (todo); - - if (!v->version) { - r = crypto_shash_update(desc, v->salt, v->salt_size); - if (r < 0) { - DMERR("crypto_shash_update failed: %d", r); - return r; - } - } - - result = io_real_digest(v, io); - r = crypto_shash_final(desc, result); - if (r < 0) { - DMERR("crypto_shash_final failed: %d", r); - return r; - } - if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) { - DMERR_LIMIT("data block %llu is corrupted", - (unsigned long long)(io->block + b)); - v->hash_failed = 1; - return -EIO; - } - } - BUG_ON(vector != io->io_vec_size); - BUG_ON(offset); - - return 0; -} - -/* - * End one "io" structure with a given error. - */ -static void verity_finish_io(struct dm_verity_io *io, int error) -{ - struct bio *bio = io->bio; - struct dm_verity *v = io->v; - - bio->bi_end_io = io->orig_bi_end_io; - bio->bi_private = io->orig_bi_private; - - if (io->io_vec != io->io_vec_inline) - mempool_free(io->io_vec, v->vec_mempool); - - mempool_free(io, v->io_mempool); - - bio_endio(bio, error); -} - -static void verity_work(struct work_struct *w) -{ - struct dm_verity_io *io = container_of(w, struct dm_verity_io, work); - - verity_finish_io(io, verity_verify_io(io)); -} - -static void verity_end_io(struct bio *bio, int error) -{ - struct dm_verity_io *io = bio->bi_private; - - if (error) { - verity_finish_io(io, error); - return; - } - - INIT_WORK(&io->work, verity_work); - queue_work(io->v->verify_wq, &io->work); -} - -/* - * Prefetch buffers for the specified io. - * The root buffer is not prefetched, it is assumed that it will be cached - * all the time. - */ -static void verity_prefetch_io(struct dm_verity *v, struct dm_verity_io *io) -{ - int i; - - for (i = v->levels - 2; i >= 0; i--) { - sector_t hash_block_start; - sector_t hash_block_end; - verity_hash_at_level(v, io->block, i, &hash_block_start, NULL); - verity_hash_at_level(v, io->block + io->n_blocks - 1, i, &hash_block_end, NULL); - if (!i) { - unsigned cluster = *(volatile unsigned *)&dm_verity_prefetch_cluster; - - cluster >>= v->data_dev_block_bits; - if (unlikely(!cluster)) - goto no_prefetch_cluster; - - if (unlikely(cluster & (cluster - 1))) - cluster = 1 << (fls(cluster) - 1); - - hash_block_start &= ~(sector_t)(cluster - 1); - hash_block_end |= cluster - 1; - if (unlikely(hash_block_end >= v->hash_blocks)) - hash_block_end = v->hash_blocks - 1; - } -no_prefetch_cluster: - dm_bufio_prefetch(v->bufio, hash_block_start, - hash_block_end - hash_block_start + 1); - } -} - -/* - * Bio map function. It allocates dm_verity_io structure and bio vector and - * fills them. Then it issues prefetches and the I/O. - */ -static int verity_map(struct dm_target *ti, struct bio *bio, - union map_info *map_context) -{ - struct dm_verity *v = ti->private; - struct dm_verity_io *io; - - bio->bi_bdev = v->data_dev->bdev; - bio->bi_sector = verity_map_sector(v, bio->bi_sector); - - if (((unsigned)bio->bi_sector | bio_sectors(bio)) & - ((1 << (v->data_dev_block_bits - SECTOR_SHIFT)) - 1)) { - DMERR_LIMIT("unaligned io"); - return -EIO; - } - - if ((bio->bi_sector + bio_sectors(bio)) >> - (v->data_dev_block_bits - SECTOR_SHIFT) > v->data_blocks) { - DMERR_LIMIT("io out of range"); - return -EIO; - } - - if (bio_data_dir(bio) == WRITE) - return -EIO; - - io = mempool_alloc(v->io_mempool, GFP_NOIO); - io->v = v; - io->bio = bio; - io->orig_bi_end_io = bio->bi_end_io; - io->orig_bi_private = bio->bi_private; - io->block = bio->bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT); - io->n_blocks = bio->bi_size >> v->data_dev_block_bits; - - bio->bi_end_io = verity_end_io; - bio->bi_private = io; - io->io_vec_size = bio->bi_vcnt - bio->bi_idx; - if (io->io_vec_size < DM_VERITY_IO_VEC_INLINE) - io->io_vec = io->io_vec_inline; - else - io->io_vec = mempool_alloc(v->vec_mempool, GFP_NOIO); - memcpy(io->io_vec, bio_iovec(bio), - io->io_vec_size * sizeof(struct bio_vec)); - - verity_prefetch_io(v, io); - - generic_make_request(bio); - - return DM_MAPIO_SUBMITTED; -} - -/* - * Status: V (valid) or C (corruption found) - */ -static int verity_status(struct dm_target *ti, status_type_t type, - char *result, unsigned maxlen) -{ - struct dm_verity *v = ti->private; - unsigned sz = 0; - unsigned x; - - switch (type) { - case STATUSTYPE_INFO: - DMEMIT("%c", v->hash_failed ? 'C' : 'V'); - break; - case STATUSTYPE_TABLE: - DMEMIT("%u %s %s %u %u %llu %llu %s ", - v->version, - v->data_dev->name, - v->hash_dev->name, - 1 << v->data_dev_block_bits, - 1 << v->hash_dev_block_bits, - (unsigned long long)v->data_blocks, - (unsigned long long)v->hash_start, - v->alg_name - ); - for (x = 0; x < v->digest_size; x++) - DMEMIT("%02x", v->root_digest[x]); - DMEMIT(" "); - if (!v->salt_size) - DMEMIT("-"); - else - for (x = 0; x < v->salt_size; x++) - DMEMIT("%02x", v->salt[x]); - break; - } - - return 0; -} - -static int verity_ioctl(struct dm_target *ti, unsigned cmd, - unsigned long arg) -{ - struct dm_verity *v = ti->private; - int r = 0; - - if (v->data_start || - ti->len != i_size_read(v->data_dev->bdev->bd_inode) >> SECTOR_SHIFT) - r = scsi_verify_blk_ioctl(NULL, cmd); - - return r ? : __blkdev_driver_ioctl(v->data_dev->bdev, v->data_dev->mode, - cmd, arg); -} - -static int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm, - struct bio_vec *biovec, int max_size) -{ - struct dm_verity *v = ti->private; - struct request_queue *q = bdev_get_queue(v->data_dev->bdev); - - if (!q->merge_bvec_fn) - return max_size; - - bvm->bi_bdev = v->data_dev->bdev; - bvm->bi_sector = verity_map_sector(v, bvm->bi_sector); - - return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); -} - -static int verity_iterate_devices(struct dm_target *ti, - iterate_devices_callout_fn fn, void *data) -{ - struct dm_verity *v = ti->private; - - return fn(ti, v->data_dev, v->data_start, ti->len, data); -} - -static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits) -{ - struct dm_verity *v = ti->private; - - if (limits->logical_block_size < 1 << v->data_dev_block_bits) - limits->logical_block_size = 1 << v->data_dev_block_bits; - - if (limits->physical_block_size < 1 << v->data_dev_block_bits) - limits->physical_block_size = 1 << v->data_dev_block_bits; - - blk_limits_io_min(limits, limits->logical_block_size); -} - -static void verity_dtr(struct dm_target *ti) -{ - struct dm_verity *v = ti->private; - - if (v->verify_wq) - destroy_workqueue(v->verify_wq); - - if (v->vec_mempool) - mempool_destroy(v->vec_mempool); - - if (v->io_mempool) - mempool_destroy(v->io_mempool); - - if (v->bufio) - dm_bufio_client_destroy(v->bufio); - - kfree(v->salt); - kfree(v->root_digest); - - if (v->tfm) - crypto_free_shash(v->tfm); - - kfree(v->alg_name); - - if (v->hash_dev) - dm_put_device(ti, v->hash_dev); - - if (v->data_dev) - dm_put_device(ti, v->data_dev); - - kfree(v); -} - -/* - * Target parameters: - * <version> The current format is version 1. - * Vsn 0 is compatible with original Chromium OS releases. - * <data device> - * <hash device> - * <data block size> - * <hash block size> - * <the number of data blocks> - * <hash start block> - * <algorithm> - * <digest> - * <salt> Hex string or "-" if no salt. - */ -static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) -{ - struct dm_verity *v; - unsigned num; - unsigned long long num_ll; - int r; - int i; - sector_t hash_position; - char dummy; - - v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL); - if (!v) { - ti->error = "Cannot allocate verity structure"; - return -ENOMEM; - } - ti->private = v; - v->ti = ti; - - if ((dm_table_get_mode(ti->table) & ~FMODE_READ)) { - ti->error = "Device must be readonly"; - r = -EINVAL; - goto bad; - } - - if (argc != 10) { - ti->error = "Invalid argument count: exactly 10 arguments required"; - r = -EINVAL; - goto bad; - } - - if (sscanf(argv[0], "%d%c", &num, &dummy) != 1 || - num < 0 || num > 1) { - ti->error = "Invalid version"; - r = -EINVAL; - goto bad; - } - v->version = num; - - r = dm_get_device(ti, argv[1], FMODE_READ, &v->data_dev); - if (r) { - ti->error = "Data device lookup failed"; - goto bad; - } - - r = dm_get_device(ti, argv[2], FMODE_READ, &v->hash_dev); - if (r) { - ti->error = "Data device lookup failed"; - goto bad; - } - - if (sscanf(argv[3], "%u%c", &num, &dummy) != 1 || - !num || (num & (num - 1)) || - num < bdev_logical_block_size(v->data_dev->bdev) || - num > PAGE_SIZE) { - ti->error = "Invalid data device block size"; - r = -EINVAL; - goto bad; - } - v->data_dev_block_bits = ffs(num) - 1; - - if (sscanf(argv[4], "%u%c", &num, &dummy) != 1 || - !num || (num & (num - 1)) || - num < bdev_logical_block_size(v->hash_dev->bdev) || - num > INT_MAX) { - ti->error = "Invalid hash device block size"; - r = -EINVAL; - goto bad; - } - v->hash_dev_block_bits = ffs(num) - 1; - - if (sscanf(argv[5], "%llu%c", &num_ll, &dummy) != 1 || - num_ll << (v->data_dev_block_bits - SECTOR_SHIFT) != - (sector_t)num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) { - ti->error = "Invalid data blocks"; - r = -EINVAL; - goto bad; - } - v->data_blocks = num_ll; - - if (ti->len > (v->data_blocks << (v->data_dev_block_bits - SECTOR_SHIFT))) { - ti->error = "Data device is too small"; - r = -EINVAL; - goto bad; - } - - if (sscanf(argv[6], "%llu%c", &num_ll, &dummy) != 1 || - num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT) != - (sector_t)num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT)) { - ti->error = "Invalid hash start"; - r = -EINVAL; - goto bad; - } - v->hash_start = num_ll; - - v->alg_name = kstrdup(argv[7], GFP_KERNEL); - if (!v->alg_name) { - ti->error = "Cannot allocate algorithm name"; - r = -ENOMEM; - goto bad; - } - - v->tfm = crypto_alloc_shash(v->alg_name, 0, 0); - if (IS_ERR(v->tfm)) { - ti->error = "Cannot initialize hash function"; - r = PTR_ERR(v->tfm); - v->tfm = NULL; - goto bad; - } - v->digest_size = crypto_shash_digestsize(v->tfm); - if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) { - ti->error = "Digest size too big"; - r = -EINVAL; - goto bad; - } - v->shash_descsize = - sizeof(struct shash_desc) + crypto_shash_descsize(v->tfm); - - v->root_digest = kmalloc(v->digest_size, GFP_KERNEL); - if (!v->root_digest) { - ti->error = "Cannot allocate root digest"; - r = -ENOMEM; - goto bad; - } - if (strlen(argv[8]) != v->digest_size * 2 || - hex2bin(v->root_digest, argv[8], v->digest_size)) { - ti->error = "Invalid root digest"; - r = -EINVAL; - goto bad; - } - - if (strcmp(argv[9], "-")) { - v->salt_size = strlen(argv[9]) / 2; - v->salt = kmalloc(v->salt_size, GFP_KERNEL); - if (!v->salt) { - ti->error = "Cannot allocate salt"; - r = -ENOMEM; - goto bad; - } - if (strlen(argv[9]) != v->salt_size * 2 || - hex2bin(v->salt, argv[9], v->salt_size)) { - ti->error = "Invalid salt"; - r = -EINVAL; - goto bad; - } - } - - v->hash_per_block_bits = - fls((1 << v->hash_dev_block_bits) / v->digest_size) - 1; - - v->levels = 0; - if (v->data_blocks) - while (v->hash_per_block_bits * v->levels < 64 && - (unsigned long long)(v->data_blocks - 1) >> - (v->hash_per_block_bits * v->levels)) - v->levels++; - - if (v->levels > DM_VERITY_MAX_LEVELS) { - ti->error = "Too many tree levels"; - r = -E2BIG; - goto bad; - } - - hash_position = v->hash_start; - for (i = v->levels - 1; i >= 0; i--) { - sector_t s; - v->hash_level_block[i] = hash_position; - s = verity_position_at_level(v, v->data_blocks, i); - s = (s >> v->hash_per_block_bits) + - !!(s & ((1 << v->hash_per_block_bits) - 1)); - if (hash_position + s < hash_position) { - ti->error = "Hash device offset overflow"; - r = -E2BIG; - goto bad; - } - hash_position += s; - } - v->hash_blocks = hash_position; - - v->bufio = dm_bufio_client_create(v->hash_dev->bdev, - 1 << v->hash_dev_block_bits, 1, sizeof(struct buffer_aux), - dm_bufio_alloc_callback, NULL); - if (IS_ERR(v->bufio)) { - ti->error = "Cannot initialize dm-bufio"; - r = PTR_ERR(v->bufio); - v->bufio = NULL; - goto bad; - } - - if (dm_bufio_get_device_size(v->bufio) < v->hash_blocks) { - ti->error = "Hash device is too small"; - r = -E2BIG; - goto bad; - } - - v->io_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE, - sizeof(struct dm_verity_io) + v->shash_descsize + v->digest_size * 2); - if (!v->io_mempool) { - ti->error = "Cannot allocate io mempool"; - r = -ENOMEM; - goto bad; - } - - v->vec_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE, - BIO_MAX_PAGES * sizeof(struct bio_vec)); - if (!v->vec_mempool) { - ti->error = "Cannot allocate vector mempool"; - r = -ENOMEM; - goto bad; - } - - /* WQ_UNBOUND greatly improves performance when running on ramdisk */ - v->verify_wq = alloc_workqueue("kverityd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus()); - if (!v->verify_wq) { - ti->error = "Cannot allocate workqueue"; - r = -ENOMEM; - goto bad; - } - - return 0; - -bad: - verity_dtr(ti); - - return r; -} - -static struct target_type verity_target = { - .name = "verity", - .version = {1, 0, 0}, - .module = THIS_MODULE, - .ctr = verity_ctr, - .dtr = verity_dtr, - .map = verity_map, - .status = verity_status, - .ioctl = verity_ioctl, - .merge = verity_merge, - .iterate_devices = verity_iterate_devices, - .io_hints = verity_io_hints, -}; - -static int __init dm_verity_init(void) -{ - int r; - - r = dm_register_target(&verity_target); - if (r < 0) - DMERR("register failed %d", r); - - return r; -} - -static void __exit dm_verity_exit(void) -{ - dm_unregister_target(&verity_target); -} - -module_init(dm_verity_init); -module_exit(dm_verity_exit); - -MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>"); -MODULE_AUTHOR("Mandeep Baines <msb@chromium.org>"); -MODULE_AUTHOR("Will Drewry <wad@chromium.org>"); -MODULE_DESCRIPTION(DM_NAME " target for transparent disk integrity checking"); -MODULE_LICENSE("GPL"); diff --git a/ANDROID_3.4.5/drivers/md/dm-zero.c b/ANDROID_3.4.5/drivers/md/dm-zero.c deleted file mode 100644 index cc2b3cb8..00000000 --- a/ANDROID_3.4.5/drivers/md/dm-zero.c +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (C) 2003 Christophe Saout <christophe@saout.de> - * - * This file is released under the GPL. - */ - -#include <linux/device-mapper.h> - -#include <linux/module.h> -#include <linux/init.h> -#include <linux/bio.h> - -#define DM_MSG_PREFIX "zero" - -/* - * Construct a dummy mapping that only returns zeros - */ -static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv) -{ - if (argc != 0) { - ti->error = "No arguments required"; - return -EINVAL; - } - - /* - * Silently drop discards, avoiding -EOPNOTSUPP. - */ - ti->num_discard_requests = 1; - - return 0; -} - -/* - * Return zeros only on reads - */ -static int zero_map(struct dm_target *ti, struct bio *bio, - union map_info *map_context) -{ - switch(bio_rw(bio)) { - case READ: - zero_fill_bio(bio); - break; - case READA: - /* readahead of null bytes only wastes buffer cache */ - return -EIO; - case WRITE: - /* writes get silently dropped */ - break; - } - - bio_endio(bio, 0); - - /* accepted bio, don't make new request */ - return DM_MAPIO_SUBMITTED; -} - -static struct target_type zero_target = { - .name = "zero", - .version = {1, 0, 0}, - .module = THIS_MODULE, - .ctr = zero_ctr, - .map = zero_map, -}; - -static int __init dm_zero_init(void) -{ - int r = dm_register_target(&zero_target); - - if (r < 0) - DMERR("register failed %d", r); - - return r; -} - -static void __exit dm_zero_exit(void) -{ - dm_unregister_target(&zero_target); -} - -module_init(dm_zero_init) -module_exit(dm_zero_exit) - -MODULE_AUTHOR("Christophe Saout <christophe@saout.de>"); -MODULE_DESCRIPTION(DM_NAME " dummy target returning zeros"); -MODULE_LICENSE("GPL"); diff --git a/ANDROID_3.4.5/drivers/md/dm.c b/ANDROID_3.4.5/drivers/md/dm.c deleted file mode 100644 index e24143cc..00000000 --- a/ANDROID_3.4.5/drivers/md/dm.c +++ /dev/null @@ -1,2780 +0,0 @@ -/* - * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. - * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. - * - * This file is released under the GPL. - */ - -#include "dm.h" -#include "dm-uevent.h" - -#include <linux/init.h> -#include <linux/module.h> -#include <linux/mutex.h> -#include <linux/moduleparam.h> -#include <linux/blkpg.h> -#include <linux/bio.h> -#include <linux/mempool.h> -#include <linux/slab.h> -#include <linux/idr.h> -#include <linux/hdreg.h> -#include <linux/delay.h> - -#include <trace/events/block.h> - -#define DM_MSG_PREFIX "core" - -#ifdef CONFIG_PRINTK -/* - * ratelimit state to be used in DMXXX_LIMIT(). - */ -DEFINE_RATELIMIT_STATE(dm_ratelimit_state, - DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); -EXPORT_SYMBOL(dm_ratelimit_state); -#endif - -/* - * Cookies are numeric values sent with CHANGE and REMOVE - * uevents while resuming, removing or renaming the device. - */ -#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" -#define DM_COOKIE_LENGTH 24 - -static const char *_name = DM_NAME; - -static unsigned int major = 0; -static unsigned int _major = 0; - -static DEFINE_IDR(_minor_idr); - -static DEFINE_SPINLOCK(_minor_lock); -/* - * For bio-based dm. - * One of these is allocated per bio. - */ -struct dm_io { - struct mapped_device *md; - int error; - atomic_t io_count; - struct bio *bio; - unsigned long start_time; - spinlock_t endio_lock; -}; - -/* - * For bio-based dm. - * One of these is allocated per target within a bio. Hopefully - * this will be simplified out one day. - */ -struct dm_target_io { - struct dm_io *io; - struct dm_target *ti; - union map_info info; -}; - -/* - * For request-based dm. - * One of these is allocated per request. - */ -struct dm_rq_target_io { - struct mapped_device *md; - struct dm_target *ti; - struct request *orig, clone; - int error; - union map_info info; -}; - -/* - * For request-based dm. - * One of these is allocated per bio. - */ -struct dm_rq_clone_bio_info { - struct bio *orig; - struct dm_rq_target_io *tio; -}; - -union map_info *dm_get_mapinfo(struct bio *bio) -{ - if (bio && bio->bi_private) - return &((struct dm_target_io *)bio->bi_private)->info; - return NULL; -} - -union map_info *dm_get_rq_mapinfo(struct request *rq) -{ - if (rq && rq->end_io_data) - return &((struct dm_rq_target_io *)rq->end_io_data)->info; - return NULL; -} -EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); - -#define MINOR_ALLOCED ((void *)-1) - -/* - * Bits for the md->flags field. - */ -#define DMF_BLOCK_IO_FOR_SUSPEND 0 -#define DMF_SUSPENDED 1 -#define DMF_FROZEN 2 -#define DMF_FREEING 3 -#define DMF_DELETING 4 -#define DMF_NOFLUSH_SUSPENDING 5 -#define DMF_MERGE_IS_OPTIONAL 6 - -/* - * Work processed by per-device workqueue. - */ -struct mapped_device { - struct rw_semaphore io_lock; - struct mutex suspend_lock; - rwlock_t map_lock; - atomic_t holders; - atomic_t open_count; - - unsigned long flags; - - struct request_queue *queue; - unsigned type; - /* Protect queue and type against concurrent access. */ - struct mutex type_lock; - - struct target_type *immutable_target_type; - - struct gendisk *disk; - char name[16]; - - void *interface_ptr; - - /* - * A list of ios that arrived while we were suspended. - */ - atomic_t pending[2]; - wait_queue_head_t wait; - struct work_struct work; - struct bio_list deferred; - spinlock_t deferred_lock; - - /* - * Processing queue (flush) - */ - struct workqueue_struct *wq; - - /* - * The current mapping. - */ - struct dm_table *map; - - /* - * io objects are allocated from here. - */ - mempool_t *io_pool; - mempool_t *tio_pool; - - struct bio_set *bs; - - /* - * Event handling. - */ - atomic_t event_nr; - wait_queue_head_t eventq; - atomic_t uevent_seq; - struct list_head uevent_list; - spinlock_t uevent_lock; /* Protect access to uevent_list */ - - /* - * freeze/thaw support require holding onto a super block - */ - struct super_block *frozen_sb; - struct block_device *bdev; - - /* forced geometry settings */ - struct hd_geometry geometry; - - /* sysfs handle */ - struct kobject kobj; - - /* zero-length flush that will be cloned and submitted to targets */ - struct bio flush_bio; -}; - -/* - * For mempools pre-allocation at the table loading time. - */ -struct dm_md_mempools { - mempool_t *io_pool; - mempool_t *tio_pool; - struct bio_set *bs; -}; - -#define MIN_IOS 256 -static struct kmem_cache *_io_cache; -static struct kmem_cache *_tio_cache; -static struct kmem_cache *_rq_tio_cache; -static struct kmem_cache *_rq_bio_info_cache; - -static int __init local_init(void) -{ - int r = -ENOMEM; - - /* allocate a slab for the dm_ios */ - _io_cache = KMEM_CACHE(dm_io, 0); - if (!_io_cache) - return r; - - /* allocate a slab for the target ios */ - _tio_cache = KMEM_CACHE(dm_target_io, 0); - if (!_tio_cache) - goto out_free_io_cache; - - _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0); - if (!_rq_tio_cache) - goto out_free_tio_cache; - - _rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0); - if (!_rq_bio_info_cache) - goto out_free_rq_tio_cache; - - r = dm_uevent_init(); - if (r) - goto out_free_rq_bio_info_cache; - - _major = major; - r = register_blkdev(_major, _name); - if (r < 0) - goto out_uevent_exit; - - if (!_major) - _major = r; - - return 0; - -out_uevent_exit: - dm_uevent_exit(); -out_free_rq_bio_info_cache: - kmem_cache_destroy(_rq_bio_info_cache); -out_free_rq_tio_cache: - kmem_cache_destroy(_rq_tio_cache); -out_free_tio_cache: - kmem_cache_destroy(_tio_cache); -out_free_io_cache: - kmem_cache_destroy(_io_cache); - - return r; -} - -static void local_exit(void) -{ - kmem_cache_destroy(_rq_bio_info_cache); - kmem_cache_destroy(_rq_tio_cache); - kmem_cache_destroy(_tio_cache); - kmem_cache_destroy(_io_cache); - unregister_blkdev(_major, _name); - dm_uevent_exit(); - - _major = 0; - - DMINFO("cleaned up"); -} - -static int (*_inits[])(void) __initdata = { - local_init, - dm_target_init, - dm_linear_init, - dm_stripe_init, - dm_io_init, - dm_kcopyd_init, - dm_interface_init, -}; - -static void (*_exits[])(void) = { - local_exit, - dm_target_exit, - dm_linear_exit, - dm_stripe_exit, - dm_io_exit, - dm_kcopyd_exit, - dm_interface_exit, -}; - -static int __init dm_init(void) -{ - const int count = ARRAY_SIZE(_inits); - - int r, i; - - for (i = 0; i < count; i++) { - r = _inits[i](); - if (r) - goto bad; - } - - return 0; - - bad: - while (i--) - _exits[i](); - - return r; -} - -static void __exit dm_exit(void) -{ - int i = ARRAY_SIZE(_exits); - - while (i--) - _exits[i](); - - /* - * Should be empty by this point. - */ - idr_remove_all(&_minor_idr); - idr_destroy(&_minor_idr); -} - -/* - * Block device functions - */ -int dm_deleting_md(struct mapped_device *md) -{ - return test_bit(DMF_DELETING, &md->flags); -} - -static int dm_blk_open(struct block_device *bdev, fmode_t mode) -{ - struct mapped_device *md; - - spin_lock(&_minor_lock); - - md = bdev->bd_disk->private_data; - if (!md) - goto out; - - if (test_bit(DMF_FREEING, &md->flags) || - dm_deleting_md(md)) { - md = NULL; - goto out; - } - - dm_get(md); - atomic_inc(&md->open_count); - -out: - spin_unlock(&_minor_lock); - - return md ? 0 : -ENXIO; -} - -static int dm_blk_close(struct gendisk *disk, fmode_t mode) -{ - struct mapped_device *md = disk->private_data; - - spin_lock(&_minor_lock); - - atomic_dec(&md->open_count); - dm_put(md); - - spin_unlock(&_minor_lock); - - return 0; -} - -int dm_open_count(struct mapped_device *md) -{ - return atomic_read(&md->open_count); -} - -/* - * Guarantees nothing is using the device before it's deleted. - */ -int dm_lock_for_deletion(struct mapped_device *md) -{ - int r = 0; - - spin_lock(&_minor_lock); - - if (dm_open_count(md)) - r = -EBUSY; - else - set_bit(DMF_DELETING, &md->flags); - - spin_unlock(&_minor_lock); - - return r; -} - -static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) -{ - struct mapped_device *md = bdev->bd_disk->private_data; - - return dm_get_geometry(md, geo); -} - -static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) -{ - struct mapped_device *md = bdev->bd_disk->private_data; - struct dm_table *map = dm_get_live_table(md); - struct dm_target *tgt; - int r = -ENOTTY; - - if (!map || !dm_table_get_size(map)) - goto out; - - /* We only support devices that have a single target */ - if (dm_table_get_num_targets(map) != 1) - goto out; - - tgt = dm_table_get_target(map, 0); - - if (dm_suspended_md(md)) { - r = -EAGAIN; - goto out; - } - - if (tgt->type->ioctl) - r = tgt->type->ioctl(tgt, cmd, arg); - -out: - dm_table_put(map); - - return r; -} - -static struct dm_io *alloc_io(struct mapped_device *md) -{ - return mempool_alloc(md->io_pool, GFP_NOIO); -} - -static void free_io(struct mapped_device *md, struct dm_io *io) -{ - mempool_free(io, md->io_pool); -} - -static void free_tio(struct mapped_device *md, struct dm_target_io *tio) -{ - mempool_free(tio, md->tio_pool); -} - -static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, - gfp_t gfp_mask) -{ - return mempool_alloc(md->tio_pool, gfp_mask); -} - -static void free_rq_tio(struct dm_rq_target_io *tio) -{ - mempool_free(tio, tio->md->tio_pool); -} - -static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md) -{ - return mempool_alloc(md->io_pool, GFP_ATOMIC); -} - -static void free_bio_info(struct dm_rq_clone_bio_info *info) -{ - mempool_free(info, info->tio->md->io_pool); -} - -static int md_in_flight(struct mapped_device *md) -{ - return atomic_read(&md->pending[READ]) + - atomic_read(&md->pending[WRITE]); -} - -static void start_io_acct(struct dm_io *io) -{ - struct mapped_device *md = io->md; - int cpu; - int rw = bio_data_dir(io->bio); - - io->start_time = jiffies; - - cpu = part_stat_lock(); - part_round_stats(cpu, &dm_disk(md)->part0); - part_stat_unlock(); - atomic_set(&dm_disk(md)->part0.in_flight[rw], - atomic_inc_return(&md->pending[rw])); -} - -static void end_io_acct(struct dm_io *io) -{ - struct mapped_device *md = io->md; - struct bio *bio = io->bio; - unsigned long duration = jiffies - io->start_time; - int pending, cpu; - int rw = bio_data_dir(bio); - - cpu = part_stat_lock(); - part_round_stats(cpu, &dm_disk(md)->part0); - part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration); - part_stat_unlock(); - - /* - * After this is decremented the bio must not be touched if it is - * a flush. - */ - pending = atomic_dec_return(&md->pending[rw]); - atomic_set(&dm_disk(md)->part0.in_flight[rw], pending); - pending += atomic_read(&md->pending[rw^0x1]); - - /* nudge anyone waiting on suspend queue */ - if (!pending) - wake_up(&md->wait); -} - -/* - * Add the bio to the list of deferred io. - */ -static void queue_io(struct mapped_device *md, struct bio *bio) -{ - unsigned long flags; - - spin_lock_irqsave(&md->deferred_lock, flags); - bio_list_add(&md->deferred, bio); - spin_unlock_irqrestore(&md->deferred_lock, flags); - queue_work(md->wq, &md->work); -} - -/* - * Everyone (including functions in this file), should use this - * function to access the md->map field, and make sure they call - * dm_table_put() when finished. - */ -struct dm_table *dm_get_live_table(struct mapped_device *md) -{ - struct dm_table *t; - unsigned long flags; - - read_lock_irqsave(&md->map_lock, flags); - t = md->map; - if (t) - dm_table_get(t); - read_unlock_irqrestore(&md->map_lock, flags); - - return t; -} - -/* - * Get the geometry associated with a dm device - */ -int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) -{ - *geo = md->geometry; - - return 0; -} - -/* - * Set the geometry of a device. - */ -int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) -{ - sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; - - if (geo->start > sz) { - DMWARN("Start sector is beyond the geometry limits."); - return -EINVAL; - } - - md->geometry = *geo; - - return 0; -} - -/*----------------------------------------------------------------- - * CRUD START: - * A more elegant soln is in the works that uses the queue - * merge fn, unfortunately there are a couple of changes to - * the block layer that I want to make for this. So in the - * interests of getting something for people to use I give - * you this clearly demarcated crap. - *---------------------------------------------------------------*/ - -static int __noflush_suspending(struct mapped_device *md) -{ - return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); -} - -/* - * Decrements the number of outstanding ios that a bio has been - * cloned into, completing the original io if necc. - */ -static void dec_pending(struct dm_io *io, int error) -{ - unsigned long flags; - int io_error; - struct bio *bio; - struct mapped_device *md = io->md; - - /* Push-back supersedes any I/O errors */ - if (unlikely(error)) { - spin_lock_irqsave(&io->endio_lock, flags); - if (!(io->error > 0 && __noflush_suspending(md))) - io->error = error; - spin_unlock_irqrestore(&io->endio_lock, flags); - } - - if (atomic_dec_and_test(&io->io_count)) { - if (io->error == DM_ENDIO_REQUEUE) { - /* - * Target requested pushing back the I/O. - */ - spin_lock_irqsave(&md->deferred_lock, flags); - if (__noflush_suspending(md)) - bio_list_add_head(&md->deferred, io->bio); - else - /* noflush suspend was interrupted. */ - io->error = -EIO; - spin_unlock_irqrestore(&md->deferred_lock, flags); - } - - io_error = io->error; - bio = io->bio; - end_io_acct(io); - free_io(md, io); - - if (io_error == DM_ENDIO_REQUEUE) - return; - - if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) { - /* - * Preflush done for flush with data, reissue - * without REQ_FLUSH. - */ - bio->bi_rw &= ~REQ_FLUSH; - queue_io(md, bio); - } else { - /* done with normal IO or empty flush */ - trace_block_bio_complete(md->queue, bio, io_error); - bio_endio(bio, io_error); - } - } -} - -static void clone_endio(struct bio *bio, int error) -{ - int r = 0; - struct dm_target_io *tio = bio->bi_private; - struct dm_io *io = tio->io; - struct mapped_device *md = tio->io->md; - dm_endio_fn endio = tio->ti->type->end_io; - - if (!bio_flagged(bio, BIO_UPTODATE) && !error) - error = -EIO; - - if (endio) { - r = endio(tio->ti, bio, error, &tio->info); - if (r < 0 || r == DM_ENDIO_REQUEUE) - /* - * error and requeue request are handled - * in dec_pending(). - */ - error = r; - else if (r == DM_ENDIO_INCOMPLETE) - /* The target will handle the io */ - return; - else if (r) { - DMWARN("unimplemented target endio return value: %d", r); - BUG(); - } - } - - /* - * Store md for cleanup instead of tio which is about to get freed. - */ - bio->bi_private = md->bs; - - free_tio(md, tio); - bio_put(bio); - dec_pending(io, error); -} - -/* - * Partial completion handling for request-based dm - */ -static void end_clone_bio(struct bio *clone, int error) -{ - struct dm_rq_clone_bio_info *info = clone->bi_private; - struct dm_rq_target_io *tio = info->tio; - struct bio *bio = info->orig; - unsigned int nr_bytes = info->orig->bi_size; - - bio_put(clone); - - if (tio->error) - /* - * An error has already been detected on the request. - * Once error occurred, just let clone->end_io() handle - * the remainder. - */ - return; - else if (error) { - /* - * Don't notice the error to the upper layer yet. - * The error handling decision is made by the target driver, - * when the request is completed. - */ - tio->error = error; - return; - } - - /* - * I/O for the bio successfully completed. - * Notice the data completion to the upper layer. - */ - - /* - * bios are processed from the head of the list. - * So the completing bio should always be rq->bio. - * If it's not, something wrong is happening. - */ - if (tio->orig->bio != bio) - DMERR("bio completion is going in the middle of the request"); - - /* - * Update the original request. - * Do not use blk_end_request() here, because it may complete - * the original request before the clone, and break the ordering. - */ - blk_update_request(tio->orig, 0, nr_bytes); -} - -/* - * Don't touch any member of the md after calling this function because - * the md may be freed in dm_put() at the end of this function. - * Or do dm_get() before calling this function and dm_put() later. - */ -static void rq_completed(struct mapped_device *md, int rw, int run_queue) -{ - atomic_dec(&md->pending[rw]); - - /* nudge anyone waiting on suspend queue */ - if (!md_in_flight(md)) - wake_up(&md->wait); - - if (run_queue) - blk_run_queue(md->queue); - - /* - * dm_put() must be at the end of this function. See the comment above - */ - dm_put(md); -} - -static void free_rq_clone(struct request *clone) -{ - struct dm_rq_target_io *tio = clone->end_io_data; - - blk_rq_unprep_clone(clone); - free_rq_tio(tio); -} - -/* - * Complete the clone and the original request. - * Must be called without queue lock. - */ -static void dm_end_request(struct request *clone, int error) -{ - int rw = rq_data_dir(clone); - struct dm_rq_target_io *tio = clone->end_io_data; - struct mapped_device *md = tio->md; - struct request *rq = tio->orig; - - if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { - rq->errors = clone->errors; - rq->resid_len = clone->resid_len; - - if (rq->sense) - /* - * We are using the sense buffer of the original - * request. - * So setting the length of the sense data is enough. - */ - rq->sense_len = clone->sense_len; - } - - free_rq_clone(clone); - blk_end_request_all(rq, error); - rq_completed(md, rw, true); -} - -static void dm_unprep_request(struct request *rq) -{ - struct request *clone = rq->special; - - rq->special = NULL; - rq->cmd_flags &= ~REQ_DONTPREP; - - free_rq_clone(clone); -} - -/* - * Requeue the original request of a clone. - */ -void dm_requeue_unmapped_request(struct request *clone) -{ - int rw = rq_data_dir(clone); - struct dm_rq_target_io *tio = clone->end_io_data; - struct mapped_device *md = tio->md; - struct request *rq = tio->orig; - struct request_queue *q = rq->q; - unsigned long flags; - - dm_unprep_request(rq); - - spin_lock_irqsave(q->queue_lock, flags); - blk_requeue_request(q, rq); - spin_unlock_irqrestore(q->queue_lock, flags); - - rq_completed(md, rw, 0); -} -EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); - -static void __stop_queue(struct request_queue *q) -{ - blk_stop_queue(q); -} - -static void stop_queue(struct request_queue *q) -{ - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - __stop_queue(q); - spin_unlock_irqrestore(q->queue_lock, flags); -} - -static void __start_queue(struct request_queue *q) -{ - if (blk_queue_stopped(q)) - blk_start_queue(q); -} - -static void start_queue(struct request_queue *q) -{ - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - __start_queue(q); - spin_unlock_irqrestore(q->queue_lock, flags); -} - -static void dm_done(struct request *clone, int error, bool mapped) -{ - int r = error; - struct dm_rq_target_io *tio = clone->end_io_data; - dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io; - - if (mapped && rq_end_io) - r = rq_end_io(tio->ti, clone, error, &tio->info); - - if (r <= 0) - /* The target wants to complete the I/O */ - dm_end_request(clone, r); - else if (r == DM_ENDIO_INCOMPLETE) - /* The target will handle the I/O */ - return; - else if (r == DM_ENDIO_REQUEUE) - /* The target wants to requeue the I/O */ - dm_requeue_unmapped_request(clone); - else { - DMWARN("unimplemented target endio return value: %d", r); - BUG(); - } -} - -/* - * Request completion handler for request-based dm - */ -static void dm_softirq_done(struct request *rq) -{ - bool mapped = true; - struct request *clone = rq->completion_data; - struct dm_rq_target_io *tio = clone->end_io_data; - - if (rq->cmd_flags & REQ_FAILED) - mapped = false; - - dm_done(clone, tio->error, mapped); -} - -/* - * Complete the clone and the original request with the error status - * through softirq context. - */ -static void dm_complete_request(struct request *clone, int error) -{ - struct dm_rq_target_io *tio = clone->end_io_data; - struct request *rq = tio->orig; - - tio->error = error; - rq->completion_data = clone; - blk_complete_request(rq); -} - -/* - * Complete the not-mapped clone and the original request with the error status - * through softirq context. - * Target's rq_end_io() function isn't called. - * This may be used when the target's map_rq() function fails. - */ -void dm_kill_unmapped_request(struct request *clone, int error) -{ - struct dm_rq_target_io *tio = clone->end_io_data; - struct request *rq = tio->orig; - - rq->cmd_flags |= REQ_FAILED; - dm_complete_request(clone, error); -} -EXPORT_SYMBOL_GPL(dm_kill_unmapped_request); - -/* - * Called with the queue lock held - */ -static void end_clone_request(struct request *clone, int error) -{ - /* - * For just cleaning up the information of the queue in which - * the clone was dispatched. - * The clone is *NOT* freed actually here because it is alloced from - * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags. - */ - __blk_put_request(clone->q, clone); - - /* - * Actual request completion is done in a softirq context which doesn't - * hold the queue lock. Otherwise, deadlock could occur because: - * - another request may be submitted by the upper level driver - * of the stacking during the completion - * - the submission which requires queue lock may be done - * against this queue - */ - dm_complete_request(clone, error); -} - -/* - * Return maximum size of I/O possible at the supplied sector up to the current - * target boundary. - */ -static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti) -{ - sector_t target_offset = dm_target_offset(ti, sector); - - return ti->len - target_offset; -} - -static sector_t max_io_len(sector_t sector, struct dm_target *ti) -{ - sector_t len = max_io_len_target_boundary(sector, ti); - - /* - * Does the target need to split even further ? - */ - if (ti->split_io) { - sector_t boundary; - sector_t offset = dm_target_offset(ti, sector); - boundary = ((offset + ti->split_io) & ~(ti->split_io - 1)) - - offset; - if (len > boundary) - len = boundary; - } - - return len; -} - -static void __map_bio(struct dm_target *ti, struct bio *clone, - struct dm_target_io *tio) -{ - int r; - sector_t sector; - struct mapped_device *md; - - clone->bi_end_io = clone_endio; - clone->bi_private = tio; - - /* - * Map the clone. If r == 0 we don't need to do - * anything, the target has assumed ownership of - * this io. - */ - atomic_inc(&tio->io->io_count); - sector = clone->bi_sector; - r = ti->type->map(ti, clone, &tio->info); - if (r == DM_MAPIO_REMAPPED) { - /* the bio has been remapped so dispatch it */ - - trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone, - tio->io->bio->bi_bdev->bd_dev, sector); - - generic_make_request(clone); - } else if (r < 0 || r == DM_MAPIO_REQUEUE) { - /* error the io and bail out, or requeue it if needed */ - md = tio->io->md; - dec_pending(tio->io, r); - /* - * Store bio_set for cleanup. - */ - clone->bi_end_io = NULL; - clone->bi_private = md->bs; - bio_put(clone); - free_tio(md, tio); - } else if (r) { - DMWARN("unimplemented target map return value: %d", r); - BUG(); - } -} - -struct clone_info { - struct mapped_device *md; - struct dm_table *map; - struct bio *bio; - struct dm_io *io; - sector_t sector; - sector_t sector_count; - unsigned short idx; -}; - -static void dm_bio_destructor(struct bio *bio) -{ - struct bio_set *bs = bio->bi_private; - - bio_free(bio, bs); -} - -/* - * Creates a little bio that just does part of a bvec. - */ -static struct bio *split_bvec(struct bio *bio, sector_t sector, - unsigned short idx, unsigned int offset, - unsigned int len, struct bio_set *bs) -{ - struct bio *clone; - struct bio_vec *bv = bio->bi_io_vec + idx; - - clone = bio_alloc_bioset(GFP_NOIO, 1, bs); - clone->bi_destructor = dm_bio_destructor; - *clone->bi_io_vec = *bv; - - clone->bi_sector = sector; - clone->bi_bdev = bio->bi_bdev; - clone->bi_rw = bio->bi_rw; - clone->bi_vcnt = 1; - clone->bi_size = to_bytes(len); - clone->bi_io_vec->bv_offset = offset; - clone->bi_io_vec->bv_len = clone->bi_size; - clone->bi_flags |= 1 << BIO_CLONED; - - if (bio_integrity(bio)) { - bio_integrity_clone(clone, bio, GFP_NOIO, bs); - bio_integrity_trim(clone, - bio_sector_offset(bio, idx, offset), len); - } - - return clone; -} - -/* - * Creates a bio that consists of range of complete bvecs. - */ -static struct bio *clone_bio(struct bio *bio, sector_t sector, - unsigned short idx, unsigned short bv_count, - unsigned int len, struct bio_set *bs) -{ - struct bio *clone; - - clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); - __bio_clone(clone, bio); - clone->bi_destructor = dm_bio_destructor; - clone->bi_sector = sector; - clone->bi_idx = idx; - clone->bi_vcnt = idx + bv_count; - clone->bi_size = to_bytes(len); - clone->bi_flags &= ~(1 << BIO_SEG_VALID); - - if (bio_integrity(bio)) { - bio_integrity_clone(clone, bio, GFP_NOIO, bs); - - if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) - bio_integrity_trim(clone, - bio_sector_offset(bio, idx, 0), len); - } - - return clone; -} - -static struct dm_target_io *alloc_tio(struct clone_info *ci, - struct dm_target *ti) -{ - struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO); - - tio->io = ci->io; - tio->ti = ti; - memset(&tio->info, 0, sizeof(tio->info)); - - return tio; -} - -static void __issue_target_request(struct clone_info *ci, struct dm_target *ti, - unsigned request_nr, sector_t len) -{ - struct dm_target_io *tio = alloc_tio(ci, ti); - struct bio *clone; - - tio->info.target_request_nr = request_nr; - - /* - * Discard requests require the bio's inline iovecs be initialized. - * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush - * and discard, so no need for concern about wasted bvec allocations. - */ - clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs); - __bio_clone(clone, ci->bio); - clone->bi_destructor = dm_bio_destructor; - if (len) { - clone->bi_sector = ci->sector; - clone->bi_size = to_bytes(len); - } - - __map_bio(ti, clone, tio); -} - -static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti, - unsigned num_requests, sector_t len) -{ - unsigned request_nr; - - for (request_nr = 0; request_nr < num_requests; request_nr++) - __issue_target_request(ci, ti, request_nr, len); -} - -static int __clone_and_map_empty_flush(struct clone_info *ci) -{ - unsigned target_nr = 0; - struct dm_target *ti; - - BUG_ON(bio_has_data(ci->bio)); - while ((ti = dm_table_get_target(ci->map, target_nr++))) - __issue_target_requests(ci, ti, ti->num_flush_requests, 0); - - return 0; -} - -/* - * Perform all io with a single clone. - */ -static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti) -{ - struct bio *clone, *bio = ci->bio; - struct dm_target_io *tio; - - tio = alloc_tio(ci, ti); - clone = clone_bio(bio, ci->sector, ci->idx, - bio->bi_vcnt - ci->idx, ci->sector_count, - ci->md->bs); - __map_bio(ti, clone, tio); - ci->sector_count = 0; -} - -static int __clone_and_map_discard(struct clone_info *ci) -{ - struct dm_target *ti; - sector_t len; - - do { - ti = dm_table_find_target(ci->map, ci->sector); - if (!dm_target_is_valid(ti)) - return -EIO; - - /* - * Even though the device advertised discard support, - * that does not mean every target supports it, and - * reconfiguration might also have changed that since the - * check was performed. - */ - if (!ti->num_discard_requests) - return -EOPNOTSUPP; - - len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); - - __issue_target_requests(ci, ti, ti->num_discard_requests, len); - - ci->sector += len; - } while (ci->sector_count -= len); - - return 0; -} - -static int __clone_and_map(struct clone_info *ci) -{ - struct bio *clone, *bio = ci->bio; - struct dm_target *ti; - sector_t len = 0, max; - struct dm_target_io *tio; - - if (unlikely(bio->bi_rw & REQ_DISCARD)) - return __clone_and_map_discard(ci); - - ti = dm_table_find_target(ci->map, ci->sector); - if (!dm_target_is_valid(ti)) - return -EIO; - - max = max_io_len(ci->sector, ti); - - if (ci->sector_count <= max) { - /* - * Optimise for the simple case where we can do all of - * the remaining io with a single clone. - */ - __clone_and_map_simple(ci, ti); - - } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { - /* - * There are some bvecs that don't span targets. - * Do as many of these as possible. - */ - int i; - sector_t remaining = max; - sector_t bv_len; - - for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) { - bv_len = to_sector(bio->bi_io_vec[i].bv_len); - - if (bv_len > remaining) - break; - - remaining -= bv_len; - len += bv_len; - } - - tio = alloc_tio(ci, ti); - clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len, - ci->md->bs); - __map_bio(ti, clone, tio); - - ci->sector += len; - ci->sector_count -= len; - ci->idx = i; - - } else { - /* - * Handle a bvec that must be split between two or more targets. - */ - struct bio_vec *bv = bio->bi_io_vec + ci->idx; - sector_t remaining = to_sector(bv->bv_len); - unsigned int offset = 0; - - do { - if (offset) { - ti = dm_table_find_target(ci->map, ci->sector); - if (!dm_target_is_valid(ti)) - return -EIO; - - max = max_io_len(ci->sector, ti); - } - - len = min(remaining, max); - - tio = alloc_tio(ci, ti); - clone = split_bvec(bio, ci->sector, ci->idx, - bv->bv_offset + offset, len, - ci->md->bs); - - __map_bio(ti, clone, tio); - - ci->sector += len; - ci->sector_count -= len; - offset += to_bytes(len); - } while (remaining -= len); - - ci->idx++; - } - - return 0; -} - -/* - * Split the bio into several clones and submit it to targets. - */ -static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) -{ - struct clone_info ci; - int error = 0; - - ci.map = dm_get_live_table(md); - if (unlikely(!ci.map)) { - bio_io_error(bio); - return; - } - - ci.md = md; - ci.io = alloc_io(md); - ci.io->error = 0; - atomic_set(&ci.io->io_count, 1); - ci.io->bio = bio; - ci.io->md = md; - spin_lock_init(&ci.io->endio_lock); - ci.sector = bio->bi_sector; - ci.idx = bio->bi_idx; - - start_io_acct(ci.io); - if (bio->bi_rw & REQ_FLUSH) { - ci.bio = &ci.md->flush_bio; - ci.sector_count = 0; - error = __clone_and_map_empty_flush(&ci); - /* dec_pending submits any data associated with flush */ - } else { - ci.bio = bio; - ci.sector_count = bio_sectors(bio); - while (ci.sector_count && !error) - error = __clone_and_map(&ci); - } - - /* drop the extra reference count */ - dec_pending(ci.io, error); - dm_table_put(ci.map); -} -/*----------------------------------------------------------------- - * CRUD END - *---------------------------------------------------------------*/ - -static int dm_merge_bvec(struct request_queue *q, - struct bvec_merge_data *bvm, - struct bio_vec *biovec) -{ - struct mapped_device *md = q->queuedata; - struct dm_table *map = dm_get_live_table(md); - struct dm_target *ti; - sector_t max_sectors; - int max_size = 0; - - if (unlikely(!map)) - goto out; - - ti = dm_table_find_target(map, bvm->bi_sector); - if (!dm_target_is_valid(ti)) - goto out_table; - - /* - * Find maximum amount of I/O that won't need splitting - */ - max_sectors = min(max_io_len(bvm->bi_sector, ti), - (sector_t) BIO_MAX_SECTORS); - max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; - if (max_size < 0) - max_size = 0; - - /* - * merge_bvec_fn() returns number of bytes - * it can accept at this offset - * max is precomputed maximal io size - */ - if (max_size && ti->type->merge) - max_size = ti->type->merge(ti, bvm, biovec, max_size); - /* - * If the target doesn't support merge method and some of the devices - * provided their merge_bvec method (we know this by looking at - * queue_max_hw_sectors), then we can't allow bios with multiple vector - * entries. So always set max_size to 0, and the code below allows - * just one page. - */ - else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) - - max_size = 0; - -out_table: - dm_table_put(map); - -out: - /* - * Always allow an entire first page - */ - if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT)) - max_size = biovec->bv_len; - - return max_size; -} - -/* - * The request function that just remaps the bio built up by - * dm_merge_bvec. - */ -static void _dm_request(struct request_queue *q, struct bio *bio) -{ - int rw = bio_data_dir(bio); - struct mapped_device *md = q->queuedata; - int cpu; - - down_read(&md->io_lock); - - cpu = part_stat_lock(); - part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]); - part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio)); - part_stat_unlock(); - - /* if we're suspended, we have to queue this io for later */ - if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { - up_read(&md->io_lock); - - if (bio_rw(bio) != READA) - queue_io(md, bio); - else - bio_io_error(bio); - return; - } - - __split_and_process_bio(md, bio); - up_read(&md->io_lock); - return; -} - -static int dm_request_based(struct mapped_device *md) -{ - return blk_queue_stackable(md->queue); -} - -static void dm_request(struct request_queue *q, struct bio *bio) -{ - struct mapped_device *md = q->queuedata; - - if (dm_request_based(md)) - blk_queue_bio(q, bio); - else - _dm_request(q, bio); -} - -void dm_dispatch_request(struct request *rq) -{ - int r; - - if (blk_queue_io_stat(rq->q)) - rq->cmd_flags |= REQ_IO_STAT; - - rq->start_time = jiffies; - r = blk_insert_cloned_request(rq->q, rq); - if (r) - dm_complete_request(rq, r); -} -EXPORT_SYMBOL_GPL(dm_dispatch_request); - -static void dm_rq_bio_destructor(struct bio *bio) -{ - struct dm_rq_clone_bio_info *info = bio->bi_private; - struct mapped_device *md = info->tio->md; - - free_bio_info(info); - bio_free(bio, md->bs); -} - -static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, - void *data) -{ - struct dm_rq_target_io *tio = data; - struct mapped_device *md = tio->md; - struct dm_rq_clone_bio_info *info = alloc_bio_info(md); - - if (!info) - return -ENOMEM; - - info->orig = bio_orig; - info->tio = tio; - bio->bi_end_io = end_clone_bio; - bio->bi_private = info; - bio->bi_destructor = dm_rq_bio_destructor; - - return 0; -} - -static int setup_clone(struct request *clone, struct request *rq, - struct dm_rq_target_io *tio) -{ - int r; - - r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, - dm_rq_bio_constructor, tio); - if (r) - return r; - - clone->cmd = rq->cmd; - clone->cmd_len = rq->cmd_len; - clone->sense = rq->sense; - clone->buffer = rq->buffer; - clone->end_io = end_clone_request; - clone->end_io_data = tio; - - return 0; -} - -static struct request *clone_rq(struct request *rq, struct mapped_device *md, - gfp_t gfp_mask) -{ - struct request *clone; - struct dm_rq_target_io *tio; - - tio = alloc_rq_tio(md, gfp_mask); - if (!tio) - return NULL; - - tio->md = md; - tio->ti = NULL; - tio->orig = rq; - tio->error = 0; - memset(&tio->info, 0, sizeof(tio->info)); - - clone = &tio->clone; - if (setup_clone(clone, rq, tio)) { - /* -ENOMEM */ - free_rq_tio(tio); - return NULL; - } - - return clone; -} - -/* - * Called with the queue lock held. - */ -static int dm_prep_fn(struct request_queue *q, struct request *rq) -{ - struct mapped_device *md = q->queuedata; - struct request *clone; - - if (unlikely(rq->special)) { - DMWARN("Already has something in rq->special."); - return BLKPREP_KILL; - } - - clone = clone_rq(rq, md, GFP_ATOMIC); - if (!clone) - return BLKPREP_DEFER; - - rq->special = clone; - rq->cmd_flags |= REQ_DONTPREP; - - return BLKPREP_OK; -} - -/* - * Returns: - * 0 : the request has been processed (not requeued) - * !0 : the request has been requeued - */ -static int map_request(struct dm_target *ti, struct request *clone, - struct mapped_device *md) -{ - int r, requeued = 0; - struct dm_rq_target_io *tio = clone->end_io_data; - - /* - * Hold the md reference here for the in-flight I/O. - * We can't rely on the reference count by device opener, - * because the device may be closed during the request completion - * when all bios are completed. - * See the comment in rq_completed() too. - */ - dm_get(md); - - tio->ti = ti; - r = ti->type->map_rq(ti, clone, &tio->info); - switch (r) { - case DM_MAPIO_SUBMITTED: - /* The target has taken the I/O to submit by itself later */ - break; - case DM_MAPIO_REMAPPED: - /* The target has remapped the I/O so dispatch it */ - trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), - blk_rq_pos(tio->orig)); - dm_dispatch_request(clone); - break; - case DM_MAPIO_REQUEUE: - /* The target wants to requeue the I/O */ - dm_requeue_unmapped_request(clone); - requeued = 1; - break; - default: - if (r > 0) { - DMWARN("unimplemented target map return value: %d", r); - BUG(); - } - - /* The target wants to complete the I/O */ - dm_kill_unmapped_request(clone, r); - break; - } - - return requeued; -} - -/* - * q->request_fn for request-based dm. - * Called with the queue lock held. - */ -static void dm_request_fn(struct request_queue *q) -{ - struct mapped_device *md = q->queuedata; - struct dm_table *map = dm_get_live_table(md); - struct dm_target *ti; - struct request *rq, *clone; - sector_t pos; - - /* - * For suspend, check blk_queue_stopped() and increment - * ->pending within a single queue_lock not to increment the - * number of in-flight I/Os after the queue is stopped in - * dm_suspend(). - */ - while (!blk_queue_stopped(q)) { - rq = blk_peek_request(q); - if (!rq) - goto delay_and_out; - - /* always use block 0 to find the target for flushes for now */ - pos = 0; - if (!(rq->cmd_flags & REQ_FLUSH)) - pos = blk_rq_pos(rq); - - ti = dm_table_find_target(map, pos); - BUG_ON(!dm_target_is_valid(ti)); - - if (ti->type->busy && ti->type->busy(ti)) - goto delay_and_out; - - blk_start_request(rq); - clone = rq->special; - atomic_inc(&md->pending[rq_data_dir(clone)]); - - spin_unlock(q->queue_lock); - if (map_request(ti, clone, md)) - goto requeued; - - BUG_ON(!irqs_disabled()); - spin_lock(q->queue_lock); - } - - goto out; - -requeued: - BUG_ON(!irqs_disabled()); - spin_lock(q->queue_lock); - -delay_and_out: - blk_delay_queue(q, HZ / 10); -out: - dm_table_put(map); - - return; -} - -int dm_underlying_device_busy(struct request_queue *q) -{ - return blk_lld_busy(q); -} -EXPORT_SYMBOL_GPL(dm_underlying_device_busy); - -static int dm_lld_busy(struct request_queue *q) -{ - int r; - struct mapped_device *md = q->queuedata; - struct dm_table *map = dm_get_live_table(md); - - if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) - r = 1; - else - r = dm_table_any_busy_target(map); - - dm_table_put(map); - - return r; -} - -static int dm_any_congested(void *congested_data, int bdi_bits) -{ - int r = bdi_bits; - struct mapped_device *md = congested_data; - struct dm_table *map; - - if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { - map = dm_get_live_table(md); - if (map) { - /* - * Request-based dm cares about only own queue for - * the query about congestion status of request_queue - */ - if (dm_request_based(md)) - r = md->queue->backing_dev_info.state & - bdi_bits; - else - r = dm_table_any_congested(map, bdi_bits); - - dm_table_put(map); - } - } - - return r; -} - -/*----------------------------------------------------------------- - * An IDR is used to keep track of allocated minor numbers. - *---------------------------------------------------------------*/ -static void free_minor(int minor) -{ - spin_lock(&_minor_lock); - idr_remove(&_minor_idr, minor); - spin_unlock(&_minor_lock); -} - -/* - * See if the device with a specific minor # is free. - */ -static int specific_minor(int minor) -{ - int r, m; - - if (minor >= (1 << MINORBITS)) - return -EINVAL; - - r = idr_pre_get(&_minor_idr, GFP_KERNEL); - if (!r) - return -ENOMEM; - - spin_lock(&_minor_lock); - - if (idr_find(&_minor_idr, minor)) { - r = -EBUSY; - goto out; - } - - r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m); - if (r) - goto out; - - if (m != minor) { - idr_remove(&_minor_idr, m); - r = -EBUSY; - goto out; - } - -out: - spin_unlock(&_minor_lock); - return r; -} - -static int next_free_minor(int *minor) -{ - int r, m; - - r = idr_pre_get(&_minor_idr, GFP_KERNEL); - if (!r) - return -ENOMEM; - - spin_lock(&_minor_lock); - - r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m); - if (r) - goto out; - - if (m >= (1 << MINORBITS)) { - idr_remove(&_minor_idr, m); - r = -ENOSPC; - goto out; - } - - *minor = m; - -out: - spin_unlock(&_minor_lock); - return r; -} - -static const struct block_device_operations dm_blk_dops; - -static void dm_wq_work(struct work_struct *work); - -static void dm_init_md_queue(struct mapped_device *md) -{ - /* - * Request-based dm devices cannot be stacked on top of bio-based dm - * devices. The type of this dm device has not been decided yet. - * The type is decided at the first table loading time. - * To prevent problematic device stacking, clear the queue flag - * for request stacking support until then. - * - * This queue is new, so no concurrency on the queue_flags. - */ - queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); - - md->queue->queuedata = md; - md->queue->backing_dev_info.congested_fn = dm_any_congested; - md->queue->backing_dev_info.congested_data = md; - blk_queue_make_request(md->queue, dm_request); - blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); - blk_queue_merge_bvec(md->queue, dm_merge_bvec); -} - -/* - * Allocate and initialise a blank device with a given minor. - */ -static struct mapped_device *alloc_dev(int minor) -{ - int r; - struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL); - void *old_md; - - if (!md) { - DMWARN("unable to allocate device, out of memory."); - return NULL; - } - - if (!try_module_get(THIS_MODULE)) - goto bad_module_get; - - /* get a minor number for the dev */ - if (minor == DM_ANY_MINOR) - r = next_free_minor(&minor); - else - r = specific_minor(minor); - if (r < 0) - goto bad_minor; - - md->type = DM_TYPE_NONE; - init_rwsem(&md->io_lock); - mutex_init(&md->suspend_lock); - mutex_init(&md->type_lock); - spin_lock_init(&md->deferred_lock); - rwlock_init(&md->map_lock); - atomic_set(&md->holders, 1); - atomic_set(&md->open_count, 0); - atomic_set(&md->event_nr, 0); - atomic_set(&md->uevent_seq, 0); - INIT_LIST_HEAD(&md->uevent_list); - spin_lock_init(&md->uevent_lock); - - md->queue = blk_alloc_queue(GFP_KERNEL); - if (!md->queue) - goto bad_queue; - - dm_init_md_queue(md); - - md->disk = alloc_disk(1); - if (!md->disk) - goto bad_disk; - - atomic_set(&md->pending[0], 0); - atomic_set(&md->pending[1], 0); - init_waitqueue_head(&md->wait); - INIT_WORK(&md->work, dm_wq_work); - init_waitqueue_head(&md->eventq); - - md->disk->major = _major; - md->disk->first_minor = minor; - md->disk->fops = &dm_blk_dops; - md->disk->queue = md->queue; - md->disk->private_data = md; - sprintf(md->disk->disk_name, "dm-%d", minor); - add_disk(md->disk); - format_dev_t(md->name, MKDEV(_major, minor)); - - md->wq = alloc_workqueue("kdmflush", - WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); - if (!md->wq) - goto bad_thread; - - md->bdev = bdget_disk(md->disk, 0); - if (!md->bdev) - goto bad_bdev; - - bio_init(&md->flush_bio); - md->flush_bio.bi_bdev = md->bdev; - md->flush_bio.bi_rw = WRITE_FLUSH; - - /* Populate the mapping, nobody knows we exist yet */ - spin_lock(&_minor_lock); - old_md = idr_replace(&_minor_idr, md, minor); - spin_unlock(&_minor_lock); - - BUG_ON(old_md != MINOR_ALLOCED); - - return md; - -bad_bdev: - destroy_workqueue(md->wq); -bad_thread: - del_gendisk(md->disk); - put_disk(md->disk); -bad_disk: - blk_cleanup_queue(md->queue); -bad_queue: - free_minor(minor); -bad_minor: - module_put(THIS_MODULE); -bad_module_get: - kfree(md); - return NULL; -} - -static void unlock_fs(struct mapped_device *md); - -static void free_dev(struct mapped_device *md) -{ - int minor = MINOR(disk_devt(md->disk)); - - unlock_fs(md); - bdput(md->bdev); - destroy_workqueue(md->wq); - if (md->tio_pool) - mempool_destroy(md->tio_pool); - if (md->io_pool) - mempool_destroy(md->io_pool); - if (md->bs) - bioset_free(md->bs); - blk_integrity_unregister(md->disk); - del_gendisk(md->disk); - free_minor(minor); - - spin_lock(&_minor_lock); - md->disk->private_data = NULL; - spin_unlock(&_minor_lock); - - put_disk(md->disk); - blk_cleanup_queue(md->queue); - module_put(THIS_MODULE); - kfree(md); -} - -static void __bind_mempools(struct mapped_device *md, struct dm_table *t) -{ - struct dm_md_mempools *p; - - if (md->io_pool && md->tio_pool && md->bs) - /* the md already has necessary mempools */ - goto out; - - p = dm_table_get_md_mempools(t); - BUG_ON(!p || md->io_pool || md->tio_pool || md->bs); - - md->io_pool = p->io_pool; - p->io_pool = NULL; - md->tio_pool = p->tio_pool; - p->tio_pool = NULL; - md->bs = p->bs; - p->bs = NULL; - -out: - /* mempool bind completed, now no need any mempools in the table */ - dm_table_free_md_mempools(t); -} - -/* - * Bind a table to the device. - */ -static void event_callback(void *context) -{ - unsigned long flags; - LIST_HEAD(uevents); - struct mapped_device *md = (struct mapped_device *) context; - - spin_lock_irqsave(&md->uevent_lock, flags); - list_splice_init(&md->uevent_list, &uevents); - spin_unlock_irqrestore(&md->uevent_lock, flags); - - dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); - - atomic_inc(&md->event_nr); - wake_up(&md->eventq); -} - -/* - * Protected by md->suspend_lock obtained by dm_swap_table(). - */ -static void __set_size(struct mapped_device *md, sector_t size) -{ - set_capacity(md->disk, size); - - i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); -} - -/* - * Return 1 if the queue has a compulsory merge_bvec_fn function. - * - * If this function returns 0, then the device is either a non-dm - * device without a merge_bvec_fn, or it is a dm device that is - * able to split any bios it receives that are too big. - */ -int dm_queue_merge_is_compulsory(struct request_queue *q) -{ - struct mapped_device *dev_md; - - if (!q->merge_bvec_fn) - return 0; - - if (q->make_request_fn == dm_request) { - dev_md = q->queuedata; - if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags)) - return 0; - } - - return 1; -} - -static int dm_device_merge_is_compulsory(struct dm_target *ti, - struct dm_dev *dev, sector_t start, - sector_t len, void *data) -{ - struct block_device *bdev = dev->bdev; - struct request_queue *q = bdev_get_queue(bdev); - - return dm_queue_merge_is_compulsory(q); -} - -/* - * Return 1 if it is acceptable to ignore merge_bvec_fn based - * on the properties of the underlying devices. - */ -static int dm_table_merge_is_optional(struct dm_table *table) -{ - unsigned i = 0; - struct dm_target *ti; - - while (i < dm_table_get_num_targets(table)) { - ti = dm_table_get_target(table, i++); - - if (ti->type->iterate_devices && - ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL)) - return 0; - } - - return 1; -} - -/* - * Returns old map, which caller must destroy. - */ -static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, - struct queue_limits *limits) -{ - struct dm_table *old_map; - struct request_queue *q = md->queue; - sector_t size; - unsigned long flags; - int merge_is_optional; - - size = dm_table_get_size(t); - - /* - * Wipe any geometry if the size of the table changed. - */ - if (size != get_capacity(md->disk)) - memset(&md->geometry, 0, sizeof(md->geometry)); - - __set_size(md, size); - - dm_table_event_callback(t, event_callback, md); - - /* - * The queue hasn't been stopped yet, if the old table type wasn't - * for request-based during suspension. So stop it to prevent - * I/O mapping before resume. - * This must be done before setting the queue restrictions, - * because request-based dm may be run just after the setting. - */ - if (dm_table_request_based(t) && !blk_queue_stopped(q)) - stop_queue(q); - - __bind_mempools(md, t); - - merge_is_optional = dm_table_merge_is_optional(t); - - write_lock_irqsave(&md->map_lock, flags); - old_map = md->map; - md->map = t; - md->immutable_target_type = dm_table_get_immutable_target_type(t); - - dm_table_set_restrictions(t, q, limits); - if (merge_is_optional) - set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); - else - clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); - write_unlock_irqrestore(&md->map_lock, flags); - - return old_map; -} - -/* - * Returns unbound table for the caller to free. - */ -static struct dm_table *__unbind(struct mapped_device *md) -{ - struct dm_table *map = md->map; - unsigned long flags; - - if (!map) - return NULL; - - dm_table_event_callback(map, NULL, NULL); - write_lock_irqsave(&md->map_lock, flags); - md->map = NULL; - write_unlock_irqrestore(&md->map_lock, flags); - - return map; -} - -/* - * Constructor for a new device. - */ -int dm_create(int minor, struct mapped_device **result) -{ - struct mapped_device *md; - - md = alloc_dev(minor); - if (!md) - return -ENXIO; - - dm_sysfs_init(md); - - *result = md; - return 0; -} - -/* - * Functions to manage md->type. - * All are required to hold md->type_lock. - */ -void dm_lock_md_type(struct mapped_device *md) -{ - mutex_lock(&md->type_lock); -} - -void dm_unlock_md_type(struct mapped_device *md) -{ - mutex_unlock(&md->type_lock); -} - -void dm_set_md_type(struct mapped_device *md, unsigned type) -{ - md->type = type; -} - -unsigned dm_get_md_type(struct mapped_device *md) -{ - return md->type; -} - -struct target_type *dm_get_immutable_target_type(struct mapped_device *md) -{ - return md->immutable_target_type; -} - -/* - * Fully initialize a request-based queue (->elevator, ->request_fn, etc). - */ -static int dm_init_request_based_queue(struct mapped_device *md) -{ - struct request_queue *q = NULL; - - if (md->queue->elevator) - return 1; - - /* Fully initialize the queue */ - q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL); - if (!q) - return 0; - - md->queue = q; - dm_init_md_queue(md); - blk_queue_softirq_done(md->queue, dm_softirq_done); - blk_queue_prep_rq(md->queue, dm_prep_fn); - blk_queue_lld_busy(md->queue, dm_lld_busy); - - elv_register_queue(md->queue); - - return 1; -} - -/* - * Setup the DM device's queue based on md's type - */ -int dm_setup_md_queue(struct mapped_device *md) -{ - if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) && - !dm_init_request_based_queue(md)) { - DMWARN("Cannot initialize queue for request-based mapped device"); - return -EINVAL; - } - - return 0; -} - -static struct mapped_device *dm_find_md(dev_t dev) -{ - struct mapped_device *md; - unsigned minor = MINOR(dev); - - if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) - return NULL; - - spin_lock(&_minor_lock); - - md = idr_find(&_minor_idr, minor); - if (md && (md == MINOR_ALLOCED || - (MINOR(disk_devt(dm_disk(md))) != minor) || - dm_deleting_md(md) || - test_bit(DMF_FREEING, &md->flags))) { - md = NULL; - goto out; - } - -out: - spin_unlock(&_minor_lock); - - return md; -} - -struct mapped_device *dm_get_md(dev_t dev) -{ - struct mapped_device *md = dm_find_md(dev); - - if (md) - dm_get(md); - - return md; -} -EXPORT_SYMBOL_GPL(dm_get_md); - -void *dm_get_mdptr(struct mapped_device *md) -{ - return md->interface_ptr; -} - -void dm_set_mdptr(struct mapped_device *md, void *ptr) -{ - md->interface_ptr = ptr; -} - -void dm_get(struct mapped_device *md) -{ - atomic_inc(&md->holders); - BUG_ON(test_bit(DMF_FREEING, &md->flags)); -} - -const char *dm_device_name(struct mapped_device *md) -{ - return md->name; -} -EXPORT_SYMBOL_GPL(dm_device_name); - -static void __dm_destroy(struct mapped_device *md, bool wait) -{ - struct dm_table *map; - - might_sleep(); - - spin_lock(&_minor_lock); - map = dm_get_live_table(md); - idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); - set_bit(DMF_FREEING, &md->flags); - spin_unlock(&_minor_lock); - - if (!dm_suspended_md(md)) { - dm_table_presuspend_targets(map); - dm_table_postsuspend_targets(map); - } - - /* - * Rare, but there may be I/O requests still going to complete, - * for example. Wait for all references to disappear. - * No one should increment the reference count of the mapped_device, - * after the mapped_device state becomes DMF_FREEING. - */ - if (wait) - while (atomic_read(&md->holders)) - msleep(1); - else if (atomic_read(&md->holders)) - DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", - dm_device_name(md), atomic_read(&md->holders)); - - dm_sysfs_exit(md); - dm_table_put(map); - dm_table_destroy(__unbind(md)); - free_dev(md); -} - -void dm_destroy(struct mapped_device *md) -{ - __dm_destroy(md, true); -} - -void dm_destroy_immediate(struct mapped_device *md) -{ - __dm_destroy(md, false); -} - -void dm_put(struct mapped_device *md) -{ - atomic_dec(&md->holders); -} -EXPORT_SYMBOL_GPL(dm_put); - -static int dm_wait_for_completion(struct mapped_device *md, int interruptible) -{ - int r = 0; - DECLARE_WAITQUEUE(wait, current); - - add_wait_queue(&md->wait, &wait); - - while (1) { - set_current_state(interruptible); - - if (!md_in_flight(md)) - break; - - if (interruptible == TASK_INTERRUPTIBLE && - signal_pending(current)) { - r = -EINTR; - break; - } - - io_schedule(); - } - set_current_state(TASK_RUNNING); - - remove_wait_queue(&md->wait, &wait); - - return r; -} - -/* - * Process the deferred bios - */ -static void dm_wq_work(struct work_struct *work) -{ - struct mapped_device *md = container_of(work, struct mapped_device, - work); - struct bio *c; - - down_read(&md->io_lock); - - while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { - spin_lock_irq(&md->deferred_lock); - c = bio_list_pop(&md->deferred); - spin_unlock_irq(&md->deferred_lock); - - if (!c) - break; - - up_read(&md->io_lock); - - if (dm_request_based(md)) - generic_make_request(c); - else - __split_and_process_bio(md, c); - - down_read(&md->io_lock); - } - - up_read(&md->io_lock); -} - -static void dm_queue_flush(struct mapped_device *md) -{ - clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); - smp_mb__after_clear_bit(); - queue_work(md->wq, &md->work); -} - -/* - * Swap in a new table, returning the old one for the caller to destroy. - */ -struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) -{ - struct dm_table *map = ERR_PTR(-EINVAL); - struct queue_limits limits; - int r; - - mutex_lock(&md->suspend_lock); - - /* device must be suspended */ - if (!dm_suspended_md(md)) - goto out; - - r = dm_calculate_queue_limits(table, &limits); - if (r) { - map = ERR_PTR(r); - goto out; - } - - map = __bind(md, table, &limits); - -out: - mutex_unlock(&md->suspend_lock); - return map; -} - -/* - * Functions to lock and unlock any filesystem running on the - * device. - */ -static int lock_fs(struct mapped_device *md) -{ - int r; - - WARN_ON(md->frozen_sb); - - md->frozen_sb = freeze_bdev(md->bdev); - if (IS_ERR(md->frozen_sb)) { - r = PTR_ERR(md->frozen_sb); - md->frozen_sb = NULL; - return r; - } - - set_bit(DMF_FROZEN, &md->flags); - - return 0; -} - -static void unlock_fs(struct mapped_device *md) -{ - if (!test_bit(DMF_FROZEN, &md->flags)) - return; - - thaw_bdev(md->bdev, md->frozen_sb); - md->frozen_sb = NULL; - clear_bit(DMF_FROZEN, &md->flags); -} - -/* - * We need to be able to change a mapping table under a mounted - * filesystem. For example we might want to move some data in - * the background. Before the table can be swapped with - * dm_bind_table, dm_suspend must be called to flush any in - * flight bios and ensure that any further io gets deferred. - */ -/* - * Suspend mechanism in request-based dm. - * - * 1. Flush all I/Os by lock_fs() if needed. - * 2. Stop dispatching any I/O by stopping the request_queue. - * 3. Wait for all in-flight I/Os to be completed or requeued. - * - * To abort suspend, start the request_queue. - */ -int dm_suspend(struct mapped_device *md, unsigned suspend_flags) -{ - struct dm_table *map = NULL; - int r = 0; - int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; - int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; - - mutex_lock(&md->suspend_lock); - - if (dm_suspended_md(md)) { - r = -EINVAL; - goto out_unlock; - } - - map = dm_get_live_table(md); - - /* - * DMF_NOFLUSH_SUSPENDING must be set before presuspend. - * This flag is cleared before dm_suspend returns. - */ - if (noflush) - set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); - - /* This does not get reverted if there's an error later. */ - dm_table_presuspend_targets(map); - - /* - * Flush I/O to the device. - * Any I/O submitted after lock_fs() may not be flushed. - * noflush takes precedence over do_lockfs. - * (lock_fs() flushes I/Os and waits for them to complete.) - */ - if (!noflush && do_lockfs) { - r = lock_fs(md); - if (r) - goto out; - } - - /* - * Here we must make sure that no processes are submitting requests - * to target drivers i.e. no one may be executing - * __split_and_process_bio. This is called from dm_request and - * dm_wq_work. - * - * To get all processes out of __split_and_process_bio in dm_request, - * we take the write lock. To prevent any process from reentering - * __split_and_process_bio from dm_request and quiesce the thread - * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call - * flush_workqueue(md->wq). - */ - down_write(&md->io_lock); - set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); - up_write(&md->io_lock); - - /* - * Stop md->queue before flushing md->wq in case request-based - * dm defers requests to md->wq from md->queue. - */ - if (dm_request_based(md)) - stop_queue(md->queue); - - flush_workqueue(md->wq); - - /* - * At this point no more requests are entering target request routines. - * We call dm_wait_for_completion to wait for all existing requests - * to finish. - */ - r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE); - - down_write(&md->io_lock); - if (noflush) - clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); - up_write(&md->io_lock); - - /* were we interrupted ? */ - if (r < 0) { - dm_queue_flush(md); - - if (dm_request_based(md)) - start_queue(md->queue); - - unlock_fs(md); - goto out; /* pushback list is already flushed, so skip flush */ - } - - /* - * If dm_wait_for_completion returned 0, the device is completely - * quiescent now. There is no request-processing activity. All new - * requests are being added to md->deferred list. - */ - - set_bit(DMF_SUSPENDED, &md->flags); - - dm_table_postsuspend_targets(map); - -out: - dm_table_put(map); - -out_unlock: - mutex_unlock(&md->suspend_lock); - return r; -} - -int dm_resume(struct mapped_device *md) -{ - int r = -EINVAL; - struct dm_table *map = NULL; - - mutex_lock(&md->suspend_lock); - if (!dm_suspended_md(md)) - goto out; - - map = dm_get_live_table(md); - if (!map || !dm_table_get_size(map)) - goto out; - - r = dm_table_resume_targets(map); - if (r) - goto out; - - dm_queue_flush(md); - - /* - * Flushing deferred I/Os must be done after targets are resumed - * so that mapping of targets can work correctly. - * Request-based dm is queueing the deferred I/Os in its request_queue. - */ - if (dm_request_based(md)) - start_queue(md->queue); - - unlock_fs(md); - - clear_bit(DMF_SUSPENDED, &md->flags); - - r = 0; -out: - dm_table_put(map); - mutex_unlock(&md->suspend_lock); - - return r; -} - -/*----------------------------------------------------------------- - * Event notification. - *---------------------------------------------------------------*/ -int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, - unsigned cookie) -{ - char udev_cookie[DM_COOKIE_LENGTH]; - char *envp[] = { udev_cookie, NULL }; - - if (!cookie) - return kobject_uevent(&disk_to_dev(md->disk)->kobj, action); - else { - snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", - DM_COOKIE_ENV_VAR_NAME, cookie); - return kobject_uevent_env(&disk_to_dev(md->disk)->kobj, - action, envp); - } -} - -uint32_t dm_next_uevent_seq(struct mapped_device *md) -{ - return atomic_add_return(1, &md->uevent_seq); -} - -uint32_t dm_get_event_nr(struct mapped_device *md) -{ - return atomic_read(&md->event_nr); -} - -int dm_wait_event(struct mapped_device *md, int event_nr) -{ - return wait_event_interruptible(md->eventq, - (event_nr != atomic_read(&md->event_nr))); -} - -void dm_uevent_add(struct mapped_device *md, struct list_head *elist) -{ - unsigned long flags; - - spin_lock_irqsave(&md->uevent_lock, flags); - list_add(elist, &md->uevent_list); - spin_unlock_irqrestore(&md->uevent_lock, flags); -} - -/* - * The gendisk is only valid as long as you have a reference - * count on 'md'. - */ -struct gendisk *dm_disk(struct mapped_device *md) -{ - return md->disk; -} - -struct kobject *dm_kobject(struct mapped_device *md) -{ - return &md->kobj; -} - -/* - * struct mapped_device should not be exported outside of dm.c - * so use this check to verify that kobj is part of md structure - */ -struct mapped_device *dm_get_from_kobject(struct kobject *kobj) -{ - struct mapped_device *md; - - md = container_of(kobj, struct mapped_device, kobj); - if (&md->kobj != kobj) - return NULL; - - if (test_bit(DMF_FREEING, &md->flags) || - dm_deleting_md(md)) - return NULL; - - dm_get(md); - return md; -} - -int dm_suspended_md(struct mapped_device *md) -{ - return test_bit(DMF_SUSPENDED, &md->flags); -} - -int dm_suspended(struct dm_target *ti) -{ - return dm_suspended_md(dm_table_get_md(ti->table)); -} -EXPORT_SYMBOL_GPL(dm_suspended); - -int dm_noflush_suspending(struct dm_target *ti) -{ - return __noflush_suspending(dm_table_get_md(ti->table)); -} -EXPORT_SYMBOL_GPL(dm_noflush_suspending); - -struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity) -{ - struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL); - unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS; - - if (!pools) - return NULL; - - pools->io_pool = (type == DM_TYPE_BIO_BASED) ? - mempool_create_slab_pool(MIN_IOS, _io_cache) : - mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache); - if (!pools->io_pool) - goto free_pools_and_out; - - pools->tio_pool = (type == DM_TYPE_BIO_BASED) ? - mempool_create_slab_pool(MIN_IOS, _tio_cache) : - mempool_create_slab_pool(MIN_IOS, _rq_tio_cache); - if (!pools->tio_pool) - goto free_io_pool_and_out; - - pools->bs = bioset_create(pool_size, 0); - if (!pools->bs) - goto free_tio_pool_and_out; - - if (integrity && bioset_integrity_create(pools->bs, pool_size)) - goto free_bioset_and_out; - - return pools; - -free_bioset_and_out: - bioset_free(pools->bs); - -free_tio_pool_and_out: - mempool_destroy(pools->tio_pool); - -free_io_pool_and_out: - mempool_destroy(pools->io_pool); - -free_pools_and_out: - kfree(pools); - - return NULL; -} - -void dm_free_md_mempools(struct dm_md_mempools *pools) -{ - if (!pools) - return; - - if (pools->io_pool) - mempool_destroy(pools->io_pool); - - if (pools->tio_pool) - mempool_destroy(pools->tio_pool); - - if (pools->bs) - bioset_free(pools->bs); - - kfree(pools); -} - -static const struct block_device_operations dm_blk_dops = { - .open = dm_blk_open, - .release = dm_blk_close, - .ioctl = dm_blk_ioctl, - .getgeo = dm_blk_getgeo, - .owner = THIS_MODULE -}; - -EXPORT_SYMBOL(dm_get_mapinfo); - -/* - * module hooks - */ -module_init(dm_init); -module_exit(dm_exit); - -module_param(major, uint, 0); -MODULE_PARM_DESC(major, "The major number of the device mapper"); -MODULE_DESCRIPTION(DM_NAME " driver"); -MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); -MODULE_LICENSE("GPL"); diff --git a/ANDROID_3.4.5/drivers/md/dm.h b/ANDROID_3.4.5/drivers/md/dm.h deleted file mode 100644 index b7dacd59..00000000 --- a/ANDROID_3.4.5/drivers/md/dm.h +++ /dev/null @@ -1,159 +0,0 @@ -/* - * Internal header file for device mapper - * - * Copyright (C) 2001, 2002 Sistina Software - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This file is released under the LGPL. - */ - -#ifndef DM_INTERNAL_H -#define DM_INTERNAL_H - -#include <linux/fs.h> -#include <linux/device-mapper.h> -#include <linux/list.h> -#include <linux/blkdev.h> -#include <linux/hdreg.h> - -/* - * Suspend feature flags - */ -#define DM_SUSPEND_LOCKFS_FLAG (1 << 0) -#define DM_SUSPEND_NOFLUSH_FLAG (1 << 1) - -/* - * Type of table and mapped_device's mempool - */ -#define DM_TYPE_NONE 0 -#define DM_TYPE_BIO_BASED 1 -#define DM_TYPE_REQUEST_BASED 2 - -/* - * List of devices that a metadevice uses and should open/close. - */ -struct dm_dev_internal { - struct list_head list; - atomic_t count; - struct dm_dev dm_dev; -}; - -struct dm_table; -struct dm_md_mempools; - -/*----------------------------------------------------------------- - * Internal table functions. - *---------------------------------------------------------------*/ -void dm_table_destroy(struct dm_table *t); -void dm_table_event_callback(struct dm_table *t, - void (*fn)(void *), void *context); -struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index); -struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); -int dm_calculate_queue_limits(struct dm_table *table, - struct queue_limits *limits); -void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, - struct queue_limits *limits); -struct list_head *dm_table_get_devices(struct dm_table *t); -void dm_table_presuspend_targets(struct dm_table *t); -void dm_table_postsuspend_targets(struct dm_table *t); -int dm_table_resume_targets(struct dm_table *t); -int dm_table_any_congested(struct dm_table *t, int bdi_bits); -int dm_table_any_busy_target(struct dm_table *t); -unsigned dm_table_get_type(struct dm_table *t); -struct target_type *dm_table_get_immutable_target_type(struct dm_table *t); -bool dm_table_request_based(struct dm_table *t); -bool dm_table_supports_discards(struct dm_table *t); -int dm_table_alloc_md_mempools(struct dm_table *t); -void dm_table_free_md_mempools(struct dm_table *t); -struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); - -int dm_queue_merge_is_compulsory(struct request_queue *q); - -void dm_lock_md_type(struct mapped_device *md); -void dm_unlock_md_type(struct mapped_device *md); -void dm_set_md_type(struct mapped_device *md, unsigned type); -unsigned dm_get_md_type(struct mapped_device *md); -struct target_type *dm_get_immutable_target_type(struct mapped_device *md); - -int dm_setup_md_queue(struct mapped_device *md); - -/* - * To check the return value from dm_table_find_target(). - */ -#define dm_target_is_valid(t) ((t)->table) - -/* - * To check whether the target type is request-based or not (bio-based). - */ -#define dm_target_request_based(t) ((t)->type->map_rq != NULL) - -/*----------------------------------------------------------------- - * A registry of target types. - *---------------------------------------------------------------*/ -int dm_target_init(void); -void dm_target_exit(void); -struct target_type *dm_get_target_type(const char *name); -void dm_put_target_type(struct target_type *tt); -int dm_target_iterate(void (*iter_func)(struct target_type *tt, - void *param), void *param); - -int dm_split_args(int *argc, char ***argvp, char *input); - -/* - * Is this mapped_device being deleted? - */ -int dm_deleting_md(struct mapped_device *md); - -/* - * Is this mapped_device suspended? - */ -int dm_suspended_md(struct mapped_device *md); - -/* - * The device-mapper can be driven through one of two interfaces; - * ioctl or filesystem, depending which patch you have applied. - */ -int dm_interface_init(void); -void dm_interface_exit(void); - -/* - * sysfs interface - */ -int dm_sysfs_init(struct mapped_device *md); -void dm_sysfs_exit(struct mapped_device *md); -struct kobject *dm_kobject(struct mapped_device *md); -struct mapped_device *dm_get_from_kobject(struct kobject *kobj); - -/* - * Targets for linear and striped mappings - */ -int dm_linear_init(void); -void dm_linear_exit(void); - -int dm_stripe_init(void); -void dm_stripe_exit(void); - -/* - * mapped_device operations - */ -void dm_destroy(struct mapped_device *md); -void dm_destroy_immediate(struct mapped_device *md); -int dm_open_count(struct mapped_device *md); -int dm_lock_for_deletion(struct mapped_device *md); - -int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, - unsigned cookie); - -int dm_io_init(void); -void dm_io_exit(void); - -int dm_kcopyd_init(void); -void dm_kcopyd_exit(void); - -/* - * Mempool operations - */ -struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity); -void dm_free_md_mempools(struct dm_md_mempools *pools); - -#endif diff --git a/ANDROID_3.4.5/drivers/md/faulty.c b/ANDROID_3.4.5/drivers/md/faulty.c deleted file mode 100644 index 45135f69..00000000 --- a/ANDROID_3.4.5/drivers/md/faulty.c +++ /dev/null @@ -1,367 +0,0 @@ -/* - * faulty.c : Multiple Devices driver for Linux - * - * Copyright (C) 2004 Neil Brown - * - * fautly-device-simulator personality for md - * - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * You should have received a copy of the GNU General Public License - * (for example /usr/src/linux/COPYING); if not, write to the Free - * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - - -/* - * The "faulty" personality causes some requests to fail. - * - * Possible failure modes are: - * reads fail "randomly" but succeed on retry - * writes fail "randomly" but succeed on retry - * reads for some address fail and then persist until a write - * reads for some address fail and then persist irrespective of write - * writes for some address fail and persist - * all writes fail - * - * Different modes can be active at a time, but only - * one can be set at array creation. Others can be added later. - * A mode can be one-shot or recurrent with the recurrence being - * once in every N requests. - * The bottom 5 bits of the "layout" indicate the mode. The - * remainder indicate a period, or 0 for one-shot. - * - * There is an implementation limit on the number of concurrently - * persisting-faulty blocks. When a new fault is requested that would - * exceed the limit, it is ignored. - * All current faults can be clear using a layout of "0". - * - * Requests are always sent to the device. If they are to fail, - * we clone the bio and insert a new b_end_io into the chain. - */ - -#define WriteTransient 0 -#define ReadTransient 1 -#define WritePersistent 2 -#define ReadPersistent 3 -#define WriteAll 4 /* doesn't go to device */ -#define ReadFixable 5 -#define Modes 6 - -#define ClearErrors 31 -#define ClearFaults 30 - -#define AllPersist 100 /* internal use only */ -#define NoPersist 101 - -#define ModeMask 0x1f -#define ModeShift 5 - -#define MaxFault 50 -#include <linux/blkdev.h> -#include <linux/module.h> -#include <linux/raid/md_u.h> -#include <linux/slab.h> -#include "md.h" -#include <linux/seq_file.h> - - -static void faulty_fail(struct bio *bio, int error) -{ - struct bio *b = bio->bi_private; - - b->bi_size = bio->bi_size; - b->bi_sector = bio->bi_sector; - - bio_put(bio); - - bio_io_error(b); -} - -struct faulty_conf { - int period[Modes]; - atomic_t counters[Modes]; - sector_t faults[MaxFault]; - int modes[MaxFault]; - int nfaults; - struct md_rdev *rdev; -}; - -static int check_mode(struct faulty_conf *conf, int mode) -{ - if (conf->period[mode] == 0 && - atomic_read(&conf->counters[mode]) <= 0) - return 0; /* no failure, no decrement */ - - - if (atomic_dec_and_test(&conf->counters[mode])) { - if (conf->period[mode]) - atomic_set(&conf->counters[mode], conf->period[mode]); - return 1; - } - return 0; -} - -static int check_sector(struct faulty_conf *conf, sector_t start, sector_t end, int dir) -{ - /* If we find a ReadFixable sector, we fix it ... */ - int i; - for (i=0; i<conf->nfaults; i++) - if (conf->faults[i] >= start && - conf->faults[i] < end) { - /* found it ... */ - switch (conf->modes[i] * 2 + dir) { - case WritePersistent*2+WRITE: return 1; - case ReadPersistent*2+READ: return 1; - case ReadFixable*2+READ: return 1; - case ReadFixable*2+WRITE: - conf->modes[i] = NoPersist; - return 0; - case AllPersist*2+READ: - case AllPersist*2+WRITE: return 1; - default: - return 0; - } - } - return 0; -} - -static void add_sector(struct faulty_conf *conf, sector_t start, int mode) -{ - int i; - int n = conf->nfaults; - for (i=0; i<conf->nfaults; i++) - if (conf->faults[i] == start) { - switch(mode) { - case NoPersist: conf->modes[i] = mode; return; - case WritePersistent: - if (conf->modes[i] == ReadPersistent || - conf->modes[i] == ReadFixable) - conf->modes[i] = AllPersist; - else - conf->modes[i] = WritePersistent; - return; - case ReadPersistent: - if (conf->modes[i] == WritePersistent) - conf->modes[i] = AllPersist; - else - conf->modes[i] = ReadPersistent; - return; - case ReadFixable: - if (conf->modes[i] == WritePersistent || - conf->modes[i] == ReadPersistent) - conf->modes[i] = AllPersist; - else - conf->modes[i] = ReadFixable; - return; - } - } else if (conf->modes[i] == NoPersist) - n = i; - - if (n >= MaxFault) - return; - conf->faults[n] = start; - conf->modes[n] = mode; - if (conf->nfaults == n) - conf->nfaults = n+1; -} - -static void make_request(struct mddev *mddev, struct bio *bio) -{ - struct faulty_conf *conf = mddev->private; - int failit = 0; - - if (bio_data_dir(bio) == WRITE) { - /* write request */ - if (atomic_read(&conf->counters[WriteAll])) { - /* special case - don't decrement, don't generic_make_request, - * just fail immediately - */ - bio_endio(bio, -EIO); - return; - } - - if (check_sector(conf, bio->bi_sector, bio->bi_sector+(bio->bi_size>>9), - WRITE)) - failit = 1; - if (check_mode(conf, WritePersistent)) { - add_sector(conf, bio->bi_sector, WritePersistent); - failit = 1; - } - if (check_mode(conf, WriteTransient)) - failit = 1; - } else { - /* read request */ - if (check_sector(conf, bio->bi_sector, bio->bi_sector + (bio->bi_size>>9), - READ)) - failit = 1; - if (check_mode(conf, ReadTransient)) - failit = 1; - if (check_mode(conf, ReadPersistent)) { - add_sector(conf, bio->bi_sector, ReadPersistent); - failit = 1; - } - if (check_mode(conf, ReadFixable)) { - add_sector(conf, bio->bi_sector, ReadFixable); - failit = 1; - } - } - if (failit) { - struct bio *b = bio_clone_mddev(bio, GFP_NOIO, mddev); - - b->bi_bdev = conf->rdev->bdev; - b->bi_private = bio; - b->bi_end_io = faulty_fail; - bio = b; - } else - bio->bi_bdev = conf->rdev->bdev; - - generic_make_request(bio); -} - -static void status(struct seq_file *seq, struct mddev *mddev) -{ - struct faulty_conf *conf = mddev->private; - int n; - - if ((n=atomic_read(&conf->counters[WriteTransient])) != 0) - seq_printf(seq, " WriteTransient=%d(%d)", - n, conf->period[WriteTransient]); - - if ((n=atomic_read(&conf->counters[ReadTransient])) != 0) - seq_printf(seq, " ReadTransient=%d(%d)", - n, conf->period[ReadTransient]); - - if ((n=atomic_read(&conf->counters[WritePersistent])) != 0) - seq_printf(seq, " WritePersistent=%d(%d)", - n, conf->period[WritePersistent]); - - if ((n=atomic_read(&conf->counters[ReadPersistent])) != 0) - seq_printf(seq, " ReadPersistent=%d(%d)", - n, conf->period[ReadPersistent]); - - - if ((n=atomic_read(&conf->counters[ReadFixable])) != 0) - seq_printf(seq, " ReadFixable=%d(%d)", - n, conf->period[ReadFixable]); - - if ((n=atomic_read(&conf->counters[WriteAll])) != 0) - seq_printf(seq, " WriteAll"); - - seq_printf(seq, " nfaults=%d", conf->nfaults); -} - - -static int reshape(struct mddev *mddev) -{ - int mode = mddev->new_layout & ModeMask; - int count = mddev->new_layout >> ModeShift; - struct faulty_conf *conf = mddev->private; - - if (mddev->new_layout < 0) - return 0; - - /* new layout */ - if (mode == ClearFaults) - conf->nfaults = 0; - else if (mode == ClearErrors) { - int i; - for (i=0 ; i < Modes ; i++) { - conf->period[i] = 0; - atomic_set(&conf->counters[i], 0); - } - } else if (mode < Modes) { - conf->period[mode] = count; - if (!count) count++; - atomic_set(&conf->counters[mode], count); - } else - return -EINVAL; - mddev->new_layout = -1; - mddev->layout = -1; /* makes sure further changes come through */ - return 0; -} - -static sector_t faulty_size(struct mddev *mddev, sector_t sectors, int raid_disks) -{ - WARN_ONCE(raid_disks, - "%s does not support generic reshape\n", __func__); - - if (sectors == 0) - return mddev->dev_sectors; - - return sectors; -} - -static int run(struct mddev *mddev) -{ - struct md_rdev *rdev; - int i; - struct faulty_conf *conf; - - if (md_check_no_bitmap(mddev)) - return -EINVAL; - - conf = kmalloc(sizeof(*conf), GFP_KERNEL); - if (!conf) - return -ENOMEM; - - for (i=0; i<Modes; i++) { - atomic_set(&conf->counters[i], 0); - conf->period[i] = 0; - } - conf->nfaults = 0; - - rdev_for_each(rdev, mddev) - conf->rdev = rdev; - - md_set_array_sectors(mddev, faulty_size(mddev, 0, 0)); - mddev->private = conf; - - reshape(mddev); - - return 0; -} - -static int stop(struct mddev *mddev) -{ - struct faulty_conf *conf = mddev->private; - - kfree(conf); - mddev->private = NULL; - return 0; -} - -static struct md_personality faulty_personality = -{ - .name = "faulty", - .level = LEVEL_FAULTY, - .owner = THIS_MODULE, - .make_request = make_request, - .run = run, - .stop = stop, - .status = status, - .check_reshape = reshape, - .size = faulty_size, -}; - -static int __init raid_init(void) -{ - return register_md_personality(&faulty_personality); -} - -static void raid_exit(void) -{ - unregister_md_personality(&faulty_personality); -} - -module_init(raid_init); -module_exit(raid_exit); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Fault injection personality for MD"); -MODULE_ALIAS("md-personality-10"); /* faulty */ -MODULE_ALIAS("md-faulty"); -MODULE_ALIAS("md-level--5"); diff --git a/ANDROID_3.4.5/drivers/md/linear.c b/ANDROID_3.4.5/drivers/md/linear.c deleted file mode 100644 index fa211d80..00000000 --- a/ANDROID_3.4.5/drivers/md/linear.c +++ /dev/null @@ -1,369 +0,0 @@ -/* - linear.c : Multiple Devices driver for Linux - Copyright (C) 1994-96 Marc ZYNGIER - <zyngier@ufr-info-p7.ibp.fr> or - <maz@gloups.fdn.fr> - - Linear mode management functions. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. - - You should have received a copy of the GNU General Public License - (for example /usr/src/linux/COPYING); if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -*/ - -#include <linux/blkdev.h> -#include <linux/raid/md_u.h> -#include <linux/seq_file.h> -#include <linux/module.h> -#include <linux/slab.h> -#include "md.h" -#include "linear.h" - -/* - * find which device holds a particular offset - */ -static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector) -{ - int lo, mid, hi; - struct linear_conf *conf; - - lo = 0; - hi = mddev->raid_disks - 1; - conf = rcu_dereference(mddev->private); - - /* - * Binary Search - */ - - while (hi > lo) { - - mid = (hi + lo) / 2; - if (sector < conf->disks[mid].end_sector) - hi = mid; - else - lo = mid + 1; - } - - return conf->disks + lo; -} - -/** - * linear_mergeable_bvec -- tell bio layer if two requests can be merged - * @q: request queue - * @bvm: properties of new bio - * @biovec: the request that could be merged to it. - * - * Return amount of bytes we can take at this offset - */ -static int linear_mergeable_bvec(struct request_queue *q, - struct bvec_merge_data *bvm, - struct bio_vec *biovec) -{ - struct mddev *mddev = q->queuedata; - struct dev_info *dev0; - unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9; - sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); - int maxbytes = biovec->bv_len; - struct request_queue *subq; - - rcu_read_lock(); - dev0 = which_dev(mddev, sector); - maxsectors = dev0->end_sector - sector; - subq = bdev_get_queue(dev0->rdev->bdev); - if (subq->merge_bvec_fn) { - bvm->bi_bdev = dev0->rdev->bdev; - bvm->bi_sector -= dev0->end_sector - dev0->rdev->sectors; - maxbytes = min(maxbytes, subq->merge_bvec_fn(subq, bvm, - biovec)); - } - rcu_read_unlock(); - - if (maxsectors < bio_sectors) - maxsectors = 0; - else - maxsectors -= bio_sectors; - - if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0) - return maxbytes; - - if (maxsectors > (maxbytes >> 9)) - return maxbytes; - else - return maxsectors << 9; -} - -static int linear_congested(void *data, int bits) -{ - struct mddev *mddev = data; - struct linear_conf *conf; - int i, ret = 0; - - if (mddev_congested(mddev, bits)) - return 1; - - rcu_read_lock(); - conf = rcu_dereference(mddev->private); - - for (i = 0; i < mddev->raid_disks && !ret ; i++) { - struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev); - ret |= bdi_congested(&q->backing_dev_info, bits); - } - - rcu_read_unlock(); - return ret; -} - -static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disks) -{ - struct linear_conf *conf; - sector_t array_sectors; - - rcu_read_lock(); - conf = rcu_dereference(mddev->private); - WARN_ONCE(sectors || raid_disks, - "%s does not support generic reshape\n", __func__); - array_sectors = conf->array_sectors; - rcu_read_unlock(); - - return array_sectors; -} - -static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) -{ - struct linear_conf *conf; - struct md_rdev *rdev; - int i, cnt; - - conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(struct dev_info), - GFP_KERNEL); - if (!conf) - return NULL; - - cnt = 0; - conf->array_sectors = 0; - - rdev_for_each(rdev, mddev) { - int j = rdev->raid_disk; - struct dev_info *disk = conf->disks + j; - sector_t sectors; - - if (j < 0 || j >= raid_disks || disk->rdev) { - printk(KERN_ERR "md/linear:%s: disk numbering problem. Aborting!\n", - mdname(mddev)); - goto out; - } - - disk->rdev = rdev; - if (mddev->chunk_sectors) { - sectors = rdev->sectors; - sector_div(sectors, mddev->chunk_sectors); - rdev->sectors = sectors * mddev->chunk_sectors; - } - - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - - conf->array_sectors += rdev->sectors; - cnt++; - - } - if (cnt != raid_disks) { - printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n", - mdname(mddev)); - goto out; - } - - /* - * Here we calculate the device offsets. - */ - conf->disks[0].end_sector = conf->disks[0].rdev->sectors; - - for (i = 1; i < raid_disks; i++) - conf->disks[i].end_sector = - conf->disks[i-1].end_sector + - conf->disks[i].rdev->sectors; - - return conf; - -out: - kfree(conf); - return NULL; -} - -static int linear_run (struct mddev *mddev) -{ - struct linear_conf *conf; - int ret; - - if (md_check_no_bitmap(mddev)) - return -EINVAL; - conf = linear_conf(mddev, mddev->raid_disks); - - if (!conf) - return 1; - mddev->private = conf; - md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); - - blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); - mddev->queue->backing_dev_info.congested_fn = linear_congested; - mddev->queue->backing_dev_info.congested_data = mddev; - - ret = md_integrity_register(mddev); - if (ret) { - kfree(conf); - mddev->private = NULL; - } - return ret; -} - -static int linear_add(struct mddev *mddev, struct md_rdev *rdev) -{ - /* Adding a drive to a linear array allows the array to grow. - * It is permitted if the new drive has a matching superblock - * already on it, with raid_disk equal to raid_disks. - * It is achieved by creating a new linear_private_data structure - * and swapping it in in-place of the current one. - * The current one is never freed until the array is stopped. - * This avoids races. - */ - struct linear_conf *newconf, *oldconf; - - if (rdev->saved_raid_disk != mddev->raid_disks) - return -EINVAL; - - rdev->raid_disk = rdev->saved_raid_disk; - rdev->saved_raid_disk = -1; - - newconf = linear_conf(mddev,mddev->raid_disks+1); - - if (!newconf) - return -ENOMEM; - - oldconf = rcu_dereference(mddev->private); - mddev->raid_disks++; - rcu_assign_pointer(mddev->private, newconf); - md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk(mddev->gendisk); - kfree_rcu(oldconf, rcu); - return 0; -} - -static int linear_stop (struct mddev *mddev) -{ - struct linear_conf *conf = mddev->private; - - /* - * We do not require rcu protection here since - * we hold reconfig_mutex for both linear_add and - * linear_stop, so they cannot race. - * We should make sure any old 'conf's are properly - * freed though. - */ - rcu_barrier(); - blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ - kfree(conf); - mddev->private = NULL; - - return 0; -} - -static void linear_make_request(struct mddev *mddev, struct bio *bio) -{ - struct dev_info *tmp_dev; - sector_t start_sector; - - if (unlikely(bio->bi_rw & REQ_FLUSH)) { - md_flush_request(mddev, bio); - return; - } - - rcu_read_lock(); - tmp_dev = which_dev(mddev, bio->bi_sector); - start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; - - - if (unlikely(bio->bi_sector >= (tmp_dev->end_sector) - || (bio->bi_sector < start_sector))) { - char b[BDEVNAME_SIZE]; - - printk(KERN_ERR - "md/linear:%s: make_request: Sector %llu out of bounds on " - "dev %s: %llu sectors, offset %llu\n", - mdname(mddev), - (unsigned long long)bio->bi_sector, - bdevname(tmp_dev->rdev->bdev, b), - (unsigned long long)tmp_dev->rdev->sectors, - (unsigned long long)start_sector); - rcu_read_unlock(); - bio_io_error(bio); - return; - } - if (unlikely(bio->bi_sector + (bio->bi_size >> 9) > - tmp_dev->end_sector)) { - /* This bio crosses a device boundary, so we have to - * split it. - */ - struct bio_pair *bp; - sector_t end_sector = tmp_dev->end_sector; - - rcu_read_unlock(); - - bp = bio_split(bio, end_sector - bio->bi_sector); - - linear_make_request(mddev, &bp->bio1); - linear_make_request(mddev, &bp->bio2); - bio_pair_release(bp); - return; - } - - bio->bi_bdev = tmp_dev->rdev->bdev; - bio->bi_sector = bio->bi_sector - start_sector - + tmp_dev->rdev->data_offset; - rcu_read_unlock(); - generic_make_request(bio); -} - -static void linear_status (struct seq_file *seq, struct mddev *mddev) -{ - - seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2); -} - - -static struct md_personality linear_personality = -{ - .name = "linear", - .level = LEVEL_LINEAR, - .owner = THIS_MODULE, - .make_request = linear_make_request, - .run = linear_run, - .stop = linear_stop, - .status = linear_status, - .hot_add_disk = linear_add, - .size = linear_size, -}; - -static int __init linear_init (void) -{ - return register_md_personality (&linear_personality); -} - -static void linear_exit (void) -{ - unregister_md_personality (&linear_personality); -} - - -module_init(linear_init); -module_exit(linear_exit); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Linear device concatenation personality for MD"); -MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/ -MODULE_ALIAS("md-linear"); -MODULE_ALIAS("md-level--1"); diff --git a/ANDROID_3.4.5/drivers/md/linear.h b/ANDROID_3.4.5/drivers/md/linear.h deleted file mode 100644 index b685ddd7..00000000 --- a/ANDROID_3.4.5/drivers/md/linear.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef _LINEAR_H -#define _LINEAR_H - -struct dev_info { - struct md_rdev *rdev; - sector_t end_sector; -}; - -struct linear_conf -{ - struct rcu_head rcu; - sector_t array_sectors; - struct dev_info disks[0]; -}; -#endif diff --git a/ANDROID_3.4.5/drivers/md/md.c b/ANDROID_3.4.5/drivers/md/md.c deleted file mode 100644 index 2b30ffdb..00000000 --- a/ANDROID_3.4.5/drivers/md/md.c +++ /dev/null @@ -1,8342 +0,0 @@ -/* - md.c : Multiple Devices driver for Linux - Copyright (C) 1998, 1999, 2000 Ingo Molnar - - completely rewritten, based on the MD driver code from Marc Zyngier - - Changes: - - - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar - - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> - - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> - - kerneld support by Boris Tobotras <boris@xtalk.msk.su> - - kmod support by: Cyrus Durgin - - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> - - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> - - - lots of fixes and improvements to the RAID1/RAID5 and generic - RAID code (such as request based resynchronization): - - Neil Brown <neilb@cse.unsw.edu.au>. - - - persistent bitmap code - Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. - - You should have received a copy of the GNU General Public License - (for example /usr/src/linux/COPYING); if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -*/ - -#include <linux/kthread.h> -#include <linux/blkdev.h> -#include <linux/sysctl.h> -#include <linux/seq_file.h> -#include <linux/fs.h> -#include <linux/poll.h> -#include <linux/ctype.h> -#include <linux/string.h> -#include <linux/hdreg.h> -#include <linux/proc_fs.h> -#include <linux/random.h> -#include <linux/module.h> -#include <linux/reboot.h> -#include <linux/file.h> -#include <linux/compat.h> -#include <linux/delay.h> -#include <linux/raid/md_p.h> -#include <linux/raid/md_u.h> -#include <linux/slab.h> -#include "md.h" -#include "bitmap.h" - -#ifndef MODULE -static void autostart_arrays(int part); -#endif - -/* pers_list is a list of registered personalities protected - * by pers_lock. - * pers_lock does extra service to protect accesses to - * mddev->thread when the mutex cannot be held. - */ -static LIST_HEAD(pers_list); -static DEFINE_SPINLOCK(pers_lock); - -static void md_print_devices(void); - -static DECLARE_WAIT_QUEUE_HEAD(resync_wait); -static struct workqueue_struct *md_wq; -static struct workqueue_struct *md_misc_wq; - -#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } - -/* - * Default number of read corrections we'll attempt on an rdev - * before ejecting it from the array. We divide the read error - * count by 2 for every hour elapsed between read errors. - */ -#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 -/* - * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' - * is 1000 KB/sec, so the extra system load does not show up that much. - * Increase it if you want to have more _guaranteed_ speed. Note that - * the RAID driver will use the maximum available bandwidth if the IO - * subsystem is idle. There is also an 'absolute maximum' reconstruction - * speed limit - in case reconstruction slows down your system despite - * idle IO detection. - * - * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. - * or /sys/block/mdX/md/sync_speed_{min,max} - */ - -static int sysctl_speed_limit_min = 1000; -static int sysctl_speed_limit_max = 200000; -static inline int speed_min(struct mddev *mddev) -{ - return mddev->sync_speed_min ? - mddev->sync_speed_min : sysctl_speed_limit_min; -} - -static inline int speed_max(struct mddev *mddev) -{ - return mddev->sync_speed_max ? - mddev->sync_speed_max : sysctl_speed_limit_max; -} - -static struct ctl_table_header *raid_table_header; - -static ctl_table raid_table[] = { - { - .procname = "speed_limit_min", - .data = &sysctl_speed_limit_min, - .maxlen = sizeof(int), - .mode = S_IRUGO|S_IWUSR, - .proc_handler = proc_dointvec, - }, - { - .procname = "speed_limit_max", - .data = &sysctl_speed_limit_max, - .maxlen = sizeof(int), - .mode = S_IRUGO|S_IWUSR, - .proc_handler = proc_dointvec, - }, - { } -}; - -static ctl_table raid_dir_table[] = { - { - .procname = "raid", - .maxlen = 0, - .mode = S_IRUGO|S_IXUGO, - .child = raid_table, - }, - { } -}; - -static ctl_table raid_root_table[] = { - { - .procname = "dev", - .maxlen = 0, - .mode = 0555, - .child = raid_dir_table, - }, - { } -}; - -static const struct block_device_operations md_fops; - -static int start_readonly; - -/* bio_clone_mddev - * like bio_clone, but with a local bio set - */ - -static void mddev_bio_destructor(struct bio *bio) -{ - struct mddev *mddev, **mddevp; - - mddevp = (void*)bio; - mddev = mddevp[-1]; - - bio_free(bio, mddev->bio_set); -} - -struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, - struct mddev *mddev) -{ - struct bio *b; - struct mddev **mddevp; - - if (!mddev || !mddev->bio_set) - return bio_alloc(gfp_mask, nr_iovecs); - - b = bio_alloc_bioset(gfp_mask, nr_iovecs, - mddev->bio_set); - if (!b) - return NULL; - mddevp = (void*)b; - mddevp[-1] = mddev; - b->bi_destructor = mddev_bio_destructor; - return b; -} -EXPORT_SYMBOL_GPL(bio_alloc_mddev); - -struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, - struct mddev *mddev) -{ - struct bio *b; - struct mddev **mddevp; - - if (!mddev || !mddev->bio_set) - return bio_clone(bio, gfp_mask); - - b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, - mddev->bio_set); - if (!b) - return NULL; - mddevp = (void*)b; - mddevp[-1] = mddev; - b->bi_destructor = mddev_bio_destructor; - __bio_clone(b, bio); - if (bio_integrity(bio)) { - int ret; - - ret = bio_integrity_clone(b, bio, gfp_mask, mddev->bio_set); - - if (ret < 0) { - bio_put(b); - return NULL; - } - } - - return b; -} -EXPORT_SYMBOL_GPL(bio_clone_mddev); - -void md_trim_bio(struct bio *bio, int offset, int size) -{ - /* 'bio' is a cloned bio which we need to trim to match - * the given offset and size. - * This requires adjusting bi_sector, bi_size, and bi_io_vec - */ - int i; - struct bio_vec *bvec; - int sofar = 0; - - size <<= 9; - if (offset == 0 && size == bio->bi_size) - return; - - bio->bi_sector += offset; - bio->bi_size = size; - offset <<= 9; - clear_bit(BIO_SEG_VALID, &bio->bi_flags); - - while (bio->bi_idx < bio->bi_vcnt && - bio->bi_io_vec[bio->bi_idx].bv_len <= offset) { - /* remove this whole bio_vec */ - offset -= bio->bi_io_vec[bio->bi_idx].bv_len; - bio->bi_idx++; - } - if (bio->bi_idx < bio->bi_vcnt) { - bio->bi_io_vec[bio->bi_idx].bv_offset += offset; - bio->bi_io_vec[bio->bi_idx].bv_len -= offset; - } - /* avoid any complications with bi_idx being non-zero*/ - if (bio->bi_idx) { - memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx, - (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec)); - bio->bi_vcnt -= bio->bi_idx; - bio->bi_idx = 0; - } - /* Make sure vcnt and last bv are not too big */ - bio_for_each_segment(bvec, bio, i) { - if (sofar + bvec->bv_len > size) - bvec->bv_len = size - sofar; - if (bvec->bv_len == 0) { - bio->bi_vcnt = i; - break; - } - sofar += bvec->bv_len; - } -} -EXPORT_SYMBOL_GPL(md_trim_bio); - -/* - * We have a system wide 'event count' that is incremented - * on any 'interesting' event, and readers of /proc/mdstat - * can use 'poll' or 'select' to find out when the event - * count increases. - * - * Events are: - * start array, stop array, error, add device, remove device, - * start build, activate spare - */ -static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); -static atomic_t md_event_count; -void md_new_event(struct mddev *mddev) -{ - atomic_inc(&md_event_count); - wake_up(&md_event_waiters); -} -EXPORT_SYMBOL_GPL(md_new_event); - -/* Alternate version that can be called from interrupts - * when calling sysfs_notify isn't needed. - */ -static void md_new_event_inintr(struct mddev *mddev) -{ - atomic_inc(&md_event_count); - wake_up(&md_event_waiters); -} - -/* - * Enables to iterate over all existing md arrays - * all_mddevs_lock protects this list. - */ -static LIST_HEAD(all_mddevs); -static DEFINE_SPINLOCK(all_mddevs_lock); - - -/* - * iterates through all used mddevs in the system. - * We take care to grab the all_mddevs_lock whenever navigating - * the list, and to always hold a refcount when unlocked. - * Any code which breaks out of this loop while own - * a reference to the current mddev and must mddev_put it. - */ -#define for_each_mddev(_mddev,_tmp) \ - \ - for (({ spin_lock(&all_mddevs_lock); \ - _tmp = all_mddevs.next; \ - _mddev = NULL;}); \ - ({ if (_tmp != &all_mddevs) \ - mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\ - spin_unlock(&all_mddevs_lock); \ - if (_mddev) mddev_put(_mddev); \ - _mddev = list_entry(_tmp, struct mddev, all_mddevs); \ - _tmp != &all_mddevs;}); \ - ({ spin_lock(&all_mddevs_lock); \ - _tmp = _tmp->next;}) \ - ) - - -/* Rather than calling directly into the personality make_request function, - * IO requests come here first so that we can check if the device is - * being suspended pending a reconfiguration. - * We hold a refcount over the call to ->make_request. By the time that - * call has finished, the bio has been linked into some internal structure - * and so is visible to ->quiesce(), so we don't need the refcount any more. - */ -static void md_make_request(struct request_queue *q, struct bio *bio) -{ - const int rw = bio_data_dir(bio); - struct mddev *mddev = q->queuedata; - int cpu; - unsigned int sectors; - - if (mddev == NULL || mddev->pers == NULL - || !mddev->ready) { - bio_io_error(bio); - return; - } - smp_rmb(); /* Ensure implications of 'active' are visible */ - rcu_read_lock(); - if (mddev->suspended) { - DEFINE_WAIT(__wait); - for (;;) { - prepare_to_wait(&mddev->sb_wait, &__wait, - TASK_UNINTERRUPTIBLE); - if (!mddev->suspended) - break; - rcu_read_unlock(); - schedule(); - rcu_read_lock(); - } - finish_wait(&mddev->sb_wait, &__wait); - } - atomic_inc(&mddev->active_io); - rcu_read_unlock(); - - /* - * save the sectors now since our bio can - * go away inside make_request - */ - sectors = bio_sectors(bio); - mddev->pers->make_request(mddev, bio); - - cpu = part_stat_lock(); - part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors); - part_stat_unlock(); - - if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) - wake_up(&mddev->sb_wait); -} - -/* mddev_suspend makes sure no new requests are submitted - * to the device, and that any requests that have been submitted - * are completely handled. - * Once ->stop is called and completes, the module will be completely - * unused. - */ -void mddev_suspend(struct mddev *mddev) -{ - BUG_ON(mddev->suspended); - mddev->suspended = 1; - synchronize_rcu(); - wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); - mddev->pers->quiesce(mddev, 1); - - del_timer_sync(&mddev->safemode_timer); -} -EXPORT_SYMBOL_GPL(mddev_suspend); - -void mddev_resume(struct mddev *mddev) -{ - mddev->suspended = 0; - wake_up(&mddev->sb_wait); - mddev->pers->quiesce(mddev, 0); - - md_wakeup_thread(mddev->thread); - md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ -} -EXPORT_SYMBOL_GPL(mddev_resume); - -int mddev_congested(struct mddev *mddev, int bits) -{ - return mddev->suspended; -} -EXPORT_SYMBOL(mddev_congested); - -/* - * Generic flush handling for md - */ - -static void md_end_flush(struct bio *bio, int err) -{ - struct md_rdev *rdev = bio->bi_private; - struct mddev *mddev = rdev->mddev; - - rdev_dec_pending(rdev, mddev); - - if (atomic_dec_and_test(&mddev->flush_pending)) { - /* The pre-request flush has finished */ - queue_work(md_wq, &mddev->flush_work); - } - bio_put(bio); -} - -static void md_submit_flush_data(struct work_struct *ws); - -static void submit_flushes(struct work_struct *ws) -{ - struct mddev *mddev = container_of(ws, struct mddev, flush_work); - struct md_rdev *rdev; - - INIT_WORK(&mddev->flush_work, md_submit_flush_data); - atomic_set(&mddev->flush_pending, 1); - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) - if (rdev->raid_disk >= 0 && - !test_bit(Faulty, &rdev->flags)) { - /* Take two references, one is dropped - * when request finishes, one after - * we reclaim rcu_read_lock - */ - struct bio *bi; - atomic_inc(&rdev->nr_pending); - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); - bi->bi_end_io = md_end_flush; - bi->bi_private = rdev; - bi->bi_bdev = rdev->bdev; - atomic_inc(&mddev->flush_pending); - submit_bio(WRITE_FLUSH, bi); - rcu_read_lock(); - rdev_dec_pending(rdev, mddev); - } - rcu_read_unlock(); - if (atomic_dec_and_test(&mddev->flush_pending)) - queue_work(md_wq, &mddev->flush_work); -} - -static void md_submit_flush_data(struct work_struct *ws) -{ - struct mddev *mddev = container_of(ws, struct mddev, flush_work); - struct bio *bio = mddev->flush_bio; - - if (bio->bi_size == 0) - /* an empty barrier - all done */ - bio_endio(bio, 0); - else { - bio->bi_rw &= ~REQ_FLUSH; - mddev->pers->make_request(mddev, bio); - } - - mddev->flush_bio = NULL; - wake_up(&mddev->sb_wait); -} - -void md_flush_request(struct mddev *mddev, struct bio *bio) -{ - spin_lock_irq(&mddev->write_lock); - wait_event_lock_irq(mddev->sb_wait, - !mddev->flush_bio, - mddev->write_lock, /*nothing*/); - mddev->flush_bio = bio; - spin_unlock_irq(&mddev->write_lock); - - INIT_WORK(&mddev->flush_work, submit_flushes); - queue_work(md_wq, &mddev->flush_work); -} -EXPORT_SYMBOL(md_flush_request); - -/* Support for plugging. - * This mirrors the plugging support in request_queue, but does not - * require having a whole queue or request structures. - * We allocate an md_plug_cb for each md device and each thread it gets - * plugged on. This links tot the private plug_handle structure in the - * personality data where we keep a count of the number of outstanding - * plugs so other code can see if a plug is active. - */ -struct md_plug_cb { - struct blk_plug_cb cb; - struct mddev *mddev; -}; - -static void plugger_unplug(struct blk_plug_cb *cb) -{ - struct md_plug_cb *mdcb = container_of(cb, struct md_plug_cb, cb); - if (atomic_dec_and_test(&mdcb->mddev->plug_cnt)) - md_wakeup_thread(mdcb->mddev->thread); - kfree(mdcb); -} - -/* Check that an unplug wakeup will come shortly. - * If not, wakeup the md thread immediately - */ -int mddev_check_plugged(struct mddev *mddev) -{ - struct blk_plug *plug = current->plug; - struct md_plug_cb *mdcb; - - if (!plug) - return 0; - - list_for_each_entry(mdcb, &plug->cb_list, cb.list) { - if (mdcb->cb.callback == plugger_unplug && - mdcb->mddev == mddev) { - /* Already on the list, move to top */ - if (mdcb != list_first_entry(&plug->cb_list, - struct md_plug_cb, - cb.list)) - list_move(&mdcb->cb.list, &plug->cb_list); - return 1; - } - } - /* Not currently on the callback list */ - mdcb = kmalloc(sizeof(*mdcb), GFP_ATOMIC); - if (!mdcb) - return 0; - - mdcb->mddev = mddev; - mdcb->cb.callback = plugger_unplug; - atomic_inc(&mddev->plug_cnt); - list_add(&mdcb->cb.list, &plug->cb_list); - return 1; -} -EXPORT_SYMBOL_GPL(mddev_check_plugged); - -static inline struct mddev *mddev_get(struct mddev *mddev) -{ - atomic_inc(&mddev->active); - return mddev; -} - -static void mddev_delayed_delete(struct work_struct *ws); - -static void mddev_put(struct mddev *mddev) -{ - struct bio_set *bs = NULL; - - if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) - return; - if (!mddev->raid_disks && list_empty(&mddev->disks) && - mddev->ctime == 0 && !mddev->hold_active) { - /* Array is not configured at all, and not held active, - * so destroy it */ - list_del_init(&mddev->all_mddevs); - bs = mddev->bio_set; - mddev->bio_set = NULL; - if (mddev->gendisk) { - /* We did a probe so need to clean up. Call - * queue_work inside the spinlock so that - * flush_workqueue() after mddev_find will - * succeed in waiting for the work to be done. - */ - INIT_WORK(&mddev->del_work, mddev_delayed_delete); - queue_work(md_misc_wq, &mddev->del_work); - } else - kfree(mddev); - } - spin_unlock(&all_mddevs_lock); - if (bs) - bioset_free(bs); -} - -void mddev_init(struct mddev *mddev) -{ - mutex_init(&mddev->open_mutex); - mutex_init(&mddev->reconfig_mutex); - mutex_init(&mddev->bitmap_info.mutex); - INIT_LIST_HEAD(&mddev->disks); - INIT_LIST_HEAD(&mddev->all_mddevs); - init_timer(&mddev->safemode_timer); - atomic_set(&mddev->active, 1); - atomic_set(&mddev->openers, 0); - atomic_set(&mddev->active_io, 0); - atomic_set(&mddev->plug_cnt, 0); - spin_lock_init(&mddev->write_lock); - atomic_set(&mddev->flush_pending, 0); - init_waitqueue_head(&mddev->sb_wait); - init_waitqueue_head(&mddev->recovery_wait); - mddev->reshape_position = MaxSector; - mddev->resync_min = 0; - mddev->resync_max = MaxSector; - mddev->level = LEVEL_NONE; -} -EXPORT_SYMBOL_GPL(mddev_init); - -static struct mddev * mddev_find(dev_t unit) -{ - struct mddev *mddev, *new = NULL; - - if (unit && MAJOR(unit) != MD_MAJOR) - unit &= ~((1<<MdpMinorShift)-1); - - retry: - spin_lock(&all_mddevs_lock); - - if (unit) { - list_for_each_entry(mddev, &all_mddevs, all_mddevs) - if (mddev->unit == unit) { - mddev_get(mddev); - spin_unlock(&all_mddevs_lock); - kfree(new); - return mddev; - } - - if (new) { - list_add(&new->all_mddevs, &all_mddevs); - spin_unlock(&all_mddevs_lock); - new->hold_active = UNTIL_IOCTL; - return new; - } - } else if (new) { - /* find an unused unit number */ - static int next_minor = 512; - int start = next_minor; - int is_free = 0; - int dev = 0; - while (!is_free) { - dev = MKDEV(MD_MAJOR, next_minor); - next_minor++; - if (next_minor > MINORMASK) - next_minor = 0; - if (next_minor == start) { - /* Oh dear, all in use. */ - spin_unlock(&all_mddevs_lock); - kfree(new); - return NULL; - } - - is_free = 1; - list_for_each_entry(mddev, &all_mddevs, all_mddevs) - if (mddev->unit == dev) { - is_free = 0; - break; - } - } - new->unit = dev; - new->md_minor = MINOR(dev); - new->hold_active = UNTIL_STOP; - list_add(&new->all_mddevs, &all_mddevs); - spin_unlock(&all_mddevs_lock); - return new; - } - spin_unlock(&all_mddevs_lock); - - new = kzalloc(sizeof(*new), GFP_KERNEL); - if (!new) - return NULL; - - new->unit = unit; - if (MAJOR(unit) == MD_MAJOR) - new->md_minor = MINOR(unit); - else - new->md_minor = MINOR(unit) >> MdpMinorShift; - - mddev_init(new); - - goto retry; -} - -static inline int mddev_lock(struct mddev * mddev) -{ - return mutex_lock_interruptible(&mddev->reconfig_mutex); -} - -static inline int mddev_is_locked(struct mddev *mddev) -{ - return mutex_is_locked(&mddev->reconfig_mutex); -} - -static inline int mddev_trylock(struct mddev * mddev) -{ - return mutex_trylock(&mddev->reconfig_mutex); -} - -static struct attribute_group md_redundancy_group; - -static void mddev_unlock(struct mddev * mddev) -{ - if (mddev->to_remove) { - /* These cannot be removed under reconfig_mutex as - * an access to the files will try to take reconfig_mutex - * while holding the file unremovable, which leads to - * a deadlock. - * So hold set sysfs_active while the remove in happeing, - * and anything else which might set ->to_remove or my - * otherwise change the sysfs namespace will fail with - * -EBUSY if sysfs_active is still set. - * We set sysfs_active under reconfig_mutex and elsewhere - * test it under the same mutex to ensure its correct value - * is seen. - */ - struct attribute_group *to_remove = mddev->to_remove; - mddev->to_remove = NULL; - mddev->sysfs_active = 1; - mutex_unlock(&mddev->reconfig_mutex); - - if (mddev->kobj.sd) { - if (to_remove != &md_redundancy_group) - sysfs_remove_group(&mddev->kobj, to_remove); - if (mddev->pers == NULL || - mddev->pers->sync_request == NULL) { - sysfs_remove_group(&mddev->kobj, &md_redundancy_group); - if (mddev->sysfs_action) - sysfs_put(mddev->sysfs_action); - mddev->sysfs_action = NULL; - } - } - mddev->sysfs_active = 0; - } else - mutex_unlock(&mddev->reconfig_mutex); - - /* As we've dropped the mutex we need a spinlock to - * make sure the thread doesn't disappear - */ - spin_lock(&pers_lock); - md_wakeup_thread(mddev->thread); - spin_unlock(&pers_lock); -} - -static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr) -{ - struct md_rdev *rdev; - - rdev_for_each(rdev, mddev) - if (rdev->desc_nr == nr) - return rdev; - - return NULL; -} - -static struct md_rdev * find_rdev(struct mddev * mddev, dev_t dev) -{ - struct md_rdev *rdev; - - rdev_for_each(rdev, mddev) - if (rdev->bdev->bd_dev == dev) - return rdev; - - return NULL; -} - -static struct md_personality *find_pers(int level, char *clevel) -{ - struct md_personality *pers; - list_for_each_entry(pers, &pers_list, list) { - if (level != LEVEL_NONE && pers->level == level) - return pers; - if (strcmp(pers->name, clevel)==0) - return pers; - } - return NULL; -} - -/* return the offset of the super block in 512byte sectors */ -static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) -{ - sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512; - return MD_NEW_SIZE_SECTORS(num_sectors); -} - -static int alloc_disk_sb(struct md_rdev * rdev) -{ - if (rdev->sb_page) - MD_BUG(); - - rdev->sb_page = alloc_page(GFP_KERNEL); - if (!rdev->sb_page) { - printk(KERN_ALERT "md: out of memory.\n"); - return -ENOMEM; - } - - return 0; -} - -static void free_disk_sb(struct md_rdev * rdev) -{ - if (rdev->sb_page) { - put_page(rdev->sb_page); - rdev->sb_loaded = 0; - rdev->sb_page = NULL; - rdev->sb_start = 0; - rdev->sectors = 0; - } - if (rdev->bb_page) { - put_page(rdev->bb_page); - rdev->bb_page = NULL; - } -} - - -static void super_written(struct bio *bio, int error) -{ - struct md_rdev *rdev = bio->bi_private; - struct mddev *mddev = rdev->mddev; - - if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) { - printk("md: super_written gets error=%d, uptodate=%d\n", - error, test_bit(BIO_UPTODATE, &bio->bi_flags)); - WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags)); - md_error(mddev, rdev); - } - - if (atomic_dec_and_test(&mddev->pending_writes)) - wake_up(&mddev->sb_wait); - bio_put(bio); -} - -void md_super_write(struct mddev *mddev, struct md_rdev *rdev, - sector_t sector, int size, struct page *page) -{ - /* write first size bytes of page to sector of rdev - * Increment mddev->pending_writes before returning - * and decrement it on completion, waking up sb_wait - * if zero is reached. - * If an error occurred, call md_error - */ - struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); - - bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev; - bio->bi_sector = sector; - bio_add_page(bio, page, size, 0); - bio->bi_private = rdev; - bio->bi_end_io = super_written; - - atomic_inc(&mddev->pending_writes); - submit_bio(WRITE_FLUSH_FUA, bio); -} - -void md_super_wait(struct mddev *mddev) -{ - /* wait for all superblock writes that were scheduled to complete */ - DEFINE_WAIT(wq); - for(;;) { - prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); - if (atomic_read(&mddev->pending_writes)==0) - break; - schedule(); - } - finish_wait(&mddev->sb_wait, &wq); -} - -static void bi_complete(struct bio *bio, int error) -{ - complete((struct completion*)bio->bi_private); -} - -int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, - struct page *page, int rw, bool metadata_op) -{ - struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); - struct completion event; - int ret; - - rw |= REQ_SYNC; - - bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? - rdev->meta_bdev : rdev->bdev; - if (metadata_op) - bio->bi_sector = sector + rdev->sb_start; - else - bio->bi_sector = sector + rdev->data_offset; - bio_add_page(bio, page, size, 0); - init_completion(&event); - bio->bi_private = &event; - bio->bi_end_io = bi_complete; - submit_bio(rw, bio); - wait_for_completion(&event); - - ret = test_bit(BIO_UPTODATE, &bio->bi_flags); - bio_put(bio); - return ret; -} -EXPORT_SYMBOL_GPL(sync_page_io); - -static int read_disk_sb(struct md_rdev * rdev, int size) -{ - char b[BDEVNAME_SIZE]; - if (!rdev->sb_page) { - MD_BUG(); - return -EINVAL; - } - if (rdev->sb_loaded) - return 0; - - - if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true)) - goto fail; - rdev->sb_loaded = 1; - return 0; - -fail: - printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", - bdevname(rdev->bdev,b)); - return -EINVAL; -} - -static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) -{ - return sb1->set_uuid0 == sb2->set_uuid0 && - sb1->set_uuid1 == sb2->set_uuid1 && - sb1->set_uuid2 == sb2->set_uuid2 && - sb1->set_uuid3 == sb2->set_uuid3; -} - -static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) -{ - int ret; - mdp_super_t *tmp1, *tmp2; - - tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); - tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); - - if (!tmp1 || !tmp2) { - ret = 0; - printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n"); - goto abort; - } - - *tmp1 = *sb1; - *tmp2 = *sb2; - - /* - * nr_disks is not constant - */ - tmp1->nr_disks = 0; - tmp2->nr_disks = 0; - - ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); -abort: - kfree(tmp1); - kfree(tmp2); - return ret; -} - - -static u32 md_csum_fold(u32 csum) -{ - csum = (csum & 0xffff) + (csum >> 16); - return (csum & 0xffff) + (csum >> 16); -} - -static unsigned int calc_sb_csum(mdp_super_t * sb) -{ - u64 newcsum = 0; - u32 *sb32 = (u32*)sb; - int i; - unsigned int disk_csum, csum; - - disk_csum = sb->sb_csum; - sb->sb_csum = 0; - - for (i = 0; i < MD_SB_BYTES/4 ; i++) - newcsum += sb32[i]; - csum = (newcsum & 0xffffffff) + (newcsum>>32); - - -#ifdef CONFIG_ALPHA - /* This used to use csum_partial, which was wrong for several - * reasons including that different results are returned on - * different architectures. It isn't critical that we get exactly - * the same return value as before (we always csum_fold before - * testing, and that removes any differences). However as we - * know that csum_partial always returned a 16bit value on - * alphas, do a fold to maximise conformity to previous behaviour. - */ - sb->sb_csum = md_csum_fold(disk_csum); -#else - sb->sb_csum = disk_csum; -#endif - return csum; -} - - -/* - * Handle superblock details. - * We want to be able to handle multiple superblock formats - * so we have a common interface to them all, and an array of - * different handlers. - * We rely on user-space to write the initial superblock, and support - * reading and updating of superblocks. - * Interface methods are: - * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) - * loads and validates a superblock on dev. - * if refdev != NULL, compare superblocks on both devices - * Return: - * 0 - dev has a superblock that is compatible with refdev - * 1 - dev has a superblock that is compatible and newer than refdev - * so dev should be used as the refdev in future - * -EINVAL superblock incompatible or invalid - * -othererror e.g. -EIO - * - * int validate_super(struct mddev *mddev, struct md_rdev *dev) - * Verify that dev is acceptable into mddev. - * The first time, mddev->raid_disks will be 0, and data from - * dev should be merged in. Subsequent calls check that dev - * is new enough. Return 0 or -EINVAL - * - * void sync_super(struct mddev *mddev, struct md_rdev *dev) - * Update the superblock for rdev with data in mddev - * This does not write to disc. - * - */ - -struct super_type { - char *name; - struct module *owner; - int (*load_super)(struct md_rdev *rdev, struct md_rdev *refdev, - int minor_version); - int (*validate_super)(struct mddev *mddev, struct md_rdev *rdev); - void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); - unsigned long long (*rdev_size_change)(struct md_rdev *rdev, - sector_t num_sectors); -}; - -/* - * Check that the given mddev has no bitmap. - * - * This function is called from the run method of all personalities that do not - * support bitmaps. It prints an error message and returns non-zero if mddev - * has a bitmap. Otherwise, it returns 0. - * - */ -int md_check_no_bitmap(struct mddev *mddev) -{ - if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) - return 0; - printk(KERN_ERR "%s: bitmaps are not supported for %s\n", - mdname(mddev), mddev->pers->name); - return 1; -} -EXPORT_SYMBOL(md_check_no_bitmap); - -/* - * load_super for 0.90.0 - */ -static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) -{ - char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; - mdp_super_t *sb; - int ret; - - /* - * Calculate the position of the superblock (512byte sectors), - * it's at the end of the disk. - * - * It also happens to be a multiple of 4Kb. - */ - rdev->sb_start = calc_dev_sboffset(rdev); - - ret = read_disk_sb(rdev, MD_SB_BYTES); - if (ret) return ret; - - ret = -EINVAL; - - bdevname(rdev->bdev, b); - sb = page_address(rdev->sb_page); - - if (sb->md_magic != MD_SB_MAGIC) { - printk(KERN_ERR "md: invalid raid superblock magic on %s\n", - b); - goto abort; - } - - if (sb->major_version != 0 || - sb->minor_version < 90 || - sb->minor_version > 91) { - printk(KERN_WARNING "Bad version number %d.%d on %s\n", - sb->major_version, sb->minor_version, - b); - goto abort; - } - - if (sb->raid_disks <= 0) - goto abort; - - if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { - printk(KERN_WARNING "md: invalid superblock checksum on %s\n", - b); - goto abort; - } - - rdev->preferred_minor = sb->md_minor; - rdev->data_offset = 0; - rdev->sb_size = MD_SB_BYTES; - rdev->badblocks.shift = -1; - - if (sb->level == LEVEL_MULTIPATH) - rdev->desc_nr = -1; - else - rdev->desc_nr = sb->this_disk.number; - - if (!refdev) { - ret = 1; - } else { - __u64 ev1, ev2; - mdp_super_t *refsb = page_address(refdev->sb_page); - if (!uuid_equal(refsb, sb)) { - printk(KERN_WARNING "md: %s has different UUID to %s\n", - b, bdevname(refdev->bdev,b2)); - goto abort; - } - if (!sb_equal(refsb, sb)) { - printk(KERN_WARNING "md: %s has same UUID" - " but different superblock to %s\n", - b, bdevname(refdev->bdev, b2)); - goto abort; - } - ev1 = md_event(sb); - ev2 = md_event(refsb); - if (ev1 > ev2) - ret = 1; - else - ret = 0; - } - rdev->sectors = rdev->sb_start; - /* Limit to 4TB as metadata cannot record more than that */ - if (rdev->sectors >= (2ULL << 32)) - rdev->sectors = (2ULL << 32) - 2; - - if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) - /* "this cannot possibly happen" ... */ - ret = -EINVAL; - - abort: - return ret; -} - -/* - * validate_super for 0.90.0 - */ -static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) -{ - mdp_disk_t *desc; - mdp_super_t *sb = page_address(rdev->sb_page); - __u64 ev1 = md_event(sb); - - rdev->raid_disk = -1; - clear_bit(Faulty, &rdev->flags); - clear_bit(In_sync, &rdev->flags); - clear_bit(WriteMostly, &rdev->flags); - - if (mddev->raid_disks == 0) { - mddev->major_version = 0; - mddev->minor_version = sb->minor_version; - mddev->patch_version = sb->patch_version; - mddev->external = 0; - mddev->chunk_sectors = sb->chunk_size >> 9; - mddev->ctime = sb->ctime; - mddev->utime = sb->utime; - mddev->level = sb->level; - mddev->clevel[0] = 0; - mddev->layout = sb->layout; - mddev->raid_disks = sb->raid_disks; - mddev->dev_sectors = ((sector_t)sb->size) * 2; - mddev->events = ev1; - mddev->bitmap_info.offset = 0; - mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; - - if (mddev->minor_version >= 91) { - mddev->reshape_position = sb->reshape_position; - mddev->delta_disks = sb->delta_disks; - mddev->new_level = sb->new_level; - mddev->new_layout = sb->new_layout; - mddev->new_chunk_sectors = sb->new_chunk >> 9; - } else { - mddev->reshape_position = MaxSector; - mddev->delta_disks = 0; - mddev->new_level = mddev->level; - mddev->new_layout = mddev->layout; - mddev->new_chunk_sectors = mddev->chunk_sectors; - } - - if (sb->state & (1<<MD_SB_CLEAN)) - mddev->recovery_cp = MaxSector; - else { - if (sb->events_hi == sb->cp_events_hi && - sb->events_lo == sb->cp_events_lo) { - mddev->recovery_cp = sb->recovery_cp; - } else - mddev->recovery_cp = 0; - } - - memcpy(mddev->uuid+0, &sb->set_uuid0, 4); - memcpy(mddev->uuid+4, &sb->set_uuid1, 4); - memcpy(mddev->uuid+8, &sb->set_uuid2, 4); - memcpy(mddev->uuid+12,&sb->set_uuid3, 4); - - mddev->max_disks = MD_SB_DISKS; - - if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && - mddev->bitmap_info.file == NULL) - mddev->bitmap_info.offset = - mddev->bitmap_info.default_offset; - - } else if (mddev->pers == NULL) { - /* Insist on good event counter while assembling, except - * for spares (which don't need an event count) */ - ++ev1; - if (sb->disks[rdev->desc_nr].state & ( - (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) - if (ev1 < mddev->events) - return -EINVAL; - } else if (mddev->bitmap) { - /* if adding to array with a bitmap, then we can accept an - * older device ... but not too old. - */ - if (ev1 < mddev->bitmap->events_cleared) - return 0; - } else { - if (ev1 < mddev->events) - /* just a hot-add of a new device, leave raid_disk at -1 */ - return 0; - } - - if (mddev->level != LEVEL_MULTIPATH) { - desc = sb->disks + rdev->desc_nr; - - if (desc->state & (1<<MD_DISK_FAULTY)) - set_bit(Faulty, &rdev->flags); - else if (desc->state & (1<<MD_DISK_SYNC) /* && - desc->raid_disk < mddev->raid_disks */) { - set_bit(In_sync, &rdev->flags); - rdev->raid_disk = desc->raid_disk; - } else if (desc->state & (1<<MD_DISK_ACTIVE)) { - /* active but not in sync implies recovery up to - * reshape position. We don't know exactly where - * that is, so set to zero for now */ - if (mddev->minor_version >= 91) { - rdev->recovery_offset = 0; - rdev->raid_disk = desc->raid_disk; - } - } - if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) - set_bit(WriteMostly, &rdev->flags); - } else /* MULTIPATH are always insync */ - set_bit(In_sync, &rdev->flags); - return 0; -} - -/* - * sync_super for 0.90.0 - */ -static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) -{ - mdp_super_t *sb; - struct md_rdev *rdev2; - int next_spare = mddev->raid_disks; - - - /* make rdev->sb match mddev data.. - * - * 1/ zero out disks - * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); - * 3/ any empty disks < next_spare become removed - * - * disks[0] gets initialised to REMOVED because - * we cannot be sure from other fields if it has - * been initialised or not. - */ - int i; - int active=0, working=0,failed=0,spare=0,nr_disks=0; - - rdev->sb_size = MD_SB_BYTES; - - sb = page_address(rdev->sb_page); - - memset(sb, 0, sizeof(*sb)); - - sb->md_magic = MD_SB_MAGIC; - sb->major_version = mddev->major_version; - sb->patch_version = mddev->patch_version; - sb->gvalid_words = 0; /* ignored */ - memcpy(&sb->set_uuid0, mddev->uuid+0, 4); - memcpy(&sb->set_uuid1, mddev->uuid+4, 4); - memcpy(&sb->set_uuid2, mddev->uuid+8, 4); - memcpy(&sb->set_uuid3, mddev->uuid+12,4); - - sb->ctime = mddev->ctime; - sb->level = mddev->level; - sb->size = mddev->dev_sectors / 2; - sb->raid_disks = mddev->raid_disks; - sb->md_minor = mddev->md_minor; - sb->not_persistent = 0; - sb->utime = mddev->utime; - sb->state = 0; - sb->events_hi = (mddev->events>>32); - sb->events_lo = (u32)mddev->events; - - if (mddev->reshape_position == MaxSector) - sb->minor_version = 90; - else { - sb->minor_version = 91; - sb->reshape_position = mddev->reshape_position; - sb->new_level = mddev->new_level; - sb->delta_disks = mddev->delta_disks; - sb->new_layout = mddev->new_layout; - sb->new_chunk = mddev->new_chunk_sectors << 9; - } - mddev->minor_version = sb->minor_version; - if (mddev->in_sync) - { - sb->recovery_cp = mddev->recovery_cp; - sb->cp_events_hi = (mddev->events>>32); - sb->cp_events_lo = (u32)mddev->events; - if (mddev->recovery_cp == MaxSector) - sb->state = (1<< MD_SB_CLEAN); - } else - sb->recovery_cp = 0; - - sb->layout = mddev->layout; - sb->chunk_size = mddev->chunk_sectors << 9; - - if (mddev->bitmap && mddev->bitmap_info.file == NULL) - sb->state |= (1<<MD_SB_BITMAP_PRESENT); - - sb->disks[0].state = (1<<MD_DISK_REMOVED); - rdev_for_each(rdev2, mddev) { - mdp_disk_t *d; - int desc_nr; - int is_active = test_bit(In_sync, &rdev2->flags); - - if (rdev2->raid_disk >= 0 && - sb->minor_version >= 91) - /* we have nowhere to store the recovery_offset, - * but if it is not below the reshape_position, - * we can piggy-back on that. - */ - is_active = 1; - if (rdev2->raid_disk < 0 || - test_bit(Faulty, &rdev2->flags)) - is_active = 0; - if (is_active) - desc_nr = rdev2->raid_disk; - else - desc_nr = next_spare++; - rdev2->desc_nr = desc_nr; - d = &sb->disks[rdev2->desc_nr]; - nr_disks++; - d->number = rdev2->desc_nr; - d->major = MAJOR(rdev2->bdev->bd_dev); - d->minor = MINOR(rdev2->bdev->bd_dev); - if (is_active) - d->raid_disk = rdev2->raid_disk; - else - d->raid_disk = rdev2->desc_nr; /* compatibility */ - if (test_bit(Faulty, &rdev2->flags)) - d->state = (1<<MD_DISK_FAULTY); - else if (is_active) { - d->state = (1<<MD_DISK_ACTIVE); - if (test_bit(In_sync, &rdev2->flags)) - d->state |= (1<<MD_DISK_SYNC); - active++; - working++; - } else { - d->state = 0; - spare++; - working++; - } - if (test_bit(WriteMostly, &rdev2->flags)) - d->state |= (1<<MD_DISK_WRITEMOSTLY); - } - /* now set the "removed" and "faulty" bits on any missing devices */ - for (i=0 ; i < mddev->raid_disks ; i++) { - mdp_disk_t *d = &sb->disks[i]; - if (d->state == 0 && d->number == 0) { - d->number = i; - d->raid_disk = i; - d->state = (1<<MD_DISK_REMOVED); - d->state |= (1<<MD_DISK_FAULTY); - failed++; - } - } - sb->nr_disks = nr_disks; - sb->active_disks = active; - sb->working_disks = working; - sb->failed_disks = failed; - sb->spare_disks = spare; - - sb->this_disk = sb->disks[rdev->desc_nr]; - sb->sb_csum = calc_sb_csum(sb); -} - -/* - * rdev_size_change for 0.90.0 - */ -static unsigned long long -super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) -{ - if (num_sectors && num_sectors < rdev->mddev->dev_sectors) - return 0; /* component must fit device */ - if (rdev->mddev->bitmap_info.offset) - return 0; /* can't move bitmap */ - rdev->sb_start = calc_dev_sboffset(rdev); - if (!num_sectors || num_sectors > rdev->sb_start) - num_sectors = rdev->sb_start; - /* Limit to 4TB as metadata cannot record more than that. - * 4TB == 2^32 KB, or 2*2^32 sectors. - */ - if (num_sectors >= (2ULL << 32)) - num_sectors = (2ULL << 32) - 2; - md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, - rdev->sb_page); - md_super_wait(rdev->mddev); - return num_sectors; -} - - -/* - * version 1 superblock - */ - -static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) -{ - __le32 disk_csum; - u32 csum; - unsigned long long newcsum; - int size = 256 + le32_to_cpu(sb->max_dev)*2; - __le32 *isuper = (__le32*)sb; - int i; - - disk_csum = sb->sb_csum; - sb->sb_csum = 0; - newcsum = 0; - for (i=0; size>=4; size -= 4 ) - newcsum += le32_to_cpu(*isuper++); - - if (size == 2) - newcsum += le16_to_cpu(*(__le16*) isuper); - - csum = (newcsum & 0xffffffff) + (newcsum >> 32); - sb->sb_csum = disk_csum; - return cpu_to_le32(csum); -} - -static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, - int acknowledged); -static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) -{ - struct mdp_superblock_1 *sb; - int ret; - sector_t sb_start; - char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; - int bmask; - - /* - * Calculate the position of the superblock in 512byte sectors. - * It is always aligned to a 4K boundary and - * depeding on minor_version, it can be: - * 0: At least 8K, but less than 12K, from end of device - * 1: At start of device - * 2: 4K from start of device. - */ - switch(minor_version) { - case 0: - sb_start = i_size_read(rdev->bdev->bd_inode) >> 9; - sb_start -= 8*2; - sb_start &= ~(sector_t)(4*2-1); - break; - case 1: - sb_start = 0; - break; - case 2: - sb_start = 8; - break; - default: - return -EINVAL; - } - rdev->sb_start = sb_start; - - /* superblock is rarely larger than 1K, but it can be larger, - * and it is safe to read 4k, so we do that - */ - ret = read_disk_sb(rdev, 4096); - if (ret) return ret; - - - sb = page_address(rdev->sb_page); - - if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || - sb->major_version != cpu_to_le32(1) || - le32_to_cpu(sb->max_dev) > (4096-256)/2 || - le64_to_cpu(sb->super_offset) != rdev->sb_start || - (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) - return -EINVAL; - - if (calc_sb_1_csum(sb) != sb->sb_csum) { - printk("md: invalid superblock checksum on %s\n", - bdevname(rdev->bdev,b)); - return -EINVAL; - } - if (le64_to_cpu(sb->data_size) < 10) { - printk("md: data_size too small on %s\n", - bdevname(rdev->bdev,b)); - return -EINVAL; - } - - rdev->preferred_minor = 0xffff; - rdev->data_offset = le64_to_cpu(sb->data_offset); - atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); - - rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; - bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; - if (rdev->sb_size & bmask) - rdev->sb_size = (rdev->sb_size | bmask) + 1; - - if (minor_version - && rdev->data_offset < sb_start + (rdev->sb_size/512)) - return -EINVAL; - - if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) - rdev->desc_nr = -1; - else - rdev->desc_nr = le32_to_cpu(sb->dev_number); - - if (!rdev->bb_page) { - rdev->bb_page = alloc_page(GFP_KERNEL); - if (!rdev->bb_page) - return -ENOMEM; - } - if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && - rdev->badblocks.count == 0) { - /* need to load the bad block list. - * Currently we limit it to one page. - */ - s32 offset; - sector_t bb_sector; - u64 *bbp; - int i; - int sectors = le16_to_cpu(sb->bblog_size); - if (sectors > (PAGE_SIZE / 512)) - return -EINVAL; - offset = le32_to_cpu(sb->bblog_offset); - if (offset == 0) - return -EINVAL; - bb_sector = (long long)offset; - if (!sync_page_io(rdev, bb_sector, sectors << 9, - rdev->bb_page, READ, true)) - return -EIO; - bbp = (u64 *)page_address(rdev->bb_page); - rdev->badblocks.shift = sb->bblog_shift; - for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { - u64 bb = le64_to_cpu(*bbp); - int count = bb & (0x3ff); - u64 sector = bb >> 10; - sector <<= sb->bblog_shift; - count <<= sb->bblog_shift; - if (bb + 1 == 0) - break; - if (md_set_badblocks(&rdev->badblocks, - sector, count, 1) == 0) - return -EINVAL; - } - } else if (sb->bblog_offset == 0) - rdev->badblocks.shift = -1; - - if (!refdev) { - ret = 1; - } else { - __u64 ev1, ev2; - struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); - - if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || - sb->level != refsb->level || - sb->layout != refsb->layout || - sb->chunksize != refsb->chunksize) { - printk(KERN_WARNING "md: %s has strangely different" - " superblock to %s\n", - bdevname(rdev->bdev,b), - bdevname(refdev->bdev,b2)); - return -EINVAL; - } - ev1 = le64_to_cpu(sb->events); - ev2 = le64_to_cpu(refsb->events); - - if (ev1 > ev2) - ret = 1; - else - ret = 0; - } - if (minor_version) - rdev->sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - - le64_to_cpu(sb->data_offset); - else - rdev->sectors = rdev->sb_start; - if (rdev->sectors < le64_to_cpu(sb->data_size)) - return -EINVAL; - rdev->sectors = le64_to_cpu(sb->data_size); - if (le64_to_cpu(sb->size) > rdev->sectors) - return -EINVAL; - return ret; -} - -static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) -{ - struct mdp_superblock_1 *sb = page_address(rdev->sb_page); - __u64 ev1 = le64_to_cpu(sb->events); - - rdev->raid_disk = -1; - clear_bit(Faulty, &rdev->flags); - clear_bit(In_sync, &rdev->flags); - clear_bit(WriteMostly, &rdev->flags); - - if (mddev->raid_disks == 0) { - mddev->major_version = 1; - mddev->patch_version = 0; - mddev->external = 0; - mddev->chunk_sectors = le32_to_cpu(sb->chunksize); - mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); - mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); - mddev->level = le32_to_cpu(sb->level); - mddev->clevel[0] = 0; - mddev->layout = le32_to_cpu(sb->layout); - mddev->raid_disks = le32_to_cpu(sb->raid_disks); - mddev->dev_sectors = le64_to_cpu(sb->size); - mddev->events = ev1; - mddev->bitmap_info.offset = 0; - mddev->bitmap_info.default_offset = 1024 >> 9; - - mddev->recovery_cp = le64_to_cpu(sb->resync_offset); - memcpy(mddev->uuid, sb->set_uuid, 16); - - mddev->max_disks = (4096-256)/2; - - if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && - mddev->bitmap_info.file == NULL ) - mddev->bitmap_info.offset = - (__s32)le32_to_cpu(sb->bitmap_offset); - - if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { - mddev->reshape_position = le64_to_cpu(sb->reshape_position); - mddev->delta_disks = le32_to_cpu(sb->delta_disks); - mddev->new_level = le32_to_cpu(sb->new_level); - mddev->new_layout = le32_to_cpu(sb->new_layout); - mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); - } else { - mddev->reshape_position = MaxSector; - mddev->delta_disks = 0; - mddev->new_level = mddev->level; - mddev->new_layout = mddev->layout; - mddev->new_chunk_sectors = mddev->chunk_sectors; - } - - } else if (mddev->pers == NULL) { - /* Insist of good event counter while assembling, except for - * spares (which don't need an event count) */ - ++ev1; - if (rdev->desc_nr >= 0 && - rdev->desc_nr < le32_to_cpu(sb->max_dev) && - le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe) - if (ev1 < mddev->events) - return -EINVAL; - } else if (mddev->bitmap) { - /* If adding to array with a bitmap, then we can accept an - * older device, but not too old. - */ - if (ev1 < mddev->bitmap->events_cleared) - return 0; - } else { - if (ev1 < mddev->events) - /* just a hot-add of a new device, leave raid_disk at -1 */ - return 0; - } - if (mddev->level != LEVEL_MULTIPATH) { - int role; - if (rdev->desc_nr < 0 || - rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { - role = 0xffff; - rdev->desc_nr = -1; - } else - role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); - switch(role) { - case 0xffff: /* spare */ - break; - case 0xfffe: /* faulty */ - set_bit(Faulty, &rdev->flags); - break; - default: - if ((le32_to_cpu(sb->feature_map) & - MD_FEATURE_RECOVERY_OFFSET)) - rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); - else - set_bit(In_sync, &rdev->flags); - rdev->raid_disk = role; - break; - } - if (sb->devflags & WriteMostly1) - set_bit(WriteMostly, &rdev->flags); - if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) - set_bit(Replacement, &rdev->flags); - } else /* MULTIPATH are always insync */ - set_bit(In_sync, &rdev->flags); - - return 0; -} - -static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) -{ - struct mdp_superblock_1 *sb; - struct md_rdev *rdev2; - int max_dev, i; - /* make rdev->sb match mddev and rdev data. */ - - sb = page_address(rdev->sb_page); - - sb->feature_map = 0; - sb->pad0 = 0; - sb->recovery_offset = cpu_to_le64(0); - memset(sb->pad1, 0, sizeof(sb->pad1)); - memset(sb->pad3, 0, sizeof(sb->pad3)); - - sb->utime = cpu_to_le64((__u64)mddev->utime); - sb->events = cpu_to_le64(mddev->events); - if (mddev->in_sync) - sb->resync_offset = cpu_to_le64(mddev->recovery_cp); - else - sb->resync_offset = cpu_to_le64(0); - - sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); - - sb->raid_disks = cpu_to_le32(mddev->raid_disks); - sb->size = cpu_to_le64(mddev->dev_sectors); - sb->chunksize = cpu_to_le32(mddev->chunk_sectors); - sb->level = cpu_to_le32(mddev->level); - sb->layout = cpu_to_le32(mddev->layout); - - if (test_bit(WriteMostly, &rdev->flags)) - sb->devflags |= WriteMostly1; - else - sb->devflags &= ~WriteMostly1; - - if (mddev->bitmap && mddev->bitmap_info.file == NULL) { - sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); - sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); - } - - if (rdev->raid_disk >= 0 && - !test_bit(In_sync, &rdev->flags)) { - sb->feature_map |= - cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); - sb->recovery_offset = - cpu_to_le64(rdev->recovery_offset); - } - if (test_bit(Replacement, &rdev->flags)) - sb->feature_map |= - cpu_to_le32(MD_FEATURE_REPLACEMENT); - - if (mddev->reshape_position != MaxSector) { - sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); - sb->reshape_position = cpu_to_le64(mddev->reshape_position); - sb->new_layout = cpu_to_le32(mddev->new_layout); - sb->delta_disks = cpu_to_le32(mddev->delta_disks); - sb->new_level = cpu_to_le32(mddev->new_level); - sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); - } - - if (rdev->badblocks.count == 0) - /* Nothing to do for bad blocks*/ ; - else if (sb->bblog_offset == 0) - /* Cannot record bad blocks on this device */ - md_error(mddev, rdev); - else { - struct badblocks *bb = &rdev->badblocks; - u64 *bbp = (u64 *)page_address(rdev->bb_page); - u64 *p = bb->page; - sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); - if (bb->changed) { - unsigned seq; - -retry: - seq = read_seqbegin(&bb->lock); - - memset(bbp, 0xff, PAGE_SIZE); - - for (i = 0 ; i < bb->count ; i++) { - u64 internal_bb = *p++; - u64 store_bb = ((BB_OFFSET(internal_bb) << 10) - | BB_LEN(internal_bb)); - *bbp++ = cpu_to_le64(store_bb); - } - bb->changed = 0; - if (read_seqretry(&bb->lock, seq)) - goto retry; - - bb->sector = (rdev->sb_start + - (int)le32_to_cpu(sb->bblog_offset)); - bb->size = le16_to_cpu(sb->bblog_size); - } - } - - max_dev = 0; - rdev_for_each(rdev2, mddev) - if (rdev2->desc_nr+1 > max_dev) - max_dev = rdev2->desc_nr+1; - - if (max_dev > le32_to_cpu(sb->max_dev)) { - int bmask; - sb->max_dev = cpu_to_le32(max_dev); - rdev->sb_size = max_dev * 2 + 256; - bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; - if (rdev->sb_size & bmask) - rdev->sb_size = (rdev->sb_size | bmask) + 1; - } else - max_dev = le32_to_cpu(sb->max_dev); - - for (i=0; i<max_dev;i++) - sb->dev_roles[i] = cpu_to_le16(0xfffe); - - rdev_for_each(rdev2, mddev) { - i = rdev2->desc_nr; - if (test_bit(Faulty, &rdev2->flags)) - sb->dev_roles[i] = cpu_to_le16(0xfffe); - else if (test_bit(In_sync, &rdev2->flags)) - sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); - else if (rdev2->raid_disk >= 0) - sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); - else - sb->dev_roles[i] = cpu_to_le16(0xffff); - } - - sb->sb_csum = calc_sb_1_csum(sb); -} - -static unsigned long long -super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) -{ - struct mdp_superblock_1 *sb; - sector_t max_sectors; - if (num_sectors && num_sectors < rdev->mddev->dev_sectors) - return 0; /* component must fit device */ - if (rdev->sb_start < rdev->data_offset) { - /* minor versions 1 and 2; superblock before data */ - max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9; - max_sectors -= rdev->data_offset; - if (!num_sectors || num_sectors > max_sectors) - num_sectors = max_sectors; - } else if (rdev->mddev->bitmap_info.offset) { - /* minor version 0 with bitmap we can't move */ - return 0; - } else { - /* minor version 0; superblock after data */ - sector_t sb_start; - sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2; - sb_start &= ~(sector_t)(4*2 - 1); - max_sectors = rdev->sectors + sb_start - rdev->sb_start; - if (!num_sectors || num_sectors > max_sectors) - num_sectors = max_sectors; - rdev->sb_start = sb_start; - } - sb = page_address(rdev->sb_page); - sb->data_size = cpu_to_le64(num_sectors); - sb->super_offset = rdev->sb_start; - sb->sb_csum = calc_sb_1_csum(sb); - md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, - rdev->sb_page); - md_super_wait(rdev->mddev); - return num_sectors; -} - -static struct super_type super_types[] = { - [0] = { - .name = "0.90.0", - .owner = THIS_MODULE, - .load_super = super_90_load, - .validate_super = super_90_validate, - .sync_super = super_90_sync, - .rdev_size_change = super_90_rdev_size_change, - }, - [1] = { - .name = "md-1", - .owner = THIS_MODULE, - .load_super = super_1_load, - .validate_super = super_1_validate, - .sync_super = super_1_sync, - .rdev_size_change = super_1_rdev_size_change, - }, -}; - -static void sync_super(struct mddev *mddev, struct md_rdev *rdev) -{ - if (mddev->sync_super) { - mddev->sync_super(mddev, rdev); - return; - } - - BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); - - super_types[mddev->major_version].sync_super(mddev, rdev); -} - -static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) -{ - struct md_rdev *rdev, *rdev2; - - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev1) - rdev_for_each_rcu(rdev2, mddev2) - if (rdev->bdev->bd_contains == - rdev2->bdev->bd_contains) { - rcu_read_unlock(); - return 1; - } - rcu_read_unlock(); - return 0; -} - -static LIST_HEAD(pending_raid_disks); - -/* - * Try to register data integrity profile for an mddev - * - * This is called when an array is started and after a disk has been kicked - * from the array. It only succeeds if all working and active component devices - * are integrity capable with matching profiles. - */ -int md_integrity_register(struct mddev *mddev) -{ - struct md_rdev *rdev, *reference = NULL; - - if (list_empty(&mddev->disks)) - return 0; /* nothing to do */ - if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) - return 0; /* shouldn't register, or already is */ - rdev_for_each(rdev, mddev) { - /* skip spares and non-functional disks */ - if (test_bit(Faulty, &rdev->flags)) - continue; - if (rdev->raid_disk < 0) - continue; - if (!reference) { - /* Use the first rdev as the reference */ - reference = rdev; - continue; - } - /* does this rdev's profile match the reference profile? */ - if (blk_integrity_compare(reference->bdev->bd_disk, - rdev->bdev->bd_disk) < 0) - return -EINVAL; - } - if (!reference || !bdev_get_integrity(reference->bdev)) - return 0; - /* - * All component devices are integrity capable and have matching - * profiles, register the common profile for the md device. - */ - if (blk_integrity_register(mddev->gendisk, - bdev_get_integrity(reference->bdev)) != 0) { - printk(KERN_ERR "md: failed to register integrity for %s\n", - mdname(mddev)); - return -EINVAL; - } - printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev)); - if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) { - printk(KERN_ERR "md: failed to create integrity pool for %s\n", - mdname(mddev)); - return -EINVAL; - } - return 0; -} -EXPORT_SYMBOL(md_integrity_register); - -/* Disable data integrity if non-capable/non-matching disk is being added */ -void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) -{ - struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev); - struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk); - - if (!bi_mddev) /* nothing to do */ - return; - if (rdev->raid_disk < 0) /* skip spares */ - return; - if (bi_rdev && blk_integrity_compare(mddev->gendisk, - rdev->bdev->bd_disk) >= 0) - return; - printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev)); - blk_integrity_unregister(mddev->gendisk); -} -EXPORT_SYMBOL(md_integrity_add_rdev); - -static int bind_rdev_to_array(struct md_rdev * rdev, struct mddev * mddev) -{ - char b[BDEVNAME_SIZE]; - struct kobject *ko; - char *s; - int err; - - if (rdev->mddev) { - MD_BUG(); - return -EINVAL; - } - - /* prevent duplicates */ - if (find_rdev(mddev, rdev->bdev->bd_dev)) - return -EEXIST; - - /* make sure rdev->sectors exceeds mddev->dev_sectors */ - if (rdev->sectors && (mddev->dev_sectors == 0 || - rdev->sectors < mddev->dev_sectors)) { - if (mddev->pers) { - /* Cannot change size, so fail - * If mddev->level <= 0, then we don't care - * about aligning sizes (e.g. linear) - */ - if (mddev->level > 0) - return -ENOSPC; - } else - mddev->dev_sectors = rdev->sectors; - } - - /* Verify rdev->desc_nr is unique. - * If it is -1, assign a free number, else - * check number is not in use - */ - if (rdev->desc_nr < 0) { - int choice = 0; - if (mddev->pers) choice = mddev->raid_disks; - while (find_rdev_nr(mddev, choice)) - choice++; - rdev->desc_nr = choice; - } else { - if (find_rdev_nr(mddev, rdev->desc_nr)) - return -EBUSY; - } - if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { - printk(KERN_WARNING "md: %s: array is limited to %d devices\n", - mdname(mddev), mddev->max_disks); - return -EBUSY; - } - bdevname(rdev->bdev,b); - while ( (s=strchr(b, '/')) != NULL) - *s = '!'; - - rdev->mddev = mddev; - printk(KERN_INFO "md: bind<%s>\n", b); - - if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) - goto fail; - - ko = &part_to_dev(rdev->bdev->bd_part)->kobj; - if (sysfs_create_link(&rdev->kobj, ko, "block")) - /* failure here is OK */; - rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); - - list_add_rcu(&rdev->same_set, &mddev->disks); - bd_link_disk_holder(rdev->bdev, mddev->gendisk); - - /* May as well allow recovery to be retried once */ - mddev->recovery_disabled++; - - return 0; - - fail: - printk(KERN_WARNING "md: failed to register dev-%s for %s\n", - b, mdname(mddev)); - return err; -} - -static void md_delayed_delete(struct work_struct *ws) -{ - struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work); - kobject_del(&rdev->kobj); - kobject_put(&rdev->kobj); -} - -static void unbind_rdev_from_array(struct md_rdev * rdev) -{ - char b[BDEVNAME_SIZE]; - if (!rdev->mddev) { - MD_BUG(); - return; - } - bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); - list_del_rcu(&rdev->same_set); - printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); - rdev->mddev = NULL; - sysfs_remove_link(&rdev->kobj, "block"); - sysfs_put(rdev->sysfs_state); - rdev->sysfs_state = NULL; - kfree(rdev->badblocks.page); - rdev->badblocks.count = 0; - rdev->badblocks.page = NULL; - /* We need to delay this, otherwise we can deadlock when - * writing to 'remove' to "dev/state". We also need - * to delay it due to rcu usage. - */ - synchronize_rcu(); - INIT_WORK(&rdev->del_work, md_delayed_delete); - kobject_get(&rdev->kobj); - queue_work(md_misc_wq, &rdev->del_work); -} - -/* - * prevent the device from being mounted, repartitioned or - * otherwise reused by a RAID array (or any other kernel - * subsystem), by bd_claiming the device. - */ -static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared) -{ - int err = 0; - struct block_device *bdev; - char b[BDEVNAME_SIZE]; - - bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, - shared ? (struct md_rdev *)lock_rdev : rdev); - if (IS_ERR(bdev)) { - printk(KERN_ERR "md: could not open %s.\n", - __bdevname(dev, b)); - return PTR_ERR(bdev); - } - rdev->bdev = bdev; - return err; -} - -static void unlock_rdev(struct md_rdev *rdev) -{ - struct block_device *bdev = rdev->bdev; - rdev->bdev = NULL; - if (!bdev) - MD_BUG(); - blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); -} - -void md_autodetect_dev(dev_t dev); - -static void export_rdev(struct md_rdev * rdev) -{ - char b[BDEVNAME_SIZE]; - printk(KERN_INFO "md: export_rdev(%s)\n", - bdevname(rdev->bdev,b)); - if (rdev->mddev) - MD_BUG(); - free_disk_sb(rdev); -#ifndef MODULE - if (test_bit(AutoDetected, &rdev->flags)) - md_autodetect_dev(rdev->bdev->bd_dev); -#endif - unlock_rdev(rdev); - kobject_put(&rdev->kobj); -} - -static void kick_rdev_from_array(struct md_rdev * rdev) -{ - unbind_rdev_from_array(rdev); - export_rdev(rdev); -} - -static void export_array(struct mddev *mddev) -{ - struct md_rdev *rdev, *tmp; - - rdev_for_each_safe(rdev, tmp, mddev) { - if (!rdev->mddev) { - MD_BUG(); - continue; - } - kick_rdev_from_array(rdev); - } - if (!list_empty(&mddev->disks)) - MD_BUG(); - mddev->raid_disks = 0; - mddev->major_version = 0; -} - -static void print_desc(mdp_disk_t *desc) -{ - printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, - desc->major,desc->minor,desc->raid_disk,desc->state); -} - -static void print_sb_90(mdp_super_t *sb) -{ - int i; - - printk(KERN_INFO - "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", - sb->major_version, sb->minor_version, sb->patch_version, - sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, - sb->ctime); - printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", - sb->level, sb->size, sb->nr_disks, sb->raid_disks, - sb->md_minor, sb->layout, sb->chunk_size); - printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" - " FD:%d SD:%d CSUM:%08x E:%08lx\n", - sb->utime, sb->state, sb->active_disks, sb->working_disks, - sb->failed_disks, sb->spare_disks, - sb->sb_csum, (unsigned long)sb->events_lo); - - printk(KERN_INFO); - for (i = 0; i < MD_SB_DISKS; i++) { - mdp_disk_t *desc; - - desc = sb->disks + i; - if (desc->number || desc->major || desc->minor || - desc->raid_disk || (desc->state && (desc->state != 4))) { - printk(" D %2d: ", i); - print_desc(desc); - } - } - printk(KERN_INFO "md: THIS: "); - print_desc(&sb->this_disk); -} - -static void print_sb_1(struct mdp_superblock_1 *sb) -{ - __u8 *uuid; - - uuid = sb->set_uuid; - printk(KERN_INFO - "md: SB: (V:%u) (F:0x%08x) Array-ID:<%pU>\n" - "md: Name: \"%s\" CT:%llu\n", - le32_to_cpu(sb->major_version), - le32_to_cpu(sb->feature_map), - uuid, - sb->set_name, - (unsigned long long)le64_to_cpu(sb->ctime) - & MD_SUPERBLOCK_1_TIME_SEC_MASK); - - uuid = sb->device_uuid; - printk(KERN_INFO - "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" - " RO:%llu\n" - "md: Dev:%08x UUID: %pU\n" - "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" - "md: (MaxDev:%u) \n", - le32_to_cpu(sb->level), - (unsigned long long)le64_to_cpu(sb->size), - le32_to_cpu(sb->raid_disks), - le32_to_cpu(sb->layout), - le32_to_cpu(sb->chunksize), - (unsigned long long)le64_to_cpu(sb->data_offset), - (unsigned long long)le64_to_cpu(sb->data_size), - (unsigned long long)le64_to_cpu(sb->super_offset), - (unsigned long long)le64_to_cpu(sb->recovery_offset), - le32_to_cpu(sb->dev_number), - uuid, - sb->devflags, - (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK, - (unsigned long long)le64_to_cpu(sb->events), - (unsigned long long)le64_to_cpu(sb->resync_offset), - le32_to_cpu(sb->sb_csum), - le32_to_cpu(sb->max_dev) - ); -} - -static void print_rdev(struct md_rdev *rdev, int major_version) -{ - char b[BDEVNAME_SIZE]; - printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n", - bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors, - test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), - rdev->desc_nr); - if (rdev->sb_loaded) { - printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version); - switch (major_version) { - case 0: - print_sb_90(page_address(rdev->sb_page)); - break; - case 1: - print_sb_1(page_address(rdev->sb_page)); - break; - } - } else - printk(KERN_INFO "md: no rdev superblock!\n"); -} - -static void md_print_devices(void) -{ - struct list_head *tmp; - struct md_rdev *rdev; - struct mddev *mddev; - char b[BDEVNAME_SIZE]; - - printk("\n"); - printk("md: **********************************\n"); - printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); - printk("md: **********************************\n"); - for_each_mddev(mddev, tmp) { - - if (mddev->bitmap) - bitmap_print_sb(mddev->bitmap); - else - printk("%s: ", mdname(mddev)); - rdev_for_each(rdev, mddev) - printk("<%s>", bdevname(rdev->bdev,b)); - printk("\n"); - - rdev_for_each(rdev, mddev) - print_rdev(rdev, mddev->major_version); - } - printk("md: **********************************\n"); - printk("\n"); -} - - -static void sync_sbs(struct mddev * mddev, int nospares) -{ - /* Update each superblock (in-memory image), but - * if we are allowed to, skip spares which already - * have the right event counter, or have one earlier - * (which would mean they aren't being marked as dirty - * with the rest of the array) - */ - struct md_rdev *rdev; - rdev_for_each(rdev, mddev) { - if (rdev->sb_events == mddev->events || - (nospares && - rdev->raid_disk < 0 && - rdev->sb_events+1 == mddev->events)) { - /* Don't update this superblock */ - rdev->sb_loaded = 2; - } else { - sync_super(mddev, rdev); - rdev->sb_loaded = 1; - } - } -} - -static void md_update_sb(struct mddev * mddev, int force_change) -{ - struct md_rdev *rdev; - int sync_req; - int nospares = 0; - int any_badblocks_changed = 0; - -repeat: - /* First make sure individual recovery_offsets are correct */ - rdev_for_each(rdev, mddev) { - if (rdev->raid_disk >= 0 && - mddev->delta_disks >= 0 && - !test_bit(In_sync, &rdev->flags) && - mddev->curr_resync_completed > rdev->recovery_offset) - rdev->recovery_offset = mddev->curr_resync_completed; - - } - if (!mddev->persistent) { - clear_bit(MD_CHANGE_CLEAN, &mddev->flags); - clear_bit(MD_CHANGE_DEVS, &mddev->flags); - if (!mddev->external) { - clear_bit(MD_CHANGE_PENDING, &mddev->flags); - rdev_for_each(rdev, mddev) { - if (rdev->badblocks.changed) { - rdev->badblocks.changed = 0; - md_ack_all_badblocks(&rdev->badblocks); - md_error(mddev, rdev); - } - clear_bit(Blocked, &rdev->flags); - clear_bit(BlockedBadBlocks, &rdev->flags); - wake_up(&rdev->blocked_wait); - } - } - wake_up(&mddev->sb_wait); - return; - } - - spin_lock_irq(&mddev->write_lock); - - mddev->utime = get_seconds(); - - if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) - force_change = 1; - if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) - /* just a clean<-> dirty transition, possibly leave spares alone, - * though if events isn't the right even/odd, we will have to do - * spares after all - */ - nospares = 1; - if (force_change) - nospares = 0; - if (mddev->degraded) - /* If the array is degraded, then skipping spares is both - * dangerous and fairly pointless. - * Dangerous because a device that was removed from the array - * might have a event_count that still looks up-to-date, - * so it can be re-added without a resync. - * Pointless because if there are any spares to skip, - * then a recovery will happen and soon that array won't - * be degraded any more and the spare can go back to sleep then. - */ - nospares = 0; - - sync_req = mddev->in_sync; - - /* If this is just a dirty<->clean transition, and the array is clean - * and 'events' is odd, we can roll back to the previous clean state */ - if (nospares - && (mddev->in_sync && mddev->recovery_cp == MaxSector) - && mddev->can_decrease_events - && mddev->events != 1) { - mddev->events--; - mddev->can_decrease_events = 0; - } else { - /* otherwise we have to go forward and ... */ - mddev->events ++; - mddev->can_decrease_events = nospares; - } - - if (!mddev->events) { - /* - * oops, this 64-bit counter should never wrap. - * Either we are in around ~1 trillion A.C., assuming - * 1 reboot per second, or we have a bug: - */ - MD_BUG(); - mddev->events --; - } - - rdev_for_each(rdev, mddev) { - if (rdev->badblocks.changed) - any_badblocks_changed++; - if (test_bit(Faulty, &rdev->flags)) - set_bit(FaultRecorded, &rdev->flags); - } - - sync_sbs(mddev, nospares); - spin_unlock_irq(&mddev->write_lock); - - pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", - mdname(mddev), mddev->in_sync); - - bitmap_update_sb(mddev->bitmap); - rdev_for_each(rdev, mddev) { - char b[BDEVNAME_SIZE]; - - if (rdev->sb_loaded != 1) - continue; /* no noise on spare devices */ - - if (!test_bit(Faulty, &rdev->flags) && - rdev->saved_raid_disk == -1) { - md_super_write(mddev,rdev, - rdev->sb_start, rdev->sb_size, - rdev->sb_page); - pr_debug("md: (write) %s's sb offset: %llu\n", - bdevname(rdev->bdev, b), - (unsigned long long)rdev->sb_start); - rdev->sb_events = mddev->events; - if (rdev->badblocks.size) { - md_super_write(mddev, rdev, - rdev->badblocks.sector, - rdev->badblocks.size << 9, - rdev->bb_page); - rdev->badblocks.size = 0; - } - - } else if (test_bit(Faulty, &rdev->flags)) - pr_debug("md: %s (skipping faulty)\n", - bdevname(rdev->bdev, b)); - else - pr_debug("(skipping incremental s/r "); - - if (mddev->level == LEVEL_MULTIPATH) - /* only need to write one superblock... */ - break; - } - md_super_wait(mddev); - /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ - - spin_lock_irq(&mddev->write_lock); - if (mddev->in_sync != sync_req || - test_bit(MD_CHANGE_DEVS, &mddev->flags)) { - /* have to write it out again */ - spin_unlock_irq(&mddev->write_lock); - goto repeat; - } - clear_bit(MD_CHANGE_PENDING, &mddev->flags); - spin_unlock_irq(&mddev->write_lock); - wake_up(&mddev->sb_wait); - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); - - rdev_for_each(rdev, mddev) { - if (test_and_clear_bit(FaultRecorded, &rdev->flags)) - clear_bit(Blocked, &rdev->flags); - - if (any_badblocks_changed) - md_ack_all_badblocks(&rdev->badblocks); - clear_bit(BlockedBadBlocks, &rdev->flags); - wake_up(&rdev->blocked_wait); - } -} - -/* words written to sysfs files may, or may not, be \n terminated. - * We want to accept with case. For this we use cmd_match. - */ -static int cmd_match(const char *cmd, const char *str) -{ - /* See if cmd, written into a sysfs file, matches - * str. They must either be the same, or cmd can - * have a trailing newline - */ - while (*cmd && *str && *cmd == *str) { - cmd++; - str++; - } - if (*cmd == '\n') - cmd++; - if (*str || *cmd) - return 0; - return 1; -} - -struct rdev_sysfs_entry { - struct attribute attr; - ssize_t (*show)(struct md_rdev *, char *); - ssize_t (*store)(struct md_rdev *, const char *, size_t); -}; - -static ssize_t -state_show(struct md_rdev *rdev, char *page) -{ - char *sep = ""; - size_t len = 0; - - if (test_bit(Faulty, &rdev->flags) || - rdev->badblocks.unacked_exist) { - len+= sprintf(page+len, "%sfaulty",sep); - sep = ","; - } - if (test_bit(In_sync, &rdev->flags)) { - len += sprintf(page+len, "%sin_sync",sep); - sep = ","; - } - if (test_bit(WriteMostly, &rdev->flags)) { - len += sprintf(page+len, "%swrite_mostly",sep); - sep = ","; - } - if (test_bit(Blocked, &rdev->flags) || - (rdev->badblocks.unacked_exist - && !test_bit(Faulty, &rdev->flags))) { - len += sprintf(page+len, "%sblocked", sep); - sep = ","; - } - if (!test_bit(Faulty, &rdev->flags) && - !test_bit(In_sync, &rdev->flags)) { - len += sprintf(page+len, "%sspare", sep); - sep = ","; - } - if (test_bit(WriteErrorSeen, &rdev->flags)) { - len += sprintf(page+len, "%swrite_error", sep); - sep = ","; - } - if (test_bit(WantReplacement, &rdev->flags)) { - len += sprintf(page+len, "%swant_replacement", sep); - sep = ","; - } - if (test_bit(Replacement, &rdev->flags)) { - len += sprintf(page+len, "%sreplacement", sep); - sep = ","; - } - - return len+sprintf(page+len, "\n"); -} - -static ssize_t -state_store(struct md_rdev *rdev, const char *buf, size_t len) -{ - /* can write - * faulty - simulates an error - * remove - disconnects the device - * writemostly - sets write_mostly - * -writemostly - clears write_mostly - * blocked - sets the Blocked flags - * -blocked - clears the Blocked and possibly simulates an error - * insync - sets Insync providing device isn't active - * write_error - sets WriteErrorSeen - * -write_error - clears WriteErrorSeen - */ - int err = -EINVAL; - if (cmd_match(buf, "faulty") && rdev->mddev->pers) { - md_error(rdev->mddev, rdev); - if (test_bit(Faulty, &rdev->flags)) - err = 0; - else - err = -EBUSY; - } else if (cmd_match(buf, "remove")) { - if (rdev->raid_disk >= 0) - err = -EBUSY; - else { - struct mddev *mddev = rdev->mddev; - kick_rdev_from_array(rdev); - if (mddev->pers) - md_update_sb(mddev, 1); - md_new_event(mddev); - err = 0; - } - } else if (cmd_match(buf, "writemostly")) { - set_bit(WriteMostly, &rdev->flags); - err = 0; - } else if (cmd_match(buf, "-writemostly")) { - clear_bit(WriteMostly, &rdev->flags); - err = 0; - } else if (cmd_match(buf, "blocked")) { - set_bit(Blocked, &rdev->flags); - err = 0; - } else if (cmd_match(buf, "-blocked")) { - if (!test_bit(Faulty, &rdev->flags) && - rdev->badblocks.unacked_exist) { - /* metadata handler doesn't understand badblocks, - * so we need to fail the device - */ - md_error(rdev->mddev, rdev); - } - clear_bit(Blocked, &rdev->flags); - clear_bit(BlockedBadBlocks, &rdev->flags); - wake_up(&rdev->blocked_wait); - set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); - md_wakeup_thread(rdev->mddev->thread); - - err = 0; - } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { - set_bit(In_sync, &rdev->flags); - err = 0; - } else if (cmd_match(buf, "write_error")) { - set_bit(WriteErrorSeen, &rdev->flags); - err = 0; - } else if (cmd_match(buf, "-write_error")) { - clear_bit(WriteErrorSeen, &rdev->flags); - err = 0; - } else if (cmd_match(buf, "want_replacement")) { - /* Any non-spare device that is not a replacement can - * become want_replacement at any time, but we then need to - * check if recovery is needed. - */ - if (rdev->raid_disk >= 0 && - !test_bit(Replacement, &rdev->flags)) - set_bit(WantReplacement, &rdev->flags); - set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); - md_wakeup_thread(rdev->mddev->thread); - err = 0; - } else if (cmd_match(buf, "-want_replacement")) { - /* Clearing 'want_replacement' is always allowed. - * Once replacements starts it is too late though. - */ - err = 0; - clear_bit(WantReplacement, &rdev->flags); - } else if (cmd_match(buf, "replacement")) { - /* Can only set a device as a replacement when array has not - * yet been started. Once running, replacement is automatic - * from spares, or by assigning 'slot'. - */ - if (rdev->mddev->pers) - err = -EBUSY; - else { - set_bit(Replacement, &rdev->flags); - err = 0; - } - } else if (cmd_match(buf, "-replacement")) { - /* Similarly, can only clear Replacement before start */ - if (rdev->mddev->pers) - err = -EBUSY; - else { - clear_bit(Replacement, &rdev->flags); - err = 0; - } - } - if (!err) - sysfs_notify_dirent_safe(rdev->sysfs_state); - return err ? err : len; -} -static struct rdev_sysfs_entry rdev_state = -__ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store); - -static ssize_t -errors_show(struct md_rdev *rdev, char *page) -{ - return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); -} - -static ssize_t -errors_store(struct md_rdev *rdev, const char *buf, size_t len) -{ - char *e; - unsigned long n = simple_strtoul(buf, &e, 10); - if (*buf && (*e == 0 || *e == '\n')) { - atomic_set(&rdev->corrected_errors, n); - return len; - } - return -EINVAL; -} -static struct rdev_sysfs_entry rdev_errors = -__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); - -static ssize_t -slot_show(struct md_rdev *rdev, char *page) -{ - if (rdev->raid_disk < 0) - return sprintf(page, "none\n"); - else - return sprintf(page, "%d\n", rdev->raid_disk); -} - -static ssize_t -slot_store(struct md_rdev *rdev, const char *buf, size_t len) -{ - char *e; - int err; - int slot = simple_strtoul(buf, &e, 10); - if (strncmp(buf, "none", 4)==0) - slot = -1; - else if (e==buf || (*e && *e!= '\n')) - return -EINVAL; - if (rdev->mddev->pers && slot == -1) { - /* Setting 'slot' on an active array requires also - * updating the 'rd%d' link, and communicating - * with the personality with ->hot_*_disk. - * For now we only support removing - * failed/spare devices. This normally happens automatically, - * but not when the metadata is externally managed. - */ - if (rdev->raid_disk == -1) - return -EEXIST; - /* personality does all needed checks */ - if (rdev->mddev->pers->hot_remove_disk == NULL) - return -EINVAL; - err = rdev->mddev->pers-> - hot_remove_disk(rdev->mddev, rdev); - if (err) - return err; - sysfs_unlink_rdev(rdev->mddev, rdev); - rdev->raid_disk = -1; - set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); - md_wakeup_thread(rdev->mddev->thread); - } else if (rdev->mddev->pers) { - /* Activating a spare .. or possibly reactivating - * if we ever get bitmaps working here. - */ - - if (rdev->raid_disk != -1) - return -EBUSY; - - if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) - return -EBUSY; - - if (rdev->mddev->pers->hot_add_disk == NULL) - return -EINVAL; - - if (slot >= rdev->mddev->raid_disks && - slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) - return -ENOSPC; - - rdev->raid_disk = slot; - if (test_bit(In_sync, &rdev->flags)) - rdev->saved_raid_disk = slot; - else - rdev->saved_raid_disk = -1; - clear_bit(In_sync, &rdev->flags); - err = rdev->mddev->pers-> - hot_add_disk(rdev->mddev, rdev); - if (err) { - rdev->raid_disk = -1; - return err; - } else - sysfs_notify_dirent_safe(rdev->sysfs_state); - if (sysfs_link_rdev(rdev->mddev, rdev)) - /* failure here is OK */; - /* don't wakeup anyone, leave that to userspace. */ - } else { - if (slot >= rdev->mddev->raid_disks && - slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) - return -ENOSPC; - rdev->raid_disk = slot; - /* assume it is working */ - clear_bit(Faulty, &rdev->flags); - clear_bit(WriteMostly, &rdev->flags); - set_bit(In_sync, &rdev->flags); - sysfs_notify_dirent_safe(rdev->sysfs_state); - } - return len; -} - - -static struct rdev_sysfs_entry rdev_slot = -__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); - -static ssize_t -offset_show(struct md_rdev *rdev, char *page) -{ - return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); -} - -static ssize_t -offset_store(struct md_rdev *rdev, const char *buf, size_t len) -{ - char *e; - unsigned long long offset = simple_strtoull(buf, &e, 10); - if (e==buf || (*e && *e != '\n')) - return -EINVAL; - if (rdev->mddev->pers && rdev->raid_disk >= 0) - return -EBUSY; - if (rdev->sectors && rdev->mddev->external) - /* Must set offset before size, so overlap checks - * can be sane */ - return -EBUSY; - rdev->data_offset = offset; - return len; -} - -static struct rdev_sysfs_entry rdev_offset = -__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); - -static ssize_t -rdev_size_show(struct md_rdev *rdev, char *page) -{ - return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); -} - -static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) -{ - /* check if two start/length pairs overlap */ - if (s1+l1 <= s2) - return 0; - if (s2+l2 <= s1) - return 0; - return 1; -} - -static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) -{ - unsigned long long blocks; - sector_t new; - - if (strict_strtoull(buf, 10, &blocks) < 0) - return -EINVAL; - - if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) - return -EINVAL; /* sector conversion overflow */ - - new = blocks * 2; - if (new != blocks * 2) - return -EINVAL; /* unsigned long long to sector_t overflow */ - - *sectors = new; - return 0; -} - -static ssize_t -rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) -{ - struct mddev *my_mddev = rdev->mddev; - sector_t oldsectors = rdev->sectors; - sector_t sectors; - - if (strict_blocks_to_sectors(buf, §ors) < 0) - return -EINVAL; - if (my_mddev->pers && rdev->raid_disk >= 0) { - if (my_mddev->persistent) { - sectors = super_types[my_mddev->major_version]. - rdev_size_change(rdev, sectors); - if (!sectors) - return -EBUSY; - } else if (!sectors) - sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - - rdev->data_offset; - } - if (sectors < my_mddev->dev_sectors) - return -EINVAL; /* component must fit device */ - - rdev->sectors = sectors; - if (sectors > oldsectors && my_mddev->external) { - /* need to check that all other rdevs with the same ->bdev - * do not overlap. We need to unlock the mddev to avoid - * a deadlock. We have already changed rdev->sectors, and if - * we have to change it back, we will have the lock again. - */ - struct mddev *mddev; - int overlap = 0; - struct list_head *tmp; - - mddev_unlock(my_mddev); - for_each_mddev(mddev, tmp) { - struct md_rdev *rdev2; - - mddev_lock(mddev); - rdev_for_each(rdev2, mddev) - if (rdev->bdev == rdev2->bdev && - rdev != rdev2 && - overlaps(rdev->data_offset, rdev->sectors, - rdev2->data_offset, - rdev2->sectors)) { - overlap = 1; - break; - } - mddev_unlock(mddev); - if (overlap) { - mddev_put(mddev); - break; - } - } - mddev_lock(my_mddev); - if (overlap) { - /* Someone else could have slipped in a size - * change here, but doing so is just silly. - * We put oldsectors back because we *know* it is - * safe, and trust userspace not to race with - * itself - */ - rdev->sectors = oldsectors; - return -EBUSY; - } - } - return len; -} - -static struct rdev_sysfs_entry rdev_size = -__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); - - -static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) -{ - unsigned long long recovery_start = rdev->recovery_offset; - - if (test_bit(In_sync, &rdev->flags) || - recovery_start == MaxSector) - return sprintf(page, "none\n"); - - return sprintf(page, "%llu\n", recovery_start); -} - -static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) -{ - unsigned long long recovery_start; - - if (cmd_match(buf, "none")) - recovery_start = MaxSector; - else if (strict_strtoull(buf, 10, &recovery_start)) - return -EINVAL; - - if (rdev->mddev->pers && - rdev->raid_disk >= 0) - return -EBUSY; - - rdev->recovery_offset = recovery_start; - if (recovery_start == MaxSector) - set_bit(In_sync, &rdev->flags); - else - clear_bit(In_sync, &rdev->flags); - return len; -} - -static struct rdev_sysfs_entry rdev_recovery_start = -__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); - - -static ssize_t -badblocks_show(struct badblocks *bb, char *page, int unack); -static ssize_t -badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack); - -static ssize_t bb_show(struct md_rdev *rdev, char *page) -{ - return badblocks_show(&rdev->badblocks, page, 0); -} -static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) -{ - int rv = badblocks_store(&rdev->badblocks, page, len, 0); - /* Maybe that ack was all we needed */ - if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) - wake_up(&rdev->blocked_wait); - return rv; -} -static struct rdev_sysfs_entry rdev_bad_blocks = -__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); - - -static ssize_t ubb_show(struct md_rdev *rdev, char *page) -{ - return badblocks_show(&rdev->badblocks, page, 1); -} -static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) -{ - return badblocks_store(&rdev->badblocks, page, len, 1); -} -static struct rdev_sysfs_entry rdev_unack_bad_blocks = -__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); - -static struct attribute *rdev_default_attrs[] = { - &rdev_state.attr, - &rdev_errors.attr, - &rdev_slot.attr, - &rdev_offset.attr, - &rdev_size.attr, - &rdev_recovery_start.attr, - &rdev_bad_blocks.attr, - &rdev_unack_bad_blocks.attr, - NULL, -}; -static ssize_t -rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) -{ - struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); - struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); - struct mddev *mddev = rdev->mddev; - ssize_t rv; - - if (!entry->show) - return -EIO; - - rv = mddev ? mddev_lock(mddev) : -EBUSY; - if (!rv) { - if (rdev->mddev == NULL) - rv = -EBUSY; - else - rv = entry->show(rdev, page); - mddev_unlock(mddev); - } - return rv; -} - -static ssize_t -rdev_attr_store(struct kobject *kobj, struct attribute *attr, - const char *page, size_t length) -{ - struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); - struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); - ssize_t rv; - struct mddev *mddev = rdev->mddev; - - if (!entry->store) - return -EIO; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - rv = mddev ? mddev_lock(mddev): -EBUSY; - if (!rv) { - if (rdev->mddev == NULL) - rv = -EBUSY; - else - rv = entry->store(rdev, page, length); - mddev_unlock(mddev); - } - return rv; -} - -static void rdev_free(struct kobject *ko) -{ - struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); - kfree(rdev); -} -static const struct sysfs_ops rdev_sysfs_ops = { - .show = rdev_attr_show, - .store = rdev_attr_store, -}; -static struct kobj_type rdev_ktype = { - .release = rdev_free, - .sysfs_ops = &rdev_sysfs_ops, - .default_attrs = rdev_default_attrs, -}; - -int md_rdev_init(struct md_rdev *rdev) -{ - rdev->desc_nr = -1; - rdev->saved_raid_disk = -1; - rdev->raid_disk = -1; - rdev->flags = 0; - rdev->data_offset = 0; - rdev->sb_events = 0; - rdev->last_read_error.tv_sec = 0; - rdev->last_read_error.tv_nsec = 0; - rdev->sb_loaded = 0; - rdev->bb_page = NULL; - atomic_set(&rdev->nr_pending, 0); - atomic_set(&rdev->read_errors, 0); - atomic_set(&rdev->corrected_errors, 0); - - INIT_LIST_HEAD(&rdev->same_set); - init_waitqueue_head(&rdev->blocked_wait); - - /* Add space to store bad block list. - * This reserves the space even on arrays where it cannot - * be used - I wonder if that matters - */ - rdev->badblocks.count = 0; - rdev->badblocks.shift = 0; - rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL); - seqlock_init(&rdev->badblocks.lock); - if (rdev->badblocks.page == NULL) - return -ENOMEM; - - return 0; -} -EXPORT_SYMBOL_GPL(md_rdev_init); -/* - * Import a device. If 'super_format' >= 0, then sanity check the superblock - * - * mark the device faulty if: - * - * - the device is nonexistent (zero size) - * - the device has no valid superblock - * - * a faulty rdev _never_ has rdev->sb set. - */ -static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) -{ - char b[BDEVNAME_SIZE]; - int err; - struct md_rdev *rdev; - sector_t size; - - rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); - if (!rdev) { - printk(KERN_ERR "md: could not alloc mem for new device!\n"); - return ERR_PTR(-ENOMEM); - } - - err = md_rdev_init(rdev); - if (err) - goto abort_free; - err = alloc_disk_sb(rdev); - if (err) - goto abort_free; - - err = lock_rdev(rdev, newdev, super_format == -2); - if (err) - goto abort_free; - - kobject_init(&rdev->kobj, &rdev_ktype); - - size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS; - if (!size) { - printk(KERN_WARNING - "md: %s has zero or unknown size, marking faulty!\n", - bdevname(rdev->bdev,b)); - err = -EINVAL; - goto abort_free; - } - - if (super_format >= 0) { - err = super_types[super_format]. - load_super(rdev, NULL, super_minor); - if (err == -EINVAL) { - printk(KERN_WARNING - "md: %s does not have a valid v%d.%d " - "superblock, not importing!\n", - bdevname(rdev->bdev,b), - super_format, super_minor); - goto abort_free; - } - if (err < 0) { - printk(KERN_WARNING - "md: could not read %s's sb, not importing!\n", - bdevname(rdev->bdev,b)); - goto abort_free; - } - } - if (super_format == -1) - /* hot-add for 0.90, or non-persistent: so no badblocks */ - rdev->badblocks.shift = -1; - - return rdev; - -abort_free: - if (rdev->bdev) - unlock_rdev(rdev); - free_disk_sb(rdev); - kfree(rdev->badblocks.page); - kfree(rdev); - return ERR_PTR(err); -} - -/* - * Check a full RAID array for plausibility - */ - - -static void analyze_sbs(struct mddev * mddev) -{ - int i; - struct md_rdev *rdev, *freshest, *tmp; - char b[BDEVNAME_SIZE]; - - freshest = NULL; - rdev_for_each_safe(rdev, tmp, mddev) - switch (super_types[mddev->major_version]. - load_super(rdev, freshest, mddev->minor_version)) { - case 1: - freshest = rdev; - break; - case 0: - break; - default: - printk( KERN_ERR \ - "md: fatal superblock inconsistency in %s" - " -- removing from array\n", - bdevname(rdev->bdev,b)); - kick_rdev_from_array(rdev); - } - - - super_types[mddev->major_version]. - validate_super(mddev, freshest); - - i = 0; - rdev_for_each_safe(rdev, tmp, mddev) { - if (mddev->max_disks && - (rdev->desc_nr >= mddev->max_disks || - i > mddev->max_disks)) { - printk(KERN_WARNING - "md: %s: %s: only %d devices permitted\n", - mdname(mddev), bdevname(rdev->bdev, b), - mddev->max_disks); - kick_rdev_from_array(rdev); - continue; - } - if (rdev != freshest) - if (super_types[mddev->major_version]. - validate_super(mddev, rdev)) { - printk(KERN_WARNING "md: kicking non-fresh %s" - " from array!\n", - bdevname(rdev->bdev,b)); - kick_rdev_from_array(rdev); - continue; - } - if (mddev->level == LEVEL_MULTIPATH) { - rdev->desc_nr = i++; - rdev->raid_disk = rdev->desc_nr; - set_bit(In_sync, &rdev->flags); - } else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) { - rdev->raid_disk = -1; - clear_bit(In_sync, &rdev->flags); - } - } -} - -/* Read a fixed-point number. - * Numbers in sysfs attributes should be in "standard" units where - * possible, so time should be in seconds. - * However we internally use a a much smaller unit such as - * milliseconds or jiffies. - * This function takes a decimal number with a possible fractional - * component, and produces an integer which is the result of - * multiplying that number by 10^'scale'. - * all without any floating-point arithmetic. - */ -int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) -{ - unsigned long result = 0; - long decimals = -1; - while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { - if (*cp == '.') - decimals = 0; - else if (decimals < scale) { - unsigned int value; - value = *cp - '0'; - result = result * 10 + value; - if (decimals >= 0) - decimals++; - } - cp++; - } - if (*cp == '\n') - cp++; - if (*cp) - return -EINVAL; - if (decimals < 0) - decimals = 0; - while (decimals < scale) { - result *= 10; - decimals ++; - } - *res = result; - return 0; -} - - -static void md_safemode_timeout(unsigned long data); - -static ssize_t -safe_delay_show(struct mddev *mddev, char *page) -{ - int msec = (mddev->safemode_delay*1000)/HZ; - return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); -} -static ssize_t -safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) -{ - unsigned long msec; - - if (strict_strtoul_scaled(cbuf, &msec, 3) < 0) - return -EINVAL; - if (msec == 0) - mddev->safemode_delay = 0; - else { - unsigned long old_delay = mddev->safemode_delay; - mddev->safemode_delay = (msec*HZ)/1000; - if (mddev->safemode_delay == 0) - mddev->safemode_delay = 1; - if (mddev->safemode_delay < old_delay) - md_safemode_timeout((unsigned long)mddev); - } - return len; -} -static struct md_sysfs_entry md_safe_delay = -__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); - -static ssize_t -level_show(struct mddev *mddev, char *page) -{ - struct md_personality *p = mddev->pers; - if (p) - return sprintf(page, "%s\n", p->name); - else if (mddev->clevel[0]) - return sprintf(page, "%s\n", mddev->clevel); - else if (mddev->level != LEVEL_NONE) - return sprintf(page, "%d\n", mddev->level); - else - return 0; -} - -static ssize_t -level_store(struct mddev *mddev, const char *buf, size_t len) -{ - char clevel[16]; - ssize_t rv = len; - struct md_personality *pers; - long level; - void *priv; - struct md_rdev *rdev; - - if (mddev->pers == NULL) { - if (len == 0) - return 0; - if (len >= sizeof(mddev->clevel)) - return -ENOSPC; - strncpy(mddev->clevel, buf, len); - if (mddev->clevel[len-1] == '\n') - len--; - mddev->clevel[len] = 0; - mddev->level = LEVEL_NONE; - return rv; - } - - /* request to change the personality. Need to ensure: - * - array is not engaged in resync/recovery/reshape - * - old personality can be suspended - * - new personality will access other array. - */ - - if (mddev->sync_thread || - mddev->reshape_position != MaxSector || - mddev->sysfs_active) - return -EBUSY; - - if (!mddev->pers->quiesce) { - printk(KERN_WARNING "md: %s: %s does not support online personality change\n", - mdname(mddev), mddev->pers->name); - return -EINVAL; - } - - /* Now find the new personality */ - if (len == 0 || len >= sizeof(clevel)) - return -EINVAL; - strncpy(clevel, buf, len); - if (clevel[len-1] == '\n') - len--; - clevel[len] = 0; - if (strict_strtol(clevel, 10, &level)) - level = LEVEL_NONE; - - if (request_module("md-%s", clevel) != 0) - request_module("md-level-%s", clevel); - spin_lock(&pers_lock); - pers = find_pers(level, clevel); - if (!pers || !try_module_get(pers->owner)) { - spin_unlock(&pers_lock); - printk(KERN_WARNING "md: personality %s not loaded\n", clevel); - return -EINVAL; - } - spin_unlock(&pers_lock); - - if (pers == mddev->pers) { - /* Nothing to do! */ - module_put(pers->owner); - return rv; - } - if (!pers->takeover) { - module_put(pers->owner); - printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", - mdname(mddev), clevel); - return -EINVAL; - } - - rdev_for_each(rdev, mddev) - rdev->new_raid_disk = rdev->raid_disk; - - /* ->takeover must set new_* and/or delta_disks - * if it succeeds, and may set them when it fails. - */ - priv = pers->takeover(mddev); - if (IS_ERR(priv)) { - mddev->new_level = mddev->level; - mddev->new_layout = mddev->layout; - mddev->new_chunk_sectors = mddev->chunk_sectors; - mddev->raid_disks -= mddev->delta_disks; - mddev->delta_disks = 0; - module_put(pers->owner); - printk(KERN_WARNING "md: %s: %s would not accept array\n", - mdname(mddev), clevel); - return PTR_ERR(priv); - } - - /* Looks like we have a winner */ - mddev_suspend(mddev); - mddev->pers->stop(mddev); - - if (mddev->pers->sync_request == NULL && - pers->sync_request != NULL) { - /* need to add the md_redundancy_group */ - if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) - printk(KERN_WARNING - "md: cannot register extra attributes for %s\n", - mdname(mddev)); - mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, NULL, "sync_action"); - } - if (mddev->pers->sync_request != NULL && - pers->sync_request == NULL) { - /* need to remove the md_redundancy_group */ - if (mddev->to_remove == NULL) - mddev->to_remove = &md_redundancy_group; - } - - if (mddev->pers->sync_request == NULL && - mddev->external) { - /* We are converting from a no-redundancy array - * to a redundancy array and metadata is managed - * externally so we need to be sure that writes - * won't block due to a need to transition - * clean->dirty - * until external management is started. - */ - mddev->in_sync = 0; - mddev->safemode_delay = 0; - mddev->safemode = 0; - } - - rdev_for_each(rdev, mddev) { - if (rdev->raid_disk < 0) - continue; - if (rdev->new_raid_disk >= mddev->raid_disks) - rdev->new_raid_disk = -1; - if (rdev->new_raid_disk == rdev->raid_disk) - continue; - sysfs_unlink_rdev(mddev, rdev); - } - rdev_for_each(rdev, mddev) { - if (rdev->raid_disk < 0) - continue; - if (rdev->new_raid_disk == rdev->raid_disk) - continue; - rdev->raid_disk = rdev->new_raid_disk; - if (rdev->raid_disk < 0) - clear_bit(In_sync, &rdev->flags); - else { - if (sysfs_link_rdev(mddev, rdev)) - printk(KERN_WARNING "md: cannot register rd%d" - " for %s after level change\n", - rdev->raid_disk, mdname(mddev)); - } - } - - module_put(mddev->pers->owner); - mddev->pers = pers; - mddev->private = priv; - strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); - mddev->level = mddev->new_level; - mddev->layout = mddev->new_layout; - mddev->chunk_sectors = mddev->new_chunk_sectors; - mddev->delta_disks = 0; - mddev->degraded = 0; - if (mddev->pers->sync_request == NULL) { - /* this is now an array without redundancy, so - * it must always be in_sync - */ - mddev->in_sync = 1; - del_timer_sync(&mddev->safemode_timer); - } - pers->run(mddev); - mddev_resume(mddev); - set_bit(MD_CHANGE_DEVS, &mddev->flags); - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); - sysfs_notify(&mddev->kobj, NULL, "level"); - md_new_event(mddev); - return rv; -} - -static struct md_sysfs_entry md_level = -__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); - - -static ssize_t -layout_show(struct mddev *mddev, char *page) -{ - /* just a number, not meaningful for all levels */ - if (mddev->reshape_position != MaxSector && - mddev->layout != mddev->new_layout) - return sprintf(page, "%d (%d)\n", - mddev->new_layout, mddev->layout); - return sprintf(page, "%d\n", mddev->layout); -} - -static ssize_t -layout_store(struct mddev *mddev, const char *buf, size_t len) -{ - char *e; - unsigned long n = simple_strtoul(buf, &e, 10); - - if (!*buf || (*e && *e != '\n')) - return -EINVAL; - - if (mddev->pers) { - int err; - if (mddev->pers->check_reshape == NULL) - return -EBUSY; - mddev->new_layout = n; - err = mddev->pers->check_reshape(mddev); - if (err) { - mddev->new_layout = mddev->layout; - return err; - } - } else { - mddev->new_layout = n; - if (mddev->reshape_position == MaxSector) - mddev->layout = n; - } - return len; -} -static struct md_sysfs_entry md_layout = -__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); - - -static ssize_t -raid_disks_show(struct mddev *mddev, char *page) -{ - if (mddev->raid_disks == 0) - return 0; - if (mddev->reshape_position != MaxSector && - mddev->delta_disks != 0) - return sprintf(page, "%d (%d)\n", mddev->raid_disks, - mddev->raid_disks - mddev->delta_disks); - return sprintf(page, "%d\n", mddev->raid_disks); -} - -static int update_raid_disks(struct mddev *mddev, int raid_disks); - -static ssize_t -raid_disks_store(struct mddev *mddev, const char *buf, size_t len) -{ - char *e; - int rv = 0; - unsigned long n = simple_strtoul(buf, &e, 10); - - if (!*buf || (*e && *e != '\n')) - return -EINVAL; - - if (mddev->pers) - rv = update_raid_disks(mddev, n); - else if (mddev->reshape_position != MaxSector) { - int olddisks = mddev->raid_disks - mddev->delta_disks; - mddev->delta_disks = n - olddisks; - mddev->raid_disks = n; - } else - mddev->raid_disks = n; - return rv ? rv : len; -} -static struct md_sysfs_entry md_raid_disks = -__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); - -static ssize_t -chunk_size_show(struct mddev *mddev, char *page) -{ - if (mddev->reshape_position != MaxSector && - mddev->chunk_sectors != mddev->new_chunk_sectors) - return sprintf(page, "%d (%d)\n", - mddev->new_chunk_sectors << 9, - mddev->chunk_sectors << 9); - return sprintf(page, "%d\n", mddev->chunk_sectors << 9); -} - -static ssize_t -chunk_size_store(struct mddev *mddev, const char *buf, size_t len) -{ - char *e; - unsigned long n = simple_strtoul(buf, &e, 10); - - if (!*buf || (*e && *e != '\n')) - return -EINVAL; - - if (mddev->pers) { - int err; - if (mddev->pers->check_reshape == NULL) - return -EBUSY; - mddev->new_chunk_sectors = n >> 9; - err = mddev->pers->check_reshape(mddev); - if (err) { - mddev->new_chunk_sectors = mddev->chunk_sectors; - return err; - } - } else { - mddev->new_chunk_sectors = n >> 9; - if (mddev->reshape_position == MaxSector) - mddev->chunk_sectors = n >> 9; - } - return len; -} -static struct md_sysfs_entry md_chunk_size = -__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); - -static ssize_t -resync_start_show(struct mddev *mddev, char *page) -{ - if (mddev->recovery_cp == MaxSector) - return sprintf(page, "none\n"); - return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); -} - -static ssize_t -resync_start_store(struct mddev *mddev, const char *buf, size_t len) -{ - char *e; - unsigned long long n = simple_strtoull(buf, &e, 10); - - if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) - return -EBUSY; - if (cmd_match(buf, "none")) - n = MaxSector; - else if (!*buf || (*e && *e != '\n')) - return -EINVAL; - - mddev->recovery_cp = n; - return len; -} -static struct md_sysfs_entry md_resync_start = -__ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); - -/* - * The array state can be: - * - * clear - * No devices, no size, no level - * Equivalent to STOP_ARRAY ioctl - * inactive - * May have some settings, but array is not active - * all IO results in error - * When written, doesn't tear down array, but just stops it - * suspended (not supported yet) - * All IO requests will block. The array can be reconfigured. - * Writing this, if accepted, will block until array is quiescent - * readonly - * no resync can happen. no superblocks get written. - * write requests fail - * read-auto - * like readonly, but behaves like 'clean' on a write request. - * - * clean - no pending writes, but otherwise active. - * When written to inactive array, starts without resync - * If a write request arrives then - * if metadata is known, mark 'dirty' and switch to 'active'. - * if not known, block and switch to write-pending - * If written to an active array that has pending writes, then fails. - * active - * fully active: IO and resync can be happening. - * When written to inactive array, starts with resync - * - * write-pending - * clean, but writes are blocked waiting for 'active' to be written. - * - * active-idle - * like active, but no writes have been seen for a while (100msec). - * - */ -enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, - write_pending, active_idle, bad_word}; -static char *array_states[] = { - "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", - "write-pending", "active-idle", NULL }; - -static int match_word(const char *word, char **list) -{ - int n; - for (n=0; list[n]; n++) - if (cmd_match(word, list[n])) - break; - return n; -} - -static ssize_t -array_state_show(struct mddev *mddev, char *page) -{ - enum array_state st = inactive; - - if (mddev->pers) - switch(mddev->ro) { - case 1: - st = readonly; - break; - case 2: - st = read_auto; - break; - case 0: - if (mddev->in_sync) - st = clean; - else if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) - st = write_pending; - else if (mddev->safemode) - st = active_idle; - else - st = active; - } - else { - if (list_empty(&mddev->disks) && - mddev->raid_disks == 0 && - mddev->dev_sectors == 0) - st = clear; - else - st = inactive; - } - return sprintf(page, "%s\n", array_states[st]); -} - -static int do_md_stop(struct mddev * mddev, int ro, int is_open); -static int md_set_readonly(struct mddev * mddev, int is_open); -static int do_md_run(struct mddev * mddev); -static int restart_array(struct mddev *mddev); - -static ssize_t -array_state_store(struct mddev *mddev, const char *buf, size_t len) -{ - int err = -EINVAL; - enum array_state st = match_word(buf, array_states); - switch(st) { - case bad_word: - break; - case clear: - /* stopping an active array */ - if (atomic_read(&mddev->openers) > 0) - return -EBUSY; - err = do_md_stop(mddev, 0, 0); - break; - case inactive: - /* stopping an active array */ - if (mddev->pers) { - if (atomic_read(&mddev->openers) > 0) - return -EBUSY; - err = do_md_stop(mddev, 2, 0); - } else - err = 0; /* already inactive */ - break; - case suspended: - break; /* not supported yet */ - case readonly: - if (mddev->pers) - err = md_set_readonly(mddev, 0); - else { - mddev->ro = 1; - set_disk_ro(mddev->gendisk, 1); - err = do_md_run(mddev); - } - break; - case read_auto: - if (mddev->pers) { - if (mddev->ro == 0) - err = md_set_readonly(mddev, 0); - else if (mddev->ro == 1) - err = restart_array(mddev); - if (err == 0) { - mddev->ro = 2; - set_disk_ro(mddev->gendisk, 0); - } - } else { - mddev->ro = 2; - err = do_md_run(mddev); - } - break; - case clean: - if (mddev->pers) { - restart_array(mddev); - spin_lock_irq(&mddev->write_lock); - if (atomic_read(&mddev->writes_pending) == 0) { - if (mddev->in_sync == 0) { - mddev->in_sync = 1; - if (mddev->safemode == 1) - mddev->safemode = 0; - set_bit(MD_CHANGE_CLEAN, &mddev->flags); - } - err = 0; - } else - err = -EBUSY; - spin_unlock_irq(&mddev->write_lock); - } else - err = -EINVAL; - break; - case active: - if (mddev->pers) { - restart_array(mddev); - clear_bit(MD_CHANGE_PENDING, &mddev->flags); - wake_up(&mddev->sb_wait); - err = 0; - } else { - mddev->ro = 0; - set_disk_ro(mddev->gendisk, 0); - err = do_md_run(mddev); - } - break; - case write_pending: - case active_idle: - /* these cannot be set */ - break; - } - if (err) - return err; - else { - if (mddev->hold_active == UNTIL_IOCTL) - mddev->hold_active = 0; - sysfs_notify_dirent_safe(mddev->sysfs_state); - return len; - } -} -static struct md_sysfs_entry md_array_state = -__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); - -static ssize_t -max_corrected_read_errors_show(struct mddev *mddev, char *page) { - return sprintf(page, "%d\n", - atomic_read(&mddev->max_corr_read_errors)); -} - -static ssize_t -max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) -{ - char *e; - unsigned long n = simple_strtoul(buf, &e, 10); - - if (*buf && (*e == 0 || *e == '\n')) { - atomic_set(&mddev->max_corr_read_errors, n); - return len; - } - return -EINVAL; -} - -static struct md_sysfs_entry max_corr_read_errors = -__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, - max_corrected_read_errors_store); - -static ssize_t -null_show(struct mddev *mddev, char *page) -{ - return -EINVAL; -} - -static ssize_t -new_dev_store(struct mddev *mddev, const char *buf, size_t len) -{ - /* buf must be %d:%d\n? giving major and minor numbers */ - /* The new device is added to the array. - * If the array has a persistent superblock, we read the - * superblock to initialise info and check validity. - * Otherwise, only checking done is that in bind_rdev_to_array, - * which mainly checks size. - */ - char *e; - int major = simple_strtoul(buf, &e, 10); - int minor; - dev_t dev; - struct md_rdev *rdev; - int err; - - if (!*buf || *e != ':' || !e[1] || e[1] == '\n') - return -EINVAL; - minor = simple_strtoul(e+1, &e, 10); - if (*e && *e != '\n') - return -EINVAL; - dev = MKDEV(major, minor); - if (major != MAJOR(dev) || - minor != MINOR(dev)) - return -EOVERFLOW; - - - if (mddev->persistent) { - rdev = md_import_device(dev, mddev->major_version, - mddev->minor_version); - if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { - struct md_rdev *rdev0 - = list_entry(mddev->disks.next, - struct md_rdev, same_set); - err = super_types[mddev->major_version] - .load_super(rdev, rdev0, mddev->minor_version); - if (err < 0) - goto out; - } - } else if (mddev->external) - rdev = md_import_device(dev, -2, -1); - else - rdev = md_import_device(dev, -1, -1); - - if (IS_ERR(rdev)) - return PTR_ERR(rdev); - err = bind_rdev_to_array(rdev, mddev); - out: - if (err) - export_rdev(rdev); - return err ? err : len; -} - -static struct md_sysfs_entry md_new_device = -__ATTR(new_dev, S_IWUSR, null_show, new_dev_store); - -static ssize_t -bitmap_store(struct mddev *mddev, const char *buf, size_t len) -{ - char *end; - unsigned long chunk, end_chunk; - - if (!mddev->bitmap) - goto out; - /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ - while (*buf) { - chunk = end_chunk = simple_strtoul(buf, &end, 0); - if (buf == end) break; - if (*end == '-') { /* range */ - buf = end + 1; - end_chunk = simple_strtoul(buf, &end, 0); - if (buf == end) break; - } - if (*end && !isspace(*end)) break; - bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); - buf = skip_spaces(end); - } - bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ -out: - return len; -} - -static struct md_sysfs_entry md_bitmap = -__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); - -static ssize_t -size_show(struct mddev *mddev, char *page) -{ - return sprintf(page, "%llu\n", - (unsigned long long)mddev->dev_sectors / 2); -} - -static int update_size(struct mddev *mddev, sector_t num_sectors); - -static ssize_t -size_store(struct mddev *mddev, const char *buf, size_t len) -{ - /* If array is inactive, we can reduce the component size, but - * not increase it (except from 0). - * If array is active, we can try an on-line resize - */ - sector_t sectors; - int err = strict_blocks_to_sectors(buf, §ors); - - if (err < 0) - return err; - if (mddev->pers) { - err = update_size(mddev, sectors); - md_update_sb(mddev, 1); - } else { - if (mddev->dev_sectors == 0 || - mddev->dev_sectors > sectors) - mddev->dev_sectors = sectors; - else - err = -ENOSPC; - } - return err ? err : len; -} - -static struct md_sysfs_entry md_size = -__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); - - -/* Metdata version. - * This is one of - * 'none' for arrays with no metadata (good luck...) - * 'external' for arrays with externally managed metadata, - * or N.M for internally known formats - */ -static ssize_t -metadata_show(struct mddev *mddev, char *page) -{ - if (mddev->persistent) - return sprintf(page, "%d.%d\n", - mddev->major_version, mddev->minor_version); - else if (mddev->external) - return sprintf(page, "external:%s\n", mddev->metadata_type); - else - return sprintf(page, "none\n"); -} - -static ssize_t -metadata_store(struct mddev *mddev, const char *buf, size_t len) -{ - int major, minor; - char *e; - /* Changing the details of 'external' metadata is - * always permitted. Otherwise there must be - * no devices attached to the array. - */ - if (mddev->external && strncmp(buf, "external:", 9) == 0) - ; - else if (!list_empty(&mddev->disks)) - return -EBUSY; - - if (cmd_match(buf, "none")) { - mddev->persistent = 0; - mddev->external = 0; - mddev->major_version = 0; - mddev->minor_version = 90; - return len; - } - if (strncmp(buf, "external:", 9) == 0) { - size_t namelen = len-9; - if (namelen >= sizeof(mddev->metadata_type)) - namelen = sizeof(mddev->metadata_type)-1; - strncpy(mddev->metadata_type, buf+9, namelen); - mddev->metadata_type[namelen] = 0; - if (namelen && mddev->metadata_type[namelen-1] == '\n') - mddev->metadata_type[--namelen] = 0; - mddev->persistent = 0; - mddev->external = 1; - mddev->major_version = 0; - mddev->minor_version = 90; - return len; - } - major = simple_strtoul(buf, &e, 10); - if (e==buf || *e != '.') - return -EINVAL; - buf = e+1; - minor = simple_strtoul(buf, &e, 10); - if (e==buf || (*e && *e != '\n') ) - return -EINVAL; - if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) - return -ENOENT; - mddev->major_version = major; - mddev->minor_version = minor; - mddev->persistent = 1; - mddev->external = 0; - return len; -} - -static struct md_sysfs_entry md_metadata = -__ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); - -static ssize_t -action_show(struct mddev *mddev, char *page) -{ - char *type = "idle"; - if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) - type = "frozen"; - else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || - (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) { - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) - type = "reshape"; - else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) - type = "resync"; - else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) - type = "check"; - else - type = "repair"; - } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) - type = "recover"; - } - return sprintf(page, "%s\n", type); -} - -static void reap_sync_thread(struct mddev *mddev); - -static ssize_t -action_store(struct mddev *mddev, const char *page, size_t len) -{ - if (!mddev->pers || !mddev->pers->sync_request) - return -EINVAL; - - if (cmd_match(page, "frozen")) - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - else - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - - if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { - if (mddev->sync_thread) { - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - reap_sync_thread(mddev); - } - } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || - test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) - return -EBUSY; - else if (cmd_match(page, "resync")) - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - else if (cmd_match(page, "recover")) { - set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - } else if (cmd_match(page, "reshape")) { - int err; - if (mddev->pers->start_reshape == NULL) - return -EINVAL; - err = mddev->pers->start_reshape(mddev); - if (err) - return err; - sysfs_notify(&mddev->kobj, NULL, "degraded"); - } else { - if (cmd_match(page, "check")) - set_bit(MD_RECOVERY_CHECK, &mddev->recovery); - else if (!cmd_match(page, "repair")) - return -EINVAL; - set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); - set_bit(MD_RECOVERY_SYNC, &mddev->recovery); - } - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); - sysfs_notify_dirent_safe(mddev->sysfs_action); - return len; -} - -static ssize_t -mismatch_cnt_show(struct mddev *mddev, char *page) -{ - return sprintf(page, "%llu\n", - (unsigned long long) mddev->resync_mismatches); -} - -static struct md_sysfs_entry md_scan_mode = -__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); - - -static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); - -static ssize_t -sync_min_show(struct mddev *mddev, char *page) -{ - return sprintf(page, "%d (%s)\n", speed_min(mddev), - mddev->sync_speed_min ? "local": "system"); -} - -static ssize_t -sync_min_store(struct mddev *mddev, const char *buf, size_t len) -{ - int min; - char *e; - if (strncmp(buf, "system", 6)==0) { - mddev->sync_speed_min = 0; - return len; - } - min = simple_strtoul(buf, &e, 10); - if (buf == e || (*e && *e != '\n') || min <= 0) - return -EINVAL; - mddev->sync_speed_min = min; - return len; -} - -static struct md_sysfs_entry md_sync_min = -__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); - -static ssize_t -sync_max_show(struct mddev *mddev, char *page) -{ - return sprintf(page, "%d (%s)\n", speed_max(mddev), - mddev->sync_speed_max ? "local": "system"); -} - -static ssize_t -sync_max_store(struct mddev *mddev, const char *buf, size_t len) -{ - int max; - char *e; - if (strncmp(buf, "system", 6)==0) { - mddev->sync_speed_max = 0; - return len; - } - max = simple_strtoul(buf, &e, 10); - if (buf == e || (*e && *e != '\n') || max <= 0) - return -EINVAL; - mddev->sync_speed_max = max; - return len; -} - -static struct md_sysfs_entry md_sync_max = -__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); - -static ssize_t -degraded_show(struct mddev *mddev, char *page) -{ - return sprintf(page, "%d\n", mddev->degraded); -} -static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); - -static ssize_t -sync_force_parallel_show(struct mddev *mddev, char *page) -{ - return sprintf(page, "%d\n", mddev->parallel_resync); -} - -static ssize_t -sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) -{ - long n; - - if (strict_strtol(buf, 10, &n)) - return -EINVAL; - - if (n != 0 && n != 1) - return -EINVAL; - - mddev->parallel_resync = n; - - if (mddev->sync_thread) - wake_up(&resync_wait); - - return len; -} - -/* force parallel resync, even with shared block devices */ -static struct md_sysfs_entry md_sync_force_parallel = -__ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, - sync_force_parallel_show, sync_force_parallel_store); - -static ssize_t -sync_speed_show(struct mddev *mddev, char *page) -{ - unsigned long resync, dt, db; - if (mddev->curr_resync == 0) - return sprintf(page, "none\n"); - resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); - dt = (jiffies - mddev->resync_mark) / HZ; - if (!dt) dt++; - db = resync - mddev->resync_mark_cnt; - return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ -} - -static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); - -static ssize_t -sync_completed_show(struct mddev *mddev, char *page) -{ - unsigned long long max_sectors, resync; - - if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - return sprintf(page, "none\n"); - - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) - max_sectors = mddev->resync_max_sectors; - else - max_sectors = mddev->dev_sectors; - - resync = mddev->curr_resync_completed; - return sprintf(page, "%llu / %llu\n", resync, max_sectors); -} - -static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); - -static ssize_t -min_sync_show(struct mddev *mddev, char *page) -{ - return sprintf(page, "%llu\n", - (unsigned long long)mddev->resync_min); -} -static ssize_t -min_sync_store(struct mddev *mddev, const char *buf, size_t len) -{ - unsigned long long min; - if (strict_strtoull(buf, 10, &min)) - return -EINVAL; - if (min > mddev->resync_max) - return -EINVAL; - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - return -EBUSY; - - /* Must be a multiple of chunk_size */ - if (mddev->chunk_sectors) { - sector_t temp = min; - if (sector_div(temp, mddev->chunk_sectors)) - return -EINVAL; - } - mddev->resync_min = min; - - return len; -} - -static struct md_sysfs_entry md_min_sync = -__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); - -static ssize_t -max_sync_show(struct mddev *mddev, char *page) -{ - if (mddev->resync_max == MaxSector) - return sprintf(page, "max\n"); - else - return sprintf(page, "%llu\n", - (unsigned long long)mddev->resync_max); -} -static ssize_t -max_sync_store(struct mddev *mddev, const char *buf, size_t len) -{ - if (strncmp(buf, "max", 3) == 0) - mddev->resync_max = MaxSector; - else { - unsigned long long max; - if (strict_strtoull(buf, 10, &max)) - return -EINVAL; - if (max < mddev->resync_min) - return -EINVAL; - if (max < mddev->resync_max && - mddev->ro == 0 && - test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - return -EBUSY; - - /* Must be a multiple of chunk_size */ - if (mddev->chunk_sectors) { - sector_t temp = max; - if (sector_div(temp, mddev->chunk_sectors)) - return -EINVAL; - } - mddev->resync_max = max; - } - wake_up(&mddev->recovery_wait); - return len; -} - -static struct md_sysfs_entry md_max_sync = -__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); - -static ssize_t -suspend_lo_show(struct mddev *mddev, char *page) -{ - return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); -} - -static ssize_t -suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) -{ - char *e; - unsigned long long new = simple_strtoull(buf, &e, 10); - unsigned long long old = mddev->suspend_lo; - - if (mddev->pers == NULL || - mddev->pers->quiesce == NULL) - return -EINVAL; - if (buf == e || (*e && *e != '\n')) - return -EINVAL; - - mddev->suspend_lo = new; - if (new >= old) - /* Shrinking suspended region */ - mddev->pers->quiesce(mddev, 2); - else { - /* Expanding suspended region - need to wait */ - mddev->pers->quiesce(mddev, 1); - mddev->pers->quiesce(mddev, 0); - } - return len; -} -static struct md_sysfs_entry md_suspend_lo = -__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); - - -static ssize_t -suspend_hi_show(struct mddev *mddev, char *page) -{ - return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); -} - -static ssize_t -suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) -{ - char *e; - unsigned long long new = simple_strtoull(buf, &e, 10); - unsigned long long old = mddev->suspend_hi; - - if (mddev->pers == NULL || - mddev->pers->quiesce == NULL) - return -EINVAL; - if (buf == e || (*e && *e != '\n')) - return -EINVAL; - - mddev->suspend_hi = new; - if (new <= old) - /* Shrinking suspended region */ - mddev->pers->quiesce(mddev, 2); - else { - /* Expanding suspended region - need to wait */ - mddev->pers->quiesce(mddev, 1); - mddev->pers->quiesce(mddev, 0); - } - return len; -} -static struct md_sysfs_entry md_suspend_hi = -__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); - -static ssize_t -reshape_position_show(struct mddev *mddev, char *page) -{ - if (mddev->reshape_position != MaxSector) - return sprintf(page, "%llu\n", - (unsigned long long)mddev->reshape_position); - strcpy(page, "none\n"); - return 5; -} - -static ssize_t -reshape_position_store(struct mddev *mddev, const char *buf, size_t len) -{ - char *e; - unsigned long long new = simple_strtoull(buf, &e, 10); - if (mddev->pers) - return -EBUSY; - if (buf == e || (*e && *e != '\n')) - return -EINVAL; - mddev->reshape_position = new; - mddev->delta_disks = 0; - mddev->new_level = mddev->level; - mddev->new_layout = mddev->layout; - mddev->new_chunk_sectors = mddev->chunk_sectors; - return len; -} - -static struct md_sysfs_entry md_reshape_position = -__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, - reshape_position_store); - -static ssize_t -array_size_show(struct mddev *mddev, char *page) -{ - if (mddev->external_size) - return sprintf(page, "%llu\n", - (unsigned long long)mddev->array_sectors/2); - else - return sprintf(page, "default\n"); -} - -static ssize_t -array_size_store(struct mddev *mddev, const char *buf, size_t len) -{ - sector_t sectors; - - if (strncmp(buf, "default", 7) == 0) { - if (mddev->pers) - sectors = mddev->pers->size(mddev, 0, 0); - else - sectors = mddev->array_sectors; - - mddev->external_size = 0; - } else { - if (strict_blocks_to_sectors(buf, §ors) < 0) - return -EINVAL; - if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) - return -E2BIG; - - mddev->external_size = 1; - } - - mddev->array_sectors = sectors; - if (mddev->pers) { - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk(mddev->gendisk); - } - return len; -} - -static struct md_sysfs_entry md_array_size = -__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, - array_size_store); - -static struct attribute *md_default_attrs[] = { - &md_level.attr, - &md_layout.attr, - &md_raid_disks.attr, - &md_chunk_size.attr, - &md_size.attr, - &md_resync_start.attr, - &md_metadata.attr, - &md_new_device.attr, - &md_safe_delay.attr, - &md_array_state.attr, - &md_reshape_position.attr, - &md_array_size.attr, - &max_corr_read_errors.attr, - NULL, -}; - -static struct attribute *md_redundancy_attrs[] = { - &md_scan_mode.attr, - &md_mismatches.attr, - &md_sync_min.attr, - &md_sync_max.attr, - &md_sync_speed.attr, - &md_sync_force_parallel.attr, - &md_sync_completed.attr, - &md_min_sync.attr, - &md_max_sync.attr, - &md_suspend_lo.attr, - &md_suspend_hi.attr, - &md_bitmap.attr, - &md_degraded.attr, - NULL, -}; -static struct attribute_group md_redundancy_group = { - .name = NULL, - .attrs = md_redundancy_attrs, -}; - - -static ssize_t -md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) -{ - struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); - struct mddev *mddev = container_of(kobj, struct mddev, kobj); - ssize_t rv; - - if (!entry->show) - return -EIO; - spin_lock(&all_mddevs_lock); - if (list_empty(&mddev->all_mddevs)) { - spin_unlock(&all_mddevs_lock); - return -EBUSY; - } - mddev_get(mddev); - spin_unlock(&all_mddevs_lock); - - rv = mddev_lock(mddev); - if (!rv) { - rv = entry->show(mddev, page); - mddev_unlock(mddev); - } - mddev_put(mddev); - return rv; -} - -static ssize_t -md_attr_store(struct kobject *kobj, struct attribute *attr, - const char *page, size_t length) -{ - struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); - struct mddev *mddev = container_of(kobj, struct mddev, kobj); - ssize_t rv; - - if (!entry->store) - return -EIO; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - spin_lock(&all_mddevs_lock); - if (list_empty(&mddev->all_mddevs)) { - spin_unlock(&all_mddevs_lock); - return -EBUSY; - } - mddev_get(mddev); - spin_unlock(&all_mddevs_lock); - rv = mddev_lock(mddev); - if (!rv) { - rv = entry->store(mddev, page, length); - mddev_unlock(mddev); - } - mddev_put(mddev); - return rv; -} - -static void md_free(struct kobject *ko) -{ - struct mddev *mddev = container_of(ko, struct mddev, kobj); - - if (mddev->sysfs_state) - sysfs_put(mddev->sysfs_state); - - if (mddev->gendisk) { - del_gendisk(mddev->gendisk); - put_disk(mddev->gendisk); - } - if (mddev->queue) - blk_cleanup_queue(mddev->queue); - - kfree(mddev); -} - -static const struct sysfs_ops md_sysfs_ops = { - .show = md_attr_show, - .store = md_attr_store, -}; -static struct kobj_type md_ktype = { - .release = md_free, - .sysfs_ops = &md_sysfs_ops, - .default_attrs = md_default_attrs, -}; - -int mdp_major = 0; - -static void mddev_delayed_delete(struct work_struct *ws) -{ - struct mddev *mddev = container_of(ws, struct mddev, del_work); - - sysfs_remove_group(&mddev->kobj, &md_bitmap_group); - kobject_del(&mddev->kobj); - kobject_put(&mddev->kobj); -} - -static int md_alloc(dev_t dev, char *name) -{ - static DEFINE_MUTEX(disks_mutex); - struct mddev *mddev = mddev_find(dev); - struct gendisk *disk; - int partitioned; - int shift; - int unit; - int error; - - if (!mddev) - return -ENODEV; - - partitioned = (MAJOR(mddev->unit) != MD_MAJOR); - shift = partitioned ? MdpMinorShift : 0; - unit = MINOR(mddev->unit) >> shift; - - /* wait for any previous instance of this device to be - * completely removed (mddev_delayed_delete). - */ - flush_workqueue(md_misc_wq); - - mutex_lock(&disks_mutex); - error = -EEXIST; - if (mddev->gendisk) - goto abort; - - if (name) { - /* Need to ensure that 'name' is not a duplicate. - */ - struct mddev *mddev2; - spin_lock(&all_mddevs_lock); - - list_for_each_entry(mddev2, &all_mddevs, all_mddevs) - if (mddev2->gendisk && - strcmp(mddev2->gendisk->disk_name, name) == 0) { - spin_unlock(&all_mddevs_lock); - goto abort; - } - spin_unlock(&all_mddevs_lock); - } - - error = -ENOMEM; - mddev->queue = blk_alloc_queue(GFP_KERNEL); - if (!mddev->queue) - goto abort; - mddev->queue->queuedata = mddev; - - blk_queue_make_request(mddev->queue, md_make_request); - blk_set_stacking_limits(&mddev->queue->limits); - - disk = alloc_disk(1 << shift); - if (!disk) { - blk_cleanup_queue(mddev->queue); - mddev->queue = NULL; - goto abort; - } - disk->major = MAJOR(mddev->unit); - disk->first_minor = unit << shift; - if (name) - strcpy(disk->disk_name, name); - else if (partitioned) - sprintf(disk->disk_name, "md_d%d", unit); - else - sprintf(disk->disk_name, "md%d", unit); - disk->fops = &md_fops; - disk->private_data = mddev; - disk->queue = mddev->queue; - blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA); - /* Allow extended partitions. This makes the - * 'mdp' device redundant, but we can't really - * remove it now. - */ - disk->flags |= GENHD_FL_EXT_DEVT; - mddev->gendisk = disk; - /* As soon as we call add_disk(), another thread could get - * through to md_open, so make sure it doesn't get too far - */ - mutex_lock(&mddev->open_mutex); - add_disk(disk); - - error = kobject_init_and_add(&mddev->kobj, &md_ktype, - &disk_to_dev(disk)->kobj, "%s", "md"); - if (error) { - /* This isn't possible, but as kobject_init_and_add is marked - * __must_check, we must do something with the result - */ - printk(KERN_WARNING "md: cannot register %s/md - name in use\n", - disk->disk_name); - error = 0; - } - if (mddev->kobj.sd && - sysfs_create_group(&mddev->kobj, &md_bitmap_group)) - printk(KERN_DEBUG "pointless warning\n"); - mutex_unlock(&mddev->open_mutex); - abort: - mutex_unlock(&disks_mutex); - if (!error && mddev->kobj.sd) { - kobject_uevent(&mddev->kobj, KOBJ_ADD); - mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); - } - mddev_put(mddev); - return error; -} - -static struct kobject *md_probe(dev_t dev, int *part, void *data) -{ - md_alloc(dev, NULL); - return NULL; -} - -static int add_named_array(const char *val, struct kernel_param *kp) -{ - /* val must be "md_*" where * is not all digits. - * We allocate an array with a large free minor number, and - * set the name to val. val must not already be an active name. - */ - int len = strlen(val); - char buf[DISK_NAME_LEN]; - - while (len && val[len-1] == '\n') - len--; - if (len >= DISK_NAME_LEN) - return -E2BIG; - strlcpy(buf, val, len+1); - if (strncmp(buf, "md_", 3) != 0) - return -EINVAL; - return md_alloc(0, buf); -} - -static void md_safemode_timeout(unsigned long data) -{ - struct mddev *mddev = (struct mddev *) data; - - if (!atomic_read(&mddev->writes_pending)) { - mddev->safemode = 1; - if (mddev->external) - sysfs_notify_dirent_safe(mddev->sysfs_state); - } - md_wakeup_thread(mddev->thread); -} - -static int start_dirty_degraded; - -int md_run(struct mddev *mddev) -{ - int err; - struct md_rdev *rdev; - struct md_personality *pers; - - if (list_empty(&mddev->disks)) - /* cannot run an array with no devices.. */ - return -EINVAL; - - if (mddev->pers) - return -EBUSY; - /* Cannot run until previous stop completes properly */ - if (mddev->sysfs_active) - return -EBUSY; - - /* - * Analyze all RAID superblock(s) - */ - if (!mddev->raid_disks) { - if (!mddev->persistent) - return -EINVAL; - analyze_sbs(mddev); - } - - if (mddev->level != LEVEL_NONE) - request_module("md-level-%d", mddev->level); - else if (mddev->clevel[0]) - request_module("md-%s", mddev->clevel); - - /* - * Drop all container device buffers, from now on - * the only valid external interface is through the md - * device. - */ - rdev_for_each(rdev, mddev) { - if (test_bit(Faulty, &rdev->flags)) - continue; - sync_blockdev(rdev->bdev); - invalidate_bdev(rdev->bdev); - - /* perform some consistency tests on the device. - * We don't want the data to overlap the metadata, - * Internal Bitmap issues have been handled elsewhere. - */ - if (rdev->meta_bdev) { - /* Nothing to check */; - } else if (rdev->data_offset < rdev->sb_start) { - if (mddev->dev_sectors && - rdev->data_offset + mddev->dev_sectors - > rdev->sb_start) { - printk("md: %s: data overlaps metadata\n", - mdname(mddev)); - return -EINVAL; - } - } else { - if (rdev->sb_start + rdev->sb_size/512 - > rdev->data_offset) { - printk("md: %s: metadata overlaps data\n", - mdname(mddev)); - return -EINVAL; - } - } - sysfs_notify_dirent_safe(rdev->sysfs_state); - } - - if (mddev->bio_set == NULL) - mddev->bio_set = bioset_create(BIO_POOL_SIZE, - sizeof(struct mddev *)); - - spin_lock(&pers_lock); - pers = find_pers(mddev->level, mddev->clevel); - if (!pers || !try_module_get(pers->owner)) { - spin_unlock(&pers_lock); - if (mddev->level != LEVEL_NONE) - printk(KERN_WARNING "md: personality for level %d is not loaded!\n", - mddev->level); - else - printk(KERN_WARNING "md: personality for level %s is not loaded!\n", - mddev->clevel); - return -EINVAL; - } - mddev->pers = pers; - spin_unlock(&pers_lock); - if (mddev->level != pers->level) { - mddev->level = pers->level; - mddev->new_level = pers->level; - } - strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); - - if (mddev->reshape_position != MaxSector && - pers->start_reshape == NULL) { - /* This personality cannot handle reshaping... */ - mddev->pers = NULL; - module_put(pers->owner); - return -EINVAL; - } - - if (pers->sync_request) { - /* Warn if this is a potentially silly - * configuration. - */ - char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; - struct md_rdev *rdev2; - int warned = 0; - - rdev_for_each(rdev, mddev) - rdev_for_each(rdev2, mddev) { - if (rdev < rdev2 && - rdev->bdev->bd_contains == - rdev2->bdev->bd_contains) { - printk(KERN_WARNING - "%s: WARNING: %s appears to be" - " on the same physical disk as" - " %s.\n", - mdname(mddev), - bdevname(rdev->bdev,b), - bdevname(rdev2->bdev,b2)); - warned = 1; - } - } - - if (warned) - printk(KERN_WARNING - "True protection against single-disk" - " failure might be compromised.\n"); - } - - mddev->recovery = 0; - /* may be over-ridden by personality */ - mddev->resync_max_sectors = mddev->dev_sectors; - - mddev->ok_start_degraded = start_dirty_degraded; - - if (start_readonly && mddev->ro == 0) - mddev->ro = 2; /* read-only, but switch on first write */ - - err = mddev->pers->run(mddev); - if (err) - printk(KERN_ERR "md: pers->run() failed ...\n"); - else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) { - WARN_ONCE(!mddev->external_size, "%s: default size too small," - " but 'external_size' not in effect?\n", __func__); - printk(KERN_ERR - "md: invalid array_size %llu > default size %llu\n", - (unsigned long long)mddev->array_sectors / 2, - (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2); - err = -EINVAL; - mddev->pers->stop(mddev); - } - if (err == 0 && mddev->pers->sync_request) { - err = bitmap_create(mddev); - if (err) { - printk(KERN_ERR "%s: failed to create bitmap (%d)\n", - mdname(mddev), err); - mddev->pers->stop(mddev); - } - } - if (err) { - module_put(mddev->pers->owner); - mddev->pers = NULL; - bitmap_destroy(mddev); - return err; - } - if (mddev->pers->sync_request) { - if (mddev->kobj.sd && - sysfs_create_group(&mddev->kobj, &md_redundancy_group)) - printk(KERN_WARNING - "md: cannot register extra attributes for %s\n", - mdname(mddev)); - mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); - } else if (mddev->ro == 2) /* auto-readonly not meaningful */ - mddev->ro = 0; - - atomic_set(&mddev->writes_pending,0); - atomic_set(&mddev->max_corr_read_errors, - MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); - mddev->safemode = 0; - mddev->safemode_timer.function = md_safemode_timeout; - mddev->safemode_timer.data = (unsigned long) mddev; - mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ - mddev->in_sync = 1; - smp_wmb(); - mddev->ready = 1; - rdev_for_each(rdev, mddev) - if (rdev->raid_disk >= 0) - if (sysfs_link_rdev(mddev, rdev)) - /* failure here is OK */; - - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - - if (mddev->flags) - md_update_sb(mddev, 0); - - md_new_event(mddev); - sysfs_notify_dirent_safe(mddev->sysfs_state); - sysfs_notify_dirent_safe(mddev->sysfs_action); - sysfs_notify(&mddev->kobj, NULL, "degraded"); - return 0; -} -EXPORT_SYMBOL_GPL(md_run); - -static int do_md_run(struct mddev *mddev) -{ - int err; - - err = md_run(mddev); - if (err) - goto out; - err = bitmap_load(mddev); - if (err) { - bitmap_destroy(mddev); - goto out; - } - - md_wakeup_thread(mddev->thread); - md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ - - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk(mddev->gendisk); - mddev->changed = 1; - kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); -out: - return err; -} - -static int restart_array(struct mddev *mddev) -{ - struct gendisk *disk = mddev->gendisk; - - /* Complain if it has no devices */ - if (list_empty(&mddev->disks)) - return -ENXIO; - if (!mddev->pers) - return -EINVAL; - if (!mddev->ro) - return -EBUSY; - mddev->safemode = 0; - mddev->ro = 0; - set_disk_ro(disk, 0); - printk(KERN_INFO "md: %s switched to read-write mode.\n", - mdname(mddev)); - /* Kick recovery or resync if necessary */ - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); - md_wakeup_thread(mddev->sync_thread); - sysfs_notify_dirent_safe(mddev->sysfs_state); - return 0; -} - -/* similar to deny_write_access, but accounts for our holding a reference - * to the file ourselves */ -static int deny_bitmap_write_access(struct file * file) -{ - struct inode *inode = file->f_mapping->host; - - spin_lock(&inode->i_lock); - if (atomic_read(&inode->i_writecount) > 1) { - spin_unlock(&inode->i_lock); - return -ETXTBSY; - } - atomic_set(&inode->i_writecount, -1); - spin_unlock(&inode->i_lock); - - return 0; -} - -void restore_bitmap_write_access(struct file *file) -{ - struct inode *inode = file->f_mapping->host; - - spin_lock(&inode->i_lock); - atomic_set(&inode->i_writecount, 1); - spin_unlock(&inode->i_lock); -} - -static void md_clean(struct mddev *mddev) -{ - mddev->array_sectors = 0; - mddev->external_size = 0; - mddev->dev_sectors = 0; - mddev->raid_disks = 0; - mddev->recovery_cp = 0; - mddev->resync_min = 0; - mddev->resync_max = MaxSector; - mddev->reshape_position = MaxSector; - mddev->external = 0; - mddev->persistent = 0; - mddev->level = LEVEL_NONE; - mddev->clevel[0] = 0; - mddev->flags = 0; - mddev->ro = 0; - mddev->metadata_type[0] = 0; - mddev->chunk_sectors = 0; - mddev->ctime = mddev->utime = 0; - mddev->layout = 0; - mddev->max_disks = 0; - mddev->events = 0; - mddev->can_decrease_events = 0; - mddev->delta_disks = 0; - mddev->new_level = LEVEL_NONE; - mddev->new_layout = 0; - mddev->new_chunk_sectors = 0; - mddev->curr_resync = 0; - mddev->resync_mismatches = 0; - mddev->suspend_lo = mddev->suspend_hi = 0; - mddev->sync_speed_min = mddev->sync_speed_max = 0; - mddev->recovery = 0; - mddev->in_sync = 0; - mddev->changed = 0; - mddev->degraded = 0; - mddev->safemode = 0; - mddev->merge_check_needed = 0; - mddev->bitmap_info.offset = 0; - mddev->bitmap_info.default_offset = 0; - mddev->bitmap_info.chunksize = 0; - mddev->bitmap_info.daemon_sleep = 0; - mddev->bitmap_info.max_write_behind = 0; -} - -static void __md_stop_writes(struct mddev *mddev) -{ - if (mddev->sync_thread) { - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - reap_sync_thread(mddev); - } - - del_timer_sync(&mddev->safemode_timer); - - bitmap_flush(mddev); - md_super_wait(mddev); - - if (!mddev->in_sync || mddev->flags) { - /* mark array as shutdown cleanly */ - mddev->in_sync = 1; - md_update_sb(mddev, 1); - } -} - -void md_stop_writes(struct mddev *mddev) -{ - mddev_lock(mddev); - __md_stop_writes(mddev); - mddev_unlock(mddev); -} -EXPORT_SYMBOL_GPL(md_stop_writes); - -void md_stop(struct mddev *mddev) -{ - mddev->ready = 0; - mddev->pers->stop(mddev); - if (mddev->pers->sync_request && mddev->to_remove == NULL) - mddev->to_remove = &md_redundancy_group; - module_put(mddev->pers->owner); - mddev->pers = NULL; - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); -} -EXPORT_SYMBOL_GPL(md_stop); - -static int md_set_readonly(struct mddev *mddev, int is_open) -{ - int err = 0; - mutex_lock(&mddev->open_mutex); - if (atomic_read(&mddev->openers) > is_open) { - printk("md: %s still in use.\n",mdname(mddev)); - err = -EBUSY; - goto out; - } - if (mddev->pers) { - __md_stop_writes(mddev); - - err = -ENXIO; - if (mddev->ro==1) - goto out; - mddev->ro = 1; - set_disk_ro(mddev->gendisk, 1); - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - sysfs_notify_dirent_safe(mddev->sysfs_state); - err = 0; - } -out: - mutex_unlock(&mddev->open_mutex); - return err; -} - -/* mode: - * 0 - completely stop and dis-assemble array - * 2 - stop but do not disassemble array - */ -static int do_md_stop(struct mddev * mddev, int mode, int is_open) -{ - struct gendisk *disk = mddev->gendisk; - struct md_rdev *rdev; - - mutex_lock(&mddev->open_mutex); - if (atomic_read(&mddev->openers) > is_open || - mddev->sysfs_active) { - printk("md: %s still in use.\n",mdname(mddev)); - mutex_unlock(&mddev->open_mutex); - return -EBUSY; - } - - if (mddev->pers) { - if (mddev->ro) - set_disk_ro(disk, 0); - - __md_stop_writes(mddev); - md_stop(mddev); - mddev->queue->merge_bvec_fn = NULL; - mddev->queue->backing_dev_info.congested_fn = NULL; - - /* tell userspace to handle 'inactive' */ - sysfs_notify_dirent_safe(mddev->sysfs_state); - - rdev_for_each(rdev, mddev) - if (rdev->raid_disk >= 0) - sysfs_unlink_rdev(mddev, rdev); - - set_capacity(disk, 0); - mutex_unlock(&mddev->open_mutex); - mddev->changed = 1; - revalidate_disk(disk); - - if (mddev->ro) - mddev->ro = 0; - } else - mutex_unlock(&mddev->open_mutex); - /* - * Free resources if final stop - */ - if (mode == 0) { - printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); - - bitmap_destroy(mddev); - if (mddev->bitmap_info.file) { - restore_bitmap_write_access(mddev->bitmap_info.file); - fput(mddev->bitmap_info.file); - mddev->bitmap_info.file = NULL; - } - mddev->bitmap_info.offset = 0; - - export_array(mddev); - - md_clean(mddev); - kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); - if (mddev->hold_active == UNTIL_STOP) - mddev->hold_active = 0; - } - blk_integrity_unregister(disk); - md_new_event(mddev); - sysfs_notify_dirent_safe(mddev->sysfs_state); - return 0; -} - -#ifndef MODULE -static void autorun_array(struct mddev *mddev) -{ - struct md_rdev *rdev; - int err; - - if (list_empty(&mddev->disks)) - return; - - printk(KERN_INFO "md: running: "); - - rdev_for_each(rdev, mddev) { - char b[BDEVNAME_SIZE]; - printk("<%s>", bdevname(rdev->bdev,b)); - } - printk("\n"); - - err = do_md_run(mddev); - if (err) { - printk(KERN_WARNING "md: do_md_run() returned %d\n", err); - do_md_stop(mddev, 0, 0); - } -} - -/* - * lets try to run arrays based on all disks that have arrived - * until now. (those are in pending_raid_disks) - * - * the method: pick the first pending disk, collect all disks with - * the same UUID, remove all from the pending list and put them into - * the 'same_array' list. Then order this list based on superblock - * update time (freshest comes first), kick out 'old' disks and - * compare superblocks. If everything's fine then run it. - * - * If "unit" is allocated, then bump its reference count - */ -static void autorun_devices(int part) -{ - struct md_rdev *rdev0, *rdev, *tmp; - struct mddev *mddev; - char b[BDEVNAME_SIZE]; - - printk(KERN_INFO "md: autorun ...\n"); - while (!list_empty(&pending_raid_disks)) { - int unit; - dev_t dev; - LIST_HEAD(candidates); - rdev0 = list_entry(pending_raid_disks.next, - struct md_rdev, same_set); - - printk(KERN_INFO "md: considering %s ...\n", - bdevname(rdev0->bdev,b)); - INIT_LIST_HEAD(&candidates); - rdev_for_each_list(rdev, tmp, &pending_raid_disks) - if (super_90_load(rdev, rdev0, 0) >= 0) { - printk(KERN_INFO "md: adding %s ...\n", - bdevname(rdev->bdev,b)); - list_move(&rdev->same_set, &candidates); - } - /* - * now we have a set of devices, with all of them having - * mostly sane superblocks. It's time to allocate the - * mddev. - */ - if (part) { - dev = MKDEV(mdp_major, - rdev0->preferred_minor << MdpMinorShift); - unit = MINOR(dev) >> MdpMinorShift; - } else { - dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); - unit = MINOR(dev); - } - if (rdev0->preferred_minor != unit) { - printk(KERN_INFO "md: unit number in %s is bad: %d\n", - bdevname(rdev0->bdev, b), rdev0->preferred_minor); - break; - } - - md_probe(dev, NULL, NULL); - mddev = mddev_find(dev); - if (!mddev || !mddev->gendisk) { - if (mddev) - mddev_put(mddev); - printk(KERN_ERR - "md: cannot allocate memory for md drive.\n"); - break; - } - if (mddev_lock(mddev)) - printk(KERN_WARNING "md: %s locked, cannot run\n", - mdname(mddev)); - else if (mddev->raid_disks || mddev->major_version - || !list_empty(&mddev->disks)) { - printk(KERN_WARNING - "md: %s already running, cannot run %s\n", - mdname(mddev), bdevname(rdev0->bdev,b)); - mddev_unlock(mddev); - } else { - printk(KERN_INFO "md: created %s\n", mdname(mddev)); - mddev->persistent = 1; - rdev_for_each_list(rdev, tmp, &candidates) { - list_del_init(&rdev->same_set); - if (bind_rdev_to_array(rdev, mddev)) - export_rdev(rdev); - } - autorun_array(mddev); - mddev_unlock(mddev); - } - /* on success, candidates will be empty, on error - * it won't... - */ - rdev_for_each_list(rdev, tmp, &candidates) { - list_del_init(&rdev->same_set); - export_rdev(rdev); - } - mddev_put(mddev); - } - printk(KERN_INFO "md: ... autorun DONE.\n"); -} -#endif /* !MODULE */ - -static int get_version(void __user * arg) -{ - mdu_version_t ver; - - ver.major = MD_MAJOR_VERSION; - ver.minor = MD_MINOR_VERSION; - ver.patchlevel = MD_PATCHLEVEL_VERSION; - - if (copy_to_user(arg, &ver, sizeof(ver))) - return -EFAULT; - - return 0; -} - -static int get_array_info(struct mddev * mddev, void __user * arg) -{ - mdu_array_info_t info; - int nr,working,insync,failed,spare; - struct md_rdev *rdev; - - nr=working=insync=failed=spare=0; - rdev_for_each(rdev, mddev) { - nr++; - if (test_bit(Faulty, &rdev->flags)) - failed++; - else { - working++; - if (test_bit(In_sync, &rdev->flags)) - insync++; - else - spare++; - } - } - - info.major_version = mddev->major_version; - info.minor_version = mddev->minor_version; - info.patch_version = MD_PATCHLEVEL_VERSION; - info.ctime = mddev->ctime; - info.level = mddev->level; - info.size = mddev->dev_sectors / 2; - if (info.size != mddev->dev_sectors / 2) /* overflow */ - info.size = -1; - info.nr_disks = nr; - info.raid_disks = mddev->raid_disks; - info.md_minor = mddev->md_minor; - info.not_persistent= !mddev->persistent; - - info.utime = mddev->utime; - info.state = 0; - if (mddev->in_sync) - info.state = (1<<MD_SB_CLEAN); - if (mddev->bitmap && mddev->bitmap_info.offset) - info.state = (1<<MD_SB_BITMAP_PRESENT); - info.active_disks = insync; - info.working_disks = working; - info.failed_disks = failed; - info.spare_disks = spare; - - info.layout = mddev->layout; - info.chunk_size = mddev->chunk_sectors << 9; - - if (copy_to_user(arg, &info, sizeof(info))) - return -EFAULT; - - return 0; -} - -static int get_bitmap_file(struct mddev * mddev, void __user * arg) -{ - mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ - char *ptr, *buf = NULL; - int err = -ENOMEM; - - if (md_allow_write(mddev)) - file = kmalloc(sizeof(*file), GFP_NOIO); - else - file = kmalloc(sizeof(*file), GFP_KERNEL); - - if (!file) - goto out; - - /* bitmap disabled, zero the first byte and copy out */ - if (!mddev->bitmap || !mddev->bitmap->file) { - file->pathname[0] = '\0'; - goto copy_out; - } - - buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); - if (!buf) - goto out; - - ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname)); - if (IS_ERR(ptr)) - goto out; - - strcpy(file->pathname, ptr); - -copy_out: - err = 0; - if (copy_to_user(arg, file, sizeof(*file))) - err = -EFAULT; -out: - kfree(buf); - kfree(file); - return err; -} - -static int get_disk_info(struct mddev * mddev, void __user * arg) -{ - mdu_disk_info_t info; - struct md_rdev *rdev; - - if (copy_from_user(&info, arg, sizeof(info))) - return -EFAULT; - - rdev = find_rdev_nr(mddev, info.number); - if (rdev) { - info.major = MAJOR(rdev->bdev->bd_dev); - info.minor = MINOR(rdev->bdev->bd_dev); - info.raid_disk = rdev->raid_disk; - info.state = 0; - if (test_bit(Faulty, &rdev->flags)) - info.state |= (1<<MD_DISK_FAULTY); - else if (test_bit(In_sync, &rdev->flags)) { - info.state |= (1<<MD_DISK_ACTIVE); - info.state |= (1<<MD_DISK_SYNC); - } - if (test_bit(WriteMostly, &rdev->flags)) - info.state |= (1<<MD_DISK_WRITEMOSTLY); - } else { - info.major = info.minor = 0; - info.raid_disk = -1; - info.state = (1<<MD_DISK_REMOVED); - } - - if (copy_to_user(arg, &info, sizeof(info))) - return -EFAULT; - - return 0; -} - -static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) -{ - char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; - struct md_rdev *rdev; - dev_t dev = MKDEV(info->major,info->minor); - - if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) - return -EOVERFLOW; - - if (!mddev->raid_disks) { - int err; - /* expecting a device which has a superblock */ - rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); - if (IS_ERR(rdev)) { - printk(KERN_WARNING - "md: md_import_device returned %ld\n", - PTR_ERR(rdev)); - return PTR_ERR(rdev); - } - if (!list_empty(&mddev->disks)) { - struct md_rdev *rdev0 - = list_entry(mddev->disks.next, - struct md_rdev, same_set); - err = super_types[mddev->major_version] - .load_super(rdev, rdev0, mddev->minor_version); - if (err < 0) { - printk(KERN_WARNING - "md: %s has different UUID to %s\n", - bdevname(rdev->bdev,b), - bdevname(rdev0->bdev,b2)); - export_rdev(rdev); - return -EINVAL; - } - } - err = bind_rdev_to_array(rdev, mddev); - if (err) - export_rdev(rdev); - return err; - } - - /* - * add_new_disk can be used once the array is assembled - * to add "hot spares". They must already have a superblock - * written - */ - if (mddev->pers) { - int err; - if (!mddev->pers->hot_add_disk) { - printk(KERN_WARNING - "%s: personality does not support diskops!\n", - mdname(mddev)); - return -EINVAL; - } - if (mddev->persistent) - rdev = md_import_device(dev, mddev->major_version, - mddev->minor_version); - else - rdev = md_import_device(dev, -1, -1); - if (IS_ERR(rdev)) { - printk(KERN_WARNING - "md: md_import_device returned %ld\n", - PTR_ERR(rdev)); - return PTR_ERR(rdev); - } - /* set saved_raid_disk if appropriate */ - if (!mddev->persistent) { - if (info->state & (1<<MD_DISK_SYNC) && - info->raid_disk < mddev->raid_disks) { - rdev->raid_disk = info->raid_disk; - set_bit(In_sync, &rdev->flags); - } else - rdev->raid_disk = -1; - } else - super_types[mddev->major_version]. - validate_super(mddev, rdev); - if ((info->state & (1<<MD_DISK_SYNC)) && - (!test_bit(In_sync, &rdev->flags) || - rdev->raid_disk != info->raid_disk)) { - /* This was a hot-add request, but events doesn't - * match, so reject it. - */ - export_rdev(rdev); - return -EINVAL; - } - - if (test_bit(In_sync, &rdev->flags)) - rdev->saved_raid_disk = rdev->raid_disk; - else - rdev->saved_raid_disk = -1; - - clear_bit(In_sync, &rdev->flags); /* just to be sure */ - if (info->state & (1<<MD_DISK_WRITEMOSTLY)) - set_bit(WriteMostly, &rdev->flags); - else - clear_bit(WriteMostly, &rdev->flags); - - rdev->raid_disk = -1; - err = bind_rdev_to_array(rdev, mddev); - if (!err && !mddev->pers->hot_remove_disk) { - /* If there is hot_add_disk but no hot_remove_disk - * then added disks for geometry changes, - * and should be added immediately. - */ - super_types[mddev->major_version]. - validate_super(mddev, rdev); - err = mddev->pers->hot_add_disk(mddev, rdev); - if (err) - unbind_rdev_from_array(rdev); - } - if (err) - export_rdev(rdev); - else - sysfs_notify_dirent_safe(rdev->sysfs_state); - - md_update_sb(mddev, 1); - if (mddev->degraded) - set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - if (!err) - md_new_event(mddev); - md_wakeup_thread(mddev->thread); - return err; - } - - /* otherwise, add_new_disk is only allowed - * for major_version==0 superblocks - */ - if (mddev->major_version != 0) { - printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", - mdname(mddev)); - return -EINVAL; - } - - if (!(info->state & (1<<MD_DISK_FAULTY))) { - int err; - rdev = md_import_device(dev, -1, 0); - if (IS_ERR(rdev)) { - printk(KERN_WARNING - "md: error, md_import_device() returned %ld\n", - PTR_ERR(rdev)); - return PTR_ERR(rdev); - } - rdev->desc_nr = info->number; - if (info->raid_disk < mddev->raid_disks) - rdev->raid_disk = info->raid_disk; - else - rdev->raid_disk = -1; - - if (rdev->raid_disk < mddev->raid_disks) - if (info->state & (1<<MD_DISK_SYNC)) - set_bit(In_sync, &rdev->flags); - - if (info->state & (1<<MD_DISK_WRITEMOSTLY)) - set_bit(WriteMostly, &rdev->flags); - - if (!mddev->persistent) { - printk(KERN_INFO "md: nonpersistent superblock ...\n"); - rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; - } else - rdev->sb_start = calc_dev_sboffset(rdev); - rdev->sectors = rdev->sb_start; - - err = bind_rdev_to_array(rdev, mddev); - if (err) { - export_rdev(rdev); - return err; - } - } - - return 0; -} - -static int hot_remove_disk(struct mddev * mddev, dev_t dev) -{ - char b[BDEVNAME_SIZE]; - struct md_rdev *rdev; - - rdev = find_rdev(mddev, dev); - if (!rdev) - return -ENXIO; - - if (rdev->raid_disk >= 0) - goto busy; - - kick_rdev_from_array(rdev); - md_update_sb(mddev, 1); - md_new_event(mddev); - - return 0; -busy: - printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", - bdevname(rdev->bdev,b), mdname(mddev)); - return -EBUSY; -} - -static int hot_add_disk(struct mddev * mddev, dev_t dev) -{ - char b[BDEVNAME_SIZE]; - int err; - struct md_rdev *rdev; - - if (!mddev->pers) - return -ENODEV; - - if (mddev->major_version != 0) { - printk(KERN_WARNING "%s: HOT_ADD may only be used with" - " version-0 superblocks.\n", - mdname(mddev)); - return -EINVAL; - } - if (!mddev->pers->hot_add_disk) { - printk(KERN_WARNING - "%s: personality does not support diskops!\n", - mdname(mddev)); - return -EINVAL; - } - - rdev = md_import_device(dev, -1, 0); - if (IS_ERR(rdev)) { - printk(KERN_WARNING - "md: error, md_import_device() returned %ld\n", - PTR_ERR(rdev)); - return -EINVAL; - } - - if (mddev->persistent) - rdev->sb_start = calc_dev_sboffset(rdev); - else - rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; - - rdev->sectors = rdev->sb_start; - - if (test_bit(Faulty, &rdev->flags)) { - printk(KERN_WARNING - "md: can not hot-add faulty %s disk to %s!\n", - bdevname(rdev->bdev,b), mdname(mddev)); - err = -EINVAL; - goto abort_export; - } - clear_bit(In_sync, &rdev->flags); - rdev->desc_nr = -1; - rdev->saved_raid_disk = -1; - err = bind_rdev_to_array(rdev, mddev); - if (err) - goto abort_export; - - /* - * The rest should better be atomic, we can have disk failures - * noticed in interrupt contexts ... - */ - - rdev->raid_disk = -1; - - md_update_sb(mddev, 1); - - /* - * Kick recovery, maybe this spare has to be added to the - * array immediately. - */ - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); - md_new_event(mddev); - return 0; - -abort_export: - export_rdev(rdev); - return err; -} - -static int set_bitmap_file(struct mddev *mddev, int fd) -{ - int err; - - if (mddev->pers) { - if (!mddev->pers->quiesce) - return -EBUSY; - if (mddev->recovery || mddev->sync_thread) - return -EBUSY; - /* we should be able to change the bitmap.. */ - } - - - if (fd >= 0) { - if (mddev->bitmap) - return -EEXIST; /* cannot add when bitmap is present */ - mddev->bitmap_info.file = fget(fd); - - if (mddev->bitmap_info.file == NULL) { - printk(KERN_ERR "%s: error: failed to get bitmap file\n", - mdname(mddev)); - return -EBADF; - } - - err = deny_bitmap_write_access(mddev->bitmap_info.file); - if (err) { - printk(KERN_ERR "%s: error: bitmap file is already in use\n", - mdname(mddev)); - fput(mddev->bitmap_info.file); - mddev->bitmap_info.file = NULL; - return err; - } - mddev->bitmap_info.offset = 0; /* file overrides offset */ - } else if (mddev->bitmap == NULL) - return -ENOENT; /* cannot remove what isn't there */ - err = 0; - if (mddev->pers) { - mddev->pers->quiesce(mddev, 1); - if (fd >= 0) { - err = bitmap_create(mddev); - if (!err) - err = bitmap_load(mddev); - } - if (fd < 0 || err) { - bitmap_destroy(mddev); - fd = -1; /* make sure to put the file */ - } - mddev->pers->quiesce(mddev, 0); - } - if (fd < 0) { - if (mddev->bitmap_info.file) { - restore_bitmap_write_access(mddev->bitmap_info.file); - fput(mddev->bitmap_info.file); - } - mddev->bitmap_info.file = NULL; - } - - return err; -} - -/* - * set_array_info is used two different ways - * The original usage is when creating a new array. - * In this usage, raid_disks is > 0 and it together with - * level, size, not_persistent,layout,chunksize determine the - * shape of the array. - * This will always create an array with a type-0.90.0 superblock. - * The newer usage is when assembling an array. - * In this case raid_disks will be 0, and the major_version field is - * use to determine which style super-blocks are to be found on the devices. - * The minor and patch _version numbers are also kept incase the - * super_block handler wishes to interpret them. - */ -static int set_array_info(struct mddev * mddev, mdu_array_info_t *info) -{ - - if (info->raid_disks == 0) { - /* just setting version number for superblock loading */ - if (info->major_version < 0 || - info->major_version >= ARRAY_SIZE(super_types) || - super_types[info->major_version].name == NULL) { - /* maybe try to auto-load a module? */ - printk(KERN_INFO - "md: superblock version %d not known\n", - info->major_version); - return -EINVAL; - } - mddev->major_version = info->major_version; - mddev->minor_version = info->minor_version; - mddev->patch_version = info->patch_version; - mddev->persistent = !info->not_persistent; - /* ensure mddev_put doesn't delete this now that there - * is some minimal configuration. - */ - mddev->ctime = get_seconds(); - return 0; - } - mddev->major_version = MD_MAJOR_VERSION; - mddev->minor_version = MD_MINOR_VERSION; - mddev->patch_version = MD_PATCHLEVEL_VERSION; - mddev->ctime = get_seconds(); - - mddev->level = info->level; - mddev->clevel[0] = 0; - mddev->dev_sectors = 2 * (sector_t)info->size; - mddev->raid_disks = info->raid_disks; - /* don't set md_minor, it is determined by which /dev/md* was - * openned - */ - if (info->state & (1<<MD_SB_CLEAN)) - mddev->recovery_cp = MaxSector; - else - mddev->recovery_cp = 0; - mddev->persistent = ! info->not_persistent; - mddev->external = 0; - - mddev->layout = info->layout; - mddev->chunk_sectors = info->chunk_size >> 9; - - mddev->max_disks = MD_SB_DISKS; - - if (mddev->persistent) - mddev->flags = 0; - set_bit(MD_CHANGE_DEVS, &mddev->flags); - - mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; - mddev->bitmap_info.offset = 0; - - mddev->reshape_position = MaxSector; - - /* - * Generate a 128 bit UUID - */ - get_random_bytes(mddev->uuid, 16); - - mddev->new_level = mddev->level; - mddev->new_chunk_sectors = mddev->chunk_sectors; - mddev->new_layout = mddev->layout; - mddev->delta_disks = 0; - - return 0; -} - -void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) -{ - WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); - - if (mddev->external_size) - return; - - mddev->array_sectors = array_sectors; -} -EXPORT_SYMBOL(md_set_array_sectors); - -static int update_size(struct mddev *mddev, sector_t num_sectors) -{ - struct md_rdev *rdev; - int rv; - int fit = (num_sectors == 0); - - if (mddev->pers->resize == NULL) - return -EINVAL; - /* The "num_sectors" is the number of sectors of each device that - * is used. This can only make sense for arrays with redundancy. - * linear and raid0 always use whatever space is available. We can only - * consider changing this number if no resync or reconstruction is - * happening, and if the new size is acceptable. It must fit before the - * sb_start or, if that is <data_offset, it must fit before the size - * of each device. If num_sectors is zero, we find the largest size - * that fits. - */ - if (mddev->sync_thread) - return -EBUSY; - if (mddev->bitmap) - /* Sorry, cannot grow a bitmap yet, just remove it, - * grow, and re-add. - */ - return -EBUSY; - rdev_for_each(rdev, mddev) { - sector_t avail = rdev->sectors; - - if (fit && (num_sectors == 0 || num_sectors > avail)) - num_sectors = avail; - if (avail < num_sectors) - return -ENOSPC; - } - rv = mddev->pers->resize(mddev, num_sectors); - if (!rv) - revalidate_disk(mddev->gendisk); - return rv; -} - -static int update_raid_disks(struct mddev *mddev, int raid_disks) -{ - int rv; - /* change the number of raid disks */ - if (mddev->pers->check_reshape == NULL) - return -EINVAL; - if (raid_disks <= 0 || - (mddev->max_disks && raid_disks >= mddev->max_disks)) - return -EINVAL; - if (mddev->sync_thread || mddev->reshape_position != MaxSector) - return -EBUSY; - mddev->delta_disks = raid_disks - mddev->raid_disks; - - rv = mddev->pers->check_reshape(mddev); - if (rv < 0) - mddev->delta_disks = 0; - return rv; -} - - -/* - * update_array_info is used to change the configuration of an - * on-line array. - * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size - * fields in the info are checked against the array. - * Any differences that cannot be handled will cause an error. - * Normally, only one change can be managed at a time. - */ -static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) -{ - int rv = 0; - int cnt = 0; - int state = 0; - - /* calculate expected state,ignoring low bits */ - if (mddev->bitmap && mddev->bitmap_info.offset) - state |= (1 << MD_SB_BITMAP_PRESENT); - - if (mddev->major_version != info->major_version || - mddev->minor_version != info->minor_version || -/* mddev->patch_version != info->patch_version || */ - mddev->ctime != info->ctime || - mddev->level != info->level || -/* mddev->layout != info->layout || */ - !mddev->persistent != info->not_persistent|| - mddev->chunk_sectors != info->chunk_size >> 9 || - /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ - ((state^info->state) & 0xfffffe00) - ) - return -EINVAL; - /* Check there is only one change */ - if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) - cnt++; - if (mddev->raid_disks != info->raid_disks) - cnt++; - if (mddev->layout != info->layout) - cnt++; - if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) - cnt++; - if (cnt == 0) - return 0; - if (cnt > 1) - return -EINVAL; - - if (mddev->layout != info->layout) { - /* Change layout - * we don't need to do anything at the md level, the - * personality will take care of it all. - */ - if (mddev->pers->check_reshape == NULL) - return -EINVAL; - else { - mddev->new_layout = info->layout; - rv = mddev->pers->check_reshape(mddev); - if (rv) - mddev->new_layout = mddev->layout; - return rv; - } - } - if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) - rv = update_size(mddev, (sector_t)info->size * 2); - - if (mddev->raid_disks != info->raid_disks) - rv = update_raid_disks(mddev, info->raid_disks); - - if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { - if (mddev->pers->quiesce == NULL) - return -EINVAL; - if (mddev->recovery || mddev->sync_thread) - return -EBUSY; - if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { - /* add the bitmap */ - if (mddev->bitmap) - return -EEXIST; - if (mddev->bitmap_info.default_offset == 0) - return -EINVAL; - mddev->bitmap_info.offset = - mddev->bitmap_info.default_offset; - mddev->pers->quiesce(mddev, 1); - rv = bitmap_create(mddev); - if (!rv) - rv = bitmap_load(mddev); - if (rv) - bitmap_destroy(mddev); - mddev->pers->quiesce(mddev, 0); - } else { - /* remove the bitmap */ - if (!mddev->bitmap) - return -ENOENT; - if (mddev->bitmap->file) - return -EINVAL; - mddev->pers->quiesce(mddev, 1); - bitmap_destroy(mddev); - mddev->pers->quiesce(mddev, 0); - mddev->bitmap_info.offset = 0; - } - } - md_update_sb(mddev, 1); - return rv; -} - -static int set_disk_faulty(struct mddev *mddev, dev_t dev) -{ - struct md_rdev *rdev; - - if (mddev->pers == NULL) - return -ENODEV; - - rdev = find_rdev(mddev, dev); - if (!rdev) - return -ENODEV; - - md_error(mddev, rdev); - if (!test_bit(Faulty, &rdev->flags)) - return -EBUSY; - return 0; -} - -/* - * We have a problem here : there is no easy way to give a CHS - * virtual geometry. We currently pretend that we have a 2 heads - * 4 sectors (with a BIG number of cylinders...). This drives - * dosfs just mad... ;-) - */ -static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) -{ - struct mddev *mddev = bdev->bd_disk->private_data; - - geo->heads = 2; - geo->sectors = 4; - geo->cylinders = mddev->array_sectors / 8; - return 0; -} - -static int md_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) -{ - int err = 0; - void __user *argp = (void __user *)arg; - struct mddev *mddev = NULL; - int ro; - - switch (cmd) { - case RAID_VERSION: - case GET_ARRAY_INFO: - case GET_DISK_INFO: - break; - default: - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - } - - /* - * Commands dealing with the RAID driver but not any - * particular array: - */ - switch (cmd) - { - case RAID_VERSION: - err = get_version(argp); - goto done; - - case PRINT_RAID_DEBUG: - err = 0; - md_print_devices(); - goto done; - -#ifndef MODULE - case RAID_AUTORUN: - err = 0; - autostart_arrays(arg); - goto done; -#endif - default:; - } - - /* - * Commands creating/starting a new array: - */ - - mddev = bdev->bd_disk->private_data; - - if (!mddev) { - BUG(); - goto abort; - } - - err = mddev_lock(mddev); - if (err) { - printk(KERN_INFO - "md: ioctl lock interrupted, reason %d, cmd %d\n", - err, cmd); - goto abort; - } - - switch (cmd) - { - case SET_ARRAY_INFO: - { - mdu_array_info_t info; - if (!arg) - memset(&info, 0, sizeof(info)); - else if (copy_from_user(&info, argp, sizeof(info))) { - err = -EFAULT; - goto abort_unlock; - } - if (mddev->pers) { - err = update_array_info(mddev, &info); - if (err) { - printk(KERN_WARNING "md: couldn't update" - " array info. %d\n", err); - goto abort_unlock; - } - goto done_unlock; - } - if (!list_empty(&mddev->disks)) { - printk(KERN_WARNING - "md: array %s already has disks!\n", - mdname(mddev)); - err = -EBUSY; - goto abort_unlock; - } - if (mddev->raid_disks) { - printk(KERN_WARNING - "md: array %s already initialised!\n", - mdname(mddev)); - err = -EBUSY; - goto abort_unlock; - } - err = set_array_info(mddev, &info); - if (err) { - printk(KERN_WARNING "md: couldn't set" - " array info. %d\n", err); - goto abort_unlock; - } - } - goto done_unlock; - - default:; - } - - /* - * Commands querying/configuring an existing array: - */ - /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, - * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ - if ((!mddev->raid_disks && !mddev->external) - && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY - && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE - && cmd != GET_BITMAP_FILE) { - err = -ENODEV; - goto abort_unlock; - } - - /* - * Commands even a read-only array can execute: - */ - switch (cmd) - { - case GET_ARRAY_INFO: - err = get_array_info(mddev, argp); - goto done_unlock; - - case GET_BITMAP_FILE: - err = get_bitmap_file(mddev, argp); - goto done_unlock; - - case GET_DISK_INFO: - err = get_disk_info(mddev, argp); - goto done_unlock; - - case RESTART_ARRAY_RW: - err = restart_array(mddev); - goto done_unlock; - - case STOP_ARRAY: - err = do_md_stop(mddev, 0, 1); - goto done_unlock; - - case STOP_ARRAY_RO: - err = md_set_readonly(mddev, 1); - goto done_unlock; - - case BLKROSET: - if (get_user(ro, (int __user *)(arg))) { - err = -EFAULT; - goto done_unlock; - } - err = -EINVAL; - - /* if the bdev is going readonly the value of mddev->ro - * does not matter, no writes are coming - */ - if (ro) - goto done_unlock; - - /* are we are already prepared for writes? */ - if (mddev->ro != 1) - goto done_unlock; - - /* transitioning to readauto need only happen for - * arrays that call md_write_start - */ - if (mddev->pers) { - err = restart_array(mddev); - if (err == 0) { - mddev->ro = 2; - set_disk_ro(mddev->gendisk, 0); - } - } - goto done_unlock; - } - - /* - * The remaining ioctls are changing the state of the - * superblock, so we do not allow them on read-only arrays. - * However non-MD ioctls (e.g. get-size) will still come through - * here and hit the 'default' below, so only disallow - * 'md' ioctls, and switch to rw mode if started auto-readonly. - */ - if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) { - if (mddev->ro == 2) { - mddev->ro = 0; - sysfs_notify_dirent_safe(mddev->sysfs_state); - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); - } else { - err = -EROFS; - goto abort_unlock; - } - } - - switch (cmd) - { - case ADD_NEW_DISK: - { - mdu_disk_info_t info; - if (copy_from_user(&info, argp, sizeof(info))) - err = -EFAULT; - else - err = add_new_disk(mddev, &info); - goto done_unlock; - } - - case HOT_REMOVE_DISK: - err = hot_remove_disk(mddev, new_decode_dev(arg)); - goto done_unlock; - - case HOT_ADD_DISK: - err = hot_add_disk(mddev, new_decode_dev(arg)); - goto done_unlock; - - case SET_DISK_FAULTY: - err = set_disk_faulty(mddev, new_decode_dev(arg)); - goto done_unlock; - - case RUN_ARRAY: - err = do_md_run(mddev); - goto done_unlock; - - case SET_BITMAP_FILE: - err = set_bitmap_file(mddev, (int)arg); - goto done_unlock; - - default: - err = -EINVAL; - goto abort_unlock; - } - -done_unlock: -abort_unlock: - if (mddev->hold_active == UNTIL_IOCTL && - err != -EINVAL) - mddev->hold_active = 0; - mddev_unlock(mddev); - - return err; -done: - if (err) - MD_BUG(); -abort: - return err; -} -#ifdef CONFIG_COMPAT -static int md_compat_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) -{ - switch (cmd) { - case HOT_REMOVE_DISK: - case HOT_ADD_DISK: - case SET_DISK_FAULTY: - case SET_BITMAP_FILE: - /* These take in integer arg, do not convert */ - break; - default: - arg = (unsigned long)compat_ptr(arg); - break; - } - - return md_ioctl(bdev, mode, cmd, arg); -} -#endif /* CONFIG_COMPAT */ - -static int md_open(struct block_device *bdev, fmode_t mode) -{ - /* - * Succeed if we can lock the mddev, which confirms that - * it isn't being stopped right now. - */ - struct mddev *mddev = mddev_find(bdev->bd_dev); - int err; - - if (mddev->gendisk != bdev->bd_disk) { - /* we are racing with mddev_put which is discarding this - * bd_disk. - */ - mddev_put(mddev); - /* Wait until bdev->bd_disk is definitely gone */ - flush_workqueue(md_misc_wq); - /* Then retry the open from the top */ - return -ERESTARTSYS; - } - BUG_ON(mddev != bdev->bd_disk->private_data); - - if ((err = mutex_lock_interruptible(&mddev->open_mutex))) - goto out; - - err = 0; - atomic_inc(&mddev->openers); - mutex_unlock(&mddev->open_mutex); - - check_disk_change(bdev); - out: - return err; -} - -static int md_release(struct gendisk *disk, fmode_t mode) -{ - struct mddev *mddev = disk->private_data; - - BUG_ON(!mddev); - atomic_dec(&mddev->openers); - mddev_put(mddev); - - return 0; -} - -static int md_media_changed(struct gendisk *disk) -{ - struct mddev *mddev = disk->private_data; - - return mddev->changed; -} - -static int md_revalidate(struct gendisk *disk) -{ - struct mddev *mddev = disk->private_data; - - mddev->changed = 0; - return 0; -} -static const struct block_device_operations md_fops = -{ - .owner = THIS_MODULE, - .open = md_open, - .release = md_release, - .ioctl = md_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = md_compat_ioctl, -#endif - .getgeo = md_getgeo, - .media_changed = md_media_changed, - .revalidate_disk= md_revalidate, -}; - -static int md_thread(void * arg) -{ - struct md_thread *thread = arg; - - /* - * md_thread is a 'system-thread', it's priority should be very - * high. We avoid resource deadlocks individually in each - * raid personality. (RAID5 does preallocation) We also use RR and - * the very same RT priority as kswapd, thus we will never get - * into a priority inversion deadlock. - * - * we definitely have to have equal or higher priority than - * bdflush, otherwise bdflush will deadlock if there are too - * many dirty RAID5 blocks. - */ - - allow_signal(SIGKILL); - while (!kthread_should_stop()) { - - /* We need to wait INTERRUPTIBLE so that - * we don't add to the load-average. - * That means we need to be sure no signals are - * pending - */ - if (signal_pending(current)) - flush_signals(current); - - wait_event_interruptible_timeout - (thread->wqueue, - test_bit(THREAD_WAKEUP, &thread->flags) - || kthread_should_stop(), - thread->timeout); - - clear_bit(THREAD_WAKEUP, &thread->flags); - if (!kthread_should_stop()) - thread->run(thread->mddev); - } - - return 0; -} - -void md_wakeup_thread(struct md_thread *thread) -{ - if (thread) { - pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm); - set_bit(THREAD_WAKEUP, &thread->flags); - wake_up(&thread->wqueue); - } -} - -struct md_thread *md_register_thread(void (*run) (struct mddev *), struct mddev *mddev, - const char *name) -{ - struct md_thread *thread; - - thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL); - if (!thread) - return NULL; - - init_waitqueue_head(&thread->wqueue); - - thread->run = run; - thread->mddev = mddev; - thread->timeout = MAX_SCHEDULE_TIMEOUT; - thread->tsk = kthread_run(md_thread, thread, - "%s_%s", - mdname(thread->mddev), - name ?: mddev->pers->name); - if (IS_ERR(thread->tsk)) { - kfree(thread); - return NULL; - } - return thread; -} - -void md_unregister_thread(struct md_thread **threadp) -{ - struct md_thread *thread = *threadp; - if (!thread) - return; - pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); - /* Locking ensures that mddev_unlock does not wake_up a - * non-existent thread - */ - spin_lock(&pers_lock); - *threadp = NULL; - spin_unlock(&pers_lock); - - kthread_stop(thread->tsk); - kfree(thread); -} - -void md_error(struct mddev *mddev, struct md_rdev *rdev) -{ - if (!mddev) { - MD_BUG(); - return; - } - - if (!rdev || test_bit(Faulty, &rdev->flags)) - return; - - if (!mddev->pers || !mddev->pers->error_handler) - return; - mddev->pers->error_handler(mddev,rdev); - if (mddev->degraded) - set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - sysfs_notify_dirent_safe(rdev->sysfs_state); - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); - if (mddev->event_work.func) - queue_work(md_misc_wq, &mddev->event_work); - md_new_event_inintr(mddev); -} - -/* seq_file implementation /proc/mdstat */ - -static void status_unused(struct seq_file *seq) -{ - int i = 0; - struct md_rdev *rdev; - - seq_printf(seq, "unused devices: "); - - list_for_each_entry(rdev, &pending_raid_disks, same_set) { - char b[BDEVNAME_SIZE]; - i++; - seq_printf(seq, "%s ", - bdevname(rdev->bdev,b)); - } - if (!i) - seq_printf(seq, "<none>"); - - seq_printf(seq, "\n"); -} - - -static void status_resync(struct seq_file *seq, struct mddev * mddev) -{ - sector_t max_sectors, resync, res; - unsigned long dt, db; - sector_t rt; - int scale; - unsigned int per_milli; - - resync = mddev->curr_resync - atomic_read(&mddev->recovery_active); - - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) - max_sectors = mddev->resync_max_sectors; - else - max_sectors = mddev->dev_sectors; - - /* - * Should not happen. - */ - if (!max_sectors) { - MD_BUG(); - return; - } - /* Pick 'scale' such that (resync>>scale)*1000 will fit - * in a sector_t, and (max_sectors>>scale) will fit in a - * u32, as those are the requirements for sector_div. - * Thus 'scale' must be at least 10 - */ - scale = 10; - if (sizeof(sector_t) > sizeof(unsigned long)) { - while ( max_sectors/2 > (1ULL<<(scale+32))) - scale++; - } - res = (resync>>scale)*1000; - sector_div(res, (u32)((max_sectors>>scale)+1)); - - per_milli = res; - { - int i, x = per_milli/50, y = 20-x; - seq_printf(seq, "["); - for (i = 0; i < x; i++) - seq_printf(seq, "="); - seq_printf(seq, ">"); - for (i = 0; i < y; i++) - seq_printf(seq, "."); - seq_printf(seq, "] "); - } - seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", - (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? - "reshape" : - (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? - "check" : - (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? - "resync" : "recovery"))), - per_milli/10, per_milli % 10, - (unsigned long long) resync/2, - (unsigned long long) max_sectors/2); - - /* - * dt: time from mark until now - * db: blocks written from mark until now - * rt: remaining time - * - * rt is a sector_t, so could be 32bit or 64bit. - * So we divide before multiply in case it is 32bit and close - * to the limit. - * We scale the divisor (db) by 32 to avoid losing precision - * near the end of resync when the number of remaining sectors - * is close to 'db'. - * We then divide rt by 32 after multiplying by db to compensate. - * The '+1' avoids division by zero if db is very small. - */ - dt = ((jiffies - mddev->resync_mark) / HZ); - if (!dt) dt++; - db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) - - mddev->resync_mark_cnt; - - rt = max_sectors - resync; /* number of remaining sectors */ - sector_div(rt, db/32+1); - rt *= dt; - rt >>= 5; - - seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, - ((unsigned long)rt % 60)/6); - - seq_printf(seq, " speed=%ldK/sec", db/2/dt); -} - -static void *md_seq_start(struct seq_file *seq, loff_t *pos) -{ - struct list_head *tmp; - loff_t l = *pos; - struct mddev *mddev; - - if (l >= 0x10000) - return NULL; - if (!l--) - /* header */ - return (void*)1; - - spin_lock(&all_mddevs_lock); - list_for_each(tmp,&all_mddevs) - if (!l--) { - mddev = list_entry(tmp, struct mddev, all_mddevs); - mddev_get(mddev); - spin_unlock(&all_mddevs_lock); - return mddev; - } - spin_unlock(&all_mddevs_lock); - if (!l--) - return (void*)2;/* tail */ - return NULL; -} - -static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct list_head *tmp; - struct mddev *next_mddev, *mddev = v; - - ++*pos; - if (v == (void*)2) - return NULL; - - spin_lock(&all_mddevs_lock); - if (v == (void*)1) - tmp = all_mddevs.next; - else - tmp = mddev->all_mddevs.next; - if (tmp != &all_mddevs) - next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs)); - else { - next_mddev = (void*)2; - *pos = 0x10000; - } - spin_unlock(&all_mddevs_lock); - - if (v != (void*)1) - mddev_put(mddev); - return next_mddev; - -} - -static void md_seq_stop(struct seq_file *seq, void *v) -{ - struct mddev *mddev = v; - - if (mddev && v != (void*)1 && v != (void*)2) - mddev_put(mddev); -} - -static int md_seq_show(struct seq_file *seq, void *v) -{ - struct mddev *mddev = v; - sector_t sectors; - struct md_rdev *rdev; - - if (v == (void*)1) { - struct md_personality *pers; - seq_printf(seq, "Personalities : "); - spin_lock(&pers_lock); - list_for_each_entry(pers, &pers_list, list) - seq_printf(seq, "[%s] ", pers->name); - - spin_unlock(&pers_lock); - seq_printf(seq, "\n"); - seq->poll_event = atomic_read(&md_event_count); - return 0; - } - if (v == (void*)2) { - status_unused(seq); - return 0; - } - - if (mddev_lock(mddev) < 0) - return -EINTR; - - if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { - seq_printf(seq, "%s : %sactive", mdname(mddev), - mddev->pers ? "" : "in"); - if (mddev->pers) { - if (mddev->ro==1) - seq_printf(seq, " (read-only)"); - if (mddev->ro==2) - seq_printf(seq, " (auto-read-only)"); - seq_printf(seq, " %s", mddev->pers->name); - } - - sectors = 0; - rdev_for_each(rdev, mddev) { - char b[BDEVNAME_SIZE]; - seq_printf(seq, " %s[%d]", - bdevname(rdev->bdev,b), rdev->desc_nr); - if (test_bit(WriteMostly, &rdev->flags)) - seq_printf(seq, "(W)"); - if (test_bit(Faulty, &rdev->flags)) { - seq_printf(seq, "(F)"); - continue; - } - if (rdev->raid_disk < 0) - seq_printf(seq, "(S)"); /* spare */ - if (test_bit(Replacement, &rdev->flags)) - seq_printf(seq, "(R)"); - sectors += rdev->sectors; - } - - if (!list_empty(&mddev->disks)) { - if (mddev->pers) - seq_printf(seq, "\n %llu blocks", - (unsigned long long) - mddev->array_sectors / 2); - else - seq_printf(seq, "\n %llu blocks", - (unsigned long long)sectors / 2); - } - if (mddev->persistent) { - if (mddev->major_version != 0 || - mddev->minor_version != 90) { - seq_printf(seq," super %d.%d", - mddev->major_version, - mddev->minor_version); - } - } else if (mddev->external) - seq_printf(seq, " super external:%s", - mddev->metadata_type); - else - seq_printf(seq, " super non-persistent"); - - if (mddev->pers) { - mddev->pers->status(seq, mddev); - seq_printf(seq, "\n "); - if (mddev->pers->sync_request) { - if (mddev->curr_resync > 2) { - status_resync(seq, mddev); - seq_printf(seq, "\n "); - } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) - seq_printf(seq, "\tresync=DELAYED\n "); - else if (mddev->recovery_cp < MaxSector) - seq_printf(seq, "\tresync=PENDING\n "); - } - } else - seq_printf(seq, "\n "); - - bitmap_status(seq, mddev->bitmap); - - seq_printf(seq, "\n"); - } - mddev_unlock(mddev); - - return 0; -} - -static const struct seq_operations md_seq_ops = { - .start = md_seq_start, - .next = md_seq_next, - .stop = md_seq_stop, - .show = md_seq_show, -}; - -static int md_seq_open(struct inode *inode, struct file *file) -{ - struct seq_file *seq; - int error; - - error = seq_open(file, &md_seq_ops); - if (error) - return error; - - seq = file->private_data; - seq->poll_event = atomic_read(&md_event_count); - return error; -} - -static unsigned int mdstat_poll(struct file *filp, poll_table *wait) -{ - struct seq_file *seq = filp->private_data; - int mask; - - poll_wait(filp, &md_event_waiters, wait); - - /* always allow read */ - mask = POLLIN | POLLRDNORM; - - if (seq->poll_event != atomic_read(&md_event_count)) - mask |= POLLERR | POLLPRI; - return mask; -} - -static const struct file_operations md_seq_fops = { - .owner = THIS_MODULE, - .open = md_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, - .poll = mdstat_poll, -}; - -int register_md_personality(struct md_personality *p) -{ - spin_lock(&pers_lock); - list_add_tail(&p->list, &pers_list); - printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level); - spin_unlock(&pers_lock); - return 0; -} - -int unregister_md_personality(struct md_personality *p) -{ - printk(KERN_INFO "md: %s personality unregistered\n", p->name); - spin_lock(&pers_lock); - list_del_init(&p->list); - spin_unlock(&pers_lock); - return 0; -} - -static int is_mddev_idle(struct mddev *mddev, int init) -{ - struct md_rdev * rdev; - int idle; - int curr_events; - - idle = 1; - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) { - struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; - curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + - (int)part_stat_read(&disk->part0, sectors[1]) - - atomic_read(&disk->sync_io); - /* sync IO will cause sync_io to increase before the disk_stats - * as sync_io is counted when a request starts, and - * disk_stats is counted when it completes. - * So resync activity will cause curr_events to be smaller than - * when there was no such activity. - * non-sync IO will cause disk_stat to increase without - * increasing sync_io so curr_events will (eventually) - * be larger than it was before. Once it becomes - * substantially larger, the test below will cause - * the array to appear non-idle, and resync will slow - * down. - * If there is a lot of outstanding resync activity when - * we set last_event to curr_events, then all that activity - * completing might cause the array to appear non-idle - * and resync will be slowed down even though there might - * not have been non-resync activity. This will only - * happen once though. 'last_events' will soon reflect - * the state where there is little or no outstanding - * resync requests, and further resync activity will - * always make curr_events less than last_events. - * - */ - if (init || curr_events - rdev->last_events > 64) { - rdev->last_events = curr_events; - idle = 0; - } - } - rcu_read_unlock(); - return idle; -} - -void md_done_sync(struct mddev *mddev, int blocks, int ok) -{ - /* another "blocks" (512byte) blocks have been synced */ - atomic_sub(blocks, &mddev->recovery_active); - wake_up(&mddev->recovery_wait); - if (!ok) { - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_wakeup_thread(mddev->thread); - // stop recovery, signal do_sync .... - } -} - - -/* md_write_start(mddev, bi) - * If we need to update some array metadata (e.g. 'active' flag - * in superblock) before writing, schedule a superblock update - * and wait for it to complete. - */ -void md_write_start(struct mddev *mddev, struct bio *bi) -{ - int did_change = 0; - if (bio_data_dir(bi) != WRITE) - return; - - BUG_ON(mddev->ro == 1); - if (mddev->ro == 2) { - /* need to switch to read/write */ - mddev->ro = 0; - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); - md_wakeup_thread(mddev->sync_thread); - did_change = 1; - } - atomic_inc(&mddev->writes_pending); - if (mddev->safemode == 1) - mddev->safemode = 0; - if (mddev->in_sync) { - spin_lock_irq(&mddev->write_lock); - if (mddev->in_sync) { - mddev->in_sync = 0; - set_bit(MD_CHANGE_CLEAN, &mddev->flags); - set_bit(MD_CHANGE_PENDING, &mddev->flags); - md_wakeup_thread(mddev->thread); - did_change = 1; - } - spin_unlock_irq(&mddev->write_lock); - } - if (did_change) - sysfs_notify_dirent_safe(mddev->sysfs_state); - wait_event(mddev->sb_wait, - !test_bit(MD_CHANGE_PENDING, &mddev->flags)); -} - -void md_write_end(struct mddev *mddev) -{ - if (atomic_dec_and_test(&mddev->writes_pending)) { - if (mddev->safemode == 2) - md_wakeup_thread(mddev->thread); - else if (mddev->safemode_delay) - mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); - } -} - -/* md_allow_write(mddev) - * Calling this ensures that the array is marked 'active' so that writes - * may proceed without blocking. It is important to call this before - * attempting a GFP_KERNEL allocation while holding the mddev lock. - * Must be called with mddev_lock held. - * - * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock - * is dropped, so return -EAGAIN after notifying userspace. - */ -int md_allow_write(struct mddev *mddev) -{ - if (!mddev->pers) - return 0; - if (mddev->ro) - return 0; - if (!mddev->pers->sync_request) - return 0; - - spin_lock_irq(&mddev->write_lock); - if (mddev->in_sync) { - mddev->in_sync = 0; - set_bit(MD_CHANGE_CLEAN, &mddev->flags); - set_bit(MD_CHANGE_PENDING, &mddev->flags); - if (mddev->safemode_delay && - mddev->safemode == 0) - mddev->safemode = 1; - spin_unlock_irq(&mddev->write_lock); - md_update_sb(mddev, 0); - sysfs_notify_dirent_safe(mddev->sysfs_state); - } else - spin_unlock_irq(&mddev->write_lock); - - if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) - return -EAGAIN; - else - return 0; -} -EXPORT_SYMBOL_GPL(md_allow_write); - -#define SYNC_MARKS 10 -#define SYNC_MARK_STEP (3*HZ) -void md_do_sync(struct mddev *mddev) -{ - struct mddev *mddev2; - unsigned int currspeed = 0, - window; - sector_t max_sectors,j, io_sectors; - unsigned long mark[SYNC_MARKS]; - sector_t mark_cnt[SYNC_MARKS]; - int last_mark,m; - struct list_head *tmp; - sector_t last_check; - int skipped = 0; - struct md_rdev *rdev; - char *desc; - - /* just incase thread restarts... */ - if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) - return; - if (mddev->ro) /* never try to sync a read-only array */ - return; - - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) - desc = "data-check"; - else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) - desc = "requested-resync"; - else - desc = "resync"; - } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) - desc = "reshape"; - else - desc = "recovery"; - - /* we overload curr_resync somewhat here. - * 0 == not engaged in resync at all - * 2 == checking that there is no conflict with another sync - * 1 == like 2, but have yielded to allow conflicting resync to - * commense - * other == active in resync - this many blocks - * - * Before starting a resync we must have set curr_resync to - * 2, and then checked that every "conflicting" array has curr_resync - * less than ours. When we find one that is the same or higher - * we wait on resync_wait. To avoid deadlock, we reduce curr_resync - * to 1 if we choose to yield (based arbitrarily on address of mddev structure). - * This will mean we have to start checking from the beginning again. - * - */ - - do { - mddev->curr_resync = 2; - - try_again: - if (kthread_should_stop()) - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - - if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) - goto skip; - for_each_mddev(mddev2, tmp) { - if (mddev2 == mddev) - continue; - if (!mddev->parallel_resync - && mddev2->curr_resync - && match_mddev_units(mddev, mddev2)) { - DEFINE_WAIT(wq); - if (mddev < mddev2 && mddev->curr_resync == 2) { - /* arbitrarily yield */ - mddev->curr_resync = 1; - wake_up(&resync_wait); - } - if (mddev > mddev2 && mddev->curr_resync == 1) - /* no need to wait here, we can wait the next - * time 'round when curr_resync == 2 - */ - continue; - /* We need to wait 'interruptible' so as not to - * contribute to the load average, and not to - * be caught by 'softlockup' - */ - prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); - if (!kthread_should_stop() && - mddev2->curr_resync >= mddev->curr_resync) { - printk(KERN_INFO "md: delaying %s of %s" - " until %s has finished (they" - " share one or more physical units)\n", - desc, mdname(mddev), mdname(mddev2)); - mddev_put(mddev2); - if (signal_pending(current)) - flush_signals(current); - schedule(); - finish_wait(&resync_wait, &wq); - goto try_again; - } - finish_wait(&resync_wait, &wq); - } - } - } while (mddev->curr_resync < 2); - - j = 0; - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - /* resync follows the size requested by the personality, - * which defaults to physical size, but can be virtual size - */ - max_sectors = mddev->resync_max_sectors; - mddev->resync_mismatches = 0; - /* we don't use the checkpoint if there's a bitmap */ - if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) - j = mddev->resync_min; - else if (!mddev->bitmap) - j = mddev->recovery_cp; - - } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) - max_sectors = mddev->dev_sectors; - else { - /* recovery follows the physical size of devices */ - max_sectors = mddev->dev_sectors; - j = MaxSector; - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) - if (rdev->raid_disk >= 0 && - !test_bit(Faulty, &rdev->flags) && - !test_bit(In_sync, &rdev->flags) && - rdev->recovery_offset < j) - j = rdev->recovery_offset; - rcu_read_unlock(); - } - - printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); - printk(KERN_INFO "md: minimum _guaranteed_ speed:" - " %d KB/sec/disk.\n", speed_min(mddev)); - printk(KERN_INFO "md: using maximum available idle IO bandwidth " - "(but not more than %d KB/sec) for %s.\n", - speed_max(mddev), desc); - - is_mddev_idle(mddev, 1); /* this initializes IO event counters */ - - io_sectors = 0; - for (m = 0; m < SYNC_MARKS; m++) { - mark[m] = jiffies; - mark_cnt[m] = io_sectors; - } - last_mark = 0; - mddev->resync_mark = mark[last_mark]; - mddev->resync_mark_cnt = mark_cnt[last_mark]; - - /* - * Tune reconstruction: - */ - window = 32*(PAGE_SIZE/512); - printk(KERN_INFO "md: using %dk window, over a total of %lluk.\n", - window/2, (unsigned long long)max_sectors/2); - - atomic_set(&mddev->recovery_active, 0); - last_check = 0; - - if (j>2) { - printk(KERN_INFO - "md: resuming %s of %s from checkpoint.\n", - desc, mdname(mddev)); - mddev->curr_resync = j; - } - mddev->curr_resync_completed = j; - - while (j < max_sectors) { - sector_t sectors; - - skipped = 0; - - if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && - ((mddev->curr_resync > mddev->curr_resync_completed && - (mddev->curr_resync - mddev->curr_resync_completed) - > (max_sectors >> 4)) || - (j - mddev->curr_resync_completed)*2 - >= mddev->resync_max - mddev->curr_resync_completed - )) { - /* time to update curr_resync_completed */ - wait_event(mddev->recovery_wait, - atomic_read(&mddev->recovery_active) == 0); - mddev->curr_resync_completed = j; - set_bit(MD_CHANGE_CLEAN, &mddev->flags); - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); - } - - while (j >= mddev->resync_max && !kthread_should_stop()) { - /* As this condition is controlled by user-space, - * we can block indefinitely, so use '_interruptible' - * to avoid triggering warnings. - */ - flush_signals(current); /* just in case */ - wait_event_interruptible(mddev->recovery_wait, - mddev->resync_max > j - || kthread_should_stop()); - } - - if (kthread_should_stop()) - goto interrupted; - - sectors = mddev->pers->sync_request(mddev, j, &skipped, - currspeed < speed_min(mddev)); - if (sectors == 0) { - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - goto out; - } - - if (!skipped) { /* actual IO requested */ - io_sectors += sectors; - atomic_add(sectors, &mddev->recovery_active); - } - - if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) - break; - - j += sectors; - if (j>1) mddev->curr_resync = j; - mddev->curr_mark_cnt = io_sectors; - if (last_check == 0) - /* this is the earliest that rebuild will be - * visible in /proc/mdstat - */ - md_new_event(mddev); - - if (last_check + window > io_sectors || j == max_sectors) - continue; - - last_check = io_sectors; - repeat: - if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { - /* step marks */ - int next = (last_mark+1) % SYNC_MARKS; - - mddev->resync_mark = mark[next]; - mddev->resync_mark_cnt = mark_cnt[next]; - mark[next] = jiffies; - mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); - last_mark = next; - } - - - if (kthread_should_stop()) - goto interrupted; - - - /* - * this loop exits only if either when we are slower than - * the 'hard' speed limit, or the system was IO-idle for - * a jiffy. - * the system might be non-idle CPU-wise, but we only care - * about not overloading the IO subsystem. (things like an - * e2fsck being done on the RAID array should execute fast) - */ - cond_resched(); - - currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 - /((jiffies-mddev->resync_mark)/HZ +1) +1; - - if (currspeed > speed_min(mddev)) { - if ((currspeed > speed_max(mddev)) || - !is_mddev_idle(mddev, 0)) { - msleep(500); - goto repeat; - } - } - } - printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); - /* - * this also signals 'finished resyncing' to md_stop - */ - out: - wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); - - /* tell personality that we are finished */ - mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); - - if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && - mddev->curr_resync > 2) { - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { - if (mddev->curr_resync >= mddev->recovery_cp) { - printk(KERN_INFO - "md: checkpointing %s of %s.\n", - desc, mdname(mddev)); - mddev->recovery_cp = - mddev->curr_resync_completed; - } - } else - mddev->recovery_cp = MaxSector; - } else { - if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) - mddev->curr_resync = MaxSector; - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) - if (rdev->raid_disk >= 0 && - mddev->delta_disks >= 0 && - !test_bit(Faulty, &rdev->flags) && - !test_bit(In_sync, &rdev->flags) && - rdev->recovery_offset < mddev->curr_resync) - rdev->recovery_offset = mddev->curr_resync; - rcu_read_unlock(); - } - } - skip: - set_bit(MD_CHANGE_DEVS, &mddev->flags); - - if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { - /* We completed so min/max setting can be forgotten if used. */ - if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) - mddev->resync_min = 0; - mddev->resync_max = MaxSector; - } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) - mddev->resync_min = mddev->curr_resync_completed; - mddev->curr_resync = 0; - wake_up(&resync_wait); - set_bit(MD_RECOVERY_DONE, &mddev->recovery); - md_wakeup_thread(mddev->thread); - return; - - interrupted: - /* - * got a signal, exit. - */ - printk(KERN_INFO - "md: md_do_sync() got signal ... exiting\n"); - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - goto out; - -} -EXPORT_SYMBOL_GPL(md_do_sync); - -static int remove_and_add_spares(struct mddev *mddev) -{ - struct md_rdev *rdev; - int spares = 0; - int removed = 0; - - mddev->curr_resync_completed = 0; - - rdev_for_each(rdev, mddev) - if (rdev->raid_disk >= 0 && - !test_bit(Blocked, &rdev->flags) && - (test_bit(Faulty, &rdev->flags) || - ! test_bit(In_sync, &rdev->flags)) && - atomic_read(&rdev->nr_pending)==0) { - if (mddev->pers->hot_remove_disk( - mddev, rdev) == 0) { - sysfs_unlink_rdev(mddev, rdev); - rdev->raid_disk = -1; - removed++; - } - } - if (removed) - sysfs_notify(&mddev->kobj, NULL, - "degraded"); - - - rdev_for_each(rdev, mddev) { - if (rdev->raid_disk >= 0 && - !test_bit(In_sync, &rdev->flags) && - !test_bit(Faulty, &rdev->flags)) - spares++; - if (rdev->raid_disk < 0 - && !test_bit(Faulty, &rdev->flags)) { - rdev->recovery_offset = 0; - if (mddev->pers-> - hot_add_disk(mddev, rdev) == 0) { - if (sysfs_link_rdev(mddev, rdev)) - /* failure here is OK */; - spares++; - md_new_event(mddev); - set_bit(MD_CHANGE_DEVS, &mddev->flags); - } - } - } - return spares; -} - -static void reap_sync_thread(struct mddev *mddev) -{ - struct md_rdev *rdev; - - /* resync has finished, collect result */ - md_unregister_thread(&mddev->sync_thread); - if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && - !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { - /* success...*/ - /* activate any spares */ - if (mddev->pers->spare_active(mddev)) - sysfs_notify(&mddev->kobj, NULL, - "degraded"); - } - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && - mddev->pers->finish_reshape) - mddev->pers->finish_reshape(mddev); - - /* If array is no-longer degraded, then any saved_raid_disk - * information must be scrapped. Also if any device is now - * In_sync we must scrape the saved_raid_disk for that device - * do the superblock for an incrementally recovered device - * written out. - */ - rdev_for_each(rdev, mddev) - if (!mddev->degraded || - test_bit(In_sync, &rdev->flags)) - rdev->saved_raid_disk = -1; - - md_update_sb(mddev, 1); - clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); - clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); - clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); - clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); - /* flag recovery needed just to double check */ - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - sysfs_notify_dirent_safe(mddev->sysfs_action); - md_new_event(mddev); - if (mddev->event_work.func) - queue_work(md_misc_wq, &mddev->event_work); -} - -/* - * This routine is regularly called by all per-raid-array threads to - * deal with generic issues like resync and super-block update. - * Raid personalities that don't have a thread (linear/raid0) do not - * need this as they never do any recovery or update the superblock. - * - * It does not do any resync itself, but rather "forks" off other threads - * to do that as needed. - * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in - * "->recovery" and create a thread at ->sync_thread. - * When the thread finishes it sets MD_RECOVERY_DONE - * and wakeups up this thread which will reap the thread and finish up. - * This thread also removes any faulty devices (with nr_pending == 0). - * - * The overall approach is: - * 1/ if the superblock needs updating, update it. - * 2/ If a recovery thread is running, don't do anything else. - * 3/ If recovery has finished, clean up, possibly marking spares active. - * 4/ If there are any faulty devices, remove them. - * 5/ If array is degraded, try to add spares devices - * 6/ If array has spares or is not in-sync, start a resync thread. - */ -void md_check_recovery(struct mddev *mddev) -{ - if (mddev->suspended) - return; - - if (mddev->bitmap) - bitmap_daemon_work(mddev); - - if (signal_pending(current)) { - if (mddev->pers->sync_request && !mddev->external) { - printk(KERN_INFO "md: %s in immediate safe mode\n", - mdname(mddev)); - mddev->safemode = 2; - } - flush_signals(current); - } - - if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) - return; - if ( ! ( - (mddev->flags & ~ (1<<MD_CHANGE_PENDING)) || - test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || - test_bit(MD_RECOVERY_DONE, &mddev->recovery) || - (mddev->external == 0 && mddev->safemode == 1) || - (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) - && !mddev->in_sync && mddev->recovery_cp == MaxSector) - )) - return; - - if (mddev_trylock(mddev)) { - int spares = 0; - - if (mddev->ro) { - /* Only thing we do on a ro array is remove - * failed devices. - */ - struct md_rdev *rdev; - rdev_for_each(rdev, mddev) - if (rdev->raid_disk >= 0 && - !test_bit(Blocked, &rdev->flags) && - test_bit(Faulty, &rdev->flags) && - atomic_read(&rdev->nr_pending)==0) { - if (mddev->pers->hot_remove_disk( - mddev, rdev) == 0) { - sysfs_unlink_rdev(mddev, rdev); - rdev->raid_disk = -1; - } - } - clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - goto unlock; - } - - if (!mddev->external) { - int did_change = 0; - spin_lock_irq(&mddev->write_lock); - if (mddev->safemode && - !atomic_read(&mddev->writes_pending) && - !mddev->in_sync && - mddev->recovery_cp == MaxSector) { - mddev->in_sync = 1; - did_change = 1; - set_bit(MD_CHANGE_CLEAN, &mddev->flags); - } - if (mddev->safemode == 1) - mddev->safemode = 0; - spin_unlock_irq(&mddev->write_lock); - if (did_change) - sysfs_notify_dirent_safe(mddev->sysfs_state); - } - - if (mddev->flags) - md_update_sb(mddev, 0); - - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && - !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { - /* resync/recovery still happening */ - clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - goto unlock; - } - if (mddev->sync_thread) { - reap_sync_thread(mddev); - goto unlock; - } - /* Set RUNNING before clearing NEEDED to avoid - * any transients in the value of "sync_action". - */ - set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - /* Clear some bits that don't mean anything, but - * might be left set - */ - clear_bit(MD_RECOVERY_INTR, &mddev->recovery); - clear_bit(MD_RECOVERY_DONE, &mddev->recovery); - - if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || - test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) - goto unlock; - /* no recovery is running. - * remove any failed drives, then - * add spares if possible. - * Spare are also removed and re-added, to allow - * the personality to fail the re-add. - */ - - if (mddev->reshape_position != MaxSector) { - if (mddev->pers->check_reshape == NULL || - mddev->pers->check_reshape(mddev) != 0) - /* Cannot proceed */ - goto unlock; - set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); - clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - } else if ((spares = remove_and_add_spares(mddev))) { - clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); - clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); - clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); - set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - } else if (mddev->recovery_cp < MaxSector) { - set_bit(MD_RECOVERY_SYNC, &mddev->recovery); - clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) - /* nothing to be done ... */ - goto unlock; - - if (mddev->pers->sync_request) { - if (spares && mddev->bitmap && ! mddev->bitmap->file) { - /* We are adding a device or devices to an array - * which has the bitmap stored on all devices. - * So make sure all bitmap pages get written - */ - bitmap_write_all(mddev->bitmap); - } - mddev->sync_thread = md_register_thread(md_do_sync, - mddev, - "resync"); - if (!mddev->sync_thread) { - printk(KERN_ERR "%s: could not start resync" - " thread...\n", - mdname(mddev)); - /* leave the spares where they are, it shouldn't hurt */ - clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); - clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); - clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); - clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); - } else - md_wakeup_thread(mddev->sync_thread); - sysfs_notify_dirent_safe(mddev->sysfs_action); - md_new_event(mddev); - } - unlock: - if (!mddev->sync_thread) { - clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - if (test_and_clear_bit(MD_RECOVERY_RECOVER, - &mddev->recovery)) - if (mddev->sysfs_action) - sysfs_notify_dirent_safe(mddev->sysfs_action); - } - mddev_unlock(mddev); - } -} - -void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) -{ - sysfs_notify_dirent_safe(rdev->sysfs_state); - wait_event_timeout(rdev->blocked_wait, - !test_bit(Blocked, &rdev->flags) && - !test_bit(BlockedBadBlocks, &rdev->flags), - msecs_to_jiffies(5000)); - rdev_dec_pending(rdev, mddev); -} -EXPORT_SYMBOL(md_wait_for_blocked_rdev); - - -/* Bad block management. - * We can record which blocks on each device are 'bad' and so just - * fail those blocks, or that stripe, rather than the whole device. - * Entries in the bad-block table are 64bits wide. This comprises: - * Length of bad-range, in sectors: 0-511 for lengths 1-512 - * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) - * A 'shift' can be set so that larger blocks are tracked and - * consequently larger devices can be covered. - * 'Acknowledged' flag - 1 bit. - the most significant bit. - * - * Locking of the bad-block table uses a seqlock so md_is_badblock - * might need to retry if it is very unlucky. - * We will sometimes want to check for bad blocks in a bi_end_io function, - * so we use the write_seqlock_irq variant. - * - * When looking for a bad block we specify a range and want to - * know if any block in the range is bad. So we binary-search - * to the last range that starts at-or-before the given endpoint, - * (or "before the sector after the target range") - * then see if it ends after the given start. - * We return - * 0 if there are no known bad blocks in the range - * 1 if there are known bad block which are all acknowledged - * -1 if there are bad blocks which have not yet been acknowledged in metadata. - * plus the start/length of the first bad section we overlap. - */ -int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors) -{ - int hi; - int lo = 0; - u64 *p = bb->page; - int rv = 0; - sector_t target = s + sectors; - unsigned seq; - - if (bb->shift > 0) { - /* round the start down, and the end up */ - s >>= bb->shift; - target += (1<<bb->shift) - 1; - target >>= bb->shift; - sectors = target - s; - } - /* 'target' is now the first block after the bad range */ - -retry: - seq = read_seqbegin(&bb->lock); - - hi = bb->count; - - /* Binary search between lo and hi for 'target' - * i.e. for the last range that starts before 'target' - */ - /* INVARIANT: ranges before 'lo' and at-or-after 'hi' - * are known not to be the last range before target. - * VARIANT: hi-lo is the number of possible - * ranges, and decreases until it reaches 1 - */ - while (hi - lo > 1) { - int mid = (lo + hi) / 2; - sector_t a = BB_OFFSET(p[mid]); - if (a < target) - /* This could still be the one, earlier ranges - * could not. */ - lo = mid; - else - /* This and later ranges are definitely out. */ - hi = mid; - } - /* 'lo' might be the last that started before target, but 'hi' isn't */ - if (hi > lo) { - /* need to check all range that end after 's' to see if - * any are unacknowledged. - */ - while (lo >= 0 && - BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { - if (BB_OFFSET(p[lo]) < target) { - /* starts before the end, and finishes after - * the start, so they must overlap - */ - if (rv != -1 && BB_ACK(p[lo])) - rv = 1; - else - rv = -1; - *first_bad = BB_OFFSET(p[lo]); - *bad_sectors = BB_LEN(p[lo]); - } - lo--; - } - } - - if (read_seqretry(&bb->lock, seq)) - goto retry; - - return rv; -} -EXPORT_SYMBOL_GPL(md_is_badblock); - -/* - * Add a range of bad blocks to the table. - * This might extend the table, or might contract it - * if two adjacent ranges can be merged. - * We binary-search to find the 'insertion' point, then - * decide how best to handle it. - */ -static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, - int acknowledged) -{ - u64 *p; - int lo, hi; - int rv = 1; - - if (bb->shift < 0) - /* badblocks are disabled */ - return 0; - - if (bb->shift) { - /* round the start down, and the end up */ - sector_t next = s + sectors; - s >>= bb->shift; - next += (1<<bb->shift) - 1; - next >>= bb->shift; - sectors = next - s; - } - - write_seqlock_irq(&bb->lock); - - p = bb->page; - lo = 0; - hi = bb->count; - /* Find the last range that starts at-or-before 's' */ - while (hi - lo > 1) { - int mid = (lo + hi) / 2; - sector_t a = BB_OFFSET(p[mid]); - if (a <= s) - lo = mid; - else - hi = mid; - } - if (hi > lo && BB_OFFSET(p[lo]) > s) - hi = lo; - - if (hi > lo) { - /* we found a range that might merge with the start - * of our new range - */ - sector_t a = BB_OFFSET(p[lo]); - sector_t e = a + BB_LEN(p[lo]); - int ack = BB_ACK(p[lo]); - if (e >= s) { - /* Yes, we can merge with a previous range */ - if (s == a && s + sectors >= e) - /* new range covers old */ - ack = acknowledged; - else - ack = ack && acknowledged; - - if (e < s + sectors) - e = s + sectors; - if (e - a <= BB_MAX_LEN) { - p[lo] = BB_MAKE(a, e-a, ack); - s = e; - } else { - /* does not all fit in one range, - * make p[lo] maximal - */ - if (BB_LEN(p[lo]) != BB_MAX_LEN) - p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); - s = a + BB_MAX_LEN; - } - sectors = e - s; - } - } - if (sectors && hi < bb->count) { - /* 'hi' points to the first range that starts after 's'. - * Maybe we can merge with the start of that range */ - sector_t a = BB_OFFSET(p[hi]); - sector_t e = a + BB_LEN(p[hi]); - int ack = BB_ACK(p[hi]); - if (a <= s + sectors) { - /* merging is possible */ - if (e <= s + sectors) { - /* full overlap */ - e = s + sectors; - ack = acknowledged; - } else - ack = ack && acknowledged; - - a = s; - if (e - a <= BB_MAX_LEN) { - p[hi] = BB_MAKE(a, e-a, ack); - s = e; - } else { - p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); - s = a + BB_MAX_LEN; - } - sectors = e - s; - lo = hi; - hi++; - } - } - if (sectors == 0 && hi < bb->count) { - /* we might be able to combine lo and hi */ - /* Note: 's' is at the end of 'lo' */ - sector_t a = BB_OFFSET(p[hi]); - int lolen = BB_LEN(p[lo]); - int hilen = BB_LEN(p[hi]); - int newlen = lolen + hilen - (s - a); - if (s >= a && newlen < BB_MAX_LEN) { - /* yes, we can combine them */ - int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); - p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); - memmove(p + hi, p + hi + 1, - (bb->count - hi - 1) * 8); - bb->count--; - } - } - while (sectors) { - /* didn't merge (it all). - * Need to add a range just before 'hi' */ - if (bb->count >= MD_MAX_BADBLOCKS) { - /* No room for more */ - rv = 0; - break; - } else { - int this_sectors = sectors; - memmove(p + hi + 1, p + hi, - (bb->count - hi) * 8); - bb->count++; - - if (this_sectors > BB_MAX_LEN) - this_sectors = BB_MAX_LEN; - p[hi] = BB_MAKE(s, this_sectors, acknowledged); - sectors -= this_sectors; - s += this_sectors; - } - } - - bb->changed = 1; - if (!acknowledged) - bb->unacked_exist = 1; - write_sequnlock_irq(&bb->lock); - - return rv; -} - -int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, - int acknowledged) -{ - int rv = md_set_badblocks(&rdev->badblocks, - s + rdev->data_offset, sectors, acknowledged); - if (rv) { - /* Make sure they get written out promptly */ - sysfs_notify_dirent_safe(rdev->sysfs_state); - set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); - md_wakeup_thread(rdev->mddev->thread); - } - return rv; -} -EXPORT_SYMBOL_GPL(rdev_set_badblocks); - -/* - * Remove a range of bad blocks from the table. - * This may involve extending the table if we spilt a region, - * but it must not fail. So if the table becomes full, we just - * drop the remove request. - */ -static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors) -{ - u64 *p; - int lo, hi; - sector_t target = s + sectors; - int rv = 0; - - if (bb->shift > 0) { - /* When clearing we round the start up and the end down. - * This should not matter as the shift should align with - * the block size and no rounding should ever be needed. - * However it is better the think a block is bad when it - * isn't than to think a block is not bad when it is. - */ - s += (1<<bb->shift) - 1; - s >>= bb->shift; - target >>= bb->shift; - sectors = target - s; - } - - write_seqlock_irq(&bb->lock); - - p = bb->page; - lo = 0; - hi = bb->count; - /* Find the last range that starts before 'target' */ - while (hi - lo > 1) { - int mid = (lo + hi) / 2; - sector_t a = BB_OFFSET(p[mid]); - if (a < target) - lo = mid; - else - hi = mid; - } - if (hi > lo) { - /* p[lo] is the last range that could overlap the - * current range. Earlier ranges could also overlap, - * but only this one can overlap the end of the range. - */ - if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) { - /* Partial overlap, leave the tail of this range */ - int ack = BB_ACK(p[lo]); - sector_t a = BB_OFFSET(p[lo]); - sector_t end = a + BB_LEN(p[lo]); - - if (a < s) { - /* we need to split this range */ - if (bb->count >= MD_MAX_BADBLOCKS) { - rv = 0; - goto out; - } - memmove(p+lo+1, p+lo, (bb->count - lo) * 8); - bb->count++; - p[lo] = BB_MAKE(a, s-a, ack); - lo++; - } - p[lo] = BB_MAKE(target, end - target, ack); - /* there is no longer an overlap */ - hi = lo; - lo--; - } - while (lo >= 0 && - BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { - /* This range does overlap */ - if (BB_OFFSET(p[lo]) < s) { - /* Keep the early parts of this range. */ - int ack = BB_ACK(p[lo]); - sector_t start = BB_OFFSET(p[lo]); - p[lo] = BB_MAKE(start, s - start, ack); - /* now low doesn't overlap, so.. */ - break; - } - lo--; - } - /* 'lo' is strictly before, 'hi' is strictly after, - * anything between needs to be discarded - */ - if (hi - lo > 1) { - memmove(p+lo+1, p+hi, (bb->count - hi) * 8); - bb->count -= (hi - lo - 1); - } - } - - bb->changed = 1; -out: - write_sequnlock_irq(&bb->lock); - return rv; -} - -int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors) -{ - return md_clear_badblocks(&rdev->badblocks, - s + rdev->data_offset, - sectors); -} -EXPORT_SYMBOL_GPL(rdev_clear_badblocks); - -/* - * Acknowledge all bad blocks in a list. - * This only succeeds if ->changed is clear. It is used by - * in-kernel metadata updates - */ -void md_ack_all_badblocks(struct badblocks *bb) -{ - if (bb->page == NULL || bb->changed) - /* no point even trying */ - return; - write_seqlock_irq(&bb->lock); - - if (bb->changed == 0 && bb->unacked_exist) { - u64 *p = bb->page; - int i; - for (i = 0; i < bb->count ; i++) { - if (!BB_ACK(p[i])) { - sector_t start = BB_OFFSET(p[i]); - int len = BB_LEN(p[i]); - p[i] = BB_MAKE(start, len, 1); - } - } - bb->unacked_exist = 0; - } - write_sequnlock_irq(&bb->lock); -} -EXPORT_SYMBOL_GPL(md_ack_all_badblocks); - -/* sysfs access to bad-blocks list. - * We present two files. - * 'bad-blocks' lists sector numbers and lengths of ranges that - * are recorded as bad. The list is truncated to fit within - * the one-page limit of sysfs. - * Writing "sector length" to this file adds an acknowledged - * bad block list. - * 'unacknowledged-bad-blocks' lists bad blocks that have not yet - * been acknowledged. Writing to this file adds bad blocks - * without acknowledging them. This is largely for testing. - */ - -static ssize_t -badblocks_show(struct badblocks *bb, char *page, int unack) -{ - size_t len; - int i; - u64 *p = bb->page; - unsigned seq; - - if (bb->shift < 0) - return 0; - -retry: - seq = read_seqbegin(&bb->lock); - - len = 0; - i = 0; - - while (len < PAGE_SIZE && i < bb->count) { - sector_t s = BB_OFFSET(p[i]); - unsigned int length = BB_LEN(p[i]); - int ack = BB_ACK(p[i]); - i++; - - if (unack && ack) - continue; - - len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n", - (unsigned long long)s << bb->shift, - length << bb->shift); - } - if (unack && len == 0) - bb->unacked_exist = 0; - - if (read_seqretry(&bb->lock, seq)) - goto retry; - - return len; -} - -#define DO_DEBUG 1 - -static ssize_t -badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack) -{ - unsigned long long sector; - int length; - char newline; -#ifdef DO_DEBUG - /* Allow clearing via sysfs *only* for testing/debugging. - * Normally only a successful write may clear a badblock - */ - int clear = 0; - if (page[0] == '-') { - clear = 1; - page++; - } -#endif /* DO_DEBUG */ - - switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) { - case 3: - if (newline != '\n') - return -EINVAL; - case 2: - if (length <= 0) - return -EINVAL; - break; - default: - return -EINVAL; - } - -#ifdef DO_DEBUG - if (clear) { - md_clear_badblocks(bb, sector, length); - return len; - } -#endif /* DO_DEBUG */ - if (md_set_badblocks(bb, sector, length, !unack)) - return len; - else - return -ENOSPC; -} - -static int md_notify_reboot(struct notifier_block *this, - unsigned long code, void *x) -{ - struct list_head *tmp; - struct mddev *mddev; - int need_delay = 0; - - for_each_mddev(mddev, tmp) { - if (mddev_trylock(mddev)) { - if (mddev->pers) - __md_stop_writes(mddev); - mddev->safemode = 2; - mddev_unlock(mddev); - } - need_delay = 1; - } - /* - * certain more exotic SCSI devices are known to be - * volatile wrt too early system reboots. While the - * right place to handle this issue is the given - * driver, we do want to have a safe RAID driver ... - */ - if (need_delay) - mdelay(1000*1); - - return NOTIFY_DONE; -} - -static struct notifier_block md_notifier = { - .notifier_call = md_notify_reboot, - .next = NULL, - .priority = INT_MAX, /* before any real devices */ -}; - -static void md_geninit(void) -{ - pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); - - proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops); -} - -static int __init md_init(void) -{ - int ret = -ENOMEM; - - md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); - if (!md_wq) - goto err_wq; - - md_misc_wq = alloc_workqueue("md_misc", 0, 0); - if (!md_misc_wq) - goto err_misc_wq; - - if ((ret = register_blkdev(MD_MAJOR, "md")) < 0) - goto err_md; - - if ((ret = register_blkdev(0, "mdp")) < 0) - goto err_mdp; - mdp_major = ret; - - blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE, - md_probe, NULL, NULL); - blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, - md_probe, NULL, NULL); - - register_reboot_notifier(&md_notifier); - raid_table_header = register_sysctl_table(raid_root_table); - - md_geninit(); - return 0; - -err_mdp: - unregister_blkdev(MD_MAJOR, "md"); -err_md: - destroy_workqueue(md_misc_wq); -err_misc_wq: - destroy_workqueue(md_wq); -err_wq: - return ret; -} - -#ifndef MODULE - -/* - * Searches all registered partitions for autorun RAID arrays - * at boot time. - */ - -static LIST_HEAD(all_detected_devices); -struct detected_devices_node { - struct list_head list; - dev_t dev; -}; - -void md_autodetect_dev(dev_t dev) -{ - struct detected_devices_node *node_detected_dev; - - node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); - if (node_detected_dev) { - node_detected_dev->dev = dev; - list_add_tail(&node_detected_dev->list, &all_detected_devices); - } else { - printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed" - ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev)); - } -} - - -static void autostart_arrays(int part) -{ - struct md_rdev *rdev; - struct detected_devices_node *node_detected_dev; - dev_t dev; - int i_scanned, i_passed; - - i_scanned = 0; - i_passed = 0; - - printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); - - while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { - i_scanned++; - node_detected_dev = list_entry(all_detected_devices.next, - struct detected_devices_node, list); - list_del(&node_detected_dev->list); - dev = node_detected_dev->dev; - kfree(node_detected_dev); - rdev = md_import_device(dev,0, 90); - if (IS_ERR(rdev)) - continue; - - if (test_bit(Faulty, &rdev->flags)) { - MD_BUG(); - continue; - } - set_bit(AutoDetected, &rdev->flags); - list_add(&rdev->same_set, &pending_raid_disks); - i_passed++; - } - - printk(KERN_INFO "md: Scanned %d and added %d devices.\n", - i_scanned, i_passed); - - autorun_devices(part); -} - -#endif /* !MODULE */ - -static __exit void md_exit(void) -{ - struct mddev *mddev; - struct list_head *tmp; - - blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS); - blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); - - unregister_blkdev(MD_MAJOR,"md"); - unregister_blkdev(mdp_major, "mdp"); - unregister_reboot_notifier(&md_notifier); - unregister_sysctl_table(raid_table_header); - remove_proc_entry("mdstat", NULL); - for_each_mddev(mddev, tmp) { - export_array(mddev); - mddev->hold_active = 0; - } - destroy_workqueue(md_misc_wq); - destroy_workqueue(md_wq); -} - -subsys_initcall(md_init); -module_exit(md_exit) - -static int get_ro(char *buffer, struct kernel_param *kp) -{ - return sprintf(buffer, "%d", start_readonly); -} -static int set_ro(const char *val, struct kernel_param *kp) -{ - char *e; - int num = simple_strtoul(val, &e, 10); - if (*val && (*e == '\0' || *e == '\n')) { - start_readonly = num; - return 0; - } - return -EINVAL; -} - -module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); -module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); - -module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); - -EXPORT_SYMBOL(register_md_personality); -EXPORT_SYMBOL(unregister_md_personality); -EXPORT_SYMBOL(md_error); -EXPORT_SYMBOL(md_done_sync); -EXPORT_SYMBOL(md_write_start); -EXPORT_SYMBOL(md_write_end); -EXPORT_SYMBOL(md_register_thread); -EXPORT_SYMBOL(md_unregister_thread); -EXPORT_SYMBOL(md_wakeup_thread); -EXPORT_SYMBOL(md_check_recovery); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("MD RAID framework"); -MODULE_ALIAS("md"); -MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); diff --git a/ANDROID_3.4.5/drivers/md/md.h b/ANDROID_3.4.5/drivers/md/md.h deleted file mode 100644 index 1c2063cc..00000000 --- a/ANDROID_3.4.5/drivers/md/md.h +++ /dev/null @@ -1,627 +0,0 @@ -/* - md.h : kernel internal structure of the Linux MD driver - Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. - - You should have received a copy of the GNU General Public License - (for example /usr/src/linux/COPYING); if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -*/ - -#ifndef _MD_MD_H -#define _MD_MD_H - -#include <linux/blkdev.h> -#include <linux/kobject.h> -#include <linux/list.h> -#include <linux/mm.h> -#include <linux/mutex.h> -#include <linux/timer.h> -#include <linux/wait.h> -#include <linux/workqueue.h> - -#define MaxSector (~(sector_t)0) - -/* Bad block numbers are stored sorted in a single page. - * 64bits is used for each block or extent. - * 54 bits are sector number, 9 bits are extent size, - * 1 bit is an 'acknowledged' flag. - */ -#define MD_MAX_BADBLOCKS (PAGE_SIZE/8) - -/* - * MD's 'extended' device - */ -struct md_rdev { - struct list_head same_set; /* RAID devices within the same set */ - - sector_t sectors; /* Device size (in 512bytes sectors) */ - struct mddev *mddev; /* RAID array if running */ - int last_events; /* IO event timestamp */ - - /* - * If meta_bdev is non-NULL, it means that a separate device is - * being used to store the metadata (superblock/bitmap) which - * would otherwise be contained on the same device as the data (bdev). - */ - struct block_device *meta_bdev; - struct block_device *bdev; /* block device handle */ - - struct page *sb_page, *bb_page; - int sb_loaded; - __u64 sb_events; - sector_t data_offset; /* start of data in array */ - sector_t sb_start; /* offset of the super block (in 512byte sectors) */ - int sb_size; /* bytes in the superblock */ - int preferred_minor; /* autorun support */ - - struct kobject kobj; - - /* A device can be in one of three states based on two flags: - * Not working: faulty==1 in_sync==0 - * Fully working: faulty==0 in_sync==1 - * Working, but not - * in sync with array - * faulty==0 in_sync==0 - * - * It can never have faulty==1, in_sync==1 - * This reduces the burden of testing multiple flags in many cases - */ - - unsigned long flags; /* bit set of 'enum flag_bits' bits. */ - wait_queue_head_t blocked_wait; - - int desc_nr; /* descriptor index in the superblock */ - int raid_disk; /* role of device in array */ - int new_raid_disk; /* role that the device will have in - * the array after a level-change completes. - */ - int saved_raid_disk; /* role that device used to have in the - * array and could again if we did a partial - * resync from the bitmap - */ - sector_t recovery_offset;/* If this device has been partially - * recovered, this is where we were - * up to. - */ - - atomic_t nr_pending; /* number of pending requests. - * only maintained for arrays that - * support hot removal - */ - atomic_t read_errors; /* number of consecutive read errors that - * we have tried to ignore. - */ - struct timespec last_read_error; /* monotonic time since our - * last read error - */ - atomic_t corrected_errors; /* number of corrected read errors, - * for reporting to userspace and storing - * in superblock. - */ - struct work_struct del_work; /* used for delayed sysfs removal */ - - struct sysfs_dirent *sysfs_state; /* handle for 'state' - * sysfs entry */ - - struct badblocks { - int count; /* count of bad blocks */ - int unacked_exist; /* there probably are unacknowledged - * bad blocks. This is only cleared - * when a read discovers none - */ - int shift; /* shift from sectors to block size - * a -ve shift means badblocks are - * disabled.*/ - u64 *page; /* badblock list */ - int changed; - seqlock_t lock; - - sector_t sector; - sector_t size; /* in sectors */ - } badblocks; -}; -enum flag_bits { - Faulty, /* device is known to have a fault */ - In_sync, /* device is in_sync with rest of array */ - Unmerged, /* device is being added to array and should - * be considerred for bvec_merge_fn but not - * yet for actual IO - */ - WriteMostly, /* Avoid reading if at all possible */ - AutoDetected, /* added by auto-detect */ - Blocked, /* An error occurred but has not yet - * been acknowledged by the metadata - * handler, so don't allow writes - * until it is cleared */ - WriteErrorSeen, /* A write error has been seen on this - * device - */ - FaultRecorded, /* Intermediate state for clearing - * Blocked. The Fault is/will-be - * recorded in the metadata, but that - * metadata hasn't been stored safely - * on disk yet. - */ - BlockedBadBlocks, /* A writer is blocked because they - * found an unacknowledged bad-block. - * This can safely be cleared at any - * time, and the writer will re-check. - * It may be set at any time, and at - * worst the writer will timeout and - * re-check. So setting it as - * accurately as possible is good, but - * not absolutely critical. - */ - WantReplacement, /* This device is a candidate to be - * hot-replaced, either because it has - * reported some faults, or because - * of explicit request. - */ - Replacement, /* This device is a replacement for - * a want_replacement device with same - * raid_disk number. - */ -}; - -#define BB_LEN_MASK (0x00000000000001FFULL) -#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL) -#define BB_ACK_MASK (0x8000000000000000ULL) -#define BB_MAX_LEN 512 -#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9) -#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1) -#define BB_ACK(x) (!!((x) & BB_ACK_MASK)) -#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63)) - -extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors); -static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors) -{ - if (unlikely(rdev->badblocks.count)) { - int rv = md_is_badblock(&rdev->badblocks, rdev->data_offset + s, - sectors, - first_bad, bad_sectors); - if (rv) - *first_bad -= rdev->data_offset; - return rv; - } - return 0; -} -extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, - int acknowledged); -extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors); -extern void md_ack_all_badblocks(struct badblocks *bb); - -struct mddev { - void *private; - struct md_personality *pers; - dev_t unit; - int md_minor; - struct list_head disks; - unsigned long flags; -#define MD_CHANGE_DEVS 0 /* Some device status has changed */ -#define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */ -#define MD_CHANGE_PENDING 2 /* switch from 'clean' to 'active' in progress */ -#define MD_ARRAY_FIRST_USE 3 /* First use of array, needs initialization */ - - int suspended; - atomic_t active_io; - int ro; - int sysfs_active; /* set when sysfs deletes - * are happening, so run/ - * takeover/stop are not safe - */ - int ready; /* See when safe to pass - * IO requests down */ - struct gendisk *gendisk; - - struct kobject kobj; - int hold_active; -#define UNTIL_IOCTL 1 -#define UNTIL_STOP 2 - - /* Superblock information */ - int major_version, - minor_version, - patch_version; - int persistent; - int external; /* metadata is - * managed externally */ - char metadata_type[17]; /* externally set*/ - int chunk_sectors; - time_t ctime, utime; - int level, layout; - char clevel[16]; - int raid_disks; - int max_disks; - sector_t dev_sectors; /* used size of - * component devices */ - sector_t array_sectors; /* exported array size */ - int external_size; /* size managed - * externally */ - __u64 events; - /* If the last 'event' was simply a clean->dirty transition, and - * we didn't write it to the spares, then it is safe and simple - * to just decrement the event count on a dirty->clean transition. - * So we record that possibility here. - */ - int can_decrease_events; - - char uuid[16]; - - /* If the array is being reshaped, we need to record the - * new shape and an indication of where we are up to. - * This is written to the superblock. - * If reshape_position is MaxSector, then no reshape is happening (yet). - */ - sector_t reshape_position; - int delta_disks, new_level, new_layout; - int new_chunk_sectors; - - atomic_t plug_cnt; /* If device is expecting - * more bios soon. - */ - struct md_thread *thread; /* management thread */ - struct md_thread *sync_thread; /* doing resync or reconstruct */ - sector_t curr_resync; /* last block scheduled */ - /* As resync requests can complete out of order, we cannot easily track - * how much resync has been completed. So we occasionally pause until - * everything completes, then set curr_resync_completed to curr_resync. - * As such it may be well behind the real resync mark, but it is a value - * we are certain of. - */ - sector_t curr_resync_completed; - unsigned long resync_mark; /* a recent timestamp */ - sector_t resync_mark_cnt;/* blocks written at resync_mark */ - sector_t curr_mark_cnt; /* blocks scheduled now */ - - sector_t resync_max_sectors; /* may be set by personality */ - - sector_t resync_mismatches; /* count of sectors where - * parity/replica mismatch found - */ - - /* allow user-space to request suspension of IO to regions of the array */ - sector_t suspend_lo; - sector_t suspend_hi; - /* if zero, use the system-wide default */ - int sync_speed_min; - int sync_speed_max; - - /* resync even though the same disks are shared among md-devices */ - int parallel_resync; - - int ok_start_degraded; - /* recovery/resync flags - * NEEDED: we might need to start a resync/recover - * RUNNING: a thread is running, or about to be started - * SYNC: actually doing a resync, not a recovery - * RECOVER: doing recovery, or need to try it. - * INTR: resync needs to be aborted for some reason - * DONE: thread is done and is waiting to be reaped - * REQUEST: user-space has requested a sync (used with SYNC) - * CHECK: user-space request for check-only, no repair - * RESHAPE: A reshape is happening - * - * If neither SYNC or RESHAPE are set, then it is a recovery. - */ -#define MD_RECOVERY_RUNNING 0 -#define MD_RECOVERY_SYNC 1 -#define MD_RECOVERY_RECOVER 2 -#define MD_RECOVERY_INTR 3 -#define MD_RECOVERY_DONE 4 -#define MD_RECOVERY_NEEDED 5 -#define MD_RECOVERY_REQUESTED 6 -#define MD_RECOVERY_CHECK 7 -#define MD_RECOVERY_RESHAPE 8 -#define MD_RECOVERY_FROZEN 9 - - unsigned long recovery; - /* If a RAID personality determines that recovery (of a particular - * device) will fail due to a read error on the source device, it - * takes a copy of this number and does not attempt recovery again - * until this number changes. - */ - int recovery_disabled; - - int in_sync; /* know to not need resync */ - /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so - * that we are never stopping an array while it is open. - * 'reconfig_mutex' protects all other reconfiguration. - * These locks are separate due to conflicting interactions - * with bdev->bd_mutex. - * Lock ordering is: - * reconfig_mutex -> bd_mutex : e.g. do_md_run -> revalidate_disk - * bd_mutex -> open_mutex: e.g. __blkdev_get -> md_open - */ - struct mutex open_mutex; - struct mutex reconfig_mutex; - atomic_t active; /* general refcount */ - atomic_t openers; /* number of active opens */ - - int changed; /* True if we might need to - * reread partition info */ - int degraded; /* whether md should consider - * adding a spare - */ - int merge_check_needed; /* at least one - * member device - * has a - * merge_bvec_fn */ - - atomic_t recovery_active; /* blocks scheduled, but not written */ - wait_queue_head_t recovery_wait; - sector_t recovery_cp; - sector_t resync_min; /* user requested sync - * starts here */ - sector_t resync_max; /* resync should pause - * when it gets here */ - - struct sysfs_dirent *sysfs_state; /* handle for 'array_state' - * file in sysfs. - */ - struct sysfs_dirent *sysfs_action; /* handle for 'sync_action' */ - - struct work_struct del_work; /* used for delayed sysfs removal */ - - spinlock_t write_lock; - wait_queue_head_t sb_wait; /* for waiting on superblock updates */ - atomic_t pending_writes; /* number of active superblock writes */ - - unsigned int safemode; /* if set, update "clean" superblock - * when no writes pending. - */ - unsigned int safemode_delay; - struct timer_list safemode_timer; - atomic_t writes_pending; - struct request_queue *queue; /* for plugging ... */ - - struct bitmap *bitmap; /* the bitmap for the device */ - struct { - struct file *file; /* the bitmap file */ - loff_t offset; /* offset from superblock of - * start of bitmap. May be - * negative, but not '0' - * For external metadata, offset - * from start of device. - */ - loff_t default_offset; /* this is the offset to use when - * hot-adding a bitmap. It should - * eventually be settable by sysfs. - */ - struct mutex mutex; - unsigned long chunksize; - unsigned long daemon_sleep; /* how many jiffies between updates? */ - unsigned long max_write_behind; /* write-behind mode */ - int external; - } bitmap_info; - - atomic_t max_corr_read_errors; /* max read retries */ - struct list_head all_mddevs; - - struct attribute_group *to_remove; - - struct bio_set *bio_set; - - /* Generic flush handling. - * The last to finish preflush schedules a worker to submit - * the rest of the request (without the REQ_FLUSH flag). - */ - struct bio *flush_bio; - atomic_t flush_pending; - struct work_struct flush_work; - struct work_struct event_work; /* used by dm to report failure event */ - void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); -}; - - -static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) -{ - int faulty = test_bit(Faulty, &rdev->flags); - if (atomic_dec_and_test(&rdev->nr_pending) && faulty) - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); -} - -static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) -{ - atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io); -} - -struct md_personality -{ - char *name; - int level; - struct list_head list; - struct module *owner; - void (*make_request)(struct mddev *mddev, struct bio *bio); - int (*run)(struct mddev *mddev); - int (*stop)(struct mddev *mddev); - void (*status)(struct seq_file *seq, struct mddev *mddev); - /* error_handler must set ->faulty and clear ->in_sync - * if appropriate, and should abort recovery if needed - */ - void (*error_handler)(struct mddev *mddev, struct md_rdev *rdev); - int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev); - int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev); - int (*spare_active) (struct mddev *mddev); - sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster); - int (*resize) (struct mddev *mddev, sector_t sectors); - sector_t (*size) (struct mddev *mddev, sector_t sectors, int raid_disks); - int (*check_reshape) (struct mddev *mddev); - int (*start_reshape) (struct mddev *mddev); - void (*finish_reshape) (struct mddev *mddev); - /* quiesce moves between quiescence states - * 0 - fully active - * 1 - no new requests allowed - * others - reserved - */ - void (*quiesce) (struct mddev *mddev, int state); - /* takeover is used to transition an array from one - * personality to another. The new personality must be able - * to handle the data in the current layout. - * e.g. 2drive raid1 -> 2drive raid5 - * ndrive raid5 -> degraded n+1drive raid6 with special layout - * If the takeover succeeds, a new 'private' structure is returned. - * This needs to be installed and then ->run used to activate the - * array. - */ - void *(*takeover) (struct mddev *mddev); -}; - - -struct md_sysfs_entry { - struct attribute attr; - ssize_t (*show)(struct mddev *, char *); - ssize_t (*store)(struct mddev *, const char *, size_t); -}; -extern struct attribute_group md_bitmap_group; - -static inline struct sysfs_dirent *sysfs_get_dirent_safe(struct sysfs_dirent *sd, char *name) -{ - if (sd) - return sysfs_get_dirent(sd, NULL, name); - return sd; -} -static inline void sysfs_notify_dirent_safe(struct sysfs_dirent *sd) -{ - if (sd) - sysfs_notify_dirent(sd); -} - -static inline char * mdname (struct mddev * mddev) -{ - return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; -} - -static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev) -{ - char nm[20]; - if (!test_bit(Replacement, &rdev->flags)) { - sprintf(nm, "rd%d", rdev->raid_disk); - return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); - } else - return 0; -} - -static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) -{ - char nm[20]; - if (!test_bit(Replacement, &rdev->flags)) { - sprintf(nm, "rd%d", rdev->raid_disk); - sysfs_remove_link(&mddev->kobj, nm); - } -} - -/* - * iterates through some rdev ringlist. It's safe to remove the - * current 'rdev'. Dont touch 'tmp' though. - */ -#define rdev_for_each_list(rdev, tmp, head) \ - list_for_each_entry_safe(rdev, tmp, head, same_set) - -/* - * iterates through the 'same array disks' ringlist - */ -#define rdev_for_each(rdev, mddev) \ - list_for_each_entry(rdev, &((mddev)->disks), same_set) - -#define rdev_for_each_safe(rdev, tmp, mddev) \ - list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set) - -#define rdev_for_each_rcu(rdev, mddev) \ - list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set) - -struct md_thread { - void (*run) (struct mddev *mddev); - struct mddev *mddev; - wait_queue_head_t wqueue; - unsigned long flags; - struct task_struct *tsk; - unsigned long timeout; -}; - -#define THREAD_WAKEUP 0 - -#define __wait_event_lock_irq(wq, condition, lock, cmd) \ -do { \ - wait_queue_t __wait; \ - init_waitqueue_entry(&__wait, current); \ - \ - add_wait_queue(&wq, &__wait); \ - for (;;) { \ - set_current_state(TASK_UNINTERRUPTIBLE); \ - if (condition) \ - break; \ - spin_unlock_irq(&lock); \ - cmd; \ - schedule(); \ - spin_lock_irq(&lock); \ - } \ - current->state = TASK_RUNNING; \ - remove_wait_queue(&wq, &__wait); \ -} while (0) - -#define wait_event_lock_irq(wq, condition, lock, cmd) \ -do { \ - if (condition) \ - break; \ - __wait_event_lock_irq(wq, condition, lock, cmd); \ -} while (0) - -static inline void safe_put_page(struct page *p) -{ - if (p) put_page(p); -} - -extern int register_md_personality(struct md_personality *p); -extern int unregister_md_personality(struct md_personality *p); -extern struct md_thread *md_register_thread( - void (*run)(struct mddev *mddev), - struct mddev *mddev, - const char *name); -extern void md_unregister_thread(struct md_thread **threadp); -extern void md_wakeup_thread(struct md_thread *thread); -extern void md_check_recovery(struct mddev *mddev); -extern void md_write_start(struct mddev *mddev, struct bio *bi); -extern void md_write_end(struct mddev *mddev); -extern void md_done_sync(struct mddev *mddev, int blocks, int ok); -extern void md_error(struct mddev *mddev, struct md_rdev *rdev); - -extern int mddev_congested(struct mddev *mddev, int bits); -extern void md_flush_request(struct mddev *mddev, struct bio *bio); -extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, - sector_t sector, int size, struct page *page); -extern void md_super_wait(struct mddev *mddev); -extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, - struct page *page, int rw, bool metadata_op); -extern void md_do_sync(struct mddev *mddev); -extern void md_new_event(struct mddev *mddev); -extern int md_allow_write(struct mddev *mddev); -extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev); -extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors); -extern int md_check_no_bitmap(struct mddev *mddev); -extern int md_integrity_register(struct mddev *mddev); -extern void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev); -extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); -extern void restore_bitmap_write_access(struct file *file); - -extern void mddev_init(struct mddev *mddev); -extern int md_run(struct mddev *mddev); -extern void md_stop(struct mddev *mddev); -extern void md_stop_writes(struct mddev *mddev); -extern int md_rdev_init(struct md_rdev *rdev); - -extern void mddev_suspend(struct mddev *mddev); -extern void mddev_resume(struct mddev *mddev); -extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, - struct mddev *mddev); -extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, - struct mddev *mddev); -extern int mddev_check_plugged(struct mddev *mddev); -extern void md_trim_bio(struct bio *bio, int offset, int size); -#endif /* _MD_MD_H */ diff --git a/ANDROID_3.4.5/drivers/md/multipath.c b/ANDROID_3.4.5/drivers/md/multipath.c deleted file mode 100644 index 9339e67f..00000000 --- a/ANDROID_3.4.5/drivers/md/multipath.c +++ /dev/null @@ -1,557 +0,0 @@ -/* - * multipath.c : Multiple Devices driver for Linux - * - * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat - * - * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman - * - * MULTIPATH management functions. - * - * derived from raid1.c. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * You should have received a copy of the GNU General Public License - * (for example /usr/src/linux/COPYING); if not, write to the Free - * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include <linux/blkdev.h> -#include <linux/module.h> -#include <linux/raid/md_u.h> -#include <linux/seq_file.h> -#include <linux/slab.h> -#include "md.h" -#include "multipath.h" - -#define MAX_WORK_PER_DISK 128 - -#define NR_RESERVED_BUFS 32 - - -static int multipath_map (struct mpconf *conf) -{ - int i, disks = conf->raid_disks; - - /* - * Later we do read balancing on the read side - * now we use the first available disk. - */ - - rcu_read_lock(); - for (i = 0; i < disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); - if (rdev && test_bit(In_sync, &rdev->flags)) { - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - return i; - } - } - rcu_read_unlock(); - - printk(KERN_ERR "multipath_map(): no more operational IO paths?\n"); - return (-1); -} - -static void multipath_reschedule_retry (struct multipath_bh *mp_bh) -{ - unsigned long flags; - struct mddev *mddev = mp_bh->mddev; - struct mpconf *conf = mddev->private; - - spin_lock_irqsave(&conf->device_lock, flags); - list_add(&mp_bh->retry_list, &conf->retry_list); - spin_unlock_irqrestore(&conf->device_lock, flags); - md_wakeup_thread(mddev->thread); -} - - -/* - * multipath_end_bh_io() is called when we have finished servicing a multipathed - * operation and are ready to return a success/failure code to the buffer - * cache layer. - */ -static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err) -{ - struct bio *bio = mp_bh->master_bio; - struct mpconf *conf = mp_bh->mddev->private; - - bio_endio(bio, err); - mempool_free(mp_bh, conf->pool); -} - -static void multipath_end_request(struct bio *bio, int error) -{ - int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct multipath_bh *mp_bh = bio->bi_private; - struct mpconf *conf = mp_bh->mddev->private; - struct md_rdev *rdev = conf->multipaths[mp_bh->path].rdev; - - if (uptodate) - multipath_end_bh_io(mp_bh, 0); - else if (!(bio->bi_rw & REQ_RAHEAD)) { - /* - * oops, IO error: - */ - char b[BDEVNAME_SIZE]; - md_error (mp_bh->mddev, rdev); - printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n", - bdevname(rdev->bdev,b), - (unsigned long long)bio->bi_sector); - multipath_reschedule_retry(mp_bh); - } else - multipath_end_bh_io(mp_bh, error); - rdev_dec_pending(rdev, conf->mddev); -} - -static void multipath_make_request(struct mddev *mddev, struct bio * bio) -{ - struct mpconf *conf = mddev->private; - struct multipath_bh * mp_bh; - struct multipath_info *multipath; - - if (unlikely(bio->bi_rw & REQ_FLUSH)) { - md_flush_request(mddev, bio); - return; - } - - mp_bh = mempool_alloc(conf->pool, GFP_NOIO); - - mp_bh->master_bio = bio; - mp_bh->mddev = mddev; - - mp_bh->path = multipath_map(conf); - if (mp_bh->path < 0) { - bio_endio(bio, -EIO); - mempool_free(mp_bh, conf->pool); - return; - } - multipath = conf->multipaths + mp_bh->path; - - mp_bh->bio = *bio; - mp_bh->bio.bi_sector += multipath->rdev->data_offset; - mp_bh->bio.bi_bdev = multipath->rdev->bdev; - mp_bh->bio.bi_rw |= REQ_FAILFAST_TRANSPORT; - mp_bh->bio.bi_end_io = multipath_end_request; - mp_bh->bio.bi_private = mp_bh; - generic_make_request(&mp_bh->bio); - return; -} - -static void multipath_status (struct seq_file *seq, struct mddev *mddev) -{ - struct mpconf *conf = mddev->private; - int i; - - seq_printf (seq, " [%d/%d] [", conf->raid_disks, - conf->raid_disks - mddev->degraded); - for (i = 0; i < conf->raid_disks; i++) - seq_printf (seq, "%s", - conf->multipaths[i].rdev && - test_bit(In_sync, &conf->multipaths[i].rdev->flags) ? "U" : "_"); - seq_printf (seq, "]"); -} - -static int multipath_congested(void *data, int bits) -{ - struct mddev *mddev = data; - struct mpconf *conf = mddev->private; - int i, ret = 0; - - if (mddev_congested(mddev, bits)) - return 1; - - rcu_read_lock(); - for (i = 0; i < mddev->raid_disks ; i++) { - struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); - if (rdev && !test_bit(Faulty, &rdev->flags)) { - struct request_queue *q = bdev_get_queue(rdev->bdev); - - ret |= bdi_congested(&q->backing_dev_info, bits); - /* Just like multipath_map, we just check the - * first available device - */ - break; - } - } - rcu_read_unlock(); - return ret; -} - -/* - * Careful, this can execute in IRQ contexts as well! - */ -static void multipath_error (struct mddev *mddev, struct md_rdev *rdev) -{ - struct mpconf *conf = mddev->private; - char b[BDEVNAME_SIZE]; - - if (conf->raid_disks - mddev->degraded <= 1) { - /* - * Uh oh, we can do nothing if this is our last path, but - * first check if this is a queued request for a device - * which has just failed. - */ - printk(KERN_ALERT - "multipath: only one IO path left and IO error.\n"); - /* leave it active... it's all we have */ - return; - } - /* - * Mark disk as unusable - */ - if (test_and_clear_bit(In_sync, &rdev->flags)) { - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded++; - spin_unlock_irqrestore(&conf->device_lock, flags); - } - set_bit(Faulty, &rdev->flags); - set_bit(MD_CHANGE_DEVS, &mddev->flags); - printk(KERN_ALERT "multipath: IO failure on %s," - " disabling IO path.\n" - "multipath: Operation continuing" - " on %d IO paths.\n", - bdevname(rdev->bdev, b), - conf->raid_disks - mddev->degraded); -} - -static void print_multipath_conf (struct mpconf *conf) -{ - int i; - struct multipath_info *tmp; - - printk("MULTIPATH conf printout:\n"); - if (!conf) { - printk("(conf==NULL)\n"); - return; - } - printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, - conf->raid_disks); - - for (i = 0; i < conf->raid_disks; i++) { - char b[BDEVNAME_SIZE]; - tmp = conf->multipaths + i; - if (tmp->rdev) - printk(" disk%d, o:%d, dev:%s\n", - i,!test_bit(Faulty, &tmp->rdev->flags), - bdevname(tmp->rdev->bdev,b)); - } -} - - -static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) -{ - struct mpconf *conf = mddev->private; - struct request_queue *q; - int err = -EEXIST; - int path; - struct multipath_info *p; - int first = 0; - int last = mddev->raid_disks - 1; - - if (rdev->raid_disk >= 0) - first = last = rdev->raid_disk; - - print_multipath_conf(conf); - - for (path = first; path <= last; path++) - if ((p=conf->multipaths+path)->rdev == NULL) { - q = rdev->bdev->bd_disk->queue; - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - - /* as we don't honour merge_bvec_fn, we must never risk - * violating it, so limit ->max_segments to one, lying - * within a single page. - * (Note: it is very unlikely that a device with - * merge_bvec_fn will be involved in multipath.) - */ - if (q->merge_bvec_fn) { - blk_queue_max_segments(mddev->queue, 1); - blk_queue_segment_boundary(mddev->queue, - PAGE_CACHE_SIZE - 1); - } - - spin_lock_irq(&conf->device_lock); - mddev->degraded--; - rdev->raid_disk = path; - set_bit(In_sync, &rdev->flags); - spin_unlock_irq(&conf->device_lock); - rcu_assign_pointer(p->rdev, rdev); - err = 0; - md_integrity_add_rdev(rdev, mddev); - break; - } - - print_multipath_conf(conf); - - return err; -} - -static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev) -{ - struct mpconf *conf = mddev->private; - int err = 0; - int number = rdev->raid_disk; - struct multipath_info *p = conf->multipaths + number; - - print_multipath_conf(conf); - - if (rdev == p->rdev) { - if (test_bit(In_sync, &rdev->flags) || - atomic_read(&rdev->nr_pending)) { - printk(KERN_ERR "hot-remove-disk, slot %d is identified" - " but is still operational!\n", number); - err = -EBUSY; - goto abort; - } - p->rdev = NULL; - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - p->rdev = rdev; - goto abort; - } - err = md_integrity_register(mddev); - } -abort: - - print_multipath_conf(conf); - return err; -} - - - -/* - * This is a kernel thread which: - * - * 1. Retries failed read operations on working multipaths. - * 2. Updates the raid superblock when problems encounter. - * 3. Performs writes following reads for array syncronising. - */ - -static void multipathd (struct mddev *mddev) -{ - struct multipath_bh *mp_bh; - struct bio *bio; - unsigned long flags; - struct mpconf *conf = mddev->private; - struct list_head *head = &conf->retry_list; - - md_check_recovery(mddev); - for (;;) { - char b[BDEVNAME_SIZE]; - spin_lock_irqsave(&conf->device_lock, flags); - if (list_empty(head)) - break; - mp_bh = list_entry(head->prev, struct multipath_bh, retry_list); - list_del(head->prev); - spin_unlock_irqrestore(&conf->device_lock, flags); - - bio = &mp_bh->bio; - bio->bi_sector = mp_bh->master_bio->bi_sector; - - if ((mp_bh->path = multipath_map (conf))<0) { - printk(KERN_ALERT "multipath: %s: unrecoverable IO read" - " error for block %llu\n", - bdevname(bio->bi_bdev,b), - (unsigned long long)bio->bi_sector); - multipath_end_bh_io(mp_bh, -EIO); - } else { - printk(KERN_ERR "multipath: %s: redirecting sector %llu" - " to another IO path\n", - bdevname(bio->bi_bdev,b), - (unsigned long long)bio->bi_sector); - *bio = *(mp_bh->master_bio); - bio->bi_sector += conf->multipaths[mp_bh->path].rdev->data_offset; - bio->bi_bdev = conf->multipaths[mp_bh->path].rdev->bdev; - bio->bi_rw |= REQ_FAILFAST_TRANSPORT; - bio->bi_end_io = multipath_end_request; - bio->bi_private = mp_bh; - generic_make_request(bio); - } - } - spin_unlock_irqrestore(&conf->device_lock, flags); -} - -static sector_t multipath_size(struct mddev *mddev, sector_t sectors, int raid_disks) -{ - WARN_ONCE(sectors || raid_disks, - "%s does not support generic reshape\n", __func__); - - return mddev->dev_sectors; -} - -static int multipath_run (struct mddev *mddev) -{ - struct mpconf *conf; - int disk_idx; - struct multipath_info *disk; - struct md_rdev *rdev; - int working_disks; - - if (md_check_no_bitmap(mddev)) - return -EINVAL; - - if (mddev->level != LEVEL_MULTIPATH) { - printk("multipath: %s: raid level not set to multipath IO (%d)\n", - mdname(mddev), mddev->level); - goto out; - } - /* - * copy the already verified devices into our private MULTIPATH - * bookkeeping area. [whatever we allocate in multipath_run(), - * should be freed in multipath_stop()] - */ - - conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL); - mddev->private = conf; - if (!conf) { - printk(KERN_ERR - "multipath: couldn't allocate memory for %s\n", - mdname(mddev)); - goto out; - } - - conf->multipaths = kzalloc(sizeof(struct multipath_info)*mddev->raid_disks, - GFP_KERNEL); - if (!conf->multipaths) { - printk(KERN_ERR - "multipath: couldn't allocate memory for %s\n", - mdname(mddev)); - goto out_free_conf; - } - - working_disks = 0; - rdev_for_each(rdev, mddev) { - disk_idx = rdev->raid_disk; - if (disk_idx < 0 || - disk_idx >= mddev->raid_disks) - continue; - - disk = conf->multipaths + disk_idx; - disk->rdev = rdev; - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - - /* as we don't honour merge_bvec_fn, we must never risk - * violating it, not that we ever expect a device with - * a merge_bvec_fn to be involved in multipath */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { - blk_queue_max_segments(mddev->queue, 1); - blk_queue_segment_boundary(mddev->queue, - PAGE_CACHE_SIZE - 1); - } - - if (!test_bit(Faulty, &rdev->flags)) - working_disks++; - } - - conf->raid_disks = mddev->raid_disks; - conf->mddev = mddev; - spin_lock_init(&conf->device_lock); - INIT_LIST_HEAD(&conf->retry_list); - - if (!working_disks) { - printk(KERN_ERR "multipath: no operational IO paths for %s\n", - mdname(mddev)); - goto out_free_conf; - } - mddev->degraded = conf->raid_disks - working_disks; - - conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS, - sizeof(struct multipath_bh)); - if (conf->pool == NULL) { - printk(KERN_ERR - "multipath: couldn't allocate memory for %s\n", - mdname(mddev)); - goto out_free_conf; - } - - { - mddev->thread = md_register_thread(multipathd, mddev, NULL); - if (!mddev->thread) { - printk(KERN_ERR "multipath: couldn't allocate thread" - " for %s\n", mdname(mddev)); - goto out_free_conf; - } - } - - printk(KERN_INFO - "multipath: array %s active with %d out of %d IO paths\n", - mdname(mddev), conf->raid_disks - mddev->degraded, - mddev->raid_disks); - /* - * Ok, everything is just fine now - */ - md_set_array_sectors(mddev, multipath_size(mddev, 0, 0)); - - mddev->queue->backing_dev_info.congested_fn = multipath_congested; - mddev->queue->backing_dev_info.congested_data = mddev; - - if (md_integrity_register(mddev)) - goto out_free_conf; - - return 0; - -out_free_conf: - if (conf->pool) - mempool_destroy(conf->pool); - kfree(conf->multipaths); - kfree(conf); - mddev->private = NULL; -out: - return -EIO; -} - - -static int multipath_stop (struct mddev *mddev) -{ - struct mpconf *conf = mddev->private; - - md_unregister_thread(&mddev->thread); - blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ - mempool_destroy(conf->pool); - kfree(conf->multipaths); - kfree(conf); - mddev->private = NULL; - return 0; -} - -static struct md_personality multipath_personality = -{ - .name = "multipath", - .level = LEVEL_MULTIPATH, - .owner = THIS_MODULE, - .make_request = multipath_make_request, - .run = multipath_run, - .stop = multipath_stop, - .status = multipath_status, - .error_handler = multipath_error, - .hot_add_disk = multipath_add_disk, - .hot_remove_disk= multipath_remove_disk, - .size = multipath_size, -}; - -static int __init multipath_init (void) -{ - return register_md_personality (&multipath_personality); -} - -static void __exit multipath_exit (void) -{ - unregister_md_personality (&multipath_personality); -} - -module_init(multipath_init); -module_exit(multipath_exit); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("simple multi-path personality for MD"); -MODULE_ALIAS("md-personality-7"); /* MULTIPATH */ -MODULE_ALIAS("md-multipath"); -MODULE_ALIAS("md-level--4"); diff --git a/ANDROID_3.4.5/drivers/md/multipath.h b/ANDROID_3.4.5/drivers/md/multipath.h deleted file mode 100644 index 717c60f6..00000000 --- a/ANDROID_3.4.5/drivers/md/multipath.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef _MULTIPATH_H -#define _MULTIPATH_H - -struct multipath_info { - struct md_rdev *rdev; -}; - -struct mpconf { - struct mddev *mddev; - struct multipath_info *multipaths; - int raid_disks; - spinlock_t device_lock; - struct list_head retry_list; - - mempool_t *pool; -}; - -/* - * this is our 'private' 'collective' MULTIPATH buffer head. - * it contains information about what kind of IO operations were started - * for this MULTIPATH operation, and about their status: - */ - -struct multipath_bh { - struct mddev *mddev; - struct bio *master_bio; - struct bio bio; - int path; - struct list_head retry_list; -}; -#endif diff --git a/ANDROID_3.4.5/drivers/md/persistent-data/Kconfig b/ANDROID_3.4.5/drivers/md/persistent-data/Kconfig deleted file mode 100644 index ceb35905..00000000 --- a/ANDROID_3.4.5/drivers/md/persistent-data/Kconfig +++ /dev/null @@ -1,8 +0,0 @@ -config DM_PERSISTENT_DATA - tristate - depends on BLK_DEV_DM && EXPERIMENTAL - select LIBCRC32C - select DM_BUFIO - ---help--- - Library providing immutable on-disk data structure support for - device-mapper targets such as the thin provisioning target. diff --git a/ANDROID_3.4.5/drivers/md/persistent-data/Makefile b/ANDROID_3.4.5/drivers/md/persistent-data/Makefile deleted file mode 100644 index cfa95f66..00000000 --- a/ANDROID_3.4.5/drivers/md/persistent-data/Makefile +++ /dev/null @@ -1,11 +0,0 @@ -obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o -dm-persistent-data-objs := \ - dm-block-manager.o \ - dm-space-map-checker.o \ - dm-space-map-common.o \ - dm-space-map-disk.o \ - dm-space-map-metadata.o \ - dm-transaction-manager.o \ - dm-btree.o \ - dm-btree-remove.o \ - dm-btree-spine.o diff --git a/ANDROID_3.4.5/drivers/md/persistent-data/dm-block-manager.c b/ANDROID_3.4.5/drivers/md/persistent-data/dm-block-manager.c deleted file mode 100644 index 0317ecdc..00000000 --- a/ANDROID_3.4.5/drivers/md/persistent-data/dm-block-manager.c +++ /dev/null @@ -1,620 +0,0 @@ -/* - * Copyright (C) 2011 Red Hat, Inc. - * - * This file is released under the GPL. - */ -#include "dm-block-manager.h" -#include "dm-persistent-data-internal.h" -#include "../dm-bufio.h" - -#include <linux/crc32c.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/rwsem.h> -#include <linux/device-mapper.h> -#include <linux/stacktrace.h> - -#define DM_MSG_PREFIX "block manager" - -/*----------------------------------------------------------------*/ - -/* - * This is a read/write semaphore with a couple of differences. - * - * i) There is a restriction on the number of concurrent read locks that - * may be held at once. This is just an implementation detail. - * - * ii) Recursive locking attempts are detected and return EINVAL. A stack - * trace is also emitted for the previous lock aquisition. - * - * iii) Priority is given to write locks. - */ -#define MAX_HOLDERS 4 -#define MAX_STACK 10 - -typedef unsigned long stack_entries[MAX_STACK]; - -struct block_lock { - spinlock_t lock; - __s32 count; - struct list_head waiters; - struct task_struct *holders[MAX_HOLDERS]; - -#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING - struct stack_trace traces[MAX_HOLDERS]; - stack_entries entries[MAX_HOLDERS]; -#endif -}; - -struct waiter { - struct list_head list; - struct task_struct *task; - int wants_write; -}; - -static unsigned __find_holder(struct block_lock *lock, - struct task_struct *task) -{ - unsigned i; - - for (i = 0; i < MAX_HOLDERS; i++) - if (lock->holders[i] == task) - break; - - BUG_ON(i == MAX_HOLDERS); - return i; -} - -/* call this *after* you increment lock->count */ -static void __add_holder(struct block_lock *lock, struct task_struct *task) -{ - unsigned h = __find_holder(lock, NULL); -#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING - struct stack_trace *t; -#endif - - get_task_struct(task); - lock->holders[h] = task; - -#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING - t = lock->traces + h; - t->nr_entries = 0; - t->max_entries = MAX_STACK; - t->entries = lock->entries[h]; - t->skip = 2; - save_stack_trace(t); -#endif -} - -/* call this *before* you decrement lock->count */ -static void __del_holder(struct block_lock *lock, struct task_struct *task) -{ - unsigned h = __find_holder(lock, task); - lock->holders[h] = NULL; - put_task_struct(task); -} - -static int __check_holder(struct block_lock *lock) -{ - unsigned i; -#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING - static struct stack_trace t; - static stack_entries entries; -#endif - - for (i = 0; i < MAX_HOLDERS; i++) { - if (lock->holders[i] == current) { - DMERR("recursive lock detected in pool metadata"); -#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING - DMERR("previously held here:"); - print_stack_trace(lock->traces + i, 4); - - DMERR("subsequent aquisition attempted here:"); - t.nr_entries = 0; - t.max_entries = MAX_STACK; - t.entries = entries; - t.skip = 3; - save_stack_trace(&t); - print_stack_trace(&t, 4); -#endif - return -EINVAL; - } - } - - return 0; -} - -static void __wait(struct waiter *w) -{ - for (;;) { - set_task_state(current, TASK_UNINTERRUPTIBLE); - - if (!w->task) - break; - - schedule(); - } - - set_task_state(current, TASK_RUNNING); -} - -static void __wake_waiter(struct waiter *w) -{ - struct task_struct *task; - - list_del(&w->list); - task = w->task; - smp_mb(); - w->task = NULL; - wake_up_process(task); -} - -/* - * We either wake a few readers or a single writer. - */ -static void __wake_many(struct block_lock *lock) -{ - struct waiter *w, *tmp; - - BUG_ON(lock->count < 0); - list_for_each_entry_safe(w, tmp, &lock->waiters, list) { - if (lock->count >= MAX_HOLDERS) - return; - - if (w->wants_write) { - if (lock->count > 0) - return; /* still read locked */ - - lock->count = -1; - __add_holder(lock, w->task); - __wake_waiter(w); - return; - } - - lock->count++; - __add_holder(lock, w->task); - __wake_waiter(w); - } -} - -static void bl_init(struct block_lock *lock) -{ - int i; - - spin_lock_init(&lock->lock); - lock->count = 0; - INIT_LIST_HEAD(&lock->waiters); - for (i = 0; i < MAX_HOLDERS; i++) - lock->holders[i] = NULL; -} - -static int __available_for_read(struct block_lock *lock) -{ - return lock->count >= 0 && - lock->count < MAX_HOLDERS && - list_empty(&lock->waiters); -} - -static int bl_down_read(struct block_lock *lock) -{ - int r; - struct waiter w; - - spin_lock(&lock->lock); - r = __check_holder(lock); - if (r) { - spin_unlock(&lock->lock); - return r; - } - - if (__available_for_read(lock)) { - lock->count++; - __add_holder(lock, current); - spin_unlock(&lock->lock); - return 0; - } - - get_task_struct(current); - - w.task = current; - w.wants_write = 0; - list_add_tail(&w.list, &lock->waiters); - spin_unlock(&lock->lock); - - __wait(&w); - put_task_struct(current); - return 0; -} - -static int bl_down_read_nonblock(struct block_lock *lock) -{ - int r; - - spin_lock(&lock->lock); - r = __check_holder(lock); - if (r) - goto out; - - if (__available_for_read(lock)) { - lock->count++; - __add_holder(lock, current); - r = 0; - } else - r = -EWOULDBLOCK; - -out: - spin_unlock(&lock->lock); - return r; -} - -static void bl_up_read(struct block_lock *lock) -{ - spin_lock(&lock->lock); - BUG_ON(lock->count <= 0); - __del_holder(lock, current); - --lock->count; - if (!list_empty(&lock->waiters)) - __wake_many(lock); - spin_unlock(&lock->lock); -} - -static int bl_down_write(struct block_lock *lock) -{ - int r; - struct waiter w; - - spin_lock(&lock->lock); - r = __check_holder(lock); - if (r) { - spin_unlock(&lock->lock); - return r; - } - - if (lock->count == 0 && list_empty(&lock->waiters)) { - lock->count = -1; - __add_holder(lock, current); - spin_unlock(&lock->lock); - return 0; - } - - get_task_struct(current); - w.task = current; - w.wants_write = 1; - - /* - * Writers given priority. We know there's only one mutator in the - * system, so ignoring the ordering reversal. - */ - list_add(&w.list, &lock->waiters); - spin_unlock(&lock->lock); - - __wait(&w); - put_task_struct(current); - - return 0; -} - -static void bl_up_write(struct block_lock *lock) -{ - spin_lock(&lock->lock); - __del_holder(lock, current); - lock->count = 0; - if (!list_empty(&lock->waiters)) - __wake_many(lock); - spin_unlock(&lock->lock); -} - -static void report_recursive_bug(dm_block_t b, int r) -{ - if (r == -EINVAL) - DMERR("recursive acquisition of block %llu requested.", - (unsigned long long) b); -} - -/*----------------------------------------------------------------*/ - -/* - * Block manager is currently implemented using dm-bufio. struct - * dm_block_manager and struct dm_block map directly onto a couple of - * structs in the bufio interface. I want to retain the freedom to move - * away from bufio in the future. So these structs are just cast within - * this .c file, rather than making it through to the public interface. - */ -static struct dm_buffer *to_buffer(struct dm_block *b) -{ - return (struct dm_buffer *) b; -} - -static struct dm_bufio_client *to_bufio(struct dm_block_manager *bm) -{ - return (struct dm_bufio_client *) bm; -} - -dm_block_t dm_block_location(struct dm_block *b) -{ - return dm_bufio_get_block_number(to_buffer(b)); -} -EXPORT_SYMBOL_GPL(dm_block_location); - -void *dm_block_data(struct dm_block *b) -{ - return dm_bufio_get_block_data(to_buffer(b)); -} -EXPORT_SYMBOL_GPL(dm_block_data); - -struct buffer_aux { - struct dm_block_validator *validator; - struct block_lock lock; - int write_locked; -}; - -static void dm_block_manager_alloc_callback(struct dm_buffer *buf) -{ - struct buffer_aux *aux = dm_bufio_get_aux_data(buf); - aux->validator = NULL; - bl_init(&aux->lock); -} - -static void dm_block_manager_write_callback(struct dm_buffer *buf) -{ - struct buffer_aux *aux = dm_bufio_get_aux_data(buf); - if (aux->validator) { - aux->validator->prepare_for_write(aux->validator, (struct dm_block *) buf, - dm_bufio_get_block_size(dm_bufio_get_client(buf))); - } -} - -/*---------------------------------------------------------------- - * Public interface - *--------------------------------------------------------------*/ -struct dm_block_manager *dm_block_manager_create(struct block_device *bdev, - unsigned block_size, - unsigned cache_size, - unsigned max_held_per_thread) -{ - return (struct dm_block_manager *) - dm_bufio_client_create(bdev, block_size, max_held_per_thread, - sizeof(struct buffer_aux), - dm_block_manager_alloc_callback, - dm_block_manager_write_callback); -} -EXPORT_SYMBOL_GPL(dm_block_manager_create); - -void dm_block_manager_destroy(struct dm_block_manager *bm) -{ - return dm_bufio_client_destroy(to_bufio(bm)); -} -EXPORT_SYMBOL_GPL(dm_block_manager_destroy); - -unsigned dm_bm_block_size(struct dm_block_manager *bm) -{ - return dm_bufio_get_block_size(to_bufio(bm)); -} -EXPORT_SYMBOL_GPL(dm_bm_block_size); - -dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm) -{ - return dm_bufio_get_device_size(to_bufio(bm)); -} - -static int dm_bm_validate_buffer(struct dm_block_manager *bm, - struct dm_buffer *buf, - struct buffer_aux *aux, - struct dm_block_validator *v) -{ - if (unlikely(!aux->validator)) { - int r; - if (!v) - return 0; - r = v->check(v, (struct dm_block *) buf, dm_bufio_get_block_size(to_bufio(bm))); - if (unlikely(r)) - return r; - aux->validator = v; - } else { - if (unlikely(aux->validator != v)) { - DMERR("validator mismatch (old=%s vs new=%s) for block %llu", - aux->validator->name, v ? v->name : "NULL", - (unsigned long long) - dm_bufio_get_block_number(buf)); - return -EINVAL; - } - } - - return 0; -} -int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b, - struct dm_block_validator *v, - struct dm_block **result) -{ - struct buffer_aux *aux; - void *p; - int r; - - p = dm_bufio_read(to_bufio(bm), b, (struct dm_buffer **) result); - if (unlikely(IS_ERR(p))) - return PTR_ERR(p); - - aux = dm_bufio_get_aux_data(to_buffer(*result)); - r = bl_down_read(&aux->lock); - if (unlikely(r)) { - dm_bufio_release(to_buffer(*result)); - report_recursive_bug(b, r); - return r; - } - - aux->write_locked = 0; - - r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v); - if (unlikely(r)) { - bl_up_read(&aux->lock); - dm_bufio_release(to_buffer(*result)); - return r; - } - - return 0; -} -EXPORT_SYMBOL_GPL(dm_bm_read_lock); - -int dm_bm_write_lock(struct dm_block_manager *bm, - dm_block_t b, struct dm_block_validator *v, - struct dm_block **result) -{ - struct buffer_aux *aux; - void *p; - int r; - - p = dm_bufio_read(to_bufio(bm), b, (struct dm_buffer **) result); - if (unlikely(IS_ERR(p))) - return PTR_ERR(p); - - aux = dm_bufio_get_aux_data(to_buffer(*result)); - r = bl_down_write(&aux->lock); - if (r) { - dm_bufio_release(to_buffer(*result)); - report_recursive_bug(b, r); - return r; - } - - aux->write_locked = 1; - - r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v); - if (unlikely(r)) { - bl_up_write(&aux->lock); - dm_bufio_release(to_buffer(*result)); - return r; - } - - return 0; -} -EXPORT_SYMBOL_GPL(dm_bm_write_lock); - -int dm_bm_read_try_lock(struct dm_block_manager *bm, - dm_block_t b, struct dm_block_validator *v, - struct dm_block **result) -{ - struct buffer_aux *aux; - void *p; - int r; - - p = dm_bufio_get(to_bufio(bm), b, (struct dm_buffer **) result); - if (unlikely(IS_ERR(p))) - return PTR_ERR(p); - if (unlikely(!p)) - return -EWOULDBLOCK; - - aux = dm_bufio_get_aux_data(to_buffer(*result)); - r = bl_down_read_nonblock(&aux->lock); - if (r < 0) { - dm_bufio_release(to_buffer(*result)); - report_recursive_bug(b, r); - return r; - } - aux->write_locked = 0; - - r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v); - if (unlikely(r)) { - bl_up_read(&aux->lock); - dm_bufio_release(to_buffer(*result)); - return r; - } - - return 0; -} - -int dm_bm_write_lock_zero(struct dm_block_manager *bm, - dm_block_t b, struct dm_block_validator *v, - struct dm_block **result) -{ - int r; - struct buffer_aux *aux; - void *p; - - p = dm_bufio_new(to_bufio(bm), b, (struct dm_buffer **) result); - if (unlikely(IS_ERR(p))) - return PTR_ERR(p); - - memset(p, 0, dm_bm_block_size(bm)); - - aux = dm_bufio_get_aux_data(to_buffer(*result)); - r = bl_down_write(&aux->lock); - if (r) { - dm_bufio_release(to_buffer(*result)); - return r; - } - - aux->write_locked = 1; - aux->validator = v; - - return 0; -} - -int dm_bm_unlock(struct dm_block *b) -{ - struct buffer_aux *aux; - aux = dm_bufio_get_aux_data(to_buffer(b)); - - if (aux->write_locked) { - dm_bufio_mark_buffer_dirty(to_buffer(b)); - bl_up_write(&aux->lock); - } else - bl_up_read(&aux->lock); - - dm_bufio_release(to_buffer(b)); - - return 0; -} -EXPORT_SYMBOL_GPL(dm_bm_unlock); - -int dm_bm_unlock_move(struct dm_block *b, dm_block_t n) -{ - struct buffer_aux *aux; - - aux = dm_bufio_get_aux_data(to_buffer(b)); - - if (aux->write_locked) { - dm_bufio_mark_buffer_dirty(to_buffer(b)); - bl_up_write(&aux->lock); - } else - bl_up_read(&aux->lock); - - dm_bufio_release_move(to_buffer(b), n); - return 0; -} - -int dm_bm_flush_and_unlock(struct dm_block_manager *bm, - struct dm_block *superblock) -{ - int r; - - r = dm_bufio_write_dirty_buffers(to_bufio(bm)); - if (unlikely(r)) - return r; - r = dm_bufio_issue_flush(to_bufio(bm)); - if (unlikely(r)) - return r; - - dm_bm_unlock(superblock); - - r = dm_bufio_write_dirty_buffers(to_bufio(bm)); - if (unlikely(r)) - return r; - r = dm_bufio_issue_flush(to_bufio(bm)); - if (unlikely(r)) - return r; - - return 0; -} - -u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor) -{ - return crc32c(~(u32) 0, data, len) ^ init_xor; -} -EXPORT_SYMBOL_GPL(dm_bm_checksum); - -/*----------------------------------------------------------------*/ - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); -MODULE_DESCRIPTION("Immutable metadata library for dm"); - -/*----------------------------------------------------------------*/ diff --git a/ANDROID_3.4.5/drivers/md/persistent-data/dm-block-manager.h b/ANDROID_3.4.5/drivers/md/persistent-data/dm-block-manager.h deleted file mode 100644 index 924833d2..00000000 --- a/ANDROID_3.4.5/drivers/md/persistent-data/dm-block-manager.h +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (C) 2011 Red Hat, Inc. - * - * This file is released under the GPL. - */ - -#ifndef _LINUX_DM_BLOCK_MANAGER_H -#define _LINUX_DM_BLOCK_MANAGER_H - -#include <linux/types.h> -#include <linux/blkdev.h> - -/*----------------------------------------------------------------*/ - -/* - * Block number. - */ -typedef uint64_t dm_block_t; -struct dm_block; - -dm_block_t dm_block_location(struct dm_block *b); -void *dm_block_data(struct dm_block *b); - -/*----------------------------------------------------------------*/ - -/* - * @name should be a unique identifier for the block manager, no longer - * than 32 chars. - * - * @max_held_per_thread should be the maximum number of locks, read or - * write, that an individual thread holds at any one time. - */ -struct dm_block_manager; -struct dm_block_manager *dm_block_manager_create( - struct block_device *bdev, unsigned block_size, - unsigned cache_size, unsigned max_held_per_thread); -void dm_block_manager_destroy(struct dm_block_manager *bm); - -unsigned dm_bm_block_size(struct dm_block_manager *bm); -dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm); - -/*----------------------------------------------------------------*/ - -/* - * The validator allows the caller to verify newly-read data and modify - * the data just before writing, e.g. to calculate checksums. It's - * important to be consistent with your use of validators. The only time - * you can change validators is if you call dm_bm_write_lock_zero. - */ -struct dm_block_validator { - const char *name; - void (*prepare_for_write)(struct dm_block_validator *v, struct dm_block *b, size_t block_size); - - /* - * Return 0 if the checksum is valid or < 0 on error. - */ - int (*check)(struct dm_block_validator *v, struct dm_block *b, size_t block_size); -}; - -/*----------------------------------------------------------------*/ - -/* - * You can have multiple concurrent readers or a single writer holding a - * block lock. - */ - -/* - * dm_bm_lock() locks a block and returns through @result a pointer to - * memory that holds a copy of that block. If you have write-locked the - * block then any changes you make to memory pointed to by @result will be - * written back to the disk sometime after dm_bm_unlock is called. - */ -int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b, - struct dm_block_validator *v, - struct dm_block **result); - -int dm_bm_write_lock(struct dm_block_manager *bm, dm_block_t b, - struct dm_block_validator *v, - struct dm_block **result); - -/* - * The *_try_lock variants return -EWOULDBLOCK if the block isn't - * available immediately. - */ -int dm_bm_read_try_lock(struct dm_block_manager *bm, dm_block_t b, - struct dm_block_validator *v, - struct dm_block **result); - -/* - * Use dm_bm_write_lock_zero() when you know you're going to - * overwrite the block completely. It saves a disk read. - */ -int dm_bm_write_lock_zero(struct dm_block_manager *bm, dm_block_t b, - struct dm_block_validator *v, - struct dm_block **result); - -int dm_bm_unlock(struct dm_block *b); - -/* - * An optimisation; we often want to copy a block's contents to a new - * block. eg, as part of the shadowing operation. It's far better for - * bufio to do this move behind the scenes than hold 2 locks and memcpy the - * data. - */ -int dm_bm_unlock_move(struct dm_block *b, dm_block_t n); - -/* - * It's a common idiom to have a superblock that should be committed last. - * - * @superblock should be write-locked on entry. It will be unlocked during - * this function. All dirty blocks are guaranteed to be written and flushed - * before the superblock. - * - * This method always blocks. - */ -int dm_bm_flush_and_unlock(struct dm_block_manager *bm, - struct dm_block *superblock); - -u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor); - -/*----------------------------------------------------------------*/ - -#endif /* _LINUX_DM_BLOCK_MANAGER_H */ diff --git a/ANDROID_3.4.5/drivers/md/persistent-data/dm-btree-internal.h b/ANDROID_3.4.5/drivers/md/persistent-data/dm-btree-internal.h deleted file mode 100644 index 5709bfea..00000000 --- a/ANDROID_3.4.5/drivers/md/persistent-data/dm-btree-internal.h +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Copyright (C) 2011 Red Hat, Inc. - * - * This file is released under the GPL. - */ - -#ifndef DM_BTREE_INTERNAL_H -#define DM_BTREE_INTERNAL_H - -#include "dm-btree.h" - -/*----------------------------------------------------------------*/ - -/* - * We'll need 2 accessor functions for n->csum and n->blocknr - * to support dm-btree-spine.c in that case. - */ - -enum node_flags { - INTERNAL_NODE = 1, - LEAF_NODE = 1 << 1 -}; - -/* - * Every btree node begins with this structure. Make sure it's a multiple - * of 8-bytes in size, otherwise the 64bit keys will be mis-aligned. - */ -struct node_header { - __le32 csum; - __le32 flags; - __le64 blocknr; /* Block this node is supposed to live in. */ - - __le32 nr_entries; - __le32 max_entries; - __le32 value_size; - __le32 padding; -} __packed; - -struct node { - struct node_header header; - __le64 keys[0]; -} __packed; - - -void inc_children(struct dm_transaction_manager *tm, struct node *n, - struct dm_btree_value_type *vt); - -int new_block(struct dm_btree_info *info, struct dm_block **result); -int unlock_block(struct dm_btree_info *info, struct dm_block *b); - -/* - * Spines keep track of the rolling locks. There are 2 variants, read-only - * and one that uses shadowing. These are separate structs to allow the - * type checker to spot misuse, for example accidentally calling read_lock - * on a shadow spine. - */ -struct ro_spine { - struct dm_btree_info *info; - - int count; - struct dm_block *nodes[2]; -}; - -void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info); -int exit_ro_spine(struct ro_spine *s); -int ro_step(struct ro_spine *s, dm_block_t new_child); -struct node *ro_node(struct ro_spine *s); - -struct shadow_spine { - struct dm_btree_info *info; - - int count; - struct dm_block *nodes[2]; - - dm_block_t root; -}; - -void init_shadow_spine(struct shadow_spine *s, struct dm_btree_info *info); -int exit_shadow_spine(struct shadow_spine *s); - -int shadow_step(struct shadow_spine *s, dm_block_t b, - struct dm_btree_value_type *vt); - -/* - * The spine must have at least one entry before calling this. - */ -struct dm_block *shadow_current(struct shadow_spine *s); - -/* - * The spine must have at least two entries before calling this. - */ -struct dm_block *shadow_parent(struct shadow_spine *s); - -int shadow_has_parent(struct shadow_spine *s); - -int shadow_root(struct shadow_spine *s); - -/* - * Some inlines. - */ -static inline __le64 *key_ptr(struct node *n, uint32_t index) -{ - return n->keys + index; -} - -static inline void *value_base(struct node *n) -{ - return &n->keys[le32_to_cpu(n->header.max_entries)]; -} - -static inline void *value_ptr(struct node *n, uint32_t index) -{ - uint32_t value_size = le32_to_cpu(n->header.value_size); - return value_base(n) + (value_size * index); -} - -/* - * Assumes the values are suitably-aligned and converts to core format. - */ -static inline uint64_t value64(struct node *n, uint32_t index) -{ - __le64 *values_le = value_base(n); - - return le64_to_cpu(values_le[index]); -} - -/* - * Searching for a key within a single node. - */ -int lower_bound(struct node *n, uint64_t key); - -extern struct dm_block_validator btree_node_validator; - -#endif /* DM_BTREE_INTERNAL_H */ diff --git a/ANDROID_3.4.5/drivers/md/persistent-data/dm-btree-remove.c b/ANDROID_3.4.5/drivers/md/persistent-data/dm-btree-remove.c deleted file mode 100644 index aa71e235..00000000 --- a/ANDROID_3.4.5/drivers/md/persistent-data/dm-btree-remove.c +++ /dev/null @@ -1,590 +0,0 @@ -/* - * Copyright (C) 2011 Red Hat, Inc. - * - * This file is released under the GPL. - */ - -#include "dm-btree.h" -#include "dm-btree-internal.h" -#include "dm-transaction-manager.h" - -#include <linux/export.h> - -/* - * Removing an entry from a btree - * ============================== - * - * A very important constraint for our btree is that no node, except the - * root, may have fewer than a certain number of entries. - * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES). - * - * Ensuring this is complicated by the way we want to only ever hold the - * locks on 2 nodes concurrently, and only change nodes in a top to bottom - * fashion. - * - * Each node may have a left or right sibling. When decending the spine, - * if a node contains only MIN_ENTRIES then we try and increase this to at - * least MIN_ENTRIES + 1. We do this in the following ways: - * - * [A] No siblings => this can only happen if the node is the root, in which - * case we copy the childs contents over the root. - * - * [B] No left sibling - * ==> rebalance(node, right sibling) - * - * [C] No right sibling - * ==> rebalance(left sibling, node) - * - * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD - * ==> delete node adding it's contents to left and right - * - * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD - * ==> rebalance(left, node, right) - * - * After these operations it's possible that the our original node no - * longer contains the desired sub tree. For this reason this rebalancing - * is performed on the children of the current node. This also avoids - * having a special case for the root. - * - * Once this rebalancing has occurred we can then step into the child node - * for internal nodes. Or delete the entry for leaf nodes. - */ - -/* - * Some little utilities for moving node data around. - */ -static void node_shift(struct node *n, int shift) -{ - uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); - uint32_t value_size = le32_to_cpu(n->header.value_size); - - if (shift < 0) { - shift = -shift; - BUG_ON(shift > nr_entries); - BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift)); - memmove(key_ptr(n, 0), - key_ptr(n, shift), - (nr_entries - shift) * sizeof(__le64)); - memmove(value_ptr(n, 0), - value_ptr(n, shift), - (nr_entries - shift) * value_size); - } else { - BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries)); - memmove(key_ptr(n, shift), - key_ptr(n, 0), - nr_entries * sizeof(__le64)); - memmove(value_ptr(n, shift), - value_ptr(n, 0), - nr_entries * value_size); - } -} - -static void node_copy(struct node *left, struct node *right, int shift) -{ - uint32_t nr_left = le32_to_cpu(left->header.nr_entries); - uint32_t value_size = le32_to_cpu(left->header.value_size); - BUG_ON(value_size != le32_to_cpu(right->header.value_size)); - - if (shift < 0) { - shift = -shift; - BUG_ON(nr_left + shift > le32_to_cpu(left->header.max_entries)); - memcpy(key_ptr(left, nr_left), - key_ptr(right, 0), - shift * sizeof(__le64)); - memcpy(value_ptr(left, nr_left), - value_ptr(right, 0), - shift * value_size); - } else { - BUG_ON(shift > le32_to_cpu(right->header.max_entries)); - memcpy(key_ptr(right, 0), - key_ptr(left, nr_left - shift), - shift * sizeof(__le64)); - memcpy(value_ptr(right, 0), - value_ptr(left, nr_left - shift), - shift * value_size); - } -} - -/* - * Delete a specific entry from a leaf node. - */ -static void delete_at(struct node *n, unsigned index) -{ - unsigned nr_entries = le32_to_cpu(n->header.nr_entries); - unsigned nr_to_copy = nr_entries - (index + 1); - uint32_t value_size = le32_to_cpu(n->header.value_size); - BUG_ON(index >= nr_entries); - - if (nr_to_copy) { - memmove(key_ptr(n, index), - key_ptr(n, index + 1), - nr_to_copy * sizeof(__le64)); - - memmove(value_ptr(n, index), - value_ptr(n, index + 1), - nr_to_copy * value_size); - } - - n->header.nr_entries = cpu_to_le32(nr_entries - 1); -} - -static unsigned merge_threshold(struct node *n) -{ - return le32_to_cpu(n->header.max_entries) / 3; -} - -struct child { - unsigned index; - struct dm_block *block; - struct node *n; -}; - -static struct dm_btree_value_type le64_type = { - .context = NULL, - .size = sizeof(__le64), - .inc = NULL, - .dec = NULL, - .equal = NULL -}; - -static int init_child(struct dm_btree_info *info, struct node *parent, - unsigned index, struct child *result) -{ - int r, inc; - dm_block_t root; - - result->index = index; - root = value64(parent, index); - - r = dm_tm_shadow_block(info->tm, root, &btree_node_validator, - &result->block, &inc); - if (r) - return r; - - result->n = dm_block_data(result->block); - - if (inc) - inc_children(info->tm, result->n, &le64_type); - - *((__le64 *) value_ptr(parent, index)) = - cpu_to_le64(dm_block_location(result->block)); - - return 0; -} - -static int exit_child(struct dm_btree_info *info, struct child *c) -{ - return dm_tm_unlock(info->tm, c->block); -} - -static void shift(struct node *left, struct node *right, int count) -{ - uint32_t nr_left = le32_to_cpu(left->header.nr_entries); - uint32_t nr_right = le32_to_cpu(right->header.nr_entries); - uint32_t max_entries = le32_to_cpu(left->header.max_entries); - uint32_t r_max_entries = le32_to_cpu(right->header.max_entries); - - BUG_ON(max_entries != r_max_entries); - BUG_ON(nr_left - count > max_entries); - BUG_ON(nr_right + count > max_entries); - - if (!count) - return; - - if (count > 0) { - node_shift(right, count); - node_copy(left, right, count); - } else { - node_copy(left, right, count); - node_shift(right, count); - } - - left->header.nr_entries = cpu_to_le32(nr_left - count); - right->header.nr_entries = cpu_to_le32(nr_right + count); -} - -static void __rebalance2(struct dm_btree_info *info, struct node *parent, - struct child *l, struct child *r) -{ - struct node *left = l->n; - struct node *right = r->n; - uint32_t nr_left = le32_to_cpu(left->header.nr_entries); - uint32_t nr_right = le32_to_cpu(right->header.nr_entries); - unsigned threshold = 2 * merge_threshold(left) + 1; - - if (nr_left + nr_right < threshold) { - /* - * Merge - */ - node_copy(left, right, -nr_right); - left->header.nr_entries = cpu_to_le32(nr_left + nr_right); - delete_at(parent, r->index); - - /* - * We need to decrement the right block, but not it's - * children, since they're still referenced by left. - */ - dm_tm_dec(info->tm, dm_block_location(r->block)); - } else { - /* - * Rebalance. - */ - unsigned target_left = (nr_left + nr_right) / 2; - shift(left, right, nr_left - target_left); - *key_ptr(parent, r->index) = right->keys[0]; - } -} - -static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info, - unsigned left_index) -{ - int r; - struct node *parent; - struct child left, right; - - parent = dm_block_data(shadow_current(s)); - - r = init_child(info, parent, left_index, &left); - if (r) - return r; - - r = init_child(info, parent, left_index + 1, &right); - if (r) { - exit_child(info, &left); - return r; - } - - __rebalance2(info, parent, &left, &right); - - r = exit_child(info, &left); - if (r) { - exit_child(info, &right); - return r; - } - - return exit_child(info, &right); -} - -/* - * We dump as many entries from center as possible into left, then the rest - * in right, then rebalance2. This wastes some cpu, but I want something - * simple atm. - */ -static void delete_center_node(struct dm_btree_info *info, struct node *parent, - struct child *l, struct child *c, struct child *r, - struct node *left, struct node *center, struct node *right, - uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) -{ - uint32_t max_entries = le32_to_cpu(left->header.max_entries); - unsigned shift = min(max_entries - nr_left, nr_center); - - BUG_ON(nr_left + shift > max_entries); - node_copy(left, center, -shift); - left->header.nr_entries = cpu_to_le32(nr_left + shift); - - if (shift != nr_center) { - shift = nr_center - shift; - BUG_ON((nr_right + shift) > max_entries); - node_shift(right, shift); - node_copy(center, right, shift); - right->header.nr_entries = cpu_to_le32(nr_right + shift); - } - *key_ptr(parent, r->index) = right->keys[0]; - - delete_at(parent, c->index); - r->index--; - - dm_tm_dec(info->tm, dm_block_location(c->block)); - __rebalance2(info, parent, l, r); -} - -/* - * Redistributes entries among 3 sibling nodes. - */ -static void redistribute3(struct dm_btree_info *info, struct node *parent, - struct child *l, struct child *c, struct child *r, - struct node *left, struct node *center, struct node *right, - uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) -{ - int s; - uint32_t max_entries = le32_to_cpu(left->header.max_entries); - unsigned target = (nr_left + nr_center + nr_right) / 3; - BUG_ON(target > max_entries); - - if (nr_left < nr_right) { - s = nr_left - target; - - if (s < 0 && nr_center < -s) { - /* not enough in central node */ - shift(left, center, nr_center); - s = nr_center - target; - shift(left, right, s); - nr_right += s; - } else - shift(left, center, s); - - shift(center, right, target - nr_right); - - } else { - s = target - nr_right; - if (s > 0 && nr_center < s) { - /* not enough in central node */ - shift(center, right, nr_center); - s = target - nr_center; - shift(left, right, s); - nr_left -= s; - } else - shift(center, right, s); - - shift(left, center, nr_left - target); - } - - *key_ptr(parent, c->index) = center->keys[0]; - *key_ptr(parent, r->index) = right->keys[0]; -} - -static void __rebalance3(struct dm_btree_info *info, struct node *parent, - struct child *l, struct child *c, struct child *r) -{ - struct node *left = l->n; - struct node *center = c->n; - struct node *right = r->n; - - uint32_t nr_left = le32_to_cpu(left->header.nr_entries); - uint32_t nr_center = le32_to_cpu(center->header.nr_entries); - uint32_t nr_right = le32_to_cpu(right->header.nr_entries); - - unsigned threshold = merge_threshold(left) * 4 + 1; - - BUG_ON(left->header.max_entries != center->header.max_entries); - BUG_ON(center->header.max_entries != right->header.max_entries); - - if ((nr_left + nr_center + nr_right) < threshold) - delete_center_node(info, parent, l, c, r, left, center, right, - nr_left, nr_center, nr_right); - else - redistribute3(info, parent, l, c, r, left, center, right, - nr_left, nr_center, nr_right); -} - -static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, - unsigned left_index) -{ - int r; - struct node *parent = dm_block_data(shadow_current(s)); - struct child left, center, right; - - /* - * FIXME: fill out an array? - */ - r = init_child(info, parent, left_index, &left); - if (r) - return r; - - r = init_child(info, parent, left_index + 1, ¢er); - if (r) { - exit_child(info, &left); - return r; - } - - r = init_child(info, parent, left_index + 2, &right); - if (r) { - exit_child(info, &left); - exit_child(info, ¢er); - return r; - } - - __rebalance3(info, parent, &left, ¢er, &right); - - r = exit_child(info, &left); - if (r) { - exit_child(info, ¢er); - exit_child(info, &right); - return r; - } - - r = exit_child(info, ¢er); - if (r) { - exit_child(info, &right); - return r; - } - - r = exit_child(info, &right); - if (r) - return r; - - return 0; -} - -static int get_nr_entries(struct dm_transaction_manager *tm, - dm_block_t b, uint32_t *result) -{ - int r; - struct dm_block *block; - struct node *n; - - r = dm_tm_read_lock(tm, b, &btree_node_validator, &block); - if (r) - return r; - - n = dm_block_data(block); - *result = le32_to_cpu(n->header.nr_entries); - - return dm_tm_unlock(tm, block); -} - -static int rebalance_children(struct shadow_spine *s, - struct dm_btree_info *info, uint64_t key) -{ - int i, r, has_left_sibling, has_right_sibling; - uint32_t child_entries; - struct node *n; - - n = dm_block_data(shadow_current(s)); - - if (le32_to_cpu(n->header.nr_entries) == 1) { - struct dm_block *child; - dm_block_t b = value64(n, 0); - - r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child); - if (r) - return r; - - memcpy(n, dm_block_data(child), - dm_bm_block_size(dm_tm_get_bm(info->tm))); - r = dm_tm_unlock(info->tm, child); - if (r) - return r; - - dm_tm_dec(info->tm, dm_block_location(child)); - return 0; - } - - i = lower_bound(n, key); - if (i < 0) - return -ENODATA; - - r = get_nr_entries(info->tm, value64(n, i), &child_entries); - if (r) - return r; - - has_left_sibling = i > 0; - has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1); - - if (!has_left_sibling) - r = rebalance2(s, info, i); - - else if (!has_right_sibling) - r = rebalance2(s, info, i - 1); - - else - r = rebalance3(s, info, i - 1); - - return r; -} - -static int do_leaf(struct node *n, uint64_t key, unsigned *index) -{ - int i = lower_bound(n, key); - - if ((i < 0) || - (i >= le32_to_cpu(n->header.nr_entries)) || - (le64_to_cpu(n->keys[i]) != key)) - return -ENODATA; - - *index = i; - - return 0; -} - -/* - * Prepares for removal from one level of the hierarchy. The caller must - * call delete_at() to remove the entry at index. - */ -static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, - struct dm_btree_value_type *vt, dm_block_t root, - uint64_t key, unsigned *index) -{ - int i = *index, r; - struct node *n; - - for (;;) { - r = shadow_step(s, root, vt); - if (r < 0) - break; - - /* - * We have to patch up the parent node, ugly, but I don't - * see a way to do this automatically as part of the spine - * op. - */ - if (shadow_has_parent(s)) { - __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); - memcpy(value_ptr(dm_block_data(shadow_parent(s)), i), - &location, sizeof(__le64)); - } - - n = dm_block_data(shadow_current(s)); - - if (le32_to_cpu(n->header.flags) & LEAF_NODE) - return do_leaf(n, key, index); - - r = rebalance_children(s, info, key); - if (r) - break; - - n = dm_block_data(shadow_current(s)); - if (le32_to_cpu(n->header.flags) & LEAF_NODE) - return do_leaf(n, key, index); - - i = lower_bound(n, key); - - /* - * We know the key is present, or else - * rebalance_children would have returned - * -ENODATA - */ - root = value64(n, i); - } - - return r; -} - -int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, - uint64_t *keys, dm_block_t *new_root) -{ - unsigned level, last_level = info->levels - 1; - int index = 0, r = 0; - struct shadow_spine spine; - struct node *n; - - init_shadow_spine(&spine, info); - for (level = 0; level < info->levels; level++) { - r = remove_raw(&spine, info, - (level == last_level ? - &info->value_type : &le64_type), - root, keys[level], (unsigned *)&index); - if (r < 0) - break; - - n = dm_block_data(shadow_current(&spine)); - if (level != last_level) { - root = value64(n, index); - continue; - } - - BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries)); - - if (info->value_type.dec) - info->value_type.dec(info->value_type.context, - value_ptr(n, index)); - - delete_at(n, index); - } - - *new_root = shadow_root(&spine); - exit_shadow_spine(&spine); - - return r; -} -EXPORT_SYMBOL_GPL(dm_btree_remove); diff --git a/ANDROID_3.4.5/drivers/md/persistent-data/dm-btree-spine.c b/ANDROID_3.4.5/drivers/md/persistent-data/dm-btree-spine.c deleted file mode 100644 index d9a7912e..00000000 --- a/ANDROID_3.4.5/drivers/md/persistent-data/dm-btree-spine.c +++ /dev/null @@ -1,244 +0,0 @@ -/* - * Copyright (C) 2011 Red Hat, Inc. - * - * This file is released under the GPL. - */ - -#include "dm-btree-internal.h" -#include "dm-transaction-manager.h" - -#include <linux/device-mapper.h> - -#define DM_MSG_PREFIX "btree spine" - -/*----------------------------------------------------------------*/ - -#define BTREE_CSUM_XOR 121107 - -static int node_check(struct dm_block_validator *v, - struct dm_block *b, - size_t block_size); - -static void node_prepare_for_write(struct dm_block_validator *v, - struct dm_block *b, - size_t block_size) -{ - struct node *n = dm_block_data(b); - struct node_header *h = &n->header; - - h->blocknr = cpu_to_le64(dm_block_location(b)); - h->csum = cpu_to_le32(dm_bm_checksum(&h->flags, - block_size - sizeof(__le32), - BTREE_CSUM_XOR)); - - BUG_ON(node_check(v, b, 4096)); -} - -static int node_check(struct dm_block_validator *v, - struct dm_block *b, - size_t block_size) -{ - struct node *n = dm_block_data(b); - struct node_header *h = &n->header; - size_t value_size; - __le32 csum_disk; - uint32_t flags; - - if (dm_block_location(b) != le64_to_cpu(h->blocknr)) { - DMERR("node_check failed blocknr %llu wanted %llu", - le64_to_cpu(h->blocknr), dm_block_location(b)); - return -ENOTBLK; - } - - csum_disk = cpu_to_le32(dm_bm_checksum(&h->flags, - block_size - sizeof(__le32), - BTREE_CSUM_XOR)); - if (csum_disk != h->csum) { - DMERR("node_check failed csum %u wanted %u", - le32_to_cpu(csum_disk), le32_to_cpu(h->csum)); - return -EILSEQ; - } - - value_size = le32_to_cpu(h->value_size); - - if (sizeof(struct node_header) + - (sizeof(__le64) + value_size) * le32_to_cpu(h->max_entries) > block_size) { - DMERR("node_check failed: max_entries too large"); - return -EILSEQ; - } - - if (le32_to_cpu(h->nr_entries) > le32_to_cpu(h->max_entries)) { - DMERR("node_check failed, too many entries"); - return -EILSEQ; - } - - /* - * The node must be either INTERNAL or LEAF. - */ - flags = le32_to_cpu(h->flags); - if (!(flags & INTERNAL_NODE) && !(flags & LEAF_NODE)) { - DMERR("node_check failed, node is neither INTERNAL or LEAF"); - return -EILSEQ; - } - - return 0; -} - -struct dm_block_validator btree_node_validator = { - .name = "btree_node", - .prepare_for_write = node_prepare_for_write, - .check = node_check -}; - -/*----------------------------------------------------------------*/ - -static int bn_read_lock(struct dm_btree_info *info, dm_block_t b, - struct dm_block **result) -{ - return dm_tm_read_lock(info->tm, b, &btree_node_validator, result); -} - -static int bn_shadow(struct dm_btree_info *info, dm_block_t orig, - struct dm_btree_value_type *vt, - struct dm_block **result) -{ - int r, inc; - - r = dm_tm_shadow_block(info->tm, orig, &btree_node_validator, - result, &inc); - if (!r && inc) - inc_children(info->tm, dm_block_data(*result), vt); - - return r; -} - -int new_block(struct dm_btree_info *info, struct dm_block **result) -{ - return dm_tm_new_block(info->tm, &btree_node_validator, result); -} - -int unlock_block(struct dm_btree_info *info, struct dm_block *b) -{ - return dm_tm_unlock(info->tm, b); -} - -/*----------------------------------------------------------------*/ - -void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info) -{ - s->info = info; - s->count = 0; - s->nodes[0] = NULL; - s->nodes[1] = NULL; -} - -int exit_ro_spine(struct ro_spine *s) -{ - int r = 0, i; - - for (i = 0; i < s->count; i++) { - int r2 = unlock_block(s->info, s->nodes[i]); - if (r2 < 0) - r = r2; - } - - return r; -} - -int ro_step(struct ro_spine *s, dm_block_t new_child) -{ - int r; - - if (s->count == 2) { - r = unlock_block(s->info, s->nodes[0]); - if (r < 0) - return r; - s->nodes[0] = s->nodes[1]; - s->count--; - } - - r = bn_read_lock(s->info, new_child, s->nodes + s->count); - if (!r) - s->count++; - - return r; -} - -struct node *ro_node(struct ro_spine *s) -{ - struct dm_block *block; - - BUG_ON(!s->count); - block = s->nodes[s->count - 1]; - - return dm_block_data(block); -} - -/*----------------------------------------------------------------*/ - -void init_shadow_spine(struct shadow_spine *s, struct dm_btree_info *info) -{ - s->info = info; - s->count = 0; -} - -int exit_shadow_spine(struct shadow_spine *s) -{ - int r = 0, i; - - for (i = 0; i < s->count; i++) { - int r2 = unlock_block(s->info, s->nodes[i]); - if (r2 < 0) - r = r2; - } - - return r; -} - -int shadow_step(struct shadow_spine *s, dm_block_t b, - struct dm_btree_value_type *vt) -{ - int r; - - if (s->count == 2) { - r = unlock_block(s->info, s->nodes[0]); - if (r < 0) - return r; - s->nodes[0] = s->nodes[1]; - s->count--; - } - - r = bn_shadow(s->info, b, vt, s->nodes + s->count); - if (!r) { - if (!s->count) - s->root = dm_block_location(s->nodes[0]); - - s->count++; - } - - return r; -} - -struct dm_block *shadow_current(struct shadow_spine *s) -{ - BUG_ON(!s->count); - - return s->nodes[s->count - 1]; -} - -struct dm_block *shadow_parent(struct shadow_spine *s) -{ - BUG_ON(s->count != 2); - - return s->count == 2 ? s->nodes[0] : NULL; -} - -int shadow_has_parent(struct shadow_spine *s) -{ - return s->count >= 2; -} - -int shadow_root(struct shadow_spine *s) -{ - return s->root; -} diff --git a/ANDROID_3.4.5/drivers/md/persistent-data/dm-btree.c b/ANDROID_3.4.5/drivers/md/persistent-data/dm-btree.c deleted file mode 100644 index d12b2cc5..00000000 --- a/ANDROID_3.4.5/drivers/md/persistent-data/dm-btree.c +++ /dev/null @@ -1,804 +0,0 @@ -/* - * Copyright (C) 2011 Red Hat, Inc. - * - * This file is released under the GPL. - */ - -#include "dm-btree-internal.h" -#include "dm-space-map.h" -#include "dm-transaction-manager.h" - -#include <linux/export.h> -#include <linux/device-mapper.h> - -#define DM_MSG_PREFIX "btree" - -/*---------------------------------------------------------------- - * Array manipulation - *--------------------------------------------------------------*/ -static void memcpy_disk(void *dest, const void *src, size_t len) - __dm_written_to_disk(src) -{ - memcpy(dest, src, len); - __dm_unbless_for_disk(src); -} - -static void array_insert(void *base, size_t elt_size, unsigned nr_elts, - unsigned index, void *elt) - __dm_written_to_disk(elt) -{ - if (index < nr_elts) - memmove(base + (elt_size * (index + 1)), - base + (elt_size * index), - (nr_elts - index) * elt_size); - - memcpy_disk(base + (elt_size * index), elt, elt_size); -} - -/*----------------------------------------------------------------*/ - -/* makes the assumption that no two keys are the same. */ -static int bsearch(struct node *n, uint64_t key, int want_hi) -{ - int lo = -1, hi = le32_to_cpu(n->header.nr_entries); - - while (hi - lo > 1) { - int mid = lo + ((hi - lo) / 2); - uint64_t mid_key = le64_to_cpu(n->keys[mid]); - - if (mid_key == key) - return mid; - - if (mid_key < key) - lo = mid; - else - hi = mid; - } - - return want_hi ? hi : lo; -} - -int lower_bound(struct node *n, uint64_t key) -{ - return bsearch(n, key, 0); -} - -void inc_children(struct dm_transaction_manager *tm, struct node *n, - struct dm_btree_value_type *vt) -{ - unsigned i; - uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); - - if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) - for (i = 0; i < nr_entries; i++) - dm_tm_inc(tm, value64(n, i)); - else if (vt->inc) - for (i = 0; i < nr_entries; i++) - vt->inc(vt->context, value_ptr(n, i)); -} - -static int insert_at(size_t value_size, struct node *node, unsigned index, - uint64_t key, void *value) - __dm_written_to_disk(value) -{ - uint32_t nr_entries = le32_to_cpu(node->header.nr_entries); - __le64 key_le = cpu_to_le64(key); - - if (index > nr_entries || - index >= le32_to_cpu(node->header.max_entries)) { - DMERR("too many entries in btree node for insert"); - __dm_unbless_for_disk(value); - return -ENOMEM; - } - - __dm_bless_for_disk(&key_le); - - array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le); - array_insert(value_base(node), value_size, nr_entries, index, value); - node->header.nr_entries = cpu_to_le32(nr_entries + 1); - - return 0; -} - -/*----------------------------------------------------------------*/ - -/* - * We want 3n entries (for some n). This works more nicely for repeated - * insert remove loops than (2n + 1). - */ -static uint32_t calc_max_entries(size_t value_size, size_t block_size) -{ - uint32_t total, n; - size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */ - - block_size -= sizeof(struct node_header); - total = block_size / elt_size; - n = total / 3; /* rounds down */ - - return 3 * n; -} - -int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root) -{ - int r; - struct dm_block *b; - struct node *n; - size_t block_size; - uint32_t max_entries; - - r = new_block(info, &b); - if (r < 0) - return r; - - block_size = dm_bm_block_size(dm_tm_get_bm(info->tm)); - max_entries = calc_max_entries(info->value_type.size, block_size); - - n = dm_block_data(b); - memset(n, 0, block_size); - n->header.flags = cpu_to_le32(LEAF_NODE); - n->header.nr_entries = cpu_to_le32(0); - n->header.max_entries = cpu_to_le32(max_entries); - n->header.value_size = cpu_to_le32(info->value_type.size); - - *root = dm_block_location(b); - return unlock_block(info, b); -} -EXPORT_SYMBOL_GPL(dm_btree_empty); - -/*----------------------------------------------------------------*/ - -/* - * Deletion uses a recursive algorithm, since we have limited stack space - * we explicitly manage our own stack on the heap. - */ -#define MAX_SPINE_DEPTH 64 -struct frame { - struct dm_block *b; - struct node *n; - unsigned level; - unsigned nr_children; - unsigned current_child; -}; - -struct del_stack { - struct dm_transaction_manager *tm; - int top; - struct frame spine[MAX_SPINE_DEPTH]; -}; - -static int top_frame(struct del_stack *s, struct frame **f) -{ - if (s->top < 0) { - DMERR("btree deletion stack empty"); - return -EINVAL; - } - - *f = s->spine + s->top; - - return 0; -} - -static int unprocessed_frames(struct del_stack *s) -{ - return s->top >= 0; -} - -static int push_frame(struct del_stack *s, dm_block_t b, unsigned level) -{ - int r; - uint32_t ref_count; - - if (s->top >= MAX_SPINE_DEPTH - 1) { - DMERR("btree deletion stack out of memory"); - return -ENOMEM; - } - - r = dm_tm_ref(s->tm, b, &ref_count); - if (r) - return r; - - if (ref_count > 1) - /* - * This is a shared node, so we can just decrement it's - * reference counter and leave the children. - */ - dm_tm_dec(s->tm, b); - - else { - struct frame *f = s->spine + ++s->top; - - r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b); - if (r) { - s->top--; - return r; - } - - f->n = dm_block_data(f->b); - f->level = level; - f->nr_children = le32_to_cpu(f->n->header.nr_entries); - f->current_child = 0; - } - - return 0; -} - -static void pop_frame(struct del_stack *s) -{ - struct frame *f = s->spine + s->top--; - - dm_tm_dec(s->tm, dm_block_location(f->b)); - dm_tm_unlock(s->tm, f->b); -} - -int dm_btree_del(struct dm_btree_info *info, dm_block_t root) -{ - int r; - struct del_stack *s; - - s = kmalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return -ENOMEM; - s->tm = info->tm; - s->top = -1; - - r = push_frame(s, root, 1); - if (r) - goto out; - - while (unprocessed_frames(s)) { - uint32_t flags; - struct frame *f; - dm_block_t b; - - r = top_frame(s, &f); - if (r) - goto out; - - if (f->current_child >= f->nr_children) { - pop_frame(s); - continue; - } - - flags = le32_to_cpu(f->n->header.flags); - if (flags & INTERNAL_NODE) { - b = value64(f->n, f->current_child); - f->current_child++; - r = push_frame(s, b, f->level); - if (r) - goto out; - - } else if (f->level != (info->levels - 1)) { - b = value64(f->n, f->current_child); - f->current_child++; - r = push_frame(s, b, f->level + 1); - if (r) - goto out; - - } else { - if (info->value_type.dec) { - unsigned i; - - for (i = 0; i < f->nr_children; i++) - info->value_type.dec(info->value_type.context, - value_ptr(f->n, i)); - } - f->current_child = f->nr_children; - } - } - -out: - kfree(s); - return r; -} -EXPORT_SYMBOL_GPL(dm_btree_del); - -/*----------------------------------------------------------------*/ - -static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key, - int (*search_fn)(struct node *, uint64_t), - uint64_t *result_key, void *v, size_t value_size) -{ - int i, r; - uint32_t flags, nr_entries; - - do { - r = ro_step(s, block); - if (r < 0) - return r; - - i = search_fn(ro_node(s), key); - - flags = le32_to_cpu(ro_node(s)->header.flags); - nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries); - if (i < 0 || i >= nr_entries) - return -ENODATA; - - if (flags & INTERNAL_NODE) - block = value64(ro_node(s), i); - - } while (!(flags & LEAF_NODE)); - - *result_key = le64_to_cpu(ro_node(s)->keys[i]); - memcpy(v, value_ptr(ro_node(s), i), value_size); - - return 0; -} - -int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root, - uint64_t *keys, void *value_le) -{ - unsigned level, last_level = info->levels - 1; - int r = -ENODATA; - uint64_t rkey; - __le64 internal_value_le; - struct ro_spine spine; - - init_ro_spine(&spine, info); - for (level = 0; level < info->levels; level++) { - size_t size; - void *value_p; - - if (level == last_level) { - value_p = value_le; - size = info->value_type.size; - - } else { - value_p = &internal_value_le; - size = sizeof(uint64_t); - } - - r = btree_lookup_raw(&spine, root, keys[level], - lower_bound, &rkey, - value_p, size); - - if (!r) { - if (rkey != keys[level]) { - exit_ro_spine(&spine); - return -ENODATA; - } - } else { - exit_ro_spine(&spine); - return r; - } - - root = le64_to_cpu(internal_value_le); - } - exit_ro_spine(&spine); - - return r; -} -EXPORT_SYMBOL_GPL(dm_btree_lookup); - -/* - * Splits a node by creating a sibling node and shifting half the nodes - * contents across. Assumes there is a parent node, and it has room for - * another child. - * - * Before: - * +--------+ - * | Parent | - * +--------+ - * | - * v - * +----------+ - * | A ++++++ | - * +----------+ - * - * - * After: - * +--------+ - * | Parent | - * +--------+ - * | | - * v +------+ - * +---------+ | - * | A* +++ | v - * +---------+ +-------+ - * | B +++ | - * +-------+ - * - * Where A* is a shadow of A. - */ -static int btree_split_sibling(struct shadow_spine *s, dm_block_t root, - unsigned parent_index, uint64_t key) -{ - int r; - size_t size; - unsigned nr_left, nr_right; - struct dm_block *left, *right, *parent; - struct node *ln, *rn, *pn; - __le64 location; - - left = shadow_current(s); - - r = new_block(s->info, &right); - if (r < 0) - return r; - - ln = dm_block_data(left); - rn = dm_block_data(right); - - nr_left = le32_to_cpu(ln->header.nr_entries) / 2; - nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left; - - ln->header.nr_entries = cpu_to_le32(nr_left); - - rn->header.flags = ln->header.flags; - rn->header.nr_entries = cpu_to_le32(nr_right); - rn->header.max_entries = ln->header.max_entries; - rn->header.value_size = ln->header.value_size; - memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0])); - - size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ? - sizeof(uint64_t) : s->info->value_type.size; - memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left), - size * nr_right); - - /* - * Patch up the parent - */ - parent = shadow_parent(s); - - pn = dm_block_data(parent); - location = cpu_to_le64(dm_block_location(left)); - __dm_bless_for_disk(&location); - memcpy_disk(value_ptr(pn, parent_index), - &location, sizeof(__le64)); - - location = cpu_to_le64(dm_block_location(right)); - __dm_bless_for_disk(&location); - - r = insert_at(sizeof(__le64), pn, parent_index + 1, - le64_to_cpu(rn->keys[0]), &location); - if (r) - return r; - - if (key < le64_to_cpu(rn->keys[0])) { - unlock_block(s->info, right); - s->nodes[1] = left; - } else { - unlock_block(s->info, left); - s->nodes[1] = right; - } - - return 0; -} - -/* - * Splits a node by creating two new children beneath the given node. - * - * Before: - * +----------+ - * | A ++++++ | - * +----------+ - * - * - * After: - * +------------+ - * | A (shadow) | - * +------------+ - * | | - * +------+ +----+ - * | | - * v v - * +-------+ +-------+ - * | B +++ | | C +++ | - * +-------+ +-------+ - */ -static int btree_split_beneath(struct shadow_spine *s, uint64_t key) -{ - int r; - size_t size; - unsigned nr_left, nr_right; - struct dm_block *left, *right, *new_parent; - struct node *pn, *ln, *rn; - __le64 val; - - new_parent = shadow_current(s); - - r = new_block(s->info, &left); - if (r < 0) - return r; - - r = new_block(s->info, &right); - if (r < 0) { - /* FIXME: put left */ - return r; - } - - pn = dm_block_data(new_parent); - ln = dm_block_data(left); - rn = dm_block_data(right); - - nr_left = le32_to_cpu(pn->header.nr_entries) / 2; - nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left; - - ln->header.flags = pn->header.flags; - ln->header.nr_entries = cpu_to_le32(nr_left); - ln->header.max_entries = pn->header.max_entries; - ln->header.value_size = pn->header.value_size; - - rn->header.flags = pn->header.flags; - rn->header.nr_entries = cpu_to_le32(nr_right); - rn->header.max_entries = pn->header.max_entries; - rn->header.value_size = pn->header.value_size; - - memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0])); - memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0])); - - size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ? - sizeof(__le64) : s->info->value_type.size; - memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size); - memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left), - nr_right * size); - - /* new_parent should just point to l and r now */ - pn->header.flags = cpu_to_le32(INTERNAL_NODE); - pn->header.nr_entries = cpu_to_le32(2); - pn->header.max_entries = cpu_to_le32( - calc_max_entries(sizeof(__le64), - dm_bm_block_size( - dm_tm_get_bm(s->info->tm)))); - pn->header.value_size = cpu_to_le32(sizeof(__le64)); - - val = cpu_to_le64(dm_block_location(left)); - __dm_bless_for_disk(&val); - pn->keys[0] = ln->keys[0]; - memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64)); - - val = cpu_to_le64(dm_block_location(right)); - __dm_bless_for_disk(&val); - pn->keys[1] = rn->keys[0]; - memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64)); - - /* - * rejig the spine. This is ugly, since it knows too - * much about the spine - */ - if (s->nodes[0] != new_parent) { - unlock_block(s->info, s->nodes[0]); - s->nodes[0] = new_parent; - } - if (key < le64_to_cpu(rn->keys[0])) { - unlock_block(s->info, right); - s->nodes[1] = left; - } else { - unlock_block(s->info, left); - s->nodes[1] = right; - } - s->count = 2; - - return 0; -} - -static int btree_insert_raw(struct shadow_spine *s, dm_block_t root, - struct dm_btree_value_type *vt, - uint64_t key, unsigned *index) -{ - int r, i = *index, top = 1; - struct node *node; - - for (;;) { - r = shadow_step(s, root, vt); - if (r < 0) - return r; - - node = dm_block_data(shadow_current(s)); - - /* - * We have to patch up the parent node, ugly, but I don't - * see a way to do this automatically as part of the spine - * op. - */ - if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */ - __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); - - __dm_bless_for_disk(&location); - memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i), - &location, sizeof(__le64)); - } - - node = dm_block_data(shadow_current(s)); - - if (node->header.nr_entries == node->header.max_entries) { - if (top) - r = btree_split_beneath(s, key); - else - r = btree_split_sibling(s, root, i, key); - - if (r < 0) - return r; - } - - node = dm_block_data(shadow_current(s)); - - i = lower_bound(node, key); - - if (le32_to_cpu(node->header.flags) & LEAF_NODE) - break; - - if (i < 0) { - /* change the bounds on the lowest key */ - node->keys[0] = cpu_to_le64(key); - i = 0; - } - - root = value64(node, i); - top = 0; - } - - if (i < 0 || le64_to_cpu(node->keys[i]) != key) - i++; - - *index = i; - return 0; -} - -static int insert(struct dm_btree_info *info, dm_block_t root, - uint64_t *keys, void *value, dm_block_t *new_root, - int *inserted) - __dm_written_to_disk(value) -{ - int r, need_insert; - unsigned level, index = -1, last_level = info->levels - 1; - dm_block_t block = root; - struct shadow_spine spine; - struct node *n; - struct dm_btree_value_type le64_type; - - le64_type.context = NULL; - le64_type.size = sizeof(__le64); - le64_type.inc = NULL; - le64_type.dec = NULL; - le64_type.equal = NULL; - - init_shadow_spine(&spine, info); - - for (level = 0; level < (info->levels - 1); level++) { - r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index); - if (r < 0) - goto bad; - - n = dm_block_data(shadow_current(&spine)); - need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) || - (le64_to_cpu(n->keys[index]) != keys[level])); - - if (need_insert) { - dm_block_t new_tree; - __le64 new_le; - - r = dm_btree_empty(info, &new_tree); - if (r < 0) - goto bad; - - new_le = cpu_to_le64(new_tree); - __dm_bless_for_disk(&new_le); - - r = insert_at(sizeof(uint64_t), n, index, - keys[level], &new_le); - if (r) - goto bad; - } - - if (level < last_level) - block = value64(n, index); - } - - r = btree_insert_raw(&spine, block, &info->value_type, - keys[level], &index); - if (r < 0) - goto bad; - - n = dm_block_data(shadow_current(&spine)); - need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) || - (le64_to_cpu(n->keys[index]) != keys[level])); - - if (need_insert) { - if (inserted) - *inserted = 1; - - r = insert_at(info->value_type.size, n, index, - keys[level], value); - if (r) - goto bad_unblessed; - } else { - if (inserted) - *inserted = 0; - - if (info->value_type.dec && - (!info->value_type.equal || - !info->value_type.equal( - info->value_type.context, - value_ptr(n, index), - value))) { - info->value_type.dec(info->value_type.context, - value_ptr(n, index)); - } - memcpy_disk(value_ptr(n, index), - value, info->value_type.size); - } - - *new_root = shadow_root(&spine); - exit_shadow_spine(&spine); - - return 0; - -bad: - __dm_unbless_for_disk(value); -bad_unblessed: - exit_shadow_spine(&spine); - return r; -} - -int dm_btree_insert(struct dm_btree_info *info, dm_block_t root, - uint64_t *keys, void *value, dm_block_t *new_root) - __dm_written_to_disk(value) -{ - return insert(info, root, keys, value, new_root, NULL); -} -EXPORT_SYMBOL_GPL(dm_btree_insert); - -int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root, - uint64_t *keys, void *value, dm_block_t *new_root, - int *inserted) - __dm_written_to_disk(value) -{ - return insert(info, root, keys, value, new_root, inserted); -} -EXPORT_SYMBOL_GPL(dm_btree_insert_notify); - -/*----------------------------------------------------------------*/ - -static int find_highest_key(struct ro_spine *s, dm_block_t block, - uint64_t *result_key, dm_block_t *next_block) -{ - int i, r; - uint32_t flags; - - do { - r = ro_step(s, block); - if (r < 0) - return r; - - flags = le32_to_cpu(ro_node(s)->header.flags); - i = le32_to_cpu(ro_node(s)->header.nr_entries); - if (!i) - return -ENODATA; - else - i--; - - *result_key = le64_to_cpu(ro_node(s)->keys[i]); - if (next_block || flags & INTERNAL_NODE) - block = value64(ro_node(s), i); - - } while (flags & INTERNAL_NODE); - - if (next_block) - *next_block = block; - return 0; -} - -int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, - uint64_t *result_keys) -{ - int r = 0, count = 0, level; - struct ro_spine spine; - - init_ro_spine(&spine, info); - for (level = 0; level < info->levels; level++) { - r = find_highest_key(&spine, root, result_keys + level, - level == info->levels - 1 ? NULL : &root); - if (r == -ENODATA) { - r = 0; - break; - - } else if (r) - break; - - count++; - } - exit_ro_spine(&spine); - - return r ? r : count; -} -EXPORT_SYMBOL_GPL(dm_btree_find_highest_key); diff --git a/ANDROID_3.4.5/drivers/md/persistent-data/dm-btree.h b/ANDROID_3.4.5/drivers/md/persistent-data/dm-btree.h deleted file mode 100644 index ae02c844..00000000 --- a/ANDROID_3.4.5/drivers/md/persistent-data/dm-btree.h +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (C) 2011 Red Hat, Inc. - * - * This file is released under the GPL. - */ -#ifndef _LINUX_DM_BTREE_H -#define _LINUX_DM_BTREE_H - -#include "dm-block-manager.h" - -struct dm_transaction_manager; - -/*----------------------------------------------------------------*/ - -/* - * Annotations used to check on-disk metadata is handled as little-endian. - */ -#ifdef __CHECKER__ -# define __dm_written_to_disk(x) __releases(x) -# define __dm_reads_from_disk(x) __acquires(x) -# define __dm_bless_for_disk(x) __acquire(x) -# define __dm_unbless_for_disk(x) __release(x) -#else -# define __dm_written_to_disk(x) -# define __dm_reads_from_disk(x) -# define __dm_bless_for_disk(x) -# define __dm_unbless_for_disk(x) -#endif - -/*----------------------------------------------------------------*/ - -/* - * Manipulates hierarchical B+ trees with 64-bit keys and arbitrary-sized - * values. - */ - -/* - * Infomation about the values stored within the btree. - */ -struct dm_btree_value_type { - void *context; - - /* - * The size in bytes of each value. - */ - uint32_t size; - - /* - * Any of these methods can be safely set to NULL if you do not - * need the corresponding feature. - */ - - /* - * The btree is making a duplicate of the value, for instance - * because previously-shared btree nodes have now diverged. - * @value argument is the new copy that the copy function may modify. - * (Probably it just wants to increment a reference count - * somewhere.) This method is _not_ called for insertion of a new - * value: It is assumed the ref count is already 1. - */ - void (*inc)(void *context, void *value); - - /* - * This value is being deleted. The btree takes care of freeing - * the memory pointed to by @value. Often the del function just - * needs to decrement a reference count somewhere. - */ - void (*dec)(void *context, void *value); - - /* - * A test for equality between two values. When a value is - * overwritten with a new one, the old one has the dec method - * called _unless_ the new and old value are deemed equal. - */ - int (*equal)(void *context, void *value1, void *value2); -}; - -/* - * The shape and contents of a btree. - */ -struct dm_btree_info { - struct dm_transaction_manager *tm; - - /* - * Number of nested btrees. (Not the depth of a single tree.) - */ - unsigned levels; - struct dm_btree_value_type value_type; -}; - -/* - * Set up an empty tree. O(1). - */ -int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root); - -/* - * Delete a tree. O(n) - this is the slow one! It can also block, so - * please don't call it on an IO path. - */ -int dm_btree_del(struct dm_btree_info *info, dm_block_t root); - -/* - * All the lookup functions return -ENODATA if the key cannot be found. - */ - -/* - * Tries to find a key that matches exactly. O(ln(n)) - */ -int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root, - uint64_t *keys, void *value_le); - -/* - * Insertion (or overwrite an existing value). O(ln(n)) - */ -int dm_btree_insert(struct dm_btree_info *info, dm_block_t root, - uint64_t *keys, void *value, dm_block_t *new_root) - __dm_written_to_disk(value); - -/* - * A variant of insert that indicates whether it actually inserted or just - * overwrote. Useful if you're keeping track of the number of entries in a - * tree. - */ -int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root, - uint64_t *keys, void *value, dm_block_t *new_root, - int *inserted) - __dm_written_to_disk(value); - -/* - * Remove a key if present. This doesn't remove empty sub trees. Normally - * subtrees represent a separate entity, like a snapshot map, so this is - * correct behaviour. O(ln(n)). - */ -int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, - uint64_t *keys, dm_block_t *new_root); - -/* - * Returns < 0 on failure. Otherwise the number of key entries that have - * been filled out. Remember trees can have zero entries, and as such have - * no highest key. - */ -int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, - uint64_t *result_keys); - -#endif /* _LINUX_DM_BTREE_H */ diff --git a/ANDROID_3.4.5/drivers/md/persistent-data/dm-persistent-data-internal.h b/ANDROID_3.4.5/drivers/md/persistent-data/dm-persistent-data-internal.h deleted file mode 100644 index c49e26ff..00000000 --- a/ANDROID_3.4.5/drivers/md/persistent-data/dm-persistent-data-internal.h +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Copyright (C) 2011 Red Hat, Inc. - * - * This file is released under the GPL. - */ - -#ifndef _DM_PERSISTENT_DATA_INTERNAL_H -#define _DM_PERSISTENT_DATA_INTERNAL_H - -#include "dm-block-manager.h" - -static inline unsigned dm_hash_block(dm_block_t b, unsigned hash_mask) -{ - const unsigned BIG_PRIME = 4294967291UL; - - return (((unsigned) b) * BIG_PRIME) & hash_mask; -} - -#endif /* _PERSISTENT_DATA_INTERNAL_H */ diff --git a/ANDROID_3.4.5/drivers/md/persistent-data/dm-space-map-checker.c b/ANDROID_3.4.5/drivers/md/persistent-data/dm-space-map-checker.c deleted file mode 100644 index fc90c116..00000000 --- a/ANDROID_3.4.5/drivers/md/persistent-data/dm-space-map-checker.c +++ /dev/null @@ -1,446 +0,0 @@ -/* - * Copyright (C) 2011 Red Hat, Inc. - * - * This file is released under the GPL. - */ - -#include "dm-space-map-checker.h" - -#include <linux/device-mapper.h> -#include <linux/export.h> -#include <linux/vmalloc.h> - -#ifdef CONFIG_DM_DEBUG_SPACE_MAPS - -#define DM_MSG_PREFIX "space map checker" - -/*----------------------------------------------------------------*/ - -struct count_array { - dm_block_t nr; - dm_block_t nr_free; - - uint32_t *counts; -}; - -static int ca_get_count(struct count_array *ca, dm_block_t b, uint32_t *count) -{ - if (b >= ca->nr) - return -EINVAL; - - *count = ca->counts[b]; - return 0; -} - -static int ca_count_more_than_one(struct count_array *ca, dm_block_t b, int *r) -{ - if (b >= ca->nr) - return -EINVAL; - - *r = ca->counts[b] > 1; - return 0; -} - -static int ca_set_count(struct count_array *ca, dm_block_t b, uint32_t count) -{ - uint32_t old_count; - - if (b >= ca->nr) - return -EINVAL; - - old_count = ca->counts[b]; - - if (!count && old_count) - ca->nr_free++; - - else if (count && !old_count) - ca->nr_free--; - - ca->counts[b] = count; - return 0; -} - -static int ca_inc_block(struct count_array *ca, dm_block_t b) -{ - if (b >= ca->nr) - return -EINVAL; - - ca_set_count(ca, b, ca->counts[b] + 1); - return 0; -} - -static int ca_dec_block(struct count_array *ca, dm_block_t b) -{ - if (b >= ca->nr) - return -EINVAL; - - BUG_ON(ca->counts[b] == 0); - ca_set_count(ca, b, ca->counts[b] - 1); - return 0; -} - -static int ca_create(struct count_array *ca, struct dm_space_map *sm) -{ - int r; - dm_block_t nr_blocks; - - r = dm_sm_get_nr_blocks(sm, &nr_blocks); - if (r) - return r; - - ca->nr = nr_blocks; - ca->nr_free = nr_blocks; - - if (!nr_blocks) - ca->counts = NULL; - else { - ca->counts = vzalloc(sizeof(*ca->counts) * nr_blocks); - if (!ca->counts) - return -ENOMEM; - } - - return 0; -} - -static void ca_destroy(struct count_array *ca) -{ - vfree(ca->counts); -} - -static int ca_load(struct count_array *ca, struct dm_space_map *sm) -{ - int r; - uint32_t count; - dm_block_t nr_blocks, i; - - r = dm_sm_get_nr_blocks(sm, &nr_blocks); - if (r) - return r; - - BUG_ON(ca->nr != nr_blocks); - - DMWARN("Loading debug space map from disk. This may take some time"); - for (i = 0; i < nr_blocks; i++) { - r = dm_sm_get_count(sm, i, &count); - if (r) { - DMERR("load failed"); - return r; - } - - ca_set_count(ca, i, count); - } - DMWARN("Load complete"); - - return 0; -} - -static int ca_extend(struct count_array *ca, dm_block_t extra_blocks) -{ - dm_block_t nr_blocks = ca->nr + extra_blocks; - uint32_t *counts = vzalloc(sizeof(*counts) * nr_blocks); - if (!counts) - return -ENOMEM; - - if (ca->counts) { - memcpy(counts, ca->counts, sizeof(*counts) * ca->nr); - ca_destroy(ca); - } - ca->nr = nr_blocks; - ca->nr_free += extra_blocks; - ca->counts = counts; - return 0; -} - -static int ca_commit(struct count_array *old, struct count_array *new) -{ - if (old->nr != new->nr) { - BUG_ON(old->nr > new->nr); - ca_extend(old, new->nr - old->nr); - } - - BUG_ON(old->nr != new->nr); - old->nr_free = new->nr_free; - memcpy(old->counts, new->counts, sizeof(*old->counts) * old->nr); - return 0; -} - -/*----------------------------------------------------------------*/ - -struct sm_checker { - struct dm_space_map sm; - - struct count_array old_counts; - struct count_array counts; - - struct dm_space_map *real_sm; -}; - -static void sm_checker_destroy(struct dm_space_map *sm) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - - dm_sm_destroy(smc->real_sm); - ca_destroy(&smc->old_counts); - ca_destroy(&smc->counts); - kfree(smc); -} - -static int sm_checker_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - int r = dm_sm_get_nr_blocks(smc->real_sm, count); - if (!r) - BUG_ON(smc->old_counts.nr != *count); - return r; -} - -static int sm_checker_get_nr_free(struct dm_space_map *sm, dm_block_t *count) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - int r = dm_sm_get_nr_free(smc->real_sm, count); - if (!r) { - /* - * Slow, but we know it's correct. - */ - dm_block_t b, n = 0; - for (b = 0; b < smc->old_counts.nr; b++) - if (smc->old_counts.counts[b] == 0 && - smc->counts.counts[b] == 0) - n++; - - if (n != *count) - DMERR("free block counts differ, checker %u, sm-disk:%u", - (unsigned) n, (unsigned) *count); - } - return r; -} - -static int sm_checker_new_block(struct dm_space_map *sm, dm_block_t *b) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - int r = dm_sm_new_block(smc->real_sm, b); - - if (!r) { - BUG_ON(*b >= smc->old_counts.nr); - BUG_ON(smc->old_counts.counts[*b] != 0); - BUG_ON(*b >= smc->counts.nr); - BUG_ON(smc->counts.counts[*b] != 0); - ca_set_count(&smc->counts, *b, 1); - } - - return r; -} - -static int sm_checker_inc_block(struct dm_space_map *sm, dm_block_t b) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - int r = dm_sm_inc_block(smc->real_sm, b); - int r2 = ca_inc_block(&smc->counts, b); - BUG_ON(r != r2); - return r; -} - -static int sm_checker_dec_block(struct dm_space_map *sm, dm_block_t b) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - int r = dm_sm_dec_block(smc->real_sm, b); - int r2 = ca_dec_block(&smc->counts, b); - BUG_ON(r != r2); - return r; -} - -static int sm_checker_get_count(struct dm_space_map *sm, dm_block_t b, uint32_t *result) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - uint32_t result2 = 0; - int r = dm_sm_get_count(smc->real_sm, b, result); - int r2 = ca_get_count(&smc->counts, b, &result2); - - BUG_ON(r != r2); - if (!r) - BUG_ON(*result != result2); - return r; -} - -static int sm_checker_count_more_than_one(struct dm_space_map *sm, dm_block_t b, int *result) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - int result2 = 0; - int r = dm_sm_count_is_more_than_one(smc->real_sm, b, result); - int r2 = ca_count_more_than_one(&smc->counts, b, &result2); - - BUG_ON(r != r2); - if (!r) - BUG_ON(!(*result) && result2); - return r; -} - -static int sm_checker_set_count(struct dm_space_map *sm, dm_block_t b, uint32_t count) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - uint32_t old_rc; - int r = dm_sm_set_count(smc->real_sm, b, count); - int r2; - - BUG_ON(b >= smc->counts.nr); - old_rc = smc->counts.counts[b]; - r2 = ca_set_count(&smc->counts, b, count); - BUG_ON(r != r2); - - return r; -} - -static int sm_checker_commit(struct dm_space_map *sm) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - int r; - - r = dm_sm_commit(smc->real_sm); - if (r) - return r; - - r = ca_commit(&smc->old_counts, &smc->counts); - if (r) - return r; - - return 0; -} - -static int sm_checker_extend(struct dm_space_map *sm, dm_block_t extra_blocks) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - int r = dm_sm_extend(smc->real_sm, extra_blocks); - if (r) - return r; - - return ca_extend(&smc->counts, extra_blocks); -} - -static int sm_checker_root_size(struct dm_space_map *sm, size_t *result) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - return dm_sm_root_size(smc->real_sm, result); -} - -static int sm_checker_copy_root(struct dm_space_map *sm, void *copy_to_here_le, size_t len) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - return dm_sm_copy_root(smc->real_sm, copy_to_here_le, len); -} - -/*----------------------------------------------------------------*/ - -static struct dm_space_map ops_ = { - .destroy = sm_checker_destroy, - .get_nr_blocks = sm_checker_get_nr_blocks, - .get_nr_free = sm_checker_get_nr_free, - .inc_block = sm_checker_inc_block, - .dec_block = sm_checker_dec_block, - .new_block = sm_checker_new_block, - .get_count = sm_checker_get_count, - .count_is_more_than_one = sm_checker_count_more_than_one, - .set_count = sm_checker_set_count, - .commit = sm_checker_commit, - .extend = sm_checker_extend, - .root_size = sm_checker_root_size, - .copy_root = sm_checker_copy_root -}; - -struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm) -{ - int r; - struct sm_checker *smc; - - if (IS_ERR_OR_NULL(sm)) - return ERR_PTR(-EINVAL); - - smc = kmalloc(sizeof(*smc), GFP_KERNEL); - if (!smc) - return ERR_PTR(-ENOMEM); - - memcpy(&smc->sm, &ops_, sizeof(smc->sm)); - r = ca_create(&smc->old_counts, sm); - if (r) { - kfree(smc); - return ERR_PTR(r); - } - - r = ca_create(&smc->counts, sm); - if (r) { - ca_destroy(&smc->old_counts); - kfree(smc); - return ERR_PTR(r); - } - - smc->real_sm = sm; - - r = ca_load(&smc->counts, sm); - if (r) { - ca_destroy(&smc->counts); - ca_destroy(&smc->old_counts); - kfree(smc); - return ERR_PTR(r); - } - - r = ca_commit(&smc->old_counts, &smc->counts); - if (r) { - ca_destroy(&smc->counts); - ca_destroy(&smc->old_counts); - kfree(smc); - return ERR_PTR(r); - } - - return &smc->sm; -} -EXPORT_SYMBOL_GPL(dm_sm_checker_create); - -struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm) -{ - int r; - struct sm_checker *smc; - - if (IS_ERR_OR_NULL(sm)) - return ERR_PTR(-EINVAL); - - smc = kmalloc(sizeof(*smc), GFP_KERNEL); - if (!smc) - return ERR_PTR(-ENOMEM); - - memcpy(&smc->sm, &ops_, sizeof(smc->sm)); - r = ca_create(&smc->old_counts, sm); - if (r) { - kfree(smc); - return ERR_PTR(r); - } - - r = ca_create(&smc->counts, sm); - if (r) { - ca_destroy(&smc->old_counts); - kfree(smc); - return ERR_PTR(r); - } - - smc->real_sm = sm; - return &smc->sm; -} -EXPORT_SYMBOL_GPL(dm_sm_checker_create_fresh); - -/*----------------------------------------------------------------*/ - -#else - -struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm) -{ - return sm; -} -EXPORT_SYMBOL_GPL(dm_sm_checker_create); - -struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm) -{ - return sm; -} -EXPORT_SYMBOL_GPL(dm_sm_checker_create_fresh); - -/*----------------------------------------------------------------*/ - -#endif diff --git a/ANDROID_3.4.5/drivers/md/persistent-data/dm-space-map-checker.h b/ANDROID_3.4.5/drivers/md/persistent-data/dm-space-map-checker.h deleted file mode 100644 index 444dccf6..00000000 --- a/ANDROID_3.4.5/drivers/md/persistent-data/dm-space-map-checker.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright (C) 2011 Red Hat, Inc. - * - * This file is released under the GPL. - */ - -#ifndef SNAPSHOTS_SPACE_MAP_CHECKER_H -#define SNAPSHOTS_SPACE_MAP_CHECKER_H - -#include "dm-space-map.h" - -/*----------------------------------------------------------------*/ - -/* - * This space map wraps a real on-disk space map, and verifies all of its - * operations. It uses a lot of memory, so only use if you have a specific - * problem that you're debugging. - * - * Ownership of @sm passes. - */ -struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm); -struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm); - -/*----------------------------------------------------------------*/ - -#endif diff --git a/ANDROID_3.4.5/drivers/md/persistent-data/dm-space-map-common.c b/ANDROID_3.4.5/drivers/md/persistent-data/dm-space-map-common.c deleted file mode 100644 index ff3beed6..00000000 --- a/ANDROID_3.4.5/drivers/md/persistent-data/dm-space-map-common.c +++ /dev/null @@ -1,702 +0,0 @@ -/* - * Copyright (C) 2011 Red Hat, Inc. - * - * This file is released under the GPL. - */ - -#include "dm-space-map-common.h" -#include "dm-transaction-manager.h" - -#include <linux/bitops.h> -#include <linux/device-mapper.h> - -#define DM_MSG_PREFIX "space map common" - -/*----------------------------------------------------------------*/ - -/* - * Index validator. - */ -#define INDEX_CSUM_XOR 160478 - -static void index_prepare_for_write(struct dm_block_validator *v, - struct dm_block *b, - size_t block_size) -{ - struct disk_metadata_index *mi_le = dm_block_data(b); - - mi_le->blocknr = cpu_to_le64(dm_block_location(b)); - mi_le->csum = cpu_to_le32(dm_bm_checksum(&mi_le->padding, - block_size - sizeof(__le32), - INDEX_CSUM_XOR)); -} - -static int index_check(struct dm_block_validator *v, - struct dm_block *b, - size_t block_size) -{ - struct disk_metadata_index *mi_le = dm_block_data(b); - __le32 csum_disk; - - if (dm_block_location(b) != le64_to_cpu(mi_le->blocknr)) { - DMERR("index_check failed blocknr %llu wanted %llu", - le64_to_cpu(mi_le->blocknr), dm_block_location(b)); - return -ENOTBLK; - } - - csum_disk = cpu_to_le32(dm_bm_checksum(&mi_le->padding, - block_size - sizeof(__le32), - INDEX_CSUM_XOR)); - if (csum_disk != mi_le->csum) { - DMERR("index_check failed csum %u wanted %u", - le32_to_cpu(csum_disk), le32_to_cpu(mi_le->csum)); - return -EILSEQ; - } - - return 0; -} - -static struct dm_block_validator index_validator = { - .name = "index", - .prepare_for_write = index_prepare_for_write, - .check = index_check -}; - -/*----------------------------------------------------------------*/ - -/* - * Bitmap validator - */ -#define BITMAP_CSUM_XOR 240779 - -static void bitmap_prepare_for_write(struct dm_block_validator *v, - struct dm_block *b, - size_t block_size) -{ - struct disk_bitmap_header *disk_header = dm_block_data(b); - - disk_header->blocknr = cpu_to_le64(dm_block_location(b)); - disk_header->csum = cpu_to_le32(dm_bm_checksum(&disk_header->not_used, - block_size - sizeof(__le32), - BITMAP_CSUM_XOR)); -} - -static int bitmap_check(struct dm_block_validator *v, - struct dm_block *b, - size_t block_size) -{ - struct disk_bitmap_header *disk_header = dm_block_data(b); - __le32 csum_disk; - - if (dm_block_location(b) != le64_to_cpu(disk_header->blocknr)) { - DMERR("bitmap check failed blocknr %llu wanted %llu", - le64_to_cpu(disk_header->blocknr), dm_block_location(b)); - return -ENOTBLK; - } - - csum_disk = cpu_to_le32(dm_bm_checksum(&disk_header->not_used, - block_size - sizeof(__le32), - BITMAP_CSUM_XOR)); - if (csum_disk != disk_header->csum) { - DMERR("bitmap check failed csum %u wanted %u", - le32_to_cpu(csum_disk), le32_to_cpu(disk_header->csum)); - return -EILSEQ; - } - - return 0; -} - -static struct dm_block_validator dm_sm_bitmap_validator = { - .name = "sm_bitmap", - .prepare_for_write = bitmap_prepare_for_write, - .check = bitmap_check -}; - -/*----------------------------------------------------------------*/ - -#define ENTRIES_PER_WORD 32 -#define ENTRIES_SHIFT 5 - -static void *dm_bitmap_data(struct dm_block *b) -{ - return dm_block_data(b) + sizeof(struct disk_bitmap_header); -} - -#define WORD_MASK_HIGH 0xAAAAAAAAAAAAAAAAULL - -static unsigned bitmap_word_used(void *addr, unsigned b) -{ - __le64 *words_le = addr; - __le64 *w_le = words_le + (b >> ENTRIES_SHIFT); - - uint64_t bits = le64_to_cpu(*w_le); - uint64_t mask = (bits + WORD_MASK_HIGH + 1) & WORD_MASK_HIGH; - - return !(~bits & mask); -} - -static unsigned sm_lookup_bitmap(void *addr, unsigned b) -{ - __le64 *words_le = addr; - __le64 *w_le = words_le + (b >> ENTRIES_SHIFT); - unsigned hi, lo; - - b = (b & (ENTRIES_PER_WORD - 1)) << 1; - hi = !!test_bit_le(b, (void *) w_le); - lo = !!test_bit_le(b + 1, (void *) w_le); - return (hi << 1) | lo; -} - -static void sm_set_bitmap(void *addr, unsigned b, unsigned val) -{ - __le64 *words_le = addr; - __le64 *w_le = words_le + (b >> ENTRIES_SHIFT); - - b = (b & (ENTRIES_PER_WORD - 1)) << 1; - - if (val & 2) - __set_bit_le(b, (void *) w_le); - else - __clear_bit_le(b, (void *) w_le); - - if (val & 1) - __set_bit_le(b + 1, (void *) w_le); - else - __clear_bit_le(b + 1, (void *) w_le); -} - -static int sm_find_free(void *addr, unsigned begin, unsigned end, - unsigned *result) -{ - while (begin < end) { - if (!(begin & (ENTRIES_PER_WORD - 1)) && - bitmap_word_used(addr, begin)) { - begin += ENTRIES_PER_WORD; - continue; - } - - if (!sm_lookup_bitmap(addr, begin)) { - *result = begin; - return 0; - } - - begin++; - } - - return -ENOSPC; -} - -/*----------------------------------------------------------------*/ - -static int sm_ll_init(struct ll_disk *ll, struct dm_transaction_manager *tm) -{ - ll->tm = tm; - - ll->bitmap_info.tm = tm; - ll->bitmap_info.levels = 1; - - /* - * Because the new bitmap blocks are created via a shadow - * operation, the old entry has already had its reference count - * decremented and we don't need the btree to do any bookkeeping. - */ - ll->bitmap_info.value_type.size = sizeof(struct disk_index_entry); - ll->bitmap_info.value_type.inc = NULL; - ll->bitmap_info.value_type.dec = NULL; - ll->bitmap_info.value_type.equal = NULL; - - ll->ref_count_info.tm = tm; - ll->ref_count_info.levels = 1; - ll->ref_count_info.value_type.size = sizeof(uint32_t); - ll->ref_count_info.value_type.inc = NULL; - ll->ref_count_info.value_type.dec = NULL; - ll->ref_count_info.value_type.equal = NULL; - - ll->block_size = dm_bm_block_size(dm_tm_get_bm(tm)); - - if (ll->block_size > (1 << 30)) { - DMERR("block size too big to hold bitmaps"); - return -EINVAL; - } - - ll->entries_per_block = (ll->block_size - sizeof(struct disk_bitmap_header)) * - ENTRIES_PER_BYTE; - ll->nr_blocks = 0; - ll->bitmap_root = 0; - ll->ref_count_root = 0; - - return 0; -} - -int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks) -{ - int r; - dm_block_t i, nr_blocks, nr_indexes; - unsigned old_blocks, blocks; - - nr_blocks = ll->nr_blocks + extra_blocks; - old_blocks = dm_sector_div_up(ll->nr_blocks, ll->entries_per_block); - blocks = dm_sector_div_up(nr_blocks, ll->entries_per_block); - - nr_indexes = dm_sector_div_up(nr_blocks, ll->entries_per_block); - if (nr_indexes > ll->max_entries(ll)) { - DMERR("space map too large"); - return -EINVAL; - } - - for (i = old_blocks; i < blocks; i++) { - struct dm_block *b; - struct disk_index_entry idx; - - r = dm_tm_new_block(ll->tm, &dm_sm_bitmap_validator, &b); - if (r < 0) - return r; - idx.blocknr = cpu_to_le64(dm_block_location(b)); - - r = dm_tm_unlock(ll->tm, b); - if (r < 0) - return r; - - idx.nr_free = cpu_to_le32(ll->entries_per_block); - idx.none_free_before = 0; - - r = ll->save_ie(ll, i, &idx); - if (r < 0) - return r; - } - - ll->nr_blocks = nr_blocks; - return 0; -} - -int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result) -{ - int r; - dm_block_t index = b; - struct disk_index_entry ie_disk; - struct dm_block *blk; - - b = do_div(index, ll->entries_per_block); - r = ll->load_ie(ll, index, &ie_disk); - if (r < 0) - return r; - - r = dm_tm_read_lock(ll->tm, le64_to_cpu(ie_disk.blocknr), - &dm_sm_bitmap_validator, &blk); - if (r < 0) - return r; - - *result = sm_lookup_bitmap(dm_bitmap_data(blk), b); - - return dm_tm_unlock(ll->tm, blk); -} - -int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result) -{ - __le32 le_rc; - int r = sm_ll_lookup_bitmap(ll, b, result); - - if (r) - return r; - - if (*result != 3) - return r; - - r = dm_btree_lookup(&ll->ref_count_info, ll->ref_count_root, &b, &le_rc); - if (r < 0) - return r; - - *result = le32_to_cpu(le_rc); - - return r; -} - -int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, - dm_block_t end, dm_block_t *result) -{ - int r; - struct disk_index_entry ie_disk; - dm_block_t i, index_begin = begin; - dm_block_t index_end = dm_sector_div_up(end, ll->entries_per_block); - - /* - * FIXME: Use shifts - */ - begin = do_div(index_begin, ll->entries_per_block); - end = do_div(end, ll->entries_per_block); - - for (i = index_begin; i < index_end; i++, begin = 0) { - struct dm_block *blk; - unsigned position; - uint32_t bit_end; - - r = ll->load_ie(ll, i, &ie_disk); - if (r < 0) - return r; - - if (le32_to_cpu(ie_disk.nr_free) == 0) - continue; - - r = dm_tm_read_lock(ll->tm, le64_to_cpu(ie_disk.blocknr), - &dm_sm_bitmap_validator, &blk); - if (r < 0) - return r; - - bit_end = (i == index_end - 1) ? end : ll->entries_per_block; - - r = sm_find_free(dm_bitmap_data(blk), - max_t(unsigned, begin, le32_to_cpu(ie_disk.none_free_before)), - bit_end, &position); - if (r == -ENOSPC) { - /* - * This might happen because we started searching - * part way through the bitmap. - */ - dm_tm_unlock(ll->tm, blk); - continue; - - } else if (r < 0) { - dm_tm_unlock(ll->tm, blk); - return r; - } - - r = dm_tm_unlock(ll->tm, blk); - if (r < 0) - return r; - - *result = i * ll->entries_per_block + (dm_block_t) position; - return 0; - } - - return -ENOSPC; -} - -int sm_ll_insert(struct ll_disk *ll, dm_block_t b, - uint32_t ref_count, enum allocation_event *ev) -{ - int r; - uint32_t bit, old; - struct dm_block *nb; - dm_block_t index = b; - struct disk_index_entry ie_disk; - void *bm_le; - int inc; - - bit = do_div(index, ll->entries_per_block); - r = ll->load_ie(ll, index, &ie_disk); - if (r < 0) - return r; - - r = dm_tm_shadow_block(ll->tm, le64_to_cpu(ie_disk.blocknr), - &dm_sm_bitmap_validator, &nb, &inc); - if (r < 0) { - DMERR("dm_tm_shadow_block() failed"); - return r; - } - ie_disk.blocknr = cpu_to_le64(dm_block_location(nb)); - - bm_le = dm_bitmap_data(nb); - old = sm_lookup_bitmap(bm_le, bit); - - if (ref_count <= 2) { - sm_set_bitmap(bm_le, bit, ref_count); - - r = dm_tm_unlock(ll->tm, nb); - if (r < 0) - return r; - - if (old > 2) { - r = dm_btree_remove(&ll->ref_count_info, - ll->ref_count_root, - &b, &ll->ref_count_root); - if (r) - return r; - } - - } else { - __le32 le_rc = cpu_to_le32(ref_count); - - sm_set_bitmap(bm_le, bit, 3); - r = dm_tm_unlock(ll->tm, nb); - if (r < 0) - return r; - - __dm_bless_for_disk(&le_rc); - r = dm_btree_insert(&ll->ref_count_info, ll->ref_count_root, - &b, &le_rc, &ll->ref_count_root); - if (r < 0) { - DMERR("ref count insert failed"); - return r; - } - } - - if (ref_count && !old) { - *ev = SM_ALLOC; - ll->nr_allocated++; - ie_disk.nr_free = cpu_to_le32(le32_to_cpu(ie_disk.nr_free) - 1); - if (le32_to_cpu(ie_disk.none_free_before) == bit) - ie_disk.none_free_before = cpu_to_le32(bit + 1); - - } else if (old && !ref_count) { - *ev = SM_FREE; - ll->nr_allocated--; - ie_disk.nr_free = cpu_to_le32(le32_to_cpu(ie_disk.nr_free) + 1); - ie_disk.none_free_before = cpu_to_le32(min(le32_to_cpu(ie_disk.none_free_before), bit)); - } - - return ll->save_ie(ll, index, &ie_disk); -} - -int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) -{ - int r; - uint32_t rc; - - r = sm_ll_lookup(ll, b, &rc); - if (r) - return r; - - return sm_ll_insert(ll, b, rc + 1, ev); -} - -int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) -{ - int r; - uint32_t rc; - - r = sm_ll_lookup(ll, b, &rc); - if (r) - return r; - - if (!rc) - return -EINVAL; - - return sm_ll_insert(ll, b, rc - 1, ev); -} - -int sm_ll_commit(struct ll_disk *ll) -{ - return ll->commit(ll); -} - -/*----------------------------------------------------------------*/ - -static int metadata_ll_load_ie(struct ll_disk *ll, dm_block_t index, - struct disk_index_entry *ie) -{ - memcpy(ie, ll->mi_le.index + index, sizeof(*ie)); - return 0; -} - -static int metadata_ll_save_ie(struct ll_disk *ll, dm_block_t index, - struct disk_index_entry *ie) -{ - memcpy(ll->mi_le.index + index, ie, sizeof(*ie)); - return 0; -} - -static int metadata_ll_init_index(struct ll_disk *ll) -{ - int r; - struct dm_block *b; - - r = dm_tm_new_block(ll->tm, &index_validator, &b); - if (r < 0) - return r; - - memcpy(dm_block_data(b), &ll->mi_le, sizeof(ll->mi_le)); - ll->bitmap_root = dm_block_location(b); - - return dm_tm_unlock(ll->tm, b); -} - -static int metadata_ll_open(struct ll_disk *ll) -{ - int r; - struct dm_block *block; - - r = dm_tm_read_lock(ll->tm, ll->bitmap_root, - &index_validator, &block); - if (r) - return r; - - memcpy(&ll->mi_le, dm_block_data(block), sizeof(ll->mi_le)); - return dm_tm_unlock(ll->tm, block); -} - -static dm_block_t metadata_ll_max_entries(struct ll_disk *ll) -{ - return MAX_METADATA_BITMAPS; -} - -static int metadata_ll_commit(struct ll_disk *ll) -{ - int r, inc; - struct dm_block *b; - - r = dm_tm_shadow_block(ll->tm, ll->bitmap_root, &index_validator, &b, &inc); - if (r) - return r; - - memcpy(dm_block_data(b), &ll->mi_le, sizeof(ll->mi_le)); - ll->bitmap_root = dm_block_location(b); - - return dm_tm_unlock(ll->tm, b); -} - -int sm_ll_new_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm) -{ - int r; - - r = sm_ll_init(ll, tm); - if (r < 0) - return r; - - ll->load_ie = metadata_ll_load_ie; - ll->save_ie = metadata_ll_save_ie; - ll->init_index = metadata_ll_init_index; - ll->open_index = metadata_ll_open; - ll->max_entries = metadata_ll_max_entries; - ll->commit = metadata_ll_commit; - - ll->nr_blocks = 0; - ll->nr_allocated = 0; - - r = ll->init_index(ll); - if (r < 0) - return r; - - r = dm_btree_empty(&ll->ref_count_info, &ll->ref_count_root); - if (r < 0) - return r; - - return 0; -} - -int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm, - void *root_le, size_t len) -{ - int r; - struct disk_sm_root *smr = root_le; - - if (len < sizeof(struct disk_sm_root)) { - DMERR("sm_metadata root too small"); - return -ENOMEM; - } - - r = sm_ll_init(ll, tm); - if (r < 0) - return r; - - ll->load_ie = metadata_ll_load_ie; - ll->save_ie = metadata_ll_save_ie; - ll->init_index = metadata_ll_init_index; - ll->open_index = metadata_ll_open; - ll->max_entries = metadata_ll_max_entries; - ll->commit = metadata_ll_commit; - - ll->nr_blocks = le64_to_cpu(smr->nr_blocks); - ll->nr_allocated = le64_to_cpu(smr->nr_allocated); - ll->bitmap_root = le64_to_cpu(smr->bitmap_root); - ll->ref_count_root = le64_to_cpu(smr->ref_count_root); - - return ll->open_index(ll); -} - -/*----------------------------------------------------------------*/ - -static int disk_ll_load_ie(struct ll_disk *ll, dm_block_t index, - struct disk_index_entry *ie) -{ - return dm_btree_lookup(&ll->bitmap_info, ll->bitmap_root, &index, ie); -} - -static int disk_ll_save_ie(struct ll_disk *ll, dm_block_t index, - struct disk_index_entry *ie) -{ - __dm_bless_for_disk(ie); - return dm_btree_insert(&ll->bitmap_info, ll->bitmap_root, - &index, ie, &ll->bitmap_root); -} - -static int disk_ll_init_index(struct ll_disk *ll) -{ - return dm_btree_empty(&ll->bitmap_info, &ll->bitmap_root); -} - -static int disk_ll_open(struct ll_disk *ll) -{ - /* nothing to do */ - return 0; -} - -static dm_block_t disk_ll_max_entries(struct ll_disk *ll) -{ - return -1ULL; -} - -static int disk_ll_commit(struct ll_disk *ll) -{ - return 0; -} - -int sm_ll_new_disk(struct ll_disk *ll, struct dm_transaction_manager *tm) -{ - int r; - - r = sm_ll_init(ll, tm); - if (r < 0) - return r; - - ll->load_ie = disk_ll_load_ie; - ll->save_ie = disk_ll_save_ie; - ll->init_index = disk_ll_init_index; - ll->open_index = disk_ll_open; - ll->max_entries = disk_ll_max_entries; - ll->commit = disk_ll_commit; - - ll->nr_blocks = 0; - ll->nr_allocated = 0; - - r = ll->init_index(ll); - if (r < 0) - return r; - - r = dm_btree_empty(&ll->ref_count_info, &ll->ref_count_root); - if (r < 0) - return r; - - return 0; -} - -int sm_ll_open_disk(struct ll_disk *ll, struct dm_transaction_manager *tm, - void *root_le, size_t len) -{ - int r; - struct disk_sm_root *smr = root_le; - - if (len < sizeof(struct disk_sm_root)) { - DMERR("sm_metadata root too small"); - return -ENOMEM; - } - - r = sm_ll_init(ll, tm); - if (r < 0) - return r; - - ll->load_ie = disk_ll_load_ie; - ll->save_ie = disk_ll_save_ie; - ll->init_index = disk_ll_init_index; - ll->open_index = disk_ll_open; - ll->max_entries = disk_ll_max_entries; - ll->commit = disk_ll_commit; - - ll->nr_blocks = le64_to_cpu(smr->nr_blocks); - ll->nr_allocated = le64_to_cpu(smr->nr_allocated); - ll->bitmap_root = le64_to_cpu(smr->bitmap_root); - ll->ref_count_root = le64_to_cpu(smr->ref_count_root); - - return ll->open_index(ll); -} - -/*----------------------------------------------------------------*/ diff --git a/ANDROID_3.4.5/drivers/md/persistent-data/dm-space-map-common.h b/ANDROID_3.4.5/drivers/md/persistent-data/dm-space-map-common.h deleted file mode 100644 index 8f220821..00000000 --- a/ANDROID_3.4.5/drivers/md/persistent-data/dm-space-map-common.h +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Copyright (C) 2011 Red Hat, Inc. - * - * This file is released under the GPL. - */ - -#ifndef DM_SPACE_MAP_COMMON_H -#define DM_SPACE_MAP_COMMON_H - -#include "dm-btree.h" - -/*----------------------------------------------------------------*/ - -/* - * Low level disk format - * - * Bitmap btree - * ------------ - * - * Each value stored in the btree is an index_entry. This points to a - * block that is used as a bitmap. Within the bitmap hold 2 bits per - * entry, which represent UNUSED = 0, REF_COUNT = 1, REF_COUNT = 2 and - * REF_COUNT = many. - * - * Refcount btree - * -------------- - * - * Any entry that has a ref count higher than 2 gets entered in the ref - * count tree. The leaf values for this tree is the 32-bit ref count. - */ - -struct disk_index_entry { - __le64 blocknr; - __le32 nr_free; - __le32 none_free_before; -} __packed; - - -#define MAX_METADATA_BITMAPS 255 -struct disk_metadata_index { - __le32 csum; - __le32 padding; - __le64 blocknr; - - struct disk_index_entry index[MAX_METADATA_BITMAPS]; -} __packed; - -struct ll_disk; - -typedef int (*load_ie_fn)(struct ll_disk *ll, dm_block_t index, struct disk_index_entry *result); -typedef int (*save_ie_fn)(struct ll_disk *ll, dm_block_t index, struct disk_index_entry *ie); -typedef int (*init_index_fn)(struct ll_disk *ll); -typedef int (*open_index_fn)(struct ll_disk *ll); -typedef dm_block_t (*max_index_entries_fn)(struct ll_disk *ll); -typedef int (*commit_fn)(struct ll_disk *ll); - -struct ll_disk { - struct dm_transaction_manager *tm; - struct dm_btree_info bitmap_info; - struct dm_btree_info ref_count_info; - - uint32_t block_size; - uint32_t entries_per_block; - dm_block_t nr_blocks; - dm_block_t nr_allocated; - - /* - * bitmap_root may be a btree root or a simple index. - */ - dm_block_t bitmap_root; - - dm_block_t ref_count_root; - - struct disk_metadata_index mi_le; - load_ie_fn load_ie; - save_ie_fn save_ie; - init_index_fn init_index; - open_index_fn open_index; - max_index_entries_fn max_entries; - commit_fn commit; -}; - -struct disk_sm_root { - __le64 nr_blocks; - __le64 nr_allocated; - __le64 bitmap_root; - __le64 ref_count_root; -} __packed; - -#define ENTRIES_PER_BYTE 4 - -struct disk_bitmap_header { - __le32 csum; - __le32 not_used; - __le64 blocknr; -} __packed; - -enum allocation_event { - SM_NONE, - SM_ALLOC, - SM_FREE, -}; - -/*----------------------------------------------------------------*/ - -int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks); -int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result); -int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result); -int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, - dm_block_t end, dm_block_t *result); -int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, enum allocation_event *ev); -int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev); -int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev); -int sm_ll_commit(struct ll_disk *ll); - -int sm_ll_new_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm); -int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm, - void *root_le, size_t len); - -int sm_ll_new_disk(struct ll_disk *ll, struct dm_transaction_manager *tm); -int sm_ll_open_disk(struct ll_disk *ll, struct dm_transaction_manager *tm, - void *root_le, size_t len); - -/*----------------------------------------------------------------*/ - -#endif /* DM_SPACE_MAP_COMMON_H */ diff --git a/ANDROID_3.4.5/drivers/md/persistent-data/dm-space-map-disk.c b/ANDROID_3.4.5/drivers/md/persistent-data/dm-space-map-disk.c deleted file mode 100644 index 3d0ed533..00000000 --- a/ANDROID_3.4.5/drivers/md/persistent-data/dm-space-map-disk.c +++ /dev/null @@ -1,344 +0,0 @@ -/* - * Copyright (C) 2011 Red Hat, Inc. - * - * This file is released under the GPL. - */ - -#include "dm-space-map-checker.h" -#include "dm-space-map-common.h" -#include "dm-space-map-disk.h" -#include "dm-space-map.h" -#include "dm-transaction-manager.h" - -#include <linux/list.h> -#include <linux/slab.h> -#include <linux/export.h> -#include <linux/device-mapper.h> - -#define DM_MSG_PREFIX "space map disk" - -/*----------------------------------------------------------------*/ - -/* - * Space map interface. - */ -struct sm_disk { - struct dm_space_map sm; - - struct ll_disk ll; - struct ll_disk old_ll; - - dm_block_t begin; - dm_block_t nr_allocated_this_transaction; -}; - -static void sm_disk_destroy(struct dm_space_map *sm) -{ - struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - - kfree(smd); -} - -static int sm_disk_extend(struct dm_space_map *sm, dm_block_t extra_blocks) -{ - struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - - return sm_ll_extend(&smd->ll, extra_blocks); -} - -static int sm_disk_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count) -{ - struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - *count = smd->old_ll.nr_blocks; - - return 0; -} - -static int sm_disk_get_nr_free(struct dm_space_map *sm, dm_block_t *count) -{ - struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - *count = (smd->old_ll.nr_blocks - smd->old_ll.nr_allocated) - smd->nr_allocated_this_transaction; - - return 0; -} - -static int sm_disk_get_count(struct dm_space_map *sm, dm_block_t b, - uint32_t *result) -{ - struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - return sm_ll_lookup(&smd->ll, b, result); -} - -static int sm_disk_count_is_more_than_one(struct dm_space_map *sm, dm_block_t b, - int *result) -{ - int r; - uint32_t count; - - r = sm_disk_get_count(sm, b, &count); - if (r) - return r; - - return count > 1; -} - -static int sm_disk_set_count(struct dm_space_map *sm, dm_block_t b, - uint32_t count) -{ - int r; - uint32_t old_count; - enum allocation_event ev; - struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - - r = sm_ll_insert(&smd->ll, b, count, &ev); - if (!r) { - switch (ev) { - case SM_NONE: - break; - - case SM_ALLOC: - /* - * This _must_ be free in the prior transaction - * otherwise we've lost atomicity. - */ - smd->nr_allocated_this_transaction++; - break; - - case SM_FREE: - /* - * It's only free if it's also free in the last - * transaction. - */ - r = sm_ll_lookup(&smd->old_ll, b, &old_count); - if (r) - return r; - - if (!old_count) - smd->nr_allocated_this_transaction--; - break; - } - } - - return r; -} - -static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b) -{ - int r; - enum allocation_event ev; - struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - - r = sm_ll_inc(&smd->ll, b, &ev); - if (!r && (ev == SM_ALLOC)) - /* - * This _must_ be free in the prior transaction - * otherwise we've lost atomicity. - */ - smd->nr_allocated_this_transaction++; - - return r; -} - -static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b) -{ - int r; - uint32_t old_count; - enum allocation_event ev; - struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - - r = sm_ll_dec(&smd->ll, b, &ev); - if (!r && (ev == SM_FREE)) { - /* - * It's only free if it's also free in the last - * transaction. - */ - r = sm_ll_lookup(&smd->old_ll, b, &old_count); - if (r) - return r; - - if (!old_count) - smd->nr_allocated_this_transaction--; - } - - return r; -} - -static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b) -{ - int r; - enum allocation_event ev; - struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - - /* FIXME: we should loop round a couple of times */ - r = sm_ll_find_free_block(&smd->old_ll, smd->begin, smd->old_ll.nr_blocks, b); - if (r) - return r; - - smd->begin = *b + 1; - r = sm_ll_inc(&smd->ll, *b, &ev); - if (!r) { - BUG_ON(ev != SM_ALLOC); - smd->nr_allocated_this_transaction++; - } - - return r; -} - -static int sm_disk_commit(struct dm_space_map *sm) -{ - int r; - dm_block_t nr_free; - struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - - r = sm_disk_get_nr_free(sm, &nr_free); - if (r) - return r; - - r = sm_ll_commit(&smd->ll); - if (r) - return r; - - memcpy(&smd->old_ll, &smd->ll, sizeof(smd->old_ll)); - smd->begin = 0; - smd->nr_allocated_this_transaction = 0; - - r = sm_disk_get_nr_free(sm, &nr_free); - if (r) - return r; - - return 0; -} - -static int sm_disk_root_size(struct dm_space_map *sm, size_t *result) -{ - *result = sizeof(struct disk_sm_root); - - return 0; -} - -static int sm_disk_copy_root(struct dm_space_map *sm, void *where_le, size_t max) -{ - struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - struct disk_sm_root root_le; - - root_le.nr_blocks = cpu_to_le64(smd->ll.nr_blocks); - root_le.nr_allocated = cpu_to_le64(smd->ll.nr_allocated); - root_le.bitmap_root = cpu_to_le64(smd->ll.bitmap_root); - root_le.ref_count_root = cpu_to_le64(smd->ll.ref_count_root); - - if (max < sizeof(root_le)) - return -ENOSPC; - - memcpy(where_le, &root_le, sizeof(root_le)); - - return 0; -} - -/*----------------------------------------------------------------*/ - -static struct dm_space_map ops = { - .destroy = sm_disk_destroy, - .extend = sm_disk_extend, - .get_nr_blocks = sm_disk_get_nr_blocks, - .get_nr_free = sm_disk_get_nr_free, - .get_count = sm_disk_get_count, - .count_is_more_than_one = sm_disk_count_is_more_than_one, - .set_count = sm_disk_set_count, - .inc_block = sm_disk_inc_block, - .dec_block = sm_disk_dec_block, - .new_block = sm_disk_new_block, - .commit = sm_disk_commit, - .root_size = sm_disk_root_size, - .copy_root = sm_disk_copy_root -}; - -static struct dm_space_map *dm_sm_disk_create_real( - struct dm_transaction_manager *tm, - dm_block_t nr_blocks) -{ - int r; - struct sm_disk *smd; - - smd = kmalloc(sizeof(*smd), GFP_KERNEL); - if (!smd) - return ERR_PTR(-ENOMEM); - - smd->begin = 0; - smd->nr_allocated_this_transaction = 0; - memcpy(&smd->sm, &ops, sizeof(smd->sm)); - - r = sm_ll_new_disk(&smd->ll, tm); - if (r) - goto bad; - - r = sm_ll_extend(&smd->ll, nr_blocks); - if (r) - goto bad; - - r = sm_disk_commit(&smd->sm); - if (r) - goto bad; - - return &smd->sm; - -bad: - kfree(smd); - return ERR_PTR(r); -} - -struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm, - dm_block_t nr_blocks) -{ - struct dm_space_map *sm = dm_sm_disk_create_real(tm, nr_blocks); - struct dm_space_map *smc; - - if (IS_ERR_OR_NULL(sm)) - return sm; - - smc = dm_sm_checker_create_fresh(sm); - if (IS_ERR(smc)) - dm_sm_destroy(sm); - - return smc; -} -EXPORT_SYMBOL_GPL(dm_sm_disk_create); - -static struct dm_space_map *dm_sm_disk_open_real( - struct dm_transaction_manager *tm, - void *root_le, size_t len) -{ - int r; - struct sm_disk *smd; - - smd = kmalloc(sizeof(*smd), GFP_KERNEL); - if (!smd) - return ERR_PTR(-ENOMEM); - - smd->begin = 0; - smd->nr_allocated_this_transaction = 0; - memcpy(&smd->sm, &ops, sizeof(smd->sm)); - - r = sm_ll_open_disk(&smd->ll, tm, root_le, len); - if (r) - goto bad; - - r = sm_disk_commit(&smd->sm); - if (r) - goto bad; - - return &smd->sm; - -bad: - kfree(smd); - return ERR_PTR(r); -} - -struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm, - void *root_le, size_t len) -{ - return dm_sm_checker_create( - dm_sm_disk_open_real(tm, root_le, len)); -} -EXPORT_SYMBOL_GPL(dm_sm_disk_open); - -/*----------------------------------------------------------------*/ diff --git a/ANDROID_3.4.5/drivers/md/persistent-data/dm-space-map-disk.h b/ANDROID_3.4.5/drivers/md/persistent-data/dm-space-map-disk.h deleted file mode 100644 index 447a0a9a..00000000 --- a/ANDROID_3.4.5/drivers/md/persistent-data/dm-space-map-disk.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright (C) 2011 Red Hat, Inc. - * - * This file is released under the GPL. - */ - -#ifndef _LINUX_DM_SPACE_MAP_DISK_H -#define _LINUX_DM_SPACE_MAP_DISK_H - -#include "dm-block-manager.h" - -struct dm_space_map; -struct dm_transaction_manager; - -/* - * Unfortunately we have to use two-phase construction due to the cycle - * between the tm and sm. - */ -struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm, - dm_block_t nr_blocks); - -struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm, - void *root, size_t len); - -#endif /* _LINUX_DM_SPACE_MAP_DISK_H */ diff --git a/ANDROID_3.4.5/drivers/md/persistent-data/dm-space-map-metadata.c b/ANDROID_3.4.5/drivers/md/persistent-data/dm-space-map-metadata.c deleted file mode 100644 index e89ae5e7..00000000 --- a/ANDROID_3.4.5/drivers/md/persistent-data/dm-space-map-metadata.c +++ /dev/null @@ -1,596 +0,0 @@ -/* - * Copyright (C) 2011 Red Hat, Inc. - * - * This file is released under the GPL. - */ - -#include "dm-space-map.h" -#include "dm-space-map-common.h" -#include "dm-space-map-metadata.h" - -#include <linux/list.h> -#include <linux/slab.h> -#include <linux/device-mapper.h> - -#define DM_MSG_PREFIX "space map metadata" - -/*----------------------------------------------------------------*/ - -/* - * Space map interface. - * - * The low level disk format is written using the standard btree and - * transaction manager. This means that performing disk operations may - * cause us to recurse into the space map in order to allocate new blocks. - * For this reason we have a pool of pre-allocated blocks large enough to - * service any metadata_ll_disk operation. - */ - -/* - * FIXME: we should calculate this based on the size of the device. - * Only the metadata space map needs this functionality. - */ -#define MAX_RECURSIVE_ALLOCATIONS 1024 - -enum block_op_type { - BOP_INC, - BOP_DEC -}; - -struct block_op { - enum block_op_type type; - dm_block_t block; -}; - -struct sm_metadata { - struct dm_space_map sm; - - struct ll_disk ll; - struct ll_disk old_ll; - - dm_block_t begin; - - unsigned recursion_count; - unsigned allocated_this_transaction; - unsigned nr_uncommitted; - struct block_op uncommitted[MAX_RECURSIVE_ALLOCATIONS]; -}; - -static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b) -{ - struct block_op *op; - - if (smm->nr_uncommitted == MAX_RECURSIVE_ALLOCATIONS) { - DMERR("too many recursive allocations"); - return -ENOMEM; - } - - op = smm->uncommitted + smm->nr_uncommitted++; - op->type = type; - op->block = b; - - return 0; -} - -static int commit_bop(struct sm_metadata *smm, struct block_op *op) -{ - int r = 0; - enum allocation_event ev; - - switch (op->type) { - case BOP_INC: - r = sm_ll_inc(&smm->ll, op->block, &ev); - break; - - case BOP_DEC: - r = sm_ll_dec(&smm->ll, op->block, &ev); - break; - } - - return r; -} - -static void in(struct sm_metadata *smm) -{ - smm->recursion_count++; -} - -static int out(struct sm_metadata *smm) -{ - int r = 0; - - /* - * If we're not recursing then very bad things are happening. - */ - if (!smm->recursion_count) { - DMERR("lost track of recursion depth"); - return -ENOMEM; - } - - if (smm->recursion_count == 1 && smm->nr_uncommitted) { - while (smm->nr_uncommitted && !r) { - smm->nr_uncommitted--; - r = commit_bop(smm, smm->uncommitted + - smm->nr_uncommitted); - if (r) - break; - } - } - - smm->recursion_count--; - - return r; -} - -/* - * When using the out() function above, we often want to combine an error - * code for the operation run in the recursive context with that from - * out(). - */ -static int combine_errors(int r1, int r2) -{ - return r1 ? r1 : r2; -} - -static int recursing(struct sm_metadata *smm) -{ - return smm->recursion_count; -} - -static void sm_metadata_destroy(struct dm_space_map *sm) -{ - struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - - kfree(smm); -} - -static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks) -{ - DMERR("doesn't support extend"); - return -EINVAL; -} - -static int sm_metadata_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count) -{ - struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - - *count = smm->ll.nr_blocks; - - return 0; -} - -static int sm_metadata_get_nr_free(struct dm_space_map *sm, dm_block_t *count) -{ - struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - - *count = smm->old_ll.nr_blocks - smm->old_ll.nr_allocated - - smm->allocated_this_transaction; - - return 0; -} - -static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, - uint32_t *result) -{ - int r, i; - struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - unsigned adjustment = 0; - - /* - * We may have some uncommitted adjustments to add. This list - * should always be really short. - */ - for (i = 0; i < smm->nr_uncommitted; i++) { - struct block_op *op = smm->uncommitted + i; - - if (op->block != b) - continue; - - switch (op->type) { - case BOP_INC: - adjustment++; - break; - - case BOP_DEC: - adjustment--; - break; - } - } - - r = sm_ll_lookup(&smm->ll, b, result); - if (r) - return r; - - *result += adjustment; - - return 0; -} - -static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm, - dm_block_t b, int *result) -{ - int r, i, adjustment = 0; - struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - uint32_t rc; - - /* - * We may have some uncommitted adjustments to add. This list - * should always be really short. - */ - for (i = 0; i < smm->nr_uncommitted; i++) { - struct block_op *op = smm->uncommitted + i; - - if (op->block != b) - continue; - - switch (op->type) { - case BOP_INC: - adjustment++; - break; - - case BOP_DEC: - adjustment--; - break; - } - } - - if (adjustment > 1) { - *result = 1; - return 0; - } - - r = sm_ll_lookup_bitmap(&smm->ll, b, &rc); - if (r) - return r; - - if (rc == 3) - /* - * We err on the side of caution, and always return true. - */ - *result = 1; - else - *result = rc + adjustment > 1; - - return 0; -} - -static int sm_metadata_set_count(struct dm_space_map *sm, dm_block_t b, - uint32_t count) -{ - int r, r2; - enum allocation_event ev; - struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - - if (smm->recursion_count) { - DMERR("cannot recurse set_count()"); - return -EINVAL; - } - - in(smm); - r = sm_ll_insert(&smm->ll, b, count, &ev); - r2 = out(smm); - - return combine_errors(r, r2); -} - -static int sm_metadata_inc_block(struct dm_space_map *sm, dm_block_t b) -{ - int r, r2 = 0; - enum allocation_event ev; - struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - - if (recursing(smm)) - r = add_bop(smm, BOP_INC, b); - else { - in(smm); - r = sm_ll_inc(&smm->ll, b, &ev); - r2 = out(smm); - } - - return combine_errors(r, r2); -} - -static int sm_metadata_dec_block(struct dm_space_map *sm, dm_block_t b) -{ - int r, r2 = 0; - enum allocation_event ev; - struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - - if (recursing(smm)) - r = add_bop(smm, BOP_DEC, b); - else { - in(smm); - r = sm_ll_dec(&smm->ll, b, &ev); - r2 = out(smm); - } - - return combine_errors(r, r2); -} - -static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b) -{ - int r, r2 = 0; - enum allocation_event ev; - struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - - r = sm_ll_find_free_block(&smm->old_ll, smm->begin, smm->old_ll.nr_blocks, b); - if (r) - return r; - - smm->begin = *b + 1; - - if (recursing(smm)) - r = add_bop(smm, BOP_INC, *b); - else { - in(smm); - r = sm_ll_inc(&smm->ll, *b, &ev); - r2 = out(smm); - } - - if (!r) - smm->allocated_this_transaction++; - - return combine_errors(r, r2); -} - -static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b) -{ - int r = sm_metadata_new_block_(sm, b); - if (r) - DMERR("out of metadata space"); - return r; -} - -static int sm_metadata_commit(struct dm_space_map *sm) -{ - int r; - struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - - r = sm_ll_commit(&smm->ll); - if (r) - return r; - - memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll)); - smm->begin = 0; - smm->allocated_this_transaction = 0; - - return 0; -} - -static int sm_metadata_root_size(struct dm_space_map *sm, size_t *result) -{ - *result = sizeof(struct disk_sm_root); - - return 0; -} - -static int sm_metadata_copy_root(struct dm_space_map *sm, void *where_le, size_t max) -{ - struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - struct disk_sm_root root_le; - - root_le.nr_blocks = cpu_to_le64(smm->ll.nr_blocks); - root_le.nr_allocated = cpu_to_le64(smm->ll.nr_allocated); - root_le.bitmap_root = cpu_to_le64(smm->ll.bitmap_root); - root_le.ref_count_root = cpu_to_le64(smm->ll.ref_count_root); - - if (max < sizeof(root_le)) - return -ENOSPC; - - memcpy(where_le, &root_le, sizeof(root_le)); - - return 0; -} - -static struct dm_space_map ops = { - .destroy = sm_metadata_destroy, - .extend = sm_metadata_extend, - .get_nr_blocks = sm_metadata_get_nr_blocks, - .get_nr_free = sm_metadata_get_nr_free, - .get_count = sm_metadata_get_count, - .count_is_more_than_one = sm_metadata_count_is_more_than_one, - .set_count = sm_metadata_set_count, - .inc_block = sm_metadata_inc_block, - .dec_block = sm_metadata_dec_block, - .new_block = sm_metadata_new_block, - .commit = sm_metadata_commit, - .root_size = sm_metadata_root_size, - .copy_root = sm_metadata_copy_root -}; - -/*----------------------------------------------------------------*/ - -/* - * When a new space map is created that manages its own space. We use - * this tiny bootstrap allocator. - */ -static void sm_bootstrap_destroy(struct dm_space_map *sm) -{ -} - -static int sm_bootstrap_extend(struct dm_space_map *sm, dm_block_t extra_blocks) -{ - DMERR("boostrap doesn't support extend"); - - return -EINVAL; -} - -static int sm_bootstrap_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count) -{ - struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - - return smm->ll.nr_blocks; -} - -static int sm_bootstrap_get_nr_free(struct dm_space_map *sm, dm_block_t *count) -{ - struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - - *count = smm->ll.nr_blocks - smm->begin; - - return 0; -} - -static int sm_bootstrap_get_count(struct dm_space_map *sm, dm_block_t b, - uint32_t *result) -{ - struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - - return b < smm->begin ? 1 : 0; -} - -static int sm_bootstrap_count_is_more_than_one(struct dm_space_map *sm, - dm_block_t b, int *result) -{ - *result = 0; - - return 0; -} - -static int sm_bootstrap_set_count(struct dm_space_map *sm, dm_block_t b, - uint32_t count) -{ - DMERR("boostrap doesn't support set_count"); - - return -EINVAL; -} - -static int sm_bootstrap_new_block(struct dm_space_map *sm, dm_block_t *b) -{ - struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - - /* - * We know the entire device is unused. - */ - if (smm->begin == smm->ll.nr_blocks) - return -ENOSPC; - - *b = smm->begin++; - - return 0; -} - -static int sm_bootstrap_inc_block(struct dm_space_map *sm, dm_block_t b) -{ - struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - - return add_bop(smm, BOP_INC, b); -} - -static int sm_bootstrap_dec_block(struct dm_space_map *sm, dm_block_t b) -{ - struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - - return add_bop(smm, BOP_DEC, b); -} - -static int sm_bootstrap_commit(struct dm_space_map *sm) -{ - return 0; -} - -static int sm_bootstrap_root_size(struct dm_space_map *sm, size_t *result) -{ - DMERR("boostrap doesn't support root_size"); - - return -EINVAL; -} - -static int sm_bootstrap_copy_root(struct dm_space_map *sm, void *where, - size_t max) -{ - DMERR("boostrap doesn't support copy_root"); - - return -EINVAL; -} - -static struct dm_space_map bootstrap_ops = { - .destroy = sm_bootstrap_destroy, - .extend = sm_bootstrap_extend, - .get_nr_blocks = sm_bootstrap_get_nr_blocks, - .get_nr_free = sm_bootstrap_get_nr_free, - .get_count = sm_bootstrap_get_count, - .count_is_more_than_one = sm_bootstrap_count_is_more_than_one, - .set_count = sm_bootstrap_set_count, - .inc_block = sm_bootstrap_inc_block, - .dec_block = sm_bootstrap_dec_block, - .new_block = sm_bootstrap_new_block, - .commit = sm_bootstrap_commit, - .root_size = sm_bootstrap_root_size, - .copy_root = sm_bootstrap_copy_root -}; - -/*----------------------------------------------------------------*/ - -struct dm_space_map *dm_sm_metadata_init(void) -{ - struct sm_metadata *smm; - - smm = kmalloc(sizeof(*smm), GFP_KERNEL); - if (!smm) - return ERR_PTR(-ENOMEM); - - memcpy(&smm->sm, &ops, sizeof(smm->sm)); - - return &smm->sm; -} - -int dm_sm_metadata_create(struct dm_space_map *sm, - struct dm_transaction_manager *tm, - dm_block_t nr_blocks, - dm_block_t superblock) -{ - int r; - dm_block_t i; - enum allocation_event ev; - struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - - smm->begin = superblock + 1; - smm->recursion_count = 0; - smm->allocated_this_transaction = 0; - smm->nr_uncommitted = 0; - - memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm)); - - r = sm_ll_new_metadata(&smm->ll, tm); - if (r) - return r; - - r = sm_ll_extend(&smm->ll, nr_blocks); - if (r) - return r; - - memcpy(&smm->sm, &ops, sizeof(smm->sm)); - - /* - * Now we need to update the newly created data structures with the - * allocated blocks that they were built from. - */ - for (i = superblock; !r && i < smm->begin; i++) - r = sm_ll_inc(&smm->ll, i, &ev); - - if (r) - return r; - - return sm_metadata_commit(sm); -} - -int dm_sm_metadata_open(struct dm_space_map *sm, - struct dm_transaction_manager *tm, - void *root_le, size_t len) -{ - int r; - struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - - r = sm_ll_open_metadata(&smm->ll, tm, root_le, len); - if (r) - return r; - - smm->begin = 0; - smm->recursion_count = 0; - smm->allocated_this_transaction = 0; - smm->nr_uncommitted = 0; - - memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll)); - return 0; -} diff --git a/ANDROID_3.4.5/drivers/md/persistent-data/dm-space-map-metadata.h b/ANDROID_3.4.5/drivers/md/persistent-data/dm-space-map-metadata.h deleted file mode 100644 index 39bba080..00000000 --- a/ANDROID_3.4.5/drivers/md/persistent-data/dm-space-map-metadata.h +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (C) 2011 Red Hat, Inc. - * - * This file is released under the GPL. - */ - -#ifndef DM_SPACE_MAP_METADATA_H -#define DM_SPACE_MAP_METADATA_H - -#include "dm-transaction-manager.h" - -/* - * Unfortunately we have to use two-phase construction due to the cycle - * between the tm and sm. - */ -struct dm_space_map *dm_sm_metadata_init(void); - -/* - * Create a fresh space map. - */ -int dm_sm_metadata_create(struct dm_space_map *sm, - struct dm_transaction_manager *tm, - dm_block_t nr_blocks, - dm_block_t superblock); - -/* - * Open from a previously-recorded root. - */ -int dm_sm_metadata_open(struct dm_space_map *sm, - struct dm_transaction_manager *tm, - void *root_le, size_t len); - -#endif /* DM_SPACE_MAP_METADATA_H */ diff --git a/ANDROID_3.4.5/drivers/md/persistent-data/dm-space-map.h b/ANDROID_3.4.5/drivers/md/persistent-data/dm-space-map.h deleted file mode 100644 index 1cbfc6b1..00000000 --- a/ANDROID_3.4.5/drivers/md/persistent-data/dm-space-map.h +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Copyright (C) 2011 Red Hat, Inc. - * - * This file is released under the GPL. - */ - -#ifndef _LINUX_DM_SPACE_MAP_H -#define _LINUX_DM_SPACE_MAP_H - -#include "dm-block-manager.h" - -/* - * struct dm_space_map keeps a record of how many times each block in a device - * is referenced. It needs to be fixed on disk as part of the transaction. - */ -struct dm_space_map { - void (*destroy)(struct dm_space_map *sm); - - /* - * You must commit before allocating the newly added space. - */ - int (*extend)(struct dm_space_map *sm, dm_block_t extra_blocks); - - /* - * Extensions do not appear in this count until after commit has - * been called. - */ - int (*get_nr_blocks)(struct dm_space_map *sm, dm_block_t *count); - - /* - * Space maps must never allocate a block from the previous - * transaction, in case we need to rollback. This complicates the - * semantics of get_nr_free(), it should return the number of blocks - * that are available for allocation _now_. For instance you may - * have blocks with a zero reference count that will not be - * available for allocation until after the next commit. - */ - int (*get_nr_free)(struct dm_space_map *sm, dm_block_t *count); - - int (*get_count)(struct dm_space_map *sm, dm_block_t b, uint32_t *result); - int (*count_is_more_than_one)(struct dm_space_map *sm, dm_block_t b, - int *result); - int (*set_count)(struct dm_space_map *sm, dm_block_t b, uint32_t count); - - int (*commit)(struct dm_space_map *sm); - - int (*inc_block)(struct dm_space_map *sm, dm_block_t b); - int (*dec_block)(struct dm_space_map *sm, dm_block_t b); - - /* - * new_block will increment the returned block. - */ - int (*new_block)(struct dm_space_map *sm, dm_block_t *b); - - /* - * The root contains all the information needed to fix the space map. - * Generally this info is small, so squirrel it away in a disk block - * along with other info. - */ - int (*root_size)(struct dm_space_map *sm, size_t *result); - int (*copy_root)(struct dm_space_map *sm, void *copy_to_here_le, size_t len); -}; - -/*----------------------------------------------------------------*/ - -static inline void dm_sm_destroy(struct dm_space_map *sm) -{ - sm->destroy(sm); -} - -static inline int dm_sm_extend(struct dm_space_map *sm, dm_block_t extra_blocks) -{ - return sm->extend(sm, extra_blocks); -} - -static inline int dm_sm_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count) -{ - return sm->get_nr_blocks(sm, count); -} - -static inline int dm_sm_get_nr_free(struct dm_space_map *sm, dm_block_t *count) -{ - return sm->get_nr_free(sm, count); -} - -static inline int dm_sm_get_count(struct dm_space_map *sm, dm_block_t b, - uint32_t *result) -{ - return sm->get_count(sm, b, result); -} - -static inline int dm_sm_count_is_more_than_one(struct dm_space_map *sm, - dm_block_t b, int *result) -{ - return sm->count_is_more_than_one(sm, b, result); -} - -static inline int dm_sm_set_count(struct dm_space_map *sm, dm_block_t b, - uint32_t count) -{ - return sm->set_count(sm, b, count); -} - -static inline int dm_sm_commit(struct dm_space_map *sm) -{ - return sm->commit(sm); -} - -static inline int dm_sm_inc_block(struct dm_space_map *sm, dm_block_t b) -{ - return sm->inc_block(sm, b); -} - -static inline int dm_sm_dec_block(struct dm_space_map *sm, dm_block_t b) -{ - return sm->dec_block(sm, b); -} - -static inline int dm_sm_new_block(struct dm_space_map *sm, dm_block_t *b) -{ - return sm->new_block(sm, b); -} - -static inline int dm_sm_root_size(struct dm_space_map *sm, size_t *result) -{ - return sm->root_size(sm, result); -} - -static inline int dm_sm_copy_root(struct dm_space_map *sm, void *copy_to_here_le, size_t len) -{ - return sm->copy_root(sm, copy_to_here_le, len); -} - -#endif /* _LINUX_DM_SPACE_MAP_H */ diff --git a/ANDROID_3.4.5/drivers/md/persistent-data/dm-transaction-manager.c b/ANDROID_3.4.5/drivers/md/persistent-data/dm-transaction-manager.c deleted file mode 100644 index ba54aacf..00000000 --- a/ANDROID_3.4.5/drivers/md/persistent-data/dm-transaction-manager.c +++ /dev/null @@ -1,407 +0,0 @@ -/* - * Copyright (C) 2011 Red Hat, Inc. - * - * This file is released under the GPL. - */ -#include "dm-transaction-manager.h" -#include "dm-space-map.h" -#include "dm-space-map-checker.h" -#include "dm-space-map-disk.h" -#include "dm-space-map-metadata.h" -#include "dm-persistent-data-internal.h" - -#include <linux/export.h> -#include <linux/slab.h> -#include <linux/device-mapper.h> - -#define DM_MSG_PREFIX "transaction manager" - -/*----------------------------------------------------------------*/ - -struct shadow_info { - struct hlist_node hlist; - dm_block_t where; -}; - -/* - * It would be nice if we scaled with the size of transaction. - */ -#define HASH_SIZE 256 -#define HASH_MASK (HASH_SIZE - 1) - -struct dm_transaction_manager { - int is_clone; - struct dm_transaction_manager *real; - - struct dm_block_manager *bm; - struct dm_space_map *sm; - - spinlock_t lock; - struct hlist_head buckets[HASH_SIZE]; -}; - -/*----------------------------------------------------------------*/ - -static int is_shadow(struct dm_transaction_manager *tm, dm_block_t b) -{ - int r = 0; - unsigned bucket = dm_hash_block(b, HASH_MASK); - struct shadow_info *si; - struct hlist_node *n; - - spin_lock(&tm->lock); - hlist_for_each_entry(si, n, tm->buckets + bucket, hlist) - if (si->where == b) { - r = 1; - break; - } - spin_unlock(&tm->lock); - - return r; -} - -/* - * This can silently fail if there's no memory. We're ok with this since - * creating redundant shadows causes no harm. - */ -static void insert_shadow(struct dm_transaction_manager *tm, dm_block_t b) -{ - unsigned bucket; - struct shadow_info *si; - - si = kmalloc(sizeof(*si), GFP_NOIO); - if (si) { - si->where = b; - bucket = dm_hash_block(b, HASH_MASK); - spin_lock(&tm->lock); - hlist_add_head(&si->hlist, tm->buckets + bucket); - spin_unlock(&tm->lock); - } -} - -static void wipe_shadow_table(struct dm_transaction_manager *tm) -{ - struct shadow_info *si; - struct hlist_node *n, *tmp; - struct hlist_head *bucket; - int i; - - spin_lock(&tm->lock); - for (i = 0; i < HASH_SIZE; i++) { - bucket = tm->buckets + i; - hlist_for_each_entry_safe(si, n, tmp, bucket, hlist) - kfree(si); - - INIT_HLIST_HEAD(bucket); - } - - spin_unlock(&tm->lock); -} - -/*----------------------------------------------------------------*/ - -static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm, - struct dm_space_map *sm) -{ - int i; - struct dm_transaction_manager *tm; - - tm = kmalloc(sizeof(*tm), GFP_KERNEL); - if (!tm) - return ERR_PTR(-ENOMEM); - - tm->is_clone = 0; - tm->real = NULL; - tm->bm = bm; - tm->sm = sm; - - spin_lock_init(&tm->lock); - for (i = 0; i < HASH_SIZE; i++) - INIT_HLIST_HEAD(tm->buckets + i); - - return tm; -} - -struct dm_transaction_manager *dm_tm_create_non_blocking_clone(struct dm_transaction_manager *real) -{ - struct dm_transaction_manager *tm; - - tm = kmalloc(sizeof(*tm), GFP_KERNEL); - if (tm) { - tm->is_clone = 1; - tm->real = real; - } - - return tm; -} -EXPORT_SYMBOL_GPL(dm_tm_create_non_blocking_clone); - -void dm_tm_destroy(struct dm_transaction_manager *tm) -{ - if (!tm->is_clone) - wipe_shadow_table(tm); - - kfree(tm); -} -EXPORT_SYMBOL_GPL(dm_tm_destroy); - -int dm_tm_pre_commit(struct dm_transaction_manager *tm) -{ - int r; - - if (tm->is_clone) - return -EWOULDBLOCK; - - r = dm_sm_commit(tm->sm); - if (r < 0) - return r; - - return 0; -} -EXPORT_SYMBOL_GPL(dm_tm_pre_commit); - -int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root) -{ - if (tm->is_clone) - return -EWOULDBLOCK; - - wipe_shadow_table(tm); - - return dm_bm_flush_and_unlock(tm->bm, root); -} -EXPORT_SYMBOL_GPL(dm_tm_commit); - -int dm_tm_new_block(struct dm_transaction_manager *tm, - struct dm_block_validator *v, - struct dm_block **result) -{ - int r; - dm_block_t new_block; - - if (tm->is_clone) - return -EWOULDBLOCK; - - r = dm_sm_new_block(tm->sm, &new_block); - if (r < 0) - return r; - - r = dm_bm_write_lock_zero(tm->bm, new_block, v, result); - if (r < 0) { - dm_sm_dec_block(tm->sm, new_block); - return r; - } - - /* - * New blocks count as shadows in that they don't need to be - * shadowed again. - */ - insert_shadow(tm, new_block); - - return 0; -} - -static int __shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, - struct dm_block_validator *v, - struct dm_block **result) -{ - int r; - dm_block_t new; - struct dm_block *orig_block; - - r = dm_sm_new_block(tm->sm, &new); - if (r < 0) - return r; - - r = dm_sm_dec_block(tm->sm, orig); - if (r < 0) - return r; - - r = dm_bm_read_lock(tm->bm, orig, v, &orig_block); - if (r < 0) - return r; - - r = dm_bm_unlock_move(orig_block, new); - if (r < 0) { - dm_bm_unlock(orig_block); - return r; - } - - return dm_bm_write_lock(tm->bm, new, v, result); -} - -int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, - struct dm_block_validator *v, struct dm_block **result, - int *inc_children) -{ - int r; - - if (tm->is_clone) - return -EWOULDBLOCK; - - r = dm_sm_count_is_more_than_one(tm->sm, orig, inc_children); - if (r < 0) - return r; - - if (is_shadow(tm, orig) && !*inc_children) - return dm_bm_write_lock(tm->bm, orig, v, result); - - r = __shadow_block(tm, orig, v, result); - if (r < 0) - return r; - insert_shadow(tm, dm_block_location(*result)); - - return r; -} - -int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b, - struct dm_block_validator *v, - struct dm_block **blk) -{ - if (tm->is_clone) - return dm_bm_read_try_lock(tm->real->bm, b, v, blk); - - return dm_bm_read_lock(tm->bm, b, v, blk); -} - -int dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b) -{ - return dm_bm_unlock(b); -} -EXPORT_SYMBOL_GPL(dm_tm_unlock); - -void dm_tm_inc(struct dm_transaction_manager *tm, dm_block_t b) -{ - /* - * The non-blocking clone doesn't support this. - */ - BUG_ON(tm->is_clone); - - dm_sm_inc_block(tm->sm, b); -} -EXPORT_SYMBOL_GPL(dm_tm_inc); - -void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b) -{ - /* - * The non-blocking clone doesn't support this. - */ - BUG_ON(tm->is_clone); - - dm_sm_dec_block(tm->sm, b); -} -EXPORT_SYMBOL_GPL(dm_tm_dec); - -int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b, - uint32_t *result) -{ - if (tm->is_clone) - return -EWOULDBLOCK; - - return dm_sm_get_count(tm->sm, b, result); -} - -struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm) -{ - return tm->bm; -} - -/*----------------------------------------------------------------*/ - -static int dm_tm_create_internal(struct dm_block_manager *bm, - dm_block_t sb_location, - struct dm_block_validator *sb_validator, - size_t root_offset, size_t root_max_len, - struct dm_transaction_manager **tm, - struct dm_space_map **sm, - struct dm_block **sblock, - int create) -{ - int r; - struct dm_space_map *inner; - - inner = dm_sm_metadata_init(); - if (IS_ERR(inner)) - return PTR_ERR(inner); - - *tm = dm_tm_create(bm, inner); - if (IS_ERR(*tm)) { - dm_sm_destroy(inner); - return PTR_ERR(*tm); - } - - if (create) { - r = dm_bm_write_lock_zero(dm_tm_get_bm(*tm), sb_location, - sb_validator, sblock); - if (r < 0) { - DMERR("couldn't lock superblock"); - goto bad1; - } - - r = dm_sm_metadata_create(inner, *tm, dm_bm_nr_blocks(bm), - sb_location); - if (r) { - DMERR("couldn't create metadata space map"); - goto bad2; - } - - *sm = dm_sm_checker_create(inner); - if (IS_ERR(*sm)) { - r = PTR_ERR(*sm); - goto bad2; - } - - } else { - r = dm_bm_write_lock(dm_tm_get_bm(*tm), sb_location, - sb_validator, sblock); - if (r < 0) { - DMERR("couldn't lock superblock"); - goto bad1; - } - - r = dm_sm_metadata_open(inner, *tm, - dm_block_data(*sblock) + root_offset, - root_max_len); - if (r) { - DMERR("couldn't open metadata space map"); - goto bad2; - } - - *sm = dm_sm_checker_create(inner); - if (IS_ERR(*sm)) { - r = PTR_ERR(*sm); - goto bad2; - } - } - - return 0; - -bad2: - dm_tm_unlock(*tm, *sblock); -bad1: - dm_tm_destroy(*tm); - dm_sm_destroy(inner); - return r; -} - -int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, - struct dm_block_validator *sb_validator, - struct dm_transaction_manager **tm, - struct dm_space_map **sm, struct dm_block **sblock) -{ - return dm_tm_create_internal(bm, sb_location, sb_validator, - 0, 0, tm, sm, sblock, 1); -} -EXPORT_SYMBOL_GPL(dm_tm_create_with_sm); - -int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, - struct dm_block_validator *sb_validator, - size_t root_offset, size_t root_max_len, - struct dm_transaction_manager **tm, - struct dm_space_map **sm, struct dm_block **sblock) -{ - return dm_tm_create_internal(bm, sb_location, sb_validator, root_offset, - root_max_len, tm, sm, sblock, 0); -} -EXPORT_SYMBOL_GPL(dm_tm_open_with_sm); - -/*----------------------------------------------------------------*/ diff --git a/ANDROID_3.4.5/drivers/md/persistent-data/dm-transaction-manager.h b/ANDROID_3.4.5/drivers/md/persistent-data/dm-transaction-manager.h deleted file mode 100644 index 6da78487..00000000 --- a/ANDROID_3.4.5/drivers/md/persistent-data/dm-transaction-manager.h +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright (C) 2011 Red Hat, Inc. - * - * This file is released under the GPL. - */ - -#ifndef _LINUX_DM_TRANSACTION_MANAGER_H -#define _LINUX_DM_TRANSACTION_MANAGER_H - -#include "dm-block-manager.h" - -struct dm_transaction_manager; -struct dm_space_map; - -/*----------------------------------------------------------------*/ - -/* - * This manages the scope of a transaction. It also enforces immutability - * of the on-disk data structures by limiting access to writeable blocks. - * - * Clients should not fiddle with the block manager directly. - */ - -void dm_tm_destroy(struct dm_transaction_manager *tm); - -/* - * The non-blocking version of a transaction manager is intended for use in - * fast path code that needs to do lookups e.g. a dm mapping function. - * You create the non-blocking variant from a normal tm. The interface is - * the same, except that most functions will just return -EWOULDBLOCK. - * Methods that return void yet may block should not be called on a clone - * viz. dm_tm_inc, dm_tm_dec. Call dm_tm_destroy() as you would with a normal - * tm when you've finished with it. You may not destroy the original prior - * to clones. - */ -struct dm_transaction_manager *dm_tm_create_non_blocking_clone(struct dm_transaction_manager *real); - -/* - * We use a 2-phase commit here. - * - * i) In the first phase the block manager is told to start flushing, and - * the changes to the space map are written to disk. You should interrogate - * your particular space map to get detail of its root node etc. to be - * included in your superblock. - * - * ii) @root will be committed last. You shouldn't use more than the - * first 512 bytes of @root if you wish the transaction to survive a power - * failure. You *must* have a write lock held on @root for both stage (i) - * and (ii). The commit will drop the write lock. - */ -int dm_tm_pre_commit(struct dm_transaction_manager *tm); -int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root); - -/* - * These methods are the only way to get hold of a writeable block. - */ - -/* - * dm_tm_new_block() is pretty self-explanatory. Make sure you do actually - * write to the whole of @data before you unlock, otherwise you could get - * a data leak. (The other option is for tm_new_block() to zero new blocks - * before handing them out, which will be redundant in most, if not all, - * cases). - * Zeroes the new block and returns with write lock held. - */ -int dm_tm_new_block(struct dm_transaction_manager *tm, - struct dm_block_validator *v, - struct dm_block **result); - -/* - * dm_tm_shadow_block() allocates a new block and copies the data from @orig - * to it. It then decrements the reference count on original block. Use - * this to update the contents of a block in a data structure, don't - * confuse this with a clone - you shouldn't access the orig block after - * this operation. Because the tm knows the scope of the transaction it - * can optimise requests for a shadow of a shadow to a no-op. Don't forget - * to unlock when you've finished with the shadow. - * - * The @inc_children flag is used to tell the caller whether it needs to - * adjust reference counts for children. (Data in the block may refer to - * other blocks.) - * - * Shadowing implicitly drops a reference on @orig so you must not have - * it locked when you call this. - */ -int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, - struct dm_block_validator *v, - struct dm_block **result, int *inc_children); - -/* - * Read access. You can lock any block you want. If there's a write lock - * on it outstanding then it'll block. - */ -int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b, - struct dm_block_validator *v, - struct dm_block **result); - -int dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b); - -/* - * Functions for altering the reference count of a block directly. - */ -void dm_tm_inc(struct dm_transaction_manager *tm, dm_block_t b); - -void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b); - -int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b, - uint32_t *result); - -struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm); - -/* - * A little utility that ties the knot by producing a transaction manager - * that has a space map managed by the transaction manager... - * - * Returns a tm that has an open transaction to write the new disk sm. - * Caller should store the new sm root and commit. - */ -int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, - struct dm_block_validator *sb_validator, - struct dm_transaction_manager **tm, - struct dm_space_map **sm, struct dm_block **sblock); - -int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, - struct dm_block_validator *sb_validator, - size_t root_offset, size_t root_max_len, - struct dm_transaction_manager **tm, - struct dm_space_map **sm, struct dm_block **sblock); - -#endif /* _LINUX_DM_TRANSACTION_MANAGER_H */ diff --git a/ANDROID_3.4.5/drivers/md/raid0.c b/ANDROID_3.4.5/drivers/md/raid0.c deleted file mode 100644 index de63a1fc..00000000 --- a/ANDROID_3.4.5/drivers/md/raid0.c +++ /dev/null @@ -1,739 +0,0 @@ -/* - raid0.c : Multiple Devices driver for Linux - Copyright (C) 1994-96 Marc ZYNGIER - <zyngier@ufr-info-p7.ibp.fr> or - <maz@gloups.fdn.fr> - Copyright (C) 1999, 2000 Ingo Molnar, Red Hat - - - RAID-0 management functions. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. - - You should have received a copy of the GNU General Public License - (for example /usr/src/linux/COPYING); if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -*/ - -#include <linux/blkdev.h> -#include <linux/seq_file.h> -#include <linux/module.h> -#include <linux/slab.h> -#include "md.h" -#include "raid0.h" -#include "raid5.h" - -static int raid0_congested(void *data, int bits) -{ - struct mddev *mddev = data; - struct r0conf *conf = mddev->private; - struct md_rdev **devlist = conf->devlist; - int raid_disks = conf->strip_zone[0].nb_dev; - int i, ret = 0; - - if (mddev_congested(mddev, bits)) - return 1; - - for (i = 0; i < raid_disks && !ret ; i++) { - struct request_queue *q = bdev_get_queue(devlist[i]->bdev); - - ret |= bdi_congested(&q->backing_dev_info, bits); - } - return ret; -} - -/* - * inform the user of the raid configuration -*/ -static void dump_zones(struct mddev *mddev) -{ - int j, k; - sector_t zone_size = 0; - sector_t zone_start = 0; - char b[BDEVNAME_SIZE]; - struct r0conf *conf = mddev->private; - int raid_disks = conf->strip_zone[0].nb_dev; - printk(KERN_INFO "md: RAID0 configuration for %s - %d zone%s\n", - mdname(mddev), - conf->nr_strip_zones, conf->nr_strip_zones==1?"":"s"); - for (j = 0; j < conf->nr_strip_zones; j++) { - printk(KERN_INFO "md: zone%d=[", j); - for (k = 0; k < conf->strip_zone[j].nb_dev; k++) - printk(KERN_CONT "%s%s", k?"/":"", - bdevname(conf->devlist[j*raid_disks - + k]->bdev, b)); - printk(KERN_CONT "]\n"); - - zone_size = conf->strip_zone[j].zone_end - zone_start; - printk(KERN_INFO " zone-offset=%10lluKB, " - "device-offset=%10lluKB, size=%10lluKB\n", - (unsigned long long)zone_start>>1, - (unsigned long long)conf->strip_zone[j].dev_start>>1, - (unsigned long long)zone_size>>1); - zone_start = conf->strip_zone[j].zone_end; - } - printk(KERN_INFO "\n"); -} - -static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) -{ - int i, c, err; - sector_t curr_zone_end, sectors; - struct md_rdev *smallest, *rdev1, *rdev2, *rdev, **dev; - struct strip_zone *zone; - int cnt; - char b[BDEVNAME_SIZE]; - char b2[BDEVNAME_SIZE]; - struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL); - - if (!conf) - return -ENOMEM; - rdev_for_each(rdev1, mddev) { - pr_debug("md/raid0:%s: looking at %s\n", - mdname(mddev), - bdevname(rdev1->bdev, b)); - c = 0; - - /* round size to chunk_size */ - sectors = rdev1->sectors; - sector_div(sectors, mddev->chunk_sectors); - rdev1->sectors = sectors * mddev->chunk_sectors; - - rdev_for_each(rdev2, mddev) { - pr_debug("md/raid0:%s: comparing %s(%llu)" - " with %s(%llu)\n", - mdname(mddev), - bdevname(rdev1->bdev,b), - (unsigned long long)rdev1->sectors, - bdevname(rdev2->bdev,b2), - (unsigned long long)rdev2->sectors); - if (rdev2 == rdev1) { - pr_debug("md/raid0:%s: END\n", - mdname(mddev)); - break; - } - if (rdev2->sectors == rdev1->sectors) { - /* - * Not unique, don't count it as a new - * group - */ - pr_debug("md/raid0:%s: EQUAL\n", - mdname(mddev)); - c = 1; - break; - } - pr_debug("md/raid0:%s: NOT EQUAL\n", - mdname(mddev)); - } - if (!c) { - pr_debug("md/raid0:%s: ==> UNIQUE\n", - mdname(mddev)); - conf->nr_strip_zones++; - pr_debug("md/raid0:%s: %d zones\n", - mdname(mddev), conf->nr_strip_zones); - } - } - pr_debug("md/raid0:%s: FINAL %d zones\n", - mdname(mddev), conf->nr_strip_zones); - err = -ENOMEM; - conf->strip_zone = kzalloc(sizeof(struct strip_zone)* - conf->nr_strip_zones, GFP_KERNEL); - if (!conf->strip_zone) - goto abort; - conf->devlist = kzalloc(sizeof(struct md_rdev*)* - conf->nr_strip_zones*mddev->raid_disks, - GFP_KERNEL); - if (!conf->devlist) - goto abort; - - /* The first zone must contain all devices, so here we check that - * there is a proper alignment of slots to devices and find them all - */ - zone = &conf->strip_zone[0]; - cnt = 0; - smallest = NULL; - dev = conf->devlist; - err = -EINVAL; - rdev_for_each(rdev1, mddev) { - int j = rdev1->raid_disk; - - if (mddev->level == 10) { - /* taking over a raid10-n2 array */ - j /= 2; - rdev1->new_raid_disk = j; - } - - if (mddev->level == 1) { - /* taiking over a raid1 array- - * we have only one active disk - */ - j = 0; - rdev1->new_raid_disk = j; - } - - if (j < 0 || j >= mddev->raid_disks) { - printk(KERN_ERR "md/raid0:%s: bad disk number %d - " - "aborting!\n", mdname(mddev), j); - goto abort; - } - if (dev[j]) { - printk(KERN_ERR "md/raid0:%s: multiple devices for %d - " - "aborting!\n", mdname(mddev), j); - goto abort; - } - dev[j] = rdev1; - - disk_stack_limits(mddev->gendisk, rdev1->bdev, - rdev1->data_offset << 9); - - if (rdev1->bdev->bd_disk->queue->merge_bvec_fn) - conf->has_merge_bvec = 1; - - if (!smallest || (rdev1->sectors < smallest->sectors)) - smallest = rdev1; - cnt++; - } - if (cnt != mddev->raid_disks) { - printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - " - "aborting!\n", mdname(mddev), cnt, mddev->raid_disks); - goto abort; - } - zone->nb_dev = cnt; - zone->zone_end = smallest->sectors * cnt; - - curr_zone_end = zone->zone_end; - - /* now do the other zones */ - for (i = 1; i < conf->nr_strip_zones; i++) - { - int j; - - zone = conf->strip_zone + i; - dev = conf->devlist + i * mddev->raid_disks; - - pr_debug("md/raid0:%s: zone %d\n", mdname(mddev), i); - zone->dev_start = smallest->sectors; - smallest = NULL; - c = 0; - - for (j=0; j<cnt; j++) { - rdev = conf->devlist[j]; - if (rdev->sectors <= zone->dev_start) { - pr_debug("md/raid0:%s: checking %s ... nope\n", - mdname(mddev), - bdevname(rdev->bdev, b)); - continue; - } - pr_debug("md/raid0:%s: checking %s ..." - " contained as device %d\n", - mdname(mddev), - bdevname(rdev->bdev, b), c); - dev[c] = rdev; - c++; - if (!smallest || rdev->sectors < smallest->sectors) { - smallest = rdev; - pr_debug("md/raid0:%s: (%llu) is smallest!.\n", - mdname(mddev), - (unsigned long long)rdev->sectors); - } - } - - zone->nb_dev = c; - sectors = (smallest->sectors - zone->dev_start) * c; - pr_debug("md/raid0:%s: zone->nb_dev: %d, sectors: %llu\n", - mdname(mddev), - zone->nb_dev, (unsigned long long)sectors); - - curr_zone_end += sectors; - zone->zone_end = curr_zone_end; - - pr_debug("md/raid0:%s: current zone start: %llu\n", - mdname(mddev), - (unsigned long long)smallest->sectors); - } - mddev->queue->backing_dev_info.congested_fn = raid0_congested; - mddev->queue->backing_dev_info.congested_data = mddev; - - /* - * now since we have the hard sector sizes, we can make sure - * chunk size is a multiple of that sector size - */ - if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) { - printk(KERN_ERR "md/raid0:%s: chunk_size of %d not valid\n", - mdname(mddev), - mddev->chunk_sectors << 9); - goto abort; - } - - blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); - blk_queue_io_opt(mddev->queue, - (mddev->chunk_sectors << 9) * mddev->raid_disks); - - pr_debug("md/raid0:%s: done.\n", mdname(mddev)); - *private_conf = conf; - - return 0; -abort: - kfree(conf->strip_zone); - kfree(conf->devlist); - kfree(conf); - *private_conf = NULL; - return err; -} - -/* Find the zone which holds a particular offset - * Update *sectorp to be an offset in that zone - */ -static struct strip_zone *find_zone(struct r0conf *conf, - sector_t *sectorp) -{ - int i; - struct strip_zone *z = conf->strip_zone; - sector_t sector = *sectorp; - - for (i = 0; i < conf->nr_strip_zones; i++) - if (sector < z[i].zone_end) { - if (i) - *sectorp = sector - z[i-1].zone_end; - return z + i; - } - BUG(); -} - -/* - * remaps the bio to the target device. we separate two flows. - * power 2 flow and a general flow for the sake of perfromance -*/ -static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone, - sector_t sector, sector_t *sector_offset) -{ - unsigned int sect_in_chunk; - sector_t chunk; - struct r0conf *conf = mddev->private; - int raid_disks = conf->strip_zone[0].nb_dev; - unsigned int chunk_sects = mddev->chunk_sectors; - - if (is_power_of_2(chunk_sects)) { - int chunksect_bits = ffz(~chunk_sects); - /* find the sector offset inside the chunk */ - sect_in_chunk = sector & (chunk_sects - 1); - sector >>= chunksect_bits; - /* chunk in zone */ - chunk = *sector_offset; - /* quotient is the chunk in real device*/ - sector_div(chunk, zone->nb_dev << chunksect_bits); - } else{ - sect_in_chunk = sector_div(sector, chunk_sects); - chunk = *sector_offset; - sector_div(chunk, chunk_sects * zone->nb_dev); - } - /* - * position the bio over the real device - * real sector = chunk in device + starting of zone - * + the position in the chunk - */ - *sector_offset = (chunk * chunk_sects) + sect_in_chunk; - return conf->devlist[(zone - conf->strip_zone)*raid_disks - + sector_div(sector, zone->nb_dev)]; -} - -/** - * raid0_mergeable_bvec -- tell bio layer if two requests can be merged - * @q: request queue - * @bvm: properties of new bio - * @biovec: the request that could be merged to it. - * - * Return amount of bytes we can accept at this offset - */ -static int raid0_mergeable_bvec(struct request_queue *q, - struct bvec_merge_data *bvm, - struct bio_vec *biovec) -{ - struct mddev *mddev = q->queuedata; - struct r0conf *conf = mddev->private; - sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); - sector_t sector_offset = sector; - int max; - unsigned int chunk_sectors = mddev->chunk_sectors; - unsigned int bio_sectors = bvm->bi_size >> 9; - struct strip_zone *zone; - struct md_rdev *rdev; - struct request_queue *subq; - - if (is_power_of_2(chunk_sectors)) - max = (chunk_sectors - ((sector & (chunk_sectors-1)) - + bio_sectors)) << 9; - else - max = (chunk_sectors - (sector_div(sector, chunk_sectors) - + bio_sectors)) << 9; - if (max < 0) - max = 0; /* bio_add cannot handle a negative return */ - if (max <= biovec->bv_len && bio_sectors == 0) - return biovec->bv_len; - if (max < biovec->bv_len) - /* too small already, no need to check further */ - return max; - if (!conf->has_merge_bvec) - return max; - - /* May need to check subordinate device */ - sector = sector_offset; - zone = find_zone(mddev->private, §or_offset); - rdev = map_sector(mddev, zone, sector, §or_offset); - subq = bdev_get_queue(rdev->bdev); - if (subq->merge_bvec_fn) { - bvm->bi_bdev = rdev->bdev; - bvm->bi_sector = sector_offset + zone->dev_start + - rdev->data_offset; - return min(max, subq->merge_bvec_fn(subq, bvm, biovec)); - } else - return max; -} - -static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks) -{ - sector_t array_sectors = 0; - struct md_rdev *rdev; - - WARN_ONCE(sectors || raid_disks, - "%s does not support generic reshape\n", __func__); - - rdev_for_each(rdev, mddev) - array_sectors += rdev->sectors; - - return array_sectors; -} - -static int raid0_stop(struct mddev *mddev); - -static int raid0_run(struct mddev *mddev) -{ - struct r0conf *conf; - int ret; - - if (mddev->chunk_sectors == 0) { - printk(KERN_ERR "md/raid0:%s: chunk size must be set.\n", - mdname(mddev)); - return -EINVAL; - } - if (md_check_no_bitmap(mddev)) - return -EINVAL; - blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); - - /* if private is not null, we are here after takeover */ - if (mddev->private == NULL) { - ret = create_strip_zones(mddev, &conf); - if (ret < 0) - return ret; - mddev->private = conf; - } - conf = mddev->private; - - /* calculate array device size */ - md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); - - printk(KERN_INFO "md/raid0:%s: md_size is %llu sectors.\n", - mdname(mddev), - (unsigned long long)mddev->array_sectors); - /* calculate the max read-ahead size. - * For read-ahead of large files to be effective, we need to - * readahead at least twice a whole stripe. i.e. number of devices - * multiplied by chunk size times 2. - * If an individual device has an ra_pages greater than the - * chunk size, then we will not drive that device as hard as it - * wants. We consider this a configuration error: a larger - * chunksize should be used in that case. - */ - { - int stripe = mddev->raid_disks * - (mddev->chunk_sectors << 9) / PAGE_SIZE; - if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) - mddev->queue->backing_dev_info.ra_pages = 2* stripe; - } - - blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec); - dump_zones(mddev); - - ret = md_integrity_register(mddev); - if (ret) - raid0_stop(mddev); - - return ret; -} - -static int raid0_stop(struct mddev *mddev) -{ - struct r0conf *conf = mddev->private; - - blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ - kfree(conf->strip_zone); - kfree(conf->devlist); - kfree(conf); - mddev->private = NULL; - return 0; -} - -/* - * Is io distribute over 1 or more chunks ? -*/ -static inline int is_io_in_chunk_boundary(struct mddev *mddev, - unsigned int chunk_sects, struct bio *bio) -{ - if (likely(is_power_of_2(chunk_sects))) { - return chunk_sects >= ((bio->bi_sector & (chunk_sects-1)) - + (bio->bi_size >> 9)); - } else{ - sector_t sector = bio->bi_sector; - return chunk_sects >= (sector_div(sector, chunk_sects) - + (bio->bi_size >> 9)); - } -} - -static void raid0_make_request(struct mddev *mddev, struct bio *bio) -{ - unsigned int chunk_sects; - sector_t sector_offset; - struct strip_zone *zone; - struct md_rdev *tmp_dev; - - if (unlikely(bio->bi_rw & REQ_FLUSH)) { - md_flush_request(mddev, bio); - return; - } - - chunk_sects = mddev->chunk_sectors; - if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) { - sector_t sector = bio->bi_sector; - struct bio_pair *bp; - /* Sanity check -- queue functions should prevent this happening */ - if (bio->bi_vcnt != 1 || - bio->bi_idx != 0) - goto bad_map; - /* This is a one page bio that upper layers - * refuse to split for us, so we need to split it. - */ - if (likely(is_power_of_2(chunk_sects))) - bp = bio_split(bio, chunk_sects - (sector & - (chunk_sects-1))); - else - bp = bio_split(bio, chunk_sects - - sector_div(sector, chunk_sects)); - raid0_make_request(mddev, &bp->bio1); - raid0_make_request(mddev, &bp->bio2); - bio_pair_release(bp); - return; - } - - sector_offset = bio->bi_sector; - zone = find_zone(mddev->private, §or_offset); - tmp_dev = map_sector(mddev, zone, bio->bi_sector, - §or_offset); - bio->bi_bdev = tmp_dev->bdev; - bio->bi_sector = sector_offset + zone->dev_start + - tmp_dev->data_offset; - - generic_make_request(bio); - return; - -bad_map: - printk("md/raid0:%s: make_request bug: can't convert block across chunks" - " or bigger than %dk %llu %d\n", - mdname(mddev), chunk_sects / 2, - (unsigned long long)bio->bi_sector, bio->bi_size >> 10); - - bio_io_error(bio); - return; -} - -static void raid0_status(struct seq_file *seq, struct mddev *mddev) -{ - seq_printf(seq, " %dk chunks", mddev->chunk_sectors / 2); - return; -} - -static void *raid0_takeover_raid45(struct mddev *mddev) -{ - struct md_rdev *rdev; - struct r0conf *priv_conf; - - if (mddev->degraded != 1) { - printk(KERN_ERR "md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n", - mdname(mddev), - mddev->degraded); - return ERR_PTR(-EINVAL); - } - - rdev_for_each(rdev, mddev) { - /* check slot number for a disk */ - if (rdev->raid_disk == mddev->raid_disks-1) { - printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n", - mdname(mddev)); - return ERR_PTR(-EINVAL); - } - } - - /* Set new parameters */ - mddev->new_level = 0; - mddev->new_layout = 0; - mddev->new_chunk_sectors = mddev->chunk_sectors; - mddev->raid_disks--; - mddev->delta_disks = -1; - /* make sure it will be not marked as dirty */ - mddev->recovery_cp = MaxSector; - - create_strip_zones(mddev, &priv_conf); - return priv_conf; -} - -static void *raid0_takeover_raid10(struct mddev *mddev) -{ - struct r0conf *priv_conf; - - /* Check layout: - * - far_copies must be 1 - * - near_copies must be 2 - * - disks number must be even - * - all mirrors must be already degraded - */ - if (mddev->layout != ((1 << 8) + 2)) { - printk(KERN_ERR "md/raid0:%s:: Raid0 cannot takover layout: 0x%x\n", - mdname(mddev), - mddev->layout); - return ERR_PTR(-EINVAL); - } - if (mddev->raid_disks & 1) { - printk(KERN_ERR "md/raid0:%s: Raid0 cannot takover Raid10 with odd disk number.\n", - mdname(mddev)); - return ERR_PTR(-EINVAL); - } - if (mddev->degraded != (mddev->raid_disks>>1)) { - printk(KERN_ERR "md/raid0:%s: All mirrors must be already degraded!\n", - mdname(mddev)); - return ERR_PTR(-EINVAL); - } - - /* Set new parameters */ - mddev->new_level = 0; - mddev->new_layout = 0; - mddev->new_chunk_sectors = mddev->chunk_sectors; - mddev->delta_disks = - mddev->raid_disks / 2; - mddev->raid_disks += mddev->delta_disks; - mddev->degraded = 0; - /* make sure it will be not marked as dirty */ - mddev->recovery_cp = MaxSector; - - create_strip_zones(mddev, &priv_conf); - return priv_conf; -} - -static void *raid0_takeover_raid1(struct mddev *mddev) -{ - struct r0conf *priv_conf; - int chunksect; - - /* Check layout: - * - (N - 1) mirror drives must be already faulty - */ - if ((mddev->raid_disks - 1) != mddev->degraded) { - printk(KERN_ERR "md/raid0:%s: (N - 1) mirrors drives must be already faulty!\n", - mdname(mddev)); - return ERR_PTR(-EINVAL); - } - - /* - * a raid1 doesn't have the notion of chunk size, so - * figure out the largest suitable size we can use. - */ - chunksect = 64 * 2; /* 64K by default */ - - /* The array must be an exact multiple of chunksize */ - while (chunksect && (mddev->array_sectors & (chunksect - 1))) - chunksect >>= 1; - - if ((chunksect << 9) < PAGE_SIZE) - /* array size does not allow a suitable chunk size */ - return ERR_PTR(-EINVAL); - - /* Set new parameters */ - mddev->new_level = 0; - mddev->new_layout = 0; - mddev->new_chunk_sectors = chunksect; - mddev->chunk_sectors = chunksect; - mddev->delta_disks = 1 - mddev->raid_disks; - mddev->raid_disks = 1; - /* make sure it will be not marked as dirty */ - mddev->recovery_cp = MaxSector; - - create_strip_zones(mddev, &priv_conf); - return priv_conf; -} - -static void *raid0_takeover(struct mddev *mddev) -{ - /* raid0 can take over: - * raid4 - if all data disks are active. - * raid5 - providing it is Raid4 layout and one disk is faulty - * raid10 - assuming we have all necessary active disks - * raid1 - with (N -1) mirror drives faulty - */ - if (mddev->level == 4) - return raid0_takeover_raid45(mddev); - - if (mddev->level == 5) { - if (mddev->layout == ALGORITHM_PARITY_N) - return raid0_takeover_raid45(mddev); - - printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n", - mdname(mddev), ALGORITHM_PARITY_N); - } - - if (mddev->level == 10) - return raid0_takeover_raid10(mddev); - - if (mddev->level == 1) - return raid0_takeover_raid1(mddev); - - printk(KERN_ERR "Takeover from raid%i to raid0 not supported\n", - mddev->level); - - return ERR_PTR(-EINVAL); -} - -static void raid0_quiesce(struct mddev *mddev, int state) -{ -} - -static struct md_personality raid0_personality= -{ - .name = "raid0", - .level = 0, - .owner = THIS_MODULE, - .make_request = raid0_make_request, - .run = raid0_run, - .stop = raid0_stop, - .status = raid0_status, - .size = raid0_size, - .takeover = raid0_takeover, - .quiesce = raid0_quiesce, -}; - -static int __init raid0_init (void) -{ - return register_md_personality (&raid0_personality); -} - -static void raid0_exit (void) -{ - unregister_md_personality (&raid0_personality); -} - -module_init(raid0_init); -module_exit(raid0_exit); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("RAID0 (striping) personality for MD"); -MODULE_ALIAS("md-personality-2"); /* RAID0 */ -MODULE_ALIAS("md-raid0"); -MODULE_ALIAS("md-level-0"); diff --git a/ANDROID_3.4.5/drivers/md/raid0.h b/ANDROID_3.4.5/drivers/md/raid0.h deleted file mode 100644 index 05539d9c..00000000 --- a/ANDROID_3.4.5/drivers/md/raid0.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef _RAID0_H -#define _RAID0_H - -struct strip_zone { - sector_t zone_end; /* Start of the next zone (in sectors) */ - sector_t dev_start; /* Zone offset in real dev (in sectors) */ - int nb_dev; /* # of devices attached to the zone */ -}; - -struct r0conf { - struct strip_zone *strip_zone; - struct md_rdev **devlist; /* lists of rdevs, pointed to - * by strip_zone->dev */ - int nr_strip_zones; - int has_merge_bvec; /* at least one member has - * a merge_bvec_fn */ -}; - -#endif diff --git a/ANDROID_3.4.5/drivers/md/raid1.c b/ANDROID_3.4.5/drivers/md/raid1.c deleted file mode 100644 index d7e95772..00000000 --- a/ANDROID_3.4.5/drivers/md/raid1.c +++ /dev/null @@ -1,2953 +0,0 @@ -/* - * raid1.c : Multiple Devices driver for Linux - * - * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat - * - * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman - * - * RAID-1 management functions. - * - * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000 - * - * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk> - * Various fixes by Neil Brown <neilb@cse.unsw.edu.au> - * - * Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support - * bitmapped intelligence in resync: - * - * - bitmap marked during normal i/o - * - bitmap used to skip nondirty blocks during sync - * - * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology: - * - persistent bitmap code - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * You should have received a copy of the GNU General Public License - * (for example /usr/src/linux/COPYING); if not, write to the Free - * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include <linux/slab.h> -#include <linux/delay.h> -#include <linux/blkdev.h> -#include <linux/module.h> -#include <linux/seq_file.h> -#include <linux/ratelimit.h> -#include "md.h" -#include "raid1.h" -#include "bitmap.h" - -/* - * Number of guaranteed r1bios in case of extreme VM load: - */ -#define NR_RAID1_BIOS 256 - -/* When there are this many requests queue to be written by - * the raid1 thread, we become 'congested' to provide back-pressure - * for writeback. - */ -static int max_queued_requests = 1024; - -static void allow_barrier(struct r1conf *conf); -static void lower_barrier(struct r1conf *conf); - -static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) -{ - struct pool_info *pi = data; - int size = offsetof(struct r1bio, bios[pi->raid_disks]); - - /* allocate a r1bio with room for raid_disks entries in the bios array */ - return kzalloc(size, gfp_flags); -} - -static void r1bio_pool_free(void *r1_bio, void *data) -{ - kfree(r1_bio); -} - -#define RESYNC_BLOCK_SIZE (64*1024) -//#define RESYNC_BLOCK_SIZE PAGE_SIZE -#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) -#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) -#define RESYNC_WINDOW (2048*1024) - -static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) -{ - struct pool_info *pi = data; - struct page *page; - struct r1bio *r1_bio; - struct bio *bio; - int i, j; - - r1_bio = r1bio_pool_alloc(gfp_flags, pi); - if (!r1_bio) - return NULL; - - /* - * Allocate bios : 1 for reading, n-1 for writing - */ - for (j = pi->raid_disks ; j-- ; ) { - bio = bio_kmalloc(gfp_flags, RESYNC_PAGES); - if (!bio) - goto out_free_bio; - r1_bio->bios[j] = bio; - } - /* - * Allocate RESYNC_PAGES data pages and attach them to - * the first bio. - * If this is a user-requested check/repair, allocate - * RESYNC_PAGES for each bio. - */ - if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) - j = pi->raid_disks; - else - j = 1; - while(j--) { - bio = r1_bio->bios[j]; - for (i = 0; i < RESYNC_PAGES; i++) { - page = alloc_page(gfp_flags); - if (unlikely(!page)) - goto out_free_pages; - - bio->bi_io_vec[i].bv_page = page; - bio->bi_vcnt = i+1; - } - } - /* If not user-requests, copy the page pointers to all bios */ - if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) { - for (i=0; i<RESYNC_PAGES ; i++) - for (j=1; j<pi->raid_disks; j++) - r1_bio->bios[j]->bi_io_vec[i].bv_page = - r1_bio->bios[0]->bi_io_vec[i].bv_page; - } - - r1_bio->master_bio = NULL; - - return r1_bio; - -out_free_pages: - for (j=0 ; j < pi->raid_disks; j++) - for (i=0; i < r1_bio->bios[j]->bi_vcnt ; i++) - put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page); - j = -1; -out_free_bio: - while (++j < pi->raid_disks) - bio_put(r1_bio->bios[j]); - r1bio_pool_free(r1_bio, data); - return NULL; -} - -static void r1buf_pool_free(void *__r1_bio, void *data) -{ - struct pool_info *pi = data; - int i,j; - struct r1bio *r1bio = __r1_bio; - - for (i = 0; i < RESYNC_PAGES; i++) - for (j = pi->raid_disks; j-- ;) { - if (j == 0 || - r1bio->bios[j]->bi_io_vec[i].bv_page != - r1bio->bios[0]->bi_io_vec[i].bv_page) - safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page); - } - for (i=0 ; i < pi->raid_disks; i++) - bio_put(r1bio->bios[i]); - - r1bio_pool_free(r1bio, data); -} - -static void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio) -{ - int i; - - for (i = 0; i < conf->raid_disks * 2; i++) { - struct bio **bio = r1_bio->bios + i; - if (!BIO_SPECIAL(*bio)) - bio_put(*bio); - *bio = NULL; - } -} - -static void free_r1bio(struct r1bio *r1_bio) -{ - struct r1conf *conf = r1_bio->mddev->private; - - put_all_bios(conf, r1_bio); - mempool_free(r1_bio, conf->r1bio_pool); -} - -static void put_buf(struct r1bio *r1_bio) -{ - struct r1conf *conf = r1_bio->mddev->private; - int i; - - for (i = 0; i < conf->raid_disks * 2; i++) { - struct bio *bio = r1_bio->bios[i]; - if (bio->bi_end_io) - rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev); - } - - mempool_free(r1_bio, conf->r1buf_pool); - - lower_barrier(conf); -} - -static void reschedule_retry(struct r1bio *r1_bio) -{ - unsigned long flags; - struct mddev *mddev = r1_bio->mddev; - struct r1conf *conf = mddev->private; - - spin_lock_irqsave(&conf->device_lock, flags); - list_add(&r1_bio->retry_list, &conf->retry_list); - conf->nr_queued ++; - spin_unlock_irqrestore(&conf->device_lock, flags); - - wake_up(&conf->wait_barrier); - md_wakeup_thread(mddev->thread); -} - -/* - * raid_end_bio_io() is called when we have finished servicing a mirrored - * operation and are ready to return a success/failure code to the buffer - * cache layer. - */ -static void call_bio_endio(struct r1bio *r1_bio) -{ - struct bio *bio = r1_bio->master_bio; - int done; - struct r1conf *conf = r1_bio->mddev->private; - - if (bio->bi_phys_segments) { - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - bio->bi_phys_segments--; - done = (bio->bi_phys_segments == 0); - spin_unlock_irqrestore(&conf->device_lock, flags); - } else - done = 1; - - if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - if (done) { - bio_endio(bio, 0); - /* - * Wake up any possible resync thread that waits for the device - * to go idle. - */ - allow_barrier(conf); - } -} - -static void raid_end_bio_io(struct r1bio *r1_bio) -{ - struct bio *bio = r1_bio->master_bio; - - /* if nobody has done the final endio yet, do it now */ - if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { - pr_debug("raid1: sync end %s on sectors %llu-%llu\n", - (bio_data_dir(bio) == WRITE) ? "write" : "read", - (unsigned long long) bio->bi_sector, - (unsigned long long) bio->bi_sector + - (bio->bi_size >> 9) - 1); - - call_bio_endio(r1_bio); - } - free_r1bio(r1_bio); -} - -/* - * Update disk head position estimator based on IRQ completion info. - */ -static inline void update_head_pos(int disk, struct r1bio *r1_bio) -{ - struct r1conf *conf = r1_bio->mddev->private; - - conf->mirrors[disk].head_position = - r1_bio->sector + (r1_bio->sectors); -} - -/* - * Find the disk number which triggered given bio - */ -static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio) -{ - int mirror; - struct r1conf *conf = r1_bio->mddev->private; - int raid_disks = conf->raid_disks; - - for (mirror = 0; mirror < raid_disks * 2; mirror++) - if (r1_bio->bios[mirror] == bio) - break; - - BUG_ON(mirror == raid_disks * 2); - update_head_pos(mirror, r1_bio); - - return mirror; -} - -static void raid1_end_read_request(struct bio *bio, int error) -{ - int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct r1bio *r1_bio = bio->bi_private; - int mirror; - struct r1conf *conf = r1_bio->mddev->private; - - mirror = r1_bio->read_disk; - /* - * this branch is our 'one mirror IO has finished' event handler: - */ - update_head_pos(mirror, r1_bio); - - if (uptodate) - set_bit(R1BIO_Uptodate, &r1_bio->state); - else { - /* If all other devices have failed, we want to return - * the error upwards rather than fail the last device. - * Here we redefine "uptodate" to mean "Don't want to retry" - */ - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - if (r1_bio->mddev->degraded == conf->raid_disks || - (r1_bio->mddev->degraded == conf->raid_disks-1 && - !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags))) - uptodate = 1; - spin_unlock_irqrestore(&conf->device_lock, flags); - } - - if (uptodate) - raid_end_bio_io(r1_bio); - else { - /* - * oops, read error: - */ - char b[BDEVNAME_SIZE]; - printk_ratelimited( - KERN_ERR "md/raid1:%s: %s: " - "rescheduling sector %llu\n", - mdname(conf->mddev), - bdevname(conf->mirrors[mirror].rdev->bdev, - b), - (unsigned long long)r1_bio->sector); - set_bit(R1BIO_ReadError, &r1_bio->state); - reschedule_retry(r1_bio); - } - - rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); -} - -static void close_write(struct r1bio *r1_bio) -{ - /* it really is the end of this request */ - if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { - /* free extra copy of the data pages */ - int i = r1_bio->behind_page_count; - while (i--) - safe_put_page(r1_bio->behind_bvecs[i].bv_page); - kfree(r1_bio->behind_bvecs); - r1_bio->behind_bvecs = NULL; - } - /* clear the bitmap if all writes complete successfully */ - bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, - r1_bio->sectors, - !test_bit(R1BIO_Degraded, &r1_bio->state), - test_bit(R1BIO_BehindIO, &r1_bio->state)); - md_write_end(r1_bio->mddev); -} - -static void r1_bio_write_done(struct r1bio *r1_bio) -{ - if (!atomic_dec_and_test(&r1_bio->remaining)) - return; - - if (test_bit(R1BIO_WriteError, &r1_bio->state)) - reschedule_retry(r1_bio); - else { - close_write(r1_bio); - if (test_bit(R1BIO_MadeGood, &r1_bio->state)) - reschedule_retry(r1_bio); - else - raid_end_bio_io(r1_bio); - } -} - -static void raid1_end_write_request(struct bio *bio, int error) -{ - int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct r1bio *r1_bio = bio->bi_private; - int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); - struct r1conf *conf = r1_bio->mddev->private; - struct bio *to_put = NULL; - - mirror = find_bio_disk(r1_bio, bio); - - /* - * 'one mirror IO has finished' event handler: - */ - if (!uptodate) { - set_bit(WriteErrorSeen, - &conf->mirrors[mirror].rdev->flags); - if (!test_and_set_bit(WantReplacement, - &conf->mirrors[mirror].rdev->flags)) - set_bit(MD_RECOVERY_NEEDED, & - conf->mddev->recovery); - - set_bit(R1BIO_WriteError, &r1_bio->state); - } else { - /* - * Set R1BIO_Uptodate in our master bio, so that we - * will return a good error code for to the higher - * levels even if IO on some other mirrored buffer - * fails. - * - * The 'master' represents the composite IO operation - * to user-side. So if something waits for IO, then it - * will wait for the 'master' bio. - */ - sector_t first_bad; - int bad_sectors; - - r1_bio->bios[mirror] = NULL; - to_put = bio; - set_bit(R1BIO_Uptodate, &r1_bio->state); - - /* Maybe we can clear some bad blocks. */ - if (is_badblock(conf->mirrors[mirror].rdev, - r1_bio->sector, r1_bio->sectors, - &first_bad, &bad_sectors)) { - r1_bio->bios[mirror] = IO_MADE_GOOD; - set_bit(R1BIO_MadeGood, &r1_bio->state); - } - } - - if (behind) { - if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) - atomic_dec(&r1_bio->behind_remaining); - - /* - * In behind mode, we ACK the master bio once the I/O - * has safely reached all non-writemostly - * disks. Setting the Returned bit ensures that this - * gets done only once -- we don't ever want to return - * -EIO here, instead we'll wait - */ - if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && - test_bit(R1BIO_Uptodate, &r1_bio->state)) { - /* Maybe we can return now */ - if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { - struct bio *mbio = r1_bio->master_bio; - pr_debug("raid1: behind end write sectors" - " %llu-%llu\n", - (unsigned long long) mbio->bi_sector, - (unsigned long long) mbio->bi_sector + - (mbio->bi_size >> 9) - 1); - call_bio_endio(r1_bio); - } - } - } - if (r1_bio->bios[mirror] == NULL) - rdev_dec_pending(conf->mirrors[mirror].rdev, - conf->mddev); - - /* - * Let's see if all mirrored write operations have finished - * already. - */ - r1_bio_write_done(r1_bio); - - if (to_put) - bio_put(to_put); -} - - -/* - * This routine returns the disk from which the requested read should - * be done. There is a per-array 'next expected sequential IO' sector - * number - if this matches on the next IO then we use the last disk. - * There is also a per-disk 'last know head position' sector that is - * maintained from IRQ contexts, both the normal and the resync IO - * completion handlers update this position correctly. If there is no - * perfect sequential match then we pick the disk whose head is closest. - * - * If there are 2 mirrors in the same 2 devices, performance degrades - * because position is mirror, not device based. - * - * The rdev for the device selected will have nr_pending incremented. - */ -static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sectors) -{ - const sector_t this_sector = r1_bio->sector; - int sectors; - int best_good_sectors; - int start_disk; - int best_disk; - int i; - sector_t best_dist; - struct md_rdev *rdev; - int choose_first; - - rcu_read_lock(); - /* - * Check if we can balance. We can balance on the whole - * device if no resync is going on, or below the resync window. - * We take the first readable disk when above the resync window. - */ - retry: - sectors = r1_bio->sectors; - best_disk = -1; - best_dist = MaxSector; - best_good_sectors = 0; - - if (conf->mddev->recovery_cp < MaxSector && - (this_sector + sectors >= conf->next_resync)) { - choose_first = 1; - start_disk = 0; - } else { - choose_first = 0; - start_disk = conf->last_used; - } - - for (i = 0 ; i < conf->raid_disks * 2 ; i++) { - sector_t dist; - sector_t first_bad; - int bad_sectors; - - int disk = start_disk + i; - if (disk >= conf->raid_disks) - disk -= conf->raid_disks; - - rdev = rcu_dereference(conf->mirrors[disk].rdev); - if (r1_bio->bios[disk] == IO_BLOCKED - || rdev == NULL - || test_bit(Unmerged, &rdev->flags) - || test_bit(Faulty, &rdev->flags)) - continue; - if (!test_bit(In_sync, &rdev->flags) && - rdev->recovery_offset < this_sector + sectors) - continue; - if (test_bit(WriteMostly, &rdev->flags)) { - /* Don't balance among write-mostly, just - * use the first as a last resort */ - if (best_disk < 0) { - if (is_badblock(rdev, this_sector, sectors, - &first_bad, &bad_sectors)) { - if (first_bad < this_sector) - /* Cannot use this */ - continue; - best_good_sectors = first_bad - this_sector; - } else - best_good_sectors = sectors; - best_disk = disk; - } - continue; - } - /* This is a reasonable device to use. It might - * even be best. - */ - if (is_badblock(rdev, this_sector, sectors, - &first_bad, &bad_sectors)) { - if (best_dist < MaxSector) - /* already have a better device */ - continue; - if (first_bad <= this_sector) { - /* cannot read here. If this is the 'primary' - * device, then we must not read beyond - * bad_sectors from another device.. - */ - bad_sectors -= (this_sector - first_bad); - if (choose_first && sectors > bad_sectors) - sectors = bad_sectors; - if (best_good_sectors > sectors) - best_good_sectors = sectors; - - } else { - sector_t good_sectors = first_bad - this_sector; - if (good_sectors > best_good_sectors) { - best_good_sectors = good_sectors; - best_disk = disk; - } - if (choose_first) - break; - } - continue; - } else - best_good_sectors = sectors; - - dist = abs(this_sector - conf->mirrors[disk].head_position); - if (choose_first - /* Don't change to another disk for sequential reads */ - || conf->next_seq_sect == this_sector - || dist == 0 - /* If device is idle, use it */ - || atomic_read(&rdev->nr_pending) == 0) { - best_disk = disk; - break; - } - if (dist < best_dist) { - best_dist = dist; - best_disk = disk; - } - } - - if (best_disk >= 0) { - rdev = rcu_dereference(conf->mirrors[best_disk].rdev); - if (!rdev) - goto retry; - atomic_inc(&rdev->nr_pending); - if (test_bit(Faulty, &rdev->flags)) { - /* cannot risk returning a device that failed - * before we inc'ed nr_pending - */ - rdev_dec_pending(rdev, conf->mddev); - goto retry; - } - sectors = best_good_sectors; - conf->next_seq_sect = this_sector + sectors; - conf->last_used = best_disk; - } - rcu_read_unlock(); - *max_sectors = sectors; - - return best_disk; -} - -static int raid1_mergeable_bvec(struct request_queue *q, - struct bvec_merge_data *bvm, - struct bio_vec *biovec) -{ - struct mddev *mddev = q->queuedata; - struct r1conf *conf = mddev->private; - sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); - int max = biovec->bv_len; - - if (mddev->merge_check_needed) { - int disk; - rcu_read_lock(); - for (disk = 0; disk < conf->raid_disks * 2; disk++) { - struct md_rdev *rdev = rcu_dereference( - conf->mirrors[disk].rdev); - if (rdev && !test_bit(Faulty, &rdev->flags)) { - struct request_queue *q = - bdev_get_queue(rdev->bdev); - if (q->merge_bvec_fn) { - bvm->bi_sector = sector + - rdev->data_offset; - bvm->bi_bdev = rdev->bdev; - max = min(max, q->merge_bvec_fn( - q, bvm, biovec)); - } - } - } - rcu_read_unlock(); - } - return max; - -} - -int md_raid1_congested(struct mddev *mddev, int bits) -{ - struct r1conf *conf = mddev->private; - int i, ret = 0; - - if ((bits & (1 << BDI_async_congested)) && - conf->pending_count >= max_queued_requests) - return 1; - - rcu_read_lock(); - for (i = 0; i < conf->raid_disks * 2; i++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); - if (rdev && !test_bit(Faulty, &rdev->flags)) { - struct request_queue *q = bdev_get_queue(rdev->bdev); - - BUG_ON(!q); - - /* Note the '|| 1' - when read_balance prefers - * non-congested targets, it can be removed - */ - if ((bits & (1<<BDI_async_congested)) || 1) - ret |= bdi_congested(&q->backing_dev_info, bits); - else - ret &= bdi_congested(&q->backing_dev_info, bits); - } - } - rcu_read_unlock(); - return ret; -} -EXPORT_SYMBOL_GPL(md_raid1_congested); - -static int raid1_congested(void *data, int bits) -{ - struct mddev *mddev = data; - - return mddev_congested(mddev, bits) || - md_raid1_congested(mddev, bits); -} - -static void flush_pending_writes(struct r1conf *conf) -{ - /* Any writes that have been queued but are awaiting - * bitmap updates get flushed here. - */ - spin_lock_irq(&conf->device_lock); - - if (conf->pending_bio_list.head) { - struct bio *bio; - bio = bio_list_get(&conf->pending_bio_list); - conf->pending_count = 0; - spin_unlock_irq(&conf->device_lock); - /* flush any pending bitmap writes to - * disk before proceeding w/ I/O */ - bitmap_unplug(conf->mddev->bitmap); - wake_up(&conf->wait_barrier); - - while (bio) { /* submit pending writes */ - struct bio *next = bio->bi_next; - bio->bi_next = NULL; - generic_make_request(bio); - bio = next; - } - } else - spin_unlock_irq(&conf->device_lock); -} - -/* Barriers.... - * Sometimes we need to suspend IO while we do something else, - * either some resync/recovery, or reconfigure the array. - * To do this we raise a 'barrier'. - * The 'barrier' is a counter that can be raised multiple times - * to count how many activities are happening which preclude - * normal IO. - * We can only raise the barrier if there is no pending IO. - * i.e. if nr_pending == 0. - * We choose only to raise the barrier if no-one is waiting for the - * barrier to go down. This means that as soon as an IO request - * is ready, no other operations which require a barrier will start - * until the IO request has had a chance. - * - * So: regular IO calls 'wait_barrier'. When that returns there - * is no backgroup IO happening, It must arrange to call - * allow_barrier when it has finished its IO. - * backgroup IO calls must call raise_barrier. Once that returns - * there is no normal IO happeing. It must arrange to call - * lower_barrier when the particular background IO completes. - */ -#define RESYNC_DEPTH 32 - -static void raise_barrier(struct r1conf *conf) -{ - spin_lock_irq(&conf->resync_lock); - - /* Wait until no block IO is waiting */ - wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting, - conf->resync_lock, ); - - /* block any new IO from starting */ - conf->barrier++; - - /* Now wait for all pending IO to complete */ - wait_event_lock_irq(conf->wait_barrier, - !conf->nr_pending && conf->barrier < RESYNC_DEPTH, - conf->resync_lock, ); - - spin_unlock_irq(&conf->resync_lock); -} - -static void lower_barrier(struct r1conf *conf) -{ - unsigned long flags; - BUG_ON(conf->barrier <= 0); - spin_lock_irqsave(&conf->resync_lock, flags); - conf->barrier--; - spin_unlock_irqrestore(&conf->resync_lock, flags); - wake_up(&conf->wait_barrier); -} - -static void wait_barrier(struct r1conf *conf) -{ - spin_lock_irq(&conf->resync_lock); - if (conf->barrier) { - conf->nr_waiting++; - /* Wait for the barrier to drop. - * However if there are already pending - * requests (preventing the barrier from - * rising completely), and the - * pre-process bio queue isn't empty, - * then don't wait, as we need to empty - * that queue to get the nr_pending - * count down. - */ - wait_event_lock_irq(conf->wait_barrier, - !conf->barrier || - (conf->nr_pending && - current->bio_list && - !bio_list_empty(current->bio_list)), - conf->resync_lock, - ); - conf->nr_waiting--; - } - conf->nr_pending++; - spin_unlock_irq(&conf->resync_lock); -} - -static void allow_barrier(struct r1conf *conf) -{ - unsigned long flags; - spin_lock_irqsave(&conf->resync_lock, flags); - conf->nr_pending--; - spin_unlock_irqrestore(&conf->resync_lock, flags); - wake_up(&conf->wait_barrier); -} - -static void freeze_array(struct r1conf *conf) -{ - /* stop syncio and normal IO and wait for everything to - * go quite. - * We increment barrier and nr_waiting, and then - * wait until nr_pending match nr_queued+1 - * This is called in the context of one normal IO request - * that has failed. Thus any sync request that might be pending - * will be blocked by nr_pending, and we need to wait for - * pending IO requests to complete or be queued for re-try. - * Thus the number queued (nr_queued) plus this request (1) - * must match the number of pending IOs (nr_pending) before - * we continue. - */ - spin_lock_irq(&conf->resync_lock); - conf->barrier++; - conf->nr_waiting++; - wait_event_lock_irq(conf->wait_barrier, - conf->nr_pending == conf->nr_queued+1, - conf->resync_lock, - flush_pending_writes(conf)); - spin_unlock_irq(&conf->resync_lock); -} -static void unfreeze_array(struct r1conf *conf) -{ - /* reverse the effect of the freeze */ - spin_lock_irq(&conf->resync_lock); - conf->barrier--; - conf->nr_waiting--; - wake_up(&conf->wait_barrier); - spin_unlock_irq(&conf->resync_lock); -} - - -/* duplicate the data pages for behind I/O - */ -static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio) -{ - int i; - struct bio_vec *bvec; - struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec), - GFP_NOIO); - if (unlikely(!bvecs)) - return; - - bio_for_each_segment(bvec, bio, i) { - bvecs[i] = *bvec; - bvecs[i].bv_page = alloc_page(GFP_NOIO); - if (unlikely(!bvecs[i].bv_page)) - goto do_sync_io; - memcpy(kmap(bvecs[i].bv_page) + bvec->bv_offset, - kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); - kunmap(bvecs[i].bv_page); - kunmap(bvec->bv_page); - } - r1_bio->behind_bvecs = bvecs; - r1_bio->behind_page_count = bio->bi_vcnt; - set_bit(R1BIO_BehindIO, &r1_bio->state); - return; - -do_sync_io: - for (i = 0; i < bio->bi_vcnt; i++) - if (bvecs[i].bv_page) - put_page(bvecs[i].bv_page); - kfree(bvecs); - pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); -} - -static void make_request(struct mddev *mddev, struct bio * bio) -{ - struct r1conf *conf = mddev->private; - struct mirror_info *mirror; - struct r1bio *r1_bio; - struct bio *read_bio; - int i, disks; - struct bitmap *bitmap; - unsigned long flags; - const int rw = bio_data_dir(bio); - const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); - const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); - struct md_rdev *blocked_rdev; - int plugged; - int first_clone; - int sectors_handled; - int max_sectors; - - /* - * Register the new request and wait if the reconstruction - * thread has put up a bar for new requests. - * Continue immediately if no resync is active currently. - */ - - md_write_start(mddev, bio); /* wait on superblock update early */ - - if (bio_data_dir(bio) == WRITE && - bio->bi_sector + bio->bi_size/512 > mddev->suspend_lo && - bio->bi_sector < mddev->suspend_hi) { - /* As the suspend_* range is controlled by - * userspace, we want an interruptible - * wait. - */ - DEFINE_WAIT(w); - for (;;) { - flush_signals(current); - prepare_to_wait(&conf->wait_barrier, - &w, TASK_INTERRUPTIBLE); - if (bio->bi_sector + bio->bi_size/512 <= mddev->suspend_lo || - bio->bi_sector >= mddev->suspend_hi) - break; - schedule(); - } - finish_wait(&conf->wait_barrier, &w); - } - - wait_barrier(conf); - - bitmap = mddev->bitmap; - - /* - * make_request() can abort the operation when READA is being - * used and no empty request is available. - * - */ - r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); - - r1_bio->master_bio = bio; - r1_bio->sectors = bio->bi_size >> 9; - r1_bio->state = 0; - r1_bio->mddev = mddev; - r1_bio->sector = bio->bi_sector; - - /* We might need to issue multiple reads to different - * devices if there are bad blocks around, so we keep - * track of the number of reads in bio->bi_phys_segments. - * If this is 0, there is only one r1_bio and no locking - * will be needed when requests complete. If it is - * non-zero, then it is the number of not-completed requests. - */ - bio->bi_phys_segments = 0; - clear_bit(BIO_SEG_VALID, &bio->bi_flags); - - if (rw == READ) { - /* - * read balancing logic: - */ - int rdisk; - -read_again: - rdisk = read_balance(conf, r1_bio, &max_sectors); - - if (rdisk < 0) { - /* couldn't find anywhere to read from */ - raid_end_bio_io(r1_bio); - return; - } - mirror = conf->mirrors + rdisk; - - if (test_bit(WriteMostly, &mirror->rdev->flags) && - bitmap) { - /* Reading from a write-mostly device must - * take care not to over-take any writes - * that are 'behind' - */ - wait_event(bitmap->behind_wait, - atomic_read(&bitmap->behind_writes) == 0); - } - r1_bio->read_disk = rdisk; - - read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); - md_trim_bio(read_bio, r1_bio->sector - bio->bi_sector, - max_sectors); - - r1_bio->bios[rdisk] = read_bio; - - read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset; - read_bio->bi_bdev = mirror->rdev->bdev; - read_bio->bi_end_io = raid1_end_read_request; - read_bio->bi_rw = READ | do_sync; - read_bio->bi_private = r1_bio; - - if (max_sectors < r1_bio->sectors) { - /* could not read all from this device, so we will - * need another r1_bio. - */ - - sectors_handled = (r1_bio->sector + max_sectors - - bio->bi_sector); - r1_bio->sectors = max_sectors; - spin_lock_irq(&conf->device_lock); - if (bio->bi_phys_segments == 0) - bio->bi_phys_segments = 2; - else - bio->bi_phys_segments++; - spin_unlock_irq(&conf->device_lock); - /* Cannot call generic_make_request directly - * as that will be queued in __make_request - * and subsequent mempool_alloc might block waiting - * for it. So hand bio over to raid1d. - */ - reschedule_retry(r1_bio); - - r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); - - r1_bio->master_bio = bio; - r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled; - r1_bio->state = 0; - r1_bio->mddev = mddev; - r1_bio->sector = bio->bi_sector + sectors_handled; - goto read_again; - } else - generic_make_request(read_bio); - return; - } - - /* - * WRITE: - */ - if (conf->pending_count >= max_queued_requests) { - md_wakeup_thread(mddev->thread); - wait_event(conf->wait_barrier, - conf->pending_count < max_queued_requests); - } - /* first select target devices under rcu_lock and - * inc refcount on their rdev. Record them by setting - * bios[x] to bio - * If there are known/acknowledged bad blocks on any device on - * which we have seen a write error, we want to avoid writing those - * blocks. - * This potentially requires several writes to write around - * the bad blocks. Each set of writes gets it's own r1bio - * with a set of bios attached. - */ - plugged = mddev_check_plugged(mddev); - - disks = conf->raid_disks * 2; - retry_write: - blocked_rdev = NULL; - rcu_read_lock(); - max_sectors = r1_bio->sectors; - for (i = 0; i < disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); - if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { - atomic_inc(&rdev->nr_pending); - blocked_rdev = rdev; - break; - } - r1_bio->bios[i] = NULL; - if (!rdev || test_bit(Faulty, &rdev->flags) - || test_bit(Unmerged, &rdev->flags)) { - if (i < conf->raid_disks) - set_bit(R1BIO_Degraded, &r1_bio->state); - continue; - } - - atomic_inc(&rdev->nr_pending); - if (test_bit(WriteErrorSeen, &rdev->flags)) { - sector_t first_bad; - int bad_sectors; - int is_bad; - - is_bad = is_badblock(rdev, r1_bio->sector, - max_sectors, - &first_bad, &bad_sectors); - if (is_bad < 0) { - /* mustn't write here until the bad block is - * acknowledged*/ - set_bit(BlockedBadBlocks, &rdev->flags); - blocked_rdev = rdev; - break; - } - if (is_bad && first_bad <= r1_bio->sector) { - /* Cannot write here at all */ - bad_sectors -= (r1_bio->sector - first_bad); - if (bad_sectors < max_sectors) - /* mustn't write more than bad_sectors - * to other devices yet - */ - max_sectors = bad_sectors; - rdev_dec_pending(rdev, mddev); - /* We don't set R1BIO_Degraded as that - * only applies if the disk is - * missing, so it might be re-added, - * and we want to know to recover this - * chunk. - * In this case the device is here, - * and the fact that this chunk is not - * in-sync is recorded in the bad - * block log - */ - continue; - } - if (is_bad) { - int good_sectors = first_bad - r1_bio->sector; - if (good_sectors < max_sectors) - max_sectors = good_sectors; - } - } - r1_bio->bios[i] = bio; - } - rcu_read_unlock(); - - if (unlikely(blocked_rdev)) { - /* Wait for this device to become unblocked */ - int j; - - for (j = 0; j < i; j++) - if (r1_bio->bios[j]) - rdev_dec_pending(conf->mirrors[j].rdev, mddev); - r1_bio->state = 0; - allow_barrier(conf); - md_wait_for_blocked_rdev(blocked_rdev, mddev); - wait_barrier(conf); - goto retry_write; - } - - if (max_sectors < r1_bio->sectors) { - /* We are splitting this write into multiple parts, so - * we need to prepare for allocating another r1_bio. - */ - r1_bio->sectors = max_sectors; - spin_lock_irq(&conf->device_lock); - if (bio->bi_phys_segments == 0) - bio->bi_phys_segments = 2; - else - bio->bi_phys_segments++; - spin_unlock_irq(&conf->device_lock); - } - sectors_handled = r1_bio->sector + max_sectors - bio->bi_sector; - - atomic_set(&r1_bio->remaining, 1); - atomic_set(&r1_bio->behind_remaining, 0); - - first_clone = 1; - for (i = 0; i < disks; i++) { - struct bio *mbio; - if (!r1_bio->bios[i]) - continue; - - mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - md_trim_bio(mbio, r1_bio->sector - bio->bi_sector, max_sectors); - - if (first_clone) { - /* do behind I/O ? - * Not if there are too many, or cannot - * allocate memory, or a reader on WriteMostly - * is waiting for behind writes to flush */ - if (bitmap && - (atomic_read(&bitmap->behind_writes) - < mddev->bitmap_info.max_write_behind) && - !waitqueue_active(&bitmap->behind_wait)) - alloc_behind_pages(mbio, r1_bio); - - bitmap_startwrite(bitmap, r1_bio->sector, - r1_bio->sectors, - test_bit(R1BIO_BehindIO, - &r1_bio->state)); - first_clone = 0; - } - if (r1_bio->behind_bvecs) { - struct bio_vec *bvec; - int j; - - /* Yes, I really want the '__' version so that - * we clear any unused pointer in the io_vec, rather - * than leave them unchanged. This is important - * because when we come to free the pages, we won't - * know the original bi_idx, so we just free - * them all - */ - __bio_for_each_segment(bvec, mbio, j, 0) - bvec->bv_page = r1_bio->behind_bvecs[j].bv_page; - if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) - atomic_inc(&r1_bio->behind_remaining); - } - - r1_bio->bios[i] = mbio; - - mbio->bi_sector = (r1_bio->sector + - conf->mirrors[i].rdev->data_offset); - mbio->bi_bdev = conf->mirrors[i].rdev->bdev; - mbio->bi_end_io = raid1_end_write_request; - mbio->bi_rw = WRITE | do_flush_fua | do_sync; - mbio->bi_private = r1_bio; - - atomic_inc(&r1_bio->remaining); - spin_lock_irqsave(&conf->device_lock, flags); - bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; - spin_unlock_irqrestore(&conf->device_lock, flags); - } - /* Mustn't call r1_bio_write_done before this next test, - * as it could result in the bio being freed. - */ - if (sectors_handled < (bio->bi_size >> 9)) { - r1_bio_write_done(r1_bio); - /* We need another r1_bio. It has already been counted - * in bio->bi_phys_segments - */ - r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); - r1_bio->master_bio = bio; - r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled; - r1_bio->state = 0; - r1_bio->mddev = mddev; - r1_bio->sector = bio->bi_sector + sectors_handled; - goto retry_write; - } - - r1_bio_write_done(r1_bio); - - /* In case raid1d snuck in to freeze_array */ - wake_up(&conf->wait_barrier); - - if (do_sync || !bitmap || !plugged) - md_wakeup_thread(mddev->thread); -} - -static void status(struct seq_file *seq, struct mddev *mddev) -{ - struct r1conf *conf = mddev->private; - int i; - - seq_printf(seq, " [%d/%d] [", conf->raid_disks, - conf->raid_disks - mddev->degraded); - rcu_read_lock(); - for (i = 0; i < conf->raid_disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); - seq_printf(seq, "%s", - rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); - } - rcu_read_unlock(); - seq_printf(seq, "]"); -} - - -static void error(struct mddev *mddev, struct md_rdev *rdev) -{ - char b[BDEVNAME_SIZE]; - struct r1conf *conf = mddev->private; - - /* - * If it is not operational, then we have already marked it as dead - * else if it is the last working disks, ignore the error, let the - * next level up know. - * else mark the drive as failed - */ - if (test_bit(In_sync, &rdev->flags) - && (conf->raid_disks - mddev->degraded) == 1) { - /* - * Don't fail the drive, act as though we were just a - * normal single drive. - * However don't try a recovery from this drive as - * it is very likely to fail. - */ - conf->recovery_disabled = mddev->recovery_disabled; - return; - } - set_bit(Blocked, &rdev->flags); - if (test_and_clear_bit(In_sync, &rdev->flags)) { - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded++; - set_bit(Faulty, &rdev->flags); - spin_unlock_irqrestore(&conf->device_lock, flags); - /* - * if recovery is running, make sure it aborts. - */ - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - } else - set_bit(Faulty, &rdev->flags); - set_bit(MD_CHANGE_DEVS, &mddev->flags); - printk(KERN_ALERT - "md/raid1:%s: Disk failure on %s, disabling device.\n" - "md/raid1:%s: Operation continuing on %d devices.\n", - mdname(mddev), bdevname(rdev->bdev, b), - mdname(mddev), conf->raid_disks - mddev->degraded); -} - -static void print_conf(struct r1conf *conf) -{ - int i; - - printk(KERN_DEBUG "RAID1 conf printout:\n"); - if (!conf) { - printk(KERN_DEBUG "(!conf)\n"); - return; - } - printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, - conf->raid_disks); - - rcu_read_lock(); - for (i = 0; i < conf->raid_disks; i++) { - char b[BDEVNAME_SIZE]; - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); - if (rdev) - printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n", - i, !test_bit(In_sync, &rdev->flags), - !test_bit(Faulty, &rdev->flags), - bdevname(rdev->bdev,b)); - } - rcu_read_unlock(); -} - -static void close_sync(struct r1conf *conf) -{ - wait_barrier(conf); - allow_barrier(conf); - - mempool_destroy(conf->r1buf_pool); - conf->r1buf_pool = NULL; -} - -static int raid1_spare_active(struct mddev *mddev) -{ - int i; - struct r1conf *conf = mddev->private; - int count = 0; - unsigned long flags; - - /* - * Find all failed disks within the RAID1 configuration - * and mark them readable. - * Called under mddev lock, so rcu protection not needed. - */ - for (i = 0; i < conf->raid_disks; i++) { - struct md_rdev *rdev = conf->mirrors[i].rdev; - struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev; - if (repl - && repl->recovery_offset == MaxSector - && !test_bit(Faulty, &repl->flags) - && !test_and_set_bit(In_sync, &repl->flags)) { - /* replacement has just become active */ - if (!rdev || - !test_and_clear_bit(In_sync, &rdev->flags)) - count++; - if (rdev) { - /* Replaced device not technically - * faulty, but we need to be sure - * it gets removed and never re-added - */ - set_bit(Faulty, &rdev->flags); - sysfs_notify_dirent_safe( - rdev->sysfs_state); - } - } - if (rdev - && !test_bit(Faulty, &rdev->flags) - && !test_and_set_bit(In_sync, &rdev->flags)) { - count++; - sysfs_notify_dirent_safe(rdev->sysfs_state); - } - } - spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded -= count; - spin_unlock_irqrestore(&conf->device_lock, flags); - - print_conf(conf); - return count; -} - - -static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) -{ - struct r1conf *conf = mddev->private; - int err = -EEXIST; - int mirror = 0; - struct mirror_info *p; - int first = 0; - int last = conf->raid_disks - 1; - struct request_queue *q = bdev_get_queue(rdev->bdev); - - if (mddev->recovery_disabled == conf->recovery_disabled) - return -EBUSY; - - if (rdev->raid_disk >= 0) - first = last = rdev->raid_disk; - - if (q->merge_bvec_fn) { - set_bit(Unmerged, &rdev->flags); - mddev->merge_check_needed = 1; - } - - for (mirror = first; mirror <= last; mirror++) { - p = conf->mirrors+mirror; - if (!p->rdev) { - - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - - p->head_position = 0; - rdev->raid_disk = mirror; - err = 0; - /* As all devices are equivalent, we don't need a full recovery - * if this was recently any drive of the array - */ - if (rdev->saved_raid_disk < 0) - conf->fullsync = 1; - rcu_assign_pointer(p->rdev, rdev); - break; - } - if (test_bit(WantReplacement, &p->rdev->flags) && - p[conf->raid_disks].rdev == NULL) { - /* Add this device as a replacement */ - clear_bit(In_sync, &rdev->flags); - set_bit(Replacement, &rdev->flags); - rdev->raid_disk = mirror; - err = 0; - conf->fullsync = 1; - rcu_assign_pointer(p[conf->raid_disks].rdev, rdev); - break; - } - } - if (err == 0 && test_bit(Unmerged, &rdev->flags)) { - /* Some requests might not have seen this new - * merge_bvec_fn. We must wait for them to complete - * before merging the device fully. - * First we make sure any code which has tested - * our function has submitted the request, then - * we wait for all outstanding requests to complete. - */ - synchronize_sched(); - raise_barrier(conf); - lower_barrier(conf); - clear_bit(Unmerged, &rdev->flags); - } - md_integrity_add_rdev(rdev, mddev); - print_conf(conf); - return err; -} - -static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) -{ - struct r1conf *conf = mddev->private; - int err = 0; - int number = rdev->raid_disk; - struct mirror_info *p = conf->mirrors+ number; - - if (rdev != p->rdev) - p = conf->mirrors + conf->raid_disks + number; - - print_conf(conf); - if (rdev == p->rdev) { - if (test_bit(In_sync, &rdev->flags) || - atomic_read(&rdev->nr_pending)) { - err = -EBUSY; - goto abort; - } - /* Only remove non-faulty devices if recovery - * is not possible. - */ - if (!test_bit(Faulty, &rdev->flags) && - mddev->recovery_disabled != conf->recovery_disabled && - mddev->degraded < conf->raid_disks) { - err = -EBUSY; - goto abort; - } - p->rdev = NULL; - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - p->rdev = rdev; - goto abort; - } else if (conf->mirrors[conf->raid_disks + number].rdev) { - /* We just removed a device that is being replaced. - * Move down the replacement. We drain all IO before - * doing this to avoid confusion. - */ - struct md_rdev *repl = - conf->mirrors[conf->raid_disks + number].rdev; - raise_barrier(conf); - clear_bit(Replacement, &repl->flags); - p->rdev = repl; - conf->mirrors[conf->raid_disks + number].rdev = NULL; - lower_barrier(conf); - clear_bit(WantReplacement, &rdev->flags); - } else - clear_bit(WantReplacement, &rdev->flags); - err = md_integrity_register(mddev); - } -abort: - - print_conf(conf); - return err; -} - - -static void end_sync_read(struct bio *bio, int error) -{ - struct r1bio *r1_bio = bio->bi_private; - - update_head_pos(r1_bio->read_disk, r1_bio); - - /* - * we have read a block, now it needs to be re-written, - * or re-read if the read failed. - * We don't do much here, just schedule handling by raid1d - */ - if (test_bit(BIO_UPTODATE, &bio->bi_flags)) - set_bit(R1BIO_Uptodate, &r1_bio->state); - - if (atomic_dec_and_test(&r1_bio->remaining)) - reschedule_retry(r1_bio); -} - -static void end_sync_write(struct bio *bio, int error) -{ - int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct r1bio *r1_bio = bio->bi_private; - struct mddev *mddev = r1_bio->mddev; - struct r1conf *conf = mddev->private; - int mirror=0; - sector_t first_bad; - int bad_sectors; - - mirror = find_bio_disk(r1_bio, bio); - - if (!uptodate) { - sector_t sync_blocks = 0; - sector_t s = r1_bio->sector; - long sectors_to_go = r1_bio->sectors; - /* make sure these bits doesn't get cleared. */ - do { - bitmap_end_sync(mddev->bitmap, s, - &sync_blocks, 1); - s += sync_blocks; - sectors_to_go -= sync_blocks; - } while (sectors_to_go > 0); - set_bit(WriteErrorSeen, - &conf->mirrors[mirror].rdev->flags); - if (!test_and_set_bit(WantReplacement, - &conf->mirrors[mirror].rdev->flags)) - set_bit(MD_RECOVERY_NEEDED, & - mddev->recovery); - set_bit(R1BIO_WriteError, &r1_bio->state); - } else if (is_badblock(conf->mirrors[mirror].rdev, - r1_bio->sector, - r1_bio->sectors, - &first_bad, &bad_sectors) && - !is_badblock(conf->mirrors[r1_bio->read_disk].rdev, - r1_bio->sector, - r1_bio->sectors, - &first_bad, &bad_sectors) - ) - set_bit(R1BIO_MadeGood, &r1_bio->state); - - if (atomic_dec_and_test(&r1_bio->remaining)) { - int s = r1_bio->sectors; - if (test_bit(R1BIO_MadeGood, &r1_bio->state) || - test_bit(R1BIO_WriteError, &r1_bio->state)) - reschedule_retry(r1_bio); - else { - put_buf(r1_bio); - md_done_sync(mddev, s, uptodate); - } - } -} - -static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector, - int sectors, struct page *page, int rw) -{ - if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) - /* success */ - return 1; - if (rw == WRITE) { - set_bit(WriteErrorSeen, &rdev->flags); - if (!test_and_set_bit(WantReplacement, - &rdev->flags)) - set_bit(MD_RECOVERY_NEEDED, & - rdev->mddev->recovery); - } - /* need to record an error - either for the block or the device */ - if (!rdev_set_badblocks(rdev, sector, sectors, 0)) - md_error(rdev->mddev, rdev); - return 0; -} - -static int fix_sync_read_error(struct r1bio *r1_bio) -{ - /* Try some synchronous reads of other devices to get - * good data, much like with normal read errors. Only - * read into the pages we already have so we don't - * need to re-issue the read request. - * We don't need to freeze the array, because being in an - * active sync request, there is no normal IO, and - * no overlapping syncs. - * We don't need to check is_badblock() again as we - * made sure that anything with a bad block in range - * will have bi_end_io clear. - */ - struct mddev *mddev = r1_bio->mddev; - struct r1conf *conf = mddev->private; - struct bio *bio = r1_bio->bios[r1_bio->read_disk]; - sector_t sect = r1_bio->sector; - int sectors = r1_bio->sectors; - int idx = 0; - - while(sectors) { - int s = sectors; - int d = r1_bio->read_disk; - int success = 0; - struct md_rdev *rdev; - int start; - - if (s > (PAGE_SIZE>>9)) - s = PAGE_SIZE >> 9; - do { - if (r1_bio->bios[d]->bi_end_io == end_sync_read) { - /* No rcu protection needed here devices - * can only be removed when no resync is - * active, and resync is currently active - */ - rdev = conf->mirrors[d].rdev; - if (sync_page_io(rdev, sect, s<<9, - bio->bi_io_vec[idx].bv_page, - READ, false)) { - success = 1; - break; - } - } - d++; - if (d == conf->raid_disks * 2) - d = 0; - } while (!success && d != r1_bio->read_disk); - - if (!success) { - char b[BDEVNAME_SIZE]; - int abort = 0; - /* Cannot read from anywhere, this block is lost. - * Record a bad block on each device. If that doesn't - * work just disable and interrupt the recovery. - * Don't fail devices as that won't really help. - */ - printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" - " for block %llu\n", - mdname(mddev), - bdevname(bio->bi_bdev, b), - (unsigned long long)r1_bio->sector); - for (d = 0; d < conf->raid_disks * 2; d++) { - rdev = conf->mirrors[d].rdev; - if (!rdev || test_bit(Faulty, &rdev->flags)) - continue; - if (!rdev_set_badblocks(rdev, sect, s, 0)) - abort = 1; - } - if (abort) { - conf->recovery_disabled = - mddev->recovery_disabled; - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_done_sync(mddev, r1_bio->sectors, 0); - put_buf(r1_bio); - return 0; - } - /* Try next page */ - sectors -= s; - sect += s; - idx++; - continue; - } - - start = d; - /* write it back and re-read */ - while (d != r1_bio->read_disk) { - if (d == 0) - d = conf->raid_disks * 2; - d--; - if (r1_bio->bios[d]->bi_end_io != end_sync_read) - continue; - rdev = conf->mirrors[d].rdev; - if (r1_sync_page_io(rdev, sect, s, - bio->bi_io_vec[idx].bv_page, - WRITE) == 0) { - r1_bio->bios[d]->bi_end_io = NULL; - rdev_dec_pending(rdev, mddev); - } - } - d = start; - while (d != r1_bio->read_disk) { - if (d == 0) - d = conf->raid_disks * 2; - d--; - if (r1_bio->bios[d]->bi_end_io != end_sync_read) - continue; - rdev = conf->mirrors[d].rdev; - if (r1_sync_page_io(rdev, sect, s, - bio->bi_io_vec[idx].bv_page, - READ) != 0) - atomic_add(s, &rdev->corrected_errors); - } - sectors -= s; - sect += s; - idx ++; - } - set_bit(R1BIO_Uptodate, &r1_bio->state); - set_bit(BIO_UPTODATE, &bio->bi_flags); - return 1; -} - -static int process_checks(struct r1bio *r1_bio) -{ - /* We have read all readable devices. If we haven't - * got the block, then there is no hope left. - * If we have, then we want to do a comparison - * and skip the write if everything is the same. - * If any blocks failed to read, then we need to - * attempt an over-write - */ - struct mddev *mddev = r1_bio->mddev; - struct r1conf *conf = mddev->private; - int primary; - int i; - int vcnt; - - for (primary = 0; primary < conf->raid_disks * 2; primary++) - if (r1_bio->bios[primary]->bi_end_io == end_sync_read && - test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { - r1_bio->bios[primary]->bi_end_io = NULL; - rdev_dec_pending(conf->mirrors[primary].rdev, mddev); - break; - } - r1_bio->read_disk = primary; - vcnt = (r1_bio->sectors + PAGE_SIZE / 512 - 1) >> (PAGE_SHIFT - 9); - for (i = 0; i < conf->raid_disks * 2; i++) { - int j; - struct bio *pbio = r1_bio->bios[primary]; - struct bio *sbio = r1_bio->bios[i]; - int size; - - if (r1_bio->bios[i]->bi_end_io != end_sync_read) - continue; - - if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) { - for (j = vcnt; j-- ; ) { - struct page *p, *s; - p = pbio->bi_io_vec[j].bv_page; - s = sbio->bi_io_vec[j].bv_page; - if (memcmp(page_address(p), - page_address(s), - sbio->bi_io_vec[j].bv_len)) - break; - } - } else - j = 0; - if (j >= 0) - mddev->resync_mismatches += r1_bio->sectors; - if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) - && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { - /* No need to write to this device. */ - sbio->bi_end_io = NULL; - rdev_dec_pending(conf->mirrors[i].rdev, mddev); - continue; - } - /* fixup the bio for reuse */ - sbio->bi_vcnt = vcnt; - sbio->bi_size = r1_bio->sectors << 9; - sbio->bi_idx = 0; - sbio->bi_phys_segments = 0; - sbio->bi_flags &= ~(BIO_POOL_MASK - 1); - sbio->bi_flags |= 1 << BIO_UPTODATE; - sbio->bi_next = NULL; - sbio->bi_sector = r1_bio->sector + - conf->mirrors[i].rdev->data_offset; - sbio->bi_bdev = conf->mirrors[i].rdev->bdev; - size = sbio->bi_size; - for (j = 0; j < vcnt ; j++) { - struct bio_vec *bi; - bi = &sbio->bi_io_vec[j]; - bi->bv_offset = 0; - if (size > PAGE_SIZE) - bi->bv_len = PAGE_SIZE; - else - bi->bv_len = size; - size -= PAGE_SIZE; - memcpy(page_address(bi->bv_page), - page_address(pbio->bi_io_vec[j].bv_page), - PAGE_SIZE); - } - } - return 0; -} - -static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) -{ - struct r1conf *conf = mddev->private; - int i; - int disks = conf->raid_disks * 2; - struct bio *bio, *wbio; - - bio = r1_bio->bios[r1_bio->read_disk]; - - if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) - /* ouch - failed to read all of that. */ - if (!fix_sync_read_error(r1_bio)) - return; - - if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) - if (process_checks(r1_bio) < 0) - return; - /* - * schedule writes - */ - atomic_set(&r1_bio->remaining, 1); - for (i = 0; i < disks ; i++) { - wbio = r1_bio->bios[i]; - if (wbio->bi_end_io == NULL || - (wbio->bi_end_io == end_sync_read && - (i == r1_bio->read_disk || - !test_bit(MD_RECOVERY_SYNC, &mddev->recovery)))) - continue; - - wbio->bi_rw = WRITE; - wbio->bi_end_io = end_sync_write; - atomic_inc(&r1_bio->remaining); - md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9); - - generic_make_request(wbio); - } - - if (atomic_dec_and_test(&r1_bio->remaining)) { - /* if we're here, all write(s) have completed, so clean up */ - md_done_sync(mddev, r1_bio->sectors, 1); - put_buf(r1_bio); - } -} - -/* - * This is a kernel thread which: - * - * 1. Retries failed read operations on working mirrors. - * 2. Updates the raid superblock when problems encounter. - * 3. Performs writes following reads for array synchronising. - */ - -static void fix_read_error(struct r1conf *conf, int read_disk, - sector_t sect, int sectors) -{ - struct mddev *mddev = conf->mddev; - while(sectors) { - int s = sectors; - int d = read_disk; - int success = 0; - int start; - struct md_rdev *rdev; - - if (s > (PAGE_SIZE>>9)) - s = PAGE_SIZE >> 9; - - do { - /* Note: no rcu protection needed here - * as this is synchronous in the raid1d thread - * which is the thread that might remove - * a device. If raid1d ever becomes multi-threaded.... - */ - sector_t first_bad; - int bad_sectors; - - rdev = conf->mirrors[d].rdev; - if (rdev && - test_bit(In_sync, &rdev->flags) && - is_badblock(rdev, sect, s, - &first_bad, &bad_sectors) == 0 && - sync_page_io(rdev, sect, s<<9, - conf->tmppage, READ, false)) - success = 1; - else { - d++; - if (d == conf->raid_disks * 2) - d = 0; - } - } while (!success && d != read_disk); - - if (!success) { - /* Cannot read from anywhere - mark it bad */ - struct md_rdev *rdev = conf->mirrors[read_disk].rdev; - if (!rdev_set_badblocks(rdev, sect, s, 0)) - md_error(mddev, rdev); - break; - } - /* write it back and re-read */ - start = d; - while (d != read_disk) { - if (d==0) - d = conf->raid_disks * 2; - d--; - rdev = conf->mirrors[d].rdev; - if (rdev && - test_bit(In_sync, &rdev->flags)) - r1_sync_page_io(rdev, sect, s, - conf->tmppage, WRITE); - } - d = start; - while (d != read_disk) { - char b[BDEVNAME_SIZE]; - if (d==0) - d = conf->raid_disks * 2; - d--; - rdev = conf->mirrors[d].rdev; - if (rdev && - test_bit(In_sync, &rdev->flags)) { - if (r1_sync_page_io(rdev, sect, s, - conf->tmppage, READ)) { - atomic_add(s, &rdev->corrected_errors); - printk(KERN_INFO - "md/raid1:%s: read error corrected " - "(%d sectors at %llu on %s)\n", - mdname(mddev), s, - (unsigned long long)(sect + - rdev->data_offset), - bdevname(rdev->bdev, b)); - } - } - } - sectors -= s; - sect += s; - } -} - -static void bi_complete(struct bio *bio, int error) -{ - complete((struct completion *)bio->bi_private); -} - -static int submit_bio_wait(int rw, struct bio *bio) -{ - struct completion event; - rw |= REQ_SYNC; - - init_completion(&event); - bio->bi_private = &event; - bio->bi_end_io = bi_complete; - submit_bio(rw, bio); - wait_for_completion(&event); - - return test_bit(BIO_UPTODATE, &bio->bi_flags); -} - -static int narrow_write_error(struct r1bio *r1_bio, int i) -{ - struct mddev *mddev = r1_bio->mddev; - struct r1conf *conf = mddev->private; - struct md_rdev *rdev = conf->mirrors[i].rdev; - int vcnt, idx; - struct bio_vec *vec; - - /* bio has the data to be written to device 'i' where - * we just recently had a write error. - * We repeatedly clone the bio and trim down to one block, - * then try the write. Where the write fails we record - * a bad block. - * It is conceivable that the bio doesn't exactly align with - * blocks. We must handle this somehow. - * - * We currently own a reference on the rdev. - */ - - int block_sectors; - sector_t sector; - int sectors; - int sect_to_write = r1_bio->sectors; - int ok = 1; - - if (rdev->badblocks.shift < 0) - return 0; - - block_sectors = 1 << rdev->badblocks.shift; - sector = r1_bio->sector; - sectors = ((sector + block_sectors) - & ~(sector_t)(block_sectors - 1)) - - sector; - - if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { - vcnt = r1_bio->behind_page_count; - vec = r1_bio->behind_bvecs; - idx = 0; - while (vec[idx].bv_page == NULL) - idx++; - } else { - vcnt = r1_bio->master_bio->bi_vcnt; - vec = r1_bio->master_bio->bi_io_vec; - idx = r1_bio->master_bio->bi_idx; - } - while (sect_to_write) { - struct bio *wbio; - if (sectors > sect_to_write) - sectors = sect_to_write; - /* Write at 'sector' for 'sectors'*/ - - wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev); - memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec)); - wbio->bi_sector = r1_bio->sector; - wbio->bi_rw = WRITE; - wbio->bi_vcnt = vcnt; - wbio->bi_size = r1_bio->sectors << 9; - wbio->bi_idx = idx; - - md_trim_bio(wbio, sector - r1_bio->sector, sectors); - wbio->bi_sector += rdev->data_offset; - wbio->bi_bdev = rdev->bdev; - if (submit_bio_wait(WRITE, wbio) == 0) - /* failure! */ - ok = rdev_set_badblocks(rdev, sector, - sectors, 0) - && ok; - - bio_put(wbio); - sect_to_write -= sectors; - sector += sectors; - sectors = block_sectors; - } - return ok; -} - -static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio) -{ - int m; - int s = r1_bio->sectors; - for (m = 0; m < conf->raid_disks * 2 ; m++) { - struct md_rdev *rdev = conf->mirrors[m].rdev; - struct bio *bio = r1_bio->bios[m]; - if (bio->bi_end_io == NULL) - continue; - if (test_bit(BIO_UPTODATE, &bio->bi_flags) && - test_bit(R1BIO_MadeGood, &r1_bio->state)) { - rdev_clear_badblocks(rdev, r1_bio->sector, s); - } - if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && - test_bit(R1BIO_WriteError, &r1_bio->state)) { - if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0)) - md_error(conf->mddev, rdev); - } - } - put_buf(r1_bio); - md_done_sync(conf->mddev, s, 1); -} - -static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) -{ - int m; - for (m = 0; m < conf->raid_disks * 2 ; m++) - if (r1_bio->bios[m] == IO_MADE_GOOD) { - struct md_rdev *rdev = conf->mirrors[m].rdev; - rdev_clear_badblocks(rdev, - r1_bio->sector, - r1_bio->sectors); - rdev_dec_pending(rdev, conf->mddev); - } else if (r1_bio->bios[m] != NULL) { - /* This drive got a write error. We need to - * narrow down and record precise write - * errors. - */ - if (!narrow_write_error(r1_bio, m)) { - md_error(conf->mddev, - conf->mirrors[m].rdev); - /* an I/O failed, we can't clear the bitmap */ - set_bit(R1BIO_Degraded, &r1_bio->state); - } - rdev_dec_pending(conf->mirrors[m].rdev, - conf->mddev); - } - if (test_bit(R1BIO_WriteError, &r1_bio->state)) - close_write(r1_bio); - raid_end_bio_io(r1_bio); -} - -static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) -{ - int disk; - int max_sectors; - struct mddev *mddev = conf->mddev; - struct bio *bio; - char b[BDEVNAME_SIZE]; - struct md_rdev *rdev; - - clear_bit(R1BIO_ReadError, &r1_bio->state); - /* we got a read error. Maybe the drive is bad. Maybe just - * the block and we can fix it. - * We freeze all other IO, and try reading the block from - * other devices. When we find one, we re-write - * and check it that fixes the read error. - * This is all done synchronously while the array is - * frozen - */ - if (mddev->ro == 0) { - freeze_array(conf); - fix_read_error(conf, r1_bio->read_disk, - r1_bio->sector, r1_bio->sectors); - unfreeze_array(conf); - } else - md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); - - bio = r1_bio->bios[r1_bio->read_disk]; - bdevname(bio->bi_bdev, b); -read_more: - disk = read_balance(conf, r1_bio, &max_sectors); - if (disk == -1) { - printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O" - " read error for block %llu\n", - mdname(mddev), b, (unsigned long long)r1_bio->sector); - raid_end_bio_io(r1_bio); - } else { - const unsigned long do_sync - = r1_bio->master_bio->bi_rw & REQ_SYNC; - if (bio) { - r1_bio->bios[r1_bio->read_disk] = - mddev->ro ? IO_BLOCKED : NULL; - bio_put(bio); - } - r1_bio->read_disk = disk; - bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); - md_trim_bio(bio, r1_bio->sector - bio->bi_sector, max_sectors); - r1_bio->bios[r1_bio->read_disk] = bio; - rdev = conf->mirrors[disk].rdev; - printk_ratelimited(KERN_ERR - "md/raid1:%s: redirecting sector %llu" - " to other mirror: %s\n", - mdname(mddev), - (unsigned long long)r1_bio->sector, - bdevname(rdev->bdev, b)); - bio->bi_sector = r1_bio->sector + rdev->data_offset; - bio->bi_bdev = rdev->bdev; - bio->bi_end_io = raid1_end_read_request; - bio->bi_rw = READ | do_sync; - bio->bi_private = r1_bio; - if (max_sectors < r1_bio->sectors) { - /* Drat - have to split this up more */ - struct bio *mbio = r1_bio->master_bio; - int sectors_handled = (r1_bio->sector + max_sectors - - mbio->bi_sector); - r1_bio->sectors = max_sectors; - spin_lock_irq(&conf->device_lock); - if (mbio->bi_phys_segments == 0) - mbio->bi_phys_segments = 2; - else - mbio->bi_phys_segments++; - spin_unlock_irq(&conf->device_lock); - generic_make_request(bio); - bio = NULL; - - r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); - - r1_bio->master_bio = mbio; - r1_bio->sectors = (mbio->bi_size >> 9) - - sectors_handled; - r1_bio->state = 0; - set_bit(R1BIO_ReadError, &r1_bio->state); - r1_bio->mddev = mddev; - r1_bio->sector = mbio->bi_sector + sectors_handled; - - goto read_more; - } else - generic_make_request(bio); - } -} - -static void raid1d(struct mddev *mddev) -{ - struct r1bio *r1_bio; - unsigned long flags; - struct r1conf *conf = mddev->private; - struct list_head *head = &conf->retry_list; - struct blk_plug plug; - - md_check_recovery(mddev); - - blk_start_plug(&plug); - for (;;) { - - if (atomic_read(&mddev->plug_cnt) == 0) - flush_pending_writes(conf); - - spin_lock_irqsave(&conf->device_lock, flags); - if (list_empty(head)) { - spin_unlock_irqrestore(&conf->device_lock, flags); - break; - } - r1_bio = list_entry(head->prev, struct r1bio, retry_list); - list_del(head->prev); - conf->nr_queued--; - spin_unlock_irqrestore(&conf->device_lock, flags); - - mddev = r1_bio->mddev; - conf = mddev->private; - if (test_bit(R1BIO_IsSync, &r1_bio->state)) { - if (test_bit(R1BIO_MadeGood, &r1_bio->state) || - test_bit(R1BIO_WriteError, &r1_bio->state)) - handle_sync_write_finished(conf, r1_bio); - else - sync_request_write(mddev, r1_bio); - } else if (test_bit(R1BIO_MadeGood, &r1_bio->state) || - test_bit(R1BIO_WriteError, &r1_bio->state)) - handle_write_finished(conf, r1_bio); - else if (test_bit(R1BIO_ReadError, &r1_bio->state)) - handle_read_error(conf, r1_bio); - else - /* just a partial read to be scheduled from separate - * context - */ - generic_make_request(r1_bio->bios[r1_bio->read_disk]); - - cond_resched(); - if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) - md_check_recovery(mddev); - } - blk_finish_plug(&plug); -} - - -static int init_resync(struct r1conf *conf) -{ - int buffs; - - buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; - BUG_ON(conf->r1buf_pool); - conf->r1buf_pool = mempool_create(buffs, r1buf_pool_alloc, r1buf_pool_free, - conf->poolinfo); - if (!conf->r1buf_pool) - return -ENOMEM; - conf->next_resync = 0; - return 0; -} - -/* - * perform a "sync" on one "block" - * - * We need to make sure that no normal I/O request - particularly write - * requests - conflict with active sync requests. - * - * This is achieved by tracking pending requests and a 'barrier' concept - * that can be installed to exclude normal IO requests. - */ - -static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster) -{ - struct r1conf *conf = mddev->private; - struct r1bio *r1_bio; - struct bio *bio; - sector_t max_sector, nr_sectors; - int disk = -1; - int i; - int wonly = -1; - int write_targets = 0, read_targets = 0; - sector_t sync_blocks; - int still_degraded = 0; - int good_sectors = RESYNC_SECTORS; - int min_bad = 0; /* number of sectors that are bad in all devices */ - - if (!conf->r1buf_pool) - if (init_resync(conf)) - return 0; - - max_sector = mddev->dev_sectors; - if (sector_nr >= max_sector) { - /* If we aborted, we need to abort the - * sync on the 'current' bitmap chunk (there will - * only be one in raid1 resync. - * We can find the current addess in mddev->curr_resync - */ - if (mddev->curr_resync < max_sector) /* aborted */ - bitmap_end_sync(mddev->bitmap, mddev->curr_resync, - &sync_blocks, 1); - else /* completed sync */ - conf->fullsync = 0; - - bitmap_close_sync(mddev->bitmap); - close_sync(conf); - return 0; - } - - if (mddev->bitmap == NULL && - mddev->recovery_cp == MaxSector && - !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && - conf->fullsync == 0) { - *skipped = 1; - return max_sector - sector_nr; - } - /* before building a request, check if we can skip these blocks.. - * This call the bitmap_start_sync doesn't actually record anything - */ - if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && - !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { - /* We can skip this block, and probably several more */ - *skipped = 1; - return sync_blocks; - } - /* - * If there is non-resync activity waiting for a turn, - * and resync is going fast enough, - * then let it though before starting on this new sync request. - */ - if (!go_faster && conf->nr_waiting) - msleep_interruptible(1000); - - bitmap_cond_end_sync(mddev->bitmap, sector_nr); - r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); - raise_barrier(conf); - - conf->next_resync = sector_nr; - - rcu_read_lock(); - /* - * If we get a correctably read error during resync or recovery, - * we might want to read from a different device. So we - * flag all drives that could conceivably be read from for READ, - * and any others (which will be non-In_sync devices) for WRITE. - * If a read fails, we try reading from something else for which READ - * is OK. - */ - - r1_bio->mddev = mddev; - r1_bio->sector = sector_nr; - r1_bio->state = 0; - set_bit(R1BIO_IsSync, &r1_bio->state); - - for (i = 0; i < conf->raid_disks * 2; i++) { - struct md_rdev *rdev; - bio = r1_bio->bios[i]; - - /* take from bio_init */ - bio->bi_next = NULL; - bio->bi_flags &= ~(BIO_POOL_MASK-1); - bio->bi_flags |= 1 << BIO_UPTODATE; - bio->bi_rw = READ; - bio->bi_vcnt = 0; - bio->bi_idx = 0; - bio->bi_phys_segments = 0; - bio->bi_size = 0; - bio->bi_end_io = NULL; - bio->bi_private = NULL; - - rdev = rcu_dereference(conf->mirrors[i].rdev); - if (rdev == NULL || - test_bit(Faulty, &rdev->flags)) { - if (i < conf->raid_disks) - still_degraded = 1; - } else if (!test_bit(In_sync, &rdev->flags)) { - bio->bi_rw = WRITE; - bio->bi_end_io = end_sync_write; - write_targets ++; - } else { - /* may need to read from here */ - sector_t first_bad = MaxSector; - int bad_sectors; - - if (is_badblock(rdev, sector_nr, good_sectors, - &first_bad, &bad_sectors)) { - if (first_bad > sector_nr) - good_sectors = first_bad - sector_nr; - else { - bad_sectors -= (sector_nr - first_bad); - if (min_bad == 0 || - min_bad > bad_sectors) - min_bad = bad_sectors; - } - } - if (sector_nr < first_bad) { - if (test_bit(WriteMostly, &rdev->flags)) { - if (wonly < 0) - wonly = i; - } else { - if (disk < 0) - disk = i; - } - bio->bi_rw = READ; - bio->bi_end_io = end_sync_read; - read_targets++; - } - } - if (bio->bi_end_io) { - atomic_inc(&rdev->nr_pending); - bio->bi_sector = sector_nr + rdev->data_offset; - bio->bi_bdev = rdev->bdev; - bio->bi_private = r1_bio; - } - } - rcu_read_unlock(); - if (disk < 0) - disk = wonly; - r1_bio->read_disk = disk; - - if (read_targets == 0 && min_bad > 0) { - /* These sectors are bad on all InSync devices, so we - * need to mark them bad on all write targets - */ - int ok = 1; - for (i = 0 ; i < conf->raid_disks * 2 ; i++) - if (r1_bio->bios[i]->bi_end_io == end_sync_write) { - struct md_rdev *rdev = conf->mirrors[i].rdev; - ok = rdev_set_badblocks(rdev, sector_nr, - min_bad, 0 - ) && ok; - } - set_bit(MD_CHANGE_DEVS, &mddev->flags); - *skipped = 1; - put_buf(r1_bio); - - if (!ok) { - /* Cannot record the badblocks, so need to - * abort the resync. - * If there are multiple read targets, could just - * fail the really bad ones ??? - */ - conf->recovery_disabled = mddev->recovery_disabled; - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - return 0; - } else - return min_bad; - - } - if (min_bad > 0 && min_bad < good_sectors) { - /* only resync enough to reach the next bad->good - * transition */ - good_sectors = min_bad; - } - - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0) - /* extra read targets are also write targets */ - write_targets += read_targets-1; - - if (write_targets == 0 || read_targets == 0) { - /* There is nowhere to write, so all non-sync - * drives must be failed - so we are finished - */ - sector_t rv = max_sector - sector_nr; - *skipped = 1; - put_buf(r1_bio); - return rv; - } - - if (max_sector > mddev->resync_max) - max_sector = mddev->resync_max; /* Don't do IO beyond here */ - if (max_sector > sector_nr + good_sectors) - max_sector = sector_nr + good_sectors; - nr_sectors = 0; - sync_blocks = 0; - do { - struct page *page; - int len = PAGE_SIZE; - if (sector_nr + (len>>9) > max_sector) - len = (max_sector - sector_nr) << 9; - if (len == 0) - break; - if (sync_blocks == 0) { - if (!bitmap_start_sync(mddev->bitmap, sector_nr, - &sync_blocks, still_degraded) && - !conf->fullsync && - !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) - break; - BUG_ON(sync_blocks < (PAGE_SIZE>>9)); - if ((len >> 9) > sync_blocks) - len = sync_blocks<<9; - } - - for (i = 0 ; i < conf->raid_disks * 2; i++) { - bio = r1_bio->bios[i]; - if (bio->bi_end_io) { - page = bio->bi_io_vec[bio->bi_vcnt].bv_page; - if (bio_add_page(bio, page, len, 0) == 0) { - /* stop here */ - bio->bi_io_vec[bio->bi_vcnt].bv_page = page; - while (i > 0) { - i--; - bio = r1_bio->bios[i]; - if (bio->bi_end_io==NULL) - continue; - /* remove last page from this bio */ - bio->bi_vcnt--; - bio->bi_size -= len; - bio->bi_flags &= ~(1<< BIO_SEG_VALID); - } - goto bio_full; - } - } - } - nr_sectors += len>>9; - sector_nr += len>>9; - sync_blocks -= (len>>9); - } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES); - bio_full: - r1_bio->sectors = nr_sectors; - - /* For a user-requested sync, we read all readable devices and do a - * compare - */ - if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { - atomic_set(&r1_bio->remaining, read_targets); - for (i = 0; i < conf->raid_disks * 2; i++) { - bio = r1_bio->bios[i]; - if (bio->bi_end_io == end_sync_read) { - md_sync_acct(bio->bi_bdev, nr_sectors); - generic_make_request(bio); - } - } - } else { - atomic_set(&r1_bio->remaining, 1); - bio = r1_bio->bios[r1_bio->read_disk]; - md_sync_acct(bio->bi_bdev, nr_sectors); - generic_make_request(bio); - - } - return nr_sectors; -} - -static sector_t raid1_size(struct mddev *mddev, sector_t sectors, int raid_disks) -{ - if (sectors) - return sectors; - - return mddev->dev_sectors; -} - -static struct r1conf *setup_conf(struct mddev *mddev) -{ - struct r1conf *conf; - int i; - struct mirror_info *disk; - struct md_rdev *rdev; - int err = -ENOMEM; - - conf = kzalloc(sizeof(struct r1conf), GFP_KERNEL); - if (!conf) - goto abort; - - conf->mirrors = kzalloc(sizeof(struct mirror_info) - * mddev->raid_disks * 2, - GFP_KERNEL); - if (!conf->mirrors) - goto abort; - - conf->tmppage = alloc_page(GFP_KERNEL); - if (!conf->tmppage) - goto abort; - - conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL); - if (!conf->poolinfo) - goto abort; - conf->poolinfo->raid_disks = mddev->raid_disks * 2; - conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, - r1bio_pool_free, - conf->poolinfo); - if (!conf->r1bio_pool) - goto abort; - - conf->poolinfo->mddev = mddev; - - err = -EINVAL; - spin_lock_init(&conf->device_lock); - rdev_for_each(rdev, mddev) { - struct request_queue *q; - int disk_idx = rdev->raid_disk; - if (disk_idx >= mddev->raid_disks - || disk_idx < 0) - continue; - if (test_bit(Replacement, &rdev->flags)) - disk = conf->mirrors + conf->raid_disks + disk_idx; - else - disk = conf->mirrors + disk_idx; - - if (disk->rdev) - goto abort; - disk->rdev = rdev; - q = bdev_get_queue(rdev->bdev); - if (q->merge_bvec_fn) - mddev->merge_check_needed = 1; - - disk->head_position = 0; - } - conf->raid_disks = mddev->raid_disks; - conf->mddev = mddev; - INIT_LIST_HEAD(&conf->retry_list); - - spin_lock_init(&conf->resync_lock); - init_waitqueue_head(&conf->wait_barrier); - - bio_list_init(&conf->pending_bio_list); - conf->pending_count = 0; - conf->recovery_disabled = mddev->recovery_disabled - 1; - - err = -EIO; - conf->last_used = -1; - for (i = 0; i < conf->raid_disks * 2; i++) { - - disk = conf->mirrors + i; - - if (i < conf->raid_disks && - disk[conf->raid_disks].rdev) { - /* This slot has a replacement. */ - if (!disk->rdev) { - /* No original, just make the replacement - * a recovering spare - */ - disk->rdev = - disk[conf->raid_disks].rdev; - disk[conf->raid_disks].rdev = NULL; - } else if (!test_bit(In_sync, &disk->rdev->flags)) - /* Original is not in_sync - bad */ - goto abort; - } - - if (!disk->rdev || - !test_bit(In_sync, &disk->rdev->flags)) { - disk->head_position = 0; - if (disk->rdev) - conf->fullsync = 1; - } else if (conf->last_used < 0) - /* - * The first working device is used as a - * starting point to read balancing. - */ - conf->last_used = i; - } - - if (conf->last_used < 0) { - printk(KERN_ERR "md/raid1:%s: no operational mirrors\n", - mdname(mddev)); - goto abort; - } - err = -ENOMEM; - conf->thread = md_register_thread(raid1d, mddev, NULL); - if (!conf->thread) { - printk(KERN_ERR - "md/raid1:%s: couldn't allocate thread\n", - mdname(mddev)); - goto abort; - } - - return conf; - - abort: - if (conf) { - if (conf->r1bio_pool) - mempool_destroy(conf->r1bio_pool); - kfree(conf->mirrors); - safe_put_page(conf->tmppage); - kfree(conf->poolinfo); - kfree(conf); - } - return ERR_PTR(err); -} - -static int stop(struct mddev *mddev); -static int run(struct mddev *mddev) -{ - struct r1conf *conf; - int i; - struct md_rdev *rdev; - int ret; - - if (mddev->level != 1) { - printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n", - mdname(mddev), mddev->level); - return -EIO; - } - if (mddev->reshape_position != MaxSector) { - printk(KERN_ERR "md/raid1:%s: reshape_position set but not supported\n", - mdname(mddev)); - return -EIO; - } - /* - * copy the already verified devices into our private RAID1 - * bookkeeping area. [whatever we allocate in run(), - * should be freed in stop()] - */ - if (mddev->private == NULL) - conf = setup_conf(mddev); - else - conf = mddev->private; - - if (IS_ERR(conf)) - return PTR_ERR(conf); - - rdev_for_each(rdev, mddev) { - if (!mddev->gendisk) - continue; - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - } - - mddev->degraded = 0; - for (i=0; i < conf->raid_disks; i++) - if (conf->mirrors[i].rdev == NULL || - !test_bit(In_sync, &conf->mirrors[i].rdev->flags) || - test_bit(Faulty, &conf->mirrors[i].rdev->flags)) - mddev->degraded++; - - if (conf->raid_disks - mddev->degraded == 1) - mddev->recovery_cp = MaxSector; - - if (mddev->recovery_cp != MaxSector) - printk(KERN_NOTICE "md/raid1:%s: not clean" - " -- starting background reconstruction\n", - mdname(mddev)); - printk(KERN_INFO - "md/raid1:%s: active with %d out of %d mirrors\n", - mdname(mddev), mddev->raid_disks - mddev->degraded, - mddev->raid_disks); - - /* - * Ok, everything is just fine now - */ - mddev->thread = conf->thread; - conf->thread = NULL; - mddev->private = conf; - - md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); - - if (mddev->queue) { - mddev->queue->backing_dev_info.congested_fn = raid1_congested; - mddev->queue->backing_dev_info.congested_data = mddev; - blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec); - } - - ret = md_integrity_register(mddev); - if (ret) - stop(mddev); - return ret; -} - -static int stop(struct mddev *mddev) -{ - struct r1conf *conf = mddev->private; - struct bitmap *bitmap = mddev->bitmap; - - /* wait for behind writes to complete */ - if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { - printk(KERN_INFO "md/raid1:%s: behind writes in progress - waiting to stop.\n", - mdname(mddev)); - /* need to kick something here to make sure I/O goes? */ - wait_event(bitmap->behind_wait, - atomic_read(&bitmap->behind_writes) == 0); - } - - raise_barrier(conf); - lower_barrier(conf); - - md_unregister_thread(&mddev->thread); - if (conf->r1bio_pool) - mempool_destroy(conf->r1bio_pool); - kfree(conf->mirrors); - kfree(conf->poolinfo); - kfree(conf); - mddev->private = NULL; - return 0; -} - -static int raid1_resize(struct mddev *mddev, sector_t sectors) -{ - /* no resync is happening, and there is enough space - * on all devices, so we can resize. - * We need to make sure resync covers any new space. - * If the array is shrinking we should possibly wait until - * any io in the removed space completes, but it hardly seems - * worth it. - */ - md_set_array_sectors(mddev, raid1_size(mddev, sectors, 0)); - if (mddev->array_sectors > raid1_size(mddev, sectors, 0)) - return -EINVAL; - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk(mddev->gendisk); - if (sectors > mddev->dev_sectors && - mddev->recovery_cp > mddev->dev_sectors) { - mddev->recovery_cp = mddev->dev_sectors; - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - } - mddev->dev_sectors = sectors; - mddev->resync_max_sectors = sectors; - return 0; -} - -static int raid1_reshape(struct mddev *mddev) -{ - /* We need to: - * 1/ resize the r1bio_pool - * 2/ resize conf->mirrors - * - * We allocate a new r1bio_pool if we can. - * Then raise a device barrier and wait until all IO stops. - * Then resize conf->mirrors and swap in the new r1bio pool. - * - * At the same time, we "pack" the devices so that all the missing - * devices have the higher raid_disk numbers. - */ - mempool_t *newpool, *oldpool; - struct pool_info *newpoolinfo; - struct mirror_info *newmirrors; - struct r1conf *conf = mddev->private; - int cnt, raid_disks; - unsigned long flags; - int d, d2, err; - - /* Cannot change chunk_size, layout, or level */ - if (mddev->chunk_sectors != mddev->new_chunk_sectors || - mddev->layout != mddev->new_layout || - mddev->level != mddev->new_level) { - mddev->new_chunk_sectors = mddev->chunk_sectors; - mddev->new_layout = mddev->layout; - mddev->new_level = mddev->level; - return -EINVAL; - } - - err = md_allow_write(mddev); - if (err) - return err; - - raid_disks = mddev->raid_disks + mddev->delta_disks; - - if (raid_disks < conf->raid_disks) { - cnt=0; - for (d= 0; d < conf->raid_disks; d++) - if (conf->mirrors[d].rdev) - cnt++; - if (cnt > raid_disks) - return -EBUSY; - } - - newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL); - if (!newpoolinfo) - return -ENOMEM; - newpoolinfo->mddev = mddev; - newpoolinfo->raid_disks = raid_disks * 2; - - newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, - r1bio_pool_free, newpoolinfo); - if (!newpool) { - kfree(newpoolinfo); - return -ENOMEM; - } - newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2, - GFP_KERNEL); - if (!newmirrors) { - kfree(newpoolinfo); - mempool_destroy(newpool); - return -ENOMEM; - } - - raise_barrier(conf); - - /* ok, everything is stopped */ - oldpool = conf->r1bio_pool; - conf->r1bio_pool = newpool; - - for (d = d2 = 0; d < conf->raid_disks; d++) { - struct md_rdev *rdev = conf->mirrors[d].rdev; - if (rdev && rdev->raid_disk != d2) { - sysfs_unlink_rdev(mddev, rdev); - rdev->raid_disk = d2; - sysfs_unlink_rdev(mddev, rdev); - if (sysfs_link_rdev(mddev, rdev)) - printk(KERN_WARNING - "md/raid1:%s: cannot register rd%d\n", - mdname(mddev), rdev->raid_disk); - } - if (rdev) - newmirrors[d2++].rdev = rdev; - } - kfree(conf->mirrors); - conf->mirrors = newmirrors; - kfree(conf->poolinfo); - conf->poolinfo = newpoolinfo; - - spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded += (raid_disks - conf->raid_disks); - spin_unlock_irqrestore(&conf->device_lock, flags); - conf->raid_disks = mddev->raid_disks = raid_disks; - mddev->delta_disks = 0; - - conf->last_used = 0; /* just make sure it is in-range */ - lower_barrier(conf); - - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); - - mempool_destroy(oldpool); - return 0; -} - -static void raid1_quiesce(struct mddev *mddev, int state) -{ - struct r1conf *conf = mddev->private; - - switch(state) { - case 2: /* wake for suspend */ - wake_up(&conf->wait_barrier); - break; - case 1: - raise_barrier(conf); - break; - case 0: - lower_barrier(conf); - break; - } -} - -static void *raid1_takeover(struct mddev *mddev) -{ - /* raid1 can take over: - * raid5 with 2 devices, any layout or chunk size - */ - if (mddev->level == 5 && mddev->raid_disks == 2) { - struct r1conf *conf; - mddev->new_level = 1; - mddev->new_layout = 0; - mddev->new_chunk_sectors = 0; - conf = setup_conf(mddev); - if (!IS_ERR(conf)) - conf->barrier = 1; - return conf; - } - return ERR_PTR(-EINVAL); -} - -static struct md_personality raid1_personality = -{ - .name = "raid1", - .level = 1, - .owner = THIS_MODULE, - .make_request = make_request, - .run = run, - .stop = stop, - .status = status, - .error_handler = error, - .hot_add_disk = raid1_add_disk, - .hot_remove_disk= raid1_remove_disk, - .spare_active = raid1_spare_active, - .sync_request = sync_request, - .resize = raid1_resize, - .size = raid1_size, - .check_reshape = raid1_reshape, - .quiesce = raid1_quiesce, - .takeover = raid1_takeover, -}; - -static int __init raid_init(void) -{ - return register_md_personality(&raid1_personality); -} - -static void raid_exit(void) -{ - unregister_md_personality(&raid1_personality); -} - -module_init(raid_init); -module_exit(raid_exit); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD"); -MODULE_ALIAS("md-personality-3"); /* RAID1 */ -MODULE_ALIAS("md-raid1"); -MODULE_ALIAS("md-level-1"); - -module_param(max_queued_requests, int, S_IRUGO|S_IWUSR); diff --git a/ANDROID_3.4.5/drivers/md/raid1.h b/ANDROID_3.4.5/drivers/md/raid1.h deleted file mode 100644 index 80ded139..00000000 --- a/ANDROID_3.4.5/drivers/md/raid1.h +++ /dev/null @@ -1,177 +0,0 @@ -#ifndef _RAID1_H -#define _RAID1_H - -struct mirror_info { - struct md_rdev *rdev; - sector_t head_position; -}; - -/* - * memory pools need a pointer to the mddev, so they can force an unplug - * when memory is tight, and a count of the number of drives that the - * pool was allocated for, so they know how much to allocate and free. - * mddev->raid_disks cannot be used, as it can change while a pool is active - * These two datums are stored in a kmalloced struct. - * The 'raid_disks' here is twice the raid_disks in r1conf. - * This allows space for each 'real' device can have a replacement in the - * second half of the array. - */ - -struct pool_info { - struct mddev *mddev; - int raid_disks; -}; - -struct r1conf { - struct mddev *mddev; - struct mirror_info *mirrors; /* twice 'raid_disks' to - * allow for replacements. - */ - int raid_disks; - - /* When choose the best device for a read (read_balance()) - * we try to keep sequential reads one the same device - * using 'last_used' and 'next_seq_sect' - */ - int last_used; - sector_t next_seq_sect; - /* During resync, read_balancing is only allowed on the part - * of the array that has been resynced. 'next_resync' tells us - * where that is. - */ - sector_t next_resync; - - spinlock_t device_lock; - - /* list of 'struct r1bio' that need to be processed by raid1d, - * whether to retry a read, writeout a resync or recovery - * block, or anything else. - */ - struct list_head retry_list; - - /* queue pending writes to be submitted on unplug */ - struct bio_list pending_bio_list; - int pending_count; - - /* for use when syncing mirrors: - * We don't allow both normal IO and resync/recovery IO at - * the same time - resync/recovery can only happen when there - * is no other IO. So when either is active, the other has to wait. - * See more details description in raid1.c near raise_barrier(). - */ - wait_queue_head_t wait_barrier; - spinlock_t resync_lock; - int nr_pending; - int nr_waiting; - int nr_queued; - int barrier; - - /* Set to 1 if a full sync is needed, (fresh device added). - * Cleared when a sync completes. - */ - int fullsync; - - /* When the same as mddev->recovery_disabled we don't allow - * recovery to be attempted as we expect a read error. - */ - int recovery_disabled; - - - /* poolinfo contains information about the content of the - * mempools - it changes when the array grows or shrinks - */ - struct pool_info *poolinfo; - mempool_t *r1bio_pool; - mempool_t *r1buf_pool; - - /* temporary buffer to synchronous IO when attempting to repair - * a read error. - */ - struct page *tmppage; - - - /* When taking over an array from a different personality, we store - * the new thread here until we fully activate the array. - */ - struct md_thread *thread; -}; - -/* - * this is our 'private' RAID1 bio. - * - * it contains information about what kind of IO operations were started - * for this RAID1 operation, and about their status: - */ - -struct r1bio { - atomic_t remaining; /* 'have we finished' count, - * used from IRQ handlers - */ - atomic_t behind_remaining; /* number of write-behind ios remaining - * in this BehindIO request - */ - sector_t sector; - int sectors; - unsigned long state; - struct mddev *mddev; - /* - * original bio going to /dev/mdx - */ - struct bio *master_bio; - /* - * if the IO is in READ direction, then this is where we read - */ - int read_disk; - - struct list_head retry_list; - /* Next two are only valid when R1BIO_BehindIO is set */ - struct bio_vec *behind_bvecs; - int behind_page_count; - /* - * if the IO is in WRITE direction, then multiple bios are used. - * We choose the number when they are allocated. - */ - struct bio *bios[0]; - /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/ -}; - -/* when we get a read error on a read-only array, we redirect to another - * device without failing the first device, or trying to over-write to - * correct the read error. To keep track of bad blocks on a per-bio - * level, we store IO_BLOCKED in the appropriate 'bios' pointer - */ -#define IO_BLOCKED ((struct bio *)1) -/* When we successfully write to a known bad-block, we need to remove the - * bad-block marking which must be done from process context. So we record - * the success by setting bios[n] to IO_MADE_GOOD - */ -#define IO_MADE_GOOD ((struct bio *)2) - -#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) - -/* bits for r1bio.state */ -#define R1BIO_Uptodate 0 -#define R1BIO_IsSync 1 -#define R1BIO_Degraded 2 -#define R1BIO_BehindIO 3 -/* Set ReadError on bios that experience a readerror so that - * raid1d knows what to do with them. - */ -#define R1BIO_ReadError 4 -/* For write-behind requests, we call bi_end_io when - * the last non-write-behind device completes, providing - * any write was successful. Otherwise we call when - * any write-behind write succeeds, otherwise we call - * with failure when last write completes (and all failed). - * Record that bi_end_io was called with this flag... - */ -#define R1BIO_Returned 6 -/* If a write for this request means we can clear some - * known-bad-block records, we set this flag - */ -#define R1BIO_MadeGood 7 -#define R1BIO_WriteError 8 - -extern int md_raid1_congested(struct mddev *mddev, int bits); - -#endif diff --git a/ANDROID_3.4.5/drivers/md/raid10.c b/ANDROID_3.4.5/drivers/md/raid10.c deleted file mode 100644 index a954c95d..00000000 --- a/ANDROID_3.4.5/drivers/md/raid10.c +++ /dev/null @@ -1,3584 +0,0 @@ -/* - * raid10.c : Multiple Devices driver for Linux - * - * Copyright (C) 2000-2004 Neil Brown - * - * RAID-10 support for md. - * - * Base on code in raid1.c. See raid1.c for further copyright information. - * - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * You should have received a copy of the GNU General Public License - * (for example /usr/src/linux/COPYING); if not, write to the Free - * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include <linux/slab.h> -#include <linux/delay.h> -#include <linux/blkdev.h> -#include <linux/module.h> -#include <linux/seq_file.h> -#include <linux/ratelimit.h> -#include "md.h" -#include "raid10.h" -#include "raid0.h" -#include "bitmap.h" - -/* - * RAID10 provides a combination of RAID0 and RAID1 functionality. - * The layout of data is defined by - * chunk_size - * raid_disks - * near_copies (stored in low byte of layout) - * far_copies (stored in second byte of layout) - * far_offset (stored in bit 16 of layout ) - * - * The data to be stored is divided into chunks using chunksize. - * Each device is divided into far_copies sections. - * In each section, chunks are laid out in a style similar to raid0, but - * near_copies copies of each chunk is stored (each on a different drive). - * The starting device for each section is offset near_copies from the starting - * device of the previous section. - * Thus they are (near_copies*far_copies) of each chunk, and each is on a different - * drive. - * near_copies and far_copies must be at least one, and their product is at most - * raid_disks. - * - * If far_offset is true, then the far_copies are handled a bit differently. - * The copies are still in different stripes, but instead of be very far apart - * on disk, there are adjacent stripes. - */ - -/* - * Number of guaranteed r10bios in case of extreme VM load: - */ -#define NR_RAID10_BIOS 256 - -/* When there are this many requests queue to be written by - * the raid10 thread, we become 'congested' to provide back-pressure - * for writeback. - */ -static int max_queued_requests = 1024; - -static void allow_barrier(struct r10conf *conf); -static void lower_barrier(struct r10conf *conf); -static int enough(struct r10conf *conf, int ignore); - -static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) -{ - struct r10conf *conf = data; - int size = offsetof(struct r10bio, devs[conf->copies]); - - /* allocate a r10bio with room for raid_disks entries in the - * bios array */ - return kzalloc(size, gfp_flags); -} - -static void r10bio_pool_free(void *r10_bio, void *data) -{ - kfree(r10_bio); -} - -/* Maximum size of each resync request */ -#define RESYNC_BLOCK_SIZE (64*1024) -#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) -/* amount of memory to reserve for resync requests */ -#define RESYNC_WINDOW (1024*1024) -/* maximum number of concurrent requests, memory permitting */ -#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE) - -/* - * When performing a resync, we need to read and compare, so - * we need as many pages are there are copies. - * When performing a recovery, we need 2 bios, one for read, - * one for write (we recover only one drive per r10buf) - * - */ -static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) -{ - struct r10conf *conf = data; - struct page *page; - struct r10bio *r10_bio; - struct bio *bio; - int i, j; - int nalloc; - - r10_bio = r10bio_pool_alloc(gfp_flags, conf); - if (!r10_bio) - return NULL; - - if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) - nalloc = conf->copies; /* resync */ - else - nalloc = 2; /* recovery */ - - /* - * Allocate bios. - */ - for (j = nalloc ; j-- ; ) { - bio = bio_kmalloc(gfp_flags, RESYNC_PAGES); - if (!bio) - goto out_free_bio; - r10_bio->devs[j].bio = bio; - if (!conf->have_replacement) - continue; - bio = bio_kmalloc(gfp_flags, RESYNC_PAGES); - if (!bio) - goto out_free_bio; - r10_bio->devs[j].repl_bio = bio; - } - /* - * Allocate RESYNC_PAGES data pages and attach them - * where needed. - */ - for (j = 0 ; j < nalloc; j++) { - struct bio *rbio = r10_bio->devs[j].repl_bio; - bio = r10_bio->devs[j].bio; - for (i = 0; i < RESYNC_PAGES; i++) { - if (j == 1 && !test_bit(MD_RECOVERY_SYNC, - &conf->mddev->recovery)) { - /* we can share bv_page's during recovery */ - struct bio *rbio = r10_bio->devs[0].bio; - page = rbio->bi_io_vec[i].bv_page; - get_page(page); - } else - page = alloc_page(gfp_flags); - if (unlikely(!page)) - goto out_free_pages; - - bio->bi_io_vec[i].bv_page = page; - if (rbio) - rbio->bi_io_vec[i].bv_page = page; - } - } - - return r10_bio; - -out_free_pages: - for ( ; i > 0 ; i--) - safe_put_page(bio->bi_io_vec[i-1].bv_page); - while (j--) - for (i = 0; i < RESYNC_PAGES ; i++) - safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); - j = -1; -out_free_bio: - while (++j < nalloc) { - bio_put(r10_bio->devs[j].bio); - if (r10_bio->devs[j].repl_bio) - bio_put(r10_bio->devs[j].repl_bio); - } - r10bio_pool_free(r10_bio, conf); - return NULL; -} - -static void r10buf_pool_free(void *__r10_bio, void *data) -{ - int i; - struct r10conf *conf = data; - struct r10bio *r10bio = __r10_bio; - int j; - - for (j=0; j < conf->copies; j++) { - struct bio *bio = r10bio->devs[j].bio; - if (bio) { - for (i = 0; i < RESYNC_PAGES; i++) { - safe_put_page(bio->bi_io_vec[i].bv_page); - bio->bi_io_vec[i].bv_page = NULL; - } - bio_put(bio); - } - bio = r10bio->devs[j].repl_bio; - if (bio) - bio_put(bio); - } - r10bio_pool_free(r10bio, conf); -} - -static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio) -{ - int i; - - for (i = 0; i < conf->copies; i++) { - struct bio **bio = & r10_bio->devs[i].bio; - if (!BIO_SPECIAL(*bio)) - bio_put(*bio); - *bio = NULL; - bio = &r10_bio->devs[i].repl_bio; - if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio)) - bio_put(*bio); - *bio = NULL; - } -} - -static void free_r10bio(struct r10bio *r10_bio) -{ - struct r10conf *conf = r10_bio->mddev->private; - - put_all_bios(conf, r10_bio); - mempool_free(r10_bio, conf->r10bio_pool); -} - -static void put_buf(struct r10bio *r10_bio) -{ - struct r10conf *conf = r10_bio->mddev->private; - - mempool_free(r10_bio, conf->r10buf_pool); - - lower_barrier(conf); -} - -static void reschedule_retry(struct r10bio *r10_bio) -{ - unsigned long flags; - struct mddev *mddev = r10_bio->mddev; - struct r10conf *conf = mddev->private; - - spin_lock_irqsave(&conf->device_lock, flags); - list_add(&r10_bio->retry_list, &conf->retry_list); - conf->nr_queued ++; - spin_unlock_irqrestore(&conf->device_lock, flags); - - /* wake up frozen array... */ - wake_up(&conf->wait_barrier); - - md_wakeup_thread(mddev->thread); -} - -/* - * raid_end_bio_io() is called when we have finished servicing a mirrored - * operation and are ready to return a success/failure code to the buffer - * cache layer. - */ -static void raid_end_bio_io(struct r10bio *r10_bio) -{ - struct bio *bio = r10_bio->master_bio; - int done; - struct r10conf *conf = r10_bio->mddev->private; - - if (bio->bi_phys_segments) { - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - bio->bi_phys_segments--; - done = (bio->bi_phys_segments == 0); - spin_unlock_irqrestore(&conf->device_lock, flags); - } else - done = 1; - if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - if (done) { - bio_endio(bio, 0); - /* - * Wake up any possible resync thread that waits for the device - * to go idle. - */ - allow_barrier(conf); - } - free_r10bio(r10_bio); -} - -/* - * Update disk head position estimator based on IRQ completion info. - */ -static inline void update_head_pos(int slot, struct r10bio *r10_bio) -{ - struct r10conf *conf = r10_bio->mddev->private; - - conf->mirrors[r10_bio->devs[slot].devnum].head_position = - r10_bio->devs[slot].addr + (r10_bio->sectors); -} - -/* - * Find the disk number which triggered given bio - */ -static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, - struct bio *bio, int *slotp, int *replp) -{ - int slot; - int repl = 0; - - for (slot = 0; slot < conf->copies; slot++) { - if (r10_bio->devs[slot].bio == bio) - break; - if (r10_bio->devs[slot].repl_bio == bio) { - repl = 1; - break; - } - } - - BUG_ON(slot == conf->copies); - update_head_pos(slot, r10_bio); - - if (slotp) - *slotp = slot; - if (replp) - *replp = repl; - return r10_bio->devs[slot].devnum; -} - -static void raid10_end_read_request(struct bio *bio, int error) -{ - int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct r10bio *r10_bio = bio->bi_private; - int slot, dev; - struct md_rdev *rdev; - struct r10conf *conf = r10_bio->mddev->private; - - - slot = r10_bio->read_slot; - dev = r10_bio->devs[slot].devnum; - rdev = r10_bio->devs[slot].rdev; - /* - * this branch is our 'one mirror IO has finished' event handler: - */ - update_head_pos(slot, r10_bio); - - if (uptodate) { - /* - * Set R10BIO_Uptodate in our master bio, so that - * we will return a good error code to the higher - * levels even if IO on some other mirrored buffer fails. - * - * The 'master' represents the composite IO operation to - * user-side. So if something waits for IO, then it will - * wait for the 'master' bio. - */ - set_bit(R10BIO_Uptodate, &r10_bio->state); - } else { - /* If all other devices that store this block have - * failed, we want to return the error upwards rather - * than fail the last device. Here we redefine - * "uptodate" to mean "Don't want to retry" - */ - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - if (!enough(conf, rdev->raid_disk)) - uptodate = 1; - spin_unlock_irqrestore(&conf->device_lock, flags); - } - if (uptodate) { - raid_end_bio_io(r10_bio); - rdev_dec_pending(rdev, conf->mddev); - } else { - /* - * oops, read error - keep the refcount on the rdev - */ - char b[BDEVNAME_SIZE]; - printk_ratelimited(KERN_ERR - "md/raid10:%s: %s: rescheduling sector %llu\n", - mdname(conf->mddev), - bdevname(rdev->bdev, b), - (unsigned long long)r10_bio->sector); - set_bit(R10BIO_ReadError, &r10_bio->state); - reschedule_retry(r10_bio); - } -} - -static void close_write(struct r10bio *r10_bio) -{ - /* clear the bitmap if all writes complete successfully */ - bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, - r10_bio->sectors, - !test_bit(R10BIO_Degraded, &r10_bio->state), - 0); - md_write_end(r10_bio->mddev); -} - -static void one_write_done(struct r10bio *r10_bio) -{ - if (atomic_dec_and_test(&r10_bio->remaining)) { - if (test_bit(R10BIO_WriteError, &r10_bio->state)) - reschedule_retry(r10_bio); - else { - close_write(r10_bio); - if (test_bit(R10BIO_MadeGood, &r10_bio->state)) - reschedule_retry(r10_bio); - else - raid_end_bio_io(r10_bio); - } - } -} - -static void raid10_end_write_request(struct bio *bio, int error) -{ - int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct r10bio *r10_bio = bio->bi_private; - int dev; - int dec_rdev = 1; - struct r10conf *conf = r10_bio->mddev->private; - int slot, repl; - struct md_rdev *rdev = NULL; - - dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); - - if (repl) - rdev = conf->mirrors[dev].replacement; - if (!rdev) { - smp_rmb(); - repl = 0; - rdev = conf->mirrors[dev].rdev; - } - /* - * this branch is our 'one mirror IO has finished' event handler: - */ - if (!uptodate) { - if (repl) - /* Never record new bad blocks to replacement, - * just fail it. - */ - md_error(rdev->mddev, rdev); - else { - set_bit(WriteErrorSeen, &rdev->flags); - if (!test_and_set_bit(WantReplacement, &rdev->flags)) - set_bit(MD_RECOVERY_NEEDED, - &rdev->mddev->recovery); - set_bit(R10BIO_WriteError, &r10_bio->state); - dec_rdev = 0; - } - } else { - /* - * Set R10BIO_Uptodate in our master bio, so that - * we will return a good error code for to the higher - * levels even if IO on some other mirrored buffer fails. - * - * The 'master' represents the composite IO operation to - * user-side. So if something waits for IO, then it will - * wait for the 'master' bio. - */ - sector_t first_bad; - int bad_sectors; - - set_bit(R10BIO_Uptodate, &r10_bio->state); - - /* Maybe we can clear some bad blocks. */ - if (is_badblock(rdev, - r10_bio->devs[slot].addr, - r10_bio->sectors, - &first_bad, &bad_sectors)) { - bio_put(bio); - if (repl) - r10_bio->devs[slot].repl_bio = IO_MADE_GOOD; - else - r10_bio->devs[slot].bio = IO_MADE_GOOD; - dec_rdev = 0; - set_bit(R10BIO_MadeGood, &r10_bio->state); - } - } - - /* - * - * Let's see if all mirrored write operations have finished - * already. - */ - one_write_done(r10_bio); - if (dec_rdev) - rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); -} - -/* - * RAID10 layout manager - * As well as the chunksize and raid_disks count, there are two - * parameters: near_copies and far_copies. - * near_copies * far_copies must be <= raid_disks. - * Normally one of these will be 1. - * If both are 1, we get raid0. - * If near_copies == raid_disks, we get raid1. - * - * Chunks are laid out in raid0 style with near_copies copies of the - * first chunk, followed by near_copies copies of the next chunk and - * so on. - * If far_copies > 1, then after 1/far_copies of the array has been assigned - * as described above, we start again with a device offset of near_copies. - * So we effectively have another copy of the whole array further down all - * the drives, but with blocks on different drives. - * With this layout, and block is never stored twice on the one device. - * - * raid10_find_phys finds the sector offset of a given virtual sector - * on each device that it is on. - * - * raid10_find_virt does the reverse mapping, from a device and a - * sector offset to a virtual address - */ - -static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio) -{ - int n,f; - sector_t sector; - sector_t chunk; - sector_t stripe; - int dev; - - int slot = 0; - - /* now calculate first sector/dev */ - chunk = r10bio->sector >> conf->chunk_shift; - sector = r10bio->sector & conf->chunk_mask; - - chunk *= conf->near_copies; - stripe = chunk; - dev = sector_div(stripe, conf->raid_disks); - if (conf->far_offset) - stripe *= conf->far_copies; - - sector += stripe << conf->chunk_shift; - - /* and calculate all the others */ - for (n=0; n < conf->near_copies; n++) { - int d = dev; - sector_t s = sector; - r10bio->devs[slot].addr = sector; - r10bio->devs[slot].devnum = d; - slot++; - - for (f = 1; f < conf->far_copies; f++) { - d += conf->near_copies; - if (d >= conf->raid_disks) - d -= conf->raid_disks; - s += conf->stride; - r10bio->devs[slot].devnum = d; - r10bio->devs[slot].addr = s; - slot++; - } - dev++; - if (dev >= conf->raid_disks) { - dev = 0; - sector += (conf->chunk_mask + 1); - } - } - BUG_ON(slot != conf->copies); -} - -static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) -{ - sector_t offset, chunk, vchunk; - - offset = sector & conf->chunk_mask; - if (conf->far_offset) { - int fc; - chunk = sector >> conf->chunk_shift; - fc = sector_div(chunk, conf->far_copies); - dev -= fc * conf->near_copies; - if (dev < 0) - dev += conf->raid_disks; - } else { - while (sector >= conf->stride) { - sector -= conf->stride; - if (dev < conf->near_copies) - dev += conf->raid_disks - conf->near_copies; - else - dev -= conf->near_copies; - } - chunk = sector >> conf->chunk_shift; - } - vchunk = chunk * conf->raid_disks + dev; - sector_div(vchunk, conf->near_copies); - return (vchunk << conf->chunk_shift) + offset; -} - -/** - * raid10_mergeable_bvec -- tell bio layer if a two requests can be merged - * @q: request queue - * @bvm: properties of new bio - * @biovec: the request that could be merged to it. - * - * Return amount of bytes we can accept at this offset - * This requires checking for end-of-chunk if near_copies != raid_disks, - * and for subordinate merge_bvec_fns if merge_check_needed. - */ -static int raid10_mergeable_bvec(struct request_queue *q, - struct bvec_merge_data *bvm, - struct bio_vec *biovec) -{ - struct mddev *mddev = q->queuedata; - struct r10conf *conf = mddev->private; - sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); - int max; - unsigned int chunk_sectors = mddev->chunk_sectors; - unsigned int bio_sectors = bvm->bi_size >> 9; - - if (conf->near_copies < conf->raid_disks) { - max = (chunk_sectors - ((sector & (chunk_sectors - 1)) - + bio_sectors)) << 9; - if (max < 0) - /* bio_add cannot handle a negative return */ - max = 0; - if (max <= biovec->bv_len && bio_sectors == 0) - return biovec->bv_len; - } else - max = biovec->bv_len; - - if (mddev->merge_check_needed) { - struct r10bio r10_bio; - int s; - r10_bio.sector = sector; - raid10_find_phys(conf, &r10_bio); - rcu_read_lock(); - for (s = 0; s < conf->copies; s++) { - int disk = r10_bio.devs[s].devnum; - struct md_rdev *rdev = rcu_dereference( - conf->mirrors[disk].rdev); - if (rdev && !test_bit(Faulty, &rdev->flags)) { - struct request_queue *q = - bdev_get_queue(rdev->bdev); - if (q->merge_bvec_fn) { - bvm->bi_sector = r10_bio.devs[s].addr - + rdev->data_offset; - bvm->bi_bdev = rdev->bdev; - max = min(max, q->merge_bvec_fn( - q, bvm, biovec)); - } - } - rdev = rcu_dereference(conf->mirrors[disk].replacement); - if (rdev && !test_bit(Faulty, &rdev->flags)) { - struct request_queue *q = - bdev_get_queue(rdev->bdev); - if (q->merge_bvec_fn) { - bvm->bi_sector = r10_bio.devs[s].addr - + rdev->data_offset; - bvm->bi_bdev = rdev->bdev; - max = min(max, q->merge_bvec_fn( - q, bvm, biovec)); - } - } - } - rcu_read_unlock(); - } - return max; -} - -/* - * This routine returns the disk from which the requested read should - * be done. There is a per-array 'next expected sequential IO' sector - * number - if this matches on the next IO then we use the last disk. - * There is also a per-disk 'last know head position' sector that is - * maintained from IRQ contexts, both the normal and the resync IO - * completion handlers update this position correctly. If there is no - * perfect sequential match then we pick the disk whose head is closest. - * - * If there are 2 mirrors in the same 2 devices, performance degrades - * because position is mirror, not device based. - * - * The rdev for the device selected will have nr_pending incremented. - */ - -/* - * FIXME: possibly should rethink readbalancing and do it differently - * depending on near_copies / far_copies geometry. - */ -static struct md_rdev *read_balance(struct r10conf *conf, - struct r10bio *r10_bio, - int *max_sectors) -{ - const sector_t this_sector = r10_bio->sector; - int disk, slot; - int sectors = r10_bio->sectors; - int best_good_sectors; - sector_t new_distance, best_dist; - struct md_rdev *rdev, *best_rdev; - int do_balance; - int best_slot; - - raid10_find_phys(conf, r10_bio); - rcu_read_lock(); -retry: - sectors = r10_bio->sectors; - best_slot = -1; - best_rdev = NULL; - best_dist = MaxSector; - best_good_sectors = 0; - do_balance = 1; - /* - * Check if we can balance. We can balance on the whole - * device if no resync is going on (recovery is ok), or below - * the resync window. We take the first readable disk when - * above the resync window. - */ - if (conf->mddev->recovery_cp < MaxSector - && (this_sector + sectors >= conf->next_resync)) - do_balance = 0; - - for (slot = 0; slot < conf->copies ; slot++) { - sector_t first_bad; - int bad_sectors; - sector_t dev_sector; - - if (r10_bio->devs[slot].bio == IO_BLOCKED) - continue; - disk = r10_bio->devs[slot].devnum; - rdev = rcu_dereference(conf->mirrors[disk].replacement); - if (rdev == NULL || test_bit(Faulty, &rdev->flags) || - test_bit(Unmerged, &rdev->flags) || - r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) - rdev = rcu_dereference(conf->mirrors[disk].rdev); - if (rdev == NULL || - test_bit(Faulty, &rdev->flags) || - test_bit(Unmerged, &rdev->flags)) - continue; - if (!test_bit(In_sync, &rdev->flags) && - r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) - continue; - - dev_sector = r10_bio->devs[slot].addr; - if (is_badblock(rdev, dev_sector, sectors, - &first_bad, &bad_sectors)) { - if (best_dist < MaxSector) - /* Already have a better slot */ - continue; - if (first_bad <= dev_sector) { - /* Cannot read here. If this is the - * 'primary' device, then we must not read - * beyond 'bad_sectors' from another device. - */ - bad_sectors -= (dev_sector - first_bad); - if (!do_balance && sectors > bad_sectors) - sectors = bad_sectors; - if (best_good_sectors > sectors) - best_good_sectors = sectors; - } else { - sector_t good_sectors = - first_bad - dev_sector; - if (good_sectors > best_good_sectors) { - best_good_sectors = good_sectors; - best_slot = slot; - best_rdev = rdev; - } - if (!do_balance) - /* Must read from here */ - break; - } - continue; - } else - best_good_sectors = sectors; - - if (!do_balance) - break; - - /* This optimisation is debatable, and completely destroys - * sequential read speed for 'far copies' arrays. So only - * keep it for 'near' arrays, and review those later. - */ - if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) - break; - - /* for far > 1 always use the lowest address */ - if (conf->far_copies > 1) - new_distance = r10_bio->devs[slot].addr; - else - new_distance = abs(r10_bio->devs[slot].addr - - conf->mirrors[disk].head_position); - if (new_distance < best_dist) { - best_dist = new_distance; - best_slot = slot; - best_rdev = rdev; - } - } - if (slot >= conf->copies) { - slot = best_slot; - rdev = best_rdev; - } - - if (slot >= 0) { - atomic_inc(&rdev->nr_pending); - if (test_bit(Faulty, &rdev->flags)) { - /* Cannot risk returning a device that failed - * before we inc'ed nr_pending - */ - rdev_dec_pending(rdev, conf->mddev); - goto retry; - } - r10_bio->read_slot = slot; - } else - rdev = NULL; - rcu_read_unlock(); - *max_sectors = best_good_sectors; - - return rdev; -} - -static int raid10_congested(void *data, int bits) -{ - struct mddev *mddev = data; - struct r10conf *conf = mddev->private; - int i, ret = 0; - - if ((bits & (1 << BDI_async_congested)) && - conf->pending_count >= max_queued_requests) - return 1; - - if (mddev_congested(mddev, bits)) - return 1; - rcu_read_lock(); - for (i = 0; i < conf->raid_disks && ret == 0; i++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); - if (rdev && !test_bit(Faulty, &rdev->flags)) { - struct request_queue *q = bdev_get_queue(rdev->bdev); - - ret |= bdi_congested(&q->backing_dev_info, bits); - } - } - rcu_read_unlock(); - return ret; -} - -static void flush_pending_writes(struct r10conf *conf) -{ - /* Any writes that have been queued but are awaiting - * bitmap updates get flushed here. - */ - spin_lock_irq(&conf->device_lock); - - if (conf->pending_bio_list.head) { - struct bio *bio; - bio = bio_list_get(&conf->pending_bio_list); - conf->pending_count = 0; - spin_unlock_irq(&conf->device_lock); - /* flush any pending bitmap writes to disk - * before proceeding w/ I/O */ - bitmap_unplug(conf->mddev->bitmap); - wake_up(&conf->wait_barrier); - - while (bio) { /* submit pending writes */ - struct bio *next = bio->bi_next; - bio->bi_next = NULL; - generic_make_request(bio); - bio = next; - } - } else - spin_unlock_irq(&conf->device_lock); -} - -/* Barriers.... - * Sometimes we need to suspend IO while we do something else, - * either some resync/recovery, or reconfigure the array. - * To do this we raise a 'barrier'. - * The 'barrier' is a counter that can be raised multiple times - * to count how many activities are happening which preclude - * normal IO. - * We can only raise the barrier if there is no pending IO. - * i.e. if nr_pending == 0. - * We choose only to raise the barrier if no-one is waiting for the - * barrier to go down. This means that as soon as an IO request - * is ready, no other operations which require a barrier will start - * until the IO request has had a chance. - * - * So: regular IO calls 'wait_barrier'. When that returns there - * is no backgroup IO happening, It must arrange to call - * allow_barrier when it has finished its IO. - * backgroup IO calls must call raise_barrier. Once that returns - * there is no normal IO happeing. It must arrange to call - * lower_barrier when the particular background IO completes. - */ - -static void raise_barrier(struct r10conf *conf, int force) -{ - BUG_ON(force && !conf->barrier); - spin_lock_irq(&conf->resync_lock); - - /* Wait until no block IO is waiting (unless 'force') */ - wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, - conf->resync_lock, ); - - /* block any new IO from starting */ - conf->barrier++; - - /* Now wait for all pending IO to complete */ - wait_event_lock_irq(conf->wait_barrier, - !conf->nr_pending && conf->barrier < RESYNC_DEPTH, - conf->resync_lock, ); - - spin_unlock_irq(&conf->resync_lock); -} - -static void lower_barrier(struct r10conf *conf) -{ - unsigned long flags; - spin_lock_irqsave(&conf->resync_lock, flags); - conf->barrier--; - spin_unlock_irqrestore(&conf->resync_lock, flags); - wake_up(&conf->wait_barrier); -} - -static void wait_barrier(struct r10conf *conf) -{ - spin_lock_irq(&conf->resync_lock); - if (conf->barrier) { - conf->nr_waiting++; - /* Wait for the barrier to drop. - * However if there are already pending - * requests (preventing the barrier from - * rising completely), and the - * pre-process bio queue isn't empty, - * then don't wait, as we need to empty - * that queue to get the nr_pending - * count down. - */ - wait_event_lock_irq(conf->wait_barrier, - !conf->barrier || - (conf->nr_pending && - current->bio_list && - !bio_list_empty(current->bio_list)), - conf->resync_lock, - ); - conf->nr_waiting--; - } - conf->nr_pending++; - spin_unlock_irq(&conf->resync_lock); -} - -static void allow_barrier(struct r10conf *conf) -{ - unsigned long flags; - spin_lock_irqsave(&conf->resync_lock, flags); - conf->nr_pending--; - spin_unlock_irqrestore(&conf->resync_lock, flags); - wake_up(&conf->wait_barrier); -} - -static void freeze_array(struct r10conf *conf) -{ - /* stop syncio and normal IO and wait for everything to - * go quiet. - * We increment barrier and nr_waiting, and then - * wait until nr_pending match nr_queued+1 - * This is called in the context of one normal IO request - * that has failed. Thus any sync request that might be pending - * will be blocked by nr_pending, and we need to wait for - * pending IO requests to complete or be queued for re-try. - * Thus the number queued (nr_queued) plus this request (1) - * must match the number of pending IOs (nr_pending) before - * we continue. - */ - spin_lock_irq(&conf->resync_lock); - conf->barrier++; - conf->nr_waiting++; - wait_event_lock_irq(conf->wait_barrier, - conf->nr_pending == conf->nr_queued+1, - conf->resync_lock, - flush_pending_writes(conf)); - - spin_unlock_irq(&conf->resync_lock); -} - -static void unfreeze_array(struct r10conf *conf) -{ - /* reverse the effect of the freeze */ - spin_lock_irq(&conf->resync_lock); - conf->barrier--; - conf->nr_waiting--; - wake_up(&conf->wait_barrier); - spin_unlock_irq(&conf->resync_lock); -} - -static void make_request(struct mddev *mddev, struct bio * bio) -{ - struct r10conf *conf = mddev->private; - struct r10bio *r10_bio; - struct bio *read_bio; - int i; - int chunk_sects = conf->chunk_mask + 1; - const int rw = bio_data_dir(bio); - const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); - const unsigned long do_fua = (bio->bi_rw & REQ_FUA); - unsigned long flags; - struct md_rdev *blocked_rdev; - int plugged; - int sectors_handled; - int max_sectors; - - if (unlikely(bio->bi_rw & REQ_FLUSH)) { - md_flush_request(mddev, bio); - return; - } - - /* If this request crosses a chunk boundary, we need to - * split it. This will only happen for 1 PAGE (or less) requests. - */ - if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9) - > chunk_sects && - conf->near_copies < conf->raid_disks)) { - struct bio_pair *bp; - /* Sanity check -- queue functions should prevent this happening */ - if (bio->bi_vcnt != 1 || - bio->bi_idx != 0) - goto bad_map; - /* This is a one page bio that upper layers - * refuse to split for us, so we need to split it. - */ - bp = bio_split(bio, - chunk_sects - (bio->bi_sector & (chunk_sects - 1)) ); - - /* Each of these 'make_request' calls will call 'wait_barrier'. - * If the first succeeds but the second blocks due to the resync - * thread raising the barrier, we will deadlock because the - * IO to the underlying device will be queued in generic_make_request - * and will never complete, so will never reduce nr_pending. - * So increment nr_waiting here so no new raise_barriers will - * succeed, and so the second wait_barrier cannot block. - */ - spin_lock_irq(&conf->resync_lock); - conf->nr_waiting++; - spin_unlock_irq(&conf->resync_lock); - - make_request(mddev, &bp->bio1); - make_request(mddev, &bp->bio2); - - spin_lock_irq(&conf->resync_lock); - conf->nr_waiting--; - wake_up(&conf->wait_barrier); - spin_unlock_irq(&conf->resync_lock); - - bio_pair_release(bp); - return; - bad_map: - printk("md/raid10:%s: make_request bug: can't convert block across chunks" - " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2, - (unsigned long long)bio->bi_sector, bio->bi_size >> 10); - - bio_io_error(bio); - return; - } - - md_write_start(mddev, bio); - - /* - * Register the new request and wait if the reconstruction - * thread has put up a bar for new requests. - * Continue immediately if no resync is active currently. - */ - wait_barrier(conf); - - r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); - - r10_bio->master_bio = bio; - r10_bio->sectors = bio->bi_size >> 9; - - r10_bio->mddev = mddev; - r10_bio->sector = bio->bi_sector; - r10_bio->state = 0; - - /* We might need to issue multiple reads to different - * devices if there are bad blocks around, so we keep - * track of the number of reads in bio->bi_phys_segments. - * If this is 0, there is only one r10_bio and no locking - * will be needed when the request completes. If it is - * non-zero, then it is the number of not-completed requests. - */ - bio->bi_phys_segments = 0; - clear_bit(BIO_SEG_VALID, &bio->bi_flags); - - if (rw == READ) { - /* - * read balancing logic: - */ - struct md_rdev *rdev; - int slot; - -read_again: - rdev = read_balance(conf, r10_bio, &max_sectors); - if (!rdev) { - raid_end_bio_io(r10_bio); - return; - } - slot = r10_bio->read_slot; - - read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); - md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector, - max_sectors); - - r10_bio->devs[slot].bio = read_bio; - r10_bio->devs[slot].rdev = rdev; - - read_bio->bi_sector = r10_bio->devs[slot].addr + - rdev->data_offset; - read_bio->bi_bdev = rdev->bdev; - read_bio->bi_end_io = raid10_end_read_request; - read_bio->bi_rw = READ | do_sync; - read_bio->bi_private = r10_bio; - - if (max_sectors < r10_bio->sectors) { - /* Could not read all from this device, so we will - * need another r10_bio. - */ - sectors_handled = (r10_bio->sectors + max_sectors - - bio->bi_sector); - r10_bio->sectors = max_sectors; - spin_lock_irq(&conf->device_lock); - if (bio->bi_phys_segments == 0) - bio->bi_phys_segments = 2; - else - bio->bi_phys_segments++; - spin_unlock(&conf->device_lock); - /* Cannot call generic_make_request directly - * as that will be queued in __generic_make_request - * and subsequent mempool_alloc might block - * waiting for it. so hand bio over to raid10d. - */ - reschedule_retry(r10_bio); - - r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); - - r10_bio->master_bio = bio; - r10_bio->sectors = ((bio->bi_size >> 9) - - sectors_handled); - r10_bio->state = 0; - r10_bio->mddev = mddev; - r10_bio->sector = bio->bi_sector + sectors_handled; - goto read_again; - } else - generic_make_request(read_bio); - return; - } - - /* - * WRITE: - */ - if (conf->pending_count >= max_queued_requests) { - md_wakeup_thread(mddev->thread); - wait_event(conf->wait_barrier, - conf->pending_count < max_queued_requests); - } - /* first select target devices under rcu_lock and - * inc refcount on their rdev. Record them by setting - * bios[x] to bio - * If there are known/acknowledged bad blocks on any device - * on which we have seen a write error, we want to avoid - * writing to those blocks. This potentially requires several - * writes to write around the bad blocks. Each set of writes - * gets its own r10_bio with a set of bios attached. The number - * of r10_bios is recored in bio->bi_phys_segments just as with - * the read case. - */ - plugged = mddev_check_plugged(mddev); - - r10_bio->read_slot = -1; /* make sure repl_bio gets freed */ - raid10_find_phys(conf, r10_bio); -retry_write: - blocked_rdev = NULL; - rcu_read_lock(); - max_sectors = r10_bio->sectors; - - for (i = 0; i < conf->copies; i++) { - int d = r10_bio->devs[i].devnum; - struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); - struct md_rdev *rrdev = rcu_dereference( - conf->mirrors[d].replacement); - if (rdev == rrdev) - rrdev = NULL; - if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { - atomic_inc(&rdev->nr_pending); - blocked_rdev = rdev; - break; - } - if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) { - atomic_inc(&rrdev->nr_pending); - blocked_rdev = rrdev; - break; - } - if (rrdev && (test_bit(Faulty, &rrdev->flags) - || test_bit(Unmerged, &rrdev->flags))) - rrdev = NULL; - - r10_bio->devs[i].bio = NULL; - r10_bio->devs[i].repl_bio = NULL; - if (!rdev || test_bit(Faulty, &rdev->flags) || - test_bit(Unmerged, &rdev->flags)) { - set_bit(R10BIO_Degraded, &r10_bio->state); - continue; - } - if (test_bit(WriteErrorSeen, &rdev->flags)) { - sector_t first_bad; - sector_t dev_sector = r10_bio->devs[i].addr; - int bad_sectors; - int is_bad; - - is_bad = is_badblock(rdev, dev_sector, - max_sectors, - &first_bad, &bad_sectors); - if (is_bad < 0) { - /* Mustn't write here until the bad block - * is acknowledged - */ - atomic_inc(&rdev->nr_pending); - set_bit(BlockedBadBlocks, &rdev->flags); - blocked_rdev = rdev; - break; - } - if (is_bad && first_bad <= dev_sector) { - /* Cannot write here at all */ - bad_sectors -= (dev_sector - first_bad); - if (bad_sectors < max_sectors) - /* Mustn't write more than bad_sectors - * to other devices yet - */ - max_sectors = bad_sectors; - /* We don't set R10BIO_Degraded as that - * only applies if the disk is missing, - * so it might be re-added, and we want to - * know to recover this chunk. - * In this case the device is here, and the - * fact that this chunk is not in-sync is - * recorded in the bad block log. - */ - continue; - } - if (is_bad) { - int good_sectors = first_bad - dev_sector; - if (good_sectors < max_sectors) - max_sectors = good_sectors; - } - } - r10_bio->devs[i].bio = bio; - atomic_inc(&rdev->nr_pending); - if (rrdev) { - r10_bio->devs[i].repl_bio = bio; - atomic_inc(&rrdev->nr_pending); - } - } - rcu_read_unlock(); - - if (unlikely(blocked_rdev)) { - /* Have to wait for this device to get unblocked, then retry */ - int j; - int d; - - for (j = 0; j < i; j++) { - if (r10_bio->devs[j].bio) { - d = r10_bio->devs[j].devnum; - rdev_dec_pending(conf->mirrors[d].rdev, mddev); - } - if (r10_bio->devs[j].repl_bio) { - struct md_rdev *rdev; - d = r10_bio->devs[j].devnum; - rdev = conf->mirrors[d].replacement; - if (!rdev) { - /* Race with remove_disk */ - smp_mb(); - rdev = conf->mirrors[d].rdev; - } - rdev_dec_pending(rdev, mddev); - } - } - allow_barrier(conf); - md_wait_for_blocked_rdev(blocked_rdev, mddev); - wait_barrier(conf); - goto retry_write; - } - - if (max_sectors < r10_bio->sectors) { - /* We are splitting this into multiple parts, so - * we need to prepare for allocating another r10_bio. - */ - r10_bio->sectors = max_sectors; - spin_lock_irq(&conf->device_lock); - if (bio->bi_phys_segments == 0) - bio->bi_phys_segments = 2; - else - bio->bi_phys_segments++; - spin_unlock_irq(&conf->device_lock); - } - sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector; - - atomic_set(&r10_bio->remaining, 1); - bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); - - for (i = 0; i < conf->copies; i++) { - struct bio *mbio; - int d = r10_bio->devs[i].devnum; - if (!r10_bio->devs[i].bio) - continue; - - mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, - max_sectors); - r10_bio->devs[i].bio = mbio; - - mbio->bi_sector = (r10_bio->devs[i].addr+ - conf->mirrors[d].rdev->data_offset); - mbio->bi_bdev = conf->mirrors[d].rdev->bdev; - mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | do_sync | do_fua; - mbio->bi_private = r10_bio; - - atomic_inc(&r10_bio->remaining); - spin_lock_irqsave(&conf->device_lock, flags); - bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; - spin_unlock_irqrestore(&conf->device_lock, flags); - - if (!r10_bio->devs[i].repl_bio) - continue; - - mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, - max_sectors); - r10_bio->devs[i].repl_bio = mbio; - - /* We are actively writing to the original device - * so it cannot disappear, so the replacement cannot - * become NULL here - */ - mbio->bi_sector = (r10_bio->devs[i].addr+ - conf->mirrors[d].replacement->data_offset); - mbio->bi_bdev = conf->mirrors[d].replacement->bdev; - mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | do_sync | do_fua; - mbio->bi_private = r10_bio; - - atomic_inc(&r10_bio->remaining); - spin_lock_irqsave(&conf->device_lock, flags); - bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; - spin_unlock_irqrestore(&conf->device_lock, flags); - } - - /* Don't remove the bias on 'remaining' (one_write_done) until - * after checking if we need to go around again. - */ - - if (sectors_handled < (bio->bi_size >> 9)) { - one_write_done(r10_bio); - /* We need another r10_bio. It has already been counted - * in bio->bi_phys_segments. - */ - r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); - - r10_bio->master_bio = bio; - r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled; - - r10_bio->mddev = mddev; - r10_bio->sector = bio->bi_sector + sectors_handled; - r10_bio->state = 0; - goto retry_write; - } - one_write_done(r10_bio); - - /* In case raid10d snuck in to freeze_array */ - wake_up(&conf->wait_barrier); - - if (do_sync || !mddev->bitmap || !plugged) - md_wakeup_thread(mddev->thread); -} - -static void status(struct seq_file *seq, struct mddev *mddev) -{ - struct r10conf *conf = mddev->private; - int i; - - if (conf->near_copies < conf->raid_disks) - seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2); - if (conf->near_copies > 1) - seq_printf(seq, " %d near-copies", conf->near_copies); - if (conf->far_copies > 1) { - if (conf->far_offset) - seq_printf(seq, " %d offset-copies", conf->far_copies); - else - seq_printf(seq, " %d far-copies", conf->far_copies); - } - seq_printf(seq, " [%d/%d] [", conf->raid_disks, - conf->raid_disks - mddev->degraded); - for (i = 0; i < conf->raid_disks; i++) - seq_printf(seq, "%s", - conf->mirrors[i].rdev && - test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_"); - seq_printf(seq, "]"); -} - -/* check if there are enough drives for - * every block to appear on atleast one. - * Don't consider the device numbered 'ignore' - * as we might be about to remove it. - */ -static int enough(struct r10conf *conf, int ignore) -{ - int first = 0; - - do { - int n = conf->copies; - int cnt = 0; - while (n--) { - if (conf->mirrors[first].rdev && - first != ignore) - cnt++; - first = (first+1) % conf->raid_disks; - } - if (cnt == 0) - return 0; - } while (first != 0); - return 1; -} - -static void error(struct mddev *mddev, struct md_rdev *rdev) -{ - char b[BDEVNAME_SIZE]; - struct r10conf *conf = mddev->private; - - /* - * If it is not operational, then we have already marked it as dead - * else if it is the last working disks, ignore the error, let the - * next level up know. - * else mark the drive as failed - */ - if (test_bit(In_sync, &rdev->flags) - && !enough(conf, rdev->raid_disk)) - /* - * Don't fail the drive, just return an IO error. - */ - return; - if (test_and_clear_bit(In_sync, &rdev->flags)) { - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded++; - spin_unlock_irqrestore(&conf->device_lock, flags); - /* - * if recovery is running, make sure it aborts. - */ - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - } - set_bit(Blocked, &rdev->flags); - set_bit(Faulty, &rdev->flags); - set_bit(MD_CHANGE_DEVS, &mddev->flags); - printk(KERN_ALERT - "md/raid10:%s: Disk failure on %s, disabling device.\n" - "md/raid10:%s: Operation continuing on %d devices.\n", - mdname(mddev), bdevname(rdev->bdev, b), - mdname(mddev), conf->raid_disks - mddev->degraded); -} - -static void print_conf(struct r10conf *conf) -{ - int i; - struct mirror_info *tmp; - - printk(KERN_DEBUG "RAID10 conf printout:\n"); - if (!conf) { - printk(KERN_DEBUG "(!conf)\n"); - return; - } - printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, - conf->raid_disks); - - for (i = 0; i < conf->raid_disks; i++) { - char b[BDEVNAME_SIZE]; - tmp = conf->mirrors + i; - if (tmp->rdev) - printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n", - i, !test_bit(In_sync, &tmp->rdev->flags), - !test_bit(Faulty, &tmp->rdev->flags), - bdevname(tmp->rdev->bdev,b)); - } -} - -static void close_sync(struct r10conf *conf) -{ - wait_barrier(conf); - allow_barrier(conf); - - mempool_destroy(conf->r10buf_pool); - conf->r10buf_pool = NULL; -} - -static int raid10_spare_active(struct mddev *mddev) -{ - int i; - struct r10conf *conf = mddev->private; - struct mirror_info *tmp; - int count = 0; - unsigned long flags; - - /* - * Find all non-in_sync disks within the RAID10 configuration - * and mark them in_sync - */ - for (i = 0; i < conf->raid_disks; i++) { - tmp = conf->mirrors + i; - if (tmp->replacement - && tmp->replacement->recovery_offset == MaxSector - && !test_bit(Faulty, &tmp->replacement->flags) - && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { - /* Replacement has just become active */ - if (!tmp->rdev - || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) - count++; - if (tmp->rdev) { - /* Replaced device not technically faulty, - * but we need to be sure it gets removed - * and never re-added. - */ - set_bit(Faulty, &tmp->rdev->flags); - sysfs_notify_dirent_safe( - tmp->rdev->sysfs_state); - } - sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); - } else if (tmp->rdev - && !test_bit(Faulty, &tmp->rdev->flags) - && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { - count++; - sysfs_notify_dirent(tmp->rdev->sysfs_state); - } - } - spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded -= count; - spin_unlock_irqrestore(&conf->device_lock, flags); - - print_conf(conf); - return count; -} - - -static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) -{ - struct r10conf *conf = mddev->private; - int err = -EEXIST; - int mirror; - int first = 0; - int last = conf->raid_disks - 1; - struct request_queue *q = bdev_get_queue(rdev->bdev); - - if (mddev->recovery_cp < MaxSector) - /* only hot-add to in-sync arrays, as recovery is - * very different from resync - */ - return -EBUSY; - if (rdev->saved_raid_disk < 0 && !enough(conf, -1)) - return -EINVAL; - - if (rdev->raid_disk >= 0) - first = last = rdev->raid_disk; - - if (q->merge_bvec_fn) { - set_bit(Unmerged, &rdev->flags); - mddev->merge_check_needed = 1; - } - - if (rdev->saved_raid_disk >= first && - conf->mirrors[rdev->saved_raid_disk].rdev == NULL) - mirror = rdev->saved_raid_disk; - else - mirror = first; - for ( ; mirror <= last ; mirror++) { - struct mirror_info *p = &conf->mirrors[mirror]; - if (p->recovery_disabled == mddev->recovery_disabled) - continue; - if (p->rdev) { - if (!test_bit(WantReplacement, &p->rdev->flags) || - p->replacement != NULL) - continue; - clear_bit(In_sync, &rdev->flags); - set_bit(Replacement, &rdev->flags); - rdev->raid_disk = mirror; - err = 0; - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - conf->fullsync = 1; - rcu_assign_pointer(p->replacement, rdev); - break; - } - - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - - p->head_position = 0; - p->recovery_disabled = mddev->recovery_disabled - 1; - rdev->raid_disk = mirror; - err = 0; - if (rdev->saved_raid_disk != mirror) - conf->fullsync = 1; - rcu_assign_pointer(p->rdev, rdev); - break; - } - if (err == 0 && test_bit(Unmerged, &rdev->flags)) { - /* Some requests might not have seen this new - * merge_bvec_fn. We must wait for them to complete - * before merging the device fully. - * First we make sure any code which has tested - * our function has submitted the request, then - * we wait for all outstanding requests to complete. - */ - synchronize_sched(); - raise_barrier(conf, 0); - lower_barrier(conf); - clear_bit(Unmerged, &rdev->flags); - } - md_integrity_add_rdev(rdev, mddev); - print_conf(conf); - return err; -} - -static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) -{ - struct r10conf *conf = mddev->private; - int err = 0; - int number = rdev->raid_disk; - struct md_rdev **rdevp; - struct mirror_info *p = conf->mirrors + number; - - print_conf(conf); - if (rdev == p->rdev) - rdevp = &p->rdev; - else if (rdev == p->replacement) - rdevp = &p->replacement; - else - return 0; - - if (test_bit(In_sync, &rdev->flags) || - atomic_read(&rdev->nr_pending)) { - err = -EBUSY; - goto abort; - } - /* Only remove faulty devices if recovery - * is not possible. - */ - if (!test_bit(Faulty, &rdev->flags) && - mddev->recovery_disabled != p->recovery_disabled && - (!p->replacement || p->replacement == rdev) && - enough(conf, -1)) { - err = -EBUSY; - goto abort; - } - *rdevp = NULL; - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - *rdevp = rdev; - goto abort; - } else if (p->replacement) { - /* We must have just cleared 'rdev' */ - p->rdev = p->replacement; - clear_bit(Replacement, &p->replacement->flags); - smp_mb(); /* Make sure other CPUs may see both as identical - * but will never see neither -- if they are careful. - */ - p->replacement = NULL; - clear_bit(WantReplacement, &rdev->flags); - } else - /* We might have just remove the Replacement as faulty - * Clear the flag just in case - */ - clear_bit(WantReplacement, &rdev->flags); - - err = md_integrity_register(mddev); - -abort: - - print_conf(conf); - return err; -} - - -static void end_sync_read(struct bio *bio, int error) -{ - struct r10bio *r10_bio = bio->bi_private; - struct r10conf *conf = r10_bio->mddev->private; - int d; - - d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); - - if (test_bit(BIO_UPTODATE, &bio->bi_flags)) - set_bit(R10BIO_Uptodate, &r10_bio->state); - else - /* The write handler will notice the lack of - * R10BIO_Uptodate and record any errors etc - */ - atomic_add(r10_bio->sectors, - &conf->mirrors[d].rdev->corrected_errors); - - /* for reconstruct, we always reschedule after a read. - * for resync, only after all reads - */ - rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev); - if (test_bit(R10BIO_IsRecover, &r10_bio->state) || - atomic_dec_and_test(&r10_bio->remaining)) { - /* we have read all the blocks, - * do the comparison in process context in raid10d - */ - reschedule_retry(r10_bio); - } -} - -static void end_sync_request(struct r10bio *r10_bio) -{ - struct mddev *mddev = r10_bio->mddev; - - while (atomic_dec_and_test(&r10_bio->remaining)) { - if (r10_bio->master_bio == NULL) { - /* the primary of several recovery bios */ - sector_t s = r10_bio->sectors; - if (test_bit(R10BIO_MadeGood, &r10_bio->state) || - test_bit(R10BIO_WriteError, &r10_bio->state)) - reschedule_retry(r10_bio); - else - put_buf(r10_bio); - md_done_sync(mddev, s, 1); - break; - } else { - struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio; - if (test_bit(R10BIO_MadeGood, &r10_bio->state) || - test_bit(R10BIO_WriteError, &r10_bio->state)) - reschedule_retry(r10_bio); - else - put_buf(r10_bio); - r10_bio = r10_bio2; - } - } -} - -static void end_sync_write(struct bio *bio, int error) -{ - int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct r10bio *r10_bio = bio->bi_private; - struct mddev *mddev = r10_bio->mddev; - struct r10conf *conf = mddev->private; - int d; - sector_t first_bad; - int bad_sectors; - int slot; - int repl; - struct md_rdev *rdev = NULL; - - d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); - if (repl) - rdev = conf->mirrors[d].replacement; - else - rdev = conf->mirrors[d].rdev; - - if (!uptodate) { - if (repl) - md_error(mddev, rdev); - else { - set_bit(WriteErrorSeen, &rdev->flags); - if (!test_and_set_bit(WantReplacement, &rdev->flags)) - set_bit(MD_RECOVERY_NEEDED, - &rdev->mddev->recovery); - set_bit(R10BIO_WriteError, &r10_bio->state); - } - } else if (is_badblock(rdev, - r10_bio->devs[slot].addr, - r10_bio->sectors, - &first_bad, &bad_sectors)) - set_bit(R10BIO_MadeGood, &r10_bio->state); - - rdev_dec_pending(rdev, mddev); - - end_sync_request(r10_bio); -} - -/* - * Note: sync and recover and handled very differently for raid10 - * This code is for resync. - * For resync, we read through virtual addresses and read all blocks. - * If there is any error, we schedule a write. The lowest numbered - * drive is authoritative. - * However requests come for physical address, so we need to map. - * For every physical address there are raid_disks/copies virtual addresses, - * which is always are least one, but is not necessarly an integer. - * This means that a physical address can span multiple chunks, so we may - * have to submit multiple io requests for a single sync request. - */ -/* - * We check if all blocks are in-sync and only write to blocks that - * aren't in sync - */ -static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) -{ - struct r10conf *conf = mddev->private; - int i, first; - struct bio *tbio, *fbio; - int vcnt; - - atomic_set(&r10_bio->remaining, 1); - - /* find the first device with a block */ - for (i=0; i<conf->copies; i++) - if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) - break; - - if (i == conf->copies) - goto done; - - first = i; - fbio = r10_bio->devs[i].bio; - - vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9); - /* now find blocks with errors */ - for (i=0 ; i < conf->copies ; i++) { - int j, d; - - tbio = r10_bio->devs[i].bio; - - if (tbio->bi_end_io != end_sync_read) - continue; - if (i == first) - continue; - if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) { - /* We know that the bi_io_vec layout is the same for - * both 'first' and 'i', so we just compare them. - * All vec entries are PAGE_SIZE; - */ - for (j = 0; j < vcnt; j++) - if (memcmp(page_address(fbio->bi_io_vec[j].bv_page), - page_address(tbio->bi_io_vec[j].bv_page), - fbio->bi_io_vec[j].bv_len)) - break; - if (j == vcnt) - continue; - mddev->resync_mismatches += r10_bio->sectors; - if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) - /* Don't fix anything. */ - continue; - } - /* Ok, we need to write this bio, either to correct an - * inconsistency or to correct an unreadable block. - * First we need to fixup bv_offset, bv_len and - * bi_vecs, as the read request might have corrupted these - */ - tbio->bi_vcnt = vcnt; - tbio->bi_size = r10_bio->sectors << 9; - tbio->bi_idx = 0; - tbio->bi_phys_segments = 0; - tbio->bi_flags &= ~(BIO_POOL_MASK - 1); - tbio->bi_flags |= 1 << BIO_UPTODATE; - tbio->bi_next = NULL; - tbio->bi_rw = WRITE; - tbio->bi_private = r10_bio; - tbio->bi_sector = r10_bio->devs[i].addr; - - for (j=0; j < vcnt ; j++) { - tbio->bi_io_vec[j].bv_offset = 0; - tbio->bi_io_vec[j].bv_len = PAGE_SIZE; - - memcpy(page_address(tbio->bi_io_vec[j].bv_page), - page_address(fbio->bi_io_vec[j].bv_page), - PAGE_SIZE); - } - tbio->bi_end_io = end_sync_write; - - d = r10_bio->devs[i].devnum; - atomic_inc(&conf->mirrors[d].rdev->nr_pending); - atomic_inc(&r10_bio->remaining); - md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9); - - tbio->bi_sector += conf->mirrors[d].rdev->data_offset; - tbio->bi_bdev = conf->mirrors[d].rdev->bdev; - generic_make_request(tbio); - } - - /* Now write out to any replacement devices - * that are active - */ - for (i = 0; i < conf->copies; i++) { - int j, d; - - tbio = r10_bio->devs[i].repl_bio; - if (!tbio || !tbio->bi_end_io) - continue; - if (r10_bio->devs[i].bio->bi_end_io != end_sync_write - && r10_bio->devs[i].bio != fbio) - for (j = 0; j < vcnt; j++) - memcpy(page_address(tbio->bi_io_vec[j].bv_page), - page_address(fbio->bi_io_vec[j].bv_page), - PAGE_SIZE); - d = r10_bio->devs[i].devnum; - atomic_inc(&r10_bio->remaining); - md_sync_acct(conf->mirrors[d].replacement->bdev, - tbio->bi_size >> 9); - generic_make_request(tbio); - } - -done: - if (atomic_dec_and_test(&r10_bio->remaining)) { - md_done_sync(mddev, r10_bio->sectors, 1); - put_buf(r10_bio); - } -} - -/* - * Now for the recovery code. - * Recovery happens across physical sectors. - * We recover all non-is_sync drives by finding the virtual address of - * each, and then choose a working drive that also has that virt address. - * There is a separate r10_bio for each non-in_sync drive. - * Only the first two slots are in use. The first for reading, - * The second for writing. - * - */ -static void fix_recovery_read_error(struct r10bio *r10_bio) -{ - /* We got a read error during recovery. - * We repeat the read in smaller page-sized sections. - * If a read succeeds, write it to the new device or record - * a bad block if we cannot. - * If a read fails, record a bad block on both old and - * new devices. - */ - struct mddev *mddev = r10_bio->mddev; - struct r10conf *conf = mddev->private; - struct bio *bio = r10_bio->devs[0].bio; - sector_t sect = 0; - int sectors = r10_bio->sectors; - int idx = 0; - int dr = r10_bio->devs[0].devnum; - int dw = r10_bio->devs[1].devnum; - - while (sectors) { - int s = sectors; - struct md_rdev *rdev; - sector_t addr; - int ok; - - if (s > (PAGE_SIZE>>9)) - s = PAGE_SIZE >> 9; - - rdev = conf->mirrors[dr].rdev; - addr = r10_bio->devs[0].addr + sect, - ok = sync_page_io(rdev, - addr, - s << 9, - bio->bi_io_vec[idx].bv_page, - READ, false); - if (ok) { - rdev = conf->mirrors[dw].rdev; - addr = r10_bio->devs[1].addr + sect; - ok = sync_page_io(rdev, - addr, - s << 9, - bio->bi_io_vec[idx].bv_page, - WRITE, false); - if (!ok) { - set_bit(WriteErrorSeen, &rdev->flags); - if (!test_and_set_bit(WantReplacement, - &rdev->flags)) - set_bit(MD_RECOVERY_NEEDED, - &rdev->mddev->recovery); - } - } - if (!ok) { - /* We don't worry if we cannot set a bad block - - * it really is bad so there is no loss in not - * recording it yet - */ - rdev_set_badblocks(rdev, addr, s, 0); - - if (rdev != conf->mirrors[dw].rdev) { - /* need bad block on destination too */ - struct md_rdev *rdev2 = conf->mirrors[dw].rdev; - addr = r10_bio->devs[1].addr + sect; - ok = rdev_set_badblocks(rdev2, addr, s, 0); - if (!ok) { - /* just abort the recovery */ - printk(KERN_NOTICE - "md/raid10:%s: recovery aborted" - " due to read error\n", - mdname(mddev)); - - conf->mirrors[dw].recovery_disabled - = mddev->recovery_disabled; - set_bit(MD_RECOVERY_INTR, - &mddev->recovery); - break; - } - } - } - - sectors -= s; - sect += s; - idx++; - } -} - -static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) -{ - struct r10conf *conf = mddev->private; - int d; - struct bio *wbio, *wbio2; - - if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) { - fix_recovery_read_error(r10_bio); - end_sync_request(r10_bio); - return; - } - - /* - * share the pages with the first bio - * and submit the write request - */ - d = r10_bio->devs[1].devnum; - wbio = r10_bio->devs[1].bio; - wbio2 = r10_bio->devs[1].repl_bio; - if (wbio->bi_end_io) { - atomic_inc(&conf->mirrors[d].rdev->nr_pending); - md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); - generic_make_request(wbio); - } - if (wbio2 && wbio2->bi_end_io) { - atomic_inc(&conf->mirrors[d].replacement->nr_pending); - md_sync_acct(conf->mirrors[d].replacement->bdev, - wbio2->bi_size >> 9); - generic_make_request(wbio2); - } -} - - -/* - * Used by fix_read_error() to decay the per rdev read_errors. - * We halve the read error count for every hour that has elapsed - * since the last recorded read error. - * - */ -static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev) -{ - struct timespec cur_time_mon; - unsigned long hours_since_last; - unsigned int read_errors = atomic_read(&rdev->read_errors); - - ktime_get_ts(&cur_time_mon); - - if (rdev->last_read_error.tv_sec == 0 && - rdev->last_read_error.tv_nsec == 0) { - /* first time we've seen a read error */ - rdev->last_read_error = cur_time_mon; - return; - } - - hours_since_last = (cur_time_mon.tv_sec - - rdev->last_read_error.tv_sec) / 3600; - - rdev->last_read_error = cur_time_mon; - - /* - * if hours_since_last is > the number of bits in read_errors - * just set read errors to 0. We do this to avoid - * overflowing the shift of read_errors by hours_since_last. - */ - if (hours_since_last >= 8 * sizeof(read_errors)) - atomic_set(&rdev->read_errors, 0); - else - atomic_set(&rdev->read_errors, read_errors >> hours_since_last); -} - -static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, - int sectors, struct page *page, int rw) -{ - sector_t first_bad; - int bad_sectors; - - if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors) - && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags))) - return -1; - if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) - /* success */ - return 1; - if (rw == WRITE) { - set_bit(WriteErrorSeen, &rdev->flags); - if (!test_and_set_bit(WantReplacement, &rdev->flags)) - set_bit(MD_RECOVERY_NEEDED, - &rdev->mddev->recovery); - } - /* need to record an error - either for the block or the device */ - if (!rdev_set_badblocks(rdev, sector, sectors, 0)) - md_error(rdev->mddev, rdev); - return 0; -} - -/* - * This is a kernel thread which: - * - * 1. Retries failed read operations on working mirrors. - * 2. Updates the raid superblock when problems encounter. - * 3. Performs writes following reads for array synchronising. - */ - -static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio) -{ - int sect = 0; /* Offset from r10_bio->sector */ - int sectors = r10_bio->sectors; - struct md_rdev*rdev; - int max_read_errors = atomic_read(&mddev->max_corr_read_errors); - int d = r10_bio->devs[r10_bio->read_slot].devnum; - - /* still own a reference to this rdev, so it cannot - * have been cleared recently. - */ - rdev = conf->mirrors[d].rdev; - - if (test_bit(Faulty, &rdev->flags)) - /* drive has already been failed, just ignore any - more fix_read_error() attempts */ - return; - - check_decay_read_errors(mddev, rdev); - atomic_inc(&rdev->read_errors); - if (atomic_read(&rdev->read_errors) > max_read_errors) { - char b[BDEVNAME_SIZE]; - bdevname(rdev->bdev, b); - - printk(KERN_NOTICE - "md/raid10:%s: %s: Raid device exceeded " - "read_error threshold [cur %d:max %d]\n", - mdname(mddev), b, - atomic_read(&rdev->read_errors), max_read_errors); - printk(KERN_NOTICE - "md/raid10:%s: %s: Failing raid device\n", - mdname(mddev), b); - md_error(mddev, conf->mirrors[d].rdev); - r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED; - return; - } - - while(sectors) { - int s = sectors; - int sl = r10_bio->read_slot; - int success = 0; - int start; - - if (s > (PAGE_SIZE>>9)) - s = PAGE_SIZE >> 9; - - rcu_read_lock(); - do { - sector_t first_bad; - int bad_sectors; - - d = r10_bio->devs[sl].devnum; - rdev = rcu_dereference(conf->mirrors[d].rdev); - if (rdev && - !test_bit(Unmerged, &rdev->flags) && - test_bit(In_sync, &rdev->flags) && - is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, - &first_bad, &bad_sectors) == 0) { - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - success = sync_page_io(rdev, - r10_bio->devs[sl].addr + - sect, - s<<9, - conf->tmppage, READ, false); - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); - if (success) - break; - } - sl++; - if (sl == conf->copies) - sl = 0; - } while (!success && sl != r10_bio->read_slot); - rcu_read_unlock(); - - if (!success) { - /* Cannot read from anywhere, just mark the block - * as bad on the first device to discourage future - * reads. - */ - int dn = r10_bio->devs[r10_bio->read_slot].devnum; - rdev = conf->mirrors[dn].rdev; - - if (!rdev_set_badblocks( - rdev, - r10_bio->devs[r10_bio->read_slot].addr - + sect, - s, 0)) { - md_error(mddev, rdev); - r10_bio->devs[r10_bio->read_slot].bio - = IO_BLOCKED; - } - break; - } - - start = sl; - /* write it back and re-read */ - rcu_read_lock(); - while (sl != r10_bio->read_slot) { - char b[BDEVNAME_SIZE]; - - if (sl==0) - sl = conf->copies; - sl--; - d = r10_bio->devs[sl].devnum; - rdev = rcu_dereference(conf->mirrors[d].rdev); - if (!rdev || - test_bit(Unmerged, &rdev->flags) || - !test_bit(In_sync, &rdev->flags)) - continue; - - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - if (r10_sync_page_io(rdev, - r10_bio->devs[sl].addr + - sect, - s, conf->tmppage, WRITE) - == 0) { - /* Well, this device is dead */ - printk(KERN_NOTICE - "md/raid10:%s: read correction " - "write failed" - " (%d sectors at %llu on %s)\n", - mdname(mddev), s, - (unsigned long long)( - sect + rdev->data_offset), - bdevname(rdev->bdev, b)); - printk(KERN_NOTICE "md/raid10:%s: %s: failing " - "drive\n", - mdname(mddev), - bdevname(rdev->bdev, b)); - } - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); - } - sl = start; - while (sl != r10_bio->read_slot) { - char b[BDEVNAME_SIZE]; - - if (sl==0) - sl = conf->copies; - sl--; - d = r10_bio->devs[sl].devnum; - rdev = rcu_dereference(conf->mirrors[d].rdev); - if (!rdev || - !test_bit(In_sync, &rdev->flags)) - continue; - - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - switch (r10_sync_page_io(rdev, - r10_bio->devs[sl].addr + - sect, - s, conf->tmppage, - READ)) { - case 0: - /* Well, this device is dead */ - printk(KERN_NOTICE - "md/raid10:%s: unable to read back " - "corrected sectors" - " (%d sectors at %llu on %s)\n", - mdname(mddev), s, - (unsigned long long)( - sect + rdev->data_offset), - bdevname(rdev->bdev, b)); - printk(KERN_NOTICE "md/raid10:%s: %s: failing " - "drive\n", - mdname(mddev), - bdevname(rdev->bdev, b)); - break; - case 1: - printk(KERN_INFO - "md/raid10:%s: read error corrected" - " (%d sectors at %llu on %s)\n", - mdname(mddev), s, - (unsigned long long)( - sect + rdev->data_offset), - bdevname(rdev->bdev, b)); - atomic_add(s, &rdev->corrected_errors); - } - - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); - } - rcu_read_unlock(); - - sectors -= s; - sect += s; - } -} - -static void bi_complete(struct bio *bio, int error) -{ - complete((struct completion *)bio->bi_private); -} - -static int submit_bio_wait(int rw, struct bio *bio) -{ - struct completion event; - rw |= REQ_SYNC; - - init_completion(&event); - bio->bi_private = &event; - bio->bi_end_io = bi_complete; - submit_bio(rw, bio); - wait_for_completion(&event); - - return test_bit(BIO_UPTODATE, &bio->bi_flags); -} - -static int narrow_write_error(struct r10bio *r10_bio, int i) -{ - struct bio *bio = r10_bio->master_bio; - struct mddev *mddev = r10_bio->mddev; - struct r10conf *conf = mddev->private; - struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev; - /* bio has the data to be written to slot 'i' where - * we just recently had a write error. - * We repeatedly clone the bio and trim down to one block, - * then try the write. Where the write fails we record - * a bad block. - * It is conceivable that the bio doesn't exactly align with - * blocks. We must handle this. - * - * We currently own a reference to the rdev. - */ - - int block_sectors; - sector_t sector; - int sectors; - int sect_to_write = r10_bio->sectors; - int ok = 1; - - if (rdev->badblocks.shift < 0) - return 0; - - block_sectors = 1 << rdev->badblocks.shift; - sector = r10_bio->sector; - sectors = ((r10_bio->sector + block_sectors) - & ~(sector_t)(block_sectors - 1)) - - sector; - - while (sect_to_write) { - struct bio *wbio; - if (sectors > sect_to_write) - sectors = sect_to_write; - /* Write at 'sector' for 'sectors' */ - wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - md_trim_bio(wbio, sector - bio->bi_sector, sectors); - wbio->bi_sector = (r10_bio->devs[i].addr+ - rdev->data_offset+ - (sector - r10_bio->sector)); - wbio->bi_bdev = rdev->bdev; - if (submit_bio_wait(WRITE, wbio) == 0) - /* Failure! */ - ok = rdev_set_badblocks(rdev, sector, - sectors, 0) - && ok; - - bio_put(wbio); - sect_to_write -= sectors; - sector += sectors; - sectors = block_sectors; - } - return ok; -} - -static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) -{ - int slot = r10_bio->read_slot; - struct bio *bio; - struct r10conf *conf = mddev->private; - struct md_rdev *rdev = r10_bio->devs[slot].rdev; - char b[BDEVNAME_SIZE]; - unsigned long do_sync; - int max_sectors; - - /* we got a read error. Maybe the drive is bad. Maybe just - * the block and we can fix it. - * We freeze all other IO, and try reading the block from - * other devices. When we find one, we re-write - * and check it that fixes the read error. - * This is all done synchronously while the array is - * frozen. - */ - bio = r10_bio->devs[slot].bio; - bdevname(bio->bi_bdev, b); - bio_put(bio); - r10_bio->devs[slot].bio = NULL; - - if (mddev->ro == 0) { - freeze_array(conf); - fix_read_error(conf, mddev, r10_bio); - unfreeze_array(conf); - } else - r10_bio->devs[slot].bio = IO_BLOCKED; - - rdev_dec_pending(rdev, mddev); - -read_more: - rdev = read_balance(conf, r10_bio, &max_sectors); - if (rdev == NULL) { - printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" - " read error for block %llu\n", - mdname(mddev), b, - (unsigned long long)r10_bio->sector); - raid_end_bio_io(r10_bio); - return; - } - - do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); - slot = r10_bio->read_slot; - printk_ratelimited( - KERN_ERR - "md/raid10:%s: %s: redirecting " - "sector %llu to another mirror\n", - mdname(mddev), - bdevname(rdev->bdev, b), - (unsigned long long)r10_bio->sector); - bio = bio_clone_mddev(r10_bio->master_bio, - GFP_NOIO, mddev); - md_trim_bio(bio, - r10_bio->sector - bio->bi_sector, - max_sectors); - r10_bio->devs[slot].bio = bio; - r10_bio->devs[slot].rdev = rdev; - bio->bi_sector = r10_bio->devs[slot].addr - + rdev->data_offset; - bio->bi_bdev = rdev->bdev; - bio->bi_rw = READ | do_sync; - bio->bi_private = r10_bio; - bio->bi_end_io = raid10_end_read_request; - if (max_sectors < r10_bio->sectors) { - /* Drat - have to split this up more */ - struct bio *mbio = r10_bio->master_bio; - int sectors_handled = - r10_bio->sector + max_sectors - - mbio->bi_sector; - r10_bio->sectors = max_sectors; - spin_lock_irq(&conf->device_lock); - if (mbio->bi_phys_segments == 0) - mbio->bi_phys_segments = 2; - else - mbio->bi_phys_segments++; - spin_unlock_irq(&conf->device_lock); - generic_make_request(bio); - - r10_bio = mempool_alloc(conf->r10bio_pool, - GFP_NOIO); - r10_bio->master_bio = mbio; - r10_bio->sectors = (mbio->bi_size >> 9) - - sectors_handled; - r10_bio->state = 0; - set_bit(R10BIO_ReadError, - &r10_bio->state); - r10_bio->mddev = mddev; - r10_bio->sector = mbio->bi_sector - + sectors_handled; - - goto read_more; - } else - generic_make_request(bio); -} - -static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) -{ - /* Some sort of write request has finished and it - * succeeded in writing where we thought there was a - * bad block. So forget the bad block. - * Or possibly if failed and we need to record - * a bad block. - */ - int m; - struct md_rdev *rdev; - - if (test_bit(R10BIO_IsSync, &r10_bio->state) || - test_bit(R10BIO_IsRecover, &r10_bio->state)) { - for (m = 0; m < conf->copies; m++) { - int dev = r10_bio->devs[m].devnum; - rdev = conf->mirrors[dev].rdev; - if (r10_bio->devs[m].bio == NULL) - continue; - if (test_bit(BIO_UPTODATE, - &r10_bio->devs[m].bio->bi_flags)) { - rdev_clear_badblocks( - rdev, - r10_bio->devs[m].addr, - r10_bio->sectors); - } else { - if (!rdev_set_badblocks( - rdev, - r10_bio->devs[m].addr, - r10_bio->sectors, 0)) - md_error(conf->mddev, rdev); - } - rdev = conf->mirrors[dev].replacement; - if (r10_bio->devs[m].repl_bio == NULL) - continue; - if (test_bit(BIO_UPTODATE, - &r10_bio->devs[m].repl_bio->bi_flags)) { - rdev_clear_badblocks( - rdev, - r10_bio->devs[m].addr, - r10_bio->sectors); - } else { - if (!rdev_set_badblocks( - rdev, - r10_bio->devs[m].addr, - r10_bio->sectors, 0)) - md_error(conf->mddev, rdev); - } - } - put_buf(r10_bio); - } else { - for (m = 0; m < conf->copies; m++) { - int dev = r10_bio->devs[m].devnum; - struct bio *bio = r10_bio->devs[m].bio; - rdev = conf->mirrors[dev].rdev; - if (bio == IO_MADE_GOOD) { - rdev_clear_badblocks( - rdev, - r10_bio->devs[m].addr, - r10_bio->sectors); - rdev_dec_pending(rdev, conf->mddev); - } else if (bio != NULL && - !test_bit(BIO_UPTODATE, &bio->bi_flags)) { - if (!narrow_write_error(r10_bio, m)) { - md_error(conf->mddev, rdev); - set_bit(R10BIO_Degraded, - &r10_bio->state); - } - rdev_dec_pending(rdev, conf->mddev); - } - bio = r10_bio->devs[m].repl_bio; - rdev = conf->mirrors[dev].replacement; - if (rdev && bio == IO_MADE_GOOD) { - rdev_clear_badblocks( - rdev, - r10_bio->devs[m].addr, - r10_bio->sectors); - rdev_dec_pending(rdev, conf->mddev); - } - } - if (test_bit(R10BIO_WriteError, - &r10_bio->state)) - close_write(r10_bio); - raid_end_bio_io(r10_bio); - } -} - -static void raid10d(struct mddev *mddev) -{ - struct r10bio *r10_bio; - unsigned long flags; - struct r10conf *conf = mddev->private; - struct list_head *head = &conf->retry_list; - struct blk_plug plug; - - md_check_recovery(mddev); - - blk_start_plug(&plug); - for (;;) { - - flush_pending_writes(conf); - - spin_lock_irqsave(&conf->device_lock, flags); - if (list_empty(head)) { - spin_unlock_irqrestore(&conf->device_lock, flags); - break; - } - r10_bio = list_entry(head->prev, struct r10bio, retry_list); - list_del(head->prev); - conf->nr_queued--; - spin_unlock_irqrestore(&conf->device_lock, flags); - - mddev = r10_bio->mddev; - conf = mddev->private; - if (test_bit(R10BIO_MadeGood, &r10_bio->state) || - test_bit(R10BIO_WriteError, &r10_bio->state)) - handle_write_completed(conf, r10_bio); - else if (test_bit(R10BIO_IsSync, &r10_bio->state)) - sync_request_write(mddev, r10_bio); - else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) - recovery_request_write(mddev, r10_bio); - else if (test_bit(R10BIO_ReadError, &r10_bio->state)) - handle_read_error(mddev, r10_bio); - else { - /* just a partial read to be scheduled from a - * separate context - */ - int slot = r10_bio->read_slot; - generic_make_request(r10_bio->devs[slot].bio); - } - - cond_resched(); - if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) - md_check_recovery(mddev); - } - blk_finish_plug(&plug); -} - - -static int init_resync(struct r10conf *conf) -{ - int buffs; - int i; - - buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; - BUG_ON(conf->r10buf_pool); - conf->have_replacement = 0; - for (i = 0; i < conf->raid_disks; i++) - if (conf->mirrors[i].replacement) - conf->have_replacement = 1; - conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf); - if (!conf->r10buf_pool) - return -ENOMEM; - conf->next_resync = 0; - return 0; -} - -/* - * perform a "sync" on one "block" - * - * We need to make sure that no normal I/O request - particularly write - * requests - conflict with active sync requests. - * - * This is achieved by tracking pending requests and a 'barrier' concept - * that can be installed to exclude normal IO requests. - * - * Resync and recovery are handled very differently. - * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery. - * - * For resync, we iterate over virtual addresses, read all copies, - * and update if there are differences. If only one copy is live, - * skip it. - * For recovery, we iterate over physical addresses, read a good - * value for each non-in_sync drive, and over-write. - * - * So, for recovery we may have several outstanding complex requests for a - * given address, one for each out-of-sync device. We model this by allocating - * a number of r10_bio structures, one for each out-of-sync device. - * As we setup these structures, we collect all bio's together into a list - * which we then process collectively to add pages, and then process again - * to pass to generic_make_request. - * - * The r10_bio structures are linked using a borrowed master_bio pointer. - * This link is counted in ->remaining. When the r10_bio that points to NULL - * has its remaining count decremented to 0, the whole complex operation - * is complete. - * - */ - -static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, - int *skipped, int go_faster) -{ - struct r10conf *conf = mddev->private; - struct r10bio *r10_bio; - struct bio *biolist = NULL, *bio; - sector_t max_sector, nr_sectors; - int i; - int max_sync; - sector_t sync_blocks; - sector_t sectors_skipped = 0; - int chunks_skipped = 0; - - if (!conf->r10buf_pool) - if (init_resync(conf)) - return 0; - - skipped: - max_sector = mddev->dev_sectors; - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) - max_sector = mddev->resync_max_sectors; - if (sector_nr >= max_sector) { - /* If we aborted, we need to abort the - * sync on the 'current' bitmap chucks (there can - * be several when recovering multiple devices). - * as we may have started syncing it but not finished. - * We can find the current address in - * mddev->curr_resync, but for recovery, - * we need to convert that to several - * virtual addresses. - */ - if (mddev->curr_resync < max_sector) { /* aborted */ - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) - bitmap_end_sync(mddev->bitmap, mddev->curr_resync, - &sync_blocks, 1); - else for (i=0; i<conf->raid_disks; i++) { - sector_t sect = - raid10_find_virt(conf, mddev->curr_resync, i); - bitmap_end_sync(mddev->bitmap, sect, - &sync_blocks, 1); - } - } else { - /* completed sync */ - if ((!mddev->bitmap || conf->fullsync) - && conf->have_replacement - && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - /* Completed a full sync so the replacements - * are now fully recovered. - */ - for (i = 0; i < conf->raid_disks; i++) - if (conf->mirrors[i].replacement) - conf->mirrors[i].replacement - ->recovery_offset - = MaxSector; - } - conf->fullsync = 0; - } - bitmap_close_sync(mddev->bitmap); - close_sync(conf); - *skipped = 1; - return sectors_skipped; - } - if (chunks_skipped >= conf->raid_disks) { - /* if there has been nothing to do on any drive, - * then there is nothing to do at all.. - */ - *skipped = 1; - return (max_sector - sector_nr) + sectors_skipped; - } - - if (max_sector > mddev->resync_max) - max_sector = mddev->resync_max; /* Don't do IO beyond here */ - - /* make sure whole request will fit in a chunk - if chunks - * are meaningful - */ - if (conf->near_copies < conf->raid_disks && - max_sector > (sector_nr | conf->chunk_mask)) - max_sector = (sector_nr | conf->chunk_mask) + 1; - /* - * If there is non-resync activity waiting for us then - * put in a delay to throttle resync. - */ - if (!go_faster && conf->nr_waiting) - msleep_interruptible(1000); - - /* Again, very different code for resync and recovery. - * Both must result in an r10bio with a list of bios that - * have bi_end_io, bi_sector, bi_bdev set, - * and bi_private set to the r10bio. - * For recovery, we may actually create several r10bios - * with 2 bios in each, that correspond to the bios in the main one. - * In this case, the subordinate r10bios link back through a - * borrowed master_bio pointer, and the counter in the master - * includes a ref from each subordinate. - */ - /* First, we decide what to do and set ->bi_end_io - * To end_sync_read if we want to read, and - * end_sync_write if we will want to write. - */ - - max_sync = RESYNC_PAGES << (PAGE_SHIFT-9); - if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - /* recovery... the complicated one */ - int j; - r10_bio = NULL; - - for (i=0 ; i<conf->raid_disks; i++) { - int still_degraded; - struct r10bio *rb2; - sector_t sect; - int must_sync; - int any_working; - struct mirror_info *mirror = &conf->mirrors[i]; - - if ((mirror->rdev == NULL || - test_bit(In_sync, &mirror->rdev->flags)) - && - (mirror->replacement == NULL || - test_bit(Faulty, - &mirror->replacement->flags))) - continue; - - still_degraded = 0; - /* want to reconstruct this device */ - rb2 = r10_bio; - sect = raid10_find_virt(conf, sector_nr, i); - if (sect >= mddev->resync_max_sectors) { - /* last stripe is not complete - don't - * try to recover this sector. - */ - continue; - } - /* Unless we are doing a full sync, or a replacement - * we only need to recover the block if it is set in - * the bitmap - */ - must_sync = bitmap_start_sync(mddev->bitmap, sect, - &sync_blocks, 1); - if (sync_blocks < max_sync) - max_sync = sync_blocks; - if (!must_sync && - mirror->replacement == NULL && - !conf->fullsync) { - /* yep, skip the sync_blocks here, but don't assume - * that there will never be anything to do here - */ - chunks_skipped = -1; - continue; - } - - r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); - raise_barrier(conf, rb2 != NULL); - atomic_set(&r10_bio->remaining, 0); - - r10_bio->master_bio = (struct bio*)rb2; - if (rb2) - atomic_inc(&rb2->remaining); - r10_bio->mddev = mddev; - set_bit(R10BIO_IsRecover, &r10_bio->state); - r10_bio->sector = sect; - - raid10_find_phys(conf, r10_bio); - - /* Need to check if the array will still be - * degraded - */ - for (j=0; j<conf->raid_disks; j++) - if (conf->mirrors[j].rdev == NULL || - test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { - still_degraded = 1; - break; - } - - must_sync = bitmap_start_sync(mddev->bitmap, sect, - &sync_blocks, still_degraded); - - any_working = 0; - for (j=0; j<conf->copies;j++) { - int k; - int d = r10_bio->devs[j].devnum; - sector_t from_addr, to_addr; - struct md_rdev *rdev; - sector_t sector, first_bad; - int bad_sectors; - if (!conf->mirrors[d].rdev || - !test_bit(In_sync, &conf->mirrors[d].rdev->flags)) - continue; - /* This is where we read from */ - any_working = 1; - rdev = conf->mirrors[d].rdev; - sector = r10_bio->devs[j].addr; - - if (is_badblock(rdev, sector, max_sync, - &first_bad, &bad_sectors)) { - if (first_bad > sector) - max_sync = first_bad - sector; - else { - bad_sectors -= (sector - - first_bad); - if (max_sync > bad_sectors) - max_sync = bad_sectors; - continue; - } - } - bio = r10_bio->devs[0].bio; - bio->bi_next = biolist; - biolist = bio; - bio->bi_private = r10_bio; - bio->bi_end_io = end_sync_read; - bio->bi_rw = READ; - from_addr = r10_bio->devs[j].addr; - bio->bi_sector = from_addr + rdev->data_offset; - bio->bi_bdev = rdev->bdev; - atomic_inc(&rdev->nr_pending); - /* and we write to 'i' (if not in_sync) */ - - for (k=0; k<conf->copies; k++) - if (r10_bio->devs[k].devnum == i) - break; - BUG_ON(k == conf->copies); - to_addr = r10_bio->devs[k].addr; - r10_bio->devs[0].devnum = d; - r10_bio->devs[0].addr = from_addr; - r10_bio->devs[1].devnum = i; - r10_bio->devs[1].addr = to_addr; - - rdev = mirror->rdev; - if (!test_bit(In_sync, &rdev->flags)) { - bio = r10_bio->devs[1].bio; - bio->bi_next = biolist; - biolist = bio; - bio->bi_private = r10_bio; - bio->bi_end_io = end_sync_write; - bio->bi_rw = WRITE; - bio->bi_sector = to_addr - + rdev->data_offset; - bio->bi_bdev = rdev->bdev; - atomic_inc(&r10_bio->remaining); - } else - r10_bio->devs[1].bio->bi_end_io = NULL; - - /* and maybe write to replacement */ - bio = r10_bio->devs[1].repl_bio; - if (bio) - bio->bi_end_io = NULL; - rdev = mirror->replacement; - /* Note: if rdev != NULL, then bio - * cannot be NULL as r10buf_pool_alloc will - * have allocated it. - * So the second test here is pointless. - * But it keeps semantic-checkers happy, and - * this comment keeps human reviewers - * happy. - */ - if (rdev == NULL || bio == NULL || - test_bit(Faulty, &rdev->flags)) - break; - bio->bi_next = biolist; - biolist = bio; - bio->bi_private = r10_bio; - bio->bi_end_io = end_sync_write; - bio->bi_rw = WRITE; - bio->bi_sector = to_addr + rdev->data_offset; - bio->bi_bdev = rdev->bdev; - atomic_inc(&r10_bio->remaining); - break; - } - if (j == conf->copies) { - /* Cannot recover, so abort the recovery or - * record a bad block */ - put_buf(r10_bio); - if (rb2) - atomic_dec(&rb2->remaining); - r10_bio = rb2; - if (any_working) { - /* problem is that there are bad blocks - * on other device(s) - */ - int k; - for (k = 0; k < conf->copies; k++) - if (r10_bio->devs[k].devnum == i) - break; - if (!test_bit(In_sync, - &mirror->rdev->flags) - && !rdev_set_badblocks( - mirror->rdev, - r10_bio->devs[k].addr, - max_sync, 0)) - any_working = 0; - if (mirror->replacement && - !rdev_set_badblocks( - mirror->replacement, - r10_bio->devs[k].addr, - max_sync, 0)) - any_working = 0; - } - if (!any_working) { - if (!test_and_set_bit(MD_RECOVERY_INTR, - &mddev->recovery)) - printk(KERN_INFO "md/raid10:%s: insufficient " - "working devices for recovery.\n", - mdname(mddev)); - mirror->recovery_disabled - = mddev->recovery_disabled; - } - break; - } - } - if (biolist == NULL) { - while (r10_bio) { - struct r10bio *rb2 = r10_bio; - r10_bio = (struct r10bio*) rb2->master_bio; - rb2->master_bio = NULL; - put_buf(rb2); - } - goto giveup; - } - } else { - /* resync. Schedule a read for every block at this virt offset */ - int count = 0; - - bitmap_cond_end_sync(mddev->bitmap, sector_nr); - - if (!bitmap_start_sync(mddev->bitmap, sector_nr, - &sync_blocks, mddev->degraded) && - !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, - &mddev->recovery)) { - /* We can skip this block */ - *skipped = 1; - return sync_blocks + sectors_skipped; - } - if (sync_blocks < max_sync) - max_sync = sync_blocks; - r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); - - r10_bio->mddev = mddev; - atomic_set(&r10_bio->remaining, 0); - raise_barrier(conf, 0); - conf->next_resync = sector_nr; - - r10_bio->master_bio = NULL; - r10_bio->sector = sector_nr; - set_bit(R10BIO_IsSync, &r10_bio->state); - raid10_find_phys(conf, r10_bio); - r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1; - - for (i=0; i<conf->copies; i++) { - int d = r10_bio->devs[i].devnum; - sector_t first_bad, sector; - int bad_sectors; - - if (r10_bio->devs[i].repl_bio) - r10_bio->devs[i].repl_bio->bi_end_io = NULL; - - bio = r10_bio->devs[i].bio; - bio->bi_end_io = NULL; - clear_bit(BIO_UPTODATE, &bio->bi_flags); - if (conf->mirrors[d].rdev == NULL || - test_bit(Faulty, &conf->mirrors[d].rdev->flags)) - continue; - sector = r10_bio->devs[i].addr; - if (is_badblock(conf->mirrors[d].rdev, - sector, max_sync, - &first_bad, &bad_sectors)) { - if (first_bad > sector) - max_sync = first_bad - sector; - else { - bad_sectors -= (sector - first_bad); - if (max_sync > bad_sectors) - max_sync = max_sync; - continue; - } - } - atomic_inc(&conf->mirrors[d].rdev->nr_pending); - atomic_inc(&r10_bio->remaining); - bio->bi_next = biolist; - biolist = bio; - bio->bi_private = r10_bio; - bio->bi_end_io = end_sync_read; - bio->bi_rw = READ; - bio->bi_sector = sector + - conf->mirrors[d].rdev->data_offset; - bio->bi_bdev = conf->mirrors[d].rdev->bdev; - count++; - - if (conf->mirrors[d].replacement == NULL || - test_bit(Faulty, - &conf->mirrors[d].replacement->flags)) - continue; - - /* Need to set up for writing to the replacement */ - bio = r10_bio->devs[i].repl_bio; - clear_bit(BIO_UPTODATE, &bio->bi_flags); - - sector = r10_bio->devs[i].addr; - atomic_inc(&conf->mirrors[d].rdev->nr_pending); - bio->bi_next = biolist; - biolist = bio; - bio->bi_private = r10_bio; - bio->bi_end_io = end_sync_write; - bio->bi_rw = WRITE; - bio->bi_sector = sector + - conf->mirrors[d].replacement->data_offset; - bio->bi_bdev = conf->mirrors[d].replacement->bdev; - count++; - } - - if (count < 2) { - for (i=0; i<conf->copies; i++) { - int d = r10_bio->devs[i].devnum; - if (r10_bio->devs[i].bio->bi_end_io) - rdev_dec_pending(conf->mirrors[d].rdev, - mddev); - if (r10_bio->devs[i].repl_bio && - r10_bio->devs[i].repl_bio->bi_end_io) - rdev_dec_pending( - conf->mirrors[d].replacement, - mddev); - } - put_buf(r10_bio); - biolist = NULL; - goto giveup; - } - } - - for (bio = biolist; bio ; bio=bio->bi_next) { - - bio->bi_flags &= ~(BIO_POOL_MASK - 1); - if (bio->bi_end_io) - bio->bi_flags |= 1 << BIO_UPTODATE; - bio->bi_vcnt = 0; - bio->bi_idx = 0; - bio->bi_phys_segments = 0; - bio->bi_size = 0; - } - - nr_sectors = 0; - if (sector_nr + max_sync < max_sector) - max_sector = sector_nr + max_sync; - do { - struct page *page; - int len = PAGE_SIZE; - if (sector_nr + (len>>9) > max_sector) - len = (max_sector - sector_nr) << 9; - if (len == 0) - break; - for (bio= biolist ; bio ; bio=bio->bi_next) { - struct bio *bio2; - page = bio->bi_io_vec[bio->bi_vcnt].bv_page; - if (bio_add_page(bio, page, len, 0)) - continue; - - /* stop here */ - bio->bi_io_vec[bio->bi_vcnt].bv_page = page; - for (bio2 = biolist; - bio2 && bio2 != bio; - bio2 = bio2->bi_next) { - /* remove last page from this bio */ - bio2->bi_vcnt--; - bio2->bi_size -= len; - bio2->bi_flags &= ~(1<< BIO_SEG_VALID); - } - goto bio_full; - } - nr_sectors += len>>9; - sector_nr += len>>9; - } while (biolist->bi_vcnt < RESYNC_PAGES); - bio_full: - r10_bio->sectors = nr_sectors; - - while (biolist) { - bio = biolist; - biolist = biolist->bi_next; - - bio->bi_next = NULL; - r10_bio = bio->bi_private; - r10_bio->sectors = nr_sectors; - - if (bio->bi_end_io == end_sync_read) { - md_sync_acct(bio->bi_bdev, nr_sectors); - generic_make_request(bio); - } - } - - if (sectors_skipped) - /* pretend they weren't skipped, it makes - * no important difference in this case - */ - md_done_sync(mddev, sectors_skipped, 1); - - return sectors_skipped + nr_sectors; - giveup: - /* There is nowhere to write, so all non-sync - * drives must be failed or in resync, all drives - * have a bad block, so try the next chunk... - */ - if (sector_nr + max_sync < max_sector) - max_sector = sector_nr + max_sync; - - sectors_skipped += (max_sector - sector_nr); - chunks_skipped ++; - sector_nr = max_sector; - goto skipped; -} - -static sector_t -raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks) -{ - sector_t size; - struct r10conf *conf = mddev->private; - - if (!raid_disks) - raid_disks = conf->raid_disks; - if (!sectors) - sectors = conf->dev_sectors; - - size = sectors >> conf->chunk_shift; - sector_div(size, conf->far_copies); - size = size * raid_disks; - sector_div(size, conf->near_copies); - - return size << conf->chunk_shift; -} - -static void calc_sectors(struct r10conf *conf, sector_t size) -{ - /* Calculate the number of sectors-per-device that will - * actually be used, and set conf->dev_sectors and - * conf->stride - */ - - size = size >> conf->chunk_shift; - sector_div(size, conf->far_copies); - size = size * conf->raid_disks; - sector_div(size, conf->near_copies); - /* 'size' is now the number of chunks in the array */ - /* calculate "used chunks per device" */ - size = size * conf->copies; - - /* We need to round up when dividing by raid_disks to - * get the stride size. - */ - size = DIV_ROUND_UP_SECTOR_T(size, conf->raid_disks); - - conf->dev_sectors = size << conf->chunk_shift; - - if (conf->far_offset) - conf->stride = 1 << conf->chunk_shift; - else { - sector_div(size, conf->far_copies); - conf->stride = size << conf->chunk_shift; - } -} - -static struct r10conf *setup_conf(struct mddev *mddev) -{ - struct r10conf *conf = NULL; - int nc, fc, fo; - int err = -EINVAL; - - if (mddev->new_chunk_sectors < (PAGE_SIZE >> 9) || - !is_power_of_2(mddev->new_chunk_sectors)) { - printk(KERN_ERR "md/raid10:%s: chunk size must be " - "at least PAGE_SIZE(%ld) and be a power of 2.\n", - mdname(mddev), PAGE_SIZE); - goto out; - } - - nc = mddev->new_layout & 255; - fc = (mddev->new_layout >> 8) & 255; - fo = mddev->new_layout & (1<<16); - - if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || - (mddev->new_layout >> 17)) { - printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n", - mdname(mddev), mddev->new_layout); - goto out; - } - - err = -ENOMEM; - conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL); - if (!conf) - goto out; - - conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, - GFP_KERNEL); - if (!conf->mirrors) - goto out; - - conf->tmppage = alloc_page(GFP_KERNEL); - if (!conf->tmppage) - goto out; - - - conf->raid_disks = mddev->raid_disks; - conf->near_copies = nc; - conf->far_copies = fc; - conf->copies = nc*fc; - conf->far_offset = fo; - conf->chunk_mask = mddev->new_chunk_sectors - 1; - conf->chunk_shift = ffz(~mddev->new_chunk_sectors); - - conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, - r10bio_pool_free, conf); - if (!conf->r10bio_pool) - goto out; - - calc_sectors(conf, mddev->dev_sectors); - - spin_lock_init(&conf->device_lock); - INIT_LIST_HEAD(&conf->retry_list); - - spin_lock_init(&conf->resync_lock); - init_waitqueue_head(&conf->wait_barrier); - - conf->thread = md_register_thread(raid10d, mddev, NULL); - if (!conf->thread) - goto out; - - conf->mddev = mddev; - return conf; - - out: - printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n", - mdname(mddev)); - if (conf) { - if (conf->r10bio_pool) - mempool_destroy(conf->r10bio_pool); - kfree(conf->mirrors); - safe_put_page(conf->tmppage); - kfree(conf); - } - return ERR_PTR(err); -} - -static int run(struct mddev *mddev) -{ - struct r10conf *conf; - int i, disk_idx, chunk_size; - struct mirror_info *disk; - struct md_rdev *rdev; - sector_t size; - - /* - * copy the already verified devices into our private RAID10 - * bookkeeping area. [whatever we allocate in run(), - * should be freed in stop()] - */ - - if (mddev->private == NULL) { - conf = setup_conf(mddev); - if (IS_ERR(conf)) - return PTR_ERR(conf); - mddev->private = conf; - } - conf = mddev->private; - if (!conf) - goto out; - - mddev->thread = conf->thread; - conf->thread = NULL; - - chunk_size = mddev->chunk_sectors << 9; - blk_queue_io_min(mddev->queue, chunk_size); - if (conf->raid_disks % conf->near_copies) - blk_queue_io_opt(mddev->queue, chunk_size * conf->raid_disks); - else - blk_queue_io_opt(mddev->queue, chunk_size * - (conf->raid_disks / conf->near_copies)); - - rdev_for_each(rdev, mddev) { - struct request_queue *q; - disk_idx = rdev->raid_disk; - if (disk_idx >= conf->raid_disks - || disk_idx < 0) - continue; - disk = conf->mirrors + disk_idx; - - if (test_bit(Replacement, &rdev->flags)) { - if (disk->replacement) - goto out_free_conf; - disk->replacement = rdev; - } else { - if (disk->rdev) - goto out_free_conf; - disk->rdev = rdev; - } - q = bdev_get_queue(rdev->bdev); - if (q->merge_bvec_fn) - mddev->merge_check_needed = 1; - - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - - disk->head_position = 0; - } - /* need to check that every block has at least one working mirror */ - if (!enough(conf, -1)) { - printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", - mdname(mddev)); - goto out_free_conf; - } - - mddev->degraded = 0; - for (i = 0; i < conf->raid_disks; i++) { - - disk = conf->mirrors + i; - - if (!disk->rdev && disk->replacement) { - /* The replacement is all we have - use it */ - disk->rdev = disk->replacement; - disk->replacement = NULL; - clear_bit(Replacement, &disk->rdev->flags); - } - - if (!disk->rdev || - !test_bit(In_sync, &disk->rdev->flags)) { - disk->head_position = 0; - mddev->degraded++; - if (disk->rdev) - conf->fullsync = 1; - } - disk->recovery_disabled = mddev->recovery_disabled - 1; - } - - if (mddev->recovery_cp != MaxSector) - printk(KERN_NOTICE "md/raid10:%s: not clean" - " -- starting background reconstruction\n", - mdname(mddev)); - printk(KERN_INFO - "md/raid10:%s: active with %d out of %d devices\n", - mdname(mddev), conf->raid_disks - mddev->degraded, - conf->raid_disks); - /* - * Ok, everything is just fine now - */ - mddev->dev_sectors = conf->dev_sectors; - size = raid10_size(mddev, 0, 0); - md_set_array_sectors(mddev, size); - mddev->resync_max_sectors = size; - - mddev->queue->backing_dev_info.congested_fn = raid10_congested; - mddev->queue->backing_dev_info.congested_data = mddev; - - /* Calculate max read-ahead size. - * We need to readahead at least twice a whole stripe.... - * maybe... - */ - { - int stripe = conf->raid_disks * - ((mddev->chunk_sectors << 9) / PAGE_SIZE); - stripe /= conf->near_copies; - if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) - mddev->queue->backing_dev_info.ra_pages = 2* stripe; - } - - blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); - - if (md_integrity_register(mddev)) - goto out_free_conf; - - return 0; - -out_free_conf: - md_unregister_thread(&mddev->thread); - if (conf->r10bio_pool) - mempool_destroy(conf->r10bio_pool); - safe_put_page(conf->tmppage); - kfree(conf->mirrors); - kfree(conf); - mddev->private = NULL; -out: - return -EIO; -} - -static int stop(struct mddev *mddev) -{ - struct r10conf *conf = mddev->private; - - raise_barrier(conf, 0); - lower_barrier(conf); - - md_unregister_thread(&mddev->thread); - blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ - if (conf->r10bio_pool) - mempool_destroy(conf->r10bio_pool); - kfree(conf->mirrors); - kfree(conf); - mddev->private = NULL; - return 0; -} - -static void raid10_quiesce(struct mddev *mddev, int state) -{ - struct r10conf *conf = mddev->private; - - switch(state) { - case 1: - raise_barrier(conf, 0); - break; - case 0: - lower_barrier(conf); - break; - } -} - -static int raid10_resize(struct mddev *mddev, sector_t sectors) -{ - /* Resize of 'far' arrays is not supported. - * For 'near' and 'offset' arrays we can set the - * number of sectors used to be an appropriate multiple - * of the chunk size. - * For 'offset', this is far_copies*chunksize. - * For 'near' the multiplier is the LCM of - * near_copies and raid_disks. - * So if far_copies > 1 && !far_offset, fail. - * Else find LCM(raid_disks, near_copy)*far_copies and - * multiply by chunk_size. Then round to this number. - * This is mostly done by raid10_size() - */ - struct r10conf *conf = mddev->private; - sector_t oldsize, size; - - if (conf->far_copies > 1 && !conf->far_offset) - return -EINVAL; - - oldsize = raid10_size(mddev, 0, 0); - size = raid10_size(mddev, sectors, 0); - md_set_array_sectors(mddev, size); - if (mddev->array_sectors > size) - return -EINVAL; - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk(mddev->gendisk); - if (sectors > mddev->dev_sectors && - mddev->recovery_cp > oldsize) { - mddev->recovery_cp = oldsize; - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - } - calc_sectors(conf, sectors); - mddev->dev_sectors = conf->dev_sectors; - mddev->resync_max_sectors = size; - return 0; -} - -static void *raid10_takeover_raid0(struct mddev *mddev) -{ - struct md_rdev *rdev; - struct r10conf *conf; - - if (mddev->degraded > 0) { - printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n", - mdname(mddev)); - return ERR_PTR(-EINVAL); - } - - /* Set new parameters */ - mddev->new_level = 10; - /* new layout: far_copies = 1, near_copies = 2 */ - mddev->new_layout = (1<<8) + 2; - mddev->new_chunk_sectors = mddev->chunk_sectors; - mddev->delta_disks = mddev->raid_disks; - mddev->raid_disks *= 2; - /* make sure it will be not marked as dirty */ - mddev->recovery_cp = MaxSector; - - conf = setup_conf(mddev); - if (!IS_ERR(conf)) { - rdev_for_each(rdev, mddev) - if (rdev->raid_disk >= 0) - rdev->new_raid_disk = rdev->raid_disk * 2; - conf->barrier = 1; - } - - return conf; -} - -static void *raid10_takeover(struct mddev *mddev) -{ - struct r0conf *raid0_conf; - - /* raid10 can take over: - * raid0 - providing it has only two drives - */ - if (mddev->level == 0) { - /* for raid0 takeover only one zone is supported */ - raid0_conf = mddev->private; - if (raid0_conf->nr_strip_zones > 1) { - printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0" - " with more than one zone.\n", - mdname(mddev)); - return ERR_PTR(-EINVAL); - } - return raid10_takeover_raid0(mddev); - } - return ERR_PTR(-EINVAL); -} - -static struct md_personality raid10_personality = -{ - .name = "raid10", - .level = 10, - .owner = THIS_MODULE, - .make_request = make_request, - .run = run, - .stop = stop, - .status = status, - .error_handler = error, - .hot_add_disk = raid10_add_disk, - .hot_remove_disk= raid10_remove_disk, - .spare_active = raid10_spare_active, - .sync_request = sync_request, - .quiesce = raid10_quiesce, - .size = raid10_size, - .resize = raid10_resize, - .takeover = raid10_takeover, -}; - -static int __init raid_init(void) -{ - return register_md_personality(&raid10_personality); -} - -static void raid_exit(void) -{ - unregister_md_personality(&raid10_personality); -} - -module_init(raid_init); -module_exit(raid_exit); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD"); -MODULE_ALIAS("md-personality-9"); /* RAID10 */ -MODULE_ALIAS("md-raid10"); -MODULE_ALIAS("md-level-10"); - -module_param(max_queued_requests, int, S_IRUGO|S_IWUSR); diff --git a/ANDROID_3.4.5/drivers/md/raid10.h b/ANDROID_3.4.5/drivers/md/raid10.h deleted file mode 100644 index 7c615613..00000000 --- a/ANDROID_3.4.5/drivers/md/raid10.h +++ /dev/null @@ -1,150 +0,0 @@ -#ifndef _RAID10_H -#define _RAID10_H - -struct mirror_info { - struct md_rdev *rdev, *replacement; - sector_t head_position; - int recovery_disabled; /* matches - * mddev->recovery_disabled - * when we shouldn't try - * recovering this device. - */ -}; - -struct r10conf { - struct mddev *mddev; - struct mirror_info *mirrors; - int raid_disks; - spinlock_t device_lock; - - /* geometry */ - int near_copies; /* number of copies laid out - * raid0 style */ - int far_copies; /* number of copies laid out - * at large strides across drives - */ - int far_offset; /* far_copies are offset by 1 - * stripe instead of many - */ - int copies; /* near_copies * far_copies. - * must be <= raid_disks - */ - sector_t stride; /* distance between far copies. - * This is size / far_copies unless - * far_offset, in which case it is - * 1 stripe. - */ - - sector_t dev_sectors; /* temp copy of - * mddev->dev_sectors */ - - int chunk_shift; /* shift from chunks to sectors */ - sector_t chunk_mask; - - struct list_head retry_list; - /* queue pending writes and submit them on unplug */ - struct bio_list pending_bio_list; - int pending_count; - - spinlock_t resync_lock; - int nr_pending; - int nr_waiting; - int nr_queued; - int barrier; - sector_t next_resync; - int fullsync; /* set to 1 if a full sync is needed, - * (fresh device added). - * Cleared when a sync completes. - */ - int have_replacement; /* There is at least one - * replacement device. - */ - wait_queue_head_t wait_barrier; - - mempool_t *r10bio_pool; - mempool_t *r10buf_pool; - struct page *tmppage; - - /* When taking over an array from a different personality, we store - * the new thread here until we fully activate the array. - */ - struct md_thread *thread; -}; - -/* - * this is our 'private' RAID10 bio. - * - * it contains information about what kind of IO operations were started - * for this RAID10 operation, and about their status: - */ - -struct r10bio { - atomic_t remaining; /* 'have we finished' count, - * used from IRQ handlers - */ - sector_t sector; /* virtual sector number */ - int sectors; - unsigned long state; - struct mddev *mddev; - /* - * original bio going to /dev/mdx - */ - struct bio *master_bio; - /* - * if the IO is in READ direction, then this is where we read - */ - int read_slot; - - struct list_head retry_list; - /* - * if the IO is in WRITE direction, then multiple bios are used, - * one for each copy. - * When resyncing we also use one for each copy. - * When reconstructing, we use 2 bios, one for read, one for write. - * We choose the number when they are allocated. - * We sometimes need an extra bio to write to the replacement. - */ - struct { - struct bio *bio; - union { - struct bio *repl_bio; /* used for resync and - * writes */ - struct md_rdev *rdev; /* used for reads - * (read_slot >= 0) */ - }; - sector_t addr; - int devnum; - } devs[0]; -}; - -/* when we get a read error on a read-only array, we redirect to another - * device without failing the first device, or trying to over-write to - * correct the read error. To keep track of bad blocks on a per-bio - * level, we store IO_BLOCKED in the appropriate 'bios' pointer - */ -#define IO_BLOCKED ((struct bio*)1) -/* When we successfully write to a known bad-block, we need to remove the - * bad-block marking which must be done from process context. So we record - * the success by setting devs[n].bio to IO_MADE_GOOD - */ -#define IO_MADE_GOOD ((struct bio *)2) - -#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) - -/* bits for r10bio.state */ -enum r10bio_state { - R10BIO_Uptodate, - R10BIO_IsSync, - R10BIO_IsRecover, - R10BIO_Degraded, -/* Set ReadError on bios that experience a read error - * so that raid10d knows what to do with them. - */ - R10BIO_ReadError, -/* If a write for this request means we can clear some - * known-bad-block records, we set this flag. - */ - R10BIO_MadeGood, - R10BIO_WriteError, -}; -#endif diff --git a/ANDROID_3.4.5/drivers/md/raid5.c b/ANDROID_3.4.5/drivers/md/raid5.c deleted file mode 100644 index 73a58007..00000000 --- a/ANDROID_3.4.5/drivers/md/raid5.c +++ /dev/null @@ -1,6050 +0,0 @@ -/* - * raid5.c : Multiple Devices driver for Linux - * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman - * Copyright (C) 1999, 2000 Ingo Molnar - * Copyright (C) 2002, 2003 H. Peter Anvin - * - * RAID-4/5/6 management functions. - * Thanks to Penguin Computing for making the RAID-6 development possible - * by donating a test server! - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * You should have received a copy of the GNU General Public License - * (for example /usr/src/linux/COPYING); if not, write to the Free - * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -/* - * BITMAP UNPLUGGING: - * - * The sequencing for updating the bitmap reliably is a little - * subtle (and I got it wrong the first time) so it deserves some - * explanation. - * - * We group bitmap updates into batches. Each batch has a number. - * We may write out several batches at once, but that isn't very important. - * conf->seq_write is the number of the last batch successfully written. - * conf->seq_flush is the number of the last batch that was closed to - * new additions. - * When we discover that we will need to write to any block in a stripe - * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq - * the number of the batch it will be in. This is seq_flush+1. - * When we are ready to do a write, if that batch hasn't been written yet, - * we plug the array and queue the stripe for later. - * When an unplug happens, we increment bm_flush, thus closing the current - * batch. - * When we notice that bm_flush > bm_write, we write out all pending updates - * to the bitmap, and advance bm_write to where bm_flush was. - * This may occasionally write a bit out twice, but is sure never to - * miss any bits. - */ - -#include <linux/blkdev.h> -#include <linux/kthread.h> -#include <linux/raid/pq.h> -#include <linux/async_tx.h> -#include <linux/module.h> -#include <linux/async.h> -#include <linux/seq_file.h> -#include <linux/cpu.h> -#include <linux/slab.h> -#include <linux/ratelimit.h> -#include "md.h" -#include "raid5.h" -#include "raid0.h" -#include "bitmap.h" - -/* - * Stripe cache - */ - -#define NR_STRIPES 256 -#define STRIPE_SIZE PAGE_SIZE -#define STRIPE_SHIFT (PAGE_SHIFT - 9) -#define STRIPE_SECTORS (STRIPE_SIZE>>9) -#define IO_THRESHOLD 1 -#define BYPASS_THRESHOLD 1 -#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) -#define HASH_MASK (NR_HASH - 1) - -static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) -{ - int hash = (sect >> STRIPE_SHIFT) & HASH_MASK; - return &conf->stripe_hashtbl[hash]; -} - -/* bio's attached to a stripe+device for I/O are linked together in bi_sector - * order without overlap. There may be several bio's per stripe+device, and - * a bio could span several devices. - * When walking this list for a particular stripe+device, we must never proceed - * beyond a bio that extends past this device, as the next bio might no longer - * be valid. - * This function is used to determine the 'next' bio in the list, given the sector - * of the current stripe+device - */ -static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) -{ - int sectors = bio->bi_size >> 9; - if (bio->bi_sector + sectors < sector + STRIPE_SECTORS) - return bio->bi_next; - else - return NULL; -} - -/* - * We maintain a biased count of active stripes in the bottom 16 bits of - * bi_phys_segments, and a count of processed stripes in the upper 16 bits - */ -static inline int raid5_bi_phys_segments(struct bio *bio) -{ - return bio->bi_phys_segments & 0xffff; -} - -static inline int raid5_bi_hw_segments(struct bio *bio) -{ - return (bio->bi_phys_segments >> 16) & 0xffff; -} - -static inline int raid5_dec_bi_phys_segments(struct bio *bio) -{ - --bio->bi_phys_segments; - return raid5_bi_phys_segments(bio); -} - -static inline int raid5_dec_bi_hw_segments(struct bio *bio) -{ - unsigned short val = raid5_bi_hw_segments(bio); - - --val; - bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio); - return val; -} - -static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) -{ - bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16); -} - -/* Find first data disk in a raid6 stripe */ -static inline int raid6_d0(struct stripe_head *sh) -{ - if (sh->ddf_layout) - /* ddf always start from first device */ - return 0; - /* md starts just after Q block */ - if (sh->qd_idx == sh->disks - 1) - return 0; - else - return sh->qd_idx + 1; -} -static inline int raid6_next_disk(int disk, int raid_disks) -{ - disk++; - return (disk < raid_disks) ? disk : 0; -} - -/* When walking through the disks in a raid5, starting at raid6_d0, - * We need to map each disk to a 'slot', where the data disks are slot - * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk - * is raid_disks-1. This help does that mapping. - */ -static int raid6_idx_to_slot(int idx, struct stripe_head *sh, - int *count, int syndrome_disks) -{ - int slot = *count; - - if (sh->ddf_layout) - (*count)++; - if (idx == sh->pd_idx) - return syndrome_disks; - if (idx == sh->qd_idx) - return syndrome_disks + 1; - if (!sh->ddf_layout) - (*count)++; - return slot; -} - -static void return_io(struct bio *return_bi) -{ - struct bio *bi = return_bi; - while (bi) { - - return_bi = bi->bi_next; - bi->bi_next = NULL; - bi->bi_size = 0; - bio_endio(bi, 0); - bi = return_bi; - } -} - -static void print_raid5_conf (struct r5conf *conf); - -static int stripe_operations_active(struct stripe_head *sh) -{ - return sh->check_state || sh->reconstruct_state || - test_bit(STRIPE_BIOFILL_RUN, &sh->state) || - test_bit(STRIPE_COMPUTE_RUN, &sh->state); -} - -static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) -{ - if (atomic_dec_and_test(&sh->count)) { - BUG_ON(!list_empty(&sh->lru)); - BUG_ON(atomic_read(&conf->active_stripes)==0); - if (test_bit(STRIPE_HANDLE, &sh->state)) { - if (test_bit(STRIPE_DELAYED, &sh->state) && - !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) - list_add_tail(&sh->lru, &conf->delayed_list); - else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && - sh->bm_seq - conf->seq_write > 0) - list_add_tail(&sh->lru, &conf->bitmap_list); - else { - clear_bit(STRIPE_DELAYED, &sh->state); - clear_bit(STRIPE_BIT_DELAY, &sh->state); - list_add_tail(&sh->lru, &conf->handle_list); - } - md_wakeup_thread(conf->mddev->thread); - } else { - BUG_ON(stripe_operations_active(sh)); - if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) - if (atomic_dec_return(&conf->preread_active_stripes) - < IO_THRESHOLD) - md_wakeup_thread(conf->mddev->thread); - atomic_dec(&conf->active_stripes); - if (!test_bit(STRIPE_EXPANDING, &sh->state)) { - list_add_tail(&sh->lru, &conf->inactive_list); - wake_up(&conf->wait_for_stripe); - if (conf->retry_read_aligned) - md_wakeup_thread(conf->mddev->thread); - } - } - } -} - -static void release_stripe(struct stripe_head *sh) -{ - struct r5conf *conf = sh->raid_conf; - unsigned long flags; - - spin_lock_irqsave(&conf->device_lock, flags); - __release_stripe(conf, sh); - spin_unlock_irqrestore(&conf->device_lock, flags); -} - -static inline void remove_hash(struct stripe_head *sh) -{ - pr_debug("remove_hash(), stripe %llu\n", - (unsigned long long)sh->sector); - - hlist_del_init(&sh->hash); -} - -static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) -{ - struct hlist_head *hp = stripe_hash(conf, sh->sector); - - pr_debug("insert_hash(), stripe %llu\n", - (unsigned long long)sh->sector); - - hlist_add_head(&sh->hash, hp); -} - - -/* find an idle stripe, make sure it is unhashed, and return it. */ -static struct stripe_head *get_free_stripe(struct r5conf *conf) -{ - struct stripe_head *sh = NULL; - struct list_head *first; - - if (list_empty(&conf->inactive_list)) - goto out; - first = conf->inactive_list.next; - sh = list_entry(first, struct stripe_head, lru); - list_del_init(first); - remove_hash(sh); - atomic_inc(&conf->active_stripes); -out: - return sh; -} - -static void shrink_buffers(struct stripe_head *sh) -{ - struct page *p; - int i; - int num = sh->raid_conf->pool_size; - - for (i = 0; i < num ; i++) { - p = sh->dev[i].page; - if (!p) - continue; - sh->dev[i].page = NULL; - put_page(p); - } -} - -static int grow_buffers(struct stripe_head *sh) -{ - int i; - int num = sh->raid_conf->pool_size; - - for (i = 0; i < num; i++) { - struct page *page; - - if (!(page = alloc_page(GFP_KERNEL))) { - return 1; - } - sh->dev[i].page = page; - } - return 0; -} - -static void raid5_build_block(struct stripe_head *sh, int i, int previous); -static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, - struct stripe_head *sh); - -static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) -{ - struct r5conf *conf = sh->raid_conf; - int i; - - BUG_ON(atomic_read(&sh->count) != 0); - BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); - BUG_ON(stripe_operations_active(sh)); - - pr_debug("init_stripe called, stripe %llu\n", - (unsigned long long)sh->sector); - - remove_hash(sh); - - sh->generation = conf->generation - previous; - sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; - sh->sector = sector; - stripe_set_idx(sector, conf, previous, sh); - sh->state = 0; - - - for (i = sh->disks; i--; ) { - struct r5dev *dev = &sh->dev[i]; - - if (dev->toread || dev->read || dev->towrite || dev->written || - test_bit(R5_LOCKED, &dev->flags)) { - printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n", - (unsigned long long)sh->sector, i, dev->toread, - dev->read, dev->towrite, dev->written, - test_bit(R5_LOCKED, &dev->flags)); - WARN_ON(1); - } - dev->flags = 0; - raid5_build_block(sh, i, previous); - } - insert_hash(conf, sh); -} - -static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, - short generation) -{ - struct stripe_head *sh; - struct hlist_node *hn; - - pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); - hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) - if (sh->sector == sector && sh->generation == generation) - return sh; - pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); - return NULL; -} - -/* - * Need to check if array has failed when deciding whether to: - * - start an array - * - remove non-faulty devices - * - add a spare - * - allow a reshape - * This determination is simple when no reshape is happening. - * However if there is a reshape, we need to carefully check - * both the before and after sections. - * This is because some failed devices may only affect one - * of the two sections, and some non-in_sync devices may - * be insync in the section most affected by failed devices. - */ -static int calc_degraded(struct r5conf *conf) -{ - int degraded, degraded2; - int i; - - rcu_read_lock(); - degraded = 0; - for (i = 0; i < conf->previous_raid_disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); - if (!rdev || test_bit(Faulty, &rdev->flags)) - degraded++; - else if (test_bit(In_sync, &rdev->flags)) - ; - else - /* not in-sync or faulty. - * If the reshape increases the number of devices, - * this is being recovered by the reshape, so - * this 'previous' section is not in_sync. - * If the number of devices is being reduced however, - * the device can only be part of the array if - * we are reverting a reshape, so this section will - * be in-sync. - */ - if (conf->raid_disks >= conf->previous_raid_disks) - degraded++; - } - rcu_read_unlock(); - if (conf->raid_disks == conf->previous_raid_disks) - return degraded; - rcu_read_lock(); - degraded2 = 0; - for (i = 0; i < conf->raid_disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); - if (!rdev || test_bit(Faulty, &rdev->flags)) - degraded2++; - else if (test_bit(In_sync, &rdev->flags)) - ; - else - /* not in-sync or faulty. - * If reshape increases the number of devices, this - * section has already been recovered, else it - * almost certainly hasn't. - */ - if (conf->raid_disks <= conf->previous_raid_disks) - degraded2++; - } - rcu_read_unlock(); - if (degraded2 > degraded) - return degraded2; - return degraded; -} - -static int has_failed(struct r5conf *conf) -{ - int degraded; - - if (conf->mddev->reshape_position == MaxSector) - return conf->mddev->degraded > conf->max_degraded; - - degraded = calc_degraded(conf); - if (degraded > conf->max_degraded) - return 1; - return 0; -} - -static struct stripe_head * -get_active_stripe(struct r5conf *conf, sector_t sector, - int previous, int noblock, int noquiesce) -{ - struct stripe_head *sh; - - pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); - - spin_lock_irq(&conf->device_lock); - - do { - wait_event_lock_irq(conf->wait_for_stripe, - conf->quiesce == 0 || noquiesce, - conf->device_lock, /* nothing */); - sh = __find_stripe(conf, sector, conf->generation - previous); - if (!sh) { - if (!conf->inactive_blocked) - sh = get_free_stripe(conf); - if (noblock && sh == NULL) - break; - if (!sh) { - conf->inactive_blocked = 1; - wait_event_lock_irq(conf->wait_for_stripe, - !list_empty(&conf->inactive_list) && - (atomic_read(&conf->active_stripes) - < (conf->max_nr_stripes *3/4) - || !conf->inactive_blocked), - conf->device_lock, - ); - conf->inactive_blocked = 0; - } else - init_stripe(sh, sector, previous); - } else { - if (atomic_read(&sh->count)) { - BUG_ON(!list_empty(&sh->lru) - && !test_bit(STRIPE_EXPANDING, &sh->state)); - } else { - if (!test_bit(STRIPE_HANDLE, &sh->state)) - atomic_inc(&conf->active_stripes); - if (list_empty(&sh->lru) && - !test_bit(STRIPE_EXPANDING, &sh->state)) - BUG(); - list_del_init(&sh->lru); - } - } - } while (sh == NULL); - - if (sh) - atomic_inc(&sh->count); - - spin_unlock_irq(&conf->device_lock); - return sh; -} - -static void -raid5_end_read_request(struct bio *bi, int error); -static void -raid5_end_write_request(struct bio *bi, int error); - -static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) -{ - struct r5conf *conf = sh->raid_conf; - int i, disks = sh->disks; - - might_sleep(); - - for (i = disks; i--; ) { - int rw; - int replace_only = 0; - struct bio *bi, *rbi; - struct md_rdev *rdev, *rrdev = NULL; - if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { - if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) - rw = WRITE_FUA; - else - rw = WRITE; - } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) - rw = READ; - else if (test_and_clear_bit(R5_WantReplace, - &sh->dev[i].flags)) { - rw = WRITE; - replace_only = 1; - } else - continue; - - bi = &sh->dev[i].req; - rbi = &sh->dev[i].rreq; /* For writing to replacement */ - - bi->bi_rw = rw; - rbi->bi_rw = rw; - if (rw & WRITE) { - bi->bi_end_io = raid5_end_write_request; - rbi->bi_end_io = raid5_end_write_request; - } else - bi->bi_end_io = raid5_end_read_request; - - rcu_read_lock(); - rrdev = rcu_dereference(conf->disks[i].replacement); - smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ - rdev = rcu_dereference(conf->disks[i].rdev); - if (!rdev) { - rdev = rrdev; - rrdev = NULL; - } - if (rw & WRITE) { - if (replace_only) - rdev = NULL; - if (rdev == rrdev) - /* We raced and saw duplicates */ - rrdev = NULL; - } else { - if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev) - rdev = rrdev; - rrdev = NULL; - } - - if (rdev && test_bit(Faulty, &rdev->flags)) - rdev = NULL; - if (rdev) - atomic_inc(&rdev->nr_pending); - if (rrdev && test_bit(Faulty, &rrdev->flags)) - rrdev = NULL; - if (rrdev) - atomic_inc(&rrdev->nr_pending); - rcu_read_unlock(); - - /* We have already checked bad blocks for reads. Now - * need to check for writes. We never accept write errors - * on the replacement, so we don't to check rrdev. - */ - while ((rw & WRITE) && rdev && - test_bit(WriteErrorSeen, &rdev->flags)) { - sector_t first_bad; - int bad_sectors; - int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, - &first_bad, &bad_sectors); - if (!bad) - break; - - if (bad < 0) { - set_bit(BlockedBadBlocks, &rdev->flags); - if (!conf->mddev->external && - conf->mddev->flags) { - /* It is very unlikely, but we might - * still need to write out the - * bad block log - better give it - * a chance*/ - md_check_recovery(conf->mddev); - } - /* - * Because md_wait_for_blocked_rdev - * will dec nr_pending, we must - * increment it first. - */ - atomic_inc(&rdev->nr_pending); - md_wait_for_blocked_rdev(rdev, conf->mddev); - } else { - /* Acknowledged bad block - skip the write */ - rdev_dec_pending(rdev, conf->mddev); - rdev = NULL; - } - } - - if (rdev) { - if (s->syncing || s->expanding || s->expanded - || s->replacing) - md_sync_acct(rdev->bdev, STRIPE_SECTORS); - - set_bit(STRIPE_IO_STARTED, &sh->state); - - bi->bi_bdev = rdev->bdev; - pr_debug("%s: for %llu schedule op %ld on disc %d\n", - __func__, (unsigned long long)sh->sector, - bi->bi_rw, i); - atomic_inc(&sh->count); - bi->bi_sector = sh->sector + rdev->data_offset; - bi->bi_flags = 1 << BIO_UPTODATE; - bi->bi_idx = 0; - bi->bi_io_vec[0].bv_len = STRIPE_SIZE; - bi->bi_io_vec[0].bv_offset = 0; - bi->bi_size = STRIPE_SIZE; - bi->bi_next = NULL; - if (rrdev) - set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); - generic_make_request(bi); - } - if (rrdev) { - if (s->syncing || s->expanding || s->expanded - || s->replacing) - md_sync_acct(rrdev->bdev, STRIPE_SECTORS); - - set_bit(STRIPE_IO_STARTED, &sh->state); - - rbi->bi_bdev = rrdev->bdev; - pr_debug("%s: for %llu schedule op %ld on " - "replacement disc %d\n", - __func__, (unsigned long long)sh->sector, - rbi->bi_rw, i); - atomic_inc(&sh->count); - rbi->bi_sector = sh->sector + rrdev->data_offset; - rbi->bi_flags = 1 << BIO_UPTODATE; - rbi->bi_idx = 0; - rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; - rbi->bi_io_vec[0].bv_offset = 0; - rbi->bi_size = STRIPE_SIZE; - rbi->bi_next = NULL; - generic_make_request(rbi); - } - if (!rdev && !rrdev) { - if (rw & WRITE) - set_bit(STRIPE_DEGRADED, &sh->state); - pr_debug("skip op %ld on disc %d for sector %llu\n", - bi->bi_rw, i, (unsigned long long)sh->sector); - clear_bit(R5_LOCKED, &sh->dev[i].flags); - set_bit(STRIPE_HANDLE, &sh->state); - } - } -} - -static struct dma_async_tx_descriptor * -async_copy_data(int frombio, struct bio *bio, struct page *page, - sector_t sector, struct dma_async_tx_descriptor *tx) -{ - struct bio_vec *bvl; - struct page *bio_page; - int i; - int page_offset; - struct async_submit_ctl submit; - enum async_tx_flags flags = 0; - - if (bio->bi_sector >= sector) - page_offset = (signed)(bio->bi_sector - sector) * 512; - else - page_offset = (signed)(sector - bio->bi_sector) * -512; - - if (frombio) - flags |= ASYNC_TX_FENCE; - init_async_submit(&submit, flags, tx, NULL, NULL, NULL); - - bio_for_each_segment(bvl, bio, i) { - int len = bvl->bv_len; - int clen; - int b_offset = 0; - - if (page_offset < 0) { - b_offset = -page_offset; - page_offset += b_offset; - len -= b_offset; - } - - if (len > 0 && page_offset + len > STRIPE_SIZE) - clen = STRIPE_SIZE - page_offset; - else - clen = len; - - if (clen > 0) { - b_offset += bvl->bv_offset; - bio_page = bvl->bv_page; - if (frombio) - tx = async_memcpy(page, bio_page, page_offset, - b_offset, clen, &submit); - else - tx = async_memcpy(bio_page, page, b_offset, - page_offset, clen, &submit); - } - /* chain the operations */ - submit.depend_tx = tx; - - if (clen < len) /* hit end of page */ - break; - page_offset += len; - } - - return tx; -} - -static void ops_complete_biofill(void *stripe_head_ref) -{ - struct stripe_head *sh = stripe_head_ref; - struct bio *return_bi = NULL; - struct r5conf *conf = sh->raid_conf; - int i; - - pr_debug("%s: stripe %llu\n", __func__, - (unsigned long long)sh->sector); - - /* clear completed biofills */ - spin_lock_irq(&conf->device_lock); - for (i = sh->disks; i--; ) { - struct r5dev *dev = &sh->dev[i]; - - /* acknowledge completion of a biofill operation */ - /* and check if we need to reply to a read request, - * new R5_Wantfill requests are held off until - * !STRIPE_BIOFILL_RUN - */ - if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { - struct bio *rbi, *rbi2; - - BUG_ON(!dev->read); - rbi = dev->read; - dev->read = NULL; - while (rbi && rbi->bi_sector < - dev->sector + STRIPE_SECTORS) { - rbi2 = r5_next_bio(rbi, dev->sector); - if (!raid5_dec_bi_phys_segments(rbi)) { - rbi->bi_next = return_bi; - return_bi = rbi; - } - rbi = rbi2; - } - } - } - spin_unlock_irq(&conf->device_lock); - clear_bit(STRIPE_BIOFILL_RUN, &sh->state); - - return_io(return_bi); - - set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); -} - -static void ops_run_biofill(struct stripe_head *sh) -{ - struct dma_async_tx_descriptor *tx = NULL; - struct r5conf *conf = sh->raid_conf; - struct async_submit_ctl submit; - int i; - - pr_debug("%s: stripe %llu\n", __func__, - (unsigned long long)sh->sector); - - for (i = sh->disks; i--; ) { - struct r5dev *dev = &sh->dev[i]; - if (test_bit(R5_Wantfill, &dev->flags)) { - struct bio *rbi; - spin_lock_irq(&conf->device_lock); - dev->read = rbi = dev->toread; - dev->toread = NULL; - spin_unlock_irq(&conf->device_lock); - while (rbi && rbi->bi_sector < - dev->sector + STRIPE_SECTORS) { - tx = async_copy_data(0, rbi, dev->page, - dev->sector, tx); - rbi = r5_next_bio(rbi, dev->sector); - } - } - } - - atomic_inc(&sh->count); - init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); - async_trigger_callback(&submit); -} - -static void mark_target_uptodate(struct stripe_head *sh, int target) -{ - struct r5dev *tgt; - - if (target < 0) - return; - - tgt = &sh->dev[target]; - set_bit(R5_UPTODATE, &tgt->flags); - BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); - clear_bit(R5_Wantcompute, &tgt->flags); -} - -static void ops_complete_compute(void *stripe_head_ref) -{ - struct stripe_head *sh = stripe_head_ref; - - pr_debug("%s: stripe %llu\n", __func__, - (unsigned long long)sh->sector); - - /* mark the computed target(s) as uptodate */ - mark_target_uptodate(sh, sh->ops.target); - mark_target_uptodate(sh, sh->ops.target2); - - clear_bit(STRIPE_COMPUTE_RUN, &sh->state); - if (sh->check_state == check_state_compute_run) - sh->check_state = check_state_compute_result; - set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); -} - -/* return a pointer to the address conversion region of the scribble buffer */ -static addr_conv_t *to_addr_conv(struct stripe_head *sh, - struct raid5_percpu *percpu) -{ - return percpu->scribble + sizeof(struct page *) * (sh->disks + 2); -} - -static struct dma_async_tx_descriptor * -ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) -{ - int disks = sh->disks; - struct page **xor_srcs = percpu->scribble; - int target = sh->ops.target; - struct r5dev *tgt = &sh->dev[target]; - struct page *xor_dest = tgt->page; - int count = 0; - struct dma_async_tx_descriptor *tx; - struct async_submit_ctl submit; - int i; - - pr_debug("%s: stripe %llu block: %d\n", - __func__, (unsigned long long)sh->sector, target); - BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); - - for (i = disks; i--; ) - if (i != target) - xor_srcs[count++] = sh->dev[i].page; - - atomic_inc(&sh->count); - - init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, - ops_complete_compute, sh, to_addr_conv(sh, percpu)); - if (unlikely(count == 1)) - tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); - else - tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); - - return tx; -} - -/* set_syndrome_sources - populate source buffers for gen_syndrome - * @srcs - (struct page *) array of size sh->disks - * @sh - stripe_head to parse - * - * Populates srcs in proper layout order for the stripe and returns the - * 'count' of sources to be used in a call to async_gen_syndrome. The P - * destination buffer is recorded in srcs[count] and the Q destination - * is recorded in srcs[count+1]]. - */ -static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) -{ - int disks = sh->disks; - int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); - int d0_idx = raid6_d0(sh); - int count; - int i; - - for (i = 0; i < disks; i++) - srcs[i] = NULL; - - count = 0; - i = d0_idx; - do { - int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); - - srcs[slot] = sh->dev[i].page; - i = raid6_next_disk(i, disks); - } while (i != d0_idx); - - return syndrome_disks; -} - -static struct dma_async_tx_descriptor * -ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) -{ - int disks = sh->disks; - struct page **blocks = percpu->scribble; - int target; - int qd_idx = sh->qd_idx; - struct dma_async_tx_descriptor *tx; - struct async_submit_ctl submit; - struct r5dev *tgt; - struct page *dest; - int i; - int count; - - if (sh->ops.target < 0) - target = sh->ops.target2; - else if (sh->ops.target2 < 0) - target = sh->ops.target; - else - /* we should only have one valid target */ - BUG(); - BUG_ON(target < 0); - pr_debug("%s: stripe %llu block: %d\n", - __func__, (unsigned long long)sh->sector, target); - - tgt = &sh->dev[target]; - BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); - dest = tgt->page; - - atomic_inc(&sh->count); - - if (target == qd_idx) { - count = set_syndrome_sources(blocks, sh); - blocks[count] = NULL; /* regenerating p is not necessary */ - BUG_ON(blocks[count+1] != dest); /* q should already be set */ - init_async_submit(&submit, ASYNC_TX_FENCE, NULL, - ops_complete_compute, sh, - to_addr_conv(sh, percpu)); - tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); - } else { - /* Compute any data- or p-drive using XOR */ - count = 0; - for (i = disks; i-- ; ) { - if (i == target || i == qd_idx) - continue; - blocks[count++] = sh->dev[i].page; - } - - init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, - NULL, ops_complete_compute, sh, - to_addr_conv(sh, percpu)); - tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); - } - - return tx; -} - -static struct dma_async_tx_descriptor * -ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) -{ - int i, count, disks = sh->disks; - int syndrome_disks = sh->ddf_layout ? disks : disks-2; - int d0_idx = raid6_d0(sh); - int faila = -1, failb = -1; - int target = sh->ops.target; - int target2 = sh->ops.target2; - struct r5dev *tgt = &sh->dev[target]; - struct r5dev *tgt2 = &sh->dev[target2]; - struct dma_async_tx_descriptor *tx; - struct page **blocks = percpu->scribble; - struct async_submit_ctl submit; - - pr_debug("%s: stripe %llu block1: %d block2: %d\n", - __func__, (unsigned long long)sh->sector, target, target2); - BUG_ON(target < 0 || target2 < 0); - BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); - BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); - - /* we need to open-code set_syndrome_sources to handle the - * slot number conversion for 'faila' and 'failb' - */ - for (i = 0; i < disks ; i++) - blocks[i] = NULL; - count = 0; - i = d0_idx; - do { - int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); - - blocks[slot] = sh->dev[i].page; - - if (i == target) - faila = slot; - if (i == target2) - failb = slot; - i = raid6_next_disk(i, disks); - } while (i != d0_idx); - - BUG_ON(faila == failb); - if (failb < faila) - swap(faila, failb); - pr_debug("%s: stripe: %llu faila: %d failb: %d\n", - __func__, (unsigned long long)sh->sector, faila, failb); - - atomic_inc(&sh->count); - - if (failb == syndrome_disks+1) { - /* Q disk is one of the missing disks */ - if (faila == syndrome_disks) { - /* Missing P+Q, just recompute */ - init_async_submit(&submit, ASYNC_TX_FENCE, NULL, - ops_complete_compute, sh, - to_addr_conv(sh, percpu)); - return async_gen_syndrome(blocks, 0, syndrome_disks+2, - STRIPE_SIZE, &submit); - } else { - struct page *dest; - int data_target; - int qd_idx = sh->qd_idx; - - /* Missing D+Q: recompute D from P, then recompute Q */ - if (target == qd_idx) - data_target = target2; - else - data_target = target; - - count = 0; - for (i = disks; i-- ; ) { - if (i == data_target || i == qd_idx) - continue; - blocks[count++] = sh->dev[i].page; - } - dest = sh->dev[data_target].page; - init_async_submit(&submit, - ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, - NULL, NULL, NULL, - to_addr_conv(sh, percpu)); - tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, - &submit); - - count = set_syndrome_sources(blocks, sh); - init_async_submit(&submit, ASYNC_TX_FENCE, tx, - ops_complete_compute, sh, - to_addr_conv(sh, percpu)); - return async_gen_syndrome(blocks, 0, count+2, - STRIPE_SIZE, &submit); - } - } else { - init_async_submit(&submit, ASYNC_TX_FENCE, NULL, - ops_complete_compute, sh, - to_addr_conv(sh, percpu)); - if (failb == syndrome_disks) { - /* We're missing D+P. */ - return async_raid6_datap_recov(syndrome_disks+2, - STRIPE_SIZE, faila, - blocks, &submit); - } else { - /* We're missing D+D. */ - return async_raid6_2data_recov(syndrome_disks+2, - STRIPE_SIZE, faila, failb, - blocks, &submit); - } - } -} - - -static void ops_complete_prexor(void *stripe_head_ref) -{ - struct stripe_head *sh = stripe_head_ref; - - pr_debug("%s: stripe %llu\n", __func__, - (unsigned long long)sh->sector); -} - -static struct dma_async_tx_descriptor * -ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, - struct dma_async_tx_descriptor *tx) -{ - int disks = sh->disks; - struct page **xor_srcs = percpu->scribble; - int count = 0, pd_idx = sh->pd_idx, i; - struct async_submit_ctl submit; - - /* existing parity data subtracted */ - struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; - - pr_debug("%s: stripe %llu\n", __func__, - (unsigned long long)sh->sector); - - for (i = disks; i--; ) { - struct r5dev *dev = &sh->dev[i]; - /* Only process blocks that are known to be uptodate */ - if (test_bit(R5_Wantdrain, &dev->flags)) - xor_srcs[count++] = dev->page; - } - - init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, - ops_complete_prexor, sh, to_addr_conv(sh, percpu)); - tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); - - return tx; -} - -static struct dma_async_tx_descriptor * -ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) -{ - int disks = sh->disks; - int i; - - pr_debug("%s: stripe %llu\n", __func__, - (unsigned long long)sh->sector); - - for (i = disks; i--; ) { - struct r5dev *dev = &sh->dev[i]; - struct bio *chosen; - - if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { - struct bio *wbi; - - spin_lock_irq(&sh->raid_conf->device_lock); - chosen = dev->towrite; - dev->towrite = NULL; - BUG_ON(dev->written); - wbi = dev->written = chosen; - spin_unlock_irq(&sh->raid_conf->device_lock); - - while (wbi && wbi->bi_sector < - dev->sector + STRIPE_SECTORS) { - if (wbi->bi_rw & REQ_FUA) - set_bit(R5_WantFUA, &dev->flags); - tx = async_copy_data(1, wbi, dev->page, - dev->sector, tx); - wbi = r5_next_bio(wbi, dev->sector); - } - } - } - - return tx; -} - -static void ops_complete_reconstruct(void *stripe_head_ref) -{ - struct stripe_head *sh = stripe_head_ref; - int disks = sh->disks; - int pd_idx = sh->pd_idx; - int qd_idx = sh->qd_idx; - int i; - bool fua = false; - - pr_debug("%s: stripe %llu\n", __func__, - (unsigned long long)sh->sector); - - for (i = disks; i--; ) - fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); - - for (i = disks; i--; ) { - struct r5dev *dev = &sh->dev[i]; - - if (dev->written || i == pd_idx || i == qd_idx) { - set_bit(R5_UPTODATE, &dev->flags); - if (fua) - set_bit(R5_WantFUA, &dev->flags); - } - } - - if (sh->reconstruct_state == reconstruct_state_drain_run) - sh->reconstruct_state = reconstruct_state_drain_result; - else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) - sh->reconstruct_state = reconstruct_state_prexor_drain_result; - else { - BUG_ON(sh->reconstruct_state != reconstruct_state_run); - sh->reconstruct_state = reconstruct_state_result; - } - - set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); -} - -static void -ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, - struct dma_async_tx_descriptor *tx) -{ - int disks = sh->disks; - struct page **xor_srcs = percpu->scribble; - struct async_submit_ctl submit; - int count = 0, pd_idx = sh->pd_idx, i; - struct page *xor_dest; - int prexor = 0; - unsigned long flags; - - pr_debug("%s: stripe %llu\n", __func__, - (unsigned long long)sh->sector); - - /* check if prexor is active which means only process blocks - * that are part of a read-modify-write (written) - */ - if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { - prexor = 1; - xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; - for (i = disks; i--; ) { - struct r5dev *dev = &sh->dev[i]; - if (dev->written) - xor_srcs[count++] = dev->page; - } - } else { - xor_dest = sh->dev[pd_idx].page; - for (i = disks; i--; ) { - struct r5dev *dev = &sh->dev[i]; - if (i != pd_idx) - xor_srcs[count++] = dev->page; - } - } - - /* 1/ if we prexor'd then the dest is reused as a source - * 2/ if we did not prexor then we are redoing the parity - * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST - * for the synchronous xor case - */ - flags = ASYNC_TX_ACK | - (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); - - atomic_inc(&sh->count); - - init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, - to_addr_conv(sh, percpu)); - if (unlikely(count == 1)) - tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); - else - tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); -} - -static void -ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, - struct dma_async_tx_descriptor *tx) -{ - struct async_submit_ctl submit; - struct page **blocks = percpu->scribble; - int count; - - pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); - - count = set_syndrome_sources(blocks, sh); - - atomic_inc(&sh->count); - - init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, - sh, to_addr_conv(sh, percpu)); - async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); -} - -static void ops_complete_check(void *stripe_head_ref) -{ - struct stripe_head *sh = stripe_head_ref; - - pr_debug("%s: stripe %llu\n", __func__, - (unsigned long long)sh->sector); - - sh->check_state = check_state_check_result; - set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); -} - -static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) -{ - int disks = sh->disks; - int pd_idx = sh->pd_idx; - int qd_idx = sh->qd_idx; - struct page *xor_dest; - struct page **xor_srcs = percpu->scribble; - struct dma_async_tx_descriptor *tx; - struct async_submit_ctl submit; - int count; - int i; - - pr_debug("%s: stripe %llu\n", __func__, - (unsigned long long)sh->sector); - - count = 0; - xor_dest = sh->dev[pd_idx].page; - xor_srcs[count++] = xor_dest; - for (i = disks; i--; ) { - if (i == pd_idx || i == qd_idx) - continue; - xor_srcs[count++] = sh->dev[i].page; - } - - init_async_submit(&submit, 0, NULL, NULL, NULL, - to_addr_conv(sh, percpu)); - tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, - &sh->ops.zero_sum_result, &submit); - - atomic_inc(&sh->count); - init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); - tx = async_trigger_callback(&submit); -} - -static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) -{ - struct page **srcs = percpu->scribble; - struct async_submit_ctl submit; - int count; - - pr_debug("%s: stripe %llu checkp: %d\n", __func__, - (unsigned long long)sh->sector, checkp); - - count = set_syndrome_sources(srcs, sh); - if (!checkp) - srcs[count] = NULL; - - atomic_inc(&sh->count); - init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, - sh, to_addr_conv(sh, percpu)); - async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, - &sh->ops.zero_sum_result, percpu->spare_page, &submit); -} - -static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request) -{ - int overlap_clear = 0, i, disks = sh->disks; - struct dma_async_tx_descriptor *tx = NULL; - struct r5conf *conf = sh->raid_conf; - int level = conf->level; - struct raid5_percpu *percpu; - unsigned long cpu; - - cpu = get_cpu(); - percpu = per_cpu_ptr(conf->percpu, cpu); - if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { - ops_run_biofill(sh); - overlap_clear++; - } - - if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { - if (level < 6) - tx = ops_run_compute5(sh, percpu); - else { - if (sh->ops.target2 < 0 || sh->ops.target < 0) - tx = ops_run_compute6_1(sh, percpu); - else - tx = ops_run_compute6_2(sh, percpu); - } - /* terminate the chain if reconstruct is not set to be run */ - if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) - async_tx_ack(tx); - } - - if (test_bit(STRIPE_OP_PREXOR, &ops_request)) - tx = ops_run_prexor(sh, percpu, tx); - - if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { - tx = ops_run_biodrain(sh, tx); - overlap_clear++; - } - - if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { - if (level < 6) - ops_run_reconstruct5(sh, percpu, tx); - else - ops_run_reconstruct6(sh, percpu, tx); - } - - if (test_bit(STRIPE_OP_CHECK, &ops_request)) { - if (sh->check_state == check_state_run) - ops_run_check_p(sh, percpu); - else if (sh->check_state == check_state_run_q) - ops_run_check_pq(sh, percpu, 0); - else if (sh->check_state == check_state_run_pq) - ops_run_check_pq(sh, percpu, 1); - else - BUG(); - } - - if (overlap_clear) - for (i = disks; i--; ) { - struct r5dev *dev = &sh->dev[i]; - if (test_and_clear_bit(R5_Overlap, &dev->flags)) - wake_up(&sh->raid_conf->wait_for_overlap); - } - put_cpu(); -} - -#ifdef CONFIG_MULTICORE_RAID456 -static void async_run_ops(void *param, async_cookie_t cookie) -{ - struct stripe_head *sh = param; - unsigned long ops_request = sh->ops.request; - - clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state); - wake_up(&sh->ops.wait_for_ops); - - __raid_run_ops(sh, ops_request); - release_stripe(sh); -} - -static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) -{ - /* since handle_stripe can be called outside of raid5d context - * we need to ensure sh->ops.request is de-staged before another - * request arrives - */ - wait_event(sh->ops.wait_for_ops, - !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state)); - sh->ops.request = ops_request; - - atomic_inc(&sh->count); - async_schedule(async_run_ops, sh); -} -#else -#define raid_run_ops __raid_run_ops -#endif - -static int grow_one_stripe(struct r5conf *conf) -{ - struct stripe_head *sh; - sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); - if (!sh) - return 0; - - sh->raid_conf = conf; - #ifdef CONFIG_MULTICORE_RAID456 - init_waitqueue_head(&sh->ops.wait_for_ops); - #endif - - if (grow_buffers(sh)) { - shrink_buffers(sh); - kmem_cache_free(conf->slab_cache, sh); - return 0; - } - /* we just created an active stripe so... */ - atomic_set(&sh->count, 1); - atomic_inc(&conf->active_stripes); - INIT_LIST_HEAD(&sh->lru); - release_stripe(sh); - return 1; -} - -static int grow_stripes(struct r5conf *conf, int num) -{ - struct kmem_cache *sc; - int devs = max(conf->raid_disks, conf->previous_raid_disks); - - if (conf->mddev->gendisk) - sprintf(conf->cache_name[0], - "raid%d-%s", conf->level, mdname(conf->mddev)); - else - sprintf(conf->cache_name[0], - "raid%d-%p", conf->level, conf->mddev); - sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]); - - conf->active_name = 0; - sc = kmem_cache_create(conf->cache_name[conf->active_name], - sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), - 0, 0, NULL); - if (!sc) - return 1; - conf->slab_cache = sc; - conf->pool_size = devs; - while (num--) - if (!grow_one_stripe(conf)) - return 1; - return 0; -} - -/** - * scribble_len - return the required size of the scribble region - * @num - total number of disks in the array - * - * The size must be enough to contain: - * 1/ a struct page pointer for each device in the array +2 - * 2/ room to convert each entry in (1) to its corresponding dma - * (dma_map_page()) or page (page_address()) address. - * - * Note: the +2 is for the destination buffers of the ddf/raid6 case where we - * calculate over all devices (not just the data blocks), using zeros in place - * of the P and Q blocks. - */ -static size_t scribble_len(int num) -{ - size_t len; - - len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); - - return len; -} - -static int resize_stripes(struct r5conf *conf, int newsize) -{ - /* Make all the stripes able to hold 'newsize' devices. - * New slots in each stripe get 'page' set to a new page. - * - * This happens in stages: - * 1/ create a new kmem_cache and allocate the required number of - * stripe_heads. - * 2/ gather all the old stripe_heads and tranfer the pages across - * to the new stripe_heads. This will have the side effect of - * freezing the array as once all stripe_heads have been collected, - * no IO will be possible. Old stripe heads are freed once their - * pages have been transferred over, and the old kmem_cache is - * freed when all stripes are done. - * 3/ reallocate conf->disks to be suitable bigger. If this fails, - * we simple return a failre status - no need to clean anything up. - * 4/ allocate new pages for the new slots in the new stripe_heads. - * If this fails, we don't bother trying the shrink the - * stripe_heads down again, we just leave them as they are. - * As each stripe_head is processed the new one is released into - * active service. - * - * Once step2 is started, we cannot afford to wait for a write, - * so we use GFP_NOIO allocations. - */ - struct stripe_head *osh, *nsh; - LIST_HEAD(newstripes); - struct disk_info *ndisks; - unsigned long cpu; - int err; - struct kmem_cache *sc; - int i; - - if (newsize <= conf->pool_size) - return 0; /* never bother to shrink */ - - err = md_allow_write(conf->mddev); - if (err) - return err; - - /* Step 1 */ - sc = kmem_cache_create(conf->cache_name[1-conf->active_name], - sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), - 0, 0, NULL); - if (!sc) - return -ENOMEM; - - for (i = conf->max_nr_stripes; i; i--) { - nsh = kmem_cache_zalloc(sc, GFP_KERNEL); - if (!nsh) - break; - - nsh->raid_conf = conf; - #ifdef CONFIG_MULTICORE_RAID456 - init_waitqueue_head(&nsh->ops.wait_for_ops); - #endif - - list_add(&nsh->lru, &newstripes); - } - if (i) { - /* didn't get enough, give up */ - while (!list_empty(&newstripes)) { - nsh = list_entry(newstripes.next, struct stripe_head, lru); - list_del(&nsh->lru); - kmem_cache_free(sc, nsh); - } - kmem_cache_destroy(sc); - return -ENOMEM; - } - /* Step 2 - Must use GFP_NOIO now. - * OK, we have enough stripes, start collecting inactive - * stripes and copying them over - */ - list_for_each_entry(nsh, &newstripes, lru) { - spin_lock_irq(&conf->device_lock); - wait_event_lock_irq(conf->wait_for_stripe, - !list_empty(&conf->inactive_list), - conf->device_lock, - ); - osh = get_free_stripe(conf); - spin_unlock_irq(&conf->device_lock); - atomic_set(&nsh->count, 1); - for(i=0; i<conf->pool_size; i++) - nsh->dev[i].page = osh->dev[i].page; - for( ; i<newsize; i++) - nsh->dev[i].page = NULL; - kmem_cache_free(conf->slab_cache, osh); - } - kmem_cache_destroy(conf->slab_cache); - - /* Step 3. - * At this point, we are holding all the stripes so the array - * is completely stalled, so now is a good time to resize - * conf->disks and the scribble region - */ - ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); - if (ndisks) { - for (i=0; i<conf->raid_disks; i++) - ndisks[i] = conf->disks[i]; - kfree(conf->disks); - conf->disks = ndisks; - } else - err = -ENOMEM; - - get_online_cpus(); - conf->scribble_len = scribble_len(newsize); - for_each_present_cpu(cpu) { - struct raid5_percpu *percpu; - void *scribble; - - percpu = per_cpu_ptr(conf->percpu, cpu); - scribble = kmalloc(conf->scribble_len, GFP_NOIO); - - if (scribble) { - kfree(percpu->scribble); - percpu->scribble = scribble; - } else { - err = -ENOMEM; - break; - } - } - put_online_cpus(); - - /* Step 4, return new stripes to service */ - while(!list_empty(&newstripes)) { - nsh = list_entry(newstripes.next, struct stripe_head, lru); - list_del_init(&nsh->lru); - - for (i=conf->raid_disks; i < newsize; i++) - if (nsh->dev[i].page == NULL) { - struct page *p = alloc_page(GFP_NOIO); - nsh->dev[i].page = p; - if (!p) - err = -ENOMEM; - } - release_stripe(nsh); - } - /* critical section pass, GFP_NOIO no longer needed */ - - conf->slab_cache = sc; - conf->active_name = 1-conf->active_name; - conf->pool_size = newsize; - return err; -} - -static int drop_one_stripe(struct r5conf *conf) -{ - struct stripe_head *sh; - - spin_lock_irq(&conf->device_lock); - sh = get_free_stripe(conf); - spin_unlock_irq(&conf->device_lock); - if (!sh) - return 0; - BUG_ON(atomic_read(&sh->count)); - shrink_buffers(sh); - kmem_cache_free(conf->slab_cache, sh); - atomic_dec(&conf->active_stripes); - return 1; -} - -static void shrink_stripes(struct r5conf *conf) -{ - while (drop_one_stripe(conf)) - ; - - if (conf->slab_cache) - kmem_cache_destroy(conf->slab_cache); - conf->slab_cache = NULL; -} - -static void raid5_end_read_request(struct bio * bi, int error) -{ - struct stripe_head *sh = bi->bi_private; - struct r5conf *conf = sh->raid_conf; - int disks = sh->disks, i; - int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); - char b[BDEVNAME_SIZE]; - struct md_rdev *rdev = NULL; - - - for (i=0 ; i<disks; i++) - if (bi == &sh->dev[i].req) - break; - - pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n", - (unsigned long long)sh->sector, i, atomic_read(&sh->count), - uptodate); - if (i == disks) { - BUG(); - return; - } - if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) - /* If replacement finished while this request was outstanding, - * 'replacement' might be NULL already. - * In that case it moved down to 'rdev'. - * rdev is not removed until all requests are finished. - */ - rdev = conf->disks[i].replacement; - if (!rdev) - rdev = conf->disks[i].rdev; - - if (uptodate) { - set_bit(R5_UPTODATE, &sh->dev[i].flags); - if (test_bit(R5_ReadError, &sh->dev[i].flags)) { - /* Note that this cannot happen on a - * replacement device. We just fail those on - * any error - */ - printk_ratelimited( - KERN_INFO - "md/raid:%s: read error corrected" - " (%lu sectors at %llu on %s)\n", - mdname(conf->mddev), STRIPE_SECTORS, - (unsigned long long)(sh->sector - + rdev->data_offset), - bdevname(rdev->bdev, b)); - atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); - clear_bit(R5_ReadError, &sh->dev[i].flags); - clear_bit(R5_ReWrite, &sh->dev[i].flags); - } - if (atomic_read(&rdev->read_errors)) - atomic_set(&rdev->read_errors, 0); - } else { - const char *bdn = bdevname(rdev->bdev, b); - int retry = 0; - - clear_bit(R5_UPTODATE, &sh->dev[i].flags); - atomic_inc(&rdev->read_errors); - if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) - printk_ratelimited( - KERN_WARNING - "md/raid:%s: read error on replacement device " - "(sector %llu on %s).\n", - mdname(conf->mddev), - (unsigned long long)(sh->sector - + rdev->data_offset), - bdn); - else if (conf->mddev->degraded >= conf->max_degraded) - printk_ratelimited( - KERN_WARNING - "md/raid:%s: read error not correctable " - "(sector %llu on %s).\n", - mdname(conf->mddev), - (unsigned long long)(sh->sector - + rdev->data_offset), - bdn); - else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) - /* Oh, no!!! */ - printk_ratelimited( - KERN_WARNING - "md/raid:%s: read error NOT corrected!! " - "(sector %llu on %s).\n", - mdname(conf->mddev), - (unsigned long long)(sh->sector - + rdev->data_offset), - bdn); - else if (atomic_read(&rdev->read_errors) - > conf->max_nr_stripes) - printk(KERN_WARNING - "md/raid:%s: Too many read errors, failing device %s.\n", - mdname(conf->mddev), bdn); - else - retry = 1; - if (retry) - set_bit(R5_ReadError, &sh->dev[i].flags); - else { - clear_bit(R5_ReadError, &sh->dev[i].flags); - clear_bit(R5_ReWrite, &sh->dev[i].flags); - md_error(conf->mddev, rdev); - } - } - rdev_dec_pending(rdev, conf->mddev); - clear_bit(R5_LOCKED, &sh->dev[i].flags); - set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); -} - -static void raid5_end_write_request(struct bio *bi, int error) -{ - struct stripe_head *sh = bi->bi_private; - struct r5conf *conf = sh->raid_conf; - int disks = sh->disks, i; - struct md_rdev *uninitialized_var(rdev); - int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); - sector_t first_bad; - int bad_sectors; - int replacement = 0; - - for (i = 0 ; i < disks; i++) { - if (bi == &sh->dev[i].req) { - rdev = conf->disks[i].rdev; - break; - } - if (bi == &sh->dev[i].rreq) { - rdev = conf->disks[i].replacement; - if (rdev) - replacement = 1; - else - /* rdev was removed and 'replacement' - * replaced it. rdev is not removed - * until all requests are finished. - */ - rdev = conf->disks[i].rdev; - break; - } - } - pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", - (unsigned long long)sh->sector, i, atomic_read(&sh->count), - uptodate); - if (i == disks) { - BUG(); - return; - } - - if (replacement) { - if (!uptodate) - md_error(conf->mddev, rdev); - else if (is_badblock(rdev, sh->sector, - STRIPE_SECTORS, - &first_bad, &bad_sectors)) - set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); - } else { - if (!uptodate) { - set_bit(WriteErrorSeen, &rdev->flags); - set_bit(R5_WriteError, &sh->dev[i].flags); - if (!test_and_set_bit(WantReplacement, &rdev->flags)) - set_bit(MD_RECOVERY_NEEDED, - &rdev->mddev->recovery); - } else if (is_badblock(rdev, sh->sector, - STRIPE_SECTORS, - &first_bad, &bad_sectors)) - set_bit(R5_MadeGood, &sh->dev[i].flags); - } - rdev_dec_pending(rdev, conf->mddev); - - if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) - clear_bit(R5_LOCKED, &sh->dev[i].flags); - set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); -} - -static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); - -static void raid5_build_block(struct stripe_head *sh, int i, int previous) -{ - struct r5dev *dev = &sh->dev[i]; - - bio_init(&dev->req); - dev->req.bi_io_vec = &dev->vec; - dev->req.bi_vcnt++; - dev->req.bi_max_vecs++; - dev->req.bi_private = sh; - dev->vec.bv_page = dev->page; - - bio_init(&dev->rreq); - dev->rreq.bi_io_vec = &dev->rvec; - dev->rreq.bi_vcnt++; - dev->rreq.bi_max_vecs++; - dev->rreq.bi_private = sh; - dev->rvec.bv_page = dev->page; - - dev->flags = 0; - dev->sector = compute_blocknr(sh, i, previous); -} - -static void error(struct mddev *mddev, struct md_rdev *rdev) -{ - char b[BDEVNAME_SIZE]; - struct r5conf *conf = mddev->private; - unsigned long flags; - pr_debug("raid456: error called\n"); - - spin_lock_irqsave(&conf->device_lock, flags); - clear_bit(In_sync, &rdev->flags); - mddev->degraded = calc_degraded(conf); - spin_unlock_irqrestore(&conf->device_lock, flags); - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - - set_bit(Blocked, &rdev->flags); - set_bit(Faulty, &rdev->flags); - set_bit(MD_CHANGE_DEVS, &mddev->flags); - printk(KERN_ALERT - "md/raid:%s: Disk failure on %s, disabling device.\n" - "md/raid:%s: Operation continuing on %d devices.\n", - mdname(mddev), - bdevname(rdev->bdev, b), - mdname(mddev), - conf->raid_disks - mddev->degraded); -} - -/* - * Input: a 'big' sector number, - * Output: index of the data and parity disk, and the sector # in them. - */ -static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, - int previous, int *dd_idx, - struct stripe_head *sh) -{ - sector_t stripe, stripe2; - sector_t chunk_number; - unsigned int chunk_offset; - int pd_idx, qd_idx; - int ddf_layout = 0; - sector_t new_sector; - int algorithm = previous ? conf->prev_algo - : conf->algorithm; - int sectors_per_chunk = previous ? conf->prev_chunk_sectors - : conf->chunk_sectors; - int raid_disks = previous ? conf->previous_raid_disks - : conf->raid_disks; - int data_disks = raid_disks - conf->max_degraded; - - /* First compute the information on this sector */ - - /* - * Compute the chunk number and the sector offset inside the chunk - */ - chunk_offset = sector_div(r_sector, sectors_per_chunk); - chunk_number = r_sector; - - /* - * Compute the stripe number - */ - stripe = chunk_number; - *dd_idx = sector_div(stripe, data_disks); - stripe2 = stripe; - /* - * Select the parity disk based on the user selected algorithm. - */ - pd_idx = qd_idx = -1; - switch(conf->level) { - case 4: - pd_idx = data_disks; - break; - case 5: - switch (algorithm) { - case ALGORITHM_LEFT_ASYMMETRIC: - pd_idx = data_disks - sector_div(stripe2, raid_disks); - if (*dd_idx >= pd_idx) - (*dd_idx)++; - break; - case ALGORITHM_RIGHT_ASYMMETRIC: - pd_idx = sector_div(stripe2, raid_disks); - if (*dd_idx >= pd_idx) - (*dd_idx)++; - break; - case ALGORITHM_LEFT_SYMMETRIC: - pd_idx = data_disks - sector_div(stripe2, raid_disks); - *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; - break; - case ALGORITHM_RIGHT_SYMMETRIC: - pd_idx = sector_div(stripe2, raid_disks); - *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; - break; - case ALGORITHM_PARITY_0: - pd_idx = 0; - (*dd_idx)++; - break; - case ALGORITHM_PARITY_N: - pd_idx = data_disks; - break; - default: - BUG(); - } - break; - case 6: - - switch (algorithm) { - case ALGORITHM_LEFT_ASYMMETRIC: - pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); - qd_idx = pd_idx + 1; - if (pd_idx == raid_disks-1) { - (*dd_idx)++; /* Q D D D P */ - qd_idx = 0; - } else if (*dd_idx >= pd_idx) - (*dd_idx) += 2; /* D D P Q D */ - break; - case ALGORITHM_RIGHT_ASYMMETRIC: - pd_idx = sector_div(stripe2, raid_disks); - qd_idx = pd_idx + 1; - if (pd_idx == raid_disks-1) { - (*dd_idx)++; /* Q D D D P */ - qd_idx = 0; - } else if (*dd_idx >= pd_idx) - (*dd_idx) += 2; /* D D P Q D */ - break; - case ALGORITHM_LEFT_SYMMETRIC: - pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); - qd_idx = (pd_idx + 1) % raid_disks; - *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; - break; - case ALGORITHM_RIGHT_SYMMETRIC: - pd_idx = sector_div(stripe2, raid_disks); - qd_idx = (pd_idx + 1) % raid_disks; - *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; - break; - - case ALGORITHM_PARITY_0: - pd_idx = 0; - qd_idx = 1; - (*dd_idx) += 2; - break; - case ALGORITHM_PARITY_N: - pd_idx = data_disks; - qd_idx = data_disks + 1; - break; - - case ALGORITHM_ROTATING_ZERO_RESTART: - /* Exactly the same as RIGHT_ASYMMETRIC, but or - * of blocks for computing Q is different. - */ - pd_idx = sector_div(stripe2, raid_disks); - qd_idx = pd_idx + 1; - if (pd_idx == raid_disks-1) { - (*dd_idx)++; /* Q D D D P */ - qd_idx = 0; - } else if (*dd_idx >= pd_idx) - (*dd_idx) += 2; /* D D P Q D */ - ddf_layout = 1; - break; - - case ALGORITHM_ROTATING_N_RESTART: - /* Same a left_asymmetric, by first stripe is - * D D D P Q rather than - * Q D D D P - */ - stripe2 += 1; - pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); - qd_idx = pd_idx + 1; - if (pd_idx == raid_disks-1) { - (*dd_idx)++; /* Q D D D P */ - qd_idx = 0; - } else if (*dd_idx >= pd_idx) - (*dd_idx) += 2; /* D D P Q D */ - ddf_layout = 1; - break; - - case ALGORITHM_ROTATING_N_CONTINUE: - /* Same as left_symmetric but Q is before P */ - pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); - qd_idx = (pd_idx + raid_disks - 1) % raid_disks; - *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; - ddf_layout = 1; - break; - - case ALGORITHM_LEFT_ASYMMETRIC_6: - /* RAID5 left_asymmetric, with Q on last device */ - pd_idx = data_disks - sector_div(stripe2, raid_disks-1); - if (*dd_idx >= pd_idx) - (*dd_idx)++; - qd_idx = raid_disks - 1; - break; - - case ALGORITHM_RIGHT_ASYMMETRIC_6: - pd_idx = sector_div(stripe2, raid_disks-1); - if (*dd_idx >= pd_idx) - (*dd_idx)++; - qd_idx = raid_disks - 1; - break; - - case ALGORITHM_LEFT_SYMMETRIC_6: - pd_idx = data_disks - sector_div(stripe2, raid_disks-1); - *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); - qd_idx = raid_disks - 1; - break; - - case ALGORITHM_RIGHT_SYMMETRIC_6: - pd_idx = sector_div(stripe2, raid_disks-1); - *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); - qd_idx = raid_disks - 1; - break; - - case ALGORITHM_PARITY_0_6: - pd_idx = 0; - (*dd_idx)++; - qd_idx = raid_disks - 1; - break; - - default: - BUG(); - } - break; - } - - if (sh) { - sh->pd_idx = pd_idx; - sh->qd_idx = qd_idx; - sh->ddf_layout = ddf_layout; - } - /* - * Finally, compute the new sector number - */ - new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; - return new_sector; -} - - -static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) -{ - struct r5conf *conf = sh->raid_conf; - int raid_disks = sh->disks; - int data_disks = raid_disks - conf->max_degraded; - sector_t new_sector = sh->sector, check; - int sectors_per_chunk = previous ? conf->prev_chunk_sectors - : conf->chunk_sectors; - int algorithm = previous ? conf->prev_algo - : conf->algorithm; - sector_t stripe; - int chunk_offset; - sector_t chunk_number; - int dummy1, dd_idx = i; - sector_t r_sector; - struct stripe_head sh2; - - - chunk_offset = sector_div(new_sector, sectors_per_chunk); - stripe = new_sector; - - if (i == sh->pd_idx) - return 0; - switch(conf->level) { - case 4: break; - case 5: - switch (algorithm) { - case ALGORITHM_LEFT_ASYMMETRIC: - case ALGORITHM_RIGHT_ASYMMETRIC: - if (i > sh->pd_idx) - i--; - break; - case ALGORITHM_LEFT_SYMMETRIC: - case ALGORITHM_RIGHT_SYMMETRIC: - if (i < sh->pd_idx) - i += raid_disks; - i -= (sh->pd_idx + 1); - break; - case ALGORITHM_PARITY_0: - i -= 1; - break; - case ALGORITHM_PARITY_N: - break; - default: - BUG(); - } - break; - case 6: - if (i == sh->qd_idx) - return 0; /* It is the Q disk */ - switch (algorithm) { - case ALGORITHM_LEFT_ASYMMETRIC: - case ALGORITHM_RIGHT_ASYMMETRIC: - case ALGORITHM_ROTATING_ZERO_RESTART: - case ALGORITHM_ROTATING_N_RESTART: - if (sh->pd_idx == raid_disks-1) - i--; /* Q D D D P */ - else if (i > sh->pd_idx) - i -= 2; /* D D P Q D */ - break; - case ALGORITHM_LEFT_SYMMETRIC: - case ALGORITHM_RIGHT_SYMMETRIC: - if (sh->pd_idx == raid_disks-1) - i--; /* Q D D D P */ - else { - /* D D P Q D */ - if (i < sh->pd_idx) - i += raid_disks; - i -= (sh->pd_idx + 2); - } - break; - case ALGORITHM_PARITY_0: - i -= 2; - break; - case ALGORITHM_PARITY_N: - break; - case ALGORITHM_ROTATING_N_CONTINUE: - /* Like left_symmetric, but P is before Q */ - if (sh->pd_idx == 0) - i--; /* P D D D Q */ - else { - /* D D Q P D */ - if (i < sh->pd_idx) - i += raid_disks; - i -= (sh->pd_idx + 1); - } - break; - case ALGORITHM_LEFT_ASYMMETRIC_6: - case ALGORITHM_RIGHT_ASYMMETRIC_6: - if (i > sh->pd_idx) - i--; - break; - case ALGORITHM_LEFT_SYMMETRIC_6: - case ALGORITHM_RIGHT_SYMMETRIC_6: - if (i < sh->pd_idx) - i += data_disks + 1; - i -= (sh->pd_idx + 1); - break; - case ALGORITHM_PARITY_0_6: - i -= 1; - break; - default: - BUG(); - } - break; - } - - chunk_number = stripe * data_disks + i; - r_sector = chunk_number * sectors_per_chunk + chunk_offset; - - check = raid5_compute_sector(conf, r_sector, - previous, &dummy1, &sh2); - if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx - || sh2.qd_idx != sh->qd_idx) { - printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n", - mdname(conf->mddev)); - return 0; - } - return r_sector; -} - - -static void -schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, - int rcw, int expand) -{ - int i, pd_idx = sh->pd_idx, disks = sh->disks; - struct r5conf *conf = sh->raid_conf; - int level = conf->level; - - if (rcw) { - /* if we are not expanding this is a proper write request, and - * there will be bios with new data to be drained into the - * stripe cache - */ - if (!expand) { - sh->reconstruct_state = reconstruct_state_drain_run; - set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); - } else - sh->reconstruct_state = reconstruct_state_run; - - set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); - - for (i = disks; i--; ) { - struct r5dev *dev = &sh->dev[i]; - - if (dev->towrite) { - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantdrain, &dev->flags); - if (!expand) - clear_bit(R5_UPTODATE, &dev->flags); - s->locked++; - } - } - if (s->locked + conf->max_degraded == disks) - if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) - atomic_inc(&conf->pending_full_writes); - } else { - BUG_ON(level == 6); - BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || - test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); - - sh->reconstruct_state = reconstruct_state_prexor_drain_run; - set_bit(STRIPE_OP_PREXOR, &s->ops_request); - set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); - set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); - - for (i = disks; i--; ) { - struct r5dev *dev = &sh->dev[i]; - if (i == pd_idx) - continue; - - if (dev->towrite && - (test_bit(R5_UPTODATE, &dev->flags) || - test_bit(R5_Wantcompute, &dev->flags))) { - set_bit(R5_Wantdrain, &dev->flags); - set_bit(R5_LOCKED, &dev->flags); - clear_bit(R5_UPTODATE, &dev->flags); - s->locked++; - } - } - } - - /* keep the parity disk(s) locked while asynchronous operations - * are in flight - */ - set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); - clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); - s->locked++; - - if (level == 6) { - int qd_idx = sh->qd_idx; - struct r5dev *dev = &sh->dev[qd_idx]; - - set_bit(R5_LOCKED, &dev->flags); - clear_bit(R5_UPTODATE, &dev->flags); - s->locked++; - } - - pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", - __func__, (unsigned long long)sh->sector, - s->locked, s->ops_request); -} - -/* - * Each stripe/dev can have one or more bion attached. - * toread/towrite point to the first in a chain. - * The bi_next chain must be in order. - */ -static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) -{ - struct bio **bip; - struct r5conf *conf = sh->raid_conf; - int firstwrite=0; - - pr_debug("adding bi b#%llu to stripe s#%llu\n", - (unsigned long long)bi->bi_sector, - (unsigned long long)sh->sector); - - - spin_lock_irq(&conf->device_lock); - if (forwrite) { - bip = &sh->dev[dd_idx].towrite; - if (*bip == NULL && sh->dev[dd_idx].written == NULL) - firstwrite = 1; - } else - bip = &sh->dev[dd_idx].toread; - while (*bip && (*bip)->bi_sector < bi->bi_sector) { - if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) - goto overlap; - bip = & (*bip)->bi_next; - } - if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9)) - goto overlap; - - BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); - if (*bip) - bi->bi_next = *bip; - *bip = bi; - bi->bi_phys_segments++; - - if (forwrite) { - /* check if page is covered */ - sector_t sector = sh->dev[dd_idx].sector; - for (bi=sh->dev[dd_idx].towrite; - sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && - bi && bi->bi_sector <= sector; - bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { - if (bi->bi_sector + (bi->bi_size>>9) >= sector) - sector = bi->bi_sector + (bi->bi_size>>9); - } - if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) - set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); - } - spin_unlock_irq(&conf->device_lock); - - pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", - (unsigned long long)(*bip)->bi_sector, - (unsigned long long)sh->sector, dd_idx); - - if (conf->mddev->bitmap && firstwrite) { - bitmap_startwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, 0); - sh->bm_seq = conf->seq_flush+1; - set_bit(STRIPE_BIT_DELAY, &sh->state); - } - return 1; - - overlap: - set_bit(R5_Overlap, &sh->dev[dd_idx].flags); - spin_unlock_irq(&conf->device_lock); - return 0; -} - -static void end_reshape(struct r5conf *conf); - -static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, - struct stripe_head *sh) -{ - int sectors_per_chunk = - previous ? conf->prev_chunk_sectors : conf->chunk_sectors; - int dd_idx; - int chunk_offset = sector_div(stripe, sectors_per_chunk); - int disks = previous ? conf->previous_raid_disks : conf->raid_disks; - - raid5_compute_sector(conf, - stripe * (disks - conf->max_degraded) - *sectors_per_chunk + chunk_offset, - previous, - &dd_idx, sh); -} - -static void -handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, - struct stripe_head_state *s, int disks, - struct bio **return_bi) -{ - int i; - for (i = disks; i--; ) { - struct bio *bi; - int bitmap_end = 0; - - if (test_bit(R5_ReadError, &sh->dev[i].flags)) { - struct md_rdev *rdev; - rcu_read_lock(); - rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev && test_bit(In_sync, &rdev->flags)) - atomic_inc(&rdev->nr_pending); - else - rdev = NULL; - rcu_read_unlock(); - if (rdev) { - if (!rdev_set_badblocks( - rdev, - sh->sector, - STRIPE_SECTORS, 0)) - md_error(conf->mddev, rdev); - rdev_dec_pending(rdev, conf->mddev); - } - } - spin_lock_irq(&conf->device_lock); - /* fail all writes first */ - bi = sh->dev[i].towrite; - sh->dev[i].towrite = NULL; - if (bi) { - s->to_write--; - bitmap_end = 1; - } - - if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - wake_up(&conf->wait_for_overlap); - - while (bi && bi->bi_sector < - sh->dev[i].sector + STRIPE_SECTORS) { - struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); - clear_bit(BIO_UPTODATE, &bi->bi_flags); - if (!raid5_dec_bi_phys_segments(bi)) { - md_write_end(conf->mddev); - bi->bi_next = *return_bi; - *return_bi = bi; - } - bi = nextbi; - } - /* and fail all 'written' */ - bi = sh->dev[i].written; - sh->dev[i].written = NULL; - if (bi) bitmap_end = 1; - while (bi && bi->bi_sector < - sh->dev[i].sector + STRIPE_SECTORS) { - struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); - clear_bit(BIO_UPTODATE, &bi->bi_flags); - if (!raid5_dec_bi_phys_segments(bi)) { - md_write_end(conf->mddev); - bi->bi_next = *return_bi; - *return_bi = bi; - } - bi = bi2; - } - - /* fail any reads if this device is non-operational and - * the data has not reached the cache yet. - */ - if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && - (!test_bit(R5_Insync, &sh->dev[i].flags) || - test_bit(R5_ReadError, &sh->dev[i].flags))) { - bi = sh->dev[i].toread; - sh->dev[i].toread = NULL; - if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - wake_up(&conf->wait_for_overlap); - if (bi) s->to_read--; - while (bi && bi->bi_sector < - sh->dev[i].sector + STRIPE_SECTORS) { - struct bio *nextbi = - r5_next_bio(bi, sh->dev[i].sector); - clear_bit(BIO_UPTODATE, &bi->bi_flags); - if (!raid5_dec_bi_phys_segments(bi)) { - bi->bi_next = *return_bi; - *return_bi = bi; - } - bi = nextbi; - } - } - spin_unlock_irq(&conf->device_lock); - if (bitmap_end) - bitmap_endwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, 0, 0); - /* If we were in the middle of a write the parity block might - * still be locked - so just clear all R5_LOCKED flags - */ - clear_bit(R5_LOCKED, &sh->dev[i].flags); - } - - if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) - if (atomic_dec_and_test(&conf->pending_full_writes)) - md_wakeup_thread(conf->mddev->thread); -} - -static void -handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, - struct stripe_head_state *s) -{ - int abort = 0; - int i; - - clear_bit(STRIPE_SYNCING, &sh->state); - s->syncing = 0; - s->replacing = 0; - /* There is nothing more to do for sync/check/repair. - * Don't even need to abort as that is handled elsewhere - * if needed, and not always wanted e.g. if there is a known - * bad block here. - * For recover/replace we need to record a bad block on all - * non-sync devices, or abort the recovery - */ - if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) { - /* During recovery devices cannot be removed, so - * locking and refcounting of rdevs is not needed - */ - for (i = 0; i < conf->raid_disks; i++) { - struct md_rdev *rdev = conf->disks[i].rdev; - if (rdev - && !test_bit(Faulty, &rdev->flags) - && !test_bit(In_sync, &rdev->flags) - && !rdev_set_badblocks(rdev, sh->sector, - STRIPE_SECTORS, 0)) - abort = 1; - rdev = conf->disks[i].replacement; - if (rdev - && !test_bit(Faulty, &rdev->flags) - && !test_bit(In_sync, &rdev->flags) - && !rdev_set_badblocks(rdev, sh->sector, - STRIPE_SECTORS, 0)) - abort = 1; - } - if (abort) - conf->recovery_disabled = - conf->mddev->recovery_disabled; - } - md_done_sync(conf->mddev, STRIPE_SECTORS, !abort); -} - -static int want_replace(struct stripe_head *sh, int disk_idx) -{ - struct md_rdev *rdev; - int rv = 0; - /* Doing recovery so rcu locking not required */ - rdev = sh->raid_conf->disks[disk_idx].replacement; - if (rdev - && !test_bit(Faulty, &rdev->flags) - && !test_bit(In_sync, &rdev->flags) - && (rdev->recovery_offset <= sh->sector - || rdev->mddev->recovery_cp <= sh->sector)) - rv = 1; - - return rv; -} - -/* fetch_block - checks the given member device to see if its data needs - * to be read or computed to satisfy a request. - * - * Returns 1 when no more member devices need to be checked, otherwise returns - * 0 to tell the loop in handle_stripe_fill to continue - */ -static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, - int disk_idx, int disks) -{ - struct r5dev *dev = &sh->dev[disk_idx]; - struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], - &sh->dev[s->failed_num[1]] }; - - /* is the data in this block needed, and can we get it? */ - if (!test_bit(R5_LOCKED, &dev->flags) && - !test_bit(R5_UPTODATE, &dev->flags) && - (dev->toread || - (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || - s->syncing || s->expanding || - (s->replacing && want_replace(sh, disk_idx)) || - (s->failed >= 1 && fdev[0]->toread) || - (s->failed >= 2 && fdev[1]->toread) || - (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && - !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || - (sh->raid_conf->level == 6 && s->failed && s->to_write))) { - /* we would like to get this block, possibly by computing it, - * otherwise read it if the backing disk is insync - */ - BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); - BUG_ON(test_bit(R5_Wantread, &dev->flags)); - if ((s->uptodate == disks - 1) && - (s->failed && (disk_idx == s->failed_num[0] || - disk_idx == s->failed_num[1]))) { - /* have disk failed, and we're requested to fetch it; - * do compute it - */ - pr_debug("Computing stripe %llu block %d\n", - (unsigned long long)sh->sector, disk_idx); - set_bit(STRIPE_COMPUTE_RUN, &sh->state); - set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); - set_bit(R5_Wantcompute, &dev->flags); - sh->ops.target = disk_idx; - sh->ops.target2 = -1; /* no 2nd target */ - s->req_compute = 1; - /* Careful: from this point on 'uptodate' is in the eye - * of raid_run_ops which services 'compute' operations - * before writes. R5_Wantcompute flags a block that will - * be R5_UPTODATE by the time it is needed for a - * subsequent operation. - */ - s->uptodate++; - return 1; - } else if (s->uptodate == disks-2 && s->failed >= 2) { - /* Computing 2-failure is *very* expensive; only - * do it if failed >= 2 - */ - int other; - for (other = disks; other--; ) { - if (other == disk_idx) - continue; - if (!test_bit(R5_UPTODATE, - &sh->dev[other].flags)) - break; - } - BUG_ON(other < 0); - pr_debug("Computing stripe %llu blocks %d,%d\n", - (unsigned long long)sh->sector, - disk_idx, other); - set_bit(STRIPE_COMPUTE_RUN, &sh->state); - set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); - set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); - set_bit(R5_Wantcompute, &sh->dev[other].flags); - sh->ops.target = disk_idx; - sh->ops.target2 = other; - s->uptodate += 2; - s->req_compute = 1; - return 1; - } else if (test_bit(R5_Insync, &dev->flags)) { - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantread, &dev->flags); - s->locked++; - pr_debug("Reading block %d (sync=%d)\n", - disk_idx, s->syncing); - } - } - - return 0; -} - -/** - * handle_stripe_fill - read or compute data to satisfy pending requests. - */ -static void handle_stripe_fill(struct stripe_head *sh, - struct stripe_head_state *s, - int disks) -{ - int i; - - /* look for blocks to read/compute, skip this if a compute - * is already in flight, or if the stripe contents are in the - * midst of changing due to a write - */ - if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && - !sh->reconstruct_state) - for (i = disks; i--; ) - if (fetch_block(sh, s, i, disks)) - break; - set_bit(STRIPE_HANDLE, &sh->state); -} - - -/* handle_stripe_clean_event - * any written block on an uptodate or failed drive can be returned. - * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but - * never LOCKED, so we don't need to test 'failed' directly. - */ -static void handle_stripe_clean_event(struct r5conf *conf, - struct stripe_head *sh, int disks, struct bio **return_bi) -{ - int i; - struct r5dev *dev; - - for (i = disks; i--; ) - if (sh->dev[i].written) { - dev = &sh->dev[i]; - if (!test_bit(R5_LOCKED, &dev->flags) && - test_bit(R5_UPTODATE, &dev->flags)) { - /* We can return any write requests */ - struct bio *wbi, *wbi2; - int bitmap_end = 0; - pr_debug("Return write for disc %d\n", i); - spin_lock_irq(&conf->device_lock); - wbi = dev->written; - dev->written = NULL; - while (wbi && wbi->bi_sector < - dev->sector + STRIPE_SECTORS) { - wbi2 = r5_next_bio(wbi, dev->sector); - if (!raid5_dec_bi_phys_segments(wbi)) { - md_write_end(conf->mddev); - wbi->bi_next = *return_bi; - *return_bi = wbi; - } - wbi = wbi2; - } - if (dev->towrite == NULL) - bitmap_end = 1; - spin_unlock_irq(&conf->device_lock); - if (bitmap_end) - bitmap_endwrite(conf->mddev->bitmap, - sh->sector, - STRIPE_SECTORS, - !test_bit(STRIPE_DEGRADED, &sh->state), - 0); - } - } - - if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) - if (atomic_dec_and_test(&conf->pending_full_writes)) - md_wakeup_thread(conf->mddev->thread); -} - -static void handle_stripe_dirtying(struct r5conf *conf, - struct stripe_head *sh, - struct stripe_head_state *s, - int disks) -{ - int rmw = 0, rcw = 0, i; - if (conf->max_degraded == 2) { - /* RAID6 requires 'rcw' in current implementation - * Calculate the real rcw later - for now fake it - * look like rcw is cheaper - */ - rcw = 1; rmw = 2; - } else for (i = disks; i--; ) { - /* would I have to read this buffer for read_modify_write */ - struct r5dev *dev = &sh->dev[i]; - if ((dev->towrite || i == sh->pd_idx) && - !test_bit(R5_LOCKED, &dev->flags) && - !(test_bit(R5_UPTODATE, &dev->flags) || - test_bit(R5_Wantcompute, &dev->flags))) { - if (test_bit(R5_Insync, &dev->flags)) - rmw++; - else - rmw += 2*disks; /* cannot read it */ - } - /* Would I have to read this buffer for reconstruct_write */ - if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && - !test_bit(R5_LOCKED, &dev->flags) && - !(test_bit(R5_UPTODATE, &dev->flags) || - test_bit(R5_Wantcompute, &dev->flags))) { - if (test_bit(R5_Insync, &dev->flags)) rcw++; - else - rcw += 2*disks; - } - } - pr_debug("for sector %llu, rmw=%d rcw=%d\n", - (unsigned long long)sh->sector, rmw, rcw); - set_bit(STRIPE_HANDLE, &sh->state); - if (rmw < rcw && rmw > 0) - /* prefer read-modify-write, but need to get some data */ - for (i = disks; i--; ) { - struct r5dev *dev = &sh->dev[i]; - if ((dev->towrite || i == sh->pd_idx) && - !test_bit(R5_LOCKED, &dev->flags) && - !(test_bit(R5_UPTODATE, &dev->flags) || - test_bit(R5_Wantcompute, &dev->flags)) && - test_bit(R5_Insync, &dev->flags)) { - if ( - test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { - pr_debug("Read_old block " - "%d for r-m-w\n", i); - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantread, &dev->flags); - s->locked++; - } else { - set_bit(STRIPE_DELAYED, &sh->state); - set_bit(STRIPE_HANDLE, &sh->state); - } - } - } - if (rcw <= rmw && rcw > 0) { - /* want reconstruct write, but need to get some data */ - rcw = 0; - for (i = disks; i--; ) { - struct r5dev *dev = &sh->dev[i]; - if (!test_bit(R5_OVERWRITE, &dev->flags) && - i != sh->pd_idx && i != sh->qd_idx && - !test_bit(R5_LOCKED, &dev->flags) && - !(test_bit(R5_UPTODATE, &dev->flags) || - test_bit(R5_Wantcompute, &dev->flags))) { - rcw++; - if (!test_bit(R5_Insync, &dev->flags)) - continue; /* it's a failed drive */ - if ( - test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { - pr_debug("Read_old block " - "%d for Reconstruct\n", i); - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantread, &dev->flags); - s->locked++; - } else { - set_bit(STRIPE_DELAYED, &sh->state); - set_bit(STRIPE_HANDLE, &sh->state); - } - } - } - } - /* now if nothing is locked, and if we have enough data, - * we can start a write request - */ - /* since handle_stripe can be called at any time we need to handle the - * case where a compute block operation has been submitted and then a - * subsequent call wants to start a write request. raid_run_ops only - * handles the case where compute block and reconstruct are requested - * simultaneously. If this is not the case then new writes need to be - * held off until the compute completes. - */ - if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && - (s->locked == 0 && (rcw == 0 || rmw == 0) && - !test_bit(STRIPE_BIT_DELAY, &sh->state))) - schedule_reconstruction(sh, s, rcw == 0, 0); -} - -static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, - struct stripe_head_state *s, int disks) -{ - struct r5dev *dev = NULL; - - set_bit(STRIPE_HANDLE, &sh->state); - - switch (sh->check_state) { - case check_state_idle: - /* start a new check operation if there are no failures */ - if (s->failed == 0) { - BUG_ON(s->uptodate != disks); - sh->check_state = check_state_run; - set_bit(STRIPE_OP_CHECK, &s->ops_request); - clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); - s->uptodate--; - break; - } - dev = &sh->dev[s->failed_num[0]]; - /* fall through */ - case check_state_compute_result: - sh->check_state = check_state_idle; - if (!dev) - dev = &sh->dev[sh->pd_idx]; - - /* check that a write has not made the stripe insync */ - if (test_bit(STRIPE_INSYNC, &sh->state)) - break; - - /* either failed parity check, or recovery is happening */ - BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); - BUG_ON(s->uptodate != disks); - - set_bit(R5_LOCKED, &dev->flags); - s->locked++; - set_bit(R5_Wantwrite, &dev->flags); - - clear_bit(STRIPE_DEGRADED, &sh->state); - set_bit(STRIPE_INSYNC, &sh->state); - break; - case check_state_run: - break; /* we will be called again upon completion */ - case check_state_check_result: - sh->check_state = check_state_idle; - - /* if a failure occurred during the check operation, leave - * STRIPE_INSYNC not set and let the stripe be handled again - */ - if (s->failed) - break; - - /* handle a successful check operation, if parity is correct - * we are done. Otherwise update the mismatch count and repair - * parity if !MD_RECOVERY_CHECK - */ - if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) - /* parity is correct (on disc, - * not in buffer any more) - */ - set_bit(STRIPE_INSYNC, &sh->state); - else { - conf->mddev->resync_mismatches += STRIPE_SECTORS; - if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) - /* don't try to repair!! */ - set_bit(STRIPE_INSYNC, &sh->state); - else { - sh->check_state = check_state_compute_run; - set_bit(STRIPE_COMPUTE_RUN, &sh->state); - set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); - set_bit(R5_Wantcompute, - &sh->dev[sh->pd_idx].flags); - sh->ops.target = sh->pd_idx; - sh->ops.target2 = -1; - s->uptodate++; - } - } - break; - case check_state_compute_run: - break; - default: - printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", - __func__, sh->check_state, - (unsigned long long) sh->sector); - BUG(); - } -} - - -static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, - struct stripe_head_state *s, - int disks) -{ - int pd_idx = sh->pd_idx; - int qd_idx = sh->qd_idx; - struct r5dev *dev; - - set_bit(STRIPE_HANDLE, &sh->state); - - BUG_ON(s->failed > 2); - - /* Want to check and possibly repair P and Q. - * However there could be one 'failed' device, in which - * case we can only check one of them, possibly using the - * other to generate missing data - */ - - switch (sh->check_state) { - case check_state_idle: - /* start a new check operation if there are < 2 failures */ - if (s->failed == s->q_failed) { - /* The only possible failed device holds Q, so it - * makes sense to check P (If anything else were failed, - * we would have used P to recreate it). - */ - sh->check_state = check_state_run; - } - if (!s->q_failed && s->failed < 2) { - /* Q is not failed, and we didn't use it to generate - * anything, so it makes sense to check it - */ - if (sh->check_state == check_state_run) - sh->check_state = check_state_run_pq; - else - sh->check_state = check_state_run_q; - } - - /* discard potentially stale zero_sum_result */ - sh->ops.zero_sum_result = 0; - - if (sh->check_state == check_state_run) { - /* async_xor_zero_sum destroys the contents of P */ - clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); - s->uptodate--; - } - if (sh->check_state >= check_state_run && - sh->check_state <= check_state_run_pq) { - /* async_syndrome_zero_sum preserves P and Q, so - * no need to mark them !uptodate here - */ - set_bit(STRIPE_OP_CHECK, &s->ops_request); - break; - } - - /* we have 2-disk failure */ - BUG_ON(s->failed != 2); - /* fall through */ - case check_state_compute_result: - sh->check_state = check_state_idle; - - /* check that a write has not made the stripe insync */ - if (test_bit(STRIPE_INSYNC, &sh->state)) - break; - - /* now write out any block on a failed drive, - * or P or Q if they were recomputed - */ - BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ - if (s->failed == 2) { - dev = &sh->dev[s->failed_num[1]]; - s->locked++; - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantwrite, &dev->flags); - } - if (s->failed >= 1) { - dev = &sh->dev[s->failed_num[0]]; - s->locked++; - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantwrite, &dev->flags); - } - if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { - dev = &sh->dev[pd_idx]; - s->locked++; - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantwrite, &dev->flags); - } - if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { - dev = &sh->dev[qd_idx]; - s->locked++; - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantwrite, &dev->flags); - } - clear_bit(STRIPE_DEGRADED, &sh->state); - - set_bit(STRIPE_INSYNC, &sh->state); - break; - case check_state_run: - case check_state_run_q: - case check_state_run_pq: - break; /* we will be called again upon completion */ - case check_state_check_result: - sh->check_state = check_state_idle; - - /* handle a successful check operation, if parity is correct - * we are done. Otherwise update the mismatch count and repair - * parity if !MD_RECOVERY_CHECK - */ - if (sh->ops.zero_sum_result == 0) { - /* both parities are correct */ - if (!s->failed) - set_bit(STRIPE_INSYNC, &sh->state); - else { - /* in contrast to the raid5 case we can validate - * parity, but still have a failure to write - * back - */ - sh->check_state = check_state_compute_result; - /* Returning at this point means that we may go - * off and bring p and/or q uptodate again so - * we make sure to check zero_sum_result again - * to verify if p or q need writeback - */ - } - } else { - conf->mddev->resync_mismatches += STRIPE_SECTORS; - if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) - /* don't try to repair!! */ - set_bit(STRIPE_INSYNC, &sh->state); - else { - int *target = &sh->ops.target; - - sh->ops.target = -1; - sh->ops.target2 = -1; - sh->check_state = check_state_compute_run; - set_bit(STRIPE_COMPUTE_RUN, &sh->state); - set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); - if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { - set_bit(R5_Wantcompute, - &sh->dev[pd_idx].flags); - *target = pd_idx; - target = &sh->ops.target2; - s->uptodate++; - } - if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { - set_bit(R5_Wantcompute, - &sh->dev[qd_idx].flags); - *target = qd_idx; - s->uptodate++; - } - } - } - break; - case check_state_compute_run: - break; - default: - printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", - __func__, sh->check_state, - (unsigned long long) sh->sector); - BUG(); - } -} - -static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) -{ - int i; - - /* We have read all the blocks in this stripe and now we need to - * copy some of them into a target stripe for expand. - */ - struct dma_async_tx_descriptor *tx = NULL; - clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); - for (i = 0; i < sh->disks; i++) - if (i != sh->pd_idx && i != sh->qd_idx) { - int dd_idx, j; - struct stripe_head *sh2; - struct async_submit_ctl submit; - - sector_t bn = compute_blocknr(sh, i, 1); - sector_t s = raid5_compute_sector(conf, bn, 0, - &dd_idx, NULL); - sh2 = get_active_stripe(conf, s, 0, 1, 1); - if (sh2 == NULL) - /* so far only the early blocks of this stripe - * have been requested. When later blocks - * get requested, we will try again - */ - continue; - if (!test_bit(STRIPE_EXPANDING, &sh2->state) || - test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { - /* must have already done this block */ - release_stripe(sh2); - continue; - } - - /* place all the copies on one channel */ - init_async_submit(&submit, 0, tx, NULL, NULL, NULL); - tx = async_memcpy(sh2->dev[dd_idx].page, - sh->dev[i].page, 0, 0, STRIPE_SIZE, - &submit); - - set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); - set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); - for (j = 0; j < conf->raid_disks; j++) - if (j != sh2->pd_idx && - j != sh2->qd_idx && - !test_bit(R5_Expanded, &sh2->dev[j].flags)) - break; - if (j == conf->raid_disks) { - set_bit(STRIPE_EXPAND_READY, &sh2->state); - set_bit(STRIPE_HANDLE, &sh2->state); - } - release_stripe(sh2); - - } - /* done submitting copies, wait for them to complete */ - if (tx) { - async_tx_ack(tx); - dma_wait_for_async_tx(tx); - } -} - -/* - * handle_stripe - do things to a stripe. - * - * We lock the stripe by setting STRIPE_ACTIVE and then examine the - * state of various bits to see what needs to be done. - * Possible results: - * return some read requests which now have data - * return some write requests which are safely on storage - * schedule a read on some buffers - * schedule a write of some buffers - * return confirmation of parity correctness - * - */ - -static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) -{ - struct r5conf *conf = sh->raid_conf; - int disks = sh->disks; - struct r5dev *dev; - int i; - int do_recovery = 0; - - memset(s, 0, sizeof(*s)); - - s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); - s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); - s->failed_num[0] = -1; - s->failed_num[1] = -1; - - /* Now to look around and see what can be done */ - rcu_read_lock(); - spin_lock_irq(&conf->device_lock); - for (i=disks; i--; ) { - struct md_rdev *rdev; - sector_t first_bad; - int bad_sectors; - int is_bad = 0; - - dev = &sh->dev[i]; - - pr_debug("check %d: state 0x%lx read %p write %p written %p\n", - i, dev->flags, - dev->toread, dev->towrite, dev->written); - /* maybe we can reply to a read - * - * new wantfill requests are only permitted while - * ops_complete_biofill is guaranteed to be inactive - */ - if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && - !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) - set_bit(R5_Wantfill, &dev->flags); - - /* now count some things */ - if (test_bit(R5_LOCKED, &dev->flags)) - s->locked++; - if (test_bit(R5_UPTODATE, &dev->flags)) - s->uptodate++; - if (test_bit(R5_Wantcompute, &dev->flags)) { - s->compute++; - BUG_ON(s->compute > 2); - } - - if (test_bit(R5_Wantfill, &dev->flags)) - s->to_fill++; - else if (dev->toread) - s->to_read++; - if (dev->towrite) { - s->to_write++; - if (!test_bit(R5_OVERWRITE, &dev->flags)) - s->non_overwrite++; - } - if (dev->written) - s->written++; - /* Prefer to use the replacement for reads, but only - * if it is recovered enough and has no bad blocks. - */ - rdev = rcu_dereference(conf->disks[i].replacement); - if (rdev && !test_bit(Faulty, &rdev->flags) && - rdev->recovery_offset >= sh->sector + STRIPE_SECTORS && - !is_badblock(rdev, sh->sector, STRIPE_SECTORS, - &first_bad, &bad_sectors)) - set_bit(R5_ReadRepl, &dev->flags); - else { - if (rdev) - set_bit(R5_NeedReplace, &dev->flags); - rdev = rcu_dereference(conf->disks[i].rdev); - clear_bit(R5_ReadRepl, &dev->flags); - } - if (rdev && test_bit(Faulty, &rdev->flags)) - rdev = NULL; - if (rdev) { - is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, - &first_bad, &bad_sectors); - if (s->blocked_rdev == NULL - && (test_bit(Blocked, &rdev->flags) - || is_bad < 0)) { - if (is_bad < 0) - set_bit(BlockedBadBlocks, - &rdev->flags); - s->blocked_rdev = rdev; - atomic_inc(&rdev->nr_pending); - } - } - clear_bit(R5_Insync, &dev->flags); - if (!rdev) - /* Not in-sync */; - else if (is_bad) { - /* also not in-sync */ - if (!test_bit(WriteErrorSeen, &rdev->flags) && - test_bit(R5_UPTODATE, &dev->flags)) { - /* treat as in-sync, but with a read error - * which we can now try to correct - */ - set_bit(R5_Insync, &dev->flags); - set_bit(R5_ReadError, &dev->flags); - } - } else if (test_bit(In_sync, &rdev->flags)) - set_bit(R5_Insync, &dev->flags); - else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) - /* in sync if before recovery_offset */ - set_bit(R5_Insync, &dev->flags); - else if (test_bit(R5_UPTODATE, &dev->flags) && - test_bit(R5_Expanded, &dev->flags)) - /* If we've reshaped into here, we assume it is Insync. - * We will shortly update recovery_offset to make - * it official. - */ - set_bit(R5_Insync, &dev->flags); - - if (rdev && test_bit(R5_WriteError, &dev->flags)) { - /* This flag does not apply to '.replacement' - * only to .rdev, so make sure to check that*/ - struct md_rdev *rdev2 = rcu_dereference( - conf->disks[i].rdev); - if (rdev2 == rdev) - clear_bit(R5_Insync, &dev->flags); - if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { - s->handle_bad_blocks = 1; - atomic_inc(&rdev2->nr_pending); - } else - clear_bit(R5_WriteError, &dev->flags); - } - if (rdev && test_bit(R5_MadeGood, &dev->flags)) { - /* This flag does not apply to '.replacement' - * only to .rdev, so make sure to check that*/ - struct md_rdev *rdev2 = rcu_dereference( - conf->disks[i].rdev); - if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { - s->handle_bad_blocks = 1; - atomic_inc(&rdev2->nr_pending); - } else - clear_bit(R5_MadeGood, &dev->flags); - } - if (test_bit(R5_MadeGoodRepl, &dev->flags)) { - struct md_rdev *rdev2 = rcu_dereference( - conf->disks[i].replacement); - if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { - s->handle_bad_blocks = 1; - atomic_inc(&rdev2->nr_pending); - } else - clear_bit(R5_MadeGoodRepl, &dev->flags); - } - if (!test_bit(R5_Insync, &dev->flags)) { - /* The ReadError flag will just be confusing now */ - clear_bit(R5_ReadError, &dev->flags); - clear_bit(R5_ReWrite, &dev->flags); - } - if (test_bit(R5_ReadError, &dev->flags)) - clear_bit(R5_Insync, &dev->flags); - if (!test_bit(R5_Insync, &dev->flags)) { - if (s->failed < 2) - s->failed_num[s->failed] = i; - s->failed++; - if (rdev && !test_bit(Faulty, &rdev->flags)) - do_recovery = 1; - } - } - spin_unlock_irq(&conf->device_lock); - if (test_bit(STRIPE_SYNCING, &sh->state)) { - /* If there is a failed device being replaced, - * we must be recovering. - * else if we are after recovery_cp, we must be syncing - * else if MD_RECOVERY_REQUESTED is set, we also are syncing. - * else we can only be replacing - * sync and recovery both need to read all devices, and so - * use the same flag. - */ - if (do_recovery || - sh->sector >= conf->mddev->recovery_cp || - test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery))) - s->syncing = 1; - else - s->replacing = 1; - } - rcu_read_unlock(); -} - -static void handle_stripe(struct stripe_head *sh) -{ - struct stripe_head_state s; - struct r5conf *conf = sh->raid_conf; - int i; - int prexor; - int disks = sh->disks; - struct r5dev *pdev, *qdev; - - clear_bit(STRIPE_HANDLE, &sh->state); - if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) { - /* already being handled, ensure it gets handled - * again when current action finishes */ - set_bit(STRIPE_HANDLE, &sh->state); - return; - } - - if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { - set_bit(STRIPE_SYNCING, &sh->state); - clear_bit(STRIPE_INSYNC, &sh->state); - } - clear_bit(STRIPE_DELAYED, &sh->state); - - pr_debug("handling stripe %llu, state=%#lx cnt=%d, " - "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", - (unsigned long long)sh->sector, sh->state, - atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, - sh->check_state, sh->reconstruct_state); - - analyse_stripe(sh, &s); - - if (s.handle_bad_blocks) { - set_bit(STRIPE_HANDLE, &sh->state); - goto finish; - } - - if (unlikely(s.blocked_rdev)) { - if (s.syncing || s.expanding || s.expanded || - s.replacing || s.to_write || s.written) { - set_bit(STRIPE_HANDLE, &sh->state); - goto finish; - } - /* There is nothing for the blocked_rdev to block */ - rdev_dec_pending(s.blocked_rdev, conf->mddev); - s.blocked_rdev = NULL; - } - - if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { - set_bit(STRIPE_OP_BIOFILL, &s.ops_request); - set_bit(STRIPE_BIOFILL_RUN, &sh->state); - } - - pr_debug("locked=%d uptodate=%d to_read=%d" - " to_write=%d failed=%d failed_num=%d,%d\n", - s.locked, s.uptodate, s.to_read, s.to_write, s.failed, - s.failed_num[0], s.failed_num[1]); - /* check if the array has lost more than max_degraded devices and, - * if so, some requests might need to be failed. - */ - if (s.failed > conf->max_degraded) { - sh->check_state = 0; - sh->reconstruct_state = 0; - if (s.to_read+s.to_write+s.written) - handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); - if (s.syncing + s.replacing) - handle_failed_sync(conf, sh, &s); - } - - /* - * might be able to return some write requests if the parity blocks - * are safe, or on a failed drive - */ - pdev = &sh->dev[sh->pd_idx]; - s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) - || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); - qdev = &sh->dev[sh->qd_idx]; - s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) - || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) - || conf->level < 6; - - if (s.written && - (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) - && !test_bit(R5_LOCKED, &pdev->flags) - && test_bit(R5_UPTODATE, &pdev->flags)))) && - (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) - && !test_bit(R5_LOCKED, &qdev->flags) - && test_bit(R5_UPTODATE, &qdev->flags))))) - handle_stripe_clean_event(conf, sh, disks, &s.return_bi); - - /* Now we might consider reading some blocks, either to check/generate - * parity, or to satisfy requests - * or to load a block that is being partially written. - */ - if (s.to_read || s.non_overwrite - || (conf->level == 6 && s.to_write && s.failed) - || (s.syncing && (s.uptodate + s.compute < disks)) - || s.replacing - || s.expanding) - handle_stripe_fill(sh, &s, disks); - - /* Now we check to see if any write operations have recently - * completed - */ - prexor = 0; - if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) - prexor = 1; - if (sh->reconstruct_state == reconstruct_state_drain_result || - sh->reconstruct_state == reconstruct_state_prexor_drain_result) { - sh->reconstruct_state = reconstruct_state_idle; - - /* All the 'written' buffers and the parity block are ready to - * be written back to disk - */ - BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); - BUG_ON(sh->qd_idx >= 0 && - !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags)); - for (i = disks; i--; ) { - struct r5dev *dev = &sh->dev[i]; - if (test_bit(R5_LOCKED, &dev->flags) && - (i == sh->pd_idx || i == sh->qd_idx || - dev->written)) { - pr_debug("Writing block %d\n", i); - set_bit(R5_Wantwrite, &dev->flags); - if (prexor) - continue; - if (!test_bit(R5_Insync, &dev->flags) || - ((i == sh->pd_idx || i == sh->qd_idx) && - s.failed == 0)) - set_bit(STRIPE_INSYNC, &sh->state); - } - } - if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) - s.dec_preread_active = 1; - } - - /* Now to consider new write requests and what else, if anything - * should be read. We do not handle new writes when: - * 1/ A 'write' operation (copy+xor) is already in flight. - * 2/ A 'check' operation is in flight, as it may clobber the parity - * block. - */ - if (s.to_write && !sh->reconstruct_state && !sh->check_state) - handle_stripe_dirtying(conf, sh, &s, disks); - - /* maybe we need to check and possibly fix the parity for this stripe - * Any reads will already have been scheduled, so we just see if enough - * data is available. The parity check is held off while parity - * dependent operations are in flight. - */ - if (sh->check_state || - (s.syncing && s.locked == 0 && - !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && - !test_bit(STRIPE_INSYNC, &sh->state))) { - if (conf->level == 6) - handle_parity_checks6(conf, sh, &s, disks); - else - handle_parity_checks5(conf, sh, &s, disks); - } - - if (s.replacing && s.locked == 0 - && !test_bit(STRIPE_INSYNC, &sh->state)) { - /* Write out to replacement devices where possible */ - for (i = 0; i < conf->raid_disks; i++) - if (test_bit(R5_UPTODATE, &sh->dev[i].flags) && - test_bit(R5_NeedReplace, &sh->dev[i].flags)) { - set_bit(R5_WantReplace, &sh->dev[i].flags); - set_bit(R5_LOCKED, &sh->dev[i].flags); - s.locked++; - } - set_bit(STRIPE_INSYNC, &sh->state); - } - if ((s.syncing || s.replacing) && s.locked == 0 && - test_bit(STRIPE_INSYNC, &sh->state)) { - md_done_sync(conf->mddev, STRIPE_SECTORS, 1); - clear_bit(STRIPE_SYNCING, &sh->state); - } - - /* If the failed drives are just a ReadError, then we might need - * to progress the repair/check process - */ - if (s.failed <= conf->max_degraded && !conf->mddev->ro) - for (i = 0; i < s.failed; i++) { - struct r5dev *dev = &sh->dev[s.failed_num[i]]; - if (test_bit(R5_ReadError, &dev->flags) - && !test_bit(R5_LOCKED, &dev->flags) - && test_bit(R5_UPTODATE, &dev->flags) - ) { - if (!test_bit(R5_ReWrite, &dev->flags)) { - set_bit(R5_Wantwrite, &dev->flags); - set_bit(R5_ReWrite, &dev->flags); - set_bit(R5_LOCKED, &dev->flags); - s.locked++; - } else { - /* let's read it back */ - set_bit(R5_Wantread, &dev->flags); - set_bit(R5_LOCKED, &dev->flags); - s.locked++; - } - } - } - - - /* Finish reconstruct operations initiated by the expansion process */ - if (sh->reconstruct_state == reconstruct_state_result) { - struct stripe_head *sh_src - = get_active_stripe(conf, sh->sector, 1, 1, 1); - if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { - /* sh cannot be written until sh_src has been read. - * so arrange for sh to be delayed a little - */ - set_bit(STRIPE_DELAYED, &sh->state); - set_bit(STRIPE_HANDLE, &sh->state); - if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, - &sh_src->state)) - atomic_inc(&conf->preread_active_stripes); - release_stripe(sh_src); - goto finish; - } - if (sh_src) - release_stripe(sh_src); - - sh->reconstruct_state = reconstruct_state_idle; - clear_bit(STRIPE_EXPANDING, &sh->state); - for (i = conf->raid_disks; i--; ) { - set_bit(R5_Wantwrite, &sh->dev[i].flags); - set_bit(R5_LOCKED, &sh->dev[i].flags); - s.locked++; - } - } - - if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && - !sh->reconstruct_state) { - /* Need to write out all blocks after computing parity */ - sh->disks = conf->raid_disks; - stripe_set_idx(sh->sector, conf, 0, sh); - schedule_reconstruction(sh, &s, 1, 1); - } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { - clear_bit(STRIPE_EXPAND_READY, &sh->state); - atomic_dec(&conf->reshape_stripes); - wake_up(&conf->wait_for_overlap); - md_done_sync(conf->mddev, STRIPE_SECTORS, 1); - } - - if (s.expanding && s.locked == 0 && - !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) - handle_stripe_expansion(conf, sh); - -finish: - /* wait for this device to become unblocked */ - if (conf->mddev->external && unlikely(s.blocked_rdev)) - md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev); - - if (s.handle_bad_blocks) - for (i = disks; i--; ) { - struct md_rdev *rdev; - struct r5dev *dev = &sh->dev[i]; - if (test_and_clear_bit(R5_WriteError, &dev->flags)) { - /* We own a safe reference to the rdev */ - rdev = conf->disks[i].rdev; - if (!rdev_set_badblocks(rdev, sh->sector, - STRIPE_SECTORS, 0)) - md_error(conf->mddev, rdev); - rdev_dec_pending(rdev, conf->mddev); - } - if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { - rdev = conf->disks[i].rdev; - rdev_clear_badblocks(rdev, sh->sector, - STRIPE_SECTORS); - rdev_dec_pending(rdev, conf->mddev); - } - if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { - rdev = conf->disks[i].replacement; - if (!rdev) - /* rdev have been moved down */ - rdev = conf->disks[i].rdev; - rdev_clear_badblocks(rdev, sh->sector, - STRIPE_SECTORS); - rdev_dec_pending(rdev, conf->mddev); - } - } - - if (s.ops_request) - raid_run_ops(sh, s.ops_request); - - ops_run_io(sh, &s); - - if (s.dec_preread_active) { - /* We delay this until after ops_run_io so that if make_request - * is waiting on a flush, it won't continue until the writes - * have actually been submitted. - */ - atomic_dec(&conf->preread_active_stripes); - if (atomic_read(&conf->preread_active_stripes) < - IO_THRESHOLD) - md_wakeup_thread(conf->mddev->thread); - } - - return_io(s.return_bi); - - clear_bit_unlock(STRIPE_ACTIVE, &sh->state); -} - -static void raid5_activate_delayed(struct r5conf *conf) -{ - if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { - while (!list_empty(&conf->delayed_list)) { - struct list_head *l = conf->delayed_list.next; - struct stripe_head *sh; - sh = list_entry(l, struct stripe_head, lru); - list_del_init(l); - clear_bit(STRIPE_DELAYED, &sh->state); - if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) - atomic_inc(&conf->preread_active_stripes); - list_add_tail(&sh->lru, &conf->hold_list); - } - } -} - -static void activate_bit_delay(struct r5conf *conf) -{ - /* device_lock is held */ - struct list_head head; - list_add(&head, &conf->bitmap_list); - list_del_init(&conf->bitmap_list); - while (!list_empty(&head)) { - struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); - list_del_init(&sh->lru); - atomic_inc(&sh->count); - __release_stripe(conf, sh); - } -} - -int md_raid5_congested(struct mddev *mddev, int bits) -{ - struct r5conf *conf = mddev->private; - - /* No difference between reads and writes. Just check - * how busy the stripe_cache is - */ - - if (conf->inactive_blocked) - return 1; - if (conf->quiesce) - return 1; - if (list_empty_careful(&conf->inactive_list)) - return 1; - - return 0; -} -EXPORT_SYMBOL_GPL(md_raid5_congested); - -static int raid5_congested(void *data, int bits) -{ - struct mddev *mddev = data; - - return mddev_congested(mddev, bits) || - md_raid5_congested(mddev, bits); -} - -/* We want read requests to align with chunks where possible, - * but write requests don't need to. - */ -static int raid5_mergeable_bvec(struct request_queue *q, - struct bvec_merge_data *bvm, - struct bio_vec *biovec) -{ - struct mddev *mddev = q->queuedata; - sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); - int max; - unsigned int chunk_sectors = mddev->chunk_sectors; - unsigned int bio_sectors = bvm->bi_size >> 9; - - if ((bvm->bi_rw & 1) == WRITE) - return biovec->bv_len; /* always allow writes to be mergeable */ - - if (mddev->new_chunk_sectors < mddev->chunk_sectors) - chunk_sectors = mddev->new_chunk_sectors; - max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; - if (max < 0) max = 0; - if (max <= biovec->bv_len && bio_sectors == 0) - return biovec->bv_len; - else - return max; -} - - -static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) -{ - sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); - unsigned int chunk_sectors = mddev->chunk_sectors; - unsigned int bio_sectors = bio->bi_size >> 9; - - if (mddev->new_chunk_sectors < mddev->chunk_sectors) - chunk_sectors = mddev->new_chunk_sectors; - return chunk_sectors >= - ((sector & (chunk_sectors - 1)) + bio_sectors); -} - -/* - * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) - * later sampled by raid5d. - */ -static void add_bio_to_retry(struct bio *bi,struct r5conf *conf) -{ - unsigned long flags; - - spin_lock_irqsave(&conf->device_lock, flags); - - bi->bi_next = conf->retry_read_aligned_list; - conf->retry_read_aligned_list = bi; - - spin_unlock_irqrestore(&conf->device_lock, flags); - md_wakeup_thread(conf->mddev->thread); -} - - -static struct bio *remove_bio_from_retry(struct r5conf *conf) -{ - struct bio *bi; - - bi = conf->retry_read_aligned; - if (bi) { - conf->retry_read_aligned = NULL; - return bi; - } - bi = conf->retry_read_aligned_list; - if(bi) { - conf->retry_read_aligned_list = bi->bi_next; - bi->bi_next = NULL; - /* - * this sets the active strip count to 1 and the processed - * strip count to zero (upper 8 bits) - */ - bi->bi_phys_segments = 1; /* biased count of active stripes */ - } - - return bi; -} - - -/* - * The "raid5_align_endio" should check if the read succeeded and if it - * did, call bio_endio on the original bio (having bio_put the new bio - * first). - * If the read failed.. - */ -static void raid5_align_endio(struct bio *bi, int error) -{ - struct bio* raid_bi = bi->bi_private; - struct mddev *mddev; - struct r5conf *conf; - int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); - struct md_rdev *rdev; - - bio_put(bi); - - rdev = (void*)raid_bi->bi_next; - raid_bi->bi_next = NULL; - mddev = rdev->mddev; - conf = mddev->private; - - rdev_dec_pending(rdev, conf->mddev); - - if (!error && uptodate) { - bio_endio(raid_bi, 0); - if (atomic_dec_and_test(&conf->active_aligned_reads)) - wake_up(&conf->wait_for_stripe); - return; - } - - - pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); - - add_bio_to_retry(raid_bi, conf); -} - -static int bio_fits_rdev(struct bio *bi) -{ - struct request_queue *q = bdev_get_queue(bi->bi_bdev); - - if ((bi->bi_size>>9) > queue_max_sectors(q)) - return 0; - blk_recount_segments(q, bi); - if (bi->bi_phys_segments > queue_max_segments(q)) - return 0; - - if (q->merge_bvec_fn) - /* it's too hard to apply the merge_bvec_fn at this stage, - * just just give up - */ - return 0; - - return 1; -} - - -static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) -{ - struct r5conf *conf = mddev->private; - int dd_idx; - struct bio* align_bi; - struct md_rdev *rdev; - sector_t end_sector; - - if (!in_chunk_boundary(mddev, raid_bio)) { - pr_debug("chunk_aligned_read : non aligned\n"); - return 0; - } - /* - * use bio_clone_mddev to make a copy of the bio - */ - align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev); - if (!align_bi) - return 0; - /* - * set bi_end_io to a new function, and set bi_private to the - * original bio. - */ - align_bi->bi_end_io = raid5_align_endio; - align_bi->bi_private = raid_bio; - /* - * compute position - */ - align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector, - 0, - &dd_idx, NULL); - - end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9); - rcu_read_lock(); - rdev = rcu_dereference(conf->disks[dd_idx].replacement); - if (!rdev || test_bit(Faulty, &rdev->flags) || - rdev->recovery_offset < end_sector) { - rdev = rcu_dereference(conf->disks[dd_idx].rdev); - if (rdev && - (test_bit(Faulty, &rdev->flags) || - !(test_bit(In_sync, &rdev->flags) || - rdev->recovery_offset >= end_sector))) - rdev = NULL; - } - if (rdev) { - sector_t first_bad; - int bad_sectors; - - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - raid_bio->bi_next = (void*)rdev; - align_bi->bi_bdev = rdev->bdev; - align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); - - if (!bio_fits_rdev(align_bi) || - is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9, - &first_bad, &bad_sectors)) { - /* too big in some way, or has a known bad block */ - bio_put(align_bi); - rdev_dec_pending(rdev, mddev); - return 0; - } - - /* No reshape active, so we can trust rdev->data_offset */ - align_bi->bi_sector += rdev->data_offset; - - spin_lock_irq(&conf->device_lock); - wait_event_lock_irq(conf->wait_for_stripe, - conf->quiesce == 0, - conf->device_lock, /* nothing */); - atomic_inc(&conf->active_aligned_reads); - spin_unlock_irq(&conf->device_lock); - - generic_make_request(align_bi); - return 1; - } else { - rcu_read_unlock(); - bio_put(align_bi); - return 0; - } -} - -/* __get_priority_stripe - get the next stripe to process - * - * Full stripe writes are allowed to pass preread active stripes up until - * the bypass_threshold is exceeded. In general the bypass_count - * increments when the handle_list is handled before the hold_list; however, it - * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a - * stripe with in flight i/o. The bypass_count will be reset when the - * head of the hold_list has changed, i.e. the head was promoted to the - * handle_list. - */ -static struct stripe_head *__get_priority_stripe(struct r5conf *conf) -{ - struct stripe_head *sh; - - pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", - __func__, - list_empty(&conf->handle_list) ? "empty" : "busy", - list_empty(&conf->hold_list) ? "empty" : "busy", - atomic_read(&conf->pending_full_writes), conf->bypass_count); - - if (!list_empty(&conf->handle_list)) { - sh = list_entry(conf->handle_list.next, typeof(*sh), lru); - - if (list_empty(&conf->hold_list)) - conf->bypass_count = 0; - else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { - if (conf->hold_list.next == conf->last_hold) - conf->bypass_count++; - else { - conf->last_hold = conf->hold_list.next; - conf->bypass_count -= conf->bypass_threshold; - if (conf->bypass_count < 0) - conf->bypass_count = 0; - } - } - } else if (!list_empty(&conf->hold_list) && - ((conf->bypass_threshold && - conf->bypass_count > conf->bypass_threshold) || - atomic_read(&conf->pending_full_writes) == 0)) { - sh = list_entry(conf->hold_list.next, - typeof(*sh), lru); - conf->bypass_count -= conf->bypass_threshold; - if (conf->bypass_count < 0) - conf->bypass_count = 0; - } else - return NULL; - - list_del_init(&sh->lru); - atomic_inc(&sh->count); - BUG_ON(atomic_read(&sh->count) != 1); - return sh; -} - -static void make_request(struct mddev *mddev, struct bio * bi) -{ - struct r5conf *conf = mddev->private; - int dd_idx; - sector_t new_sector; - sector_t logical_sector, last_sector; - struct stripe_head *sh; - const int rw = bio_data_dir(bi); - int remaining; - int plugged; - - if (unlikely(bi->bi_rw & REQ_FLUSH)) { - md_flush_request(mddev, bi); - return; - } - - md_write_start(mddev, bi); - - if (rw == READ && - mddev->reshape_position == MaxSector && - chunk_aligned_read(mddev,bi)) - return; - - logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); - last_sector = bi->bi_sector + (bi->bi_size>>9); - bi->bi_next = NULL; - bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ - - plugged = mddev_check_plugged(mddev); - for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { - DEFINE_WAIT(w); - int disks, data_disks; - int previous; - - retry: - previous = 0; - disks = conf->raid_disks; - prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); - if (unlikely(conf->reshape_progress != MaxSector)) { - /* spinlock is needed as reshape_progress may be - * 64bit on a 32bit platform, and so it might be - * possible to see a half-updated value - * Of course reshape_progress could change after - * the lock is dropped, so once we get a reference - * to the stripe that we think it is, we will have - * to check again. - */ - spin_lock_irq(&conf->device_lock); - if (mddev->delta_disks < 0 - ? logical_sector < conf->reshape_progress - : logical_sector >= conf->reshape_progress) { - disks = conf->previous_raid_disks; - previous = 1; - } else { - if (mddev->delta_disks < 0 - ? logical_sector < conf->reshape_safe - : logical_sector >= conf->reshape_safe) { - spin_unlock_irq(&conf->device_lock); - schedule(); - goto retry; - } - } - spin_unlock_irq(&conf->device_lock); - } - data_disks = disks - conf->max_degraded; - - new_sector = raid5_compute_sector(conf, logical_sector, - previous, - &dd_idx, NULL); - pr_debug("raid456: make_request, sector %llu logical %llu\n", - (unsigned long long)new_sector, - (unsigned long long)logical_sector); - - sh = get_active_stripe(conf, new_sector, previous, - (bi->bi_rw&RWA_MASK), 0); - if (sh) { - if (unlikely(previous)) { - /* expansion might have moved on while waiting for a - * stripe, so we must do the range check again. - * Expansion could still move past after this - * test, but as we are holding a reference to - * 'sh', we know that if that happens, - * STRIPE_EXPANDING will get set and the expansion - * won't proceed until we finish with the stripe. - */ - int must_retry = 0; - spin_lock_irq(&conf->device_lock); - if (mddev->delta_disks < 0 - ? logical_sector >= conf->reshape_progress - : logical_sector < conf->reshape_progress) - /* mismatch, need to try again */ - must_retry = 1; - spin_unlock_irq(&conf->device_lock); - if (must_retry) { - release_stripe(sh); - schedule(); - goto retry; - } - } - - if (rw == WRITE && - logical_sector >= mddev->suspend_lo && - logical_sector < mddev->suspend_hi) { - release_stripe(sh); - /* As the suspend_* range is controlled by - * userspace, we want an interruptible - * wait. - */ - flush_signals(current); - prepare_to_wait(&conf->wait_for_overlap, - &w, TASK_INTERRUPTIBLE); - if (logical_sector >= mddev->suspend_lo && - logical_sector < mddev->suspend_hi) - schedule(); - goto retry; - } - - if (test_bit(STRIPE_EXPANDING, &sh->state) || - !add_stripe_bio(sh, bi, dd_idx, rw)) { - /* Stripe is busy expanding or - * add failed due to overlap. Flush everything - * and wait a while - */ - md_wakeup_thread(mddev->thread); - release_stripe(sh); - schedule(); - goto retry; - } - finish_wait(&conf->wait_for_overlap, &w); - set_bit(STRIPE_HANDLE, &sh->state); - clear_bit(STRIPE_DELAYED, &sh->state); - if ((bi->bi_rw & REQ_SYNC) && - !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) - atomic_inc(&conf->preread_active_stripes); - release_stripe(sh); - } else { - /* cannot get stripe for read-ahead, just give-up */ - clear_bit(BIO_UPTODATE, &bi->bi_flags); - finish_wait(&conf->wait_for_overlap, &w); - break; - } - - } - if (!plugged) - md_wakeup_thread(mddev->thread); - - spin_lock_irq(&conf->device_lock); - remaining = raid5_dec_bi_phys_segments(bi); - spin_unlock_irq(&conf->device_lock); - if (remaining == 0) { - - if ( rw == WRITE ) - md_write_end(mddev); - - bio_endio(bi, 0); - } -} - -static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); - -static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped) -{ - /* reshaping is quite different to recovery/resync so it is - * handled quite separately ... here. - * - * On each call to sync_request, we gather one chunk worth of - * destination stripes and flag them as expanding. - * Then we find all the source stripes and request reads. - * As the reads complete, handle_stripe will copy the data - * into the destination stripe and release that stripe. - */ - struct r5conf *conf = mddev->private; - struct stripe_head *sh; - sector_t first_sector, last_sector; - int raid_disks = conf->previous_raid_disks; - int data_disks = raid_disks - conf->max_degraded; - int new_data_disks = conf->raid_disks - conf->max_degraded; - int i; - int dd_idx; - sector_t writepos, readpos, safepos; - sector_t stripe_addr; - int reshape_sectors; - struct list_head stripes; - - if (sector_nr == 0) { - /* If restarting in the middle, skip the initial sectors */ - if (mddev->delta_disks < 0 && - conf->reshape_progress < raid5_size(mddev, 0, 0)) { - sector_nr = raid5_size(mddev, 0, 0) - - conf->reshape_progress; - } else if (mddev->delta_disks >= 0 && - conf->reshape_progress > 0) - sector_nr = conf->reshape_progress; - sector_div(sector_nr, new_data_disks); - if (sector_nr) { - mddev->curr_resync_completed = sector_nr; - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); - *skipped = 1; - return sector_nr; - } - } - - /* We need to process a full chunk at a time. - * If old and new chunk sizes differ, we need to process the - * largest of these - */ - if (mddev->new_chunk_sectors > mddev->chunk_sectors) - reshape_sectors = mddev->new_chunk_sectors; - else - reshape_sectors = mddev->chunk_sectors; - - /* we update the metadata when there is more than 3Meg - * in the block range (that is rather arbitrary, should - * probably be time based) or when the data about to be - * copied would over-write the source of the data at - * the front of the range. - * i.e. one new_stripe along from reshape_progress new_maps - * to after where reshape_safe old_maps to - */ - writepos = conf->reshape_progress; - sector_div(writepos, new_data_disks); - readpos = conf->reshape_progress; - sector_div(readpos, data_disks); - safepos = conf->reshape_safe; - sector_div(safepos, data_disks); - if (mddev->delta_disks < 0) { - writepos -= min_t(sector_t, reshape_sectors, writepos); - readpos += reshape_sectors; - safepos += reshape_sectors; - } else { - writepos += reshape_sectors; - readpos -= min_t(sector_t, reshape_sectors, readpos); - safepos -= min_t(sector_t, reshape_sectors, safepos); - } - - /* 'writepos' is the most advanced device address we might write. - * 'readpos' is the least advanced device address we might read. - * 'safepos' is the least address recorded in the metadata as having - * been reshaped. - * If 'readpos' is behind 'writepos', then there is no way that we can - * ensure safety in the face of a crash - that must be done by userspace - * making a backup of the data. So in that case there is no particular - * rush to update metadata. - * Otherwise if 'safepos' is behind 'writepos', then we really need to - * update the metadata to advance 'safepos' to match 'readpos' so that - * we can be safe in the event of a crash. - * So we insist on updating metadata if safepos is behind writepos and - * readpos is beyond writepos. - * In any case, update the metadata every 10 seconds. - * Maybe that number should be configurable, but I'm not sure it is - * worth it.... maybe it could be a multiple of safemode_delay??? - */ - if ((mddev->delta_disks < 0 - ? (safepos > writepos && readpos < writepos) - : (safepos < writepos && readpos > writepos)) || - time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { - /* Cannot proceed until we've updated the superblock... */ - wait_event(conf->wait_for_overlap, - atomic_read(&conf->reshape_stripes)==0); - mddev->reshape_position = conf->reshape_progress; - mddev->curr_resync_completed = sector_nr; - conf->reshape_checkpoint = jiffies; - set_bit(MD_CHANGE_DEVS, &mddev->flags); - md_wakeup_thread(mddev->thread); - wait_event(mddev->sb_wait, mddev->flags == 0 || - kthread_should_stop()); - spin_lock_irq(&conf->device_lock); - conf->reshape_safe = mddev->reshape_position; - spin_unlock_irq(&conf->device_lock); - wake_up(&conf->wait_for_overlap); - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); - } - - if (mddev->delta_disks < 0) { - BUG_ON(conf->reshape_progress == 0); - stripe_addr = writepos; - BUG_ON((mddev->dev_sectors & - ~((sector_t)reshape_sectors - 1)) - - reshape_sectors - stripe_addr - != sector_nr); - } else { - BUG_ON(writepos != sector_nr + reshape_sectors); - stripe_addr = sector_nr; - } - INIT_LIST_HEAD(&stripes); - for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { - int j; - int skipped_disk = 0; - sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); - set_bit(STRIPE_EXPANDING, &sh->state); - atomic_inc(&conf->reshape_stripes); - /* If any of this stripe is beyond the end of the old - * array, then we need to zero those blocks - */ - for (j=sh->disks; j--;) { - sector_t s; - if (j == sh->pd_idx) - continue; - if (conf->level == 6 && - j == sh->qd_idx) - continue; - s = compute_blocknr(sh, j, 0); - if (s < raid5_size(mddev, 0, 0)) { - skipped_disk = 1; - continue; - } - memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); - set_bit(R5_Expanded, &sh->dev[j].flags); - set_bit(R5_UPTODATE, &sh->dev[j].flags); - } - if (!skipped_disk) { - set_bit(STRIPE_EXPAND_READY, &sh->state); - set_bit(STRIPE_HANDLE, &sh->state); - } - list_add(&sh->lru, &stripes); - } - spin_lock_irq(&conf->device_lock); - if (mddev->delta_disks < 0) - conf->reshape_progress -= reshape_sectors * new_data_disks; - else - conf->reshape_progress += reshape_sectors * new_data_disks; - spin_unlock_irq(&conf->device_lock); - /* Ok, those stripe are ready. We can start scheduling - * reads on the source stripes. - * The source stripes are determined by mapping the first and last - * block on the destination stripes. - */ - first_sector = - raid5_compute_sector(conf, stripe_addr*(new_data_disks), - 1, &dd_idx, NULL); - last_sector = - raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) - * new_data_disks - 1), - 1, &dd_idx, NULL); - if (last_sector >= mddev->dev_sectors) - last_sector = mddev->dev_sectors - 1; - while (first_sector <= last_sector) { - sh = get_active_stripe(conf, first_sector, 1, 0, 1); - set_bit(STRIPE_EXPAND_SOURCE, &sh->state); - set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); - first_sector += STRIPE_SECTORS; - } - /* Now that the sources are clearly marked, we can release - * the destination stripes - */ - while (!list_empty(&stripes)) { - sh = list_entry(stripes.next, struct stripe_head, lru); - list_del_init(&sh->lru); - release_stripe(sh); - } - /* If this takes us to the resync_max point where we have to pause, - * then we need to write out the superblock. - */ - sector_nr += reshape_sectors; - if ((sector_nr - mddev->curr_resync_completed) * 2 - >= mddev->resync_max - mddev->curr_resync_completed) { - /* Cannot proceed until we've updated the superblock... */ - wait_event(conf->wait_for_overlap, - atomic_read(&conf->reshape_stripes) == 0); - mddev->reshape_position = conf->reshape_progress; - mddev->curr_resync_completed = sector_nr; - conf->reshape_checkpoint = jiffies; - set_bit(MD_CHANGE_DEVS, &mddev->flags); - md_wakeup_thread(mddev->thread); - wait_event(mddev->sb_wait, - !test_bit(MD_CHANGE_DEVS, &mddev->flags) - || kthread_should_stop()); - spin_lock_irq(&conf->device_lock); - conf->reshape_safe = mddev->reshape_position; - spin_unlock_irq(&conf->device_lock); - wake_up(&conf->wait_for_overlap); - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); - } - return reshape_sectors; -} - -/* FIXME go_faster isn't used */ -static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster) -{ - struct r5conf *conf = mddev->private; - struct stripe_head *sh; - sector_t max_sector = mddev->dev_sectors; - sector_t sync_blocks; - int still_degraded = 0; - int i; - - if (sector_nr >= max_sector) { - /* just being told to finish up .. nothing much to do */ - - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { - end_reshape(conf); - return 0; - } - - if (mddev->curr_resync < max_sector) /* aborted */ - bitmap_end_sync(mddev->bitmap, mddev->curr_resync, - &sync_blocks, 1); - else /* completed sync */ - conf->fullsync = 0; - bitmap_close_sync(mddev->bitmap); - - return 0; - } - - /* Allow raid5_quiesce to complete */ - wait_event(conf->wait_for_overlap, conf->quiesce != 2); - - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) - return reshape_request(mddev, sector_nr, skipped); - - /* No need to check resync_max as we never do more than one - * stripe, and as resync_max will always be on a chunk boundary, - * if the check in md_do_sync didn't fire, there is no chance - * of overstepping resync_max here - */ - - /* if there is too many failed drives and we are trying - * to resync, then assert that we are finished, because there is - * nothing we can do. - */ - if (mddev->degraded >= conf->max_degraded && - test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - sector_t rv = mddev->dev_sectors - sector_nr; - *skipped = 1; - return rv; - } - if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && - !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && - !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { - /* we can skip this block, and probably more */ - sync_blocks /= STRIPE_SECTORS; - *skipped = 1; - return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ - } - - bitmap_cond_end_sync(mddev->bitmap, sector_nr); - - sh = get_active_stripe(conf, sector_nr, 0, 1, 0); - if (sh == NULL) { - sh = get_active_stripe(conf, sector_nr, 0, 0, 0); - /* make sure we don't swamp the stripe cache if someone else - * is trying to get access - */ - schedule_timeout_uninterruptible(1); - } - /* Need to check if array will still be degraded after recovery/resync - * We don't need to check the 'failed' flag as when that gets set, - * recovery aborts. - */ - for (i = 0; i < conf->raid_disks; i++) - if (conf->disks[i].rdev == NULL) - still_degraded = 1; - - bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); - - set_bit(STRIPE_SYNC_REQUESTED, &sh->state); - - handle_stripe(sh); - release_stripe(sh); - - return STRIPE_SECTORS; -} - -static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) -{ - /* We may not be able to submit a whole bio at once as there - * may not be enough stripe_heads available. - * We cannot pre-allocate enough stripe_heads as we may need - * more than exist in the cache (if we allow ever large chunks). - * So we do one stripe head at a time and record in - * ->bi_hw_segments how many have been done. - * - * We *know* that this entire raid_bio is in one chunk, so - * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. - */ - struct stripe_head *sh; - int dd_idx; - sector_t sector, logical_sector, last_sector; - int scnt = 0; - int remaining; - int handled = 0; - - logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); - sector = raid5_compute_sector(conf, logical_sector, - 0, &dd_idx, NULL); - last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); - - for (; logical_sector < last_sector; - logical_sector += STRIPE_SECTORS, - sector += STRIPE_SECTORS, - scnt++) { - - if (scnt < raid5_bi_hw_segments(raid_bio)) - /* already done this stripe */ - continue; - - sh = get_active_stripe(conf, sector, 0, 1, 0); - - if (!sh) { - /* failed to get a stripe - must wait */ - raid5_set_bi_hw_segments(raid_bio, scnt); - conf->retry_read_aligned = raid_bio; - return handled; - } - - if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { - release_stripe(sh); - raid5_set_bi_hw_segments(raid_bio, scnt); - conf->retry_read_aligned = raid_bio; - return handled; - } - - handle_stripe(sh); - release_stripe(sh); - handled++; - } - spin_lock_irq(&conf->device_lock); - remaining = raid5_dec_bi_phys_segments(raid_bio); - spin_unlock_irq(&conf->device_lock); - if (remaining == 0) - bio_endio(raid_bio, 0); - if (atomic_dec_and_test(&conf->active_aligned_reads)) - wake_up(&conf->wait_for_stripe); - return handled; -} - - -/* - * This is our raid5 kernel thread. - * - * We scan the hash table for stripes which can be handled now. - * During the scan, completed stripes are saved for us by the interrupt - * handler, so that they will not have to wait for our next wakeup. - */ -static void raid5d(struct mddev *mddev) -{ - struct stripe_head *sh; - struct r5conf *conf = mddev->private; - int handled; - struct blk_plug plug; - - pr_debug("+++ raid5d active\n"); - - md_check_recovery(mddev); - - blk_start_plug(&plug); - handled = 0; - spin_lock_irq(&conf->device_lock); - while (1) { - struct bio *bio; - - if (atomic_read(&mddev->plug_cnt) == 0 && - !list_empty(&conf->bitmap_list)) { - /* Now is a good time to flush some bitmap updates */ - conf->seq_flush++; - spin_unlock_irq(&conf->device_lock); - bitmap_unplug(mddev->bitmap); - spin_lock_irq(&conf->device_lock); - conf->seq_write = conf->seq_flush; - activate_bit_delay(conf); - } - if (atomic_read(&mddev->plug_cnt) == 0) - raid5_activate_delayed(conf); - - while ((bio = remove_bio_from_retry(conf))) { - int ok; - spin_unlock_irq(&conf->device_lock); - ok = retry_aligned_read(conf, bio); - spin_lock_irq(&conf->device_lock); - if (!ok) - break; - handled++; - } - - sh = __get_priority_stripe(conf); - - if (!sh) - break; - spin_unlock_irq(&conf->device_lock); - - handled++; - handle_stripe(sh); - release_stripe(sh); - cond_resched(); - - if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) - md_check_recovery(mddev); - - spin_lock_irq(&conf->device_lock); - } - pr_debug("%d stripes handled\n", handled); - - spin_unlock_irq(&conf->device_lock); - - async_tx_issue_pending_all(); - blk_finish_plug(&plug); - - pr_debug("--- raid5d inactive\n"); -} - -static ssize_t -raid5_show_stripe_cache_size(struct mddev *mddev, char *page) -{ - struct r5conf *conf = mddev->private; - if (conf) - return sprintf(page, "%d\n", conf->max_nr_stripes); - else - return 0; -} - -int -raid5_set_cache_size(struct mddev *mddev, int size) -{ - struct r5conf *conf = mddev->private; - int err; - - if (size <= 16 || size > 32768) - return -EINVAL; - while (size < conf->max_nr_stripes) { - if (drop_one_stripe(conf)) - conf->max_nr_stripes--; - else - break; - } - err = md_allow_write(mddev); - if (err) - return err; - while (size > conf->max_nr_stripes) { - if (grow_one_stripe(conf)) - conf->max_nr_stripes++; - else break; - } - return 0; -} -EXPORT_SYMBOL(raid5_set_cache_size); - -static ssize_t -raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) -{ - struct r5conf *conf = mddev->private; - unsigned long new; - int err; - - if (len >= PAGE_SIZE) - return -EINVAL; - if (!conf) - return -ENODEV; - - if (strict_strtoul(page, 10, &new)) - return -EINVAL; - err = raid5_set_cache_size(mddev, new); - if (err) - return err; - return len; -} - -static struct md_sysfs_entry -raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, - raid5_show_stripe_cache_size, - raid5_store_stripe_cache_size); - -static ssize_t -raid5_show_preread_threshold(struct mddev *mddev, char *page) -{ - struct r5conf *conf = mddev->private; - if (conf) - return sprintf(page, "%d\n", conf->bypass_threshold); - else - return 0; -} - -static ssize_t -raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) -{ - struct r5conf *conf = mddev->private; - unsigned long new; - if (len >= PAGE_SIZE) - return -EINVAL; - if (!conf) - return -ENODEV; - - if (strict_strtoul(page, 10, &new)) - return -EINVAL; - if (new > conf->max_nr_stripes) - return -EINVAL; - conf->bypass_threshold = new; - return len; -} - -static struct md_sysfs_entry -raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, - S_IRUGO | S_IWUSR, - raid5_show_preread_threshold, - raid5_store_preread_threshold); - -static ssize_t -stripe_cache_active_show(struct mddev *mddev, char *page) -{ - struct r5conf *conf = mddev->private; - if (conf) - return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); - else - return 0; -} - -static struct md_sysfs_entry -raid5_stripecache_active = __ATTR_RO(stripe_cache_active); - -static struct attribute *raid5_attrs[] = { - &raid5_stripecache_size.attr, - &raid5_stripecache_active.attr, - &raid5_preread_bypass_threshold.attr, - NULL, -}; -static struct attribute_group raid5_attrs_group = { - .name = NULL, - .attrs = raid5_attrs, -}; - -static sector_t -raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) -{ - struct r5conf *conf = mddev->private; - - if (!sectors) - sectors = mddev->dev_sectors; - if (!raid_disks) - /* size is defined by the smallest of previous and new size */ - raid_disks = min(conf->raid_disks, conf->previous_raid_disks); - - sectors &= ~((sector_t)mddev->chunk_sectors - 1); - sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); - return sectors * (raid_disks - conf->max_degraded); -} - -static void raid5_free_percpu(struct r5conf *conf) -{ - struct raid5_percpu *percpu; - unsigned long cpu; - - if (!conf->percpu) - return; - - get_online_cpus(); - for_each_possible_cpu(cpu) { - percpu = per_cpu_ptr(conf->percpu, cpu); - safe_put_page(percpu->spare_page); - kfree(percpu->scribble); - } -#ifdef CONFIG_HOTPLUG_CPU - unregister_cpu_notifier(&conf->cpu_notify); -#endif - put_online_cpus(); - - free_percpu(conf->percpu); -} - -static void free_conf(struct r5conf *conf) -{ - shrink_stripes(conf); - raid5_free_percpu(conf); - kfree(conf->disks); - kfree(conf->stripe_hashtbl); - kfree(conf); -} - -#ifdef CONFIG_HOTPLUG_CPU -static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, - void *hcpu) -{ - struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify); - long cpu = (long)hcpu; - struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - if (conf->level == 6 && !percpu->spare_page) - percpu->spare_page = alloc_page(GFP_KERNEL); - if (!percpu->scribble) - percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); - - if (!percpu->scribble || - (conf->level == 6 && !percpu->spare_page)) { - safe_put_page(percpu->spare_page); - kfree(percpu->scribble); - pr_err("%s: failed memory allocation for cpu%ld\n", - __func__, cpu); - return notifier_from_errno(-ENOMEM); - } - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - safe_put_page(percpu->spare_page); - kfree(percpu->scribble); - percpu->spare_page = NULL; - percpu->scribble = NULL; - break; - default: - break; - } - return NOTIFY_OK; -} -#endif - -static int raid5_alloc_percpu(struct r5conf *conf) -{ - unsigned long cpu; - struct page *spare_page; - struct raid5_percpu __percpu *allcpus; - void *scribble; - int err; - - allcpus = alloc_percpu(struct raid5_percpu); - if (!allcpus) - return -ENOMEM; - conf->percpu = allcpus; - - get_online_cpus(); - err = 0; - for_each_present_cpu(cpu) { - if (conf->level == 6) { - spare_page = alloc_page(GFP_KERNEL); - if (!spare_page) { - err = -ENOMEM; - break; - } - per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; - } - scribble = kmalloc(conf->scribble_len, GFP_KERNEL); - if (!scribble) { - err = -ENOMEM; - break; - } - per_cpu_ptr(conf->percpu, cpu)->scribble = scribble; - } -#ifdef CONFIG_HOTPLUG_CPU - conf->cpu_notify.notifier_call = raid456_cpu_notify; - conf->cpu_notify.priority = 0; - if (err == 0) - err = register_cpu_notifier(&conf->cpu_notify); -#endif - put_online_cpus(); - - return err; -} - -static struct r5conf *setup_conf(struct mddev *mddev) -{ - struct r5conf *conf; - int raid_disk, memory, max_disks; - struct md_rdev *rdev; - struct disk_info *disk; - - if (mddev->new_level != 5 - && mddev->new_level != 4 - && mddev->new_level != 6) { - printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n", - mdname(mddev), mddev->new_level); - return ERR_PTR(-EIO); - } - if ((mddev->new_level == 5 - && !algorithm_valid_raid5(mddev->new_layout)) || - (mddev->new_level == 6 - && !algorithm_valid_raid6(mddev->new_layout))) { - printk(KERN_ERR "md/raid:%s: layout %d not supported\n", - mdname(mddev), mddev->new_layout); - return ERR_PTR(-EIO); - } - if (mddev->new_level == 6 && mddev->raid_disks < 4) { - printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n", - mdname(mddev), mddev->raid_disks); - return ERR_PTR(-EINVAL); - } - - if (!mddev->new_chunk_sectors || - (mddev->new_chunk_sectors << 9) % PAGE_SIZE || - !is_power_of_2(mddev->new_chunk_sectors)) { - printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n", - mdname(mddev), mddev->new_chunk_sectors << 9); - return ERR_PTR(-EINVAL); - } - - conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); - if (conf == NULL) - goto abort; - spin_lock_init(&conf->device_lock); - init_waitqueue_head(&conf->wait_for_stripe); - init_waitqueue_head(&conf->wait_for_overlap); - INIT_LIST_HEAD(&conf->handle_list); - INIT_LIST_HEAD(&conf->hold_list); - INIT_LIST_HEAD(&conf->delayed_list); - INIT_LIST_HEAD(&conf->bitmap_list); - INIT_LIST_HEAD(&conf->inactive_list); - atomic_set(&conf->active_stripes, 0); - atomic_set(&conf->preread_active_stripes, 0); - atomic_set(&conf->active_aligned_reads, 0); - conf->bypass_threshold = BYPASS_THRESHOLD; - conf->recovery_disabled = mddev->recovery_disabled - 1; - - conf->raid_disks = mddev->raid_disks; - if (mddev->reshape_position == MaxSector) - conf->previous_raid_disks = mddev->raid_disks; - else - conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; - max_disks = max(conf->raid_disks, conf->previous_raid_disks); - conf->scribble_len = scribble_len(max_disks); - - conf->disks = kzalloc(max_disks * sizeof(struct disk_info), - GFP_KERNEL); - if (!conf->disks) - goto abort; - - conf->mddev = mddev; - - if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) - goto abort; - - conf->level = mddev->new_level; - if (raid5_alloc_percpu(conf) != 0) - goto abort; - - pr_debug("raid456: run(%s) called.\n", mdname(mddev)); - - rdev_for_each(rdev, mddev) { - raid_disk = rdev->raid_disk; - if (raid_disk >= max_disks - || raid_disk < 0) - continue; - disk = conf->disks + raid_disk; - - if (test_bit(Replacement, &rdev->flags)) { - if (disk->replacement) - goto abort; - disk->replacement = rdev; - } else { - if (disk->rdev) - goto abort; - disk->rdev = rdev; - } - - if (test_bit(In_sync, &rdev->flags)) { - char b[BDEVNAME_SIZE]; - printk(KERN_INFO "md/raid:%s: device %s operational as raid" - " disk %d\n", - mdname(mddev), bdevname(rdev->bdev, b), raid_disk); - } else if (rdev->saved_raid_disk != raid_disk) - /* Cannot rely on bitmap to complete recovery */ - conf->fullsync = 1; - } - - conf->chunk_sectors = mddev->new_chunk_sectors; - conf->level = mddev->new_level; - if (conf->level == 6) - conf->max_degraded = 2; - else - conf->max_degraded = 1; - conf->algorithm = mddev->new_layout; - conf->max_nr_stripes = NR_STRIPES; - conf->reshape_progress = mddev->reshape_position; - if (conf->reshape_progress != MaxSector) { - conf->prev_chunk_sectors = mddev->chunk_sectors; - conf->prev_algo = mddev->layout; - } - - memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + - max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; - if (grow_stripes(conf, conf->max_nr_stripes)) { - printk(KERN_ERR - "md/raid:%s: couldn't allocate %dkB for buffers\n", - mdname(mddev), memory); - goto abort; - } else - printk(KERN_INFO "md/raid:%s: allocated %dkB\n", - mdname(mddev), memory); - - conf->thread = md_register_thread(raid5d, mddev, NULL); - if (!conf->thread) { - printk(KERN_ERR - "md/raid:%s: couldn't allocate thread.\n", - mdname(mddev)); - goto abort; - } - - return conf; - - abort: - if (conf) { - free_conf(conf); - return ERR_PTR(-EIO); - } else - return ERR_PTR(-ENOMEM); -} - - -static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) -{ - switch (algo) { - case ALGORITHM_PARITY_0: - if (raid_disk < max_degraded) - return 1; - break; - case ALGORITHM_PARITY_N: - if (raid_disk >= raid_disks - max_degraded) - return 1; - break; - case ALGORITHM_PARITY_0_6: - if (raid_disk == 0 || - raid_disk == raid_disks - 1) - return 1; - break; - case ALGORITHM_LEFT_ASYMMETRIC_6: - case ALGORITHM_RIGHT_ASYMMETRIC_6: - case ALGORITHM_LEFT_SYMMETRIC_6: - case ALGORITHM_RIGHT_SYMMETRIC_6: - if (raid_disk == raid_disks - 1) - return 1; - } - return 0; -} - -static int run(struct mddev *mddev) -{ - struct r5conf *conf; - int working_disks = 0; - int dirty_parity_disks = 0; - struct md_rdev *rdev; - sector_t reshape_offset = 0; - int i; - - if (mddev->recovery_cp != MaxSector) - printk(KERN_NOTICE "md/raid:%s: not clean" - " -- starting background reconstruction\n", - mdname(mddev)); - if (mddev->reshape_position != MaxSector) { - /* Check that we can continue the reshape. - * Currently only disks can change, it must - * increase, and we must be past the point where - * a stripe over-writes itself - */ - sector_t here_new, here_old; - int old_disks; - int max_degraded = (mddev->level == 6 ? 2 : 1); - - if (mddev->new_level != mddev->level) { - printk(KERN_ERR "md/raid:%s: unsupported reshape " - "required - aborting.\n", - mdname(mddev)); - return -EINVAL; - } - old_disks = mddev->raid_disks - mddev->delta_disks; - /* reshape_position must be on a new-stripe boundary, and one - * further up in new geometry must map after here in old - * geometry. - */ - here_new = mddev->reshape_position; - if (sector_div(here_new, mddev->new_chunk_sectors * - (mddev->raid_disks - max_degraded))) { - printk(KERN_ERR "md/raid:%s: reshape_position not " - "on a stripe boundary\n", mdname(mddev)); - return -EINVAL; - } - reshape_offset = here_new * mddev->new_chunk_sectors; - /* here_new is the stripe we will write to */ - here_old = mddev->reshape_position; - sector_div(here_old, mddev->chunk_sectors * - (old_disks-max_degraded)); - /* here_old is the first stripe that we might need to read - * from */ - if (mddev->delta_disks == 0) { - /* We cannot be sure it is safe to start an in-place - * reshape. It is only safe if user-space if monitoring - * and taking constant backups. - * mdadm always starts a situation like this in - * readonly mode so it can take control before - * allowing any writes. So just check for that. - */ - if ((here_new * mddev->new_chunk_sectors != - here_old * mddev->chunk_sectors) || - mddev->ro == 0) { - printk(KERN_ERR "md/raid:%s: in-place reshape must be started" - " in read-only mode - aborting\n", - mdname(mddev)); - return -EINVAL; - } - } else if (mddev->delta_disks < 0 - ? (here_new * mddev->new_chunk_sectors <= - here_old * mddev->chunk_sectors) - : (here_new * mddev->new_chunk_sectors >= - here_old * mddev->chunk_sectors)) { - /* Reading from the same stripe as writing to - bad */ - printk(KERN_ERR "md/raid:%s: reshape_position too early for " - "auto-recovery - aborting.\n", - mdname(mddev)); - return -EINVAL; - } - printk(KERN_INFO "md/raid:%s: reshape will continue\n", - mdname(mddev)); - /* OK, we should be able to continue; */ - } else { - BUG_ON(mddev->level != mddev->new_level); - BUG_ON(mddev->layout != mddev->new_layout); - BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); - BUG_ON(mddev->delta_disks != 0); - } - - if (mddev->private == NULL) - conf = setup_conf(mddev); - else - conf = mddev->private; - - if (IS_ERR(conf)) - return PTR_ERR(conf); - - mddev->thread = conf->thread; - conf->thread = NULL; - mddev->private = conf; - - for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; - i++) { - rdev = conf->disks[i].rdev; - if (!rdev && conf->disks[i].replacement) { - /* The replacement is all we have yet */ - rdev = conf->disks[i].replacement; - conf->disks[i].replacement = NULL; - clear_bit(Replacement, &rdev->flags); - conf->disks[i].rdev = rdev; - } - if (!rdev) - continue; - if (conf->disks[i].replacement && - conf->reshape_progress != MaxSector) { - /* replacements and reshape simply do not mix. */ - printk(KERN_ERR "md: cannot handle concurrent " - "replacement and reshape.\n"); - goto abort; - } - if (test_bit(In_sync, &rdev->flags)) { - working_disks++; - continue; - } - /* This disc is not fully in-sync. However if it - * just stored parity (beyond the recovery_offset), - * when we don't need to be concerned about the - * array being dirty. - * When reshape goes 'backwards', we never have - * partially completed devices, so we only need - * to worry about reshape going forwards. - */ - /* Hack because v0.91 doesn't store recovery_offset properly. */ - if (mddev->major_version == 0 && - mddev->minor_version > 90) - rdev->recovery_offset = reshape_offset; - - if (rdev->recovery_offset < reshape_offset) { - /* We need to check old and new layout */ - if (!only_parity(rdev->raid_disk, - conf->algorithm, - conf->raid_disks, - conf->max_degraded)) - continue; - } - if (!only_parity(rdev->raid_disk, - conf->prev_algo, - conf->previous_raid_disks, - conf->max_degraded)) - continue; - dirty_parity_disks++; - } - - /* - * 0 for a fully functional array, 1 or 2 for a degraded array. - */ - mddev->degraded = calc_degraded(conf); - - if (has_failed(conf)) { - printk(KERN_ERR "md/raid:%s: not enough operational devices" - " (%d/%d failed)\n", - mdname(mddev), mddev->degraded, conf->raid_disks); - goto abort; - } - - /* device size must be a multiple of chunk size */ - mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); - mddev->resync_max_sectors = mddev->dev_sectors; - - if (mddev->degraded > dirty_parity_disks && - mddev->recovery_cp != MaxSector) { - if (mddev->ok_start_degraded) - printk(KERN_WARNING - "md/raid:%s: starting dirty degraded array" - " - data corruption possible.\n", - mdname(mddev)); - else { - printk(KERN_ERR - "md/raid:%s: cannot start dirty degraded array.\n", - mdname(mddev)); - goto abort; - } - } - - if (mddev->degraded == 0) - printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d" - " devices, algorithm %d\n", mdname(mddev), conf->level, - mddev->raid_disks-mddev->degraded, mddev->raid_disks, - mddev->new_layout); - else - printk(KERN_ALERT "md/raid:%s: raid level %d active with %d" - " out of %d devices, algorithm %d\n", - mdname(mddev), conf->level, - mddev->raid_disks - mddev->degraded, - mddev->raid_disks, mddev->new_layout); - - print_raid5_conf(conf); - - if (conf->reshape_progress != MaxSector) { - conf->reshape_safe = conf->reshape_progress; - atomic_set(&conf->reshape_stripes, 0); - clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); - clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); - set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); - set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - mddev->sync_thread = md_register_thread(md_do_sync, mddev, - "reshape"); - } - - - /* Ok, everything is just fine now */ - if (mddev->to_remove == &raid5_attrs_group) - mddev->to_remove = NULL; - else if (mddev->kobj.sd && - sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) - printk(KERN_WARNING - "raid5: failed to create sysfs attributes for %s\n", - mdname(mddev)); - md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); - - if (mddev->queue) { - int chunk_size; - /* read-ahead size must cover two whole stripes, which - * is 2 * (datadisks) * chunksize where 'n' is the - * number of raid devices - */ - int data_disks = conf->previous_raid_disks - conf->max_degraded; - int stripe = data_disks * - ((mddev->chunk_sectors << 9) / PAGE_SIZE); - if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) - mddev->queue->backing_dev_info.ra_pages = 2 * stripe; - - blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); - - mddev->queue->backing_dev_info.congested_data = mddev; - mddev->queue->backing_dev_info.congested_fn = raid5_congested; - - chunk_size = mddev->chunk_sectors << 9; - blk_queue_io_min(mddev->queue, chunk_size); - blk_queue_io_opt(mddev->queue, chunk_size * - (conf->raid_disks - conf->max_degraded)); - - rdev_for_each(rdev, mddev) - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - } - - return 0; -abort: - md_unregister_thread(&mddev->thread); - print_raid5_conf(conf); - free_conf(conf); - mddev->private = NULL; - printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev)); - return -EIO; -} - -static int stop(struct mddev *mddev) -{ - struct r5conf *conf = mddev->private; - - md_unregister_thread(&mddev->thread); - if (mddev->queue) - mddev->queue->backing_dev_info.congested_fn = NULL; - free_conf(conf); - mddev->private = NULL; - mddev->to_remove = &raid5_attrs_group; - return 0; -} - -static void status(struct seq_file *seq, struct mddev *mddev) -{ - struct r5conf *conf = mddev->private; - int i; - - seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, - mddev->chunk_sectors / 2, mddev->layout); - seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); - for (i = 0; i < conf->raid_disks; i++) - seq_printf (seq, "%s", - conf->disks[i].rdev && - test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); - seq_printf (seq, "]"); -} - -static void print_raid5_conf (struct r5conf *conf) -{ - int i; - struct disk_info *tmp; - - printk(KERN_DEBUG "RAID conf printout:\n"); - if (!conf) { - printk("(conf==NULL)\n"); - return; - } - printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level, - conf->raid_disks, - conf->raid_disks - conf->mddev->degraded); - - for (i = 0; i < conf->raid_disks; i++) { - char b[BDEVNAME_SIZE]; - tmp = conf->disks + i; - if (tmp->rdev) - printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n", - i, !test_bit(Faulty, &tmp->rdev->flags), - bdevname(tmp->rdev->bdev, b)); - } -} - -static int raid5_spare_active(struct mddev *mddev) -{ - int i; - struct r5conf *conf = mddev->private; - struct disk_info *tmp; - int count = 0; - unsigned long flags; - - for (i = 0; i < conf->raid_disks; i++) { - tmp = conf->disks + i; - if (tmp->replacement - && tmp->replacement->recovery_offset == MaxSector - && !test_bit(Faulty, &tmp->replacement->flags) - && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { - /* Replacement has just become active. */ - if (!tmp->rdev - || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) - count++; - if (tmp->rdev) { - /* Replaced device not technically faulty, - * but we need to be sure it gets removed - * and never re-added. - */ - set_bit(Faulty, &tmp->rdev->flags); - sysfs_notify_dirent_safe( - tmp->rdev->sysfs_state); - } - sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); - } else if (tmp->rdev - && tmp->rdev->recovery_offset == MaxSector - && !test_bit(Faulty, &tmp->rdev->flags) - && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { - count++; - sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); - } - } - spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded = calc_degraded(conf); - spin_unlock_irqrestore(&conf->device_lock, flags); - print_raid5_conf(conf); - return count; -} - -static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) -{ - struct r5conf *conf = mddev->private; - int err = 0; - int number = rdev->raid_disk; - struct md_rdev **rdevp; - struct disk_info *p = conf->disks + number; - - print_raid5_conf(conf); - if (rdev == p->rdev) - rdevp = &p->rdev; - else if (rdev == p->replacement) - rdevp = &p->replacement; - else - return 0; - - if (number >= conf->raid_disks && - conf->reshape_progress == MaxSector) - clear_bit(In_sync, &rdev->flags); - - if (test_bit(In_sync, &rdev->flags) || - atomic_read(&rdev->nr_pending)) { - err = -EBUSY; - goto abort; - } - /* Only remove non-faulty devices if recovery - * isn't possible. - */ - if (!test_bit(Faulty, &rdev->flags) && - mddev->recovery_disabled != conf->recovery_disabled && - !has_failed(conf) && - (!p->replacement || p->replacement == rdev) && - number < conf->raid_disks) { - err = -EBUSY; - goto abort; - } - *rdevp = NULL; - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - *rdevp = rdev; - } else if (p->replacement) { - /* We must have just cleared 'rdev' */ - p->rdev = p->replacement; - clear_bit(Replacement, &p->replacement->flags); - smp_mb(); /* Make sure other CPUs may see both as identical - * but will never see neither - if they are careful - */ - p->replacement = NULL; - clear_bit(WantReplacement, &rdev->flags); - } else - /* We might have just removed the Replacement as faulty- - * clear the bit just in case - */ - clear_bit(WantReplacement, &rdev->flags); -abort: - - print_raid5_conf(conf); - return err; -} - -static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) -{ - struct r5conf *conf = mddev->private; - int err = -EEXIST; - int disk; - struct disk_info *p; - int first = 0; - int last = conf->raid_disks - 1; - - if (mddev->recovery_disabled == conf->recovery_disabled) - return -EBUSY; - - if (rdev->saved_raid_disk < 0 && has_failed(conf)) - /* no point adding a device */ - return -EINVAL; - - if (rdev->raid_disk >= 0) - first = last = rdev->raid_disk; - - /* - * find the disk ... but prefer rdev->saved_raid_disk - * if possible. - */ - if (rdev->saved_raid_disk >= 0 && - rdev->saved_raid_disk >= first && - conf->disks[rdev->saved_raid_disk].rdev == NULL) - disk = rdev->saved_raid_disk; - else - disk = first; - for ( ; disk <= last ; disk++) { - p = conf->disks + disk; - if (p->rdev == NULL) { - clear_bit(In_sync, &rdev->flags); - rdev->raid_disk = disk; - err = 0; - if (rdev->saved_raid_disk != disk) - conf->fullsync = 1; - rcu_assign_pointer(p->rdev, rdev); - break; - } - if (test_bit(WantReplacement, &p->rdev->flags) && - p->replacement == NULL) { - clear_bit(In_sync, &rdev->flags); - set_bit(Replacement, &rdev->flags); - rdev->raid_disk = disk; - err = 0; - conf->fullsync = 1; - rcu_assign_pointer(p->replacement, rdev); - break; - } - } - print_raid5_conf(conf); - return err; -} - -static int raid5_resize(struct mddev *mddev, sector_t sectors) -{ - /* no resync is happening, and there is enough space - * on all devices, so we can resize. - * We need to make sure resync covers any new space. - * If the array is shrinking we should possibly wait until - * any io in the removed space completes, but it hardly seems - * worth it. - */ - sectors &= ~((sector_t)mddev->chunk_sectors - 1); - md_set_array_sectors(mddev, raid5_size(mddev, sectors, - mddev->raid_disks)); - if (mddev->array_sectors > - raid5_size(mddev, sectors, mddev->raid_disks)) - return -EINVAL; - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk(mddev->gendisk); - if (sectors > mddev->dev_sectors && - mddev->recovery_cp > mddev->dev_sectors) { - mddev->recovery_cp = mddev->dev_sectors; - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - } - mddev->dev_sectors = sectors; - mddev->resync_max_sectors = sectors; - return 0; -} - -static int check_stripe_cache(struct mddev *mddev) -{ - /* Can only proceed if there are plenty of stripe_heads. - * We need a minimum of one full stripe,, and for sensible progress - * it is best to have about 4 times that. - * If we require 4 times, then the default 256 4K stripe_heads will - * allow for chunk sizes up to 256K, which is probably OK. - * If the chunk size is greater, user-space should request more - * stripe_heads first. - */ - struct r5conf *conf = mddev->private; - if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 - > conf->max_nr_stripes || - ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 - > conf->max_nr_stripes) { - printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", - mdname(mddev), - ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) - / STRIPE_SIZE)*4); - return 0; - } - return 1; -} - -static int check_reshape(struct mddev *mddev) -{ - struct r5conf *conf = mddev->private; - - if (mddev->delta_disks == 0 && - mddev->new_layout == mddev->layout && - mddev->new_chunk_sectors == mddev->chunk_sectors) - return 0; /* nothing to do */ - if (mddev->bitmap) - /* Cannot grow a bitmap yet */ - return -EBUSY; - if (has_failed(conf)) - return -EINVAL; - if (mddev->delta_disks < 0) { - /* We might be able to shrink, but the devices must - * be made bigger first. - * For raid6, 4 is the minimum size. - * Otherwise 2 is the minimum - */ - int min = 2; - if (mddev->level == 6) - min = 4; - if (mddev->raid_disks + mddev->delta_disks < min) - return -EINVAL; - } - - if (!check_stripe_cache(mddev)) - return -ENOSPC; - - return resize_stripes(conf, conf->raid_disks + mddev->delta_disks); -} - -static int raid5_start_reshape(struct mddev *mddev) -{ - struct r5conf *conf = mddev->private; - struct md_rdev *rdev; - int spares = 0; - unsigned long flags; - - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - return -EBUSY; - - if (!check_stripe_cache(mddev)) - return -ENOSPC; - - rdev_for_each(rdev, mddev) - if (!test_bit(In_sync, &rdev->flags) - && !test_bit(Faulty, &rdev->flags)) - spares++; - - if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) - /* Not enough devices even to make a degraded array - * of that size - */ - return -EINVAL; - - /* Refuse to reduce size of the array. Any reductions in - * array size must be through explicit setting of array_size - * attribute. - */ - if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) - < mddev->array_sectors) { - printk(KERN_ERR "md/raid:%s: array size must be reduced " - "before number of disks\n", mdname(mddev)); - return -EINVAL; - } - - atomic_set(&conf->reshape_stripes, 0); - spin_lock_irq(&conf->device_lock); - conf->previous_raid_disks = conf->raid_disks; - conf->raid_disks += mddev->delta_disks; - conf->prev_chunk_sectors = conf->chunk_sectors; - conf->chunk_sectors = mddev->new_chunk_sectors; - conf->prev_algo = conf->algorithm; - conf->algorithm = mddev->new_layout; - if (mddev->delta_disks < 0) - conf->reshape_progress = raid5_size(mddev, 0, 0); - else - conf->reshape_progress = 0; - conf->reshape_safe = conf->reshape_progress; - conf->generation++; - spin_unlock_irq(&conf->device_lock); - - /* Add some new drives, as many as will fit. - * We know there are enough to make the newly sized array work. - * Don't add devices if we are reducing the number of - * devices in the array. This is because it is not possible - * to correctly record the "partially reconstructed" state of - * such devices during the reshape and confusion could result. - */ - if (mddev->delta_disks >= 0) { - rdev_for_each(rdev, mddev) - if (rdev->raid_disk < 0 && - !test_bit(Faulty, &rdev->flags)) { - if (raid5_add_disk(mddev, rdev) == 0) { - if (rdev->raid_disk - >= conf->previous_raid_disks) - set_bit(In_sync, &rdev->flags); - else - rdev->recovery_offset = 0; - - if (sysfs_link_rdev(mddev, rdev)) - /* Failure here is OK */; - } - } else if (rdev->raid_disk >= conf->previous_raid_disks - && !test_bit(Faulty, &rdev->flags)) { - /* This is a spare that was manually added */ - set_bit(In_sync, &rdev->flags); - } - - /* When a reshape changes the number of devices, - * ->degraded is measured against the larger of the - * pre and post number of devices. - */ - spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded = calc_degraded(conf); - spin_unlock_irqrestore(&conf->device_lock, flags); - } - mddev->raid_disks = conf->raid_disks; - mddev->reshape_position = conf->reshape_progress; - set_bit(MD_CHANGE_DEVS, &mddev->flags); - - clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); - clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); - set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); - set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - mddev->sync_thread = md_register_thread(md_do_sync, mddev, - "reshape"); - if (!mddev->sync_thread) { - mddev->recovery = 0; - spin_lock_irq(&conf->device_lock); - mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; - conf->reshape_progress = MaxSector; - mddev->reshape_position = MaxSector; - spin_unlock_irq(&conf->device_lock); - return -EAGAIN; - } - conf->reshape_checkpoint = jiffies; - md_wakeup_thread(mddev->sync_thread); - md_new_event(mddev); - return 0; -} - -/* This is called from the reshape thread and should make any - * changes needed in 'conf' - */ -static void end_reshape(struct r5conf *conf) -{ - - if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { - - spin_lock_irq(&conf->device_lock); - conf->previous_raid_disks = conf->raid_disks; - conf->reshape_progress = MaxSector; - spin_unlock_irq(&conf->device_lock); - wake_up(&conf->wait_for_overlap); - - /* read-ahead size must cover two whole stripes, which is - * 2 * (datadisks) * chunksize where 'n' is the number of raid devices - */ - if (conf->mddev->queue) { - int data_disks = conf->raid_disks - conf->max_degraded; - int stripe = data_disks * ((conf->chunk_sectors << 9) - / PAGE_SIZE); - if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) - conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; - } - } -} - -/* This is called from the raid5d thread with mddev_lock held. - * It makes config changes to the device. - */ -static void raid5_finish_reshape(struct mddev *mddev) -{ - struct r5conf *conf = mddev->private; - - if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { - - if (mddev->delta_disks > 0) { - md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk(mddev->gendisk); - } else { - int d; - spin_lock_irq(&conf->device_lock); - mddev->degraded = calc_degraded(conf); - spin_unlock_irq(&conf->device_lock); - for (d = conf->raid_disks ; - d < conf->raid_disks - mddev->delta_disks; - d++) { - struct md_rdev *rdev = conf->disks[d].rdev; - if (rdev && - raid5_remove_disk(mddev, rdev) == 0) { - sysfs_unlink_rdev(mddev, rdev); - rdev->raid_disk = -1; - } - } - } - mddev->layout = conf->algorithm; - mddev->chunk_sectors = conf->chunk_sectors; - mddev->reshape_position = MaxSector; - mddev->delta_disks = 0; - } -} - -static void raid5_quiesce(struct mddev *mddev, int state) -{ - struct r5conf *conf = mddev->private; - - switch(state) { - case 2: /* resume for a suspend */ - wake_up(&conf->wait_for_overlap); - break; - - case 1: /* stop all writes */ - spin_lock_irq(&conf->device_lock); - /* '2' tells resync/reshape to pause so that all - * active stripes can drain - */ - conf->quiesce = 2; - wait_event_lock_irq(conf->wait_for_stripe, - atomic_read(&conf->active_stripes) == 0 && - atomic_read(&conf->active_aligned_reads) == 0, - conf->device_lock, /* nothing */); - conf->quiesce = 1; - spin_unlock_irq(&conf->device_lock); - /* allow reshape to continue */ - wake_up(&conf->wait_for_overlap); - break; - - case 0: /* re-enable writes */ - spin_lock_irq(&conf->device_lock); - conf->quiesce = 0; - wake_up(&conf->wait_for_stripe); - wake_up(&conf->wait_for_overlap); - spin_unlock_irq(&conf->device_lock); - break; - } -} - - -static void *raid45_takeover_raid0(struct mddev *mddev, int level) -{ - struct r0conf *raid0_conf = mddev->private; - sector_t sectors; - - /* for raid0 takeover only one zone is supported */ - if (raid0_conf->nr_strip_zones > 1) { - printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n", - mdname(mddev)); - return ERR_PTR(-EINVAL); - } - - sectors = raid0_conf->strip_zone[0].zone_end; - sector_div(sectors, raid0_conf->strip_zone[0].nb_dev); - mddev->dev_sectors = sectors; - mddev->new_level = level; - mddev->new_layout = ALGORITHM_PARITY_N; - mddev->new_chunk_sectors = mddev->chunk_sectors; - mddev->raid_disks += 1; - mddev->delta_disks = 1; - /* make sure it will be not marked as dirty */ - mddev->recovery_cp = MaxSector; - - return setup_conf(mddev); -} - - -static void *raid5_takeover_raid1(struct mddev *mddev) -{ - int chunksect; - - if (mddev->raid_disks != 2 || - mddev->degraded > 1) - return ERR_PTR(-EINVAL); - - /* Should check if there are write-behind devices? */ - - chunksect = 64*2; /* 64K by default */ - - /* The array must be an exact multiple of chunksize */ - while (chunksect && (mddev->array_sectors & (chunksect-1))) - chunksect >>= 1; - - if ((chunksect<<9) < STRIPE_SIZE) - /* array size does not allow a suitable chunk size */ - return ERR_PTR(-EINVAL); - - mddev->new_level = 5; - mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; - mddev->new_chunk_sectors = chunksect; - - return setup_conf(mddev); -} - -static void *raid5_takeover_raid6(struct mddev *mddev) -{ - int new_layout; - - switch (mddev->layout) { - case ALGORITHM_LEFT_ASYMMETRIC_6: - new_layout = ALGORITHM_LEFT_ASYMMETRIC; - break; - case ALGORITHM_RIGHT_ASYMMETRIC_6: - new_layout = ALGORITHM_RIGHT_ASYMMETRIC; - break; - case ALGORITHM_LEFT_SYMMETRIC_6: - new_layout = ALGORITHM_LEFT_SYMMETRIC; - break; - case ALGORITHM_RIGHT_SYMMETRIC_6: - new_layout = ALGORITHM_RIGHT_SYMMETRIC; - break; - case ALGORITHM_PARITY_0_6: - new_layout = ALGORITHM_PARITY_0; - break; - case ALGORITHM_PARITY_N: - new_layout = ALGORITHM_PARITY_N; - break; - default: - return ERR_PTR(-EINVAL); - } - mddev->new_level = 5; - mddev->new_layout = new_layout; - mddev->delta_disks = -1; - mddev->raid_disks -= 1; - return setup_conf(mddev); -} - - -static int raid5_check_reshape(struct mddev *mddev) -{ - /* For a 2-drive array, the layout and chunk size can be changed - * immediately as not restriping is needed. - * For larger arrays we record the new value - after validation - * to be used by a reshape pass. - */ - struct r5conf *conf = mddev->private; - int new_chunk = mddev->new_chunk_sectors; - - if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) - return -EINVAL; - if (new_chunk > 0) { - if (!is_power_of_2(new_chunk)) - return -EINVAL; - if (new_chunk < (PAGE_SIZE>>9)) - return -EINVAL; - if (mddev->array_sectors & (new_chunk-1)) - /* not factor of array size */ - return -EINVAL; - } - - /* They look valid */ - - if (mddev->raid_disks == 2) { - /* can make the change immediately */ - if (mddev->new_layout >= 0) { - conf->algorithm = mddev->new_layout; - mddev->layout = mddev->new_layout; - } - if (new_chunk > 0) { - conf->chunk_sectors = new_chunk ; - mddev->chunk_sectors = new_chunk; - } - set_bit(MD_CHANGE_DEVS, &mddev->flags); - md_wakeup_thread(mddev->thread); - } - return check_reshape(mddev); -} - -static int raid6_check_reshape(struct mddev *mddev) -{ - int new_chunk = mddev->new_chunk_sectors; - - if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) - return -EINVAL; - if (new_chunk > 0) { - if (!is_power_of_2(new_chunk)) - return -EINVAL; - if (new_chunk < (PAGE_SIZE >> 9)) - return -EINVAL; - if (mddev->array_sectors & (new_chunk-1)) - /* not factor of array size */ - return -EINVAL; - } - - /* They look valid */ - return check_reshape(mddev); -} - -static void *raid5_takeover(struct mddev *mddev) -{ - /* raid5 can take over: - * raid0 - if there is only one strip zone - make it a raid4 layout - * raid1 - if there are two drives. We need to know the chunk size - * raid4 - trivial - just use a raid4 layout. - * raid6 - Providing it is a *_6 layout - */ - if (mddev->level == 0) - return raid45_takeover_raid0(mddev, 5); - if (mddev->level == 1) - return raid5_takeover_raid1(mddev); - if (mddev->level == 4) { - mddev->new_layout = ALGORITHM_PARITY_N; - mddev->new_level = 5; - return setup_conf(mddev); - } - if (mddev->level == 6) - return raid5_takeover_raid6(mddev); - - return ERR_PTR(-EINVAL); -} - -static void *raid4_takeover(struct mddev *mddev) -{ - /* raid4 can take over: - * raid0 - if there is only one strip zone - * raid5 - if layout is right - */ - if (mddev->level == 0) - return raid45_takeover_raid0(mddev, 4); - if (mddev->level == 5 && - mddev->layout == ALGORITHM_PARITY_N) { - mddev->new_layout = 0; - mddev->new_level = 4; - return setup_conf(mddev); - } - return ERR_PTR(-EINVAL); -} - -static struct md_personality raid5_personality; - -static void *raid6_takeover(struct mddev *mddev) -{ - /* Currently can only take over a raid5. We map the - * personality to an equivalent raid6 personality - * with the Q block at the end. - */ - int new_layout; - - if (mddev->pers != &raid5_personality) - return ERR_PTR(-EINVAL); - if (mddev->degraded > 1) - return ERR_PTR(-EINVAL); - if (mddev->raid_disks > 253) - return ERR_PTR(-EINVAL); - if (mddev->raid_disks < 3) - return ERR_PTR(-EINVAL); - - switch (mddev->layout) { - case ALGORITHM_LEFT_ASYMMETRIC: - new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; - break; - case ALGORITHM_RIGHT_ASYMMETRIC: - new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; - break; - case ALGORITHM_LEFT_SYMMETRIC: - new_layout = ALGORITHM_LEFT_SYMMETRIC_6; - break; - case ALGORITHM_RIGHT_SYMMETRIC: - new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; - break; - case ALGORITHM_PARITY_0: - new_layout = ALGORITHM_PARITY_0_6; - break; - case ALGORITHM_PARITY_N: - new_layout = ALGORITHM_PARITY_N; - break; - default: - return ERR_PTR(-EINVAL); - } - mddev->new_level = 6; - mddev->new_layout = new_layout; - mddev->delta_disks = 1; - mddev->raid_disks += 1; - return setup_conf(mddev); -} - - -static struct md_personality raid6_personality = -{ - .name = "raid6", - .level = 6, - .owner = THIS_MODULE, - .make_request = make_request, - .run = run, - .stop = stop, - .status = status, - .error_handler = error, - .hot_add_disk = raid5_add_disk, - .hot_remove_disk= raid5_remove_disk, - .spare_active = raid5_spare_active, - .sync_request = sync_request, - .resize = raid5_resize, - .size = raid5_size, - .check_reshape = raid6_check_reshape, - .start_reshape = raid5_start_reshape, - .finish_reshape = raid5_finish_reshape, - .quiesce = raid5_quiesce, - .takeover = raid6_takeover, -}; -static struct md_personality raid5_personality = -{ - .name = "raid5", - .level = 5, - .owner = THIS_MODULE, - .make_request = make_request, - .run = run, - .stop = stop, - .status = status, - .error_handler = error, - .hot_add_disk = raid5_add_disk, - .hot_remove_disk= raid5_remove_disk, - .spare_active = raid5_spare_active, - .sync_request = sync_request, - .resize = raid5_resize, - .size = raid5_size, - .check_reshape = raid5_check_reshape, - .start_reshape = raid5_start_reshape, - .finish_reshape = raid5_finish_reshape, - .quiesce = raid5_quiesce, - .takeover = raid5_takeover, -}; - -static struct md_personality raid4_personality = -{ - .name = "raid4", - .level = 4, - .owner = THIS_MODULE, - .make_request = make_request, - .run = run, - .stop = stop, - .status = status, - .error_handler = error, - .hot_add_disk = raid5_add_disk, - .hot_remove_disk= raid5_remove_disk, - .spare_active = raid5_spare_active, - .sync_request = sync_request, - .resize = raid5_resize, - .size = raid5_size, - .check_reshape = raid5_check_reshape, - .start_reshape = raid5_start_reshape, - .finish_reshape = raid5_finish_reshape, - .quiesce = raid5_quiesce, - .takeover = raid4_takeover, -}; - -static int __init raid5_init(void) -{ - register_md_personality(&raid6_personality); - register_md_personality(&raid5_personality); - register_md_personality(&raid4_personality); - return 0; -} - -static void raid5_exit(void) -{ - unregister_md_personality(&raid6_personality); - unregister_md_personality(&raid5_personality); - unregister_md_personality(&raid4_personality); -} - -module_init(raid5_init); -module_exit(raid5_exit); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); -MODULE_ALIAS("md-personality-4"); /* RAID5 */ -MODULE_ALIAS("md-raid5"); -MODULE_ALIAS("md-raid4"); -MODULE_ALIAS("md-level-5"); -MODULE_ALIAS("md-level-4"); -MODULE_ALIAS("md-personality-8"); /* RAID6 */ -MODULE_ALIAS("md-raid6"); -MODULE_ALIAS("md-level-6"); - -/* This used to be two separate modules, they were: */ -MODULE_ALIAS("raid5"); -MODULE_ALIAS("raid6"); diff --git a/ANDROID_3.4.5/drivers/md/raid5.h b/ANDROID_3.4.5/drivers/md/raid5.h deleted file mode 100644 index 8d8e1393..00000000 --- a/ANDROID_3.4.5/drivers/md/raid5.h +++ /dev/null @@ -1,519 +0,0 @@ -#ifndef _RAID5_H -#define _RAID5_H - -#include <linux/raid/xor.h> -#include <linux/dmaengine.h> - -/* - * - * Each stripe contains one buffer per device. Each buffer can be in - * one of a number of states stored in "flags". Changes between - * these states happen *almost* exclusively under the protection of the - * STRIPE_ACTIVE flag. Some very specific changes can happen in bi_end_io, and - * these are not protected by STRIPE_ACTIVE. - * - * The flag bits that are used to represent these states are: - * R5_UPTODATE and R5_LOCKED - * - * State Empty == !UPTODATE, !LOCK - * We have no data, and there is no active request - * State Want == !UPTODATE, LOCK - * A read request is being submitted for this block - * State Dirty == UPTODATE, LOCK - * Some new data is in this buffer, and it is being written out - * State Clean == UPTODATE, !LOCK - * We have valid data which is the same as on disc - * - * The possible state transitions are: - * - * Empty -> Want - on read or write to get old data for parity calc - * Empty -> Dirty - on compute_parity to satisfy write/sync request. - * Empty -> Clean - on compute_block when computing a block for failed drive - * Want -> Empty - on failed read - * Want -> Clean - on successful completion of read request - * Dirty -> Clean - on successful completion of write request - * Dirty -> Clean - on failed write - * Clean -> Dirty - on compute_parity to satisfy write/sync (RECONSTRUCT or RMW) - * - * The Want->Empty, Want->Clean, Dirty->Clean, transitions - * all happen in b_end_io at interrupt time. - * Each sets the Uptodate bit before releasing the Lock bit. - * This leaves one multi-stage transition: - * Want->Dirty->Clean - * This is safe because thinking that a Clean buffer is actually dirty - * will at worst delay some action, and the stripe will be scheduled - * for attention after the transition is complete. - * - * There is one possibility that is not covered by these states. That - * is if one drive has failed and there is a spare being rebuilt. We - * can't distinguish between a clean block that has been generated - * from parity calculations, and a clean block that has been - * successfully written to the spare ( or to parity when resyncing). - * To distingush these states we have a stripe bit STRIPE_INSYNC that - * is set whenever a write is scheduled to the spare, or to the parity - * disc if there is no spare. A sync request clears this bit, and - * when we find it set with no buffers locked, we know the sync is - * complete. - * - * Buffers for the md device that arrive via make_request are attached - * to the appropriate stripe in one of two lists linked on b_reqnext. - * One list (bh_read) for read requests, one (bh_write) for write. - * There should never be more than one buffer on the two lists - * together, but we are not guaranteed of that so we allow for more. - * - * If a buffer is on the read list when the associated cache buffer is - * Uptodate, the data is copied into the read buffer and it's b_end_io - * routine is called. This may happen in the end_request routine only - * if the buffer has just successfully been read. end_request should - * remove the buffers from the list and then set the Uptodate bit on - * the buffer. Other threads may do this only if they first check - * that the Uptodate bit is set. Once they have checked that they may - * take buffers off the read queue. - * - * When a buffer on the write list is committed for write it is copied - * into the cache buffer, which is then marked dirty, and moved onto a - * third list, the written list (bh_written). Once both the parity - * block and the cached buffer are successfully written, any buffer on - * a written list can be returned with b_end_io. - * - * The write list and read list both act as fifos. The read list, - * write list and written list are protected by the device_lock. - * The device_lock is only for list manipulations and will only be - * held for a very short time. It can be claimed from interrupts. - * - * - * Stripes in the stripe cache can be on one of two lists (or on - * neither). The "inactive_list" contains stripes which are not - * currently being used for any request. They can freely be reused - * for another stripe. The "handle_list" contains stripes that need - * to be handled in some way. Both of these are fifo queues. Each - * stripe is also (potentially) linked to a hash bucket in the hash - * table so that it can be found by sector number. Stripes that are - * not hashed must be on the inactive_list, and will normally be at - * the front. All stripes start life this way. - * - * The inactive_list, handle_list and hash bucket lists are all protected by the - * device_lock. - * - stripes have a reference counter. If count==0, they are on a list. - * - If a stripe might need handling, STRIPE_HANDLE is set. - * - When refcount reaches zero, then if STRIPE_HANDLE it is put on - * handle_list else inactive_list - * - * This, combined with the fact that STRIPE_HANDLE is only ever - * cleared while a stripe has a non-zero count means that if the - * refcount is 0 and STRIPE_HANDLE is set, then it is on the - * handle_list and if recount is 0 and STRIPE_HANDLE is not set, then - * the stripe is on inactive_list. - * - * The possible transitions are: - * activate an unhashed/inactive stripe (get_active_stripe()) - * lockdev check-hash unlink-stripe cnt++ clean-stripe hash-stripe unlockdev - * activate a hashed, possibly active stripe (get_active_stripe()) - * lockdev check-hash if(!cnt++)unlink-stripe unlockdev - * attach a request to an active stripe (add_stripe_bh()) - * lockdev attach-buffer unlockdev - * handle a stripe (handle_stripe()) - * setSTRIPE_ACTIVE, clrSTRIPE_HANDLE ... - * (lockdev check-buffers unlockdev) .. - * change-state .. - * record io/ops needed clearSTRIPE_ACTIVE schedule io/ops - * release an active stripe (release_stripe()) - * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev - * - * The refcount counts each thread that have activated the stripe, - * plus raid5d if it is handling it, plus one for each active request - * on a cached buffer, and plus one if the stripe is undergoing stripe - * operations. - * - * The stripe operations are: - * -copying data between the stripe cache and user application buffers - * -computing blocks to save a disk access, or to recover a missing block - * -updating the parity on a write operation (reconstruct write and - * read-modify-write) - * -checking parity correctness - * -running i/o to disk - * These operations are carried out by raid5_run_ops which uses the async_tx - * api to (optionally) offload operations to dedicated hardware engines. - * When requesting an operation handle_stripe sets the pending bit for the - * operation and increments the count. raid5_run_ops is then run whenever - * the count is non-zero. - * There are some critical dependencies between the operations that prevent some - * from being requested while another is in flight. - * 1/ Parity check operations destroy the in cache version of the parity block, - * so we prevent parity dependent operations like writes and compute_blocks - * from starting while a check is in progress. Some dma engines can perform - * the check without damaging the parity block, in these cases the parity - * block is re-marked up to date (assuming the check was successful) and is - * not re-read from disk. - * 2/ When a write operation is requested we immediately lock the affected - * blocks, and mark them as not up to date. This causes new read requests - * to be held off, as well as parity checks and compute block operations. - * 3/ Once a compute block operation has been requested handle_stripe treats - * that block as if it is up to date. raid5_run_ops guaruntees that any - * operation that is dependent on the compute block result is initiated after - * the compute block completes. - */ - -/* - * Operations state - intermediate states that are visible outside of - * STRIPE_ACTIVE. - * In general _idle indicates nothing is running, _run indicates a data - * processing operation is active, and _result means the data processing result - * is stable and can be acted upon. For simple operations like biofill and - * compute that only have an _idle and _run state they are indicated with - * sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN) - */ -/** - * enum check_states - handles syncing / repairing a stripe - * @check_state_idle - check operations are quiesced - * @check_state_run - check operation is running - * @check_state_result - set outside lock when check result is valid - * @check_state_compute_run - check failed and we are repairing - * @check_state_compute_result - set outside lock when compute result is valid - */ -enum check_states { - check_state_idle = 0, - check_state_run, /* xor parity check */ - check_state_run_q, /* q-parity check */ - check_state_run_pq, /* pq dual parity check */ - check_state_check_result, - check_state_compute_run, /* parity repair */ - check_state_compute_result, -}; - -/** - * enum reconstruct_states - handles writing or expanding a stripe - */ -enum reconstruct_states { - reconstruct_state_idle = 0, - reconstruct_state_prexor_drain_run, /* prexor-write */ - reconstruct_state_drain_run, /* write */ - reconstruct_state_run, /* expand */ - reconstruct_state_prexor_drain_result, - reconstruct_state_drain_result, - reconstruct_state_result, -}; - -struct stripe_head { - struct hlist_node hash; - struct list_head lru; /* inactive_list or handle_list */ - struct r5conf *raid_conf; - short generation; /* increments with every - * reshape */ - sector_t sector; /* sector of this row */ - short pd_idx; /* parity disk index */ - short qd_idx; /* 'Q' disk index for raid6 */ - short ddf_layout;/* use DDF ordering to calculate Q */ - unsigned long state; /* state flags */ - atomic_t count; /* nr of active thread/requests */ - int bm_seq; /* sequence number for bitmap flushes */ - int disks; /* disks in stripe */ - enum check_states check_state; - enum reconstruct_states reconstruct_state; - /** - * struct stripe_operations - * @target - STRIPE_OP_COMPUTE_BLK target - * @target2 - 2nd compute target in the raid6 case - * @zero_sum_result - P and Q verification flags - * @request - async service request flags for raid_run_ops - */ - struct stripe_operations { - int target, target2; - enum sum_check_flags zero_sum_result; - #ifdef CONFIG_MULTICORE_RAID456 - unsigned long request; - wait_queue_head_t wait_for_ops; - #endif - } ops; - struct r5dev { - /* rreq and rvec are used for the replacement device when - * writing data to both devices. - */ - struct bio req, rreq; - struct bio_vec vec, rvec; - struct page *page; - struct bio *toread, *read, *towrite, *written; - sector_t sector; /* sector of this page */ - unsigned long flags; - } dev[1]; /* allocated with extra space depending of RAID geometry */ -}; - -/* stripe_head_state - collects and tracks the dynamic state of a stripe_head - * for handle_stripe. - */ -struct stripe_head_state { - /* 'syncing' means that we need to read all devices, either - * to check/correct parity, or to reconstruct a missing device. - * 'replacing' means we are replacing one or more drives and - * the source is valid at this point so we don't need to - * read all devices, just the replacement targets. - */ - int syncing, expanding, expanded, replacing; - int locked, uptodate, to_read, to_write, failed, written; - int to_fill, compute, req_compute, non_overwrite; - int failed_num[2]; - int p_failed, q_failed; - int dec_preread_active; - unsigned long ops_request; - - struct bio *return_bi; - struct md_rdev *blocked_rdev; - int handle_bad_blocks; -}; - -/* Flags for struct r5dev.flags */ -enum r5dev_flags { - R5_UPTODATE, /* page contains current data */ - R5_LOCKED, /* IO has been submitted on "req" */ - R5_DOUBLE_LOCKED,/* Cannot clear R5_LOCKED until 2 writes complete */ - R5_OVERWRITE, /* towrite covers whole page */ -/* and some that are internal to handle_stripe */ - R5_Insync, /* rdev && rdev->in_sync at start */ - R5_Wantread, /* want to schedule a read */ - R5_Wantwrite, - R5_Overlap, /* There is a pending overlapping request - * on this block */ - R5_ReadError, /* seen a read error here recently */ - R5_ReWrite, /* have tried to over-write the readerror */ - - R5_Expanded, /* This block now has post-expand data */ - R5_Wantcompute, /* compute_block in progress treat as - * uptodate - */ - R5_Wantfill, /* dev->toread contains a bio that needs - * filling - */ - R5_Wantdrain, /* dev->towrite needs to be drained */ - R5_WantFUA, /* Write should be FUA */ - R5_WriteError, /* got a write error - need to record it */ - R5_MadeGood, /* A bad block has been fixed by writing to it */ - R5_ReadRepl, /* Will/did read from replacement rather than orig */ - R5_MadeGoodRepl,/* A bad block on the replacement device has been - * fixed by writing to it */ - R5_NeedReplace, /* This device has a replacement which is not - * up-to-date at this stripe. */ - R5_WantReplace, /* We need to update the replacement, we have read - * data in, and now is a good time to write it out. - */ -}; - -/* - * Stripe state - */ -enum { - STRIPE_ACTIVE, - STRIPE_HANDLE, - STRIPE_SYNC_REQUESTED, - STRIPE_SYNCING, - STRIPE_INSYNC, - STRIPE_PREREAD_ACTIVE, - STRIPE_DELAYED, - STRIPE_DEGRADED, - STRIPE_BIT_DELAY, - STRIPE_EXPANDING, - STRIPE_EXPAND_SOURCE, - STRIPE_EXPAND_READY, - STRIPE_IO_STARTED, /* do not count towards 'bypass_count' */ - STRIPE_FULL_WRITE, /* all blocks are set to be overwritten */ - STRIPE_BIOFILL_RUN, - STRIPE_COMPUTE_RUN, - STRIPE_OPS_REQ_PENDING, -}; - -/* - * Operation request flags - */ -enum { - STRIPE_OP_BIOFILL, - STRIPE_OP_COMPUTE_BLK, - STRIPE_OP_PREXOR, - STRIPE_OP_BIODRAIN, - STRIPE_OP_RECONSTRUCT, - STRIPE_OP_CHECK, -}; -/* - * Plugging: - * - * To improve write throughput, we need to delay the handling of some - * stripes until there has been a chance that several write requests - * for the one stripe have all been collected. - * In particular, any write request that would require pre-reading - * is put on a "delayed" queue until there are no stripes currently - * in a pre-read phase. Further, if the "delayed" queue is empty when - * a stripe is put on it then we "plug" the queue and do not process it - * until an unplug call is made. (the unplug_io_fn() is called). - * - * When preread is initiated on a stripe, we set PREREAD_ACTIVE and add - * it to the count of prereading stripes. - * When write is initiated, or the stripe refcnt == 0 (just in case) we - * clear the PREREAD_ACTIVE flag and decrement the count - * Whenever the 'handle' queue is empty and the device is not plugged, we - * move any strips from delayed to handle and clear the DELAYED flag and set - * PREREAD_ACTIVE. - * In stripe_handle, if we find pre-reading is necessary, we do it if - * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue. - * HANDLE gets cleared if stripe_handle leaves nothing locked. - */ - - -struct disk_info { - struct md_rdev *rdev, *replacement; -}; - -struct r5conf { - struct hlist_head *stripe_hashtbl; - struct mddev *mddev; - int chunk_sectors; - int level, algorithm; - int max_degraded; - int raid_disks; - int max_nr_stripes; - - /* reshape_progress is the leading edge of a 'reshape' - * It has value MaxSector when no reshape is happening - * If delta_disks < 0, it is the last sector we started work on, - * else is it the next sector to work on. - */ - sector_t reshape_progress; - /* reshape_safe is the trailing edge of a reshape. We know that - * before (or after) this address, all reshape has completed. - */ - sector_t reshape_safe; - int previous_raid_disks; - int prev_chunk_sectors; - int prev_algo; - short generation; /* increments with every reshape */ - unsigned long reshape_checkpoint; /* Time we last updated - * metadata */ - - struct list_head handle_list; /* stripes needing handling */ - struct list_head hold_list; /* preread ready stripes */ - struct list_head delayed_list; /* stripes that have plugged requests */ - struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */ - struct bio *retry_read_aligned; /* currently retrying aligned bios */ - struct bio *retry_read_aligned_list; /* aligned bios retry list */ - atomic_t preread_active_stripes; /* stripes with scheduled io */ - atomic_t active_aligned_reads; - atomic_t pending_full_writes; /* full write backlog */ - int bypass_count; /* bypassed prereads */ - int bypass_threshold; /* preread nice */ - struct list_head *last_hold; /* detect hold_list promotions */ - - atomic_t reshape_stripes; /* stripes with pending writes for reshape */ - /* unfortunately we need two cache names as we temporarily have - * two caches. - */ - int active_name; - char cache_name[2][32]; - struct kmem_cache *slab_cache; /* for allocating stripes */ - - int seq_flush, seq_write; - int quiesce; - - int fullsync; /* set to 1 if a full sync is needed, - * (fresh device added). - * Cleared when a sync completes. - */ - int recovery_disabled; - /* per cpu variables */ - struct raid5_percpu { - struct page *spare_page; /* Used when checking P/Q in raid6 */ - void *scribble; /* space for constructing buffer - * lists and performing address - * conversions - */ - } __percpu *percpu; - size_t scribble_len; /* size of scribble region must be - * associated with conf to handle - * cpu hotplug while reshaping - */ -#ifdef CONFIG_HOTPLUG_CPU - struct notifier_block cpu_notify; -#endif - - /* - * Free stripes pool - */ - atomic_t active_stripes; - struct list_head inactive_list; - wait_queue_head_t wait_for_stripe; - wait_queue_head_t wait_for_overlap; - int inactive_blocked; /* release of inactive stripes blocked, - * waiting for 25% to be free - */ - int pool_size; /* number of disks in stripeheads in pool */ - spinlock_t device_lock; - struct disk_info *disks; - - /* When taking over an array from a different personality, we store - * the new thread here until we fully activate the array. - */ - struct md_thread *thread; -}; - -/* - * Our supported algorithms - */ -#define ALGORITHM_LEFT_ASYMMETRIC 0 /* Rotating Parity N with Data Restart */ -#define ALGORITHM_RIGHT_ASYMMETRIC 1 /* Rotating Parity 0 with Data Restart */ -#define ALGORITHM_LEFT_SYMMETRIC 2 /* Rotating Parity N with Data Continuation */ -#define ALGORITHM_RIGHT_SYMMETRIC 3 /* Rotating Parity 0 with Data Continuation */ - -/* Define non-rotating (raid4) algorithms. These allow - * conversion of raid4 to raid5. - */ -#define ALGORITHM_PARITY_0 4 /* P or P,Q are initial devices */ -#define ALGORITHM_PARITY_N 5 /* P or P,Q are final devices. */ - -/* DDF RAID6 layouts differ from md/raid6 layouts in two ways. - * Firstly, the exact positioning of the parity block is slightly - * different between the 'LEFT_*' modes of md and the "_N_*" modes - * of DDF. - * Secondly, or order of datablocks over which the Q syndrome is computed - * is different. - * Consequently we have different layouts for DDF/raid6 than md/raid6. - * These layouts are from the DDFv1.2 spec. - * Interestingly DDFv1.2-Errata-A does not specify N_CONTINUE but - * leaves RLQ=3 as 'Vendor Specific' - */ - -#define ALGORITHM_ROTATING_ZERO_RESTART 8 /* DDF PRL=6 RLQ=1 */ -#define ALGORITHM_ROTATING_N_RESTART 9 /* DDF PRL=6 RLQ=2 */ -#define ALGORITHM_ROTATING_N_CONTINUE 10 /*DDF PRL=6 RLQ=3 */ - - -/* For every RAID5 algorithm we define a RAID6 algorithm - * with exactly the same layout for data and parity, and - * with the Q block always on the last device (N-1). - * This allows trivial conversion from RAID5 to RAID6 - */ -#define ALGORITHM_LEFT_ASYMMETRIC_6 16 -#define ALGORITHM_RIGHT_ASYMMETRIC_6 17 -#define ALGORITHM_LEFT_SYMMETRIC_6 18 -#define ALGORITHM_RIGHT_SYMMETRIC_6 19 -#define ALGORITHM_PARITY_0_6 20 -#define ALGORITHM_PARITY_N_6 ALGORITHM_PARITY_N - -static inline int algorithm_valid_raid5(int layout) -{ - return (layout >= 0) && - (layout <= 5); -} -static inline int algorithm_valid_raid6(int layout) -{ - return (layout >= 0 && layout <= 5) - || - (layout >= 8 && layout <= 10) - || - (layout >= 16 && layout <= 20); -} - -static inline int algorithm_is_DDF(int layout) -{ - return layout >= 8 && layout <= 10; -} - -extern int md_raid5_congested(struct mddev *mddev, int bits); -extern void md_raid5_kick_device(struct r5conf *conf); -extern int raid5_set_cache_size(struct mddev *mddev, int size); -#endif |