diff options
Diffstat (limited to 'drivers/block/aoe')
-rw-r--r-- | drivers/block/aoe/Makefile | 6 | ||||
-rw-r--r-- | drivers/block/aoe/aoe.h | 203 | ||||
-rw-r--r-- | drivers/block/aoe/aoeblk.c | 331 | ||||
-rw-r--r-- | drivers/block/aoe/aoechr.c | 315 | ||||
-rw-r--r-- | drivers/block/aoe/aoecmd.c | 1083 | ||||
-rw-r--r-- | drivers/block/aoe/aoedev.c | 277 | ||||
-rw-r--r-- | drivers/block/aoe/aoemain.c | 111 | ||||
-rw-r--r-- | drivers/block/aoe/aoenet.c | 172 |
8 files changed, 2498 insertions, 0 deletions
diff --git a/drivers/block/aoe/Makefile b/drivers/block/aoe/Makefile new file mode 100644 index 00000000..06ea82cd --- /dev/null +++ b/drivers/block/aoe/Makefile @@ -0,0 +1,6 @@ +# +# Makefile for ATA over Ethernet +# + +obj-$(CONFIG_ATA_OVER_ETH) += aoe.o +aoe-y := aoeblk.o aoechr.o aoecmd.o aoedev.o aoemain.o aoenet.o diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h new file mode 100644 index 00000000..db195aba --- /dev/null +++ b/drivers/block/aoe/aoe.h @@ -0,0 +1,203 @@ +/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */ +#define VERSION "47" +#define AOE_MAJOR 152 +#define DEVICE_NAME "aoe" + +/* set AOE_PARTITIONS to 1 to use whole-disks only + * default is 16, which is 15 partitions plus the whole disk + */ +#ifndef AOE_PARTITIONS +#define AOE_PARTITIONS (16) +#endif + +#define SYSMINOR(aoemajor, aoeminor) ((aoemajor) * NPERSHELF + (aoeminor)) +#define AOEMAJOR(sysminor) ((sysminor) / NPERSHELF) +#define AOEMINOR(sysminor) ((sysminor) % NPERSHELF) +#define WHITESPACE " \t\v\f\n" + +enum { + AOECMD_ATA, + AOECMD_CFG, + AOECMD_VEND_MIN = 0xf0, + + AOEFL_RSP = (1<<3), + AOEFL_ERR = (1<<2), + + AOEAFL_EXT = (1<<6), + AOEAFL_DEV = (1<<4), + AOEAFL_ASYNC = (1<<1), + AOEAFL_WRITE = (1<<0), + + AOECCMD_READ = 0, + AOECCMD_TEST, + AOECCMD_PTEST, + AOECCMD_SET, + AOECCMD_FSET, + + AOE_HVER = 0x10, +}; + +struct aoe_hdr { + unsigned char dst[6]; + unsigned char src[6]; + __be16 type; + unsigned char verfl; + unsigned char err; + __be16 major; + unsigned char minor; + unsigned char cmd; + __be32 tag; +}; + +struct aoe_atahdr { + unsigned char aflags; + unsigned char errfeat; + unsigned char scnt; + unsigned char cmdstat; + unsigned char lba0; + unsigned char lba1; + unsigned char lba2; + unsigned char lba3; + unsigned char lba4; + unsigned char lba5; + unsigned char res[2]; +}; + +struct aoe_cfghdr { + __be16 bufcnt; + __be16 fwver; + unsigned char scnt; + unsigned char aoeccmd; + unsigned char cslen[2]; +}; + +enum { + DEVFL_UP = 1, /* device is installed in system and ready for AoE->ATA commands */ + DEVFL_TKILL = (1<<1), /* flag for timer to know when to kill self */ + DEVFL_EXT = (1<<2), /* device accepts lba48 commands */ + DEVFL_CLOSEWAIT = (1<<3), /* device is waiting for all closes to revalidate */ + DEVFL_GDALLOC = (1<<4), /* need to alloc gendisk */ + DEVFL_KICKME = (1<<5), /* slow polling network card catch */ + DEVFL_NEWSIZE = (1<<6), /* need to update dev size in block layer */ + + BUFFL_FAIL = 1, +}; + +enum { + DEFAULTBCNT = 2 * 512, /* 2 sectors */ + NPERSHELF = 16, /* number of slots per shelf address */ + FREETAG = -1, + MIN_BUFS = 16, + NTARGETS = 8, + NAOEIFS = 8, + NSKBPOOLMAX = 128, + + TIMERTICK = HZ / 10, + MINTIMER = HZ >> 2, + MAXTIMER = HZ << 1, + HELPWAIT = 20, +}; + +struct buf { + struct list_head bufs; + ulong stime; /* for disk stats */ + ulong flags; + ulong nframesout; + ulong resid; + ulong bv_resid; + ulong bv_off; + sector_t sector; + struct bio *bio; + struct bio_vec *bv; +}; + +struct frame { + int tag; + ulong waited; + struct buf *buf; + char *bufaddr; + ulong bcnt; + sector_t lba; + struct sk_buff *skb; +}; + +struct aoeif { + struct net_device *nd; + unsigned char lost; + unsigned char lostjumbo; + ushort maxbcnt; +}; + +struct aoetgt { + unsigned char addr[6]; + ushort nframes; + struct frame *frames; + struct aoeif ifs[NAOEIFS]; + struct aoeif *ifp; /* current aoeif in use */ + ushort nout; + ushort maxout; + u16 lasttag; /* last tag sent */ + u16 useme; + ulong lastwadj; /* last window adjustment */ + int wpkts, rpkts; + int dataref; +}; + +struct aoedev { + struct aoedev *next; + ulong sysminor; + ulong aoemajor; + u16 aoeminor; + u16 flags; + u16 nopen; /* (bd_openers isn't available without sleeping) */ + u16 rttavg; /* round trip average of requests/responses */ + u16 mintimer; + u16 fw_ver; /* version of blade's firmware */ + struct work_struct work;/* disk create work struct */ + struct gendisk *gd; + struct request_queue *blkq; + struct hd_geometry geo; + sector_t ssize; + struct timer_list timer; + spinlock_t lock; + struct sk_buff_head sendq; + struct sk_buff_head skbpool; + mempool_t *bufpool; /* for deadlock-free Buf allocation */ + struct list_head bufq; /* queue of bios to work on */ + struct buf *inprocess; /* the one we're currently working on */ + struct aoetgt *targets[NTARGETS]; + struct aoetgt **tgt; /* target in use when working */ + struct aoetgt **htgt; /* target needing rexmit assistance */ +}; + + +int aoeblk_init(void); +void aoeblk_exit(void); +void aoeblk_gdalloc(void *); +void aoedisk_rm_sysfs(struct aoedev *d); + +int aoechr_init(void); +void aoechr_exit(void); +void aoechr_error(char *); + +void aoecmd_work(struct aoedev *d); +void aoecmd_cfg(ushort aoemajor, unsigned char aoeminor); +void aoecmd_ata_rsp(struct sk_buff *); +void aoecmd_cfg_rsp(struct sk_buff *); +void aoecmd_sleepwork(struct work_struct *); +void aoecmd_cleanslate(struct aoedev *); +struct sk_buff *aoecmd_ata_id(struct aoedev *); + +int aoedev_init(void); +void aoedev_exit(void); +struct aoedev *aoedev_by_aoeaddr(int maj, int min); +struct aoedev *aoedev_by_sysminor_m(ulong sysminor); +void aoedev_downdev(struct aoedev *d); +int aoedev_flush(const char __user *str, size_t size); + +int aoenet_init(void); +void aoenet_exit(void); +void aoenet_xmit(struct sk_buff_head *); +int is_aoe_netif(struct net_device *ifp); +int set_aoe_iflist(const char __user *str, size_t size); + diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c new file mode 100644 index 00000000..321de7b6 --- /dev/null +++ b/drivers/block/aoe/aoeblk.c @@ -0,0 +1,331 @@ +/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */ +/* + * aoeblk.c + * block device routines + */ + +#include <linux/kernel.h> +#include <linux/hdreg.h> +#include <linux/blkdev.h> +#include <linux/backing-dev.h> +#include <linux/fs.h> +#include <linux/ioctl.h> +#include <linux/slab.h> +#include <linux/ratelimit.h> +#include <linux/genhd.h> +#include <linux/netdevice.h> +#include <linux/mutex.h> +#include <linux/export.h> +#include "aoe.h" + +static DEFINE_MUTEX(aoeblk_mutex); +static struct kmem_cache *buf_pool_cache; + +static ssize_t aoedisk_show_state(struct device *dev, + struct device_attribute *attr, char *page) +{ + struct gendisk *disk = dev_to_disk(dev); + struct aoedev *d = disk->private_data; + + return snprintf(page, PAGE_SIZE, + "%s%s\n", + (d->flags & DEVFL_UP) ? "up" : "down", + (d->flags & DEVFL_KICKME) ? ",kickme" : + (d->nopen && !(d->flags & DEVFL_UP)) ? ",closewait" : ""); + /* I'd rather see nopen exported so we can ditch closewait */ +} +static ssize_t aoedisk_show_mac(struct device *dev, + struct device_attribute *attr, char *page) +{ + struct gendisk *disk = dev_to_disk(dev); + struct aoedev *d = disk->private_data; + struct aoetgt *t = d->targets[0]; + + if (t == NULL) + return snprintf(page, PAGE_SIZE, "none\n"); + return snprintf(page, PAGE_SIZE, "%pm\n", t->addr); +} +static ssize_t aoedisk_show_netif(struct device *dev, + struct device_attribute *attr, char *page) +{ + struct gendisk *disk = dev_to_disk(dev); + struct aoedev *d = disk->private_data; + struct net_device *nds[8], **nd, **nnd, **ne; + struct aoetgt **t, **te; + struct aoeif *ifp, *e; + char *p; + + memset(nds, 0, sizeof nds); + nd = nds; + ne = nd + ARRAY_SIZE(nds); + t = d->targets; + te = t + NTARGETS; + for (; t < te && *t; t++) { + ifp = (*t)->ifs; + e = ifp + NAOEIFS; + for (; ifp < e && ifp->nd; ifp++) { + for (nnd = nds; nnd < nd; nnd++) + if (*nnd == ifp->nd) + break; + if (nnd == nd && nd != ne) + *nd++ = ifp->nd; + } + } + + ne = nd; + nd = nds; + if (*nd == NULL) + return snprintf(page, PAGE_SIZE, "none\n"); + for (p = page; nd < ne; nd++) + p += snprintf(p, PAGE_SIZE - (p-page), "%s%s", + p == page ? "" : ",", (*nd)->name); + p += snprintf(p, PAGE_SIZE - (p-page), "\n"); + return p-page; +} +/* firmware version */ +static ssize_t aoedisk_show_fwver(struct device *dev, + struct device_attribute *attr, char *page) +{ + struct gendisk *disk = dev_to_disk(dev); + struct aoedev *d = disk->private_data; + + return snprintf(page, PAGE_SIZE, "0x%04x\n", (unsigned int) d->fw_ver); +} + +static DEVICE_ATTR(state, S_IRUGO, aoedisk_show_state, NULL); +static DEVICE_ATTR(mac, S_IRUGO, aoedisk_show_mac, NULL); +static DEVICE_ATTR(netif, S_IRUGO, aoedisk_show_netif, NULL); +static struct device_attribute dev_attr_firmware_version = { + .attr = { .name = "firmware-version", .mode = S_IRUGO }, + .show = aoedisk_show_fwver, +}; + +static struct attribute *aoe_attrs[] = { + &dev_attr_state.attr, + &dev_attr_mac.attr, + &dev_attr_netif.attr, + &dev_attr_firmware_version.attr, + NULL, +}; + +static const struct attribute_group attr_group = { + .attrs = aoe_attrs, +}; + +static int +aoedisk_add_sysfs(struct aoedev *d) +{ + return sysfs_create_group(&disk_to_dev(d->gd)->kobj, &attr_group); +} +void +aoedisk_rm_sysfs(struct aoedev *d) +{ + sysfs_remove_group(&disk_to_dev(d->gd)->kobj, &attr_group); +} + +static int +aoeblk_open(struct block_device *bdev, fmode_t mode) +{ + struct aoedev *d = bdev->bd_disk->private_data; + ulong flags; + + mutex_lock(&aoeblk_mutex); + spin_lock_irqsave(&d->lock, flags); + if (d->flags & DEVFL_UP) { + d->nopen++; + spin_unlock_irqrestore(&d->lock, flags); + mutex_unlock(&aoeblk_mutex); + return 0; + } + spin_unlock_irqrestore(&d->lock, flags); + mutex_unlock(&aoeblk_mutex); + return -ENODEV; +} + +static int +aoeblk_release(struct gendisk *disk, fmode_t mode) +{ + struct aoedev *d = disk->private_data; + ulong flags; + + spin_lock_irqsave(&d->lock, flags); + + if (--d->nopen == 0) { + spin_unlock_irqrestore(&d->lock, flags); + aoecmd_cfg(d->aoemajor, d->aoeminor); + return 0; + } + spin_unlock_irqrestore(&d->lock, flags); + + return 0; +} + +static void +aoeblk_make_request(struct request_queue *q, struct bio *bio) +{ + struct sk_buff_head queue; + struct aoedev *d; + struct buf *buf; + ulong flags; + + blk_queue_bounce(q, &bio); + + if (bio == NULL) { + printk(KERN_ERR "aoe: bio is NULL\n"); + BUG(); + return; + } + d = bio->bi_bdev->bd_disk->private_data; + if (d == NULL) { + printk(KERN_ERR "aoe: bd_disk->private_data is NULL\n"); + BUG(); + bio_endio(bio, -ENXIO); + return; + } else if (bio->bi_io_vec == NULL) { + printk(KERN_ERR "aoe: bi_io_vec is NULL\n"); + BUG(); + bio_endio(bio, -ENXIO); + return; + } + buf = mempool_alloc(d->bufpool, GFP_NOIO); + if (buf == NULL) { + printk(KERN_INFO "aoe: buf allocation failure\n"); + bio_endio(bio, -ENOMEM); + return; + } + memset(buf, 0, sizeof(*buf)); + INIT_LIST_HEAD(&buf->bufs); + buf->stime = jiffies; + buf->bio = bio; + buf->resid = bio->bi_size; + buf->sector = bio->bi_sector; + buf->bv = &bio->bi_io_vec[bio->bi_idx]; + buf->bv_resid = buf->bv->bv_len; + WARN_ON(buf->bv_resid == 0); + buf->bv_off = buf->bv->bv_offset; + + spin_lock_irqsave(&d->lock, flags); + + if ((d->flags & DEVFL_UP) == 0) { + pr_info_ratelimited("aoe: device %ld.%d is not up\n", + d->aoemajor, d->aoeminor); + spin_unlock_irqrestore(&d->lock, flags); + mempool_free(buf, d->bufpool); + bio_endio(bio, -ENXIO); + return; + } + + list_add_tail(&buf->bufs, &d->bufq); + + aoecmd_work(d); + __skb_queue_head_init(&queue); + skb_queue_splice_init(&d->sendq, &queue); + + spin_unlock_irqrestore(&d->lock, flags); + aoenet_xmit(&queue); +} + +static int +aoeblk_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + struct aoedev *d = bdev->bd_disk->private_data; + + if ((d->flags & DEVFL_UP) == 0) { + printk(KERN_ERR "aoe: disk not up\n"); + return -ENODEV; + } + + geo->cylinders = d->geo.cylinders; + geo->heads = d->geo.heads; + geo->sectors = d->geo.sectors; + return 0; +} + +static const struct block_device_operations aoe_bdops = { + .open = aoeblk_open, + .release = aoeblk_release, + .getgeo = aoeblk_getgeo, + .owner = THIS_MODULE, +}; + +/* alloc_disk and add_disk can sleep */ +void +aoeblk_gdalloc(void *vp) +{ + struct aoedev *d = vp; + struct gendisk *gd; + ulong flags; + + gd = alloc_disk(AOE_PARTITIONS); + if (gd == NULL) { + printk(KERN_ERR + "aoe: cannot allocate disk structure for %ld.%d\n", + d->aoemajor, d->aoeminor); + goto err; + } + + d->bufpool = mempool_create_slab_pool(MIN_BUFS, buf_pool_cache); + if (d->bufpool == NULL) { + printk(KERN_ERR "aoe: cannot allocate bufpool for %ld.%d\n", + d->aoemajor, d->aoeminor); + goto err_disk; + } + + d->blkq = blk_alloc_queue(GFP_KERNEL); + if (!d->blkq) + goto err_mempool; + blk_queue_make_request(d->blkq, aoeblk_make_request); + d->blkq->backing_dev_info.name = "aoe"; + if (bdi_init(&d->blkq->backing_dev_info)) + goto err_blkq; + spin_lock_irqsave(&d->lock, flags); + gd->major = AOE_MAJOR; + gd->first_minor = d->sysminor * AOE_PARTITIONS; + gd->fops = &aoe_bdops; + gd->private_data = d; + set_capacity(gd, d->ssize); + snprintf(gd->disk_name, sizeof gd->disk_name, "etherd/e%ld.%d", + d->aoemajor, d->aoeminor); + + gd->queue = d->blkq; + d->gd = gd; + d->flags &= ~DEVFL_GDALLOC; + d->flags |= DEVFL_UP; + + spin_unlock_irqrestore(&d->lock, flags); + + add_disk(gd); + aoedisk_add_sysfs(d); + return; + +err_blkq: + blk_cleanup_queue(d->blkq); + d->blkq = NULL; +err_mempool: + mempool_destroy(d->bufpool); +err_disk: + put_disk(gd); +err: + spin_lock_irqsave(&d->lock, flags); + d->flags &= ~DEVFL_GDALLOC; + spin_unlock_irqrestore(&d->lock, flags); +} + +void +aoeblk_exit(void) +{ + kmem_cache_destroy(buf_pool_cache); +} + +int __init +aoeblk_init(void) +{ + buf_pool_cache = kmem_cache_create("aoe_bufs", + sizeof(struct buf), + 0, 0, NULL); + if (buf_pool_cache == NULL) + return -ENOMEM; + + return 0; +} + diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c new file mode 100644 index 00000000..e86d2062 --- /dev/null +++ b/drivers/block/aoe/aoechr.c @@ -0,0 +1,315 @@ +/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */ +/* + * aoechr.c + * AoE character device driver + */ + +#include <linux/hdreg.h> +#include <linux/blkdev.h> +#include <linux/completion.h> +#include <linux/delay.h> +#include <linux/slab.h> +#include <linux/mutex.h> +#include <linux/skbuff.h> +#include <linux/export.h> +#include "aoe.h" + +enum { + //MINOR_STAT = 1, (moved to sysfs) + MINOR_ERR = 2, + MINOR_DISCOVER, + MINOR_INTERFACES, + MINOR_REVALIDATE, + MINOR_FLUSH, + MSGSZ = 2048, + NMSG = 100, /* message backlog to retain */ +}; + +struct aoe_chardev { + ulong minor; + char name[32]; +}; + +enum { EMFL_VALID = 1 }; + +struct ErrMsg { + short flags; + short len; + char *msg; +}; + +static DEFINE_MUTEX(aoechr_mutex); +static struct ErrMsg emsgs[NMSG]; +static int emsgs_head_idx, emsgs_tail_idx; +static struct completion emsgs_comp; +static spinlock_t emsgs_lock; +static int nblocked_emsgs_readers; +static struct class *aoe_class; +static struct aoe_chardev chardevs[] = { + { MINOR_ERR, "err" }, + { MINOR_DISCOVER, "discover" }, + { MINOR_INTERFACES, "interfaces" }, + { MINOR_REVALIDATE, "revalidate" }, + { MINOR_FLUSH, "flush" }, +}; + +static int +discover(void) +{ + aoecmd_cfg(0xffff, 0xff); + return 0; +} + +static int +interfaces(const char __user *str, size_t size) +{ + if (set_aoe_iflist(str, size)) { + printk(KERN_ERR + "aoe: could not set interface list: too many interfaces\n"); + return -EINVAL; + } + return 0; +} + +static int +revalidate(const char __user *str, size_t size) +{ + int major, minor, n; + ulong flags; + struct aoedev *d; + struct sk_buff *skb; + char buf[16]; + + if (size >= sizeof buf) + return -EINVAL; + buf[sizeof buf - 1] = '\0'; + if (copy_from_user(buf, str, size)) + return -EFAULT; + + /* should be e%d.%d format */ + n = sscanf(buf, "e%d.%d", &major, &minor); + if (n != 2) { + printk(KERN_ERR "aoe: invalid device specification\n"); + return -EINVAL; + } + d = aoedev_by_aoeaddr(major, minor); + if (!d) + return -EINVAL; + spin_lock_irqsave(&d->lock, flags); + aoecmd_cleanslate(d); +loop: + skb = aoecmd_ata_id(d); + spin_unlock_irqrestore(&d->lock, flags); + /* try again if we are able to sleep a bit, + * otherwise give up this revalidation + */ + if (!skb && !msleep_interruptible(200)) { + spin_lock_irqsave(&d->lock, flags); + goto loop; + } + if (skb) { + struct sk_buff_head queue; + __skb_queue_head_init(&queue); + __skb_queue_tail(&queue, skb); + aoenet_xmit(&queue); + } + aoecmd_cfg(major, minor); + return 0; +} + +void +aoechr_error(char *msg) +{ + struct ErrMsg *em; + char *mp; + ulong flags, n; + + n = strlen(msg); + + spin_lock_irqsave(&emsgs_lock, flags); + + em = emsgs + emsgs_tail_idx; + if ((em->flags & EMFL_VALID)) { +bail: spin_unlock_irqrestore(&emsgs_lock, flags); + return; + } + + mp = kmalloc(n, GFP_ATOMIC); + if (mp == NULL) { + printk(KERN_ERR "aoe: allocation failure, len=%ld\n", n); + goto bail; + } + + memcpy(mp, msg, n); + em->msg = mp; + em->flags |= EMFL_VALID; + em->len = n; + + emsgs_tail_idx++; + emsgs_tail_idx %= ARRAY_SIZE(emsgs); + + spin_unlock_irqrestore(&emsgs_lock, flags); + + if (nblocked_emsgs_readers) + complete(&emsgs_comp); +} + +static ssize_t +aoechr_write(struct file *filp, const char __user *buf, size_t cnt, loff_t *offp) +{ + int ret = -EINVAL; + + switch ((unsigned long) filp->private_data) { + default: + printk(KERN_INFO "aoe: can't write to that file.\n"); + break; + case MINOR_DISCOVER: + ret = discover(); + break; + case MINOR_INTERFACES: + ret = interfaces(buf, cnt); + break; + case MINOR_REVALIDATE: + ret = revalidate(buf, cnt); + break; + case MINOR_FLUSH: + ret = aoedev_flush(buf, cnt); + } + if (ret == 0) + ret = cnt; + return ret; +} + +static int +aoechr_open(struct inode *inode, struct file *filp) +{ + int n, i; + + mutex_lock(&aoechr_mutex); + n = iminor(inode); + filp->private_data = (void *) (unsigned long) n; + + for (i = 0; i < ARRAY_SIZE(chardevs); ++i) + if (chardevs[i].minor == n) { + mutex_unlock(&aoechr_mutex); + return 0; + } + mutex_unlock(&aoechr_mutex); + return -EINVAL; +} + +static int +aoechr_rel(struct inode *inode, struct file *filp) +{ + return 0; +} + +static ssize_t +aoechr_read(struct file *filp, char __user *buf, size_t cnt, loff_t *off) +{ + unsigned long n; + char *mp; + struct ErrMsg *em; + ssize_t len; + ulong flags; + + n = (unsigned long) filp->private_data; + if (n != MINOR_ERR) + return -EFAULT; + + spin_lock_irqsave(&emsgs_lock, flags); + + for (;;) { + em = emsgs + emsgs_head_idx; + if ((em->flags & EMFL_VALID) != 0) + break; + if (filp->f_flags & O_NDELAY) { + spin_unlock_irqrestore(&emsgs_lock, flags); + return -EAGAIN; + } + nblocked_emsgs_readers++; + + spin_unlock_irqrestore(&emsgs_lock, flags); + + n = wait_for_completion_interruptible(&emsgs_comp); + + spin_lock_irqsave(&emsgs_lock, flags); + + nblocked_emsgs_readers--; + + if (n) { + spin_unlock_irqrestore(&emsgs_lock, flags); + return -ERESTARTSYS; + } + } + if (em->len > cnt) { + spin_unlock_irqrestore(&emsgs_lock, flags); + return -EAGAIN; + } + mp = em->msg; + len = em->len; + em->msg = NULL; + em->flags &= ~EMFL_VALID; + + emsgs_head_idx++; + emsgs_head_idx %= ARRAY_SIZE(emsgs); + + spin_unlock_irqrestore(&emsgs_lock, flags); + + n = copy_to_user(buf, mp, len); + kfree(mp); + return n == 0 ? len : -EFAULT; +} + +static const struct file_operations aoe_fops = { + .write = aoechr_write, + .read = aoechr_read, + .open = aoechr_open, + .release = aoechr_rel, + .owner = THIS_MODULE, + .llseek = noop_llseek, +}; + +static char *aoe_devnode(struct device *dev, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "etherd/%s", dev_name(dev)); +} + +int __init +aoechr_init(void) +{ + int n, i; + + n = register_chrdev(AOE_MAJOR, "aoechr", &aoe_fops); + if (n < 0) { + printk(KERN_ERR "aoe: can't register char device\n"); + return n; + } + init_completion(&emsgs_comp); + spin_lock_init(&emsgs_lock); + aoe_class = class_create(THIS_MODULE, "aoe"); + if (IS_ERR(aoe_class)) { + unregister_chrdev(AOE_MAJOR, "aoechr"); + return PTR_ERR(aoe_class); + } + aoe_class->devnode = aoe_devnode; + + for (i = 0; i < ARRAY_SIZE(chardevs); ++i) + device_create(aoe_class, NULL, + MKDEV(AOE_MAJOR, chardevs[i].minor), NULL, + chardevs[i].name); + + return 0; +} + +void +aoechr_exit(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(chardevs); ++i) + device_destroy(aoe_class, MKDEV(AOE_MAJOR, chardevs[i].minor)); + class_destroy(aoe_class); + unregister_chrdev(AOE_MAJOR, "aoechr"); +} + diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c new file mode 100644 index 00000000..de0435e6 --- /dev/null +++ b/drivers/block/aoe/aoecmd.c @@ -0,0 +1,1083 @@ +/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */ +/* + * aoecmd.c + * Filesystem request handling methods + */ + +#include <linux/ata.h> +#include <linux/slab.h> +#include <linux/hdreg.h> +#include <linux/blkdev.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/genhd.h> +#include <linux/moduleparam.h> +#include <net/net_namespace.h> +#include <asm/unaligned.h> +#include "aoe.h" + +static int aoe_deadsecs = 60 * 3; +module_param(aoe_deadsecs, int, 0644); +MODULE_PARM_DESC(aoe_deadsecs, "After aoe_deadsecs seconds, give up and fail dev."); + +static int aoe_maxout = 16; +module_param(aoe_maxout, int, 0644); +MODULE_PARM_DESC(aoe_maxout, + "Only aoe_maxout outstanding packets for every MAC on eX.Y."); + +static struct sk_buff * +new_skb(ulong len) +{ + struct sk_buff *skb; + + skb = alloc_skb(len, GFP_ATOMIC); + if (skb) { + skb_reset_mac_header(skb); + skb_reset_network_header(skb); + skb->protocol = __constant_htons(ETH_P_AOE); + } + return skb; +} + +static struct frame * +getframe(struct aoetgt *t, int tag) +{ + struct frame *f, *e; + + f = t->frames; + e = f + t->nframes; + for (; f<e; f++) + if (f->tag == tag) + return f; + return NULL; +} + +/* + * Leave the top bit clear so we have tagspace for userland. + * The bottom 16 bits are the xmit tick for rexmit/rttavg processing. + * This driver reserves tag -1 to mean "unused frame." + */ +static int +newtag(struct aoetgt *t) +{ + register ulong n; + + n = jiffies & 0xffff; + return n |= (++t->lasttag & 0x7fff) << 16; +} + +static int +aoehdr_atainit(struct aoedev *d, struct aoetgt *t, struct aoe_hdr *h) +{ + u32 host_tag = newtag(t); + + memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src); + memcpy(h->dst, t->addr, sizeof h->dst); + h->type = __constant_cpu_to_be16(ETH_P_AOE); + h->verfl = AOE_HVER; + h->major = cpu_to_be16(d->aoemajor); + h->minor = d->aoeminor; + h->cmd = AOECMD_ATA; + h->tag = cpu_to_be32(host_tag); + + return host_tag; +} + +static inline void +put_lba(struct aoe_atahdr *ah, sector_t lba) +{ + ah->lba0 = lba; + ah->lba1 = lba >>= 8; + ah->lba2 = lba >>= 8; + ah->lba3 = lba >>= 8; + ah->lba4 = lba >>= 8; + ah->lba5 = lba >>= 8; +} + +static void +ifrotate(struct aoetgt *t) +{ + t->ifp++; + if (t->ifp >= &t->ifs[NAOEIFS] || t->ifp->nd == NULL) + t->ifp = t->ifs; + if (t->ifp->nd == NULL) { + printk(KERN_INFO "aoe: no interface to rotate to\n"); + BUG(); + } +} + +static void +skb_pool_put(struct aoedev *d, struct sk_buff *skb) +{ + __skb_queue_tail(&d->skbpool, skb); +} + +static struct sk_buff * +skb_pool_get(struct aoedev *d) +{ + struct sk_buff *skb = skb_peek(&d->skbpool); + + if (skb && atomic_read(&skb_shinfo(skb)->dataref) == 1) { + __skb_unlink(skb, &d->skbpool); + return skb; + } + if (skb_queue_len(&d->skbpool) < NSKBPOOLMAX && + (skb = new_skb(ETH_ZLEN))) + return skb; + + return NULL; +} + +/* freeframe is where we do our load balancing so it's a little hairy. */ +static struct frame * +freeframe(struct aoedev *d) +{ + struct frame *f, *e, *rf; + struct aoetgt **t; + struct sk_buff *skb; + + if (d->targets[0] == NULL) { /* shouldn't happen, but I'm paranoid */ + printk(KERN_ERR "aoe: NULL TARGETS!\n"); + return NULL; + } + t = d->tgt; + t++; + if (t >= &d->targets[NTARGETS] || !*t) + t = d->targets; + for (;;) { + if ((*t)->nout < (*t)->maxout + && t != d->htgt + && (*t)->ifp->nd) { + rf = NULL; + f = (*t)->frames; + e = f + (*t)->nframes; + for (; f < e; f++) { + if (f->tag != FREETAG) + continue; + skb = f->skb; + if (!skb + && !(f->skb = skb = new_skb(ETH_ZLEN))) + continue; + if (atomic_read(&skb_shinfo(skb)->dataref) + != 1) { + if (!rf) + rf = f; + continue; + } +gotone: skb_shinfo(skb)->nr_frags = skb->data_len = 0; + skb_trim(skb, 0); + d->tgt = t; + ifrotate(*t); + return f; + } + /* Work can be done, but the network layer is + holding our precious packets. Try to grab + one from the pool. */ + f = rf; + if (f == NULL) { /* more paranoia */ + printk(KERN_ERR + "aoe: freeframe: %s.\n", + "unexpected null rf"); + d->flags |= DEVFL_KICKME; + return NULL; + } + skb = skb_pool_get(d); + if (skb) { + skb_pool_put(d, f->skb); + f->skb = skb; + goto gotone; + } + (*t)->dataref++; + if ((*t)->nout == 0) + d->flags |= DEVFL_KICKME; + } + if (t == d->tgt) /* we've looped and found nada */ + break; + t++; + if (t >= &d->targets[NTARGETS] || !*t) + t = d->targets; + } + return NULL; +} + +static int +aoecmd_ata_rw(struct aoedev *d) +{ + struct frame *f; + struct aoe_hdr *h; + struct aoe_atahdr *ah; + struct buf *buf; + struct bio_vec *bv; + struct aoetgt *t; + struct sk_buff *skb; + ulong bcnt; + char writebit, extbit; + + writebit = 0x10; + extbit = 0x4; + + f = freeframe(d); + if (f == NULL) + return 0; + t = *d->tgt; + buf = d->inprocess; + bv = buf->bv; + bcnt = t->ifp->maxbcnt; + if (bcnt == 0) + bcnt = DEFAULTBCNT; + if (bcnt > buf->bv_resid) + bcnt = buf->bv_resid; + /* initialize the headers & frame */ + skb = f->skb; + h = (struct aoe_hdr *) skb_mac_header(skb); + ah = (struct aoe_atahdr *) (h+1); + skb_put(skb, sizeof *h + sizeof *ah); + memset(h, 0, skb->len); + f->tag = aoehdr_atainit(d, t, h); + t->nout++; + f->waited = 0; + f->buf = buf; + f->bufaddr = page_address(bv->bv_page) + buf->bv_off; + f->bcnt = bcnt; + f->lba = buf->sector; + + /* set up ata header */ + ah->scnt = bcnt >> 9; + put_lba(ah, buf->sector); + if (d->flags & DEVFL_EXT) { + ah->aflags |= AOEAFL_EXT; + } else { + extbit = 0; + ah->lba3 &= 0x0f; + ah->lba3 |= 0xe0; /* LBA bit + obsolete 0xa0 */ + } + if (bio_data_dir(buf->bio) == WRITE) { + skb_fill_page_desc(skb, 0, bv->bv_page, buf->bv_off, bcnt); + ah->aflags |= AOEAFL_WRITE; + skb->len += bcnt; + skb->data_len = bcnt; + t->wpkts++; + } else { + t->rpkts++; + writebit = 0; + } + + ah->cmdstat = ATA_CMD_PIO_READ | writebit | extbit; + + /* mark all tracking fields and load out */ + buf->nframesout += 1; + buf->bv_off += bcnt; + buf->bv_resid -= bcnt; + buf->resid -= bcnt; + buf->sector += bcnt >> 9; + if (buf->resid == 0) { + d->inprocess = NULL; + } else if (buf->bv_resid == 0) { + buf->bv = ++bv; + buf->bv_resid = bv->bv_len; + WARN_ON(buf->bv_resid == 0); + buf->bv_off = bv->bv_offset; + } + + skb->dev = t->ifp->nd; + skb = skb_clone(skb, GFP_ATOMIC); + if (skb) + __skb_queue_tail(&d->sendq, skb); + return 1; +} + +/* some callers cannot sleep, and they can call this function, + * transmitting the packets later, when interrupts are on + */ +static void +aoecmd_cfg_pkts(ushort aoemajor, unsigned char aoeminor, struct sk_buff_head *queue) +{ + struct aoe_hdr *h; + struct aoe_cfghdr *ch; + struct sk_buff *skb; + struct net_device *ifp; + + rcu_read_lock(); + for_each_netdev_rcu(&init_net, ifp) { + dev_hold(ifp); + if (!is_aoe_netif(ifp)) + goto cont; + + skb = new_skb(sizeof *h + sizeof *ch); + if (skb == NULL) { + printk(KERN_INFO "aoe: skb alloc failure\n"); + goto cont; + } + skb_put(skb, sizeof *h + sizeof *ch); + skb->dev = ifp; + __skb_queue_tail(queue, skb); + h = (struct aoe_hdr *) skb_mac_header(skb); + memset(h, 0, sizeof *h + sizeof *ch); + + memset(h->dst, 0xff, sizeof h->dst); + memcpy(h->src, ifp->dev_addr, sizeof h->src); + h->type = __constant_cpu_to_be16(ETH_P_AOE); + h->verfl = AOE_HVER; + h->major = cpu_to_be16(aoemajor); + h->minor = aoeminor; + h->cmd = AOECMD_CFG; + +cont: + dev_put(ifp); + } + rcu_read_unlock(); +} + +static void +resend(struct aoedev *d, struct aoetgt *t, struct frame *f) +{ + struct sk_buff *skb; + struct aoe_hdr *h; + struct aoe_atahdr *ah; + char buf[128]; + u32 n; + + ifrotate(t); + n = newtag(t); + skb = f->skb; + h = (struct aoe_hdr *) skb_mac_header(skb); + ah = (struct aoe_atahdr *) (h+1); + + snprintf(buf, sizeof buf, + "%15s e%ld.%d oldtag=%08x@%08lx newtag=%08x s=%pm d=%pm nout=%d\n", + "retransmit", d->aoemajor, d->aoeminor, f->tag, jiffies, n, + h->src, h->dst, t->nout); + aoechr_error(buf); + + f->tag = n; + h->tag = cpu_to_be32(n); + memcpy(h->dst, t->addr, sizeof h->dst); + memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src); + + switch (ah->cmdstat) { + default: + break; + case ATA_CMD_PIO_READ: + case ATA_CMD_PIO_READ_EXT: + case ATA_CMD_PIO_WRITE: + case ATA_CMD_PIO_WRITE_EXT: + put_lba(ah, f->lba); + + n = f->bcnt; + if (n > DEFAULTBCNT) + n = DEFAULTBCNT; + ah->scnt = n >> 9; + if (ah->aflags & AOEAFL_WRITE) { + skb_fill_page_desc(skb, 0, virt_to_page(f->bufaddr), + offset_in_page(f->bufaddr), n); + skb->len = sizeof *h + sizeof *ah + n; + skb->data_len = n; + } + } + skb->dev = t->ifp->nd; + skb = skb_clone(skb, GFP_ATOMIC); + if (skb == NULL) + return; + __skb_queue_tail(&d->sendq, skb); +} + +static int +tsince(int tag) +{ + int n; + + n = jiffies & 0xffff; + n -= tag & 0xffff; + if (n < 0) + n += 1<<16; + return n; +} + +static struct aoeif * +getif(struct aoetgt *t, struct net_device *nd) +{ + struct aoeif *p, *e; + + p = t->ifs; + e = p + NAOEIFS; + for (; p < e; p++) + if (p->nd == nd) + return p; + return NULL; +} + +static struct aoeif * +addif(struct aoetgt *t, struct net_device *nd) +{ + struct aoeif *p; + + p = getif(t, NULL); + if (!p) + return NULL; + p->nd = nd; + p->maxbcnt = DEFAULTBCNT; + p->lost = 0; + p->lostjumbo = 0; + return p; +} + +static void +ejectif(struct aoetgt *t, struct aoeif *ifp) +{ + struct aoeif *e; + ulong n; + + e = t->ifs + NAOEIFS - 1; + n = (e - ifp) * sizeof *ifp; + memmove(ifp, ifp+1, n); + e->nd = NULL; +} + +static int +sthtith(struct aoedev *d) +{ + struct frame *f, *e, *nf; + struct sk_buff *skb; + struct aoetgt *ht = *d->htgt; + + f = ht->frames; + e = f + ht->nframes; + for (; f < e; f++) { + if (f->tag == FREETAG) + continue; + nf = freeframe(d); + if (!nf) + return 0; + skb = nf->skb; + *nf = *f; + f->skb = skb; + f->tag = FREETAG; + nf->waited = 0; + ht->nout--; + (*d->tgt)->nout++; + resend(d, *d->tgt, nf); + } + /* he's clean, he's useless. take away his interfaces */ + memset(ht->ifs, 0, sizeof ht->ifs); + d->htgt = NULL; + return 1; +} + +static inline unsigned char +ata_scnt(unsigned char *packet) { + struct aoe_hdr *h; + struct aoe_atahdr *ah; + + h = (struct aoe_hdr *) packet; + ah = (struct aoe_atahdr *) (h+1); + return ah->scnt; +} + +static void +rexmit_timer(ulong vp) +{ + struct sk_buff_head queue; + struct aoedev *d; + struct aoetgt *t, **tt, **te; + struct aoeif *ifp; + struct frame *f, *e; + register long timeout; + ulong flags, n; + + d = (struct aoedev *) vp; + + /* timeout is always ~150% of the moving average */ + timeout = d->rttavg; + timeout += timeout >> 1; + + spin_lock_irqsave(&d->lock, flags); + + if (d->flags & DEVFL_TKILL) { + spin_unlock_irqrestore(&d->lock, flags); + return; + } + tt = d->targets; + te = tt + NTARGETS; + for (; tt < te && *tt; tt++) { + t = *tt; + f = t->frames; + e = f + t->nframes; + for (; f < e; f++) { + if (f->tag == FREETAG + || tsince(f->tag) < timeout) + continue; + n = f->waited += timeout; + n /= HZ; + if (n > aoe_deadsecs) { + /* waited too long. device failure. */ + aoedev_downdev(d); + break; + } + + if (n > HELPWAIT /* see if another target can help */ + && (tt != d->targets || d->targets[1])) + d->htgt = tt; + + if (t->nout == t->maxout) { + if (t->maxout > 1) + t->maxout--; + t->lastwadj = jiffies; + } + + ifp = getif(t, f->skb->dev); + if (ifp && ++ifp->lost > (t->nframes << 1) + && (ifp != t->ifs || t->ifs[1].nd)) { + ejectif(t, ifp); + ifp = NULL; + } + + if (ata_scnt(skb_mac_header(f->skb)) > DEFAULTBCNT / 512 + && ifp && ++ifp->lostjumbo > (t->nframes << 1) + && ifp->maxbcnt != DEFAULTBCNT) { + printk(KERN_INFO + "aoe: e%ld.%d: " + "too many lost jumbo on " + "%s:%pm - " + "falling back to %d frames.\n", + d->aoemajor, d->aoeminor, + ifp->nd->name, t->addr, + DEFAULTBCNT); + ifp->maxbcnt = 0; + } + resend(d, t, f); + } + + /* window check */ + if (t->nout == t->maxout + && t->maxout < t->nframes + && (jiffies - t->lastwadj)/HZ > 10) { + t->maxout++; + t->lastwadj = jiffies; + } + } + + if (!skb_queue_empty(&d->sendq)) { + n = d->rttavg <<= 1; + if (n > MAXTIMER) + d->rttavg = MAXTIMER; + } + + if (d->flags & DEVFL_KICKME || d->htgt) { + d->flags &= ~DEVFL_KICKME; + aoecmd_work(d); + } + + __skb_queue_head_init(&queue); + skb_queue_splice_init(&d->sendq, &queue); + + d->timer.expires = jiffies + TIMERTICK; + add_timer(&d->timer); + + spin_unlock_irqrestore(&d->lock, flags); + + aoenet_xmit(&queue); +} + +/* enters with d->lock held */ +void +aoecmd_work(struct aoedev *d) +{ + struct buf *buf; +loop: + if (d->htgt && !sthtith(d)) + return; + if (d->inprocess == NULL) { + if (list_empty(&d->bufq)) + return; + buf = container_of(d->bufq.next, struct buf, bufs); + list_del(d->bufq.next); + d->inprocess = buf; + } + if (aoecmd_ata_rw(d)) + goto loop; +} + +/* this function performs work that has been deferred until sleeping is OK + */ +void +aoecmd_sleepwork(struct work_struct *work) +{ + struct aoedev *d = container_of(work, struct aoedev, work); + + if (d->flags & DEVFL_GDALLOC) + aoeblk_gdalloc(d); + + if (d->flags & DEVFL_NEWSIZE) { + struct block_device *bd; + unsigned long flags; + u64 ssize; + + ssize = get_capacity(d->gd); + bd = bdget_disk(d->gd, 0); + + if (bd) { + mutex_lock(&bd->bd_inode->i_mutex); + i_size_write(bd->bd_inode, (loff_t)ssize<<9); + mutex_unlock(&bd->bd_inode->i_mutex); + bdput(bd); + } + spin_lock_irqsave(&d->lock, flags); + d->flags |= DEVFL_UP; + d->flags &= ~DEVFL_NEWSIZE; + spin_unlock_irqrestore(&d->lock, flags); + } +} + +static void +ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id) +{ + u64 ssize; + u16 n; + + /* word 83: command set supported */ + n = get_unaligned_le16(&id[83 << 1]); + + /* word 86: command set/feature enabled */ + n |= get_unaligned_le16(&id[86 << 1]); + + if (n & (1<<10)) { /* bit 10: LBA 48 */ + d->flags |= DEVFL_EXT; + + /* word 100: number lba48 sectors */ + ssize = get_unaligned_le64(&id[100 << 1]); + + /* set as in ide-disk.c:init_idedisk_capacity */ + d->geo.cylinders = ssize; + d->geo.cylinders /= (255 * 63); + d->geo.heads = 255; + d->geo.sectors = 63; + } else { + d->flags &= ~DEVFL_EXT; + + /* number lba28 sectors */ + ssize = get_unaligned_le32(&id[60 << 1]); + + /* NOTE: obsolete in ATA 6 */ + d->geo.cylinders = get_unaligned_le16(&id[54 << 1]); + d->geo.heads = get_unaligned_le16(&id[55 << 1]); + d->geo.sectors = get_unaligned_le16(&id[56 << 1]); + } + + if (d->ssize != ssize) + printk(KERN_INFO + "aoe: %pm e%ld.%d v%04x has %llu sectors\n", + t->addr, + d->aoemajor, d->aoeminor, + d->fw_ver, (long long)ssize); + d->ssize = ssize; + d->geo.start = 0; + if (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE)) + return; + if (d->gd != NULL) { + set_capacity(d->gd, ssize); + d->flags |= DEVFL_NEWSIZE; + } else + d->flags |= DEVFL_GDALLOC; + schedule_work(&d->work); +} + +static void +calc_rttavg(struct aoedev *d, int rtt) +{ + register long n; + + n = rtt; + if (n < 0) { + n = -rtt; + if (n < MINTIMER) + n = MINTIMER; + else if (n > MAXTIMER) + n = MAXTIMER; + d->mintimer += (n - d->mintimer) >> 1; + } else if (n < d->mintimer) + n = d->mintimer; + else if (n > MAXTIMER) + n = MAXTIMER; + + /* g == .25; cf. Congestion Avoidance and Control, Jacobson & Karels; 1988 */ + n -= d->rttavg; + d->rttavg += n >> 2; +} + +static struct aoetgt * +gettgt(struct aoedev *d, char *addr) +{ + struct aoetgt **t, **e; + + t = d->targets; + e = t + NTARGETS; + for (; t < e && *t; t++) + if (memcmp((*t)->addr, addr, sizeof((*t)->addr)) == 0) + return *t; + return NULL; +} + +static inline void +diskstats(struct gendisk *disk, struct bio *bio, ulong duration, sector_t sector) +{ + unsigned long n_sect = bio->bi_size >> 9; + const int rw = bio_data_dir(bio); + struct hd_struct *part; + int cpu; + + cpu = part_stat_lock(); + part = disk_map_sector_rcu(disk, sector); + + part_stat_inc(cpu, part, ios[rw]); + part_stat_add(cpu, part, ticks[rw], duration); + part_stat_add(cpu, part, sectors[rw], n_sect); + part_stat_add(cpu, part, io_ticks, duration); + + part_stat_unlock(); +} + +void +aoecmd_ata_rsp(struct sk_buff *skb) +{ + struct sk_buff_head queue; + struct aoedev *d; + struct aoe_hdr *hin, *hout; + struct aoe_atahdr *ahin, *ahout; + struct frame *f; + struct buf *buf; + struct aoetgt *t; + struct aoeif *ifp; + register long n; + ulong flags; + char ebuf[128]; + u16 aoemajor; + + hin = (struct aoe_hdr *) skb_mac_header(skb); + aoemajor = get_unaligned_be16(&hin->major); + d = aoedev_by_aoeaddr(aoemajor, hin->minor); + if (d == NULL) { + snprintf(ebuf, sizeof ebuf, "aoecmd_ata_rsp: ata response " + "for unknown device %d.%d\n", + aoemajor, hin->minor); + aoechr_error(ebuf); + return; + } + + spin_lock_irqsave(&d->lock, flags); + + n = get_unaligned_be32(&hin->tag); + t = gettgt(d, hin->src); + if (t == NULL) { + printk(KERN_INFO "aoe: can't find target e%ld.%d:%pm\n", + d->aoemajor, d->aoeminor, hin->src); + spin_unlock_irqrestore(&d->lock, flags); + return; + } + f = getframe(t, n); + if (f == NULL) { + calc_rttavg(d, -tsince(n)); + spin_unlock_irqrestore(&d->lock, flags); + snprintf(ebuf, sizeof ebuf, + "%15s e%d.%d tag=%08x@%08lx\n", + "unexpected rsp", + get_unaligned_be16(&hin->major), + hin->minor, + get_unaligned_be32(&hin->tag), + jiffies); + aoechr_error(ebuf); + return; + } + + calc_rttavg(d, tsince(f->tag)); + + ahin = (struct aoe_atahdr *) (hin+1); + hout = (struct aoe_hdr *) skb_mac_header(f->skb); + ahout = (struct aoe_atahdr *) (hout+1); + buf = f->buf; + + if (ahin->cmdstat & 0xa9) { /* these bits cleared on success */ + printk(KERN_ERR + "aoe: ata error cmd=%2.2Xh stat=%2.2Xh from e%ld.%d\n", + ahout->cmdstat, ahin->cmdstat, + d->aoemajor, d->aoeminor); + if (buf) + buf->flags |= BUFFL_FAIL; + } else { + if (d->htgt && t == *d->htgt) /* I'll help myself, thank you. */ + d->htgt = NULL; + n = ahout->scnt << 9; + switch (ahout->cmdstat) { + case ATA_CMD_PIO_READ: + case ATA_CMD_PIO_READ_EXT: + if (skb->len - sizeof *hin - sizeof *ahin < n) { + printk(KERN_ERR + "aoe: %s. skb->len=%d need=%ld\n", + "runt data size in read", skb->len, n); + /* fail frame f? just returning will rexmit. */ + spin_unlock_irqrestore(&d->lock, flags); + return; + } + memcpy(f->bufaddr, ahin+1, n); + case ATA_CMD_PIO_WRITE: + case ATA_CMD_PIO_WRITE_EXT: + ifp = getif(t, skb->dev); + if (ifp) { + ifp->lost = 0; + if (n > DEFAULTBCNT) + ifp->lostjumbo = 0; + } + if (f->bcnt -= n) { + f->lba += n >> 9; + f->bufaddr += n; + resend(d, t, f); + goto xmit; + } + break; + case ATA_CMD_ID_ATA: + if (skb->len - sizeof *hin - sizeof *ahin < 512) { + printk(KERN_INFO + "aoe: runt data size in ataid. skb->len=%d\n", + skb->len); + spin_unlock_irqrestore(&d->lock, flags); + return; + } + ataid_complete(d, t, (char *) (ahin+1)); + break; + default: + printk(KERN_INFO + "aoe: unrecognized ata command %2.2Xh for %d.%d\n", + ahout->cmdstat, + get_unaligned_be16(&hin->major), + hin->minor); + } + } + + if (buf && --buf->nframesout == 0 && buf->resid == 0) { + diskstats(d->gd, buf->bio, jiffies - buf->stime, buf->sector); + if (buf->flags & BUFFL_FAIL) + bio_endio(buf->bio, -EIO); + else { + bio_flush_dcache_pages(buf->bio); + bio_endio(buf->bio, 0); + } + mempool_free(buf, d->bufpool); + } + + f->buf = NULL; + f->tag = FREETAG; + t->nout--; + + aoecmd_work(d); +xmit: + __skb_queue_head_init(&queue); + skb_queue_splice_init(&d->sendq, &queue); + + spin_unlock_irqrestore(&d->lock, flags); + aoenet_xmit(&queue); +} + +void +aoecmd_cfg(ushort aoemajor, unsigned char aoeminor) +{ + struct sk_buff_head queue; + + __skb_queue_head_init(&queue); + aoecmd_cfg_pkts(aoemajor, aoeminor, &queue); + aoenet_xmit(&queue); +} + +struct sk_buff * +aoecmd_ata_id(struct aoedev *d) +{ + struct aoe_hdr *h; + struct aoe_atahdr *ah; + struct frame *f; + struct sk_buff *skb; + struct aoetgt *t; + + f = freeframe(d); + if (f == NULL) + return NULL; + + t = *d->tgt; + + /* initialize the headers & frame */ + skb = f->skb; + h = (struct aoe_hdr *) skb_mac_header(skb); + ah = (struct aoe_atahdr *) (h+1); + skb_put(skb, sizeof *h + sizeof *ah); + memset(h, 0, skb->len); + f->tag = aoehdr_atainit(d, t, h); + t->nout++; + f->waited = 0; + + /* set up ata header */ + ah->scnt = 1; + ah->cmdstat = ATA_CMD_ID_ATA; + ah->lba3 = 0xa0; + + skb->dev = t->ifp->nd; + + d->rttavg = MAXTIMER; + d->timer.function = rexmit_timer; + + return skb_clone(skb, GFP_ATOMIC); +} + +static struct aoetgt * +addtgt(struct aoedev *d, char *addr, ulong nframes) +{ + struct aoetgt *t, **tt, **te; + struct frame *f, *e; + + tt = d->targets; + te = tt + NTARGETS; + for (; tt < te && *tt; tt++) + ; + + if (tt == te) { + printk(KERN_INFO + "aoe: device addtgt failure; too many targets\n"); + return NULL; + } + t = kcalloc(1, sizeof *t, GFP_ATOMIC); + f = kcalloc(nframes, sizeof *f, GFP_ATOMIC); + if (!t || !f) { + kfree(f); + kfree(t); + printk(KERN_INFO "aoe: cannot allocate memory to add target\n"); + return NULL; + } + + t->nframes = nframes; + t->frames = f; + e = f + nframes; + for (; f < e; f++) + f->tag = FREETAG; + memcpy(t->addr, addr, sizeof t->addr); + t->ifp = t->ifs; + t->maxout = t->nframes; + return *tt = t; +} + +void +aoecmd_cfg_rsp(struct sk_buff *skb) +{ + struct aoedev *d; + struct aoe_hdr *h; + struct aoe_cfghdr *ch; + struct aoetgt *t; + struct aoeif *ifp; + ulong flags, sysminor, aoemajor; + struct sk_buff *sl; + u16 n; + + h = (struct aoe_hdr *) skb_mac_header(skb); + ch = (struct aoe_cfghdr *) (h+1); + + /* + * Enough people have their dip switches set backwards to + * warrant a loud message for this special case. + */ + aoemajor = get_unaligned_be16(&h->major); + if (aoemajor == 0xfff) { + printk(KERN_ERR "aoe: Warning: shelf address is all ones. " + "Check shelf dip switches.\n"); + return; + } + + sysminor = SYSMINOR(aoemajor, h->minor); + if (sysminor * AOE_PARTITIONS + AOE_PARTITIONS > MINORMASK) { + printk(KERN_INFO "aoe: e%ld.%d: minor number too large\n", + aoemajor, (int) h->minor); + return; + } + + n = be16_to_cpu(ch->bufcnt); + if (n > aoe_maxout) /* keep it reasonable */ + n = aoe_maxout; + + d = aoedev_by_sysminor_m(sysminor); + if (d == NULL) { + printk(KERN_INFO "aoe: device sysminor_m failure\n"); + return; + } + + spin_lock_irqsave(&d->lock, flags); + + t = gettgt(d, h->src); + if (!t) { + t = addtgt(d, h->src, n); + if (!t) { + spin_unlock_irqrestore(&d->lock, flags); + return; + } + } + ifp = getif(t, skb->dev); + if (!ifp) { + ifp = addif(t, skb->dev); + if (!ifp) { + printk(KERN_INFO + "aoe: device addif failure; " + "too many interfaces?\n"); + spin_unlock_irqrestore(&d->lock, flags); + return; + } + } + if (ifp->maxbcnt) { + n = ifp->nd->mtu; + n -= sizeof (struct aoe_hdr) + sizeof (struct aoe_atahdr); + n /= 512; + if (n > ch->scnt) + n = ch->scnt; + n = n ? n * 512 : DEFAULTBCNT; + if (n != ifp->maxbcnt) { + printk(KERN_INFO + "aoe: e%ld.%d: setting %d%s%s:%pm\n", + d->aoemajor, d->aoeminor, n, + " byte data frames on ", ifp->nd->name, + t->addr); + ifp->maxbcnt = n; + } + } + + /* don't change users' perspective */ + if (d->nopen) { + spin_unlock_irqrestore(&d->lock, flags); + return; + } + d->fw_ver = be16_to_cpu(ch->fwver); + + sl = aoecmd_ata_id(d); + + spin_unlock_irqrestore(&d->lock, flags); + + if (sl) { + struct sk_buff_head queue; + __skb_queue_head_init(&queue); + __skb_queue_tail(&queue, sl); + aoenet_xmit(&queue); + } +} + +void +aoecmd_cleanslate(struct aoedev *d) +{ + struct aoetgt **t, **te; + struct aoeif *p, *e; + + d->mintimer = MINTIMER; + + t = d->targets; + te = t + NTARGETS; + for (; t < te && *t; t++) { + (*t)->maxout = (*t)->nframes; + p = (*t)->ifs; + e = p + NAOEIFS; + for (; p < e; p++) { + p->lostjumbo = 0; + p->lost = 0; + p->maxbcnt = DEFAULTBCNT; + } + } +} diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c new file mode 100644 index 00000000..6b5110a4 --- /dev/null +++ b/drivers/block/aoe/aoedev.c @@ -0,0 +1,277 @@ +/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */ +/* + * aoedev.c + * AoE device utility functions; maintains device list. + */ + +#include <linux/hdreg.h> +#include <linux/blkdev.h> +#include <linux/netdevice.h> +#include <linux/delay.h> +#include <linux/slab.h> +#include "aoe.h" + +static void dummy_timer(ulong); +static void aoedev_freedev(struct aoedev *); +static void freetgt(struct aoedev *d, struct aoetgt *t); +static void skbpoolfree(struct aoedev *d); + +static struct aoedev *devlist; +static DEFINE_SPINLOCK(devlist_lock); + +struct aoedev * +aoedev_by_aoeaddr(int maj, int min) +{ + struct aoedev *d; + ulong flags; + + spin_lock_irqsave(&devlist_lock, flags); + + for (d=devlist; d; d=d->next) + if (d->aoemajor == maj && d->aoeminor == min) + break; + + spin_unlock_irqrestore(&devlist_lock, flags); + return d; +} + +static void +dummy_timer(ulong vp) +{ + struct aoedev *d; + + d = (struct aoedev *)vp; + if (d->flags & DEVFL_TKILL) + return; + d->timer.expires = jiffies + HZ; + add_timer(&d->timer); +} + +void +aoedev_downdev(struct aoedev *d) +{ + struct aoetgt **t, **te; + struct frame *f, *e; + struct buf *buf; + struct bio *bio; + + t = d->targets; + te = t + NTARGETS; + for (; t < te && *t; t++) { + f = (*t)->frames; + e = f + (*t)->nframes; + for (; f < e; f->tag = FREETAG, f->buf = NULL, f++) { + if (f->tag == FREETAG || f->buf == NULL) + continue; + buf = f->buf; + bio = buf->bio; + if (--buf->nframesout == 0 + && buf != d->inprocess) { + mempool_free(buf, d->bufpool); + bio_endio(bio, -EIO); + } + } + (*t)->maxout = (*t)->nframes; + (*t)->nout = 0; + } + buf = d->inprocess; + if (buf) { + bio = buf->bio; + mempool_free(buf, d->bufpool); + bio_endio(bio, -EIO); + } + d->inprocess = NULL; + d->htgt = NULL; + + while (!list_empty(&d->bufq)) { + buf = container_of(d->bufq.next, struct buf, bufs); + list_del(d->bufq.next); + bio = buf->bio; + mempool_free(buf, d->bufpool); + bio_endio(bio, -EIO); + } + + if (d->gd) + set_capacity(d->gd, 0); + + d->flags &= ~DEVFL_UP; +} + +static void +aoedev_freedev(struct aoedev *d) +{ + struct aoetgt **t, **e; + + cancel_work_sync(&d->work); + if (d->gd) { + aoedisk_rm_sysfs(d); + del_gendisk(d->gd); + put_disk(d->gd); + } + t = d->targets; + e = t + NTARGETS; + for (; t < e && *t; t++) + freetgt(d, *t); + if (d->bufpool) + mempool_destroy(d->bufpool); + skbpoolfree(d); + blk_cleanup_queue(d->blkq); + kfree(d); +} + +int +aoedev_flush(const char __user *str, size_t cnt) +{ + ulong flags; + struct aoedev *d, **dd; + struct aoedev *rmd = NULL; + char buf[16]; + int all = 0; + + if (cnt >= 3) { + if (cnt > sizeof buf) + cnt = sizeof buf; + if (copy_from_user(buf, str, cnt)) + return -EFAULT; + all = !strncmp(buf, "all", 3); + } + + spin_lock_irqsave(&devlist_lock, flags); + dd = &devlist; + while ((d = *dd)) { + spin_lock(&d->lock); + if ((!all && (d->flags & DEVFL_UP)) + || (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE)) + || d->nopen) { + spin_unlock(&d->lock); + dd = &d->next; + continue; + } + *dd = d->next; + aoedev_downdev(d); + d->flags |= DEVFL_TKILL; + spin_unlock(&d->lock); + d->next = rmd; + rmd = d; + } + spin_unlock_irqrestore(&devlist_lock, flags); + while ((d = rmd)) { + rmd = d->next; + del_timer_sync(&d->timer); + aoedev_freedev(d); /* must be able to sleep */ + } + return 0; +} + +/* I'm not really sure that this is a realistic problem, but if the +network driver goes gonzo let's just leak memory after complaining. */ +static void +skbfree(struct sk_buff *skb) +{ + enum { Sms = 100, Tms = 3*1000}; + int i = Tms / Sms; + + if (skb == NULL) + return; + while (atomic_read(&skb_shinfo(skb)->dataref) != 1 && i-- > 0) + msleep(Sms); + if (i < 0) { + printk(KERN_ERR + "aoe: %s holds ref: %s\n", + skb->dev ? skb->dev->name : "netif", + "cannot free skb -- memory leaked."); + return; + } + skb_shinfo(skb)->nr_frags = skb->data_len = 0; + skb_trim(skb, 0); + dev_kfree_skb(skb); +} + +static void +skbpoolfree(struct aoedev *d) +{ + struct sk_buff *skb, *tmp; + + skb_queue_walk_safe(&d->skbpool, skb, tmp) + skbfree(skb); + + __skb_queue_head_init(&d->skbpool); +} + +/* find it or malloc it */ +struct aoedev * +aoedev_by_sysminor_m(ulong sysminor) +{ + struct aoedev *d; + ulong flags; + + spin_lock_irqsave(&devlist_lock, flags); + + for (d=devlist; d; d=d->next) + if (d->sysminor == sysminor) + break; + if (d) + goto out; + d = kcalloc(1, sizeof *d, GFP_ATOMIC); + if (!d) + goto out; + INIT_WORK(&d->work, aoecmd_sleepwork); + spin_lock_init(&d->lock); + skb_queue_head_init(&d->sendq); + skb_queue_head_init(&d->skbpool); + init_timer(&d->timer); + d->timer.data = (ulong) d; + d->timer.function = dummy_timer; + d->timer.expires = jiffies + HZ; + add_timer(&d->timer); + d->bufpool = NULL; /* defer to aoeblk_gdalloc */ + d->tgt = d->targets; + INIT_LIST_HEAD(&d->bufq); + d->sysminor = sysminor; + d->aoemajor = AOEMAJOR(sysminor); + d->aoeminor = AOEMINOR(sysminor); + d->mintimer = MINTIMER; + d->next = devlist; + devlist = d; + out: + spin_unlock_irqrestore(&devlist_lock, flags); + return d; +} + +static void +freetgt(struct aoedev *d, struct aoetgt *t) +{ + struct frame *f, *e; + + f = t->frames; + e = f + t->nframes; + for (; f < e; f++) + skbfree(f->skb); + kfree(t->frames); + kfree(t); +} + +void +aoedev_exit(void) +{ + struct aoedev *d; + ulong flags; + + while ((d = devlist)) { + devlist = d->next; + + spin_lock_irqsave(&d->lock, flags); + aoedev_downdev(d); + d->flags |= DEVFL_TKILL; + spin_unlock_irqrestore(&d->lock, flags); + + del_timer_sync(&d->timer); + aoedev_freedev(d); + } +} + +int __init +aoedev_init(void) +{ + return 0; +} diff --git a/drivers/block/aoe/aoemain.c b/drivers/block/aoe/aoemain.c new file mode 100644 index 00000000..7f83ad90 --- /dev/null +++ b/drivers/block/aoe/aoemain.c @@ -0,0 +1,111 @@ +/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */ +/* + * aoemain.c + * Module initialization routines, discover timer + */ + +#include <linux/hdreg.h> +#include <linux/blkdev.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include "aoe.h" + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Sam Hopkins <sah@coraid.com>"); +MODULE_DESCRIPTION("AoE block/char driver for 2.6.2 and newer 2.6 kernels"); +MODULE_VERSION(VERSION); + +enum { TINIT, TRUN, TKILL }; + +static void +discover_timer(ulong vp) +{ + static struct timer_list t; + static volatile ulong die; + static spinlock_t lock; + ulong flags; + enum { DTIMERTICK = HZ * 60 }; /* one minute */ + + switch (vp) { + case TINIT: + init_timer(&t); + spin_lock_init(&lock); + t.data = TRUN; + t.function = discover_timer; + die = 0; + case TRUN: + spin_lock_irqsave(&lock, flags); + if (!die) { + t.expires = jiffies + DTIMERTICK; + add_timer(&t); + } + spin_unlock_irqrestore(&lock, flags); + + aoecmd_cfg(0xffff, 0xff); + return; + case TKILL: + spin_lock_irqsave(&lock, flags); + die = 1; + spin_unlock_irqrestore(&lock, flags); + + del_timer_sync(&t); + default: + return; + } +} + +static void +aoe_exit(void) +{ + discover_timer(TKILL); + + aoenet_exit(); + unregister_blkdev(AOE_MAJOR, DEVICE_NAME); + aoechr_exit(); + aoedev_exit(); + aoeblk_exit(); /* free cache after de-allocating bufs */ +} + +static int __init +aoe_init(void) +{ + int ret; + + ret = aoedev_init(); + if (ret) + return ret; + ret = aoechr_init(); + if (ret) + goto chr_fail; + ret = aoeblk_init(); + if (ret) + goto blk_fail; + ret = aoenet_init(); + if (ret) + goto net_fail; + ret = register_blkdev(AOE_MAJOR, DEVICE_NAME); + if (ret < 0) { + printk(KERN_ERR "aoe: can't register major\n"); + goto blkreg_fail; + } + + printk(KERN_INFO "aoe: AoE v%s initialised.\n", VERSION); + discover_timer(TINIT); + return 0; + + blkreg_fail: + aoenet_exit(); + net_fail: + aoeblk_exit(); + blk_fail: + aoechr_exit(); + chr_fail: + aoedev_exit(); + + printk(KERN_INFO "aoe: initialisation failure.\n"); + return ret; +} + +module_init(aoe_init); +module_exit(aoe_exit); + diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c new file mode 100644 index 00000000..4d3bc0d4 --- /dev/null +++ b/drivers/block/aoe/aoenet.c @@ -0,0 +1,172 @@ +/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */ +/* + * aoenet.c + * Ethernet portion of AoE driver + */ + +#include <linux/gfp.h> +#include <linux/hdreg.h> +#include <linux/blkdev.h> +#include <linux/netdevice.h> +#include <linux/moduleparam.h> +#include <net/net_namespace.h> +#include <asm/unaligned.h> +#include "aoe.h" + +#define NECODES 5 + +static char *aoe_errlist[] = +{ + "no such error", + "unrecognized command code", + "bad argument parameter", + "device unavailable", + "config string present", + "unsupported version" +}; + +enum { + IFLISTSZ = 1024, +}; + +static char aoe_iflist[IFLISTSZ]; +module_param_string(aoe_iflist, aoe_iflist, IFLISTSZ, 0600); +MODULE_PARM_DESC(aoe_iflist, "aoe_iflist=\"dev1 [dev2 ...]\""); + +#ifndef MODULE +static int __init aoe_iflist_setup(char *str) +{ + strncpy(aoe_iflist, str, IFLISTSZ); + aoe_iflist[IFLISTSZ - 1] = '\0'; + return 1; +} + +__setup("aoe_iflist=", aoe_iflist_setup); +#endif + +int +is_aoe_netif(struct net_device *ifp) +{ + register char *p, *q; + register int len; + + if (aoe_iflist[0] == '\0') + return 1; + + p = aoe_iflist + strspn(aoe_iflist, WHITESPACE); + for (; *p; p = q + strspn(q, WHITESPACE)) { + q = p + strcspn(p, WHITESPACE); + if (q != p) + len = q - p; + else + len = strlen(p); /* last token in aoe_iflist */ + + if (strlen(ifp->name) == len && !strncmp(ifp->name, p, len)) + return 1; + if (q == p) + break; + } + + return 0; +} + +int +set_aoe_iflist(const char __user *user_str, size_t size) +{ + if (size >= IFLISTSZ) + return -EINVAL; + + if (copy_from_user(aoe_iflist, user_str, size)) { + printk(KERN_INFO "aoe: copy from user failed\n"); + return -EFAULT; + } + aoe_iflist[size] = 0x00; + return 0; +} + +void +aoenet_xmit(struct sk_buff_head *queue) +{ + struct sk_buff *skb, *tmp; + + skb_queue_walk_safe(queue, skb, tmp) { + __skb_unlink(skb, queue); + dev_queue_xmit(skb); + } +} + +/* + * (1) len doesn't include the header by default. I want this. + */ +static int +aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, struct net_device *orig_dev) +{ + struct aoe_hdr *h; + u32 n; + + if (dev_net(ifp) != &init_net) + goto exit; + + skb = skb_share_check(skb, GFP_ATOMIC); + if (skb == NULL) + return 0; + if (skb_linearize(skb)) + goto exit; + if (!is_aoe_netif(ifp)) + goto exit; + skb_push(skb, ETH_HLEN); /* (1) */ + + h = (struct aoe_hdr *) skb_mac_header(skb); + n = get_unaligned_be32(&h->tag); + if ((h->verfl & AOEFL_RSP) == 0 || (n & 1<<31)) + goto exit; + + if (h->verfl & AOEFL_ERR) { + n = h->err; + if (n > NECODES) + n = 0; + if (net_ratelimit()) + printk(KERN_ERR + "%s%d.%d@%s; ecode=%d '%s'\n", + "aoe: error packet from ", + get_unaligned_be16(&h->major), + h->minor, skb->dev->name, + h->err, aoe_errlist[n]); + goto exit; + } + + switch (h->cmd) { + case AOECMD_ATA: + aoecmd_ata_rsp(skb); + break; + case AOECMD_CFG: + aoecmd_cfg_rsp(skb); + break; + default: + if (h->cmd >= AOECMD_VEND_MIN) + break; /* don't complain about vendor commands */ + printk(KERN_INFO "aoe: unknown cmd %d\n", h->cmd); + } +exit: + dev_kfree_skb(skb); + return 0; +} + +static struct packet_type aoe_pt __read_mostly = { + .type = __constant_htons(ETH_P_AOE), + .func = aoenet_rcv, +}; + +int __init +aoenet_init(void) +{ + dev_add_pack(&aoe_pt); + return 0; +} + +void +aoenet_exit(void) +{ + dev_remove_pack(&aoe_pt); +} + |