diff options
Diffstat (limited to 'drivers/virtio')
-rw-r--r-- | drivers/virtio/Kconfig | 49 | ||||
-rw-r--r-- | drivers/virtio/Makefile | 5 | ||||
-rw-r--r-- | drivers/virtio/config.c | 12 | ||||
-rw-r--r-- | drivers/virtio/virtio.c | 237 | ||||
-rw-r--r-- | drivers/virtio/virtio_balloon.c | 476 | ||||
-rw-r--r-- | drivers/virtio/virtio_mmio.c | 485 | ||||
-rw-r--r-- | drivers/virtio/virtio_pci.c | 797 | ||||
-rw-r--r-- | drivers/virtio/virtio_ring.c | 718 |
8 files changed, 2779 insertions, 0 deletions
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig new file mode 100644 index 00000000..1a61939b --- /dev/null +++ b/drivers/virtio/Kconfig @@ -0,0 +1,49 @@ +# Virtio always gets selected by whoever wants it. +config VIRTIO + tristate + +# Similarly the virtio ring implementation. +config VIRTIO_RING + tristate + depends on VIRTIO + +menu "Virtio drivers" + +config VIRTIO_PCI + tristate "PCI driver for virtio devices (EXPERIMENTAL)" + depends on PCI && EXPERIMENTAL + select VIRTIO + select VIRTIO_RING + ---help--- + This drivers provides support for virtio based paravirtual device + drivers over PCI. This requires that your VMM has appropriate PCI + virtio backends. Most QEMU based VMMs should support these devices + (like KVM or Xen). + + Currently, the ABI is not considered stable so there is no guarantee + that this version of the driver will work with your VMM. + + If unsure, say M. + +config VIRTIO_BALLOON + tristate "Virtio balloon driver (EXPERIMENTAL)" + select VIRTIO + select VIRTIO_RING + ---help--- + This driver supports increasing and decreasing the amount + of memory within a KVM guest. + + If unsure, say M. + + config VIRTIO_MMIO + tristate "Platform bus driver for memory mapped virtio devices (EXPERIMENTAL)" + depends on HAS_IOMEM && EXPERIMENTAL + select VIRTIO + select VIRTIO_RING + ---help--- + This drivers provides support for memory mapped virtio + platform device driver. + + If unsure, say N. + +endmenu diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile new file mode 100644 index 00000000..5a4c63cf --- /dev/null +++ b/drivers/virtio/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_VIRTIO) += virtio.o +obj-$(CONFIG_VIRTIO_RING) += virtio_ring.o +obj-$(CONFIG_VIRTIO_MMIO) += virtio_mmio.o +obj-$(CONFIG_VIRTIO_PCI) += virtio_pci.o +obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o diff --git a/drivers/virtio/config.c b/drivers/virtio/config.c new file mode 100644 index 00000000..f70bcd2f --- /dev/null +++ b/drivers/virtio/config.c @@ -0,0 +1,12 @@ +/* Configuration space parsing helpers for virtio. + * + * The configuration is [type][len][... len bytes ...] fields. + * + * Copyright 2007 Rusty Russell, IBM Corporation. + * GPL v2 or later. + */ +#include <linux/err.h> +#include <linux/virtio.h> +#include <linux/virtio_config.h> +#include <linux/bug.h> + diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c new file mode 100644 index 00000000..984c501c --- /dev/null +++ b/drivers/virtio/virtio.c @@ -0,0 +1,237 @@ +#include <linux/virtio.h> +#include <linux/spinlock.h> +#include <linux/virtio_config.h> +#include <linux/module.h> + +/* Unique numbering for virtio devices. */ +static unsigned int dev_index; + +static ssize_t device_show(struct device *_d, + struct device_attribute *attr, char *buf) +{ + struct virtio_device *dev = container_of(_d,struct virtio_device,dev); + return sprintf(buf, "0x%04x\n", dev->id.device); +} +static ssize_t vendor_show(struct device *_d, + struct device_attribute *attr, char *buf) +{ + struct virtio_device *dev = container_of(_d,struct virtio_device,dev); + return sprintf(buf, "0x%04x\n", dev->id.vendor); +} +static ssize_t status_show(struct device *_d, + struct device_attribute *attr, char *buf) +{ + struct virtio_device *dev = container_of(_d,struct virtio_device,dev); + return sprintf(buf, "0x%08x\n", dev->config->get_status(dev)); +} +static ssize_t modalias_show(struct device *_d, + struct device_attribute *attr, char *buf) +{ + struct virtio_device *dev = container_of(_d,struct virtio_device,dev); + + return sprintf(buf, "virtio:d%08Xv%08X\n", + dev->id.device, dev->id.vendor); +} +static ssize_t features_show(struct device *_d, + struct device_attribute *attr, char *buf) +{ + struct virtio_device *dev = container_of(_d, struct virtio_device, dev); + unsigned int i; + ssize_t len = 0; + + /* We actually represent this as a bitstring, as it could be + * arbitrary length in future. */ + for (i = 0; i < ARRAY_SIZE(dev->features)*BITS_PER_LONG; i++) + len += sprintf(buf+len, "%c", + test_bit(i, dev->features) ? '1' : '0'); + len += sprintf(buf+len, "\n"); + return len; +} +static struct device_attribute virtio_dev_attrs[] = { + __ATTR_RO(device), + __ATTR_RO(vendor), + __ATTR_RO(status), + __ATTR_RO(modalias), + __ATTR_RO(features), + __ATTR_NULL +}; + +static inline int virtio_id_match(const struct virtio_device *dev, + const struct virtio_device_id *id) +{ + if (id->device != dev->id.device && id->device != VIRTIO_DEV_ANY_ID) + return 0; + + return id->vendor == VIRTIO_DEV_ANY_ID || id->vendor == dev->id.vendor; +} + +/* This looks through all the IDs a driver claims to support. If any of them + * match, we return 1 and the kernel will call virtio_dev_probe(). */ +static int virtio_dev_match(struct device *_dv, struct device_driver *_dr) +{ + unsigned int i; + struct virtio_device *dev = container_of(_dv,struct virtio_device,dev); + const struct virtio_device_id *ids; + + ids = container_of(_dr, struct virtio_driver, driver)->id_table; + for (i = 0; ids[i].device; i++) + if (virtio_id_match(dev, &ids[i])) + return 1; + return 0; +} + +static int virtio_uevent(struct device *_dv, struct kobj_uevent_env *env) +{ + struct virtio_device *dev = container_of(_dv,struct virtio_device,dev); + + return add_uevent_var(env, "MODALIAS=virtio:d%08Xv%08X", + dev->id.device, dev->id.vendor); +} + +static void add_status(struct virtio_device *dev, unsigned status) +{ + dev->config->set_status(dev, dev->config->get_status(dev) | status); +} + +void virtio_check_driver_offered_feature(const struct virtio_device *vdev, + unsigned int fbit) +{ + unsigned int i; + struct virtio_driver *drv = container_of(vdev->dev.driver, + struct virtio_driver, driver); + + for (i = 0; i < drv->feature_table_size; i++) + if (drv->feature_table[i] == fbit) + return; + BUG(); +} +EXPORT_SYMBOL_GPL(virtio_check_driver_offered_feature); + +static int virtio_dev_probe(struct device *_d) +{ + int err, i; + struct virtio_device *dev = container_of(_d,struct virtio_device,dev); + struct virtio_driver *drv = container_of(dev->dev.driver, + struct virtio_driver, driver); + u32 device_features; + + /* We have a driver! */ + add_status(dev, VIRTIO_CONFIG_S_DRIVER); + + /* Figure out what features the device supports. */ + device_features = dev->config->get_features(dev); + + /* Features supported by both device and driver into dev->features. */ + memset(dev->features, 0, sizeof(dev->features)); + for (i = 0; i < drv->feature_table_size; i++) { + unsigned int f = drv->feature_table[i]; + BUG_ON(f >= 32); + if (device_features & (1 << f)) + set_bit(f, dev->features); + } + + /* Transport features always preserved to pass to finalize_features. */ + for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) + if (device_features & (1 << i)) + set_bit(i, dev->features); + + dev->config->finalize_features(dev); + + err = drv->probe(dev); + if (err) + add_status(dev, VIRTIO_CONFIG_S_FAILED); + else + add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK); + + return err; +} + +static int virtio_dev_remove(struct device *_d) +{ + struct virtio_device *dev = container_of(_d,struct virtio_device,dev); + struct virtio_driver *drv = container_of(dev->dev.driver, + struct virtio_driver, driver); + + drv->remove(dev); + + /* Driver should have reset device. */ + BUG_ON(dev->config->get_status(dev)); + + /* Acknowledge the device's existence again. */ + add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); + return 0; +} + +static struct bus_type virtio_bus = { + .name = "virtio", + .match = virtio_dev_match, + .dev_attrs = virtio_dev_attrs, + .uevent = virtio_uevent, + .probe = virtio_dev_probe, + .remove = virtio_dev_remove, +}; + +int register_virtio_driver(struct virtio_driver *driver) +{ + /* Catch this early. */ + BUG_ON(driver->feature_table_size && !driver->feature_table); + driver->driver.bus = &virtio_bus; + return driver_register(&driver->driver); +} +EXPORT_SYMBOL_GPL(register_virtio_driver); + +void unregister_virtio_driver(struct virtio_driver *driver) +{ + driver_unregister(&driver->driver); +} +EXPORT_SYMBOL_GPL(unregister_virtio_driver); + +int register_virtio_device(struct virtio_device *dev) +{ + int err; + + dev->dev.bus = &virtio_bus; + + /* Assign a unique device index and hence name. */ + dev->index = dev_index++; + dev_set_name(&dev->dev, "virtio%u", dev->index); + + /* We always start by resetting the device, in case a previous + * driver messed it up. This also tests that code path a little. */ + dev->config->reset(dev); + + /* Acknowledge that we've seen the device. */ + add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); + + INIT_LIST_HEAD(&dev->vqs); + + /* device_register() causes the bus infrastructure to look for a + * matching driver. */ + err = device_register(&dev->dev); + if (err) + add_status(dev, VIRTIO_CONFIG_S_FAILED); + return err; +} +EXPORT_SYMBOL_GPL(register_virtio_device); + +void unregister_virtio_device(struct virtio_device *dev) +{ + device_unregister(&dev->dev); +} +EXPORT_SYMBOL_GPL(unregister_virtio_device); + +static int virtio_init(void) +{ + if (bus_register(&virtio_bus) != 0) + panic("virtio bus registration failed"); + return 0; +} + +static void __exit virtio_exit(void) +{ + bus_unregister(&virtio_bus); +} +core_initcall(virtio_init); +module_exit(virtio_exit); + +MODULE_LICENSE("GPL"); diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c new file mode 100644 index 00000000..8807fe50 --- /dev/null +++ b/drivers/virtio/virtio_balloon.c @@ -0,0 +1,476 @@ +/* + * Virtio balloon implementation, inspired by Dor Laor and Marcelo + * Tosatti's implementations. + * + * Copyright 2008 Rusty Russell IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <linux/virtio.h> +#include <linux/virtio_balloon.h> +#include <linux/swap.h> +#include <linux/kthread.h> +#include <linux/freezer.h> +#include <linux/delay.h> +#include <linux/slab.h> +#include <linux/module.h> + +/* + * Balloon device works in 4K page units. So each page is pointed to by + * multiple balloon pages. All memory counters in this driver are in balloon + * page units. + */ +#define VIRTIO_BALLOON_PAGES_PER_PAGE (PAGE_SIZE >> VIRTIO_BALLOON_PFN_SHIFT) + +struct virtio_balloon +{ + struct virtio_device *vdev; + struct virtqueue *inflate_vq, *deflate_vq, *stats_vq; + + /* Where the ballooning thread waits for config to change. */ + wait_queue_head_t config_change; + + /* The thread servicing the balloon. */ + struct task_struct *thread; + + /* Waiting for host to ack the pages we released. */ + struct completion acked; + + /* Number of balloon pages we've told the Host we're not using. */ + unsigned int num_pages; + /* + * The pages we've told the Host we're not using. + * Each page on this list adds VIRTIO_BALLOON_PAGES_PER_PAGE + * to num_pages above. + */ + struct list_head pages; + + /* The array of pfns we tell the Host about. */ + unsigned int num_pfns; + u32 pfns[256]; + + /* Memory statistics */ + int need_stats_update; + struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR]; +}; + +static struct virtio_device_id id_table[] = { + { VIRTIO_ID_BALLOON, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +static u32 page_to_balloon_pfn(struct page *page) +{ + unsigned long pfn = page_to_pfn(page); + + BUILD_BUG_ON(PAGE_SHIFT < VIRTIO_BALLOON_PFN_SHIFT); + /* Convert pfn from Linux page size to balloon page size. */ + return pfn * VIRTIO_BALLOON_PAGES_PER_PAGE; +} + +static struct page *balloon_pfn_to_page(u32 pfn) +{ + BUG_ON(pfn % VIRTIO_BALLOON_PAGES_PER_PAGE); + return pfn_to_page(pfn / VIRTIO_BALLOON_PAGES_PER_PAGE); +} + +static void balloon_ack(struct virtqueue *vq) +{ + struct virtio_balloon *vb; + unsigned int len; + + vb = virtqueue_get_buf(vq, &len); + if (vb) + complete(&vb->acked); +} + +static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq) +{ + struct scatterlist sg; + + sg_init_one(&sg, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns); + + init_completion(&vb->acked); + + /* We should always be able to add one buffer to an empty queue. */ + if (virtqueue_add_buf(vq, &sg, 1, 0, vb, GFP_KERNEL) < 0) + BUG(); + virtqueue_kick(vq); + + /* When host has read buffer, this completes via balloon_ack */ + wait_for_completion(&vb->acked); +} + +static void set_page_pfns(u32 pfns[], struct page *page) +{ + unsigned int i; + + /* Set balloon pfns pointing at this page. + * Note that the first pfn points at start of the page. */ + for (i = 0; i < VIRTIO_BALLOON_PAGES_PER_PAGE; i++) + pfns[i] = page_to_balloon_pfn(page) + i; +} + +static void fill_balloon(struct virtio_balloon *vb, size_t num) +{ + /* We can only do one array worth at a time. */ + num = min(num, ARRAY_SIZE(vb->pfns)); + + for (vb->num_pfns = 0; vb->num_pfns < num; + vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) { + struct page *page = alloc_page(GFP_HIGHUSER | __GFP_NORETRY | + __GFP_NOMEMALLOC | __GFP_NOWARN); + if (!page) { + if (printk_ratelimit()) + dev_printk(KERN_INFO, &vb->vdev->dev, + "Out of puff! Can't get %zu pages\n", + num); + /* Sleep for at least 1/5 of a second before retry. */ + msleep(200); + break; + } + set_page_pfns(vb->pfns + vb->num_pfns, page); + vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE; + totalram_pages--; + list_add(&page->lru, &vb->pages); + } + + /* Didn't get any? Oh well. */ + if (vb->num_pfns == 0) + return; + + tell_host(vb, vb->inflate_vq); +} + +static void release_pages_by_pfn(const u32 pfns[], unsigned int num) +{ + unsigned int i; + + /* Find pfns pointing at start of each page, get pages and free them. */ + for (i = 0; i < num; i += VIRTIO_BALLOON_PAGES_PER_PAGE) { + __free_page(balloon_pfn_to_page(pfns[i])); + totalram_pages++; + } +} + +static void leak_balloon(struct virtio_balloon *vb, size_t num) +{ + struct page *page; + + /* We can only do one array worth at a time. */ + num = min(num, ARRAY_SIZE(vb->pfns)); + + for (vb->num_pfns = 0; vb->num_pfns < num; + vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) { + page = list_first_entry(&vb->pages, struct page, lru); + list_del(&page->lru); + set_page_pfns(vb->pfns + vb->num_pfns, page); + vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE; + } + + /* + * Note that if + * virtio_has_feature(vdev, VIRTIO_BALLOON_F_MUST_TELL_HOST); + * is true, we *have* to do it in this order + */ + tell_host(vb, vb->deflate_vq); + release_pages_by_pfn(vb->pfns, vb->num_pfns); +} + +static inline void update_stat(struct virtio_balloon *vb, int idx, + u16 tag, u64 val) +{ + BUG_ON(idx >= VIRTIO_BALLOON_S_NR); + vb->stats[idx].tag = tag; + vb->stats[idx].val = val; +} + +#define pages_to_bytes(x) ((u64)(x) << PAGE_SHIFT) + +static void update_balloon_stats(struct virtio_balloon *vb) +{ + unsigned long events[NR_VM_EVENT_ITEMS]; + struct sysinfo i; + int idx = 0; + + all_vm_events(events); + si_meminfo(&i); + + update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_IN, + pages_to_bytes(events[PSWPIN])); + update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_OUT, + pages_to_bytes(events[PSWPOUT])); + update_stat(vb, idx++, VIRTIO_BALLOON_S_MAJFLT, events[PGMAJFAULT]); + update_stat(vb, idx++, VIRTIO_BALLOON_S_MINFLT, events[PGFAULT]); + update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMFREE, + pages_to_bytes(i.freeram)); + update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMTOT, + pages_to_bytes(i.totalram)); +} + +/* + * While most virtqueues communicate guest-initiated requests to the hypervisor, + * the stats queue operates in reverse. The driver initializes the virtqueue + * with a single buffer. From that point forward, all conversations consist of + * a hypervisor request (a call to this function) which directs us to refill + * the virtqueue with a fresh stats buffer. Since stats collection can sleep, + * we notify our kthread which does the actual work via stats_handle_request(). + */ +static void stats_request(struct virtqueue *vq) +{ + struct virtio_balloon *vb; + unsigned int len; + + vb = virtqueue_get_buf(vq, &len); + if (!vb) + return; + vb->need_stats_update = 1; + wake_up(&vb->config_change); +} + +static void stats_handle_request(struct virtio_balloon *vb) +{ + struct virtqueue *vq; + struct scatterlist sg; + + vb->need_stats_update = 0; + update_balloon_stats(vb); + + vq = vb->stats_vq; + sg_init_one(&sg, vb->stats, sizeof(vb->stats)); + if (virtqueue_add_buf(vq, &sg, 1, 0, vb, GFP_KERNEL) < 0) + BUG(); + virtqueue_kick(vq); +} + +static void virtballoon_changed(struct virtio_device *vdev) +{ + struct virtio_balloon *vb = vdev->priv; + + wake_up(&vb->config_change); +} + +static inline s64 towards_target(struct virtio_balloon *vb) +{ + __le32 v; + s64 target; + + vb->vdev->config->get(vb->vdev, + offsetof(struct virtio_balloon_config, num_pages), + &v, sizeof(v)); + target = le32_to_cpu(v); + return target - vb->num_pages; +} + +static void update_balloon_size(struct virtio_balloon *vb) +{ + __le32 actual = cpu_to_le32(vb->num_pages); + + vb->vdev->config->set(vb->vdev, + offsetof(struct virtio_balloon_config, actual), + &actual, sizeof(actual)); +} + +static int balloon(void *_vballoon) +{ + struct virtio_balloon *vb = _vballoon; + + set_freezable(); + while (!kthread_should_stop()) { + s64 diff; + + try_to_freeze(); + wait_event_interruptible(vb->config_change, + (diff = towards_target(vb)) != 0 + || vb->need_stats_update + || kthread_should_stop() + || freezing(current)); + if (vb->need_stats_update) + stats_handle_request(vb); + if (diff > 0) + fill_balloon(vb, diff); + else if (diff < 0) + leak_balloon(vb, -diff); + update_balloon_size(vb); + } + return 0; +} + +static int init_vqs(struct virtio_balloon *vb) +{ + struct virtqueue *vqs[3]; + vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request }; + const char *names[] = { "inflate", "deflate", "stats" }; + int err, nvqs; + + /* + * We expect two virtqueues: inflate and deflate, and + * optionally stat. + */ + nvqs = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ) ? 3 : 2; + err = vb->vdev->config->find_vqs(vb->vdev, nvqs, vqs, callbacks, names); + if (err) + return err; + + vb->inflate_vq = vqs[0]; + vb->deflate_vq = vqs[1]; + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) { + struct scatterlist sg; + vb->stats_vq = vqs[2]; + + /* + * Prime this virtqueue with one buffer so the hypervisor can + * use it to signal us later. + */ + sg_init_one(&sg, vb->stats, sizeof vb->stats); + if (virtqueue_add_buf(vb->stats_vq, &sg, 1, 0, vb, GFP_KERNEL) + < 0) + BUG(); + virtqueue_kick(vb->stats_vq); + } + return 0; +} + +static int virtballoon_probe(struct virtio_device *vdev) +{ + struct virtio_balloon *vb; + int err; + + vdev->priv = vb = kmalloc(sizeof(*vb), GFP_KERNEL); + if (!vb) { + err = -ENOMEM; + goto out; + } + + INIT_LIST_HEAD(&vb->pages); + vb->num_pages = 0; + init_waitqueue_head(&vb->config_change); + vb->vdev = vdev; + vb->need_stats_update = 0; + + err = init_vqs(vb); + if (err) + goto out_free_vb; + + vb->thread = kthread_run(balloon, vb, "vballoon"); + if (IS_ERR(vb->thread)) { + err = PTR_ERR(vb->thread); + goto out_del_vqs; + } + + return 0; + +out_del_vqs: + vdev->config->del_vqs(vdev); +out_free_vb: + kfree(vb); +out: + return err; +} + +static void __devexit virtballoon_remove(struct virtio_device *vdev) +{ + struct virtio_balloon *vb = vdev->priv; + + kthread_stop(vb->thread); + + /* There might be pages left in the balloon: free them. */ + while (vb->num_pages) + leak_balloon(vb, vb->num_pages); + update_balloon_size(vb); + + /* Now we reset the device so we can clean up the queues. */ + vdev->config->reset(vdev); + + vdev->config->del_vqs(vdev); + kfree(vb); +} + +#ifdef CONFIG_PM +static int virtballoon_freeze(struct virtio_device *vdev) +{ + struct virtio_balloon *vb = vdev->priv; + + /* + * The kthread is already frozen by the PM core before this + * function is called. + */ + + while (vb->num_pages) + leak_balloon(vb, vb->num_pages); + update_balloon_size(vb); + + /* Ensure we don't get any more requests from the host */ + vdev->config->reset(vdev); + vdev->config->del_vqs(vdev); + return 0; +} + +static int restore_common(struct virtio_device *vdev) +{ + struct virtio_balloon *vb = vdev->priv; + int ret; + + ret = init_vqs(vdev->priv); + if (ret) + return ret; + + fill_balloon(vb, towards_target(vb)); + update_balloon_size(vb); + return 0; +} + +static int virtballoon_restore(struct virtio_device *vdev) +{ + return restore_common(vdev); +} +#endif + +static unsigned int features[] = { + VIRTIO_BALLOON_F_MUST_TELL_HOST, + VIRTIO_BALLOON_F_STATS_VQ, +}; + +static struct virtio_driver virtio_balloon_driver = { + .feature_table = features, + .feature_table_size = ARRAY_SIZE(features), + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = id_table, + .probe = virtballoon_probe, + .remove = __devexit_p(virtballoon_remove), + .config_changed = virtballoon_changed, +#ifdef CONFIG_PM + .freeze = virtballoon_freeze, + .restore = virtballoon_restore, +#endif +}; + +static int __init init(void) +{ + return register_virtio_driver(&virtio_balloon_driver); +} + +static void __exit fini(void) +{ + unregister_virtio_driver(&virtio_balloon_driver); +} +module_init(init); +module_exit(fini); + +MODULE_DEVICE_TABLE(virtio, id_table); +MODULE_DESCRIPTION("Virtio balloon driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c new file mode 100644 index 00000000..01d6dc25 --- /dev/null +++ b/drivers/virtio/virtio_mmio.c @@ -0,0 +1,485 @@ +/* + * Virtio memory mapped device driver + * + * Copyright 2011, ARM Ltd. + * + * This module allows virtio devices to be used over a virtual, memory mapped + * platform device. + * + * Registers layout (all 32-bit wide): + * + * offset d. name description + * ------ -- ---------------- ----------------- + * + * 0x000 R MagicValue Magic value "virt" + * 0x004 R Version Device version (current max. 1) + * 0x008 R DeviceID Virtio device ID + * 0x00c R VendorID Virtio vendor ID + * + * 0x010 R HostFeatures Features supported by the host + * 0x014 W HostFeaturesSel Set of host features to access via HostFeatures + * + * 0x020 W GuestFeatures Features activated by the guest + * 0x024 W GuestFeaturesSel Set of activated features to set via GuestFeatures + * 0x028 W GuestPageSize Size of guest's memory page in bytes + * + * 0x030 W QueueSel Queue selector + * 0x034 R QueueNumMax Maximum size of the currently selected queue + * 0x038 W QueueNum Queue size for the currently selected queue + * 0x03c W QueueAlign Used Ring alignment for the current queue + * 0x040 RW QueuePFN PFN for the currently selected queue + * + * 0x050 W QueueNotify Queue notifier + * 0x060 R InterruptStatus Interrupt status register + * 0x060 W InterruptACK Interrupt acknowledge register + * 0x070 RW Status Device status register + * + * 0x100+ RW Device-specific configuration space + * + * Based on Virtio PCI driver by Anthony Liguori, copyright IBM Corp. 2007 + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include <linux/highmem.h> +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/platform_device.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/virtio.h> +#include <linux/virtio_config.h> +#include <linux/virtio_mmio.h> +#include <linux/virtio_ring.h> + + + +/* The alignment to use between consumer and producer parts of vring. + * Currently hardcoded to the page size. */ +#define VIRTIO_MMIO_VRING_ALIGN PAGE_SIZE + + + +#define to_virtio_mmio_device(_plat_dev) \ + container_of(_plat_dev, struct virtio_mmio_device, vdev) + +struct virtio_mmio_device { + struct virtio_device vdev; + struct platform_device *pdev; + + void __iomem *base; + unsigned long version; + + /* a list of queues so we can dispatch IRQs */ + spinlock_t lock; + struct list_head virtqueues; +}; + +struct virtio_mmio_vq_info { + /* the actual virtqueue */ + struct virtqueue *vq; + + /* the number of entries in the queue */ + unsigned int num; + + /* the index of the queue */ + int queue_index; + + /* the virtual address of the ring queue */ + void *queue; + + /* the list node for the virtqueues list */ + struct list_head node; +}; + + + +/* Configuration interface */ + +static u32 vm_get_features(struct virtio_device *vdev) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + + /* TODO: Features > 32 bits */ + writel(0, vm_dev->base + VIRTIO_MMIO_HOST_FEATURES_SEL); + + return readl(vm_dev->base + VIRTIO_MMIO_HOST_FEATURES); +} + +static void vm_finalize_features(struct virtio_device *vdev) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + int i; + + /* Give virtio_ring a chance to accept features. */ + vring_transport_features(vdev); + + for (i = 0; i < ARRAY_SIZE(vdev->features); i++) { + writel(i, vm_dev->base + VIRTIO_MMIO_GUEST_FEATURES_SEL); + writel(vdev->features[i], + vm_dev->base + VIRTIO_MMIO_GUEST_FEATURES); + } +} + +static void vm_get(struct virtio_device *vdev, unsigned offset, + void *buf, unsigned len) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + u8 *ptr = buf; + int i; + + for (i = 0; i < len; i++) + ptr[i] = readb(vm_dev->base + VIRTIO_MMIO_CONFIG + offset + i); +} + +static void vm_set(struct virtio_device *vdev, unsigned offset, + const void *buf, unsigned len) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + const u8 *ptr = buf; + int i; + + for (i = 0; i < len; i++) + writeb(ptr[i], vm_dev->base + VIRTIO_MMIO_CONFIG + offset + i); +} + +static u8 vm_get_status(struct virtio_device *vdev) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + + return readl(vm_dev->base + VIRTIO_MMIO_STATUS) & 0xff; +} + +static void vm_set_status(struct virtio_device *vdev, u8 status) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + + /* We should never be setting status to 0. */ + BUG_ON(status == 0); + + writel(status, vm_dev->base + VIRTIO_MMIO_STATUS); +} + +static void vm_reset(struct virtio_device *vdev) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + + /* 0 status means a reset. */ + writel(0, vm_dev->base + VIRTIO_MMIO_STATUS); +} + + + +/* Transport interface */ + +/* the notify function used when creating a virt queue */ +static void vm_notify(struct virtqueue *vq) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev); + struct virtio_mmio_vq_info *info = vq->priv; + + /* We write the queue's selector into the notification register to + * signal the other end */ + writel(info->queue_index, vm_dev->base + VIRTIO_MMIO_QUEUE_NOTIFY); +} + +/* Notify all virtqueues on an interrupt. */ +static irqreturn_t vm_interrupt(int irq, void *opaque) +{ + struct virtio_mmio_device *vm_dev = opaque; + struct virtio_mmio_vq_info *info; + struct virtio_driver *vdrv = container_of(vm_dev->vdev.dev.driver, + struct virtio_driver, driver); + unsigned long status; + unsigned long flags; + irqreturn_t ret = IRQ_NONE; + + /* Read and acknowledge interrupts */ + status = readl(vm_dev->base + VIRTIO_MMIO_INTERRUPT_STATUS); + writel(status, vm_dev->base + VIRTIO_MMIO_INTERRUPT_ACK); + + if (unlikely(status & VIRTIO_MMIO_INT_CONFIG) + && vdrv && vdrv->config_changed) { + vdrv->config_changed(&vm_dev->vdev); + ret = IRQ_HANDLED; + } + + if (likely(status & VIRTIO_MMIO_INT_VRING)) { + spin_lock_irqsave(&vm_dev->lock, flags); + list_for_each_entry(info, &vm_dev->virtqueues, node) + ret |= vring_interrupt(irq, info->vq); + spin_unlock_irqrestore(&vm_dev->lock, flags); + } + + return ret; +} + + + +static void vm_del_vq(struct virtqueue *vq) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev); + struct virtio_mmio_vq_info *info = vq->priv; + unsigned long flags, size; + + spin_lock_irqsave(&vm_dev->lock, flags); + list_del(&info->node); + spin_unlock_irqrestore(&vm_dev->lock, flags); + + vring_del_virtqueue(vq); + + /* Select and deactivate the queue */ + writel(info->queue_index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL); + writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_PFN); + + size = PAGE_ALIGN(vring_size(info->num, VIRTIO_MMIO_VRING_ALIGN)); + free_pages_exact(info->queue, size); + kfree(info); +} + +static void vm_del_vqs(struct virtio_device *vdev) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + struct virtqueue *vq, *n; + + list_for_each_entry_safe(vq, n, &vdev->vqs, list) + vm_del_vq(vq); + + free_irq(platform_get_irq(vm_dev->pdev, 0), vm_dev); +} + + + +static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index, + void (*callback)(struct virtqueue *vq), + const char *name) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + struct virtio_mmio_vq_info *info; + struct virtqueue *vq; + unsigned long flags, size; + int err; + + /* Select the queue we're interested in */ + writel(index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL); + + /* Queue shouldn't already be set up. */ + if (readl(vm_dev->base + VIRTIO_MMIO_QUEUE_PFN)) { + err = -ENOENT; + goto error_available; + } + + /* Allocate and fill out our active queue description */ + info = kmalloc(sizeof(*info), GFP_KERNEL); + if (!info) { + err = -ENOMEM; + goto error_kmalloc; + } + info->queue_index = index; + + /* Allocate pages for the queue - start with a queue as big as + * possible (limited by maximum size allowed by device), drop down + * to a minimal size, just big enough to fit descriptor table + * and two rings (which makes it "alignment_size * 2") + */ + info->num = readl(vm_dev->base + VIRTIO_MMIO_QUEUE_NUM_MAX); + while (1) { + size = PAGE_ALIGN(vring_size(info->num, + VIRTIO_MMIO_VRING_ALIGN)); + /* Already smallest possible allocation? */ + if (size <= VIRTIO_MMIO_VRING_ALIGN * 2) { + err = -ENOMEM; + goto error_alloc_pages; + } + + info->queue = alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO); + if (info->queue) + break; + + info->num /= 2; + } + + /* Activate the queue */ + writel(info->num, vm_dev->base + VIRTIO_MMIO_QUEUE_NUM); + writel(VIRTIO_MMIO_VRING_ALIGN, + vm_dev->base + VIRTIO_MMIO_QUEUE_ALIGN); + writel(virt_to_phys(info->queue) >> PAGE_SHIFT, + vm_dev->base + VIRTIO_MMIO_QUEUE_PFN); + + /* Create the vring */ + vq = vring_new_virtqueue(info->num, VIRTIO_MMIO_VRING_ALIGN, vdev, + true, info->queue, vm_notify, callback, name); + if (!vq) { + err = -ENOMEM; + goto error_new_virtqueue; + } + + vq->priv = info; + info->vq = vq; + + spin_lock_irqsave(&vm_dev->lock, flags); + list_add(&info->node, &vm_dev->virtqueues); + spin_unlock_irqrestore(&vm_dev->lock, flags); + + return vq; + +error_new_virtqueue: + writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_PFN); + free_pages_exact(info->queue, size); +error_alloc_pages: + kfree(info); +error_kmalloc: +error_available: + return ERR_PTR(err); +} + +static int vm_find_vqs(struct virtio_device *vdev, unsigned nvqs, + struct virtqueue *vqs[], + vq_callback_t *callbacks[], + const char *names[]) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + unsigned int irq = platform_get_irq(vm_dev->pdev, 0); + int i, err; + + err = request_irq(irq, vm_interrupt, IRQF_SHARED, + dev_name(&vdev->dev), vm_dev); + if (err) + return err; + + for (i = 0; i < nvqs; ++i) { + vqs[i] = vm_setup_vq(vdev, i, callbacks[i], names[i]); + if (IS_ERR(vqs[i])) { + vm_del_vqs(vdev); + return PTR_ERR(vqs[i]); + } + } + + return 0; +} + +static const char *vm_bus_name(struct virtio_device *vdev) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + + return vm_dev->pdev->name; +} + +static struct virtio_config_ops virtio_mmio_config_ops = { + .get = vm_get, + .set = vm_set, + .get_status = vm_get_status, + .set_status = vm_set_status, + .reset = vm_reset, + .find_vqs = vm_find_vqs, + .del_vqs = vm_del_vqs, + .get_features = vm_get_features, + .finalize_features = vm_finalize_features, + .bus_name = vm_bus_name, +}; + + + +/* Platform device */ + +static int __devinit virtio_mmio_probe(struct platform_device *pdev) +{ + struct virtio_mmio_device *vm_dev; + struct resource *mem; + unsigned long magic; + + mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!mem) + return -EINVAL; + + if (!devm_request_mem_region(&pdev->dev, mem->start, + resource_size(mem), pdev->name)) + return -EBUSY; + + vm_dev = devm_kzalloc(&pdev->dev, sizeof(*vm_dev), GFP_KERNEL); + if (!vm_dev) + return -ENOMEM; + + vm_dev->vdev.dev.parent = &pdev->dev; + vm_dev->vdev.config = &virtio_mmio_config_ops; + vm_dev->pdev = pdev; + INIT_LIST_HEAD(&vm_dev->virtqueues); + spin_lock_init(&vm_dev->lock); + + vm_dev->base = devm_ioremap(&pdev->dev, mem->start, resource_size(mem)); + if (vm_dev->base == NULL) + return -EFAULT; + + /* Check magic value */ + magic = readl(vm_dev->base + VIRTIO_MMIO_MAGIC_VALUE); + if (memcmp(&magic, "virt", 4) != 0) { + dev_warn(&pdev->dev, "Wrong magic value 0x%08lx!\n", magic); + return -ENODEV; + } + + /* Check device version */ + vm_dev->version = readl(vm_dev->base + VIRTIO_MMIO_VERSION); + if (vm_dev->version != 1) { + dev_err(&pdev->dev, "Version %ld not supported!\n", + vm_dev->version); + return -ENXIO; + } + + vm_dev->vdev.id.device = readl(vm_dev->base + VIRTIO_MMIO_DEVICE_ID); + vm_dev->vdev.id.vendor = readl(vm_dev->base + VIRTIO_MMIO_VENDOR_ID); + + writel(PAGE_SIZE, vm_dev->base + VIRTIO_MMIO_GUEST_PAGE_SIZE); + + platform_set_drvdata(pdev, vm_dev); + + return register_virtio_device(&vm_dev->vdev); +} + +static int __devexit virtio_mmio_remove(struct platform_device *pdev) +{ + struct virtio_mmio_device *vm_dev = platform_get_drvdata(pdev); + + unregister_virtio_device(&vm_dev->vdev); + + return 0; +} + + + +/* Platform driver */ + +static struct of_device_id virtio_mmio_match[] = { + { .compatible = "virtio,mmio", }, + {}, +}; +MODULE_DEVICE_TABLE(of, virtio_mmio_match); + +static struct platform_driver virtio_mmio_driver = { + .probe = virtio_mmio_probe, + .remove = __devexit_p(virtio_mmio_remove), + .driver = { + .name = "virtio-mmio", + .owner = THIS_MODULE, + .of_match_table = virtio_mmio_match, + }, +}; + +static int __init virtio_mmio_init(void) +{ + return platform_driver_register(&virtio_mmio_driver); +} + +static void __exit virtio_mmio_exit(void) +{ + platform_driver_unregister(&virtio_mmio_driver); +} + +module_init(virtio_mmio_init); +module_exit(virtio_mmio_exit); + +MODULE_AUTHOR("Pawel Moll <pawel.moll@arm.com>"); +MODULE_DESCRIPTION("Platform bus driver for memory mapped virtio devices"); +MODULE_LICENSE("GPL"); diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c new file mode 100644 index 00000000..2e03d416 --- /dev/null +++ b/drivers/virtio/virtio_pci.c @@ -0,0 +1,797 @@ +/* + * Virtio PCI driver + * + * This module allows virtio devices to be used over a virtual PCI device. + * This can be used with QEMU based VMMs like KVM or Xen. + * + * Copyright IBM Corp. 2007 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include <linux/module.h> +#include <linux/list.h> +#include <linux/pci.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <linux/virtio.h> +#include <linux/virtio_config.h> +#include <linux/virtio_ring.h> +#include <linux/virtio_pci.h> +#include <linux/highmem.h> +#include <linux/spinlock.h> + +MODULE_AUTHOR("Anthony Liguori <aliguori@us.ibm.com>"); +MODULE_DESCRIPTION("virtio-pci"); +MODULE_LICENSE("GPL"); +MODULE_VERSION("1"); + +/* Our device structure */ +struct virtio_pci_device +{ + struct virtio_device vdev; + struct pci_dev *pci_dev; + + /* the IO mapping for the PCI config space */ + void __iomem *ioaddr; + + /* a list of queues so we can dispatch IRQs */ + spinlock_t lock; + struct list_head virtqueues; + + /* MSI-X support */ + int msix_enabled; + int intx_enabled; + struct msix_entry *msix_entries; + /* Name strings for interrupts. This size should be enough, + * and I'm too lazy to allocate each name separately. */ + char (*msix_names)[256]; + /* Number of available vectors */ + unsigned msix_vectors; + /* Vectors allocated, excluding per-vq vectors if any */ + unsigned msix_used_vectors; + + /* Status saved during hibernate/restore */ + u8 saved_status; + + /* Whether we have vector per vq */ + bool per_vq_vectors; +}; + +/* Constants for MSI-X */ +/* Use first vector for configuration changes, second and the rest for + * virtqueues Thus, we need at least 2 vectors for MSI. */ +enum { + VP_MSIX_CONFIG_VECTOR = 0, + VP_MSIX_VQ_VECTOR = 1, +}; + +struct virtio_pci_vq_info +{ + /* the actual virtqueue */ + struct virtqueue *vq; + + /* the number of entries in the queue */ + int num; + + /* the index of the queue */ + int queue_index; + + /* the virtual address of the ring queue */ + void *queue; + + /* the list node for the virtqueues list */ + struct list_head node; + + /* MSI-X vector (or none) */ + unsigned msix_vector; +}; + +/* Qumranet donated their vendor ID for devices 0x1000 thru 0x10FF. */ +static struct pci_device_id virtio_pci_id_table[] = { + { 0x1af4, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 }, + { 0 }, +}; + +MODULE_DEVICE_TABLE(pci, virtio_pci_id_table); + +/* Convert a generic virtio device to our structure */ +static struct virtio_pci_device *to_vp_device(struct virtio_device *vdev) +{ + return container_of(vdev, struct virtio_pci_device, vdev); +} + +/* virtio config->get_features() implementation */ +static u32 vp_get_features(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + + /* When someone needs more than 32 feature bits, we'll need to + * steal a bit to indicate that the rest are somewhere else. */ + return ioread32(vp_dev->ioaddr + VIRTIO_PCI_HOST_FEATURES); +} + +/* virtio config->finalize_features() implementation */ +static void vp_finalize_features(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + + /* Give virtio_ring a chance to accept features. */ + vring_transport_features(vdev); + + /* We only support 32 feature bits. */ + BUILD_BUG_ON(ARRAY_SIZE(vdev->features) != 1); + iowrite32(vdev->features[0], vp_dev->ioaddr+VIRTIO_PCI_GUEST_FEATURES); +} + +/* virtio config->get() implementation */ +static void vp_get(struct virtio_device *vdev, unsigned offset, + void *buf, unsigned len) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + void __iomem *ioaddr = vp_dev->ioaddr + + VIRTIO_PCI_CONFIG(vp_dev) + offset; + u8 *ptr = buf; + int i; + + for (i = 0; i < len; i++) + ptr[i] = ioread8(ioaddr + i); +} + +/* the config->set() implementation. it's symmetric to the config->get() + * implementation */ +static void vp_set(struct virtio_device *vdev, unsigned offset, + const void *buf, unsigned len) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + void __iomem *ioaddr = vp_dev->ioaddr + + VIRTIO_PCI_CONFIG(vp_dev) + offset; + const u8 *ptr = buf; + int i; + + for (i = 0; i < len; i++) + iowrite8(ptr[i], ioaddr + i); +} + +/* config->{get,set}_status() implementations */ +static u8 vp_get_status(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + return ioread8(vp_dev->ioaddr + VIRTIO_PCI_STATUS); +} + +static void vp_set_status(struct virtio_device *vdev, u8 status) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + /* We should never be setting status to 0. */ + BUG_ON(status == 0); + iowrite8(status, vp_dev->ioaddr + VIRTIO_PCI_STATUS); +} + +/* wait for pending irq handlers */ +static void vp_synchronize_vectors(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + int i; + + if (vp_dev->intx_enabled) + synchronize_irq(vp_dev->pci_dev->irq); + + for (i = 0; i < vp_dev->msix_vectors; ++i) + synchronize_irq(vp_dev->msix_entries[i].vector); +} + +static void vp_reset(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + /* 0 status means a reset. */ + iowrite8(0, vp_dev->ioaddr + VIRTIO_PCI_STATUS); + /* Flush out the status write, and flush in device writes, + * including MSi-X interrupts, if any. */ + ioread8(vp_dev->ioaddr + VIRTIO_PCI_STATUS); + /* Flush pending VQ/configuration callbacks. */ + vp_synchronize_vectors(vdev); +} + +/* the notify function used when creating a virt queue */ +static void vp_notify(struct virtqueue *vq) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); + struct virtio_pci_vq_info *info = vq->priv; + + /* we write the queue's selector into the notification register to + * signal the other end */ + iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY); +} + +/* Handle a configuration change: Tell driver if it wants to know. */ +static irqreturn_t vp_config_changed(int irq, void *opaque) +{ + struct virtio_pci_device *vp_dev = opaque; + struct virtio_driver *drv; + drv = container_of(vp_dev->vdev.dev.driver, + struct virtio_driver, driver); + + if (drv && drv->config_changed) + drv->config_changed(&vp_dev->vdev); + return IRQ_HANDLED; +} + +/* Notify all virtqueues on an interrupt. */ +static irqreturn_t vp_vring_interrupt(int irq, void *opaque) +{ + struct virtio_pci_device *vp_dev = opaque; + struct virtio_pci_vq_info *info; + irqreturn_t ret = IRQ_NONE; + unsigned long flags; + + spin_lock_irqsave(&vp_dev->lock, flags); + list_for_each_entry(info, &vp_dev->virtqueues, node) { + if (vring_interrupt(irq, info->vq) == IRQ_HANDLED) + ret = IRQ_HANDLED; + } + spin_unlock_irqrestore(&vp_dev->lock, flags); + + return ret; +} + +/* A small wrapper to also acknowledge the interrupt when it's handled. + * I really need an EIO hook for the vring so I can ack the interrupt once we + * know that we'll be handling the IRQ but before we invoke the callback since + * the callback may notify the host which results in the host attempting to + * raise an interrupt that we would then mask once we acknowledged the + * interrupt. */ +static irqreturn_t vp_interrupt(int irq, void *opaque) +{ + struct virtio_pci_device *vp_dev = opaque; + u8 isr; + + /* reading the ISR has the effect of also clearing it so it's very + * important to save off the value. */ + isr = ioread8(vp_dev->ioaddr + VIRTIO_PCI_ISR); + + /* It's definitely not us if the ISR was not high */ + if (!isr) + return IRQ_NONE; + + /* Configuration change? Tell driver if it wants to know. */ + if (isr & VIRTIO_PCI_ISR_CONFIG) + vp_config_changed(irq, opaque); + + return vp_vring_interrupt(irq, opaque); +} + +static void vp_free_vectors(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + int i; + + if (vp_dev->intx_enabled) { + free_irq(vp_dev->pci_dev->irq, vp_dev); + vp_dev->intx_enabled = 0; + } + + for (i = 0; i < vp_dev->msix_used_vectors; ++i) + free_irq(vp_dev->msix_entries[i].vector, vp_dev); + + if (vp_dev->msix_enabled) { + /* Disable the vector used for configuration */ + iowrite16(VIRTIO_MSI_NO_VECTOR, + vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); + /* Flush the write out to device */ + ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); + + pci_disable_msix(vp_dev->pci_dev); + vp_dev->msix_enabled = 0; + vp_dev->msix_vectors = 0; + } + + vp_dev->msix_used_vectors = 0; + kfree(vp_dev->msix_names); + vp_dev->msix_names = NULL; + kfree(vp_dev->msix_entries); + vp_dev->msix_entries = NULL; +} + +static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors, + bool per_vq_vectors) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + const char *name = dev_name(&vp_dev->vdev.dev); + unsigned i, v; + int err = -ENOMEM; + + vp_dev->msix_entries = kmalloc(nvectors * sizeof *vp_dev->msix_entries, + GFP_KERNEL); + if (!vp_dev->msix_entries) + goto error; + vp_dev->msix_names = kmalloc(nvectors * sizeof *vp_dev->msix_names, + GFP_KERNEL); + if (!vp_dev->msix_names) + goto error; + + for (i = 0; i < nvectors; ++i) + vp_dev->msix_entries[i].entry = i; + + /* pci_enable_msix returns positive if we can't get this many. */ + err = pci_enable_msix(vp_dev->pci_dev, vp_dev->msix_entries, nvectors); + if (err > 0) + err = -ENOSPC; + if (err) + goto error; + vp_dev->msix_vectors = nvectors; + vp_dev->msix_enabled = 1; + + /* Set the vector used for configuration */ + v = vp_dev->msix_used_vectors; + snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, + "%s-config", name); + err = request_irq(vp_dev->msix_entries[v].vector, + vp_config_changed, 0, vp_dev->msix_names[v], + vp_dev); + if (err) + goto error; + ++vp_dev->msix_used_vectors; + + iowrite16(v, vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); + /* Verify we had enough resources to assign the vector */ + v = ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); + if (v == VIRTIO_MSI_NO_VECTOR) { + err = -EBUSY; + goto error; + } + + if (!per_vq_vectors) { + /* Shared vector for all VQs */ + v = vp_dev->msix_used_vectors; + snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, + "%s-virtqueues", name); + err = request_irq(vp_dev->msix_entries[v].vector, + vp_vring_interrupt, 0, vp_dev->msix_names[v], + vp_dev); + if (err) + goto error; + ++vp_dev->msix_used_vectors; + } + return 0; +error: + vp_free_vectors(vdev); + return err; +} + +static int vp_request_intx(struct virtio_device *vdev) +{ + int err; + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + + err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, + IRQF_SHARED, dev_name(&vdev->dev), vp_dev); + if (!err) + vp_dev->intx_enabled = 1; + return err; +} + +static struct virtqueue *setup_vq(struct virtio_device *vdev, unsigned index, + void (*callback)(struct virtqueue *vq), + const char *name, + u16 msix_vec) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtio_pci_vq_info *info; + struct virtqueue *vq; + unsigned long flags, size; + u16 num; + int err; + + /* Select the queue we're interested in */ + iowrite16(index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL); + + /* Check if queue is either not available or already active. */ + num = ioread16(vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NUM); + if (!num || ioread32(vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN)) + return ERR_PTR(-ENOENT); + + /* allocate and fill out our structure the represents an active + * queue */ + info = kmalloc(sizeof(struct virtio_pci_vq_info), GFP_KERNEL); + if (!info) + return ERR_PTR(-ENOMEM); + + info->queue_index = index; + info->num = num; + info->msix_vector = msix_vec; + + size = PAGE_ALIGN(vring_size(num, VIRTIO_PCI_VRING_ALIGN)); + info->queue = alloc_pages_exact(size, GFP_KERNEL|__GFP_ZERO); + if (info->queue == NULL) { + err = -ENOMEM; + goto out_info; + } + + /* activate the queue */ + iowrite32(virt_to_phys(info->queue) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT, + vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN); + + /* create the vring */ + vq = vring_new_virtqueue(info->num, VIRTIO_PCI_VRING_ALIGN, vdev, + true, info->queue, vp_notify, callback, name); + if (!vq) { + err = -ENOMEM; + goto out_activate_queue; + } + + vq->priv = info; + info->vq = vq; + + if (msix_vec != VIRTIO_MSI_NO_VECTOR) { + iowrite16(msix_vec, vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR); + msix_vec = ioread16(vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR); + if (msix_vec == VIRTIO_MSI_NO_VECTOR) { + err = -EBUSY; + goto out_assign; + } + } + + if (callback) { + spin_lock_irqsave(&vp_dev->lock, flags); + list_add(&info->node, &vp_dev->virtqueues); + spin_unlock_irqrestore(&vp_dev->lock, flags); + } else { + INIT_LIST_HEAD(&info->node); + } + + return vq; + +out_assign: + vring_del_virtqueue(vq); +out_activate_queue: + iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN); + free_pages_exact(info->queue, size); +out_info: + kfree(info); + return ERR_PTR(err); +} + +static void vp_del_vq(struct virtqueue *vq) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); + struct virtio_pci_vq_info *info = vq->priv; + unsigned long flags, size; + + spin_lock_irqsave(&vp_dev->lock, flags); + list_del(&info->node); + spin_unlock_irqrestore(&vp_dev->lock, flags); + + iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL); + + if (vp_dev->msix_enabled) { + iowrite16(VIRTIO_MSI_NO_VECTOR, + vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR); + /* Flush the write out to device */ + ioread8(vp_dev->ioaddr + VIRTIO_PCI_ISR); + } + + vring_del_virtqueue(vq); + + /* Select and deactivate the queue */ + iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN); + + size = PAGE_ALIGN(vring_size(info->num, VIRTIO_PCI_VRING_ALIGN)); + free_pages_exact(info->queue, size); + kfree(info); +} + +/* the config->del_vqs() implementation */ +static void vp_del_vqs(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtqueue *vq, *n; + struct virtio_pci_vq_info *info; + + list_for_each_entry_safe(vq, n, &vdev->vqs, list) { + info = vq->priv; + if (vp_dev->per_vq_vectors && + info->msix_vector != VIRTIO_MSI_NO_VECTOR) + free_irq(vp_dev->msix_entries[info->msix_vector].vector, + vq); + vp_del_vq(vq); + } + vp_dev->per_vq_vectors = false; + + vp_free_vectors(vdev); +} + +static int vp_try_to_find_vqs(struct virtio_device *vdev, unsigned nvqs, + struct virtqueue *vqs[], + vq_callback_t *callbacks[], + const char *names[], + bool use_msix, + bool per_vq_vectors) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + u16 msix_vec; + int i, err, nvectors, allocated_vectors; + + if (!use_msix) { + /* Old style: one normal interrupt for change and all vqs. */ + err = vp_request_intx(vdev); + if (err) + goto error_request; + } else { + if (per_vq_vectors) { + /* Best option: one for change interrupt, one per vq. */ + nvectors = 1; + for (i = 0; i < nvqs; ++i) + if (callbacks[i]) + ++nvectors; + } else { + /* Second best: one for change, shared for all vqs. */ + nvectors = 2; + } + + err = vp_request_msix_vectors(vdev, nvectors, per_vq_vectors); + if (err) + goto error_request; + } + + vp_dev->per_vq_vectors = per_vq_vectors; + allocated_vectors = vp_dev->msix_used_vectors; + for (i = 0; i < nvqs; ++i) { + if (!callbacks[i] || !vp_dev->msix_enabled) + msix_vec = VIRTIO_MSI_NO_VECTOR; + else if (vp_dev->per_vq_vectors) + msix_vec = allocated_vectors++; + else + msix_vec = VP_MSIX_VQ_VECTOR; + vqs[i] = setup_vq(vdev, i, callbacks[i], names[i], msix_vec); + if (IS_ERR(vqs[i])) { + err = PTR_ERR(vqs[i]); + goto error_find; + } + + if (!vp_dev->per_vq_vectors || msix_vec == VIRTIO_MSI_NO_VECTOR) + continue; + + /* allocate per-vq irq if available and necessary */ + snprintf(vp_dev->msix_names[msix_vec], + sizeof *vp_dev->msix_names, + "%s-%s", + dev_name(&vp_dev->vdev.dev), names[i]); + err = request_irq(vp_dev->msix_entries[msix_vec].vector, + vring_interrupt, 0, + vp_dev->msix_names[msix_vec], + vqs[i]); + if (err) { + vp_del_vq(vqs[i]); + goto error_find; + } + } + return 0; + +error_find: + vp_del_vqs(vdev); + +error_request: + return err; +} + +/* the config->find_vqs() implementation */ +static int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs, + struct virtqueue *vqs[], + vq_callback_t *callbacks[], + const char *names[]) +{ + int err; + + /* Try MSI-X with one vector per queue. */ + err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names, true, true); + if (!err) + return 0; + /* Fallback: MSI-X with one vector for config, one shared for queues. */ + err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names, + true, false); + if (!err) + return 0; + /* Finally fall back to regular interrupts. */ + return vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names, + false, false); +} + +static const char *vp_bus_name(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + + return pci_name(vp_dev->pci_dev); +} + +static struct virtio_config_ops virtio_pci_config_ops = { + .get = vp_get, + .set = vp_set, + .get_status = vp_get_status, + .set_status = vp_set_status, + .reset = vp_reset, + .find_vqs = vp_find_vqs, + .del_vqs = vp_del_vqs, + .get_features = vp_get_features, + .finalize_features = vp_finalize_features, + .bus_name = vp_bus_name, +}; + +static void virtio_pci_release_dev(struct device *_d) +{ + /* + * No need for a release method as we allocate/free + * all devices together with the pci devices. + * Provide an empty one to avoid getting a warning from core. + */ +} + +/* the PCI probing function */ +static int __devinit virtio_pci_probe(struct pci_dev *pci_dev, + const struct pci_device_id *id) +{ + struct virtio_pci_device *vp_dev; + int err; + + /* We only own devices >= 0x1000 and <= 0x103f: leave the rest. */ + if (pci_dev->device < 0x1000 || pci_dev->device > 0x103f) + return -ENODEV; + + if (pci_dev->revision != VIRTIO_PCI_ABI_VERSION) { + printk(KERN_ERR "virtio_pci: expected ABI version %d, got %d\n", + VIRTIO_PCI_ABI_VERSION, pci_dev->revision); + return -ENODEV; + } + + /* allocate our structure and fill it out */ + vp_dev = kzalloc(sizeof(struct virtio_pci_device), GFP_KERNEL); + if (vp_dev == NULL) + return -ENOMEM; + + vp_dev->vdev.dev.parent = &pci_dev->dev; + vp_dev->vdev.dev.release = virtio_pci_release_dev; + vp_dev->vdev.config = &virtio_pci_config_ops; + vp_dev->pci_dev = pci_dev; + INIT_LIST_HEAD(&vp_dev->virtqueues); + spin_lock_init(&vp_dev->lock); + + /* Disable MSI/MSIX to bring device to a known good state. */ + pci_msi_off(pci_dev); + + /* enable the device */ + err = pci_enable_device(pci_dev); + if (err) + goto out; + + err = pci_request_regions(pci_dev, "virtio-pci"); + if (err) + goto out_enable_device; + + vp_dev->ioaddr = pci_iomap(pci_dev, 0, 0); + if (vp_dev->ioaddr == NULL) + goto out_req_regions; + + pci_set_drvdata(pci_dev, vp_dev); + pci_set_master(pci_dev); + + /* we use the subsystem vendor/device id as the virtio vendor/device + * id. this allows us to use the same PCI vendor/device id for all + * virtio devices and to identify the particular virtio driver by + * the subsystem ids */ + vp_dev->vdev.id.vendor = pci_dev->subsystem_vendor; + vp_dev->vdev.id.device = pci_dev->subsystem_device; + + /* finally register the virtio device */ + err = register_virtio_device(&vp_dev->vdev); + if (err) + goto out_set_drvdata; + + return 0; + +out_set_drvdata: + pci_set_drvdata(pci_dev, NULL); + pci_iounmap(pci_dev, vp_dev->ioaddr); +out_req_regions: + pci_release_regions(pci_dev); +out_enable_device: + pci_disable_device(pci_dev); +out: + kfree(vp_dev); + return err; +} + +static void __devexit virtio_pci_remove(struct pci_dev *pci_dev) +{ + struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); + + unregister_virtio_device(&vp_dev->vdev); + + vp_del_vqs(&vp_dev->vdev); + pci_set_drvdata(pci_dev, NULL); + pci_iounmap(pci_dev, vp_dev->ioaddr); + pci_release_regions(pci_dev); + pci_disable_device(pci_dev); + kfree(vp_dev); +} + +#ifdef CONFIG_PM +static int virtio_pci_freeze(struct device *dev) +{ + struct pci_dev *pci_dev = to_pci_dev(dev); + struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); + struct virtio_driver *drv; + int ret; + + drv = container_of(vp_dev->vdev.dev.driver, + struct virtio_driver, driver); + + ret = 0; + vp_dev->saved_status = vp_get_status(&vp_dev->vdev); + if (drv && drv->freeze) + ret = drv->freeze(&vp_dev->vdev); + + if (!ret) + pci_disable_device(pci_dev); + return ret; +} + +static int virtio_pci_restore(struct device *dev) +{ + struct pci_dev *pci_dev = to_pci_dev(dev); + struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); + struct virtio_driver *drv; + int ret; + + drv = container_of(vp_dev->vdev.dev.driver, + struct virtio_driver, driver); + + ret = pci_enable_device(pci_dev); + if (ret) + return ret; + + pci_set_master(pci_dev); + vp_finalize_features(&vp_dev->vdev); + + if (drv && drv->restore) + ret = drv->restore(&vp_dev->vdev); + + /* Finally, tell the device we're all set */ + if (!ret) + vp_set_status(&vp_dev->vdev, vp_dev->saved_status); + + return ret; +} + +static const struct dev_pm_ops virtio_pci_pm_ops = { + SET_SYSTEM_SLEEP_PM_OPS(virtio_pci_freeze, virtio_pci_restore) +}; +#endif + +static struct pci_driver virtio_pci_driver = { + .name = "virtio-pci", + .id_table = virtio_pci_id_table, + .probe = virtio_pci_probe, + .remove = __devexit_p(virtio_pci_remove), +#ifdef CONFIG_PM + .driver.pm = &virtio_pci_pm_ops, +#endif +}; + +static int __init virtio_pci_init(void) +{ + return pci_register_driver(&virtio_pci_driver); +} + +module_init(virtio_pci_init); + +static void __exit virtio_pci_exit(void) +{ + pci_unregister_driver(&virtio_pci_driver); +} + +module_exit(virtio_pci_exit); diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c new file mode 100644 index 00000000..5aa43c33 --- /dev/null +++ b/drivers/virtio/virtio_ring.c @@ -0,0 +1,718 @@ +/* Virtio ring implementation. + * + * Copyright 2007 Rusty Russell IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include <linux/virtio.h> +#include <linux/virtio_ring.h> +#include <linux/virtio_config.h> +#include <linux/device.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/hrtimer.h> + +/* virtio guest is communicating with a virtual "device" that actually runs on + * a host processor. Memory barriers are used to control SMP effects. */ +#ifdef CONFIG_SMP +/* Where possible, use SMP barriers which are more lightweight than mandatory + * barriers, because mandatory barriers control MMIO effects on accesses + * through relaxed memory I/O windows (which virtio-pci does not use). */ +#define virtio_mb(vq) \ + do { if ((vq)->weak_barriers) smp_mb(); else mb(); } while(0) +#define virtio_rmb(vq) \ + do { if ((vq)->weak_barriers) smp_rmb(); else rmb(); } while(0) +#define virtio_wmb(vq) \ + do { if ((vq)->weak_barriers) smp_wmb(); else wmb(); } while(0) +#else +/* We must force memory ordering even if guest is UP since host could be + * running on another CPU, but SMP barriers are defined to barrier() in that + * configuration. So fall back to mandatory barriers instead. */ +#define virtio_mb(vq) mb() +#define virtio_rmb(vq) rmb() +#define virtio_wmb(vq) wmb() +#endif + +#ifdef DEBUG +/* For development, we want to crash whenever the ring is screwed. */ +#define BAD_RING(_vq, fmt, args...) \ + do { \ + dev_err(&(_vq)->vq.vdev->dev, \ + "%s:"fmt, (_vq)->vq.name, ##args); \ + BUG(); \ + } while (0) +/* Caller is supposed to guarantee no reentry. */ +#define START_USE(_vq) \ + do { \ + if ((_vq)->in_use) \ + panic("%s:in_use = %i\n", \ + (_vq)->vq.name, (_vq)->in_use); \ + (_vq)->in_use = __LINE__; \ + } while (0) +#define END_USE(_vq) \ + do { BUG_ON(!(_vq)->in_use); (_vq)->in_use = 0; } while(0) +#else +#define BAD_RING(_vq, fmt, args...) \ + do { \ + dev_err(&_vq->vq.vdev->dev, \ + "%s:"fmt, (_vq)->vq.name, ##args); \ + (_vq)->broken = true; \ + } while (0) +#define START_USE(vq) +#define END_USE(vq) +#endif + +struct vring_virtqueue +{ + struct virtqueue vq; + + /* Actual memory layout for this queue */ + struct vring vring; + + /* Can we use weak barriers? */ + bool weak_barriers; + + /* Other side has made a mess, don't try any more. */ + bool broken; + + /* Host supports indirect buffers */ + bool indirect; + + /* Host publishes avail event idx */ + bool event; + + /* Number of free buffers */ + unsigned int num_free; + /* Head of free buffer list. */ + unsigned int free_head; + /* Number we've added since last sync. */ + unsigned int num_added; + + /* Last used index we've seen. */ + u16 last_used_idx; + + /* How to notify other side. FIXME: commonalize hcalls! */ + void (*notify)(struct virtqueue *vq); + +#ifdef DEBUG + /* They're supposed to lock for us. */ + unsigned int in_use; + + /* Figure out if their kicks are too delayed. */ + bool last_add_time_valid; + ktime_t last_add_time; +#endif + + /* Tokens for callbacks. */ + void *data[]; +}; + +#define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq) + +/* Set up an indirect table of descriptors and add it to the queue. */ +static int vring_add_indirect(struct vring_virtqueue *vq, + struct scatterlist sg[], + unsigned int out, + unsigned int in, + gfp_t gfp) +{ + struct vring_desc *desc; + unsigned head; + int i; + + desc = kmalloc((out + in) * sizeof(struct vring_desc), gfp); + if (!desc) + return -ENOMEM; + + /* Transfer entries from the sg list into the indirect page */ + for (i = 0; i < out; i++) { + desc[i].flags = VRING_DESC_F_NEXT; + desc[i].addr = sg_phys(sg); + desc[i].len = sg->length; + desc[i].next = i+1; + sg++; + } + for (; i < (out + in); i++) { + desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; + desc[i].addr = sg_phys(sg); + desc[i].len = sg->length; + desc[i].next = i+1; + sg++; + } + + /* Last one doesn't continue. */ + desc[i-1].flags &= ~VRING_DESC_F_NEXT; + desc[i-1].next = 0; + + /* We're about to use a buffer */ + vq->num_free--; + + /* Use a single buffer which doesn't continue */ + head = vq->free_head; + vq->vring.desc[head].flags = VRING_DESC_F_INDIRECT; + vq->vring.desc[head].addr = virt_to_phys(desc); + vq->vring.desc[head].len = i * sizeof(struct vring_desc); + + /* Update free pointer */ + vq->free_head = vq->vring.desc[head].next; + + return head; +} + +/** + * virtqueue_add_buf - expose buffer to other end + * @vq: the struct virtqueue we're talking about. + * @sg: the description of the buffer(s). + * @out_num: the number of sg readable by other side + * @in_num: the number of sg which are writable (after readable ones) + * @data: the token identifying the buffer. + * @gfp: how to do memory allocations (if necessary). + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * + * Returns remaining capacity of queue or a negative error + * (ie. ENOSPC). Note that it only really makes sense to treat all + * positive return values as "available": indirect buffers mean that + * we can put an entire sg[] array inside a single queue entry. + */ +int virtqueue_add_buf(struct virtqueue *_vq, + struct scatterlist sg[], + unsigned int out, + unsigned int in, + void *data, + gfp_t gfp) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + unsigned int i, avail, uninitialized_var(prev); + int head; + + START_USE(vq); + + BUG_ON(data == NULL); + +#ifdef DEBUG + { + ktime_t now = ktime_get(); + + /* No kick or get, with .1 second between? Warn. */ + if (vq->last_add_time_valid) + WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time)) + > 100); + vq->last_add_time = now; + vq->last_add_time_valid = true; + } +#endif + + /* If the host supports indirect descriptor tables, and we have multiple + * buffers, then go indirect. FIXME: tune this threshold */ + if (vq->indirect && (out + in) > 1 && vq->num_free) { + head = vring_add_indirect(vq, sg, out, in, gfp); + if (likely(head >= 0)) + goto add_head; + } + + BUG_ON(out + in > vq->vring.num); + BUG_ON(out + in == 0); + + if (vq->num_free < out + in) { + pr_debug("Can't add buf len %i - avail = %i\n", + out + in, vq->num_free); + /* FIXME: for historical reasons, we force a notify here if + * there are outgoing parts to the buffer. Presumably the + * host should service the ring ASAP. */ + if (out) + vq->notify(&vq->vq); + END_USE(vq); + return -ENOSPC; + } + + /* We're about to use some buffers from the free list. */ + vq->num_free -= out + in; + + head = vq->free_head; + for (i = vq->free_head; out; i = vq->vring.desc[i].next, out--) { + vq->vring.desc[i].flags = VRING_DESC_F_NEXT; + vq->vring.desc[i].addr = sg_phys(sg); + vq->vring.desc[i].len = sg->length; + prev = i; + sg++; + } + for (; in; i = vq->vring.desc[i].next, in--) { + vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; + vq->vring.desc[i].addr = sg_phys(sg); + vq->vring.desc[i].len = sg->length; + prev = i; + sg++; + } + /* Last one doesn't continue. */ + vq->vring.desc[prev].flags &= ~VRING_DESC_F_NEXT; + + /* Update free pointer */ + vq->free_head = i; + +add_head: + /* Set token. */ + vq->data[head] = data; + + /* Put entry in available array (but don't update avail->idx until they + * do sync). */ + avail = (vq->vring.avail->idx & (vq->vring.num-1)); + vq->vring.avail->ring[avail] = head; + + /* Descriptors and available array need to be set before we expose the + * new available array entries. */ + virtio_wmb(vq); + vq->vring.avail->idx++; + vq->num_added++; + + /* This is very unlikely, but theoretically possible. Kick + * just in case. */ + if (unlikely(vq->num_added == (1 << 16) - 1)) + virtqueue_kick(_vq); + + pr_debug("Added buffer head %i to %p\n", head, vq); + END_USE(vq); + + return vq->num_free; +} +EXPORT_SYMBOL_GPL(virtqueue_add_buf); + +/** + * virtqueue_kick_prepare - first half of split virtqueue_kick call. + * @vq: the struct virtqueue + * + * Instead of virtqueue_kick(), you can do: + * if (virtqueue_kick_prepare(vq)) + * virtqueue_notify(vq); + * + * This is sometimes useful because the virtqueue_kick_prepare() needs + * to be serialized, but the actual virtqueue_notify() call does not. + */ +bool virtqueue_kick_prepare(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + u16 new, old; + bool needs_kick; + + START_USE(vq); + /* We need to expose available array entries before checking avail + * event. */ + virtio_mb(vq); + + old = vq->vring.avail->idx - vq->num_added; + new = vq->vring.avail->idx; + vq->num_added = 0; + +#ifdef DEBUG + if (vq->last_add_time_valid) { + WARN_ON(ktime_to_ms(ktime_sub(ktime_get(), + vq->last_add_time)) > 100); + } + vq->last_add_time_valid = false; +#endif + + if (vq->event) { + needs_kick = vring_need_event(vring_avail_event(&vq->vring), + new, old); + } else { + needs_kick = !(vq->vring.used->flags & VRING_USED_F_NO_NOTIFY); + } + END_USE(vq); + return needs_kick; +} +EXPORT_SYMBOL_GPL(virtqueue_kick_prepare); + +/** + * virtqueue_notify - second half of split virtqueue_kick call. + * @vq: the struct virtqueue + * + * This does not need to be serialized. + */ +void virtqueue_notify(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + /* Prod other side to tell it about changes. */ + vq->notify(_vq); +} +EXPORT_SYMBOL_GPL(virtqueue_notify); + +/** + * virtqueue_kick - update after add_buf + * @vq: the struct virtqueue + * + * After one or more virtqueue_add_buf calls, invoke this to kick + * the other side. + * + * Caller must ensure we don't call this with other virtqueue + * operations at the same time (except where noted). + */ +void virtqueue_kick(struct virtqueue *vq) +{ + if (virtqueue_kick_prepare(vq)) + virtqueue_notify(vq); +} +EXPORT_SYMBOL_GPL(virtqueue_kick); + +static void detach_buf(struct vring_virtqueue *vq, unsigned int head) +{ + unsigned int i; + + /* Clear data ptr. */ + vq->data[head] = NULL; + + /* Put back on free list: find end */ + i = head; + + /* Free the indirect table */ + if (vq->vring.desc[i].flags & VRING_DESC_F_INDIRECT) + kfree(phys_to_virt(vq->vring.desc[i].addr)); + + while (vq->vring.desc[i].flags & VRING_DESC_F_NEXT) { + i = vq->vring.desc[i].next; + vq->num_free++; + } + + vq->vring.desc[i].next = vq->free_head; + vq->free_head = head; + /* Plus final descriptor */ + vq->num_free++; +} + +static inline bool more_used(const struct vring_virtqueue *vq) +{ + return vq->last_used_idx != vq->vring.used->idx; +} + +/** + * virtqueue_get_buf - get the next used buffer + * @vq: the struct virtqueue we're talking about. + * @len: the length written into the buffer + * + * If the driver wrote data into the buffer, @len will be set to the + * amount written. This means you don't need to clear the buffer + * beforehand to ensure there's no data leakage in the case of short + * writes. + * + * Caller must ensure we don't call this with other virtqueue + * operations at the same time (except where noted). + * + * Returns NULL if there are no used buffers, or the "data" token + * handed to virtqueue_add_buf(). + */ +void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + void *ret; + unsigned int i; + u16 last_used; + + START_USE(vq); + + if (unlikely(vq->broken)) { + END_USE(vq); + return NULL; + } + + if (!more_used(vq)) { + pr_debug("No more buffers in queue\n"); + END_USE(vq); + return NULL; + } + + /* Only get used array entries after they have been exposed by host. */ + virtio_rmb(vq); + + last_used = (vq->last_used_idx & (vq->vring.num - 1)); + i = vq->vring.used->ring[last_used].id; + *len = vq->vring.used->ring[last_used].len; + + if (unlikely(i >= vq->vring.num)) { + BAD_RING(vq, "id %u out of range\n", i); + return NULL; + } + if (unlikely(!vq->data[i])) { + BAD_RING(vq, "id %u is not a head!\n", i); + return NULL; + } + + /* detach_buf clears data, so grab it now. */ + ret = vq->data[i]; + detach_buf(vq, i); + vq->last_used_idx++; + /* If we expect an interrupt for the next entry, tell host + * by writing event index and flush out the write before + * the read in the next get_buf call. */ + if (!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) { + vring_used_event(&vq->vring) = vq->last_used_idx; + virtio_mb(vq); + } + +#ifdef DEBUG + vq->last_add_time_valid = false; +#endif + + END_USE(vq); + return ret; +} +EXPORT_SYMBOL_GPL(virtqueue_get_buf); + +/** + * virtqueue_disable_cb - disable callbacks + * @vq: the struct virtqueue we're talking about. + * + * Note that this is not necessarily synchronous, hence unreliable and only + * useful as an optimization. + * + * Unlike other operations, this need not be serialized. + */ +void virtqueue_disable_cb(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; +} +EXPORT_SYMBOL_GPL(virtqueue_disable_cb); + +/** + * virtqueue_enable_cb - restart callbacks after disable_cb. + * @vq: the struct virtqueue we're talking about. + * + * This re-enables callbacks; it returns "false" if there are pending + * buffers in the queue, to detect a possible race between the driver + * checking for more work, and enabling callbacks. + * + * Caller must ensure we don't call this with other virtqueue + * operations at the same time (except where noted). + */ +bool virtqueue_enable_cb(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + START_USE(vq); + + /* We optimistically turn back on interrupts, then check if there was + * more to do. */ + /* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to + * either clear the flags bit or point the event index at the next + * entry. Always do both to keep code simple. */ + vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT; + vring_used_event(&vq->vring) = vq->last_used_idx; + virtio_mb(vq); + if (unlikely(more_used(vq))) { + END_USE(vq); + return false; + } + + END_USE(vq); + return true; +} +EXPORT_SYMBOL_GPL(virtqueue_enable_cb); + +/** + * virtqueue_enable_cb_delayed - restart callbacks after disable_cb. + * @vq: the struct virtqueue we're talking about. + * + * This re-enables callbacks but hints to the other side to delay + * interrupts until most of the available buffers have been processed; + * it returns "false" if there are many pending buffers in the queue, + * to detect a possible race between the driver checking for more work, + * and enabling callbacks. + * + * Caller must ensure we don't call this with other virtqueue + * operations at the same time (except where noted). + */ +bool virtqueue_enable_cb_delayed(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + u16 bufs; + + START_USE(vq); + + /* We optimistically turn back on interrupts, then check if there was + * more to do. */ + /* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to + * either clear the flags bit or point the event index at the next + * entry. Always do both to keep code simple. */ + vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT; + /* TODO: tune this threshold */ + bufs = (u16)(vq->vring.avail->idx - vq->last_used_idx) * 3 / 4; + vring_used_event(&vq->vring) = vq->last_used_idx + bufs; + virtio_mb(vq); + if (unlikely((u16)(vq->vring.used->idx - vq->last_used_idx) > bufs)) { + END_USE(vq); + return false; + } + + END_USE(vq); + return true; +} +EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed); + +/** + * virtqueue_detach_unused_buf - detach first unused buffer + * @vq: the struct virtqueue we're talking about. + * + * Returns NULL or the "data" token handed to virtqueue_add_buf(). + * This is not valid on an active queue; it is useful only for device + * shutdown. + */ +void *virtqueue_detach_unused_buf(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + unsigned int i; + void *buf; + + START_USE(vq); + + for (i = 0; i < vq->vring.num; i++) { + if (!vq->data[i]) + continue; + /* detach_buf clears data, so grab it now. */ + buf = vq->data[i]; + detach_buf(vq, i); + vq->vring.avail->idx--; + END_USE(vq); + return buf; + } + /* That should have freed everything. */ + BUG_ON(vq->num_free != vq->vring.num); + + END_USE(vq); + return NULL; +} +EXPORT_SYMBOL_GPL(virtqueue_detach_unused_buf); + +irqreturn_t vring_interrupt(int irq, void *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + if (!more_used(vq)) { + pr_debug("virtqueue interrupt with no work for %p\n", vq); + return IRQ_NONE; + } + + if (unlikely(vq->broken)) + return IRQ_HANDLED; + + pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback); + if (vq->vq.callback) + vq->vq.callback(&vq->vq); + + return IRQ_HANDLED; +} +EXPORT_SYMBOL_GPL(vring_interrupt); + +struct virtqueue *vring_new_virtqueue(unsigned int num, + unsigned int vring_align, + struct virtio_device *vdev, + bool weak_barriers, + void *pages, + void (*notify)(struct virtqueue *), + void (*callback)(struct virtqueue *), + const char *name) +{ + struct vring_virtqueue *vq; + unsigned int i; + + /* We assume num is a power of 2. */ + if (num & (num - 1)) { + dev_warn(&vdev->dev, "Bad virtqueue length %u\n", num); + return NULL; + } + + vq = kmalloc(sizeof(*vq) + sizeof(void *)*num, GFP_KERNEL); + if (!vq) + return NULL; + + vring_init(&vq->vring, num, pages, vring_align); + vq->vq.callback = callback; + vq->vq.vdev = vdev; + vq->vq.name = name; + vq->notify = notify; + vq->weak_barriers = weak_barriers; + vq->broken = false; + vq->last_used_idx = 0; + vq->num_added = 0; + list_add_tail(&vq->vq.list, &vdev->vqs); +#ifdef DEBUG + vq->in_use = false; + vq->last_add_time_valid = false; +#endif + + vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC); + vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX); + + /* No callback? Tell other side not to bother us. */ + if (!callback) + vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; + + /* Put everything in free lists. */ + vq->num_free = num; + vq->free_head = 0; + for (i = 0; i < num-1; i++) { + vq->vring.desc[i].next = i+1; + vq->data[i] = NULL; + } + vq->data[i] = NULL; + + return &vq->vq; +} +EXPORT_SYMBOL_GPL(vring_new_virtqueue); + +void vring_del_virtqueue(struct virtqueue *vq) +{ + list_del(&vq->list); + kfree(to_vvq(vq)); +} +EXPORT_SYMBOL_GPL(vring_del_virtqueue); + +/* Manipulates transport-specific feature bits. */ +void vring_transport_features(struct virtio_device *vdev) +{ + unsigned int i; + + for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) { + switch (i) { + case VIRTIO_RING_F_INDIRECT_DESC: + break; + case VIRTIO_RING_F_EVENT_IDX: + break; + default: + /* We don't understand this bit. */ + clear_bit(i, vdev->features); + } + } +} +EXPORT_SYMBOL_GPL(vring_transport_features); + +/** + * virtqueue_get_vring_size - return the size of the virtqueue's vring + * @vq: the struct virtqueue containing the vring of interest. + * + * Returns the size of the vring. This is mainly used for boasting to + * userspace. Unlike other operations, this need not be serialized. + */ +unsigned int virtqueue_get_vring_size(struct virtqueue *_vq) +{ + + struct vring_virtqueue *vq = to_vvq(_vq); + + return vq->vring.num; +} +EXPORT_SYMBOL_GPL(virtqueue_get_vring_size); + +MODULE_LICENSE("GPL"); |