diff options
Diffstat (limited to 'fs/fuse')
-rw-r--r-- | fs/fuse/Kconfig | 15 | ||||
-rw-r--r-- | fs/fuse/Makefile | 8 | ||||
-rw-r--r-- | fs/fuse/control.c | 359 | ||||
-rw-r--r-- | fs/fuse/cuse.c | 621 | ||||
-rw-r--r-- | fs/fuse/dev.c | 2104 | ||||
-rw-r--r-- | fs/fuse/dir.c | 1697 | ||||
-rw-r--r-- | fs/fuse/file.c | 2224 | ||||
-rw-r--r-- | fs/fuse/fuse_i.h | 784 | ||||
-rw-r--r-- | fs/fuse/inode.c | 1267 |
9 files changed, 9079 insertions, 0 deletions
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig new file mode 100644 index 00000000..0cf160a9 --- /dev/null +++ b/fs/fuse/Kconfig @@ -0,0 +1,15 @@ +config FUSE_FS + tristate "FUSE (Filesystem in Userspace) support" + help + With FUSE it is possible to implement a fully functional filesystem + in a userspace program. + + There's also companion library: libfuse. This library along with + utilities is available from the FUSE homepage: + <http://fuse.sourceforge.net/> + + See <file:Documentation/filesystems/fuse.txt> for more information. + See <file:Documentation/Changes> for needed library/utility version. + + If you want to develop a userspace FS, or if you want to use + a filesystem based on FUSE, answer Y or M. diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile new file mode 100644 index 00000000..e95eeb44 --- /dev/null +++ b/fs/fuse/Makefile @@ -0,0 +1,8 @@ +# +# Makefile for the FUSE filesystem. +# + +obj-$(CONFIG_FUSE_FS) += fuse.o +obj-$(CONFIG_CUSE) += cuse.o + +fuse-objs := dev.o dir.o file.o inode.o control.o diff --git a/fs/fuse/control.c b/fs/fuse/control.c new file mode 100644 index 00000000..42593c58 --- /dev/null +++ b/fs/fuse/control.c @@ -0,0 +1,359 @@ +/* + FUSE: Filesystem in Userspace + Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. +*/ + +#include "fuse_i.h" + +#include <linux/init.h> +#include <linux/module.h> + +#define FUSE_CTL_SUPER_MAGIC 0x65735543 + +/* + * This is non-NULL when the single instance of the control filesystem + * exists. Protected by fuse_mutex + */ +static struct super_block *fuse_control_sb; + +static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file) +{ + struct fuse_conn *fc; + mutex_lock(&fuse_mutex); + fc = file->f_path.dentry->d_inode->i_private; + if (fc) + fc = fuse_conn_get(fc); + mutex_unlock(&fuse_mutex); + return fc; +} + +static ssize_t fuse_conn_abort_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct fuse_conn *fc = fuse_ctl_file_conn_get(file); + if (fc) { + fuse_abort_conn(fc); + fuse_conn_put(fc); + } + return count; +} + +static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + char tmp[32]; + size_t size; + + if (!*ppos) { + long value; + struct fuse_conn *fc = fuse_ctl_file_conn_get(file); + if (!fc) + return 0; + + value = atomic_read(&fc->num_waiting); + file->private_data = (void *)value; + fuse_conn_put(fc); + } + size = sprintf(tmp, "%ld\n", (long)file->private_data); + return simple_read_from_buffer(buf, len, ppos, tmp, size); +} + +static ssize_t fuse_conn_limit_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos, unsigned val) +{ + char tmp[32]; + size_t size = sprintf(tmp, "%u\n", val); + + return simple_read_from_buffer(buf, len, ppos, tmp, size); +} + +static ssize_t fuse_conn_limit_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos, unsigned *val, + unsigned global_limit) +{ + unsigned long t; + char tmp[32]; + unsigned limit = (1 << 16) - 1; + int err; + + if (*ppos || count >= sizeof(tmp) - 1) + return -EINVAL; + + if (copy_from_user(tmp, buf, count)) + return -EINVAL; + + tmp[count] = '\0'; + + err = strict_strtoul(tmp, 0, &t); + if (err) + return err; + + if (!capable(CAP_SYS_ADMIN)) + limit = min(limit, global_limit); + + if (t > limit) + return -EINVAL; + + *val = t; + + return count; +} + +static ssize_t fuse_conn_max_background_read(struct file *file, + char __user *buf, size_t len, + loff_t *ppos) +{ + struct fuse_conn *fc; + unsigned val; + + fc = fuse_ctl_file_conn_get(file); + if (!fc) + return 0; + + val = fc->max_background; + fuse_conn_put(fc); + + return fuse_conn_limit_read(file, buf, len, ppos, val); +} + +static ssize_t fuse_conn_max_background_write(struct file *file, + const char __user *buf, + size_t count, loff_t *ppos) +{ + unsigned val; + ssize_t ret; + + ret = fuse_conn_limit_write(file, buf, count, ppos, &val, + max_user_bgreq); + if (ret > 0) { + struct fuse_conn *fc = fuse_ctl_file_conn_get(file); + if (fc) { + fc->max_background = val; + fuse_conn_put(fc); + } + } + + return ret; +} + +static ssize_t fuse_conn_congestion_threshold_read(struct file *file, + char __user *buf, size_t len, + loff_t *ppos) +{ + struct fuse_conn *fc; + unsigned val; + + fc = fuse_ctl_file_conn_get(file); + if (!fc) + return 0; + + val = fc->congestion_threshold; + fuse_conn_put(fc); + + return fuse_conn_limit_read(file, buf, len, ppos, val); +} + +static ssize_t fuse_conn_congestion_threshold_write(struct file *file, + const char __user *buf, + size_t count, loff_t *ppos) +{ + unsigned val; + ssize_t ret; + + ret = fuse_conn_limit_write(file, buf, count, ppos, &val, + max_user_congthresh); + if (ret > 0) { + struct fuse_conn *fc = fuse_ctl_file_conn_get(file); + if (fc) { + fc->congestion_threshold = val; + fuse_conn_put(fc); + } + } + + return ret; +} + +static const struct file_operations fuse_ctl_abort_ops = { + .open = nonseekable_open, + .write = fuse_conn_abort_write, + .llseek = no_llseek, +}; + +static const struct file_operations fuse_ctl_waiting_ops = { + .open = nonseekable_open, + .read = fuse_conn_waiting_read, + .llseek = no_llseek, +}; + +static const struct file_operations fuse_conn_max_background_ops = { + .open = nonseekable_open, + .read = fuse_conn_max_background_read, + .write = fuse_conn_max_background_write, + .llseek = no_llseek, +}; + +static const struct file_operations fuse_conn_congestion_threshold_ops = { + .open = nonseekable_open, + .read = fuse_conn_congestion_threshold_read, + .write = fuse_conn_congestion_threshold_write, + .llseek = no_llseek, +}; + +static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, + struct fuse_conn *fc, + const char *name, + int mode, int nlink, + const struct inode_operations *iop, + const struct file_operations *fop) +{ + struct dentry *dentry; + struct inode *inode; + + BUG_ON(fc->ctl_ndents >= FUSE_CTL_NUM_DENTRIES); + dentry = d_alloc_name(parent, name); + if (!dentry) + return NULL; + + fc->ctl_dentry[fc->ctl_ndents++] = dentry; + inode = new_inode(fuse_control_sb); + if (!inode) + return NULL; + + inode->i_ino = get_next_ino(); + inode->i_mode = mode; + inode->i_uid = fc->user_id; + inode->i_gid = fc->group_id; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + /* setting ->i_op to NULL is not allowed */ + if (iop) + inode->i_op = iop; + inode->i_fop = fop; + set_nlink(inode, nlink); + inode->i_private = fc; + d_add(dentry, inode); + return dentry; +} + +/* + * Add a connection to the control filesystem (if it exists). Caller + * must hold fuse_mutex + */ +int fuse_ctl_add_conn(struct fuse_conn *fc) +{ + struct dentry *parent; + char name[32]; + + if (!fuse_control_sb) + return 0; + + parent = fuse_control_sb->s_root; + inc_nlink(parent->d_inode); + sprintf(name, "%u", fc->dev); + parent = fuse_ctl_add_dentry(parent, fc, name, S_IFDIR | 0500, 2, + &simple_dir_inode_operations, + &simple_dir_operations); + if (!parent) + goto err; + + if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1, + NULL, &fuse_ctl_waiting_ops) || + !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1, + NULL, &fuse_ctl_abort_ops) || + !fuse_ctl_add_dentry(parent, fc, "max_background", S_IFREG | 0600, + 1, NULL, &fuse_conn_max_background_ops) || + !fuse_ctl_add_dentry(parent, fc, "congestion_threshold", + S_IFREG | 0600, 1, NULL, + &fuse_conn_congestion_threshold_ops)) + goto err; + + return 0; + + err: + fuse_ctl_remove_conn(fc); + return -ENOMEM; +} + +/* + * Remove a connection from the control filesystem (if it exists). + * Caller must hold fuse_mutex + */ +void fuse_ctl_remove_conn(struct fuse_conn *fc) +{ + int i; + + if (!fuse_control_sb) + return; + + for (i = fc->ctl_ndents - 1; i >= 0; i--) { + struct dentry *dentry = fc->ctl_dentry[i]; + dentry->d_inode->i_private = NULL; + d_drop(dentry); + dput(dentry); + } + drop_nlink(fuse_control_sb->s_root->d_inode); +} + +static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent) +{ + struct tree_descr empty_descr = {""}; + struct fuse_conn *fc; + int err; + + err = simple_fill_super(sb, FUSE_CTL_SUPER_MAGIC, &empty_descr); + if (err) + return err; + + mutex_lock(&fuse_mutex); + BUG_ON(fuse_control_sb); + fuse_control_sb = sb; + list_for_each_entry(fc, &fuse_conn_list, entry) { + err = fuse_ctl_add_conn(fc); + if (err) { + fuse_control_sb = NULL; + mutex_unlock(&fuse_mutex); + return err; + } + } + mutex_unlock(&fuse_mutex); + + return 0; +} + +static struct dentry *fuse_ctl_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data) +{ + return mount_single(fs_type, flags, raw_data, fuse_ctl_fill_super); +} + +static void fuse_ctl_kill_sb(struct super_block *sb) +{ + struct fuse_conn *fc; + + mutex_lock(&fuse_mutex); + fuse_control_sb = NULL; + list_for_each_entry(fc, &fuse_conn_list, entry) + fc->ctl_ndents = 0; + mutex_unlock(&fuse_mutex); + + kill_litter_super(sb); +} + +static struct file_system_type fuse_ctl_fs_type = { + .owner = THIS_MODULE, + .name = "fusectl", + .mount = fuse_ctl_mount, + .kill_sb = fuse_ctl_kill_sb, +}; + +int __init fuse_ctl_init(void) +{ + return register_filesystem(&fuse_ctl_fs_type); +} + +void fuse_ctl_cleanup(void) +{ + unregister_filesystem(&fuse_ctl_fs_type); +} diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c new file mode 100644 index 00000000..3426521f --- /dev/null +++ b/fs/fuse/cuse.c @@ -0,0 +1,621 @@ +/* + * CUSE: Character device in Userspace + * + * Copyright (C) 2008-2009 SUSE Linux Products GmbH + * Copyright (C) 2008-2009 Tejun Heo <tj@kernel.org> + * + * This file is released under the GPLv2. + * + * CUSE enables character devices to be implemented from userland much + * like FUSE allows filesystems. On initialization /dev/cuse is + * created. By opening the file and replying to the CUSE_INIT request + * userland CUSE server can create a character device. After that the + * operation is very similar to FUSE. + * + * A CUSE instance involves the following objects. + * + * cuse_conn : contains fuse_conn and serves as bonding structure + * channel : file handle connected to the userland CUSE server + * cdev : the implemented character device + * dev : generic device for cdev + * + * Note that 'channel' is what 'dev' is in FUSE. As CUSE deals with + * devices, it's called 'channel' to reduce confusion. + * + * channel determines when the character device dies. When channel is + * closed, everything begins to destruct. The cuse_conn is taken off + * the lookup table preventing further access from cdev, cdev and + * generic device are removed and the base reference of cuse_conn is + * put. + * + * On each open, the matching cuse_conn is looked up and if found an + * additional reference is taken which is released when the file is + * closed. + */ + +#include <linux/fuse.h> +#include <linux/cdev.h> +#include <linux/device.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/kdev_t.h> +#include <linux/kthread.h> +#include <linux/list.h> +#include <linux/magic.h> +#include <linux/miscdevice.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/stat.h> +#include <linux/module.h> + +#include "fuse_i.h" + +#define CUSE_CONNTBL_LEN 64 + +struct cuse_conn { + struct list_head list; /* linked on cuse_conntbl */ + struct fuse_conn fc; /* fuse connection */ + struct cdev *cdev; /* associated character device */ + struct device *dev; /* device representing @cdev */ + + /* init parameters, set once during initialization */ + bool unrestricted_ioctl; +}; + +static DEFINE_SPINLOCK(cuse_lock); /* protects cuse_conntbl */ +static struct list_head cuse_conntbl[CUSE_CONNTBL_LEN]; +static struct class *cuse_class; + +static struct cuse_conn *fc_to_cc(struct fuse_conn *fc) +{ + return container_of(fc, struct cuse_conn, fc); +} + +static struct list_head *cuse_conntbl_head(dev_t devt) +{ + return &cuse_conntbl[(MAJOR(devt) + MINOR(devt)) % CUSE_CONNTBL_LEN]; +} + + +/************************************************************************** + * CUSE frontend operations + * + * These are file operations for the character device. + * + * On open, CUSE opens a file from the FUSE mnt and stores it to + * private_data of the open file. All other ops call FUSE ops on the + * FUSE file. + */ + +static ssize_t cuse_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + loff_t pos = 0; + + return fuse_direct_io(file, buf, count, &pos, 0); +} + +static ssize_t cuse_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + loff_t pos = 0; + /* + * No locking or generic_write_checks(), the server is + * responsible for locking and sanity checks. + */ + return fuse_direct_io(file, buf, count, &pos, 1); +} + +static int cuse_open(struct inode *inode, struct file *file) +{ + dev_t devt = inode->i_cdev->dev; + struct cuse_conn *cc = NULL, *pos; + int rc; + + /* look up and get the connection */ + spin_lock(&cuse_lock); + list_for_each_entry(pos, cuse_conntbl_head(devt), list) + if (pos->dev->devt == devt) { + fuse_conn_get(&pos->fc); + cc = pos; + break; + } + spin_unlock(&cuse_lock); + + /* dead? */ + if (!cc) + return -ENODEV; + + /* + * Generic permission check is already done against the chrdev + * file, proceed to open. + */ + rc = fuse_do_open(&cc->fc, 0, file, 0); + if (rc) + fuse_conn_put(&cc->fc); + return rc; +} + +static int cuse_release(struct inode *inode, struct file *file) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fc; + + fuse_sync_release(ff, file->f_flags); + fuse_conn_put(fc); + + return 0; +} + +static long cuse_file_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct fuse_file *ff = file->private_data; + struct cuse_conn *cc = fc_to_cc(ff->fc); + unsigned int flags = 0; + + if (cc->unrestricted_ioctl) + flags |= FUSE_IOCTL_UNRESTRICTED; + + return fuse_do_ioctl(file, cmd, arg, flags); +} + +static long cuse_file_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct fuse_file *ff = file->private_data; + struct cuse_conn *cc = fc_to_cc(ff->fc); + unsigned int flags = FUSE_IOCTL_COMPAT; + + if (cc->unrestricted_ioctl) + flags |= FUSE_IOCTL_UNRESTRICTED; + + return fuse_do_ioctl(file, cmd, arg, flags); +} + +static const struct file_operations cuse_frontend_fops = { + .owner = THIS_MODULE, + .read = cuse_read, + .write = cuse_write, + .open = cuse_open, + .release = cuse_release, + .unlocked_ioctl = cuse_file_ioctl, + .compat_ioctl = cuse_file_compat_ioctl, + .poll = fuse_file_poll, + .llseek = noop_llseek, +}; + + +/************************************************************************** + * CUSE channel initialization and destruction + */ + +struct cuse_devinfo { + const char *name; +}; + +/** + * cuse_parse_one - parse one key=value pair + * @pp: i/o parameter for the current position + * @end: points to one past the end of the packed string + * @keyp: out parameter for key + * @valp: out parameter for value + * + * *@pp points to packed strings - "key0=val0\0key1=val1\0" which ends + * at @end - 1. This function parses one pair and set *@keyp to the + * start of the key and *@valp to the start of the value. Note that + * the original string is modified such that the key string is + * terminated with '\0'. *@pp is updated to point to the next string. + * + * RETURNS: + * 1 on successful parse, 0 on EOF, -errno on failure. + */ +static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp) +{ + char *p = *pp; + char *key, *val; + + while (p < end && *p == '\0') + p++; + if (p == end) + return 0; + + if (end[-1] != '\0') { + printk(KERN_ERR "CUSE: info not properly terminated\n"); + return -EINVAL; + } + + key = val = p; + p += strlen(p); + + if (valp) { + strsep(&val, "="); + if (!val) + val = key + strlen(key); + key = strstrip(key); + val = strstrip(val); + } else + key = strstrip(key); + + if (!strlen(key)) { + printk(KERN_ERR "CUSE: zero length info key specified\n"); + return -EINVAL; + } + + *pp = p; + *keyp = key; + if (valp) + *valp = val; + + return 1; +} + +/** + * cuse_parse_dev_info - parse device info + * @p: device info string + * @len: length of device info string + * @devinfo: out parameter for parsed device info + * + * Parse @p to extract device info and store it into @devinfo. String + * pointed to by @p is modified by parsing and @devinfo points into + * them, so @p shouldn't be freed while @devinfo is in use. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo) +{ + char *end = p + len; + char *key, *val; + int rc; + + while (true) { + rc = cuse_parse_one(&p, end, &key, &val); + if (rc < 0) + return rc; + if (!rc) + break; + if (strcmp(key, "DEVNAME") == 0) + devinfo->name = val; + else + printk(KERN_WARNING "CUSE: unknown device info \"%s\"\n", + key); + } + + if (!devinfo->name || !strlen(devinfo->name)) { + printk(KERN_ERR "CUSE: DEVNAME unspecified\n"); + return -EINVAL; + } + + return 0; +} + +static void cuse_gendev_release(struct device *dev) +{ + kfree(dev); +} + +/** + * cuse_process_init_reply - finish initializing CUSE channel + * + * This function creates the character device and sets up all the + * required data structures for it. Please read the comment at the + * top of this file for high level overview. + */ +static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) +{ + struct cuse_conn *cc = fc_to_cc(fc); + struct cuse_init_out *arg = req->out.args[0].value; + struct page *page = req->pages[0]; + struct cuse_devinfo devinfo = { }; + struct device *dev; + struct cdev *cdev; + dev_t devt; + int rc; + + if (req->out.h.error || + arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) { + goto err; + } + + fc->minor = arg->minor; + fc->max_read = max_t(unsigned, arg->max_read, 4096); + fc->max_write = max_t(unsigned, arg->max_write, 4096); + + /* parse init reply */ + cc->unrestricted_ioctl = arg->flags & CUSE_UNRESTRICTED_IOCTL; + + rc = cuse_parse_devinfo(page_address(page), req->out.args[1].size, + &devinfo); + if (rc) + goto err; + + /* determine and reserve devt */ + devt = MKDEV(arg->dev_major, arg->dev_minor); + if (!MAJOR(devt)) + rc = alloc_chrdev_region(&devt, MINOR(devt), 1, devinfo.name); + else + rc = register_chrdev_region(devt, 1, devinfo.name); + if (rc) { + printk(KERN_ERR "CUSE: failed to register chrdev region\n"); + goto err; + } + + /* devt determined, create device */ + rc = -ENOMEM; + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + goto err_region; + + device_initialize(dev); + dev_set_uevent_suppress(dev, 1); + dev->class = cuse_class; + dev->devt = devt; + dev->release = cuse_gendev_release; + dev_set_drvdata(dev, cc); + dev_set_name(dev, "%s", devinfo.name); + + rc = device_add(dev); + if (rc) + goto err_device; + + /* register cdev */ + rc = -ENOMEM; + cdev = cdev_alloc(); + if (!cdev) + goto err_device; + + cdev->owner = THIS_MODULE; + cdev->ops = &cuse_frontend_fops; + + rc = cdev_add(cdev, devt, 1); + if (rc) + goto err_cdev; + + cc->dev = dev; + cc->cdev = cdev; + + /* make the device available */ + spin_lock(&cuse_lock); + list_add(&cc->list, cuse_conntbl_head(devt)); + spin_unlock(&cuse_lock); + + /* announce device availability */ + dev_set_uevent_suppress(dev, 0); + kobject_uevent(&dev->kobj, KOBJ_ADD); +out: + kfree(arg); + __free_page(page); + return; + +err_cdev: + cdev_del(cdev); +err_device: + put_device(dev); +err_region: + unregister_chrdev_region(devt, 1); +err: + fc->conn_error = 1; + goto out; +} + +static int cuse_send_init(struct cuse_conn *cc) +{ + int rc; + struct fuse_req *req; + struct page *page; + struct fuse_conn *fc = &cc->fc; + struct cuse_init_in *arg; + void *outarg; + + BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE); + + req = fuse_get_req(fc); + if (IS_ERR(req)) { + rc = PTR_ERR(req); + goto err; + } + + rc = -ENOMEM; + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + goto err_put_req; + + outarg = kzalloc(sizeof(struct cuse_init_out), GFP_KERNEL); + if (!outarg) + goto err_free_page; + + arg = &req->misc.cuse_init_in; + arg->major = FUSE_KERNEL_VERSION; + arg->minor = FUSE_KERNEL_MINOR_VERSION; + arg->flags |= CUSE_UNRESTRICTED_IOCTL; + req->in.h.opcode = CUSE_INIT; + req->in.numargs = 1; + req->in.args[0].size = sizeof(struct cuse_init_in); + req->in.args[0].value = arg; + req->out.numargs = 2; + req->out.args[0].size = sizeof(struct cuse_init_out); + req->out.args[0].value = outarg; + req->out.args[1].size = CUSE_INIT_INFO_MAX; + req->out.argvar = 1; + req->out.argpages = 1; + req->pages[0] = page; + req->num_pages = 1; + req->end = cuse_process_init_reply; + fuse_request_send_background(fc, req); + + return 0; + +err_free_page: + __free_page(page); +err_put_req: + fuse_put_request(fc, req); +err: + return rc; +} + +static void cuse_fc_release(struct fuse_conn *fc) +{ + struct cuse_conn *cc = fc_to_cc(fc); + kfree(cc); +} + +/** + * cuse_channel_open - open method for /dev/cuse + * @inode: inode for /dev/cuse + * @file: file struct being opened + * + * Userland CUSE server can create a CUSE device by opening /dev/cuse + * and replying to the initialization request kernel sends. This + * function is responsible for handling CUSE device initialization. + * Because the fd opened by this function is used during + * initialization, this function only creates cuse_conn and sends + * init. The rest is delegated to a kthread. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int cuse_channel_open(struct inode *inode, struct file *file) +{ + struct cuse_conn *cc; + int rc; + + /* set up cuse_conn */ + cc = kzalloc(sizeof(*cc), GFP_KERNEL); + if (!cc) + return -ENOMEM; + + fuse_conn_init(&cc->fc); + + INIT_LIST_HEAD(&cc->list); + cc->fc.release = cuse_fc_release; + + cc->fc.connected = 1; + cc->fc.blocked = 0; + rc = cuse_send_init(cc); + if (rc) { + fuse_conn_put(&cc->fc); + return rc; + } + file->private_data = &cc->fc; /* channel owns base reference to cc */ + + return 0; +} + +/** + * cuse_channel_release - release method for /dev/cuse + * @inode: inode for /dev/cuse + * @file: file struct being closed + * + * Disconnect the channel, deregister CUSE device and initiate + * destruction by putting the default reference. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int cuse_channel_release(struct inode *inode, struct file *file) +{ + struct cuse_conn *cc = fc_to_cc(file->private_data); + int rc; + + /* remove from the conntbl, no more access from this point on */ + spin_lock(&cuse_lock); + list_del_init(&cc->list); + spin_unlock(&cuse_lock); + + /* remove device */ + if (cc->dev) + device_unregister(cc->dev); + if (cc->cdev) { + unregister_chrdev_region(cc->cdev->dev, 1); + cdev_del(cc->cdev); + } + + /* kill connection and shutdown channel */ + fuse_conn_kill(&cc->fc); + rc = fuse_dev_release(inode, file); /* puts the base reference */ + + return rc; +} + +static struct file_operations cuse_channel_fops; /* initialized during init */ + + +/************************************************************************** + * Misc stuff and module initializatiion + * + * CUSE exports the same set of attributes to sysfs as fusectl. + */ + +static ssize_t cuse_class_waiting_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cuse_conn *cc = dev_get_drvdata(dev); + + return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting)); +} + +static ssize_t cuse_class_abort_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cuse_conn *cc = dev_get_drvdata(dev); + + fuse_abort_conn(&cc->fc); + return count; +} + +static struct device_attribute cuse_class_dev_attrs[] = { + __ATTR(waiting, S_IFREG | 0400, cuse_class_waiting_show, NULL), + __ATTR(abort, S_IFREG | 0200, NULL, cuse_class_abort_store), + { } +}; + +static struct miscdevice cuse_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "cuse", + .fops = &cuse_channel_fops, +}; + +static int __init cuse_init(void) +{ + int i, rc; + + /* init conntbl */ + for (i = 0; i < CUSE_CONNTBL_LEN; i++) + INIT_LIST_HEAD(&cuse_conntbl[i]); + + /* inherit and extend fuse_dev_operations */ + cuse_channel_fops = fuse_dev_operations; + cuse_channel_fops.owner = THIS_MODULE; + cuse_channel_fops.open = cuse_channel_open; + cuse_channel_fops.release = cuse_channel_release; + + cuse_class = class_create(THIS_MODULE, "cuse"); + if (IS_ERR(cuse_class)) + return PTR_ERR(cuse_class); + + cuse_class->dev_attrs = cuse_class_dev_attrs; + + rc = misc_register(&cuse_miscdev); + if (rc) { + class_destroy(cuse_class); + return rc; + } + + return 0; +} + +static void __exit cuse_exit(void) +{ + misc_deregister(&cuse_miscdev); + class_destroy(cuse_class); +} + +module_init(cuse_init); +module_exit(cuse_exit); + +MODULE_AUTHOR("Tejun Heo <tj@kernel.org>"); +MODULE_DESCRIPTION("Character device in Userspace"); +MODULE_LICENSE("GPL"); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c new file mode 100644 index 00000000..ceb3b29c --- /dev/null +++ b/fs/fuse/dev.c @@ -0,0 +1,2104 @@ +/* + FUSE: Filesystem in Userspace + Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. +*/ + +#include "fuse_i.h" + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/poll.h> +#include <linux/uio.h> +#include <linux/miscdevice.h> +#include <linux/pagemap.h> +#include <linux/file.h> +#include <linux/slab.h> +#include <linux/pipe_fs_i.h> +#include <linux/swap.h> +#include <linux/splice.h> +#include <linux/freezer.h> + +MODULE_ALIAS_MISCDEV(FUSE_MINOR); +MODULE_ALIAS("devname:fuse"); + +static struct kmem_cache *fuse_req_cachep; + +static struct fuse_conn *fuse_get_conn(struct file *file) +{ + /* + * Lockless access is OK, because file->private data is set + * once during mount and is valid until the file is released. + */ + return file->private_data; +} + +static void fuse_request_init(struct fuse_req *req) +{ + memset(req, 0, sizeof(*req)); + INIT_LIST_HEAD(&req->list); + INIT_LIST_HEAD(&req->intr_entry); + init_waitqueue_head(&req->waitq); + atomic_set(&req->count, 1); +} + +struct fuse_req *fuse_request_alloc(void) +{ + struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_KERNEL); + if (req) + fuse_request_init(req); + return req; +} +EXPORT_SYMBOL_GPL(fuse_request_alloc); + +struct fuse_req *fuse_request_alloc_nofs(void) +{ + struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_NOFS); + if (req) + fuse_request_init(req); + return req; +} + +void fuse_request_free(struct fuse_req *req) +{ + kmem_cache_free(fuse_req_cachep, req); +} + +static void block_sigs(sigset_t *oldset) +{ + sigset_t mask; + + siginitsetinv(&mask, sigmask(SIGKILL)); + sigprocmask(SIG_BLOCK, &mask, oldset); +} + +static void restore_sigs(sigset_t *oldset) +{ + sigprocmask(SIG_SETMASK, oldset, NULL); +} + +static void __fuse_get_request(struct fuse_req *req) +{ + atomic_inc(&req->count); +} + +/* Must be called with > 1 refcount */ +static void __fuse_put_request(struct fuse_req *req) +{ + BUG_ON(atomic_read(&req->count) < 2); + atomic_dec(&req->count); +} + +static void fuse_req_init_context(struct fuse_req *req) +{ + req->in.h.uid = current_fsuid(); + req->in.h.gid = current_fsgid(); + req->in.h.pid = current->pid; +} + +struct fuse_req *fuse_get_req(struct fuse_conn *fc) +{ + struct fuse_req *req; + sigset_t oldset; + int intr; + int err; + + atomic_inc(&fc->num_waiting); + block_sigs(&oldset); + intr = wait_event_interruptible(fc->blocked_waitq, !fc->blocked); + restore_sigs(&oldset); + err = -EINTR; + if (intr) + goto out; + + err = -ENOTCONN; + if (!fc->connected) + goto out; + + req = fuse_request_alloc(); + err = -ENOMEM; + if (!req) + goto out; + + fuse_req_init_context(req); + req->waiting = 1; + return req; + + out: + atomic_dec(&fc->num_waiting); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(fuse_get_req); + +/* + * Return request in fuse_file->reserved_req. However that may + * currently be in use. If that is the case, wait for it to become + * available. + */ +static struct fuse_req *get_reserved_req(struct fuse_conn *fc, + struct file *file) +{ + struct fuse_req *req = NULL; + struct fuse_file *ff = file->private_data; + + do { + wait_event(fc->reserved_req_waitq, ff->reserved_req); + spin_lock(&fc->lock); + if (ff->reserved_req) { + req = ff->reserved_req; + ff->reserved_req = NULL; + get_file(file); + req->stolen_file = file; + } + spin_unlock(&fc->lock); + } while (!req); + + return req; +} + +/* + * Put stolen request back into fuse_file->reserved_req + */ +static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req) +{ + struct file *file = req->stolen_file; + struct fuse_file *ff = file->private_data; + + spin_lock(&fc->lock); + fuse_request_init(req); + BUG_ON(ff->reserved_req); + ff->reserved_req = req; + wake_up_all(&fc->reserved_req_waitq); + spin_unlock(&fc->lock); + fput(file); +} + +/* + * Gets a requests for a file operation, always succeeds + * + * This is used for sending the FLUSH request, which must get to + * userspace, due to POSIX locks which may need to be unlocked. + * + * If allocation fails due to OOM, use the reserved request in + * fuse_file. + * + * This is very unlikely to deadlock accidentally, since the + * filesystem should not have it's own file open. If deadlock is + * intentional, it can still be broken by "aborting" the filesystem. + */ +struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file) +{ + struct fuse_req *req; + + atomic_inc(&fc->num_waiting); + wait_event(fc->blocked_waitq, !fc->blocked); + req = fuse_request_alloc(); + if (!req) + req = get_reserved_req(fc, file); + + fuse_req_init_context(req); + req->waiting = 1; + return req; +} + +void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) +{ + if (atomic_dec_and_test(&req->count)) { + if (req->waiting) + atomic_dec(&fc->num_waiting); + + if (req->stolen_file) + put_reserved_req(fc, req); + else + fuse_request_free(req); + } +} +EXPORT_SYMBOL_GPL(fuse_put_request); + +static unsigned len_args(unsigned numargs, struct fuse_arg *args) +{ + unsigned nbytes = 0; + unsigned i; + + for (i = 0; i < numargs; i++) + nbytes += args[i].size; + + return nbytes; +} + +static u64 fuse_get_unique(struct fuse_conn *fc) +{ + fc->reqctr++; + /* zero is special */ + if (fc->reqctr == 0) + fc->reqctr = 1; + + return fc->reqctr; +} + +static void queue_request(struct fuse_conn *fc, struct fuse_req *req) +{ + req->in.h.len = sizeof(struct fuse_in_header) + + len_args(req->in.numargs, (struct fuse_arg *) req->in.args); + list_add_tail(&req->list, &fc->pending); + req->state = FUSE_REQ_PENDING; + if (!req->waiting) { + req->waiting = 1; + atomic_inc(&fc->num_waiting); + } + wake_up(&fc->waitq); + kill_fasync(&fc->fasync, SIGIO, POLL_IN); +} + +void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, + u64 nodeid, u64 nlookup) +{ + forget->forget_one.nodeid = nodeid; + forget->forget_one.nlookup = nlookup; + + spin_lock(&fc->lock); + if (fc->connected) { + fc->forget_list_tail->next = forget; + fc->forget_list_tail = forget; + wake_up(&fc->waitq); + kill_fasync(&fc->fasync, SIGIO, POLL_IN); + } else { + kfree(forget); + } + spin_unlock(&fc->lock); +} + +static void flush_bg_queue(struct fuse_conn *fc) +{ + while (fc->active_background < fc->max_background && + !list_empty(&fc->bg_queue)) { + struct fuse_req *req; + + req = list_entry(fc->bg_queue.next, struct fuse_req, list); + list_del(&req->list); + fc->active_background++; + req->in.h.unique = fuse_get_unique(fc); + queue_request(fc, req); + } +} + +/* + * This function is called when a request is finished. Either a reply + * has arrived or it was aborted (and not yet sent) or some error + * occurred during communication with userspace, or the device file + * was closed. The requester thread is woken up (if still waiting), + * the 'end' callback is called if given, else the reference to the + * request is released + * + * Called with fc->lock, unlocks it + */ +static void request_end(struct fuse_conn *fc, struct fuse_req *req) +__releases(fc->lock) +{ + void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; + req->end = NULL; + list_del(&req->list); + list_del(&req->intr_entry); + req->state = FUSE_REQ_FINISHED; + if (req->background) { + if (fc->num_background == fc->max_background) { + fc->blocked = 0; + wake_up_all(&fc->blocked_waitq); + } + if (fc->num_background == fc->congestion_threshold && + fc->connected && fc->bdi_initialized) { + clear_bdi_congested(&fc->bdi, BLK_RW_SYNC); + clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC); + } + fc->num_background--; + fc->active_background--; + flush_bg_queue(fc); + } + spin_unlock(&fc->lock); + wake_up(&req->waitq); + if (end) + end(fc, req); + fuse_put_request(fc, req); +} + +static void wait_answer_interruptible(struct fuse_conn *fc, + struct fuse_req *req) +__releases(fc->lock) +__acquires(fc->lock) +{ + if (signal_pending(current)) + return; + + spin_unlock(&fc->lock); + wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED); + spin_lock(&fc->lock); +} + +static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req) +{ + list_add_tail(&req->intr_entry, &fc->interrupts); + wake_up(&fc->waitq); + kill_fasync(&fc->fasync, SIGIO, POLL_IN); +} + +static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) +__releases(fc->lock) +__acquires(fc->lock) +{ + if (!fc->no_interrupt) { + /* Any signal may interrupt this */ + wait_answer_interruptible(fc, req); + + if (req->aborted) + goto aborted; + if (req->state == FUSE_REQ_FINISHED) + return; + + req->interrupted = 1; + if (req->state == FUSE_REQ_SENT) + queue_interrupt(fc, req); + } + + if (!req->force) { + sigset_t oldset; + + /* Only fatal signals may interrupt this */ + block_sigs(&oldset); + wait_answer_interruptible(fc, req); + restore_sigs(&oldset); + + if (req->aborted) + goto aborted; + if (req->state == FUSE_REQ_FINISHED) + return; + + /* Request is not yet in userspace, bail out */ + if (req->state == FUSE_REQ_PENDING) { + list_del(&req->list); + __fuse_put_request(req); + req->out.h.error = -EINTR; + return; + } + } + + /* + * Either request is already in userspace, or it was forced. + * Wait it out. + */ + spin_unlock(&fc->lock); + + while (req->state != FUSE_REQ_FINISHED) + wait_event_freezable(req->waitq, + req->state == FUSE_REQ_FINISHED); + spin_lock(&fc->lock); + + if (!req->aborted) + return; + + aborted: + BUG_ON(req->state != FUSE_REQ_FINISHED); + if (req->locked) { + /* This is uninterruptible sleep, because data is + being copied to/from the buffers of req. During + locked state, there mustn't be any filesystem + operation (e.g. page fault), since that could lead + to deadlock */ + spin_unlock(&fc->lock); + wait_event(req->waitq, !req->locked); + spin_lock(&fc->lock); + } +} + +void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) +{ + req->isreply = 1; + spin_lock(&fc->lock); + if (!fc->connected) + req->out.h.error = -ENOTCONN; + else if (fc->conn_error) + req->out.h.error = -ECONNREFUSED; + else { + req->in.h.unique = fuse_get_unique(fc); + queue_request(fc, req); + /* acquire extra reference, since request is still needed + after request_end() */ + __fuse_get_request(req); + + request_wait_answer(fc, req); + } + spin_unlock(&fc->lock); +} +EXPORT_SYMBOL_GPL(fuse_request_send); + +static void fuse_request_send_nowait_locked(struct fuse_conn *fc, + struct fuse_req *req) +{ + req->background = 1; + fc->num_background++; + if (fc->num_background == fc->max_background) + fc->blocked = 1; + if (fc->num_background == fc->congestion_threshold && + fc->bdi_initialized) { + set_bdi_congested(&fc->bdi, BLK_RW_SYNC); + set_bdi_congested(&fc->bdi, BLK_RW_ASYNC); + } + list_add_tail(&req->list, &fc->bg_queue); + flush_bg_queue(fc); +} + +static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req) +{ + spin_lock(&fc->lock); + if (fc->connected) { + fuse_request_send_nowait_locked(fc, req); + spin_unlock(&fc->lock); + } else { + req->out.h.error = -ENOTCONN; + request_end(fc, req); + } +} + +void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) +{ + req->isreply = 1; + fuse_request_send_nowait(fc, req); +} +EXPORT_SYMBOL_GPL(fuse_request_send_background); + +static int fuse_request_send_notify_reply(struct fuse_conn *fc, + struct fuse_req *req, u64 unique) +{ + int err = -ENODEV; + + req->isreply = 0; + req->in.h.unique = unique; + spin_lock(&fc->lock); + if (fc->connected) { + queue_request(fc, req); + err = 0; + } + spin_unlock(&fc->lock); + + return err; +} + +/* + * Called under fc->lock + * + * fc->connected must have been checked previously + */ +void fuse_request_send_background_locked(struct fuse_conn *fc, + struct fuse_req *req) +{ + req->isreply = 1; + fuse_request_send_nowait_locked(fc, req); +} + +/* + * Lock the request. Up to the next unlock_request() there mustn't be + * anything that could cause a page-fault. If the request was already + * aborted bail out. + */ +static int lock_request(struct fuse_conn *fc, struct fuse_req *req) +{ + int err = 0; + if (req) { + spin_lock(&fc->lock); + if (req->aborted) + err = -ENOENT; + else + req->locked = 1; + spin_unlock(&fc->lock); + } + return err; +} + +/* + * Unlock request. If it was aborted during being locked, the + * requester thread is currently waiting for it to be unlocked, so + * wake it up. + */ +static void unlock_request(struct fuse_conn *fc, struct fuse_req *req) +{ + if (req) { + spin_lock(&fc->lock); + req->locked = 0; + if (req->aborted) + wake_up(&req->waitq); + spin_unlock(&fc->lock); + } +} + +struct fuse_copy_state { + struct fuse_conn *fc; + int write; + struct fuse_req *req; + const struct iovec *iov; + struct pipe_buffer *pipebufs; + struct pipe_buffer *currbuf; + struct pipe_inode_info *pipe; + unsigned long nr_segs; + unsigned long seglen; + unsigned long addr; + struct page *pg; + void *mapaddr; + void *buf; + unsigned len; + unsigned move_pages:1; +}; + +static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc, + int write, + const struct iovec *iov, unsigned long nr_segs) +{ + memset(cs, 0, sizeof(*cs)); + cs->fc = fc; + cs->write = write; + cs->iov = iov; + cs->nr_segs = nr_segs; +} + +/* Unmap and put previous page of userspace buffer */ +static void fuse_copy_finish(struct fuse_copy_state *cs) +{ + if (cs->currbuf) { + struct pipe_buffer *buf = cs->currbuf; + + if (!cs->write) { + buf->ops->unmap(cs->pipe, buf, cs->mapaddr); + } else { + kunmap(buf->page); + buf->len = PAGE_SIZE - cs->len; + } + cs->currbuf = NULL; + cs->mapaddr = NULL; + } else if (cs->mapaddr) { + kunmap(cs->pg); + if (cs->write) { + flush_dcache_page(cs->pg); + set_page_dirty_lock(cs->pg); + } + put_page(cs->pg); + cs->mapaddr = NULL; + } +} + +/* + * Get another pagefull of userspace buffer, and map it to kernel + * address space, and lock request + */ +static int fuse_copy_fill(struct fuse_copy_state *cs) +{ + unsigned long offset; + int err; + + unlock_request(cs->fc, cs->req); + fuse_copy_finish(cs); + if (cs->pipebufs) { + struct pipe_buffer *buf = cs->pipebufs; + + if (!cs->write) { + err = buf->ops->confirm(cs->pipe, buf); + if (err) + return err; + + BUG_ON(!cs->nr_segs); + cs->currbuf = buf; + cs->mapaddr = buf->ops->map(cs->pipe, buf, 0); + cs->len = buf->len; + cs->buf = cs->mapaddr + buf->offset; + cs->pipebufs++; + cs->nr_segs--; + } else { + struct page *page; + + if (cs->nr_segs == cs->pipe->buffers) + return -EIO; + + page = alloc_page(GFP_HIGHUSER); + if (!page) + return -ENOMEM; + + buf->page = page; + buf->offset = 0; + buf->len = 0; + + cs->currbuf = buf; + cs->mapaddr = kmap(page); + cs->buf = cs->mapaddr; + cs->len = PAGE_SIZE; + cs->pipebufs++; + cs->nr_segs++; + } + } else { + if (!cs->seglen) { + BUG_ON(!cs->nr_segs); + cs->seglen = cs->iov[0].iov_len; + cs->addr = (unsigned long) cs->iov[0].iov_base; + cs->iov++; + cs->nr_segs--; + } + err = get_user_pages_fast(cs->addr, 1, cs->write, &cs->pg); + if (err < 0) + return err; + BUG_ON(err != 1); + offset = cs->addr % PAGE_SIZE; + cs->mapaddr = kmap(cs->pg); + cs->buf = cs->mapaddr + offset; + cs->len = min(PAGE_SIZE - offset, cs->seglen); + cs->seglen -= cs->len; + cs->addr += cs->len; + } + + return lock_request(cs->fc, cs->req); +} + +/* Do as much copy to/from userspace buffer as we can */ +static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size) +{ + unsigned ncpy = min(*size, cs->len); + if (val) { + if (cs->write) + memcpy(cs->buf, *val, ncpy); + else + memcpy(*val, cs->buf, ncpy); + *val += ncpy; + } + *size -= ncpy; + cs->len -= ncpy; + cs->buf += ncpy; + return ncpy; +} + +static int fuse_check_page(struct page *page) +{ + if (page_mapcount(page) || + page->mapping != NULL || + page_count(page) != 1 || + (page->flags & PAGE_FLAGS_CHECK_AT_PREP & + ~(1 << PG_locked | + 1 << PG_referenced | + 1 << PG_uptodate | + 1 << PG_lru | + 1 << PG_active | + 1 << PG_reclaim))) { + printk(KERN_WARNING "fuse: trying to steal weird page\n"); + printk(KERN_WARNING " page=%p index=%li flags=%08lx, count=%i, mapcount=%i, mapping=%p\n", page, page->index, page->flags, page_count(page), page_mapcount(page), page->mapping); + return 1; + } + return 0; +} + +static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) +{ + int err; + struct page *oldpage = *pagep; + struct page *newpage; + struct pipe_buffer *buf = cs->pipebufs; + struct address_space *mapping; + pgoff_t index; + + unlock_request(cs->fc, cs->req); + fuse_copy_finish(cs); + + err = buf->ops->confirm(cs->pipe, buf); + if (err) + return err; + + BUG_ON(!cs->nr_segs); + cs->currbuf = buf; + cs->len = buf->len; + cs->pipebufs++; + cs->nr_segs--; + + if (cs->len != PAGE_SIZE) + goto out_fallback; + + if (buf->ops->steal(cs->pipe, buf) != 0) + goto out_fallback; + + newpage = buf->page; + + if (WARN_ON(!PageUptodate(newpage))) + return -EIO; + + ClearPageMappedToDisk(newpage); + + if (fuse_check_page(newpage) != 0) + goto out_fallback_unlock; + + mapping = oldpage->mapping; + index = oldpage->index; + + /* + * This is a new and locked page, it shouldn't be mapped or + * have any special flags on it + */ + if (WARN_ON(page_mapped(oldpage))) + goto out_fallback_unlock; + if (WARN_ON(page_has_private(oldpage))) + goto out_fallback_unlock; + if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage))) + goto out_fallback_unlock; + if (WARN_ON(PageMlocked(oldpage))) + goto out_fallback_unlock; + + err = replace_page_cache_page(oldpage, newpage, GFP_KERNEL); + if (err) { + unlock_page(newpage); + return err; + } + + page_cache_get(newpage); + + if (!(buf->flags & PIPE_BUF_FLAG_LRU)) + lru_cache_add_file(newpage); + + err = 0; + spin_lock(&cs->fc->lock); + if (cs->req->aborted) + err = -ENOENT; + else + *pagep = newpage; + spin_unlock(&cs->fc->lock); + + if (err) { + unlock_page(newpage); + page_cache_release(newpage); + return err; + } + + unlock_page(oldpage); + page_cache_release(oldpage); + cs->len = 0; + + return 0; + +out_fallback_unlock: + unlock_page(newpage); +out_fallback: + cs->mapaddr = buf->ops->map(cs->pipe, buf, 1); + cs->buf = cs->mapaddr + buf->offset; + + err = lock_request(cs->fc, cs->req); + if (err) + return err; + + return 1; +} + +static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, + unsigned offset, unsigned count) +{ + struct pipe_buffer *buf; + + if (cs->nr_segs == cs->pipe->buffers) + return -EIO; + + unlock_request(cs->fc, cs->req); + fuse_copy_finish(cs); + + buf = cs->pipebufs; + page_cache_get(page); + buf->page = page; + buf->offset = offset; + buf->len = count; + + cs->pipebufs++; + cs->nr_segs++; + cs->len = 0; + + return 0; +} + +/* + * Copy a page in the request to/from the userspace buffer. Must be + * done atomically + */ +static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, + unsigned offset, unsigned count, int zeroing) +{ + int err; + struct page *page = *pagep; + + if (page && zeroing && count < PAGE_SIZE) + clear_highpage(page); + + while (count) { + if (cs->write && cs->pipebufs && page) { + return fuse_ref_page(cs, page, offset, count); + } else if (!cs->len) { + if (cs->move_pages && page && + offset == 0 && count == PAGE_SIZE) { + err = fuse_try_move_page(cs, pagep); + if (err <= 0) + return err; + } else { + err = fuse_copy_fill(cs); + if (err) + return err; + } + } + if (page) { + void *mapaddr = kmap_atomic(page); + void *buf = mapaddr + offset; + offset += fuse_copy_do(cs, &buf, &count); + kunmap_atomic(mapaddr); + } else + offset += fuse_copy_do(cs, NULL, &count); + } + if (page && !cs->write) + flush_dcache_page(page); + return 0; +} + +/* Copy pages in the request to/from userspace buffer */ +static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, + int zeroing) +{ + unsigned i; + struct fuse_req *req = cs->req; + unsigned offset = req->page_offset; + unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset); + + for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) { + int err; + + err = fuse_copy_page(cs, &req->pages[i], offset, count, + zeroing); + if (err) + return err; + + nbytes -= count; + count = min(nbytes, (unsigned) PAGE_SIZE); + offset = 0; + } + return 0; +} + +/* Copy a single argument in the request to/from userspace buffer */ +static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size) +{ + while (size) { + if (!cs->len) { + int err = fuse_copy_fill(cs); + if (err) + return err; + } + fuse_copy_do(cs, &val, &size); + } + return 0; +} + +/* Copy request arguments to/from userspace buffer */ +static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs, + unsigned argpages, struct fuse_arg *args, + int zeroing) +{ + int err = 0; + unsigned i; + + for (i = 0; !err && i < numargs; i++) { + struct fuse_arg *arg = &args[i]; + if (i == numargs - 1 && argpages) + err = fuse_copy_pages(cs, arg->size, zeroing); + else + err = fuse_copy_one(cs, arg->value, arg->size); + } + return err; +} + +static int forget_pending(struct fuse_conn *fc) +{ + return fc->forget_list_head.next != NULL; +} + +static int request_pending(struct fuse_conn *fc) +{ + return !list_empty(&fc->pending) || !list_empty(&fc->interrupts) || + forget_pending(fc); +} + +/* Wait until a request is available on the pending list */ +static void request_wait(struct fuse_conn *fc) +__releases(fc->lock) +__acquires(fc->lock) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue_exclusive(&fc->waitq, &wait); + while (fc->connected && !request_pending(fc)) { + set_current_state(TASK_INTERRUPTIBLE); + if (signal_pending(current)) + break; + + spin_unlock(&fc->lock); + schedule(); + spin_lock(&fc->lock); + } + set_current_state(TASK_RUNNING); + remove_wait_queue(&fc->waitq, &wait); +} + +/* + * Transfer an interrupt request to userspace + * + * Unlike other requests this is assembled on demand, without a need + * to allocate a separate fuse_req structure. + * + * Called with fc->lock held, releases it + */ +static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs, + size_t nbytes, struct fuse_req *req) +__releases(fc->lock) +{ + struct fuse_in_header ih; + struct fuse_interrupt_in arg; + unsigned reqsize = sizeof(ih) + sizeof(arg); + int err; + + list_del_init(&req->intr_entry); + req->intr_unique = fuse_get_unique(fc); + memset(&ih, 0, sizeof(ih)); + memset(&arg, 0, sizeof(arg)); + ih.len = reqsize; + ih.opcode = FUSE_INTERRUPT; + ih.unique = req->intr_unique; + arg.unique = req->in.h.unique; + + spin_unlock(&fc->lock); + if (nbytes < reqsize) + return -EINVAL; + + err = fuse_copy_one(cs, &ih, sizeof(ih)); + if (!err) + err = fuse_copy_one(cs, &arg, sizeof(arg)); + fuse_copy_finish(cs); + + return err ? err : reqsize; +} + +static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc, + unsigned max, + unsigned *countp) +{ + struct fuse_forget_link *head = fc->forget_list_head.next; + struct fuse_forget_link **newhead = &head; + unsigned count; + + for (count = 0; *newhead != NULL && count < max; count++) + newhead = &(*newhead)->next; + + fc->forget_list_head.next = *newhead; + *newhead = NULL; + if (fc->forget_list_head.next == NULL) + fc->forget_list_tail = &fc->forget_list_head; + + if (countp != NULL) + *countp = count; + + return head; +} + +static int fuse_read_single_forget(struct fuse_conn *fc, + struct fuse_copy_state *cs, + size_t nbytes) +__releases(fc->lock) +{ + int err; + struct fuse_forget_link *forget = dequeue_forget(fc, 1, NULL); + struct fuse_forget_in arg = { + .nlookup = forget->forget_one.nlookup, + }; + struct fuse_in_header ih = { + .opcode = FUSE_FORGET, + .nodeid = forget->forget_one.nodeid, + .unique = fuse_get_unique(fc), + .len = sizeof(ih) + sizeof(arg), + }; + + spin_unlock(&fc->lock); + kfree(forget); + if (nbytes < ih.len) + return -EINVAL; + + err = fuse_copy_one(cs, &ih, sizeof(ih)); + if (!err) + err = fuse_copy_one(cs, &arg, sizeof(arg)); + fuse_copy_finish(cs); + + if (err) + return err; + + return ih.len; +} + +static int fuse_read_batch_forget(struct fuse_conn *fc, + struct fuse_copy_state *cs, size_t nbytes) +__releases(fc->lock) +{ + int err; + unsigned max_forgets; + unsigned count; + struct fuse_forget_link *head; + struct fuse_batch_forget_in arg = { .count = 0 }; + struct fuse_in_header ih = { + .opcode = FUSE_BATCH_FORGET, + .unique = fuse_get_unique(fc), + .len = sizeof(ih) + sizeof(arg), + }; + + if (nbytes < ih.len) { + spin_unlock(&fc->lock); + return -EINVAL; + } + + max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one); + head = dequeue_forget(fc, max_forgets, &count); + spin_unlock(&fc->lock); + + arg.count = count; + ih.len += count * sizeof(struct fuse_forget_one); + err = fuse_copy_one(cs, &ih, sizeof(ih)); + if (!err) + err = fuse_copy_one(cs, &arg, sizeof(arg)); + + while (head) { + struct fuse_forget_link *forget = head; + + if (!err) { + err = fuse_copy_one(cs, &forget->forget_one, + sizeof(forget->forget_one)); + } + head = forget->next; + kfree(forget); + } + + fuse_copy_finish(cs); + + if (err) + return err; + + return ih.len; +} + +static int fuse_read_forget(struct fuse_conn *fc, struct fuse_copy_state *cs, + size_t nbytes) +__releases(fc->lock) +{ + if (fc->minor < 16 || fc->forget_list_head.next->next == NULL) + return fuse_read_single_forget(fc, cs, nbytes); + else + return fuse_read_batch_forget(fc, cs, nbytes); +} + +/* + * Read a single request into the userspace filesystem's buffer. This + * function waits until a request is available, then removes it from + * the pending list and copies request data to userspace buffer. If + * no reply is needed (FORGET) or request has been aborted or there + * was an error during the copying then it's finished by calling + * request_end(). Otherwise add it to the processing list, and set + * the 'sent' flag. + */ +static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file, + struct fuse_copy_state *cs, size_t nbytes) +{ + int err; + struct fuse_req *req; + struct fuse_in *in; + unsigned reqsize; + + restart: + spin_lock(&fc->lock); + err = -EAGAIN; + if ((file->f_flags & O_NONBLOCK) && fc->connected && + !request_pending(fc)) + goto err_unlock; + + request_wait(fc); + err = -ENODEV; + if (!fc->connected) + goto err_unlock; + err = -ERESTARTSYS; + if (!request_pending(fc)) + goto err_unlock; + + if (!list_empty(&fc->interrupts)) { + req = list_entry(fc->interrupts.next, struct fuse_req, + intr_entry); + return fuse_read_interrupt(fc, cs, nbytes, req); + } + + if (forget_pending(fc)) { + if (list_empty(&fc->pending) || fc->forget_batch-- > 0) + return fuse_read_forget(fc, cs, nbytes); + + if (fc->forget_batch <= -8) + fc->forget_batch = 16; + } + + req = list_entry(fc->pending.next, struct fuse_req, list); + req->state = FUSE_REQ_READING; + list_move(&req->list, &fc->io); + + in = &req->in; + reqsize = in->h.len; + /* If request is too large, reply with an error and restart the read */ + if (nbytes < reqsize) { + req->out.h.error = -EIO; + /* SETXATTR is special, since it may contain too large data */ + if (in->h.opcode == FUSE_SETXATTR) + req->out.h.error = -E2BIG; + request_end(fc, req); + goto restart; + } + spin_unlock(&fc->lock); + cs->req = req; + err = fuse_copy_one(cs, &in->h, sizeof(in->h)); + if (!err) + err = fuse_copy_args(cs, in->numargs, in->argpages, + (struct fuse_arg *) in->args, 0); + fuse_copy_finish(cs); + spin_lock(&fc->lock); + req->locked = 0; + if (req->aborted) { + request_end(fc, req); + return -ENODEV; + } + if (err) { + req->out.h.error = -EIO; + request_end(fc, req); + return err; + } + if (!req->isreply) + request_end(fc, req); + else { + req->state = FUSE_REQ_SENT; + list_move_tail(&req->list, &fc->processing); + if (req->interrupted) + queue_interrupt(fc, req); + spin_unlock(&fc->lock); + } + return reqsize; + + err_unlock: + spin_unlock(&fc->lock); + return err; +} + +static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct fuse_copy_state cs; + struct file *file = iocb->ki_filp; + struct fuse_conn *fc = fuse_get_conn(file); + if (!fc) + return -EPERM; + + fuse_copy_init(&cs, fc, 1, iov, nr_segs); + + return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs)); +} + +static int fuse_dev_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return 1; +} + +static const struct pipe_buf_operations fuse_dev_pipe_buf_ops = { + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .confirm = generic_pipe_buf_confirm, + .release = generic_pipe_buf_release, + .steal = fuse_dev_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + +static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + int ret; + int page_nr = 0; + int do_wakeup = 0; + struct pipe_buffer *bufs; + struct fuse_copy_state cs; + struct fuse_conn *fc = fuse_get_conn(in); + if (!fc) + return -EPERM; + + bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL); + if (!bufs) + return -ENOMEM; + + fuse_copy_init(&cs, fc, 1, NULL, 0); + cs.pipebufs = bufs; + cs.pipe = pipe; + ret = fuse_dev_do_read(fc, in, &cs, len); + if (ret < 0) + goto out; + + ret = 0; + pipe_lock(pipe); + + if (!pipe->readers) { + send_sig(SIGPIPE, current, 0); + if (!ret) + ret = -EPIPE; + goto out_unlock; + } + + if (pipe->nrbufs + cs.nr_segs > pipe->buffers) { + ret = -EIO; + goto out_unlock; + } + + while (page_nr < cs.nr_segs) { + int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); + struct pipe_buffer *buf = pipe->bufs + newbuf; + + buf->page = bufs[page_nr].page; + buf->offset = bufs[page_nr].offset; + buf->len = bufs[page_nr].len; + buf->ops = &fuse_dev_pipe_buf_ops; + + pipe->nrbufs++; + page_nr++; + ret += buf->len; + + if (pipe->inode) + do_wakeup = 1; + } + +out_unlock: + pipe_unlock(pipe); + + if (do_wakeup) { + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + } + +out: + for (; page_nr < cs.nr_segs; page_nr++) + page_cache_release(bufs[page_nr].page); + + kfree(bufs); + return ret; +} + +static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_poll_wakeup_out outarg; + int err = -EINVAL; + + if (size != sizeof(outarg)) + goto err; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + goto err; + + fuse_copy_finish(cs); + return fuse_notify_poll_wakeup(fc, &outarg); + +err: + fuse_copy_finish(cs); + return err; +} + +static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_inval_inode_out outarg; + int err = -EINVAL; + + if (size != sizeof(outarg)) + goto err; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + goto err; + fuse_copy_finish(cs); + + down_read(&fc->killsb); + err = -ENOENT; + if (fc->sb) { + err = fuse_reverse_inval_inode(fc->sb, outarg.ino, + outarg.off, outarg.len); + } + up_read(&fc->killsb); + return err; + +err: + fuse_copy_finish(cs); + return err; +} + +static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_inval_entry_out outarg; + int err = -ENOMEM; + char *buf; + struct qstr name; + + buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL); + if (!buf) + goto err; + + err = -EINVAL; + if (size < sizeof(outarg)) + goto err; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + goto err; + + err = -ENAMETOOLONG; + if (outarg.namelen > FUSE_NAME_MAX) + goto err; + + err = -EINVAL; + if (size != sizeof(outarg) + outarg.namelen + 1) + goto err; + + name.name = buf; + name.len = outarg.namelen; + err = fuse_copy_one(cs, buf, outarg.namelen + 1); + if (err) + goto err; + fuse_copy_finish(cs); + buf[outarg.namelen] = 0; + name.hash = full_name_hash(name.name, name.len); + + down_read(&fc->killsb); + err = -ENOENT; + if (fc->sb) + err = fuse_reverse_inval_entry(fc->sb, outarg.parent, 0, &name); + up_read(&fc->killsb); + kfree(buf); + return err; + +err: + kfree(buf); + fuse_copy_finish(cs); + return err; +} + +static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_delete_out outarg; + int err = -ENOMEM; + char *buf; + struct qstr name; + + buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL); + if (!buf) + goto err; + + err = -EINVAL; + if (size < sizeof(outarg)) + goto err; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + goto err; + + err = -ENAMETOOLONG; + if (outarg.namelen > FUSE_NAME_MAX) + goto err; + + err = -EINVAL; + if (size != sizeof(outarg) + outarg.namelen + 1) + goto err; + + name.name = buf; + name.len = outarg.namelen; + err = fuse_copy_one(cs, buf, outarg.namelen + 1); + if (err) + goto err; + fuse_copy_finish(cs); + buf[outarg.namelen] = 0; + name.hash = full_name_hash(name.name, name.len); + + down_read(&fc->killsb); + err = -ENOENT; + if (fc->sb) + err = fuse_reverse_inval_entry(fc->sb, outarg.parent, + outarg.child, &name); + up_read(&fc->killsb); + kfree(buf); + return err; + +err: + kfree(buf); + fuse_copy_finish(cs); + return err; +} + +static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_store_out outarg; + struct inode *inode; + struct address_space *mapping; + u64 nodeid; + int err; + pgoff_t index; + unsigned int offset; + unsigned int num; + loff_t file_size; + loff_t end; + + err = -EINVAL; + if (size < sizeof(outarg)) + goto out_finish; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + goto out_finish; + + err = -EINVAL; + if (size - sizeof(outarg) != outarg.size) + goto out_finish; + + nodeid = outarg.nodeid; + + down_read(&fc->killsb); + + err = -ENOENT; + if (!fc->sb) + goto out_up_killsb; + + inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid); + if (!inode) + goto out_up_killsb; + + mapping = inode->i_mapping; + index = outarg.offset >> PAGE_CACHE_SHIFT; + offset = outarg.offset & ~PAGE_CACHE_MASK; + file_size = i_size_read(inode); + end = outarg.offset + outarg.size; + if (end > file_size) { + file_size = end; + fuse_write_update_size(inode, file_size); + } + + num = outarg.size; + while (num) { + struct page *page; + unsigned int this_num; + + err = -ENOMEM; + page = find_or_create_page(mapping, index, + mapping_gfp_mask(mapping)); + if (!page) + goto out_iput; + + this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset); + err = fuse_copy_page(cs, &page, offset, this_num, 0); + if (!err && offset == 0 && (num != 0 || file_size == end)) + SetPageUptodate(page); + unlock_page(page); + page_cache_release(page); + + if (err) + goto out_iput; + + num -= this_num; + offset = 0; + index++; + } + + err = 0; + +out_iput: + iput(inode); +out_up_killsb: + up_read(&fc->killsb); +out_finish: + fuse_copy_finish(cs); + return err; +} + +static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req) +{ + release_pages(req->pages, req->num_pages, 0); +} + +static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, + struct fuse_notify_retrieve_out *outarg) +{ + int err; + struct address_space *mapping = inode->i_mapping; + struct fuse_req *req; + pgoff_t index; + loff_t file_size; + unsigned int num; + unsigned int offset; + size_t total_len = 0; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + offset = outarg->offset & ~PAGE_CACHE_MASK; + + req->in.h.opcode = FUSE_NOTIFY_REPLY; + req->in.h.nodeid = outarg->nodeid; + req->in.numargs = 2; + req->in.argpages = 1; + req->page_offset = offset; + req->end = fuse_retrieve_end; + + index = outarg->offset >> PAGE_CACHE_SHIFT; + file_size = i_size_read(inode); + num = outarg->size; + if (outarg->offset > file_size) + num = 0; + else if (outarg->offset + num > file_size) + num = file_size - outarg->offset; + + while (num && req->num_pages < FUSE_MAX_PAGES_PER_REQ) { + struct page *page; + unsigned int this_num; + + page = find_get_page(mapping, index); + if (!page) + break; + + this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset); + req->pages[req->num_pages] = page; + req->num_pages++; + + num -= this_num; + total_len += this_num; + index++; + } + req->misc.retrieve_in.offset = outarg->offset; + req->misc.retrieve_in.size = total_len; + req->in.args[0].size = sizeof(req->misc.retrieve_in); + req->in.args[0].value = &req->misc.retrieve_in; + req->in.args[1].size = total_len; + + err = fuse_request_send_notify_reply(fc, req, outarg->notify_unique); + if (err) + fuse_retrieve_end(fc, req); + + return err; +} + +static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_retrieve_out outarg; + struct inode *inode; + int err; + + err = -EINVAL; + if (size != sizeof(outarg)) + goto copy_finish; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + goto copy_finish; + + fuse_copy_finish(cs); + + down_read(&fc->killsb); + err = -ENOENT; + if (fc->sb) { + u64 nodeid = outarg.nodeid; + + inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid); + if (inode) { + err = fuse_retrieve(fc, inode, &outarg); + iput(inode); + } + } + up_read(&fc->killsb); + + return err; + +copy_finish: + fuse_copy_finish(cs); + return err; +} + +static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, + unsigned int size, struct fuse_copy_state *cs) +{ + switch (code) { + case FUSE_NOTIFY_POLL: + return fuse_notify_poll(fc, size, cs); + + case FUSE_NOTIFY_INVAL_INODE: + return fuse_notify_inval_inode(fc, size, cs); + + case FUSE_NOTIFY_INVAL_ENTRY: + return fuse_notify_inval_entry(fc, size, cs); + + case FUSE_NOTIFY_STORE: + return fuse_notify_store(fc, size, cs); + + case FUSE_NOTIFY_RETRIEVE: + return fuse_notify_retrieve(fc, size, cs); + + case FUSE_NOTIFY_DELETE: + return fuse_notify_delete(fc, size, cs); + + default: + fuse_copy_finish(cs); + return -EINVAL; + } +} + +/* Look up request on processing list by unique ID */ +static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique) +{ + struct list_head *entry; + + list_for_each(entry, &fc->processing) { + struct fuse_req *req; + req = list_entry(entry, struct fuse_req, list); + if (req->in.h.unique == unique || req->intr_unique == unique) + return req; + } + return NULL; +} + +static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out, + unsigned nbytes) +{ + unsigned reqsize = sizeof(struct fuse_out_header); + + if (out->h.error) + return nbytes != reqsize ? -EINVAL : 0; + + reqsize += len_args(out->numargs, out->args); + + if (reqsize < nbytes || (reqsize > nbytes && !out->argvar)) + return -EINVAL; + else if (reqsize > nbytes) { + struct fuse_arg *lastarg = &out->args[out->numargs-1]; + unsigned diffsize = reqsize - nbytes; + if (diffsize > lastarg->size) + return -EINVAL; + lastarg->size -= diffsize; + } + return fuse_copy_args(cs, out->numargs, out->argpages, out->args, + out->page_zeroing); +} + +/* + * Write a single reply to a request. First the header is copied from + * the write buffer. The request is then searched on the processing + * list by the unique ID found in the header. If found, then remove + * it from the list and copy the rest of the buffer to the request. + * The request is finished by calling request_end() + */ +static ssize_t fuse_dev_do_write(struct fuse_conn *fc, + struct fuse_copy_state *cs, size_t nbytes) +{ + int err; + struct fuse_req *req; + struct fuse_out_header oh; + + if (nbytes < sizeof(struct fuse_out_header)) + return -EINVAL; + + err = fuse_copy_one(cs, &oh, sizeof(oh)); + if (err) + goto err_finish; + + err = -EINVAL; + if (oh.len != nbytes) + goto err_finish; + + /* + * Zero oh.unique indicates unsolicited notification message + * and error contains notification code. + */ + if (!oh.unique) { + err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs); + return err ? err : nbytes; + } + + err = -EINVAL; + if (oh.error <= -1000 || oh.error > 0) + goto err_finish; + + spin_lock(&fc->lock); + err = -ENOENT; + if (!fc->connected) + goto err_unlock; + + req = request_find(fc, oh.unique); + if (!req) + goto err_unlock; + + if (req->aborted) { + spin_unlock(&fc->lock); + fuse_copy_finish(cs); + spin_lock(&fc->lock); + request_end(fc, req); + return -ENOENT; + } + /* Is it an interrupt reply? */ + if (req->intr_unique == oh.unique) { + err = -EINVAL; + if (nbytes != sizeof(struct fuse_out_header)) + goto err_unlock; + + if (oh.error == -ENOSYS) + fc->no_interrupt = 1; + else if (oh.error == -EAGAIN) + queue_interrupt(fc, req); + + spin_unlock(&fc->lock); + fuse_copy_finish(cs); + return nbytes; + } + + req->state = FUSE_REQ_WRITING; + list_move(&req->list, &fc->io); + req->out.h = oh; + req->locked = 1; + cs->req = req; + if (!req->out.page_replace) + cs->move_pages = 0; + spin_unlock(&fc->lock); + + err = copy_out_args(cs, &req->out, nbytes); + fuse_copy_finish(cs); + + spin_lock(&fc->lock); + req->locked = 0; + if (!err) { + if (req->aborted) + err = -ENOENT; + } else if (!req->aborted) + req->out.h.error = -EIO; + request_end(fc, req); + + return err ? err : nbytes; + + err_unlock: + spin_unlock(&fc->lock); + err_finish: + fuse_copy_finish(cs); + return err; +} + +static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct fuse_copy_state cs; + struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp); + if (!fc) + return -EPERM; + + fuse_copy_init(&cs, fc, 0, iov, nr_segs); + + return fuse_dev_do_write(fc, &cs, iov_length(iov, nr_segs)); +} + +static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, + size_t len, unsigned int flags) +{ + unsigned nbuf; + unsigned idx; + struct pipe_buffer *bufs; + struct fuse_copy_state cs; + struct fuse_conn *fc; + size_t rem; + ssize_t ret; + + fc = fuse_get_conn(out); + if (!fc) + return -EPERM; + + bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL); + if (!bufs) + return -ENOMEM; + + pipe_lock(pipe); + nbuf = 0; + rem = 0; + for (idx = 0; idx < pipe->nrbufs && rem < len; idx++) + rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len; + + ret = -EINVAL; + if (rem < len) { + pipe_unlock(pipe); + goto out; + } + + rem = len; + while (rem) { + struct pipe_buffer *ibuf; + struct pipe_buffer *obuf; + + BUG_ON(nbuf >= pipe->buffers); + BUG_ON(!pipe->nrbufs); + ibuf = &pipe->bufs[pipe->curbuf]; + obuf = &bufs[nbuf]; + + if (rem >= ibuf->len) { + *obuf = *ibuf; + ibuf->ops = NULL; + pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); + pipe->nrbufs--; + } else { + ibuf->ops->get(pipe, ibuf); + *obuf = *ibuf; + obuf->flags &= ~PIPE_BUF_FLAG_GIFT; + obuf->len = rem; + ibuf->offset += obuf->len; + ibuf->len -= obuf->len; + } + nbuf++; + rem -= obuf->len; + } + pipe_unlock(pipe); + + fuse_copy_init(&cs, fc, 0, NULL, nbuf); + cs.pipebufs = bufs; + cs.pipe = pipe; + + if (flags & SPLICE_F_MOVE) + cs.move_pages = 1; + + ret = fuse_dev_do_write(fc, &cs, len); + + for (idx = 0; idx < nbuf; idx++) { + struct pipe_buffer *buf = &bufs[idx]; + buf->ops->release(pipe, buf); + } +out: + kfree(bufs); + return ret; +} + +static unsigned fuse_dev_poll(struct file *file, poll_table *wait) +{ + unsigned mask = POLLOUT | POLLWRNORM; + struct fuse_conn *fc = fuse_get_conn(file); + if (!fc) + return POLLERR; + + poll_wait(file, &fc->waitq, wait); + + spin_lock(&fc->lock); + if (!fc->connected) + mask = POLLERR; + else if (request_pending(fc)) + mask |= POLLIN | POLLRDNORM; + spin_unlock(&fc->lock); + + return mask; +} + +/* + * Abort all requests on the given list (pending or processing) + * + * This function releases and reacquires fc->lock + */ +static void end_requests(struct fuse_conn *fc, struct list_head *head) +__releases(fc->lock) +__acquires(fc->lock) +{ + while (!list_empty(head)) { + struct fuse_req *req; + req = list_entry(head->next, struct fuse_req, list); + req->out.h.error = -ECONNABORTED; + request_end(fc, req); + spin_lock(&fc->lock); + } +} + +/* + * Abort requests under I/O + * + * The requests are set to aborted and finished, and the request + * waiter is woken up. This will make request_wait_answer() wait + * until the request is unlocked and then return. + * + * If the request is asynchronous, then the end function needs to be + * called after waiting for the request to be unlocked (if it was + * locked). + */ +static void end_io_requests(struct fuse_conn *fc) +__releases(fc->lock) +__acquires(fc->lock) +{ + while (!list_empty(&fc->io)) { + struct fuse_req *req = + list_entry(fc->io.next, struct fuse_req, list); + void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; + + req->aborted = 1; + req->out.h.error = -ECONNABORTED; + req->state = FUSE_REQ_FINISHED; + list_del_init(&req->list); + wake_up(&req->waitq); + if (end) { + req->end = NULL; + __fuse_get_request(req); + spin_unlock(&fc->lock); + wait_event(req->waitq, !req->locked); + end(fc, req); + fuse_put_request(fc, req); + spin_lock(&fc->lock); + } + } +} + +static void end_queued_requests(struct fuse_conn *fc) +__releases(fc->lock) +__acquires(fc->lock) +{ + fc->max_background = UINT_MAX; + flush_bg_queue(fc); + end_requests(fc, &fc->pending); + end_requests(fc, &fc->processing); + while (forget_pending(fc)) + kfree(dequeue_forget(fc, 1, NULL)); +} + +static void end_polls(struct fuse_conn *fc) +{ + struct rb_node *p; + + p = rb_first(&fc->polled_files); + + while (p) { + struct fuse_file *ff; + ff = rb_entry(p, struct fuse_file, polled_node); + wake_up_interruptible_all(&ff->poll_wait); + + p = rb_next(p); + } +} + +/* + * Abort all requests. + * + * Emergency exit in case of a malicious or accidental deadlock, or + * just a hung filesystem. + * + * The same effect is usually achievable through killing the + * filesystem daemon and all users of the filesystem. The exception + * is the combination of an asynchronous request and the tricky + * deadlock (see Documentation/filesystems/fuse.txt). + * + * During the aborting, progression of requests from the pending and + * processing lists onto the io list, and progression of new requests + * onto the pending list is prevented by req->connected being false. + * + * Progression of requests under I/O to the processing list is + * prevented by the req->aborted flag being true for these requests. + * For this reason requests on the io list must be aborted first. + */ +void fuse_abort_conn(struct fuse_conn *fc) +{ + spin_lock(&fc->lock); + if (fc->connected) { + fc->connected = 0; + fc->blocked = 0; + end_io_requests(fc); + end_queued_requests(fc); + end_polls(fc); + wake_up_all(&fc->waitq); + wake_up_all(&fc->blocked_waitq); + kill_fasync(&fc->fasync, SIGIO, POLL_IN); + } + spin_unlock(&fc->lock); +} +EXPORT_SYMBOL_GPL(fuse_abort_conn); + +int fuse_dev_release(struct inode *inode, struct file *file) +{ + struct fuse_conn *fc = fuse_get_conn(file); + if (fc) { + spin_lock(&fc->lock); + fc->connected = 0; + fc->blocked = 0; + end_queued_requests(fc); + end_polls(fc); + wake_up_all(&fc->blocked_waitq); + spin_unlock(&fc->lock); + fuse_conn_put(fc); + } + + return 0; +} +EXPORT_SYMBOL_GPL(fuse_dev_release); + +static int fuse_dev_fasync(int fd, struct file *file, int on) +{ + struct fuse_conn *fc = fuse_get_conn(file); + if (!fc) + return -EPERM; + + /* No locking - fasync_helper does its own locking */ + return fasync_helper(fd, file, on, &fc->fasync); +} + +const struct file_operations fuse_dev_operations = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .read = do_sync_read, + .aio_read = fuse_dev_read, + .splice_read = fuse_dev_splice_read, + .write = do_sync_write, + .aio_write = fuse_dev_write, + .splice_write = fuse_dev_splice_write, + .poll = fuse_dev_poll, + .release = fuse_dev_release, + .fasync = fuse_dev_fasync, +}; +EXPORT_SYMBOL_GPL(fuse_dev_operations); + +static struct miscdevice fuse_miscdevice = { + .minor = FUSE_MINOR, + .name = "fuse", + .fops = &fuse_dev_operations, +}; + +int __init fuse_dev_init(void) +{ + int err = -ENOMEM; + fuse_req_cachep = kmem_cache_create("fuse_request", + sizeof(struct fuse_req), + 0, 0, NULL); + if (!fuse_req_cachep) + goto out; + + err = misc_register(&fuse_miscdevice); + if (err) + goto out_cache_clean; + + return 0; + + out_cache_clean: + kmem_cache_destroy(fuse_req_cachep); + out: + return err; +} + +void fuse_dev_cleanup(void) +{ + misc_deregister(&fuse_miscdevice); + kmem_cache_destroy(fuse_req_cachep); +} diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c new file mode 100644 index 00000000..bc438320 --- /dev/null +++ b/fs/fuse/dir.c @@ -0,0 +1,1697 @@ +/* + FUSE: Filesystem in Userspace + Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. +*/ + +#include "fuse_i.h" + +#include <linux/pagemap.h> +#include <linux/file.h> +#include <linux/sched.h> +#include <linux/namei.h> +#include <linux/slab.h> + +#if BITS_PER_LONG >= 64 +static inline void fuse_dentry_settime(struct dentry *entry, u64 time) +{ + entry->d_time = time; +} + +static inline u64 fuse_dentry_time(struct dentry *entry) +{ + return entry->d_time; +} +#else +/* + * On 32 bit archs store the high 32 bits of time in d_fsdata + */ +static void fuse_dentry_settime(struct dentry *entry, u64 time) +{ + entry->d_time = time; + entry->d_fsdata = (void *) (unsigned long) (time >> 32); +} + +static u64 fuse_dentry_time(struct dentry *entry) +{ + return (u64) entry->d_time + + ((u64) (unsigned long) entry->d_fsdata << 32); +} +#endif + +/* + * FUSE caches dentries and attributes with separate timeout. The + * time in jiffies until the dentry/attributes are valid is stored in + * dentry->d_time and fuse_inode->i_time respectively. + */ + +/* + * Calculate the time in jiffies until a dentry/attributes are valid + */ +static u64 time_to_jiffies(unsigned long sec, unsigned long nsec) +{ + if (sec || nsec) { + struct timespec ts = {sec, nsec}; + return get_jiffies_64() + timespec_to_jiffies(&ts); + } else + return 0; +} + +/* + * Set dentry and possibly attribute timeouts from the lookup/mk* + * replies + */ +static void fuse_change_entry_timeout(struct dentry *entry, + struct fuse_entry_out *o) +{ + fuse_dentry_settime(entry, + time_to_jiffies(o->entry_valid, o->entry_valid_nsec)); +} + +static u64 attr_timeout(struct fuse_attr_out *o) +{ + return time_to_jiffies(o->attr_valid, o->attr_valid_nsec); +} + +static u64 entry_attr_timeout(struct fuse_entry_out *o) +{ + return time_to_jiffies(o->attr_valid, o->attr_valid_nsec); +} + +/* + * Mark the attributes as stale, so that at the next call to + * ->getattr() they will be fetched from userspace + */ +void fuse_invalidate_attr(struct inode *inode) +{ + get_fuse_inode(inode)->i_time = 0; +} + +/* + * Just mark the entry as stale, so that a next attempt to look it up + * will result in a new lookup call to userspace + * + * This is called when a dentry is about to become negative and the + * timeout is unknown (unlink, rmdir, rename and in some cases + * lookup) + */ +void fuse_invalidate_entry_cache(struct dentry *entry) +{ + fuse_dentry_settime(entry, 0); +} + +/* + * Same as fuse_invalidate_entry_cache(), but also try to remove the + * dentry from the hash + */ +static void fuse_invalidate_entry(struct dentry *entry) +{ + d_invalidate(entry); + fuse_invalidate_entry_cache(entry); +} + +static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_req *req, + u64 nodeid, struct qstr *name, + struct fuse_entry_out *outarg) +{ + memset(outarg, 0, sizeof(struct fuse_entry_out)); + req->in.h.opcode = FUSE_LOOKUP; + req->in.h.nodeid = nodeid; + req->in.numargs = 1; + req->in.args[0].size = name->len + 1; + req->in.args[0].value = name->name; + req->out.numargs = 1; + if (fc->minor < 9) + req->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE; + else + req->out.args[0].size = sizeof(struct fuse_entry_out); + req->out.args[0].value = outarg; +} + +u64 fuse_get_attr_version(struct fuse_conn *fc) +{ + u64 curr_version; + + /* + * The spin lock isn't actually needed on 64bit archs, but we + * don't yet care too much about such optimizations. + */ + spin_lock(&fc->lock); + curr_version = fc->attr_version; + spin_unlock(&fc->lock); + + return curr_version; +} + +/* + * Check whether the dentry is still valid + * + * If the entry validity timeout has expired and the dentry is + * positive, try to redo the lookup. If the lookup results in a + * different inode, then let the VFS invalidate the dentry and redo + * the lookup once more. If the lookup results in the same inode, + * then refresh the attributes, timeouts and mark the dentry valid. + */ +static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) +{ + struct inode *inode; + + inode = ACCESS_ONCE(entry->d_inode); + if (inode && is_bad_inode(inode)) + return 0; + else if (fuse_dentry_time(entry) < get_jiffies_64()) { + int err; + struct fuse_entry_out outarg; + struct fuse_conn *fc; + struct fuse_req *req; + struct fuse_forget_link *forget; + struct dentry *parent; + u64 attr_version; + + /* For negative dentries, always do a fresh lookup */ + if (!inode) + return 0; + + if (nd && (nd->flags & LOOKUP_RCU)) + return -ECHILD; + + fc = get_fuse_conn(inode); + req = fuse_get_req(fc); + if (IS_ERR(req)) + return 0; + + forget = fuse_alloc_forget(); + if (!forget) { + fuse_put_request(fc, req); + return 0; + } + + attr_version = fuse_get_attr_version(fc); + + parent = dget_parent(entry); + fuse_lookup_init(fc, req, get_node_id(parent->d_inode), + &entry->d_name, &outarg); + fuse_request_send(fc, req); + dput(parent); + err = req->out.h.error; + fuse_put_request(fc, req); + /* Zero nodeid is same as -ENOENT */ + if (!err && !outarg.nodeid) + err = -ENOENT; + if (!err) { + struct fuse_inode *fi = get_fuse_inode(inode); + if (outarg.nodeid != get_node_id(inode)) { + fuse_queue_forget(fc, forget, outarg.nodeid, 1); + return 0; + } + spin_lock(&fc->lock); + fi->nlookup++; + spin_unlock(&fc->lock); + } + kfree(forget); + if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT) + return 0; + + fuse_change_attributes(inode, &outarg.attr, + entry_attr_timeout(&outarg), + attr_version); + fuse_change_entry_timeout(entry, &outarg); + } + return 1; +} + +static int invalid_nodeid(u64 nodeid) +{ + return !nodeid || nodeid == FUSE_ROOT_ID; +} + +const struct dentry_operations fuse_dentry_operations = { + .d_revalidate = fuse_dentry_revalidate, +}; + +int fuse_valid_type(int m) +{ + return S_ISREG(m) || S_ISDIR(m) || S_ISLNK(m) || S_ISCHR(m) || + S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m); +} + +/* + * Add a directory inode to a dentry, ensuring that no other dentry + * refers to this inode. Called with fc->inst_mutex. + */ +static struct dentry *fuse_d_add_directory(struct dentry *entry, + struct inode *inode) +{ + struct dentry *alias = d_find_alias(inode); + if (alias && !(alias->d_flags & DCACHE_DISCONNECTED)) { + /* This tries to shrink the subtree below alias */ + fuse_invalidate_entry(alias); + dput(alias); + if (!list_empty(&inode->i_dentry)) + return ERR_PTR(-EBUSY); + } else { + dput(alias); + } + return d_splice_alias(inode, entry); +} + +int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name, + struct fuse_entry_out *outarg, struct inode **inode) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + struct fuse_req *req; + struct fuse_forget_link *forget; + u64 attr_version; + int err; + + *inode = NULL; + err = -ENAMETOOLONG; + if (name->len > FUSE_NAME_MAX) + goto out; + + req = fuse_get_req(fc); + err = PTR_ERR(req); + if (IS_ERR(req)) + goto out; + + forget = fuse_alloc_forget(); + err = -ENOMEM; + if (!forget) { + fuse_put_request(fc, req); + goto out; + } + + attr_version = fuse_get_attr_version(fc); + + fuse_lookup_init(fc, req, nodeid, name, outarg); + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + /* Zero nodeid is same as -ENOENT, but with valid timeout */ + if (err || !outarg->nodeid) + goto out_put_forget; + + err = -EIO; + if (!outarg->nodeid) + goto out_put_forget; + if (!fuse_valid_type(outarg->attr.mode)) + goto out_put_forget; + + *inode = fuse_iget(sb, outarg->nodeid, outarg->generation, + &outarg->attr, entry_attr_timeout(outarg), + attr_version); + err = -ENOMEM; + if (!*inode) { + fuse_queue_forget(fc, forget, outarg->nodeid, 1); + goto out; + } + err = 0; + + out_put_forget: + kfree(forget); + out: + return err; +} + +static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, + struct nameidata *nd) +{ + int err; + struct fuse_entry_out outarg; + struct inode *inode; + struct dentry *newent; + struct fuse_conn *fc = get_fuse_conn(dir); + bool outarg_valid = true; + + err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name, + &outarg, &inode); + if (err == -ENOENT) { + outarg_valid = false; + err = 0; + } + if (err) + goto out_err; + + err = -EIO; + if (inode && get_node_id(inode) == FUSE_ROOT_ID) + goto out_iput; + + if (inode && S_ISDIR(inode->i_mode)) { + mutex_lock(&fc->inst_mutex); + newent = fuse_d_add_directory(entry, inode); + mutex_unlock(&fc->inst_mutex); + err = PTR_ERR(newent); + if (IS_ERR(newent)) + goto out_iput; + } else { + newent = d_splice_alias(inode, entry); + } + + entry = newent ? newent : entry; + if (outarg_valid) + fuse_change_entry_timeout(entry, &outarg); + else + fuse_invalidate_entry_cache(entry); + + return newent; + + out_iput: + iput(inode); + out_err: + return ERR_PTR(err); +} + +/* + * Atomic create+open operation + * + * If the filesystem doesn't support this, then fall back to separate + * 'mknod' + 'open' requests. + */ +static int fuse_create_open(struct inode *dir, struct dentry *entry, + umode_t mode, struct nameidata *nd) +{ + int err; + struct inode *inode; + struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_req *req; + struct fuse_forget_link *forget; + struct fuse_create_in inarg; + struct fuse_open_out outopen; + struct fuse_entry_out outentry; + struct fuse_file *ff; + struct file *file; + int flags = nd->intent.open.flags; + + if (fc->no_create) + return -ENOSYS; + + forget = fuse_alloc_forget(); + if (!forget) + return -ENOMEM; + + req = fuse_get_req(fc); + err = PTR_ERR(req); + if (IS_ERR(req)) + goto out_put_forget_req; + + err = -ENOMEM; + ff = fuse_file_alloc(fc); + if (!ff) + goto out_put_request; + + if (!fc->dont_mask) + mode &= ~current_umask(); + + flags &= ~O_NOCTTY; + memset(&inarg, 0, sizeof(inarg)); + memset(&outentry, 0, sizeof(outentry)); + inarg.flags = flags; + inarg.mode = mode; + inarg.umask = current_umask(); + req->in.h.opcode = FUSE_CREATE; + req->in.h.nodeid = get_node_id(dir); + req->in.numargs = 2; + req->in.args[0].size = fc->minor < 12 ? sizeof(struct fuse_open_in) : + sizeof(inarg); + req->in.args[0].value = &inarg; + req->in.args[1].size = entry->d_name.len + 1; + req->in.args[1].value = entry->d_name.name; + req->out.numargs = 2; + if (fc->minor < 9) + req->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE; + else + req->out.args[0].size = sizeof(outentry); + req->out.args[0].value = &outentry; + req->out.args[1].size = sizeof(outopen); + req->out.args[1].value = &outopen; + fuse_request_send(fc, req); + err = req->out.h.error; + if (err) { + if (err == -ENOSYS) + fc->no_create = 1; + goto out_free_ff; + } + + err = -EIO; + if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid)) + goto out_free_ff; + + fuse_put_request(fc, req); + ff->fh = outopen.fh; + ff->nodeid = outentry.nodeid; + ff->open_flags = outopen.open_flags; + inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation, + &outentry.attr, entry_attr_timeout(&outentry), 0); + if (!inode) { + flags &= ~(O_CREAT | O_EXCL | O_TRUNC); + fuse_sync_release(ff, flags); + fuse_queue_forget(fc, forget, outentry.nodeid, 1); + return -ENOMEM; + } + kfree(forget); + d_instantiate(entry, inode); + fuse_change_entry_timeout(entry, &outentry); + fuse_invalidate_attr(dir); + file = lookup_instantiate_filp(nd, entry, generic_file_open); + if (IS_ERR(file)) { + fuse_sync_release(ff, flags); + return PTR_ERR(file); + } + file->private_data = fuse_file_get(ff); + fuse_finish_open(inode, file); + return 0; + + out_free_ff: + fuse_file_free(ff); + out_put_request: + fuse_put_request(fc, req); + out_put_forget_req: + kfree(forget); + return err; +} + +/* + * Code shared between mknod, mkdir, symlink and link + */ +static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req, + struct inode *dir, struct dentry *entry, + umode_t mode) +{ + struct fuse_entry_out outarg; + struct inode *inode; + int err; + struct fuse_forget_link *forget; + + forget = fuse_alloc_forget(); + if (!forget) { + fuse_put_request(fc, req); + return -ENOMEM; + } + + memset(&outarg, 0, sizeof(outarg)); + req->in.h.nodeid = get_node_id(dir); + req->out.numargs = 1; + if (fc->minor < 9) + req->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE; + else + req->out.args[0].size = sizeof(outarg); + req->out.args[0].value = &outarg; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + if (err) + goto out_put_forget_req; + + err = -EIO; + if (invalid_nodeid(outarg.nodeid)) + goto out_put_forget_req; + + if ((outarg.attr.mode ^ mode) & S_IFMT) + goto out_put_forget_req; + + inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, + &outarg.attr, entry_attr_timeout(&outarg), 0); + if (!inode) { + fuse_queue_forget(fc, forget, outarg.nodeid, 1); + return -ENOMEM; + } + kfree(forget); + + if (S_ISDIR(inode->i_mode)) { + struct dentry *alias; + mutex_lock(&fc->inst_mutex); + alias = d_find_alias(inode); + if (alias) { + /* New directory must have moved since mkdir */ + mutex_unlock(&fc->inst_mutex); + dput(alias); + iput(inode); + return -EBUSY; + } + d_instantiate(entry, inode); + mutex_unlock(&fc->inst_mutex); + } else + d_instantiate(entry, inode); + + fuse_change_entry_timeout(entry, &outarg); + fuse_invalidate_attr(dir); + return 0; + + out_put_forget_req: + kfree(forget); + return err; +} + +static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode, + dev_t rdev) +{ + struct fuse_mknod_in inarg; + struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + if (!fc->dont_mask) + mode &= ~current_umask(); + + memset(&inarg, 0, sizeof(inarg)); + inarg.mode = mode; + inarg.rdev = new_encode_dev(rdev); + inarg.umask = current_umask(); + req->in.h.opcode = FUSE_MKNOD; + req->in.numargs = 2; + req->in.args[0].size = fc->minor < 12 ? FUSE_COMPAT_MKNOD_IN_SIZE : + sizeof(inarg); + req->in.args[0].value = &inarg; + req->in.args[1].size = entry->d_name.len + 1; + req->in.args[1].value = entry->d_name.name; + return create_new_entry(fc, req, dir, entry, mode); +} + +static int fuse_create(struct inode *dir, struct dentry *entry, umode_t mode, + struct nameidata *nd) +{ + if (nd) { + int err = fuse_create_open(dir, entry, mode, nd); + if (err != -ENOSYS) + return err; + /* Fall back on mknod */ + } + return fuse_mknod(dir, entry, mode, 0); +} + +static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode) +{ + struct fuse_mkdir_in inarg; + struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + if (!fc->dont_mask) + mode &= ~current_umask(); + + memset(&inarg, 0, sizeof(inarg)); + inarg.mode = mode; + inarg.umask = current_umask(); + req->in.h.opcode = FUSE_MKDIR; + req->in.numargs = 2; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + req->in.args[1].size = entry->d_name.len + 1; + req->in.args[1].value = entry->d_name.name; + return create_new_entry(fc, req, dir, entry, S_IFDIR); +} + +static int fuse_symlink(struct inode *dir, struct dentry *entry, + const char *link) +{ + struct fuse_conn *fc = get_fuse_conn(dir); + unsigned len = strlen(link) + 1; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->in.h.opcode = FUSE_SYMLINK; + req->in.numargs = 2; + req->in.args[0].size = entry->d_name.len + 1; + req->in.args[0].value = entry->d_name.name; + req->in.args[1].size = len; + req->in.args[1].value = link; + return create_new_entry(fc, req, dir, entry, S_IFLNK); +} + +static int fuse_unlink(struct inode *dir, struct dentry *entry) +{ + int err; + struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->in.h.opcode = FUSE_UNLINK; + req->in.h.nodeid = get_node_id(dir); + req->in.numargs = 1; + req->in.args[0].size = entry->d_name.len + 1; + req->in.args[0].value = entry->d_name.name; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + if (!err) { + struct inode *inode = entry->d_inode; + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fc->lock); + fi->attr_version = ++fc->attr_version; + drop_nlink(inode); + spin_unlock(&fc->lock); + fuse_invalidate_attr(inode); + fuse_invalidate_attr(dir); + fuse_invalidate_entry_cache(entry); + } else if (err == -EINTR) + fuse_invalidate_entry(entry); + return err; +} + +static int fuse_rmdir(struct inode *dir, struct dentry *entry) +{ + int err; + struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->in.h.opcode = FUSE_RMDIR; + req->in.h.nodeid = get_node_id(dir); + req->in.numargs = 1; + req->in.args[0].size = entry->d_name.len + 1; + req->in.args[0].value = entry->d_name.name; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + if (!err) { + clear_nlink(entry->d_inode); + fuse_invalidate_attr(dir); + fuse_invalidate_entry_cache(entry); + } else if (err == -EINTR) + fuse_invalidate_entry(entry); + return err; +} + +static int fuse_rename(struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent) +{ + int err; + struct fuse_rename_in inarg; + struct fuse_conn *fc = get_fuse_conn(olddir); + struct fuse_req *req = fuse_get_req(fc); + + if (IS_ERR(req)) + return PTR_ERR(req); + + memset(&inarg, 0, sizeof(inarg)); + inarg.newdir = get_node_id(newdir); + req->in.h.opcode = FUSE_RENAME; + req->in.h.nodeid = get_node_id(olddir); + req->in.numargs = 3; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + req->in.args[1].size = oldent->d_name.len + 1; + req->in.args[1].value = oldent->d_name.name; + req->in.args[2].size = newent->d_name.len + 1; + req->in.args[2].value = newent->d_name.name; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + if (!err) { + /* ctime changes */ + fuse_invalidate_attr(oldent->d_inode); + + fuse_invalidate_attr(olddir); + if (olddir != newdir) + fuse_invalidate_attr(newdir); + + /* newent will end up negative */ + if (newent->d_inode) { + fuse_invalidate_attr(newent->d_inode); + fuse_invalidate_entry_cache(newent); + } + } else if (err == -EINTR) { + /* If request was interrupted, DEITY only knows if the + rename actually took place. If the invalidation + fails (e.g. some process has CWD under the renamed + directory), then there can be inconsistency between + the dcache and the real filesystem. Tough luck. */ + fuse_invalidate_entry(oldent); + if (newent->d_inode) + fuse_invalidate_entry(newent); + } + + return err; +} + +static int fuse_link(struct dentry *entry, struct inode *newdir, + struct dentry *newent) +{ + int err; + struct fuse_link_in inarg; + struct inode *inode = entry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + memset(&inarg, 0, sizeof(inarg)); + inarg.oldnodeid = get_node_id(inode); + req->in.h.opcode = FUSE_LINK; + req->in.numargs = 2; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + req->in.args[1].size = newent->d_name.len + 1; + req->in.args[1].value = newent->d_name.name; + err = create_new_entry(fc, req, newdir, newent, inode->i_mode); + /* Contrary to "normal" filesystems it can happen that link + makes two "logical" inodes point to the same "physical" + inode. We invalidate the attributes of the old one, so it + will reflect changes in the backing inode (link count, + etc.) + */ + if (!err) { + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fc->lock); + fi->attr_version = ++fc->attr_version; + inc_nlink(inode); + spin_unlock(&fc->lock); + fuse_invalidate_attr(inode); + } else if (err == -EINTR) { + fuse_invalidate_attr(inode); + } + return err; +} + +static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, + struct kstat *stat) +{ + stat->dev = inode->i_sb->s_dev; + stat->ino = attr->ino; + stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); + stat->nlink = attr->nlink; + stat->uid = attr->uid; + stat->gid = attr->gid; + stat->rdev = inode->i_rdev; + stat->atime.tv_sec = attr->atime; + stat->atime.tv_nsec = attr->atimensec; + stat->mtime.tv_sec = attr->mtime; + stat->mtime.tv_nsec = attr->mtimensec; + stat->ctime.tv_sec = attr->ctime; + stat->ctime.tv_nsec = attr->ctimensec; + stat->size = attr->size; + stat->blocks = attr->blocks; + stat->blksize = (1 << inode->i_blkbits); +} + +static int fuse_do_getattr(struct inode *inode, struct kstat *stat, + struct file *file) +{ + int err; + struct fuse_getattr_in inarg; + struct fuse_attr_out outarg; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + u64 attr_version; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + attr_version = fuse_get_attr_version(fc); + + memset(&inarg, 0, sizeof(inarg)); + memset(&outarg, 0, sizeof(outarg)); + /* Directories have separate file-handle space */ + if (file && S_ISREG(inode->i_mode)) { + struct fuse_file *ff = file->private_data; + + inarg.getattr_flags |= FUSE_GETATTR_FH; + inarg.fh = ff->fh; + } + req->in.h.opcode = FUSE_GETATTR; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 1; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + req->out.numargs = 1; + if (fc->minor < 9) + req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE; + else + req->out.args[0].size = sizeof(outarg); + req->out.args[0].value = &outarg; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + if (!err) { + if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) { + make_bad_inode(inode); + err = -EIO; + } else { + fuse_change_attributes(inode, &outarg.attr, + attr_timeout(&outarg), + attr_version); + if (stat) + fuse_fillattr(inode, &outarg.attr, stat); + } + } + return err; +} + +int fuse_update_attributes(struct inode *inode, struct kstat *stat, + struct file *file, bool *refreshed) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + int err; + bool r; + + if (fi->i_time < get_jiffies_64()) { + r = true; + err = fuse_do_getattr(inode, stat, file); + } else { + r = false; + err = 0; + if (stat) { + generic_fillattr(inode, stat); + stat->mode = fi->orig_i_mode; + stat->ino = fi->orig_ino; + } + } + + if (refreshed != NULL) + *refreshed = r; + + return err; +} + +int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, + u64 child_nodeid, struct qstr *name) +{ + int err = -ENOTDIR; + struct inode *parent; + struct dentry *dir; + struct dentry *entry; + + parent = ilookup5(sb, parent_nodeid, fuse_inode_eq, &parent_nodeid); + if (!parent) + return -ENOENT; + + mutex_lock(&parent->i_mutex); + if (!S_ISDIR(parent->i_mode)) + goto unlock; + + err = -ENOENT; + dir = d_find_alias(parent); + if (!dir) + goto unlock; + + entry = d_lookup(dir, name); + dput(dir); + if (!entry) + goto unlock; + + fuse_invalidate_attr(parent); + fuse_invalidate_entry(entry); + + if (child_nodeid != 0 && entry->d_inode) { + mutex_lock(&entry->d_inode->i_mutex); + if (get_node_id(entry->d_inode) != child_nodeid) { + err = -ENOENT; + goto badentry; + } + if (d_mountpoint(entry)) { + err = -EBUSY; + goto badentry; + } + if (S_ISDIR(entry->d_inode->i_mode)) { + shrink_dcache_parent(entry); + if (!simple_empty(entry)) { + err = -ENOTEMPTY; + goto badentry; + } + entry->d_inode->i_flags |= S_DEAD; + } + dont_mount(entry); + clear_nlink(entry->d_inode); + err = 0; + badentry: + mutex_unlock(&entry->d_inode->i_mutex); + if (!err) + d_delete(entry); + } else { + err = 0; + } + dput(entry); + + unlock: + mutex_unlock(&parent->i_mutex); + iput(parent); + return err; +} + +/* + * Calling into a user-controlled filesystem gives the filesystem + * daemon ptrace-like capabilities over the requester process. This + * means, that the filesystem daemon is able to record the exact + * filesystem operations performed, and can also control the behavior + * of the requester process in otherwise impossible ways. For example + * it can delay the operation for arbitrary length of time allowing + * DoS against the requester. + * + * For this reason only those processes can call into the filesystem, + * for which the owner of the mount has ptrace privilege. This + * excludes processes started by other users, suid or sgid processes. + */ +int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task) +{ + const struct cred *cred; + int ret; + + if (fc->flags & FUSE_ALLOW_OTHER) + return 1; + + rcu_read_lock(); + ret = 0; + cred = __task_cred(task); + if (cred->euid == fc->user_id && + cred->suid == fc->user_id && + cred->uid == fc->user_id && + cred->egid == fc->group_id && + cred->sgid == fc->group_id && + cred->gid == fc->group_id) + ret = 1; + rcu_read_unlock(); + + return ret; +} + +static int fuse_access(struct inode *inode, int mask) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + struct fuse_access_in inarg; + int err; + + if (fc->no_access) + return 0; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + memset(&inarg, 0, sizeof(inarg)); + inarg.mask = mask & (MAY_READ | MAY_WRITE | MAY_EXEC); + req->in.h.opcode = FUSE_ACCESS; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 1; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + if (err == -ENOSYS) { + fc->no_access = 1; + err = 0; + } + return err; +} + +static int fuse_perm_getattr(struct inode *inode, int mask) +{ + if (mask & MAY_NOT_BLOCK) + return -ECHILD; + + return fuse_do_getattr(inode, NULL, NULL); +} + +/* + * Check permission. The two basic access models of FUSE are: + * + * 1) Local access checking ('default_permissions' mount option) based + * on file mode. This is the plain old disk filesystem permission + * modell. + * + * 2) "Remote" access checking, where server is responsible for + * checking permission in each inode operation. An exception to this + * is if ->permission() was invoked from sys_access() in which case an + * access request is sent. Execute permission is still checked + * locally based on file mode. + */ +static int fuse_permission(struct inode *inode, int mask) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + bool refreshed = false; + int err = 0; + + if (!fuse_allow_task(fc, current)) + return -EACCES; + + /* + * If attributes are needed, refresh them before proceeding + */ + if ((fc->flags & FUSE_DEFAULT_PERMISSIONS) || + ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) { + struct fuse_inode *fi = get_fuse_inode(inode); + + if (fi->i_time < get_jiffies_64()) { + refreshed = true; + + err = fuse_perm_getattr(inode, mask); + if (err) + return err; + } + } + + if (fc->flags & FUSE_DEFAULT_PERMISSIONS) { + err = generic_permission(inode, mask); + + /* If permission is denied, try to refresh file + attributes. This is also needed, because the root + node will at first have no permissions */ + if (err == -EACCES && !refreshed) { + err = fuse_perm_getattr(inode, mask); + if (!err) + err = generic_permission(inode, mask); + } + + /* Note: the opposite of the above test does not + exist. So if permissions are revoked this won't be + noticed immediately, only after the attribute + timeout has expired */ + } else if (mask & (MAY_ACCESS | MAY_CHDIR)) { + if (mask & MAY_NOT_BLOCK) + return -ECHILD; + + err = fuse_access(inode, mask); + } else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) { + if (!(inode->i_mode & S_IXUGO)) { + if (refreshed) + return -EACCES; + + err = fuse_perm_getattr(inode, mask); + if (!err && !(inode->i_mode & S_IXUGO)) + return -EACCES; + } + } + return err; +} + +static int parse_dirfile(char *buf, size_t nbytes, struct file *file, + void *dstbuf, filldir_t filldir) +{ + while (nbytes >= FUSE_NAME_OFFSET) { + struct fuse_dirent *dirent = (struct fuse_dirent *) buf; + size_t reclen = FUSE_DIRENT_SIZE(dirent); + int over; + if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) + return -EIO; + if (reclen > nbytes) + break; + + over = filldir(dstbuf, dirent->name, dirent->namelen, + file->f_pos, dirent->ino, dirent->type); + if (over) + break; + + buf += reclen; + nbytes -= reclen; + file->f_pos = dirent->off; + } + + return 0; +} + +static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) +{ + int err; + size_t nbytes; + struct page *page; + struct inode *inode = file->f_path.dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + + if (is_bad_inode(inode)) + return -EIO; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + page = alloc_page(GFP_KERNEL); + if (!page) { + fuse_put_request(fc, req); + return -ENOMEM; + } + req->out.argpages = 1; + req->num_pages = 1; + req->pages[0] = page; + fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, FUSE_READDIR); + fuse_request_send(fc, req); + nbytes = req->out.args[0].size; + err = req->out.h.error; + fuse_put_request(fc, req); + if (!err) + err = parse_dirfile(page_address(page), nbytes, file, dstbuf, + filldir); + + __free_page(page); + fuse_invalidate_attr(inode); /* atime changed */ + return err; +} + +static char *read_link(struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req = fuse_get_req(fc); + char *link; + + if (IS_ERR(req)) + return ERR_CAST(req); + + link = (char *) __get_free_page(GFP_KERNEL); + if (!link) { + link = ERR_PTR(-ENOMEM); + goto out; + } + req->in.h.opcode = FUSE_READLINK; + req->in.h.nodeid = get_node_id(inode); + req->out.argvar = 1; + req->out.numargs = 1; + req->out.args[0].size = PAGE_SIZE - 1; + req->out.args[0].value = link; + fuse_request_send(fc, req); + if (req->out.h.error) { + free_page((unsigned long) link); + link = ERR_PTR(req->out.h.error); + } else + link[req->out.args[0].size] = '\0'; + out: + fuse_put_request(fc, req); + fuse_invalidate_attr(inode); /* atime changed */ + return link; +} + +static void free_link(char *link) +{ + if (!IS_ERR(link)) + free_page((unsigned long) link); +} + +static void *fuse_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + nd_set_link(nd, read_link(dentry)); + return NULL; +} + +static void fuse_put_link(struct dentry *dentry, struct nameidata *nd, void *c) +{ + free_link(nd_get_link(nd)); +} + +static int fuse_dir_open(struct inode *inode, struct file *file) +{ + return fuse_open_common(inode, file, true); +} + +static int fuse_dir_release(struct inode *inode, struct file *file) +{ + fuse_release_common(file, FUSE_RELEASEDIR); + + return 0; +} + +static int fuse_dir_fsync(struct file *file, loff_t start, loff_t end, + int datasync) +{ + return fuse_fsync_common(file, start, end, datasync, 1); +} + +static long fuse_dir_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host); + + /* FUSE_IOCTL_DIR only supported for API version >= 7.18 */ + if (fc->minor < 18) + return -ENOTTY; + + return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_DIR); +} + +static long fuse_dir_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host); + + if (fc->minor < 18) + return -ENOTTY; + + return fuse_ioctl_common(file, cmd, arg, + FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR); +} + +static bool update_mtime(unsigned ivalid) +{ + /* Always update if mtime is explicitly set */ + if (ivalid & ATTR_MTIME_SET) + return true; + + /* If it's an open(O_TRUNC) or an ftruncate(), don't update */ + if ((ivalid & ATTR_SIZE) && (ivalid & (ATTR_OPEN | ATTR_FILE))) + return false; + + /* In all other cases update */ + return true; +} + +static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg) +{ + unsigned ivalid = iattr->ia_valid; + + if (ivalid & ATTR_MODE) + arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode; + if (ivalid & ATTR_UID) + arg->valid |= FATTR_UID, arg->uid = iattr->ia_uid; + if (ivalid & ATTR_GID) + arg->valid |= FATTR_GID, arg->gid = iattr->ia_gid; + if (ivalid & ATTR_SIZE) + arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size; + if (ivalid & ATTR_ATIME) { + arg->valid |= FATTR_ATIME; + arg->atime = iattr->ia_atime.tv_sec; + arg->atimensec = iattr->ia_atime.tv_nsec; + if (!(ivalid & ATTR_ATIME_SET)) + arg->valid |= FATTR_ATIME_NOW; + } + if ((ivalid & ATTR_MTIME) && update_mtime(ivalid)) { + arg->valid |= FATTR_MTIME; + arg->mtime = iattr->ia_mtime.tv_sec; + arg->mtimensec = iattr->ia_mtime.tv_nsec; + if (!(ivalid & ATTR_MTIME_SET)) + arg->valid |= FATTR_MTIME_NOW; + } +} + +/* + * Prevent concurrent writepages on inode + * + * This is done by adding a negative bias to the inode write counter + * and waiting for all pending writes to finish. + */ +void fuse_set_nowrite(struct inode *inode) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + BUG_ON(!mutex_is_locked(&inode->i_mutex)); + + spin_lock(&fc->lock); + BUG_ON(fi->writectr < 0); + fi->writectr += FUSE_NOWRITE; + spin_unlock(&fc->lock); + wait_event(fi->page_waitq, fi->writectr == FUSE_NOWRITE); +} + +/* + * Allow writepages on inode + * + * Remove the bias from the writecounter and send any queued + * writepages. + */ +static void __fuse_release_nowrite(struct inode *inode) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + BUG_ON(fi->writectr != FUSE_NOWRITE); + fi->writectr = 0; + fuse_flush_writepages(inode); +} + +void fuse_release_nowrite(struct inode *inode) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + + spin_lock(&fc->lock); + __fuse_release_nowrite(inode); + spin_unlock(&fc->lock); +} + +/* + * Set attributes, and at the same time refresh them. + * + * Truncation is slightly complicated, because the 'truncate' request + * may fail, in which case we don't want to touch the mapping. + * vmtruncate() doesn't allow for this case, so do the rlimit checking + * and the actual truncation by hand. + */ +static int fuse_do_setattr(struct dentry *entry, struct iattr *attr, + struct file *file) +{ + struct inode *inode = entry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + struct fuse_setattr_in inarg; + struct fuse_attr_out outarg; + bool is_truncate = false; + loff_t oldsize; + int err; + + if (!fuse_allow_task(fc, current)) + return -EACCES; + + if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) + attr->ia_valid |= ATTR_FORCE; + + err = inode_change_ok(inode, attr); + if (err) + return err; + + if (attr->ia_valid & ATTR_OPEN) { + if (fc->atomic_o_trunc) + return 0; + file = NULL; + } + + if (attr->ia_valid & ATTR_SIZE) + is_truncate = true; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + if (is_truncate) + fuse_set_nowrite(inode); + + memset(&inarg, 0, sizeof(inarg)); + memset(&outarg, 0, sizeof(outarg)); + iattr_to_fattr(attr, &inarg); + if (file) { + struct fuse_file *ff = file->private_data; + inarg.valid |= FATTR_FH; + inarg.fh = ff->fh; + } + if (attr->ia_valid & ATTR_SIZE) { + /* For mandatory locking in truncate */ + inarg.valid |= FATTR_LOCKOWNER; + inarg.lock_owner = fuse_lock_owner_id(fc, current->files); + } + req->in.h.opcode = FUSE_SETATTR; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 1; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + req->out.numargs = 1; + if (fc->minor < 9) + req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE; + else + req->out.args[0].size = sizeof(outarg); + req->out.args[0].value = &outarg; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + if (err) { + if (err == -EINTR) + fuse_invalidate_attr(inode); + goto error; + } + + if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) { + make_bad_inode(inode); + err = -EIO; + goto error; + } + + spin_lock(&fc->lock); + fuse_change_attributes_common(inode, &outarg.attr, + attr_timeout(&outarg)); + oldsize = inode->i_size; + i_size_write(inode, outarg.attr.size); + + if (is_truncate) { + /* NOTE: this may release/reacquire fc->lock */ + __fuse_release_nowrite(inode); + } + spin_unlock(&fc->lock); + + /* + * Only call invalidate_inode_pages2() after removing + * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock. + */ + if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { + truncate_pagecache(inode, oldsize, outarg.attr.size); + invalidate_inode_pages2(inode->i_mapping); + } + + return 0; + +error: + if (is_truncate) + fuse_release_nowrite(inode); + + return err; +} + +static int fuse_setattr(struct dentry *entry, struct iattr *attr) +{ + if (attr->ia_valid & ATTR_FILE) + return fuse_do_setattr(entry, attr, attr->ia_file); + else + return fuse_do_setattr(entry, attr, NULL); +} + +static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry, + struct kstat *stat) +{ + struct inode *inode = entry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + + if (!fuse_allow_task(fc, current)) + return -EACCES; + + return fuse_update_attributes(inode, stat, NULL, NULL); +} + +static int fuse_setxattr(struct dentry *entry, const char *name, + const void *value, size_t size, int flags) +{ + struct inode *inode = entry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + struct fuse_setxattr_in inarg; + int err; + + if (fc->no_setxattr) + return -EOPNOTSUPP; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + memset(&inarg, 0, sizeof(inarg)); + inarg.size = size; + inarg.flags = flags; + req->in.h.opcode = FUSE_SETXATTR; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 3; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + req->in.args[1].size = strlen(name) + 1; + req->in.args[1].value = name; + req->in.args[2].size = size; + req->in.args[2].value = value; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + if (err == -ENOSYS) { + fc->no_setxattr = 1; + err = -EOPNOTSUPP; + } + return err; +} + +static ssize_t fuse_getxattr(struct dentry *entry, const char *name, + void *value, size_t size) +{ + struct inode *inode = entry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + struct fuse_getxattr_in inarg; + struct fuse_getxattr_out outarg; + ssize_t ret; + + if (fc->no_getxattr) + return -EOPNOTSUPP; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + memset(&inarg, 0, sizeof(inarg)); + inarg.size = size; + req->in.h.opcode = FUSE_GETXATTR; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 2; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + req->in.args[1].size = strlen(name) + 1; + req->in.args[1].value = name; + /* This is really two different operations rolled into one */ + req->out.numargs = 1; + if (size) { + req->out.argvar = 1; + req->out.args[0].size = size; + req->out.args[0].value = value; + } else { + req->out.args[0].size = sizeof(outarg); + req->out.args[0].value = &outarg; + } + fuse_request_send(fc, req); + ret = req->out.h.error; + if (!ret) + ret = size ? req->out.args[0].size : outarg.size; + else { + if (ret == -ENOSYS) { + fc->no_getxattr = 1; + ret = -EOPNOTSUPP; + } + } + fuse_put_request(fc, req); + return ret; +} + +static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) +{ + struct inode *inode = entry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + struct fuse_getxattr_in inarg; + struct fuse_getxattr_out outarg; + ssize_t ret; + + if (!fuse_allow_task(fc, current)) + return -EACCES; + + if (fc->no_listxattr) + return -EOPNOTSUPP; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + memset(&inarg, 0, sizeof(inarg)); + inarg.size = size; + req->in.h.opcode = FUSE_LISTXATTR; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 1; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + /* This is really two different operations rolled into one */ + req->out.numargs = 1; + if (size) { + req->out.argvar = 1; + req->out.args[0].size = size; + req->out.args[0].value = list; + } else { + req->out.args[0].size = sizeof(outarg); + req->out.args[0].value = &outarg; + } + fuse_request_send(fc, req); + ret = req->out.h.error; + if (!ret) + ret = size ? req->out.args[0].size : outarg.size; + else { + if (ret == -ENOSYS) { + fc->no_listxattr = 1; + ret = -EOPNOTSUPP; + } + } + fuse_put_request(fc, req); + return ret; +} + +static int fuse_removexattr(struct dentry *entry, const char *name) +{ + struct inode *inode = entry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + int err; + + if (fc->no_removexattr) + return -EOPNOTSUPP; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->in.h.opcode = FUSE_REMOVEXATTR; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 1; + req->in.args[0].size = strlen(name) + 1; + req->in.args[0].value = name; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + if (err == -ENOSYS) { + fc->no_removexattr = 1; + err = -EOPNOTSUPP; + } + return err; +} + +static const struct inode_operations fuse_dir_inode_operations = { + .lookup = fuse_lookup, + .mkdir = fuse_mkdir, + .symlink = fuse_symlink, + .unlink = fuse_unlink, + .rmdir = fuse_rmdir, + .rename = fuse_rename, + .link = fuse_link, + .setattr = fuse_setattr, + .create = fuse_create, + .mknod = fuse_mknod, + .permission = fuse_permission, + .getattr = fuse_getattr, + .setxattr = fuse_setxattr, + .getxattr = fuse_getxattr, + .listxattr = fuse_listxattr, + .removexattr = fuse_removexattr, +}; + +static const struct file_operations fuse_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = fuse_readdir, + .open = fuse_dir_open, + .release = fuse_dir_release, + .fsync = fuse_dir_fsync, + .unlocked_ioctl = fuse_dir_ioctl, + .compat_ioctl = fuse_dir_compat_ioctl, +}; + +static const struct inode_operations fuse_common_inode_operations = { + .setattr = fuse_setattr, + .permission = fuse_permission, + .getattr = fuse_getattr, + .setxattr = fuse_setxattr, + .getxattr = fuse_getxattr, + .listxattr = fuse_listxattr, + .removexattr = fuse_removexattr, +}; + +static const struct inode_operations fuse_symlink_inode_operations = { + .setattr = fuse_setattr, + .follow_link = fuse_follow_link, + .put_link = fuse_put_link, + .readlink = generic_readlink, + .getattr = fuse_getattr, + .setxattr = fuse_setxattr, + .getxattr = fuse_getxattr, + .listxattr = fuse_listxattr, + .removexattr = fuse_removexattr, +}; + +void fuse_init_common(struct inode *inode) +{ + inode->i_op = &fuse_common_inode_operations; +} + +void fuse_init_dir(struct inode *inode) +{ + inode->i_op = &fuse_dir_inode_operations; + inode->i_fop = &fuse_dir_operations; +} + +void fuse_init_symlink(struct inode *inode) +{ + inode->i_op = &fuse_symlink_inode_operations; +} diff --git a/fs/fuse/file.c b/fs/fuse/file.c new file mode 100644 index 00000000..504e61b7 --- /dev/null +++ b/fs/fuse/file.c @@ -0,0 +1,2224 @@ +/* + FUSE: Filesystem in Userspace + Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. +*/ + +#include "fuse_i.h" + +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/module.h> +#include <linux/compat.h> +#include <linux/swap.h> + +static const struct file_operations fuse_direct_io_file_operations; + +static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, + int opcode, struct fuse_open_out *outargp) +{ + struct fuse_open_in inarg; + struct fuse_req *req; + int err; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + memset(&inarg, 0, sizeof(inarg)); + inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY); + if (!fc->atomic_o_trunc) + inarg.flags &= ~O_TRUNC; + req->in.h.opcode = opcode; + req->in.h.nodeid = nodeid; + req->in.numargs = 1; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + req->out.numargs = 1; + req->out.args[0].size = sizeof(*outargp); + req->out.args[0].value = outargp; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + + return err; +} + +struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) +{ + struct fuse_file *ff; + + ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL); + if (unlikely(!ff)) + return NULL; + + ff->fc = fc; + ff->reserved_req = fuse_request_alloc(); + if (unlikely(!ff->reserved_req)) { + kfree(ff); + return NULL; + } + + INIT_LIST_HEAD(&ff->write_entry); + atomic_set(&ff->count, 0); + RB_CLEAR_NODE(&ff->polled_node); + init_waitqueue_head(&ff->poll_wait); + + spin_lock(&fc->lock); + ff->kh = ++fc->khctr; + spin_unlock(&fc->lock); + + return ff; +} + +void fuse_file_free(struct fuse_file *ff) +{ + fuse_request_free(ff->reserved_req); + kfree(ff); +} + +struct fuse_file *fuse_file_get(struct fuse_file *ff) +{ + atomic_inc(&ff->count); + return ff; +} + +static void fuse_release_async(struct work_struct *work) +{ + struct fuse_req *req; + struct fuse_conn *fc; + struct path path; + + req = container_of(work, struct fuse_req, misc.release.work); + path = req->misc.release.path; + fc = get_fuse_conn(path.dentry->d_inode); + + fuse_put_request(fc, req); + path_put(&path); +} + +static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) +{ + if (fc->destroy_req) { + /* + * If this is a fuseblk mount, then it's possible that + * releasing the path will result in releasing the + * super block and sending the DESTROY request. If + * the server is single threaded, this would hang. + * For this reason do the path_put() in a separate + * thread. + */ + atomic_inc(&req->count); + INIT_WORK(&req->misc.release.work, fuse_release_async); + schedule_work(&req->misc.release.work); + } else { + path_put(&req->misc.release.path); + } +} + +static void fuse_file_put(struct fuse_file *ff, bool sync) +{ + if (atomic_dec_and_test(&ff->count)) { + struct fuse_req *req = ff->reserved_req; + + if (sync) { + fuse_request_send(ff->fc, req); + path_put(&req->misc.release.path); + fuse_put_request(ff->fc, req); + } else { + req->end = fuse_release_end; + fuse_request_send_background(ff->fc, req); + } + kfree(ff); + } +} + +int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, + bool isdir) +{ + struct fuse_open_out outarg; + struct fuse_file *ff; + int err; + int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; + + ff = fuse_file_alloc(fc); + if (!ff) + return -ENOMEM; + + err = fuse_send_open(fc, nodeid, file, opcode, &outarg); + if (err) { + fuse_file_free(ff); + return err; + } + + if (isdir) + outarg.open_flags &= ~FOPEN_DIRECT_IO; + + ff->fh = outarg.fh; + ff->nodeid = nodeid; + ff->open_flags = outarg.open_flags; + file->private_data = fuse_file_get(ff); + + return 0; +} +EXPORT_SYMBOL_GPL(fuse_do_open); + +void fuse_finish_open(struct inode *inode, struct file *file) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = get_fuse_conn(inode); + + if (ff->open_flags & FOPEN_DIRECT_IO) + file->f_op = &fuse_direct_io_file_operations; + if (!(ff->open_flags & FOPEN_KEEP_CACHE)) + invalidate_inode_pages2(inode->i_mapping); + if (ff->open_flags & FOPEN_NONSEEKABLE) + nonseekable_open(inode, file); + if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) { + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fc->lock); + fi->attr_version = ++fc->attr_version; + i_size_write(inode, 0); + spin_unlock(&fc->lock); + fuse_invalidate_attr(inode); + } +} + +int fuse_open_common(struct inode *inode, struct file *file, bool isdir) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + int err; + + err = generic_file_open(inode, file); + if (err) + return err; + + err = fuse_do_open(fc, get_node_id(inode), file, isdir); + if (err) + return err; + + fuse_finish_open(inode, file); + + return 0; +} + +static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode) +{ + struct fuse_conn *fc = ff->fc; + struct fuse_req *req = ff->reserved_req; + struct fuse_release_in *inarg = &req->misc.release.in; + + spin_lock(&fc->lock); + list_del(&ff->write_entry); + if (!RB_EMPTY_NODE(&ff->polled_node)) + rb_erase(&ff->polled_node, &fc->polled_files); + spin_unlock(&fc->lock); + + wake_up_interruptible_all(&ff->poll_wait); + + inarg->fh = ff->fh; + inarg->flags = flags; + req->in.h.opcode = opcode; + req->in.h.nodeid = ff->nodeid; + req->in.numargs = 1; + req->in.args[0].size = sizeof(struct fuse_release_in); + req->in.args[0].value = inarg; +} + +void fuse_release_common(struct file *file, int opcode) +{ + struct fuse_file *ff; + struct fuse_req *req; + + ff = file->private_data; + if (unlikely(!ff)) + return; + + req = ff->reserved_req; + fuse_prepare_release(ff, file->f_flags, opcode); + + if (ff->flock) { + struct fuse_release_in *inarg = &req->misc.release.in; + inarg->release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; + inarg->lock_owner = fuse_lock_owner_id(ff->fc, + (fl_owner_t) file); + } + /* Hold vfsmount and dentry until release is finished */ + path_get(&file->f_path); + req->misc.release.path = file->f_path; + + /* + * Normally this will send the RELEASE request, however if + * some asynchronous READ or WRITE requests are outstanding, + * the sending will be delayed. + * + * Make the release synchronous if this is a fuseblk mount, + * synchronous RELEASE is allowed (and desirable) in this case + * because the server can be trusted not to screw up. + */ + fuse_file_put(ff, ff->fc->destroy_req != NULL); +} + +static int fuse_open(struct inode *inode, struct file *file) +{ + return fuse_open_common(inode, file, false); +} + +static int fuse_release(struct inode *inode, struct file *file) +{ + fuse_release_common(file, FUSE_RELEASE); + + /* return value is ignored by VFS */ + return 0; +} + +void fuse_sync_release(struct fuse_file *ff, int flags) +{ + WARN_ON(atomic_read(&ff->count) > 1); + fuse_prepare_release(ff, flags, FUSE_RELEASE); + ff->reserved_req->force = 1; + fuse_request_send(ff->fc, ff->reserved_req); + fuse_put_request(ff->fc, ff->reserved_req); + kfree(ff); +} +EXPORT_SYMBOL_GPL(fuse_sync_release); + +/* + * Scramble the ID space with XTEA, so that the value of the files_struct + * pointer is not exposed to userspace. + */ +u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) +{ + u32 *k = fc->scramble_key; + u64 v = (unsigned long) id; + u32 v0 = v; + u32 v1 = v >> 32; + u32 sum = 0; + int i; + + for (i = 0; i < 32; i++) { + v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]); + sum += 0x9E3779B9; + v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]); + } + + return (u64) v0 + ((u64) v1 << 32); +} + +/* + * Check if page is under writeback + * + * This is currently done by walking the list of writepage requests + * for the inode, which can be pretty inefficient. + */ +static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_req *req; + bool found = false; + + spin_lock(&fc->lock); + list_for_each_entry(req, &fi->writepages, writepages_entry) { + pgoff_t curr_index; + + BUG_ON(req->inode != inode); + curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT; + if (curr_index == index) { + found = true; + break; + } + } + spin_unlock(&fc->lock); + + return found; +} + +/* + * Wait for page writeback to be completed. + * + * Since fuse doesn't rely on the VM writeback tracking, this has to + * use some other means. + */ +static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index)); + return 0; +} + +static int fuse_flush(struct file *file, fl_owner_t id) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_file *ff = file->private_data; + struct fuse_req *req; + struct fuse_flush_in inarg; + int err; + + if (is_bad_inode(inode)) + return -EIO; + + if (fc->no_flush) + return 0; + + req = fuse_get_req_nofail(fc, file); + memset(&inarg, 0, sizeof(inarg)); + inarg.fh = ff->fh; + inarg.lock_owner = fuse_lock_owner_id(fc, id); + req->in.h.opcode = FUSE_FLUSH; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 1; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + req->force = 1; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + if (err == -ENOSYS) { + fc->no_flush = 1; + err = 0; + } + return err; +} + +/* + * Wait for all pending writepages on the inode to finish. + * + * This is currently done by blocking further writes with FUSE_NOWRITE + * and waiting for all sent writes to complete. + * + * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage + * could conflict with truncation. + */ +static void fuse_sync_writes(struct inode *inode) +{ + fuse_set_nowrite(inode); + fuse_release_nowrite(inode); +} + +int fuse_fsync_common(struct file *file, loff_t start, loff_t end, + int datasync, int isdir) +{ + struct inode *inode = file->f_mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_file *ff = file->private_data; + struct fuse_req *req; + struct fuse_fsync_in inarg; + int err; + + if (is_bad_inode(inode)) + return -EIO; + + err = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (err) + return err; + + if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) + return 0; + + mutex_lock(&inode->i_mutex); + + /* + * Start writeback against all dirty pages of the inode, then + * wait for all outstanding writes, before sending the FSYNC + * request. + */ + err = write_inode_now(inode, 0); + if (err) + goto out; + + fuse_sync_writes(inode); + + req = fuse_get_req(fc); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + + memset(&inarg, 0, sizeof(inarg)); + inarg.fh = ff->fh; + inarg.fsync_flags = datasync ? 1 : 0; + req->in.h.opcode = isdir ? FUSE_FSYNCDIR : FUSE_FSYNC; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 1; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + if (err == -ENOSYS) { + if (isdir) + fc->no_fsyncdir = 1; + else + fc->no_fsync = 1; + err = 0; + } +out: + mutex_unlock(&inode->i_mutex); + return err; +} + +static int fuse_fsync(struct file *file, loff_t start, loff_t end, + int datasync) +{ + return fuse_fsync_common(file, start, end, datasync, 0); +} + +void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, + size_t count, int opcode) +{ + struct fuse_read_in *inarg = &req->misc.read.in; + struct fuse_file *ff = file->private_data; + + inarg->fh = ff->fh; + inarg->offset = pos; + inarg->size = count; + inarg->flags = file->f_flags; + req->in.h.opcode = opcode; + req->in.h.nodeid = ff->nodeid; + req->in.numargs = 1; + req->in.args[0].size = sizeof(struct fuse_read_in); + req->in.args[0].value = inarg; + req->out.argvar = 1; + req->out.numargs = 1; + req->out.args[0].size = count; +} + +static size_t fuse_send_read(struct fuse_req *req, struct file *file, + loff_t pos, size_t count, fl_owner_t owner) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fc; + + fuse_read_fill(req, file, pos, count, FUSE_READ); + if (owner != NULL) { + struct fuse_read_in *inarg = &req->misc.read.in; + + inarg->read_flags |= FUSE_READ_LOCKOWNER; + inarg->lock_owner = fuse_lock_owner_id(fc, owner); + } + fuse_request_send(fc, req); + return req->out.args[0].size; +} + +static void fuse_read_update_size(struct inode *inode, loff_t size, + u64 attr_ver) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fc->lock); + if (attr_ver == fi->attr_version && size < inode->i_size) { + fi->attr_version = ++fc->attr_version; + i_size_write(inode, size); + } + spin_unlock(&fc->lock); +} + +static int fuse_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + size_t num_read; + loff_t pos = page_offset(page); + size_t count = PAGE_CACHE_SIZE; + u64 attr_ver; + int err; + + err = -EIO; + if (is_bad_inode(inode)) + goto out; + + /* + * Page writeback can extend beyond the lifetime of the + * page-cache page, so make sure we read a properly synced + * page. + */ + fuse_wait_on_page_writeback(inode, page->index); + + req = fuse_get_req(fc); + err = PTR_ERR(req); + if (IS_ERR(req)) + goto out; + + attr_ver = fuse_get_attr_version(fc); + + req->out.page_zeroing = 1; + req->out.argpages = 1; + req->num_pages = 1; + req->pages[0] = page; + num_read = fuse_send_read(req, file, pos, count, NULL); + err = req->out.h.error; + fuse_put_request(fc, req); + + if (!err) { + /* + * Short read means EOF. If file size is larger, truncate it + */ + if (num_read < count) + fuse_read_update_size(inode, pos + num_read, attr_ver); + + SetPageUptodate(page); + } + + fuse_invalidate_attr(inode); /* atime changed */ + out: + unlock_page(page); + return err; +} + +static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) +{ + int i; + size_t count = req->misc.read.in.size; + size_t num_read = req->out.args[0].size; + struct address_space *mapping = NULL; + + for (i = 0; mapping == NULL && i < req->num_pages; i++) + mapping = req->pages[i]->mapping; + + if (mapping) { + struct inode *inode = mapping->host; + + /* + * Short read means EOF. If file size is larger, truncate it + */ + if (!req->out.h.error && num_read < count) { + loff_t pos; + + pos = page_offset(req->pages[0]) + num_read; + fuse_read_update_size(inode, pos, + req->misc.read.attr_ver); + } + fuse_invalidate_attr(inode); /* atime changed */ + } + + for (i = 0; i < req->num_pages; i++) { + struct page *page = req->pages[i]; + if (!req->out.h.error) + SetPageUptodate(page); + else + SetPageError(page); + unlock_page(page); + page_cache_release(page); + } + if (req->ff) + fuse_file_put(req->ff, false); +} + +static void fuse_send_readpages(struct fuse_req *req, struct file *file) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fc; + loff_t pos = page_offset(req->pages[0]); + size_t count = req->num_pages << PAGE_CACHE_SHIFT; + + req->out.argpages = 1; + req->out.page_zeroing = 1; + req->out.page_replace = 1; + fuse_read_fill(req, file, pos, count, FUSE_READ); + req->misc.read.attr_ver = fuse_get_attr_version(fc); + if (fc->async_read) { + req->ff = fuse_file_get(ff); + req->end = fuse_readpages_end; + fuse_request_send_background(fc, req); + } else { + fuse_request_send(fc, req); + fuse_readpages_end(fc, req); + fuse_put_request(fc, req); + } +} + +struct fuse_fill_data { + struct fuse_req *req; + struct file *file; + struct inode *inode; +}; + +static int fuse_readpages_fill(void *_data, struct page *page) +{ + struct fuse_fill_data *data = _data; + struct fuse_req *req = data->req; + struct inode *inode = data->inode; + struct fuse_conn *fc = get_fuse_conn(inode); + + fuse_wait_on_page_writeback(inode, page->index); + + if (req->num_pages && + (req->num_pages == FUSE_MAX_PAGES_PER_REQ || + (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read || + req->pages[req->num_pages - 1]->index + 1 != page->index)) { + fuse_send_readpages(req, data->file); + data->req = req = fuse_get_req(fc); + if (IS_ERR(req)) { + unlock_page(page); + return PTR_ERR(req); + } + } + page_cache_get(page); + req->pages[req->num_pages] = page; + req->num_pages++; + return 0; +} + +static int fuse_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + struct inode *inode = mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_fill_data data; + int err; + + err = -EIO; + if (is_bad_inode(inode)) + goto out; + + data.file = file; + data.inode = inode; + data.req = fuse_get_req(fc); + err = PTR_ERR(data.req); + if (IS_ERR(data.req)) + goto out; + + err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data); + if (!err) { + if (data.req->num_pages) + fuse_send_readpages(data.req, file); + else + fuse_put_request(fc, data.req); + } +out: + return err; +} + +static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct inode *inode = iocb->ki_filp->f_mapping->host; + + if (pos + iov_length(iov, nr_segs) > i_size_read(inode)) { + int err; + /* + * If trying to read past EOF, make sure the i_size + * attribute is up-to-date. + */ + err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL); + if (err) + return err; + } + + return generic_file_aio_read(iocb, iov, nr_segs, pos); +} + +static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff, + loff_t pos, size_t count) +{ + struct fuse_write_in *inarg = &req->misc.write.in; + struct fuse_write_out *outarg = &req->misc.write.out; + + inarg->fh = ff->fh; + inarg->offset = pos; + inarg->size = count; + req->in.h.opcode = FUSE_WRITE; + req->in.h.nodeid = ff->nodeid; + req->in.numargs = 2; + if (ff->fc->minor < 9) + req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; + else + req->in.args[0].size = sizeof(struct fuse_write_in); + req->in.args[0].value = inarg; + req->in.args[1].size = count; + req->out.numargs = 1; + req->out.args[0].size = sizeof(struct fuse_write_out); + req->out.args[0].value = outarg; +} + +static size_t fuse_send_write(struct fuse_req *req, struct file *file, + loff_t pos, size_t count, fl_owner_t owner) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fc; + struct fuse_write_in *inarg = &req->misc.write.in; + + fuse_write_fill(req, ff, pos, count); + inarg->flags = file->f_flags; + if (owner != NULL) { + inarg->write_flags |= FUSE_WRITE_LOCKOWNER; + inarg->lock_owner = fuse_lock_owner_id(fc, owner); + } + fuse_request_send(fc, req); + return req->misc.write.out.size; +} + +void fuse_write_update_size(struct inode *inode, loff_t pos) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fc->lock); + fi->attr_version = ++fc->attr_version; + if (pos > inode->i_size) + i_size_write(inode, pos); + spin_unlock(&fc->lock); +} + +static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file, + struct inode *inode, loff_t pos, + size_t count) +{ + size_t res; + unsigned offset; + unsigned i; + + for (i = 0; i < req->num_pages; i++) + fuse_wait_on_page_writeback(inode, req->pages[i]->index); + + res = fuse_send_write(req, file, pos, count, NULL); + + offset = req->page_offset; + count = res; + for (i = 0; i < req->num_pages; i++) { + struct page *page = req->pages[i]; + + if (!req->out.h.error && !offset && count >= PAGE_CACHE_SIZE) + SetPageUptodate(page); + + if (count > PAGE_CACHE_SIZE - offset) + count -= PAGE_CACHE_SIZE - offset; + else + count = 0; + offset = 0; + + unlock_page(page); + page_cache_release(page); + } + + return res; +} + +static ssize_t fuse_fill_write_pages(struct fuse_req *req, + struct address_space *mapping, + struct iov_iter *ii, loff_t pos) +{ + struct fuse_conn *fc = get_fuse_conn(mapping->host); + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + size_t count = 0; + int err; + + req->in.argpages = 1; + req->page_offset = offset; + + do { + size_t tmp; + struct page *page; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + size_t bytes = min_t(size_t, PAGE_CACHE_SIZE - offset, + iov_iter_count(ii)); + + bytes = min_t(size_t, bytes, fc->max_write - count); + + again: + err = -EFAULT; + if (iov_iter_fault_in_readable(ii, bytes)) + break; + + err = -ENOMEM; + page = grab_cache_page_write_begin(mapping, index, 0); + if (!page) + break; + + if (mapping_writably_mapped(mapping)) + flush_dcache_page(page); + + pagefault_disable(); + tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); + pagefault_enable(); + flush_dcache_page(page); + + mark_page_accessed(page); + + if (!tmp) { + unlock_page(page); + page_cache_release(page); + bytes = min(bytes, iov_iter_single_seg_count(ii)); + goto again; + } + + err = 0; + req->pages[req->num_pages] = page; + req->num_pages++; + + iov_iter_advance(ii, tmp); + count += tmp; + pos += tmp; + offset += tmp; + if (offset == PAGE_CACHE_SIZE) + offset = 0; + + if (!fc->big_writes) + break; + } while (iov_iter_count(ii) && count < fc->max_write && + req->num_pages < FUSE_MAX_PAGES_PER_REQ && offset == 0); + + return count > 0 ? count : err; +} + +static ssize_t fuse_perform_write(struct file *file, + struct address_space *mapping, + struct iov_iter *ii, loff_t pos) +{ + struct inode *inode = mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + int err = 0; + ssize_t res = 0; + + if (is_bad_inode(inode)) + return -EIO; + + do { + struct fuse_req *req; + ssize_t count; + + req = fuse_get_req(fc); + if (IS_ERR(req)) { + err = PTR_ERR(req); + break; + } + + count = fuse_fill_write_pages(req, mapping, ii, pos); + if (count <= 0) { + err = count; + } else { + size_t num_written; + + num_written = fuse_send_write_pages(req, file, inode, + pos, count); + err = req->out.h.error; + if (!err) { + res += num_written; + pos += num_written; + + /* break out of the loop on short write */ + if (num_written != count) + err = -EIO; + } + } + fuse_put_request(fc, req); + } while (!err && iov_iter_count(ii)); + + if (res > 0) + fuse_write_update_size(inode, pos); + + fuse_invalidate_attr(inode); + + return res > 0 ? res : err; +} + +static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + size_t count = 0; + size_t ocount = 0; + ssize_t written = 0; + ssize_t written_buffered = 0; + struct inode *inode = mapping->host; + ssize_t err; + struct iov_iter i; + loff_t endbyte = 0; + + WARN_ON(iocb->ki_pos != pos); + + ocount = 0; + err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); + if (err) + return err; + + count = ocount; + + mutex_lock(&inode->i_mutex); + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = mapping->backing_dev_info; + + err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); + if (err) + goto out; + + if (count == 0) + goto out; + + err = file_remove_suid(file); + if (err) + goto out; + + file_update_time(file); + + if (file->f_flags & O_DIRECT) { + written = generic_file_direct_write(iocb, iov, &nr_segs, + pos, &iocb->ki_pos, + count, ocount); + if (written < 0 || written == count) + goto out; + + pos += written; + count -= written; + + iov_iter_init(&i, iov, nr_segs, count, written); + written_buffered = fuse_perform_write(file, mapping, &i, pos); + if (written_buffered < 0) { + err = written_buffered; + goto out; + } + endbyte = pos + written_buffered - 1; + + err = filemap_write_and_wait_range(file->f_mapping, pos, + endbyte); + if (err) + goto out; + + invalidate_mapping_pages(file->f_mapping, + pos >> PAGE_CACHE_SHIFT, + endbyte >> PAGE_CACHE_SHIFT); + + written += written_buffered; + iocb->ki_pos = pos + written_buffered; + } else { + iov_iter_init(&i, iov, nr_segs, count, 0); + written = fuse_perform_write(file, mapping, &i, pos); + if (written >= 0) + iocb->ki_pos = pos + written; + } +out: + current->backing_dev_info = NULL; + mutex_unlock(&inode->i_mutex); + + return written ? written : err; +} + +static void fuse_release_user_pages(struct fuse_req *req, int write) +{ + unsigned i; + + for (i = 0; i < req->num_pages; i++) { + struct page *page = req->pages[i]; + if (write) + set_page_dirty_lock(page); + put_page(page); + } +} + +static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf, + size_t *nbytesp, int write) +{ + size_t nbytes = *nbytesp; + unsigned long user_addr = (unsigned long) buf; + unsigned offset = user_addr & ~PAGE_MASK; + int npages; + + /* Special case for kernel I/O: can copy directly into the buffer */ + if (segment_eq(get_fs(), KERNEL_DS)) { + if (write) + req->in.args[1].value = (void *) user_addr; + else + req->out.args[0].value = (void *) user_addr; + + return 0; + } + + nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); + npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; + npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ); + npages = get_user_pages_fast(user_addr, npages, !write, req->pages); + if (npages < 0) + return npages; + + req->num_pages = npages; + req->page_offset = offset; + + if (write) + req->in.argpages = 1; + else + req->out.argpages = 1; + + nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset; + *nbytesp = min(*nbytesp, nbytes); + + return 0; +} + +ssize_t fuse_direct_io(struct file *file, const char __user *buf, + size_t count, loff_t *ppos, int write) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fc; + size_t nmax = write ? fc->max_write : fc->max_read; + loff_t pos = *ppos; + ssize_t res = 0; + struct fuse_req *req; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + while (count) { + size_t nres; + fl_owner_t owner = current->files; + size_t nbytes = min(count, nmax); + int err = fuse_get_user_pages(req, buf, &nbytes, write); + if (err) { + res = err; + break; + } + + if (write) + nres = fuse_send_write(req, file, pos, nbytes, owner); + else + nres = fuse_send_read(req, file, pos, nbytes, owner); + + fuse_release_user_pages(req, !write); + if (req->out.h.error) { + if (!res) + res = req->out.h.error; + break; + } else if (nres > nbytes) { + res = -EIO; + break; + } + count -= nres; + res += nres; + pos += nres; + buf += nres; + if (nres != nbytes) + break; + if (count) { + fuse_put_request(fc, req); + req = fuse_get_req(fc); + if (IS_ERR(req)) + break; + } + } + if (!IS_ERR(req)) + fuse_put_request(fc, req); + if (res > 0) + *ppos = pos; + + return res; +} +EXPORT_SYMBOL_GPL(fuse_direct_io); + +static ssize_t fuse_direct_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + ssize_t res; + struct inode *inode = file->f_path.dentry->d_inode; + + if (is_bad_inode(inode)) + return -EIO; + + res = fuse_direct_io(file, buf, count, ppos, 0); + + fuse_invalidate_attr(inode); + + return res; +} + +static ssize_t __fuse_direct_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct inode *inode = file->f_path.dentry->d_inode; + ssize_t res; + + res = generic_write_checks(file, ppos, &count, 0); + if (!res) { + res = fuse_direct_io(file, buf, count, ppos, 1); + if (res > 0) + fuse_write_update_size(inode, *ppos); + } + + fuse_invalidate_attr(inode); + + return res; +} + +static ssize_t fuse_direct_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct inode *inode = file->f_path.dentry->d_inode; + ssize_t res; + + if (is_bad_inode(inode)) + return -EIO; + + /* Don't allow parallel writes to the same file */ + mutex_lock(&inode->i_mutex); + res = __fuse_direct_write(file, buf, count, ppos); + mutex_unlock(&inode->i_mutex); + + return res; +} + +static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req) +{ + __free_page(req->pages[0]); + fuse_file_put(req->ff, false); +} + +static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) +{ + struct inode *inode = req->inode; + struct fuse_inode *fi = get_fuse_inode(inode); + struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info; + + list_del(&req->writepages_entry); + dec_bdi_stat(bdi, BDI_WRITEBACK); + dec_zone_page_state(req->pages[0], NR_WRITEBACK_TEMP); + bdi_writeout_inc(bdi); + wake_up(&fi->page_waitq); +} + +/* Called under fc->lock, may release and reacquire it */ +static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req) +__releases(fc->lock) +__acquires(fc->lock) +{ + struct fuse_inode *fi = get_fuse_inode(req->inode); + loff_t size = i_size_read(req->inode); + struct fuse_write_in *inarg = &req->misc.write.in; + + if (!fc->connected) + goto out_free; + + if (inarg->offset + PAGE_CACHE_SIZE <= size) { + inarg->size = PAGE_CACHE_SIZE; + } else if (inarg->offset < size) { + inarg->size = size & (PAGE_CACHE_SIZE - 1); + } else { + /* Got truncated off completely */ + goto out_free; + } + + req->in.args[1].size = inarg->size; + fi->writectr++; + fuse_request_send_background_locked(fc, req); + return; + + out_free: + fuse_writepage_finish(fc, req); + spin_unlock(&fc->lock); + fuse_writepage_free(fc, req); + fuse_put_request(fc, req); + spin_lock(&fc->lock); +} + +/* + * If fi->writectr is positive (no truncate or fsync going on) send + * all queued writepage requests. + * + * Called with fc->lock + */ +void fuse_flush_writepages(struct inode *inode) +__releases(fc->lock) +__acquires(fc->lock) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_req *req; + + while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) { + req = list_entry(fi->queued_writes.next, struct fuse_req, list); + list_del_init(&req->list); + fuse_send_writepage(fc, req); + } +} + +static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req) +{ + struct inode *inode = req->inode; + struct fuse_inode *fi = get_fuse_inode(inode); + + mapping_set_error(inode->i_mapping, req->out.h.error); + spin_lock(&fc->lock); + fi->writectr--; + fuse_writepage_finish(fc, req); + spin_unlock(&fc->lock); + fuse_writepage_free(fc, req); +} + +static int fuse_writepage_locked(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_req *req; + struct fuse_file *ff; + struct page *tmp_page; + + set_page_writeback(page); + + req = fuse_request_alloc_nofs(); + if (!req) + goto err; + + tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!tmp_page) + goto err_free; + + spin_lock(&fc->lock); + BUG_ON(list_empty(&fi->write_files)); + ff = list_entry(fi->write_files.next, struct fuse_file, write_entry); + req->ff = fuse_file_get(ff); + spin_unlock(&fc->lock); + + fuse_write_fill(req, ff, page_offset(page), 0); + + copy_highpage(tmp_page, page); + req->misc.write.in.write_flags |= FUSE_WRITE_CACHE; + req->in.argpages = 1; + req->num_pages = 1; + req->pages[0] = tmp_page; + req->page_offset = 0; + req->end = fuse_writepage_end; + req->inode = inode; + + inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK); + inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); + end_page_writeback(page); + + spin_lock(&fc->lock); + list_add(&req->writepages_entry, &fi->writepages); + list_add_tail(&req->list, &fi->queued_writes); + fuse_flush_writepages(inode); + spin_unlock(&fc->lock); + + return 0; + +err_free: + fuse_request_free(req); +err: + end_page_writeback(page); + return -ENOMEM; +} + +static int fuse_writepage(struct page *page, struct writeback_control *wbc) +{ + int err; + + err = fuse_writepage_locked(page); + unlock_page(page); + + return err; +} + +static int fuse_launder_page(struct page *page) +{ + int err = 0; + if (clear_page_dirty_for_io(page)) { + struct inode *inode = page->mapping->host; + err = fuse_writepage_locked(page); + if (!err) + fuse_wait_on_page_writeback(inode, page->index); + } + return err; +} + +/* + * Write back dirty pages now, because there may not be any suitable + * open files later + */ +static void fuse_vma_close(struct vm_area_struct *vma) +{ + filemap_write_and_wait(vma->vm_file->f_mapping); +} + +/* + * Wait for writeback against this page to complete before allowing it + * to be marked dirty again, and hence written back again, possibly + * before the previous writepage completed. + * + * Block here, instead of in ->writepage(), so that the userspace fs + * can only block processes actually operating on the filesystem. + * + * Otherwise unprivileged userspace fs would be able to block + * unrelated: + * + * - page migration + * - sync(2) + * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER + */ +static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page = vmf->page; + /* + * Don't use page->mapping as it may become NULL from a + * concurrent truncate. + */ + struct inode *inode = vma->vm_file->f_mapping->host; + + fuse_wait_on_page_writeback(inode, page->index); + return 0; +} + +static const struct vm_operations_struct fuse_file_vm_ops = { + .close = fuse_vma_close, + .fault = filemap_fault, + .page_mkwrite = fuse_page_mkwrite, +}; + +static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) { + struct inode *inode = file->f_dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_file *ff = file->private_data; + /* + * file may be written through mmap, so chain it onto the + * inodes's write_file list + */ + spin_lock(&fc->lock); + if (list_empty(&ff->write_entry)) + list_add(&ff->write_entry, &fi->write_files); + spin_unlock(&fc->lock); + } + file_accessed(file); + vma->vm_ops = &fuse_file_vm_ops; + return 0; +} + +static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma) +{ + /* Can't provide the coherency needed for MAP_SHARED */ + if (vma->vm_flags & VM_MAYSHARE) + return -ENODEV; + + invalidate_inode_pages2(file->f_mapping); + + return generic_file_mmap(file, vma); +} + +static int convert_fuse_file_lock(const struct fuse_file_lock *ffl, + struct file_lock *fl) +{ + switch (ffl->type) { + case F_UNLCK: + break; + + case F_RDLCK: + case F_WRLCK: + if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX || + ffl->end < ffl->start) + return -EIO; + + fl->fl_start = ffl->start; + fl->fl_end = ffl->end; + fl->fl_pid = ffl->pid; + break; + + default: + return -EIO; + } + fl->fl_type = ffl->type; + return 0; +} + +static void fuse_lk_fill(struct fuse_req *req, struct file *file, + const struct file_lock *fl, int opcode, pid_t pid, + int flock) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_file *ff = file->private_data; + struct fuse_lk_in *arg = &req->misc.lk_in; + + arg->fh = ff->fh; + arg->owner = fuse_lock_owner_id(fc, fl->fl_owner); + arg->lk.start = fl->fl_start; + arg->lk.end = fl->fl_end; + arg->lk.type = fl->fl_type; + arg->lk.pid = pid; + if (flock) + arg->lk_flags |= FUSE_LK_FLOCK; + req->in.h.opcode = opcode; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 1; + req->in.args[0].size = sizeof(*arg); + req->in.args[0].value = arg; +} + +static int fuse_getlk(struct file *file, struct file_lock *fl) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + struct fuse_lk_out outarg; + int err; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + fuse_lk_fill(req, file, fl, FUSE_GETLK, 0, 0); + req->out.numargs = 1; + req->out.args[0].size = sizeof(outarg); + req->out.args[0].value = &outarg; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + if (!err) + err = convert_fuse_file_lock(&outarg.lk, fl); + + return err; +} + +static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK; + pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0; + int err; + + if (fl->fl_lmops && fl->fl_lmops->lm_grant) { + /* NLM needs asynchronous locks, which we don't support yet */ + return -ENOLCK; + } + + /* Unlock on close is handled by the flush method */ + if (fl->fl_flags & FL_CLOSE) + return 0; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + fuse_lk_fill(req, file, fl, opcode, pid, flock); + fuse_request_send(fc, req); + err = req->out.h.error; + /* locking is restartable */ + if (err == -EINTR) + err = -ERESTARTSYS; + fuse_put_request(fc, req); + return err; +} + +static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + int err; + + if (cmd == F_CANCELLK) { + err = 0; + } else if (cmd == F_GETLK) { + if (fc->no_lock) { + posix_test_lock(file, fl); + err = 0; + } else + err = fuse_getlk(file, fl); + } else { + if (fc->no_lock) + err = posix_lock_file(file, fl, NULL); + else + err = fuse_setlk(file, fl, 0); + } + return err; +} + +static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + int err; + + if (fc->no_flock) { + err = flock_lock_file_wait(file, fl); + } else { + struct fuse_file *ff = file->private_data; + + /* emulate flock with POSIX locks */ + fl->fl_owner = (fl_owner_t) file; + ff->flock = true; + err = fuse_setlk(file, fl, 1); + } + + return err; +} + +static sector_t fuse_bmap(struct address_space *mapping, sector_t block) +{ + struct inode *inode = mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + struct fuse_bmap_in inarg; + struct fuse_bmap_out outarg; + int err; + + if (!inode->i_sb->s_bdev || fc->no_bmap) + return 0; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return 0; + + memset(&inarg, 0, sizeof(inarg)); + inarg.block = block; + inarg.blocksize = inode->i_sb->s_blocksize; + req->in.h.opcode = FUSE_BMAP; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 1; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + req->out.numargs = 1; + req->out.args[0].size = sizeof(outarg); + req->out.args[0].value = &outarg; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + if (err == -ENOSYS) + fc->no_bmap = 1; + + return err ? 0 : outarg.block; +} + +static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin) +{ + loff_t retval; + struct inode *inode = file->f_path.dentry->d_inode; + + /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */ + if (origin == SEEK_CUR || origin == SEEK_SET) + return generic_file_llseek(file, offset, origin); + + mutex_lock(&inode->i_mutex); + retval = fuse_update_attributes(inode, NULL, file, NULL); + if (!retval) + retval = generic_file_llseek(file, offset, origin); + mutex_unlock(&inode->i_mutex); + + return retval; +} + +static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov, + unsigned int nr_segs, size_t bytes, bool to_user) +{ + struct iov_iter ii; + int page_idx = 0; + + if (!bytes) + return 0; + + iov_iter_init(&ii, iov, nr_segs, bytes, 0); + + while (iov_iter_count(&ii)) { + struct page *page = pages[page_idx++]; + size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii)); + void *kaddr; + + kaddr = kmap(page); + + while (todo) { + char __user *uaddr = ii.iov->iov_base + ii.iov_offset; + size_t iov_len = ii.iov->iov_len - ii.iov_offset; + size_t copy = min(todo, iov_len); + size_t left; + + if (!to_user) + left = copy_from_user(kaddr, uaddr, copy); + else + left = copy_to_user(uaddr, kaddr, copy); + + if (unlikely(left)) + return -EFAULT; + + iov_iter_advance(&ii, copy); + todo -= copy; + kaddr += copy; + } + + kunmap(page); + } + + return 0; +} + +/* + * CUSE servers compiled on 32bit broke on 64bit kernels because the + * ABI was defined to be 'struct iovec' which is different on 32bit + * and 64bit. Fortunately we can determine which structure the server + * used from the size of the reply. + */ +static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src, + size_t transferred, unsigned count, + bool is_compat) +{ +#ifdef CONFIG_COMPAT + if (count * sizeof(struct compat_iovec) == transferred) { + struct compat_iovec *ciov = src; + unsigned i; + + /* + * With this interface a 32bit server cannot support + * non-compat (i.e. ones coming from 64bit apps) ioctl + * requests + */ + if (!is_compat) + return -EINVAL; + + for (i = 0; i < count; i++) { + dst[i].iov_base = compat_ptr(ciov[i].iov_base); + dst[i].iov_len = ciov[i].iov_len; + } + return 0; + } +#endif + + if (count * sizeof(struct iovec) != transferred) + return -EIO; + + memcpy(dst, src, transferred); + return 0; +} + +/* Make sure iov_length() won't overflow */ +static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count) +{ + size_t n; + u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT; + + for (n = 0; n < count; n++) { + if (iov->iov_len > (size_t) max) + return -ENOMEM; + max -= iov->iov_len; + } + return 0; +} + +static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst, + void *src, size_t transferred, unsigned count, + bool is_compat) +{ + unsigned i; + struct fuse_ioctl_iovec *fiov = src; + + if (fc->minor < 16) { + return fuse_copy_ioctl_iovec_old(dst, src, transferred, + count, is_compat); + } + + if (count * sizeof(struct fuse_ioctl_iovec) != transferred) + return -EIO; + + for (i = 0; i < count; i++) { + /* Did the server supply an inappropriate value? */ + if (fiov[i].base != (unsigned long) fiov[i].base || + fiov[i].len != (unsigned long) fiov[i].len) + return -EIO; + + dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base; + dst[i].iov_len = (size_t) fiov[i].len; + +#ifdef CONFIG_COMPAT + if (is_compat && + (ptr_to_compat(dst[i].iov_base) != fiov[i].base || + (compat_size_t) dst[i].iov_len != fiov[i].len)) + return -EIO; +#endif + } + + return 0; +} + + +/* + * For ioctls, there is no generic way to determine how much memory + * needs to be read and/or written. Furthermore, ioctls are allowed + * to dereference the passed pointer, so the parameter requires deep + * copying but FUSE has no idea whatsoever about what to copy in or + * out. + * + * This is solved by allowing FUSE server to retry ioctl with + * necessary in/out iovecs. Let's assume the ioctl implementation + * needs to read in the following structure. + * + * struct a { + * char *buf; + * size_t buflen; + * } + * + * On the first callout to FUSE server, inarg->in_size and + * inarg->out_size will be NULL; then, the server completes the ioctl + * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and + * the actual iov array to + * + * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) } } + * + * which tells FUSE to copy in the requested area and retry the ioctl. + * On the second round, the server has access to the structure and + * from that it can tell what to look for next, so on the invocation, + * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to + * + * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) }, + * { .iov_base = a.buf, .iov_len = a.buflen } } + * + * FUSE will copy both struct a and the pointed buffer from the + * process doing the ioctl and retry ioctl with both struct a and the + * buffer. + * + * This time, FUSE server has everything it needs and completes ioctl + * without FUSE_IOCTL_RETRY which finishes the ioctl call. + * + * Copying data out works the same way. + * + * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel + * automatically initializes in and out iovs by decoding @cmd with + * _IOC_* macros and the server is not allowed to request RETRY. This + * limits ioctl data transfers to well-formed ioctls and is the forced + * behavior for all FUSE servers. + */ +long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, + unsigned int flags) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fc; + struct fuse_ioctl_in inarg = { + .fh = ff->fh, + .cmd = cmd, + .arg = arg, + .flags = flags + }; + struct fuse_ioctl_out outarg; + struct fuse_req *req = NULL; + struct page **pages = NULL; + struct iovec *iov_page = NULL; + struct iovec *in_iov = NULL, *out_iov = NULL; + unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages; + size_t in_size, out_size, transferred; + int err; + +#if BITS_PER_LONG == 32 + inarg.flags |= FUSE_IOCTL_32BIT; +#else + if (flags & FUSE_IOCTL_COMPAT) + inarg.flags |= FUSE_IOCTL_32BIT; +#endif + + /* assume all the iovs returned by client always fits in a page */ + BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); + + err = -ENOMEM; + pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, sizeof(pages[0]), GFP_KERNEL); + iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); + if (!pages || !iov_page) + goto out; + + /* + * If restricted, initialize IO parameters as encoded in @cmd. + * RETRY from server is not allowed. + */ + if (!(flags & FUSE_IOCTL_UNRESTRICTED)) { + struct iovec *iov = iov_page; + + iov->iov_base = (void __user *)arg; + iov->iov_len = _IOC_SIZE(cmd); + + if (_IOC_DIR(cmd) & _IOC_WRITE) { + in_iov = iov; + in_iovs = 1; + } + + if (_IOC_DIR(cmd) & _IOC_READ) { + out_iov = iov; + out_iovs = 1; + } + } + + retry: + inarg.in_size = in_size = iov_length(in_iov, in_iovs); + inarg.out_size = out_size = iov_length(out_iov, out_iovs); + + /* + * Out data can be used either for actual out data or iovs, + * make sure there always is at least one page. + */ + out_size = max_t(size_t, out_size, PAGE_SIZE); + max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE); + + /* make sure there are enough buffer pages and init request with them */ + err = -ENOMEM; + if (max_pages > FUSE_MAX_PAGES_PER_REQ) + goto out; + while (num_pages < max_pages) { + pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + if (!pages[num_pages]) + goto out; + num_pages++; + } + + req = fuse_get_req(fc); + if (IS_ERR(req)) { + err = PTR_ERR(req); + req = NULL; + goto out; + } + memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages); + req->num_pages = num_pages; + + /* okay, let's send it to the client */ + req->in.h.opcode = FUSE_IOCTL; + req->in.h.nodeid = ff->nodeid; + req->in.numargs = 1; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + if (in_size) { + req->in.numargs++; + req->in.args[1].size = in_size; + req->in.argpages = 1; + + err = fuse_ioctl_copy_user(pages, in_iov, in_iovs, in_size, + false); + if (err) + goto out; + } + + req->out.numargs = 2; + req->out.args[0].size = sizeof(outarg); + req->out.args[0].value = &outarg; + req->out.args[1].size = out_size; + req->out.argpages = 1; + req->out.argvar = 1; + + fuse_request_send(fc, req); + err = req->out.h.error; + transferred = req->out.args[1].size; + fuse_put_request(fc, req); + req = NULL; + if (err) + goto out; + + /* did it ask for retry? */ + if (outarg.flags & FUSE_IOCTL_RETRY) { + void *vaddr; + + /* no retry if in restricted mode */ + err = -EIO; + if (!(flags & FUSE_IOCTL_UNRESTRICTED)) + goto out; + + in_iovs = outarg.in_iovs; + out_iovs = outarg.out_iovs; + + /* + * Make sure things are in boundary, separate checks + * are to protect against overflow. + */ + err = -ENOMEM; + if (in_iovs > FUSE_IOCTL_MAX_IOV || + out_iovs > FUSE_IOCTL_MAX_IOV || + in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV) + goto out; + + vaddr = kmap_atomic(pages[0]); + err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr, + transferred, in_iovs + out_iovs, + (flags & FUSE_IOCTL_COMPAT) != 0); + kunmap_atomic(vaddr); + if (err) + goto out; + + in_iov = iov_page; + out_iov = in_iov + in_iovs; + + err = fuse_verify_ioctl_iov(in_iov, in_iovs); + if (err) + goto out; + + err = fuse_verify_ioctl_iov(out_iov, out_iovs); + if (err) + goto out; + + goto retry; + } + + err = -EIO; + if (transferred > inarg.out_size) + goto out; + + err = fuse_ioctl_copy_user(pages, out_iov, out_iovs, transferred, true); + out: + if (req) + fuse_put_request(fc, req); + free_page((unsigned long) iov_page); + while (num_pages) + __free_page(pages[--num_pages]); + kfree(pages); + + return err ? err : outarg.result; +} +EXPORT_SYMBOL_GPL(fuse_do_ioctl); + +long fuse_ioctl_common(struct file *file, unsigned int cmd, + unsigned long arg, unsigned int flags) +{ + struct inode *inode = file->f_dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + + if (!fuse_allow_task(fc, current)) + return -EACCES; + + if (is_bad_inode(inode)) + return -EIO; + + return fuse_do_ioctl(file, cmd, arg, flags); +} + +static long fuse_file_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + return fuse_ioctl_common(file, cmd, arg, 0); +} + +static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT); +} + +/* + * All files which have been polled are linked to RB tree + * fuse_conn->polled_files which is indexed by kh. Walk the tree and + * find the matching one. + */ +static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh, + struct rb_node **parent_out) +{ + struct rb_node **link = &fc->polled_files.rb_node; + struct rb_node *last = NULL; + + while (*link) { + struct fuse_file *ff; + + last = *link; + ff = rb_entry(last, struct fuse_file, polled_node); + + if (kh < ff->kh) + link = &last->rb_left; + else if (kh > ff->kh) + link = &last->rb_right; + else + return link; + } + + if (parent_out) + *parent_out = last; + return link; +} + +/* + * The file is about to be polled. Make sure it's on the polled_files + * RB tree. Note that files once added to the polled_files tree are + * not removed before the file is released. This is because a file + * polled once is likely to be polled again. + */ +static void fuse_register_polled_file(struct fuse_conn *fc, + struct fuse_file *ff) +{ + spin_lock(&fc->lock); + if (RB_EMPTY_NODE(&ff->polled_node)) { + struct rb_node **link, *parent; + + link = fuse_find_polled_node(fc, ff->kh, &parent); + BUG_ON(*link); + rb_link_node(&ff->polled_node, parent, link); + rb_insert_color(&ff->polled_node, &fc->polled_files); + } + spin_unlock(&fc->lock); +} + +unsigned fuse_file_poll(struct file *file, poll_table *wait) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fc; + struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh }; + struct fuse_poll_out outarg; + struct fuse_req *req; + int err; + + if (fc->no_poll) + return DEFAULT_POLLMASK; + + poll_wait(file, &ff->poll_wait, wait); + + /* + * Ask for notification iff there's someone waiting for it. + * The client may ignore the flag and always notify. + */ + if (waitqueue_active(&ff->poll_wait)) { + inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY; + fuse_register_polled_file(fc, ff); + } + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return POLLERR; + + req->in.h.opcode = FUSE_POLL; + req->in.h.nodeid = ff->nodeid; + req->in.numargs = 1; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + req->out.numargs = 1; + req->out.args[0].size = sizeof(outarg); + req->out.args[0].value = &outarg; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + + if (!err) + return outarg.revents; + if (err == -ENOSYS) { + fc->no_poll = 1; + return DEFAULT_POLLMASK; + } + return POLLERR; +} +EXPORT_SYMBOL_GPL(fuse_file_poll); + +/* + * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and + * wakes up the poll waiters. + */ +int fuse_notify_poll_wakeup(struct fuse_conn *fc, + struct fuse_notify_poll_wakeup_out *outarg) +{ + u64 kh = outarg->kh; + struct rb_node **link; + + spin_lock(&fc->lock); + + link = fuse_find_polled_node(fc, kh, NULL); + if (*link) { + struct fuse_file *ff; + + ff = rb_entry(*link, struct fuse_file, polled_node); + wake_up_interruptible_sync(&ff->poll_wait); + } + + spin_unlock(&fc->lock); + return 0; +} + +static ssize_t fuse_loop_dio(struct file *filp, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos, int rw) +{ + const struct iovec *vector = iov; + ssize_t ret = 0; + + while (nr_segs > 0) { + void __user *base; + size_t len; + ssize_t nr; + + base = vector->iov_base; + len = vector->iov_len; + vector++; + nr_segs--; + + if (rw == WRITE) + nr = __fuse_direct_write(filp, base, len, ppos); + else + nr = fuse_direct_read(filp, base, len, ppos); + + if (nr < 0) { + if (!ret) + ret = nr; + break; + } + ret += nr; + if (nr != len) + break; + } + + return ret; +} + + +static ssize_t +fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, + loff_t offset, unsigned long nr_segs) +{ + ssize_t ret = 0; + struct file *file = NULL; + loff_t pos = 0; + + file = iocb->ki_filp; + pos = offset; + + ret = fuse_loop_dio(file, iov, nr_segs, &pos, rw); + + return ret; +} + +static const struct file_operations fuse_file_operations = { + .llseek = fuse_file_llseek, + .read = do_sync_read, + .aio_read = fuse_file_aio_read, + .write = do_sync_write, + .aio_write = fuse_file_aio_write, + .mmap = fuse_file_mmap, + .open = fuse_open, + .flush = fuse_flush, + .release = fuse_release, + .fsync = fuse_fsync, + .lock = fuse_file_lock, + .flock = fuse_file_flock, + .splice_read = generic_file_splice_read, + .unlocked_ioctl = fuse_file_ioctl, + .compat_ioctl = fuse_file_compat_ioctl, + .poll = fuse_file_poll, +}; + +static const struct file_operations fuse_direct_io_file_operations = { + .llseek = fuse_file_llseek, + .read = fuse_direct_read, + .write = fuse_direct_write, + .mmap = fuse_direct_mmap, + .open = fuse_open, + .flush = fuse_flush, + .release = fuse_release, + .fsync = fuse_fsync, + .lock = fuse_file_lock, + .flock = fuse_file_flock, + .unlocked_ioctl = fuse_file_ioctl, + .compat_ioctl = fuse_file_compat_ioctl, + .poll = fuse_file_poll, + /* no splice_read */ +}; + +static const struct address_space_operations fuse_file_aops = { + .readpage = fuse_readpage, + .writepage = fuse_writepage, + .launder_page = fuse_launder_page, + .readpages = fuse_readpages, + .set_page_dirty = __set_page_dirty_nobuffers, + .bmap = fuse_bmap, + .direct_IO = fuse_direct_IO, +}; + +void fuse_init_file_inode(struct inode *inode) +{ + inode->i_fop = &fuse_file_operations; + inode->i_data.a_ops = &fuse_file_aops; +} diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h new file mode 100644 index 00000000..d1819269 --- /dev/null +++ b/fs/fuse/fuse_i.h @@ -0,0 +1,784 @@ +/* + FUSE: Filesystem in Userspace + Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. +*/ + +#ifndef _FS_FUSE_I_H +#define _FS_FUSE_I_H + +#include <linux/fuse.h> +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/wait.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/mm.h> +#include <linux/backing-dev.h> +#include <linux/mutex.h> +#include <linux/rwsem.h> +#include <linux/rbtree.h> +#include <linux/poll.h> +#include <linux/workqueue.h> + +/** Max number of pages that can be used in a single read request */ +#define FUSE_MAX_PAGES_PER_REQ 32 + +/** Bias for fi->writectr, meaning new writepages must not be sent */ +#define FUSE_NOWRITE INT_MIN + +/** It could be as large as PATH_MAX, but would that have any uses? */ +#define FUSE_NAME_MAX 1024 + +/** Number of dentries for each connection in the control filesystem */ +#define FUSE_CTL_NUM_DENTRIES 5 + +/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem + module will check permissions based on the file mode. Otherwise no + permission checking is done in the kernel */ +#define FUSE_DEFAULT_PERMISSIONS (1 << 0) + +/** If the FUSE_ALLOW_OTHER flag is given, then not only the user + doing the mount will be allowed to access the filesystem */ +#define FUSE_ALLOW_OTHER (1 << 1) + +/** List of active connections */ +extern struct list_head fuse_conn_list; + +/** Global mutex protecting fuse_conn_list and the control filesystem */ +extern struct mutex fuse_mutex; + +/** Module parameters */ +extern unsigned max_user_bgreq; +extern unsigned max_user_congthresh; + +/* One forget request */ +struct fuse_forget_link { + struct fuse_forget_one forget_one; + struct fuse_forget_link *next; +}; + +/** FUSE inode */ +struct fuse_inode { + /** Inode data */ + struct inode inode; + + /** Unique ID, which identifies the inode between userspace + * and kernel */ + u64 nodeid; + + /** Number of lookups on this inode */ + u64 nlookup; + + /** The request used for sending the FORGET message */ + struct fuse_forget_link *forget; + + /** Time in jiffies until the file attributes are valid */ + u64 i_time; + + /** The sticky bit in inode->i_mode may have been removed, so + preserve the original mode */ + umode_t orig_i_mode; + + /** 64 bit inode number */ + u64 orig_ino; + + /** Version of last attribute change */ + u64 attr_version; + + /** Files usable in writepage. Protected by fc->lock */ + struct list_head write_files; + + /** Writepages pending on truncate or fsync */ + struct list_head queued_writes; + + /** Number of sent writes, a negative bias (FUSE_NOWRITE) + * means more writes are blocked */ + int writectr; + + /** Waitq for writepage completion */ + wait_queue_head_t page_waitq; + + /** List of writepage requestst (pending or sent) */ + struct list_head writepages; +}; + +struct fuse_conn; + +/** FUSE specific file data */ +struct fuse_file { + /** Fuse connection for this file */ + struct fuse_conn *fc; + + /** Request reserved for flush and release */ + struct fuse_req *reserved_req; + + /** Kernel file handle guaranteed to be unique */ + u64 kh; + + /** File handle used by userspace */ + u64 fh; + + /** Node id of this file */ + u64 nodeid; + + /** Refcount */ + atomic_t count; + + /** FOPEN_* flags returned by open */ + u32 open_flags; + + /** Entry on inode's write_files list */ + struct list_head write_entry; + + /** RB node to be linked on fuse_conn->polled_files */ + struct rb_node polled_node; + + /** Wait queue head for poll */ + wait_queue_head_t poll_wait; + + /** Has flock been performed on this file? */ + bool flock:1; +}; + +/** One input argument of a request */ +struct fuse_in_arg { + unsigned size; + const void *value; +}; + +/** The request input */ +struct fuse_in { + /** The request header */ + struct fuse_in_header h; + + /** True if the data for the last argument is in req->pages */ + unsigned argpages:1; + + /** Number of arguments */ + unsigned numargs; + + /** Array of arguments */ + struct fuse_in_arg args[3]; +}; + +/** One output argument of a request */ +struct fuse_arg { + unsigned size; + void *value; +}; + +/** The request output */ +struct fuse_out { + /** Header returned from userspace */ + struct fuse_out_header h; + + /* + * The following bitfields are not changed during the request + * processing + */ + + /** Last argument is variable length (can be shorter than + arg->size) */ + unsigned argvar:1; + + /** Last argument is a list of pages to copy data to */ + unsigned argpages:1; + + /** Zero partially or not copied pages */ + unsigned page_zeroing:1; + + /** Pages may be replaced with new ones */ + unsigned page_replace:1; + + /** Number or arguments */ + unsigned numargs; + + /** Array of arguments */ + struct fuse_arg args[3]; +}; + +/** The request state */ +enum fuse_req_state { + FUSE_REQ_INIT = 0, + FUSE_REQ_PENDING, + FUSE_REQ_READING, + FUSE_REQ_SENT, + FUSE_REQ_WRITING, + FUSE_REQ_FINISHED +}; + +/** + * A request to the client + */ +struct fuse_req { + /** This can be on either pending processing or io lists in + fuse_conn */ + struct list_head list; + + /** Entry on the interrupts list */ + struct list_head intr_entry; + + /** refcount */ + atomic_t count; + + /** Unique ID for the interrupt request */ + u64 intr_unique; + + /* + * The following bitfields are either set once before the + * request is queued or setting/clearing them is protected by + * fuse_conn->lock + */ + + /** True if the request has reply */ + unsigned isreply:1; + + /** Force sending of the request even if interrupted */ + unsigned force:1; + + /** The request was aborted */ + unsigned aborted:1; + + /** Request is sent in the background */ + unsigned background:1; + + /** The request has been interrupted */ + unsigned interrupted:1; + + /** Data is being copied to/from the request */ + unsigned locked:1; + + /** Request is counted as "waiting" */ + unsigned waiting:1; + + /** State of the request */ + enum fuse_req_state state; + + /** The request input */ + struct fuse_in in; + + /** The request output */ + struct fuse_out out; + + /** Used to wake up the task waiting for completion of request*/ + wait_queue_head_t waitq; + + /** Data for asynchronous requests */ + union { + struct { + union { + struct fuse_release_in in; + struct work_struct work; + }; + struct path path; + } release; + struct fuse_init_in init_in; + struct fuse_init_out init_out; + struct cuse_init_in cuse_init_in; + struct { + struct fuse_read_in in; + u64 attr_ver; + } read; + struct { + struct fuse_write_in in; + struct fuse_write_out out; + } write; + struct fuse_notify_retrieve_in retrieve_in; + struct fuse_lk_in lk_in; + } misc; + + /** page vector */ + struct page *pages[FUSE_MAX_PAGES_PER_REQ]; + + /** number of pages in vector */ + unsigned num_pages; + + /** offset of data on first page */ + unsigned page_offset; + + /** File used in the request (or NULL) */ + struct fuse_file *ff; + + /** Inode used in the request or NULL */ + struct inode *inode; + + /** Link on fi->writepages */ + struct list_head writepages_entry; + + /** Request completion callback */ + void (*end)(struct fuse_conn *, struct fuse_req *); + + /** Request is stolen from fuse_file->reserved_req */ + struct file *stolen_file; +}; + +/** + * A Fuse connection. + * + * This structure is created, when the filesystem is mounted, and is + * destroyed, when the client device is closed and the filesystem is + * unmounted. + */ +struct fuse_conn { + /** Lock protecting accessess to members of this structure */ + spinlock_t lock; + + /** Mutex protecting against directory alias creation */ + struct mutex inst_mutex; + + /** Refcount */ + atomic_t count; + + /** The user id for this mount */ + uid_t user_id; + + /** The group id for this mount */ + gid_t group_id; + + /** The fuse mount flags for this mount */ + unsigned flags; + + /** Maximum read size */ + unsigned max_read; + + /** Maximum write size */ + unsigned max_write; + + /** Readers of the connection are waiting on this */ + wait_queue_head_t waitq; + + /** The list of pending requests */ + struct list_head pending; + + /** The list of requests being processed */ + struct list_head processing; + + /** The list of requests under I/O */ + struct list_head io; + + /** The next unique kernel file handle */ + u64 khctr; + + /** rbtree of fuse_files waiting for poll events indexed by ph */ + struct rb_root polled_files; + + /** Maximum number of outstanding background requests */ + unsigned max_background; + + /** Number of background requests at which congestion starts */ + unsigned congestion_threshold; + + /** Number of requests currently in the background */ + unsigned num_background; + + /** Number of background requests currently queued for userspace */ + unsigned active_background; + + /** The list of background requests set aside for later queuing */ + struct list_head bg_queue; + + /** Pending interrupts */ + struct list_head interrupts; + + /** Queue of pending forgets */ + struct fuse_forget_link forget_list_head; + struct fuse_forget_link *forget_list_tail; + + /** Batching of FORGET requests (positive indicates FORGET batch) */ + int forget_batch; + + /** Flag indicating if connection is blocked. This will be + the case before the INIT reply is received, and if there + are too many outstading backgrounds requests */ + int blocked; + + /** waitq for blocked connection */ + wait_queue_head_t blocked_waitq; + + /** waitq for reserved requests */ + wait_queue_head_t reserved_req_waitq; + + /** The next unique request id */ + u64 reqctr; + + /** Connection established, cleared on umount, connection + abort and device release */ + unsigned connected; + + /** Connection failed (version mismatch). Cannot race with + setting other bitfields since it is only set once in INIT + reply, before any other request, and never cleared */ + unsigned conn_error:1; + + /** Connection successful. Only set in INIT */ + unsigned conn_init:1; + + /** Do readpages asynchronously? Only set in INIT */ + unsigned async_read:1; + + /** Do not send separate SETATTR request before open(O_TRUNC) */ + unsigned atomic_o_trunc:1; + + /** Filesystem supports NFS exporting. Only set in INIT */ + unsigned export_support:1; + + /** Set if bdi is valid */ + unsigned bdi_initialized:1; + + /* + * The following bitfields are only for optimization purposes + * and hence races in setting them will not cause malfunction + */ + + /** Is fsync not implemented by fs? */ + unsigned no_fsync:1; + + /** Is fsyncdir not implemented by fs? */ + unsigned no_fsyncdir:1; + + /** Is flush not implemented by fs? */ + unsigned no_flush:1; + + /** Is setxattr not implemented by fs? */ + unsigned no_setxattr:1; + + /** Is getxattr not implemented by fs? */ + unsigned no_getxattr:1; + + /** Is listxattr not implemented by fs? */ + unsigned no_listxattr:1; + + /** Is removexattr not implemented by fs? */ + unsigned no_removexattr:1; + + /** Are posix file locking primitives not implemented by fs? */ + unsigned no_lock:1; + + /** Is access not implemented by fs? */ + unsigned no_access:1; + + /** Is create not implemented by fs? */ + unsigned no_create:1; + + /** Is interrupt not implemented by fs? */ + unsigned no_interrupt:1; + + /** Is bmap not implemented by fs? */ + unsigned no_bmap:1; + + /** Is poll not implemented by fs? */ + unsigned no_poll:1; + + /** Do multi-page cached writes */ + unsigned big_writes:1; + + /** Don't apply umask to creation modes */ + unsigned dont_mask:1; + + /** Are BSD file locking primitives not implemented by fs? */ + unsigned no_flock:1; + + /** The number of requests waiting for completion */ + atomic_t num_waiting; + + /** Negotiated minor version */ + unsigned minor; + + /** Backing dev info */ + struct backing_dev_info bdi; + + /** Entry on the fuse_conn_list */ + struct list_head entry; + + /** Device ID from super block */ + dev_t dev; + + /** Dentries in the control filesystem */ + struct dentry *ctl_dentry[FUSE_CTL_NUM_DENTRIES]; + + /** number of dentries used in the above array */ + int ctl_ndents; + + /** O_ASYNC requests */ + struct fasync_struct *fasync; + + /** Key for lock owner ID scrambling */ + u32 scramble_key[4]; + + /** Reserved request for the DESTROY message */ + struct fuse_req *destroy_req; + + /** Version counter for attribute changes */ + u64 attr_version; + + /** Called on final put */ + void (*release)(struct fuse_conn *); + + /** Super block for this connection. */ + struct super_block *sb; + + /** Read/write semaphore to hold when accessing sb. */ + struct rw_semaphore killsb; +}; + +static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct fuse_conn *get_fuse_conn(struct inode *inode) +{ + return get_fuse_conn_super(inode->i_sb); +} + +static inline struct fuse_inode *get_fuse_inode(struct inode *inode) +{ + return container_of(inode, struct fuse_inode, inode); +} + +static inline u64 get_node_id(struct inode *inode) +{ + return get_fuse_inode(inode)->nodeid; +} + +/** Device operations */ +extern const struct file_operations fuse_dev_operations; + +extern const struct dentry_operations fuse_dentry_operations; + +/** + * Inode to nodeid comparison. + */ +int fuse_inode_eq(struct inode *inode, void *_nodeidp); + +/** + * Get a filled in inode + */ +struct inode *fuse_iget(struct super_block *sb, u64 nodeid, + int generation, struct fuse_attr *attr, + u64 attr_valid, u64 attr_version); + +int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name, + struct fuse_entry_out *outarg, struct inode **inode); + +/** + * Send FORGET command + */ +void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, + u64 nodeid, u64 nlookup); + +struct fuse_forget_link *fuse_alloc_forget(void); + +/** + * Initialize READ or READDIR request + */ +void fuse_read_fill(struct fuse_req *req, struct file *file, + loff_t pos, size_t count, int opcode); + +/** + * Send OPEN or OPENDIR request + */ +int fuse_open_common(struct inode *inode, struct file *file, bool isdir); + +struct fuse_file *fuse_file_alloc(struct fuse_conn *fc); +struct fuse_file *fuse_file_get(struct fuse_file *ff); +void fuse_file_free(struct fuse_file *ff); +void fuse_finish_open(struct inode *inode, struct file *file); + +void fuse_sync_release(struct fuse_file *ff, int flags); + +/** + * Send RELEASE or RELEASEDIR request + */ +void fuse_release_common(struct file *file, int opcode); + +/** + * Send FSYNC or FSYNCDIR request + */ +int fuse_fsync_common(struct file *file, loff_t start, loff_t end, + int datasync, int isdir); + +/** + * Notify poll wakeup + */ +int fuse_notify_poll_wakeup(struct fuse_conn *fc, + struct fuse_notify_poll_wakeup_out *outarg); + +/** + * Initialize file operations on a regular file + */ +void fuse_init_file_inode(struct inode *inode); + +/** + * Initialize inode operations on regular files and special files + */ +void fuse_init_common(struct inode *inode); + +/** + * Initialize inode and file operations on a directory + */ +void fuse_init_dir(struct inode *inode); + +/** + * Initialize inode operations on a symlink + */ +void fuse_init_symlink(struct inode *inode); + +/** + * Change attributes of an inode + */ +void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, + u64 attr_valid, u64 attr_version); + +void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, + u64 attr_valid); + +/** + * Initialize the client device + */ +int fuse_dev_init(void); + +/** + * Cleanup the client device + */ +void fuse_dev_cleanup(void); + +int fuse_ctl_init(void); +void fuse_ctl_cleanup(void); + +/** + * Allocate a request + */ +struct fuse_req *fuse_request_alloc(void); + +struct fuse_req *fuse_request_alloc_nofs(void); + +/** + * Free a request + */ +void fuse_request_free(struct fuse_req *req); + +/** + * Get a request, may fail with -ENOMEM + */ +struct fuse_req *fuse_get_req(struct fuse_conn *fc); + +/** + * Gets a requests for a file operation, always succeeds + */ +struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file); + +/** + * Decrement reference count of a request. If count goes to zero free + * the request. + */ +void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req); + +/** + * Send a request (synchronous) + */ +void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req); + +/** + * Send a request in the background + */ +void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req); + +void fuse_request_send_background_locked(struct fuse_conn *fc, + struct fuse_req *req); + +/* Abort all requests */ +void fuse_abort_conn(struct fuse_conn *fc); + +/** + * Invalidate inode attributes + */ +void fuse_invalidate_attr(struct inode *inode); + +void fuse_invalidate_entry_cache(struct dentry *entry); + +/** + * Acquire reference to fuse_conn + */ +struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); + +void fuse_conn_kill(struct fuse_conn *fc); + +/** + * Initialize fuse_conn + */ +void fuse_conn_init(struct fuse_conn *fc); + +/** + * Release reference to fuse_conn + */ +void fuse_conn_put(struct fuse_conn *fc); + +/** + * Add connection to control filesystem + */ +int fuse_ctl_add_conn(struct fuse_conn *fc); + +/** + * Remove connection from control filesystem + */ +void fuse_ctl_remove_conn(struct fuse_conn *fc); + +/** + * Is file type valid? + */ +int fuse_valid_type(int m); + +/** + * Is task allowed to perform filesystem operation? + */ +int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task); + +u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id); + +int fuse_update_attributes(struct inode *inode, struct kstat *stat, + struct file *file, bool *refreshed); + +void fuse_flush_writepages(struct inode *inode); + +void fuse_set_nowrite(struct inode *inode); +void fuse_release_nowrite(struct inode *inode); + +u64 fuse_get_attr_version(struct fuse_conn *fc); + +/** + * File-system tells the kernel to invalidate cache for the given node id. + */ +int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, + loff_t offset, loff_t len); + +/** + * File-system tells the kernel to invalidate parent attributes and + * the dentry matching parent/name. + * + * If the child_nodeid is non-zero and: + * - matches the inode number for the dentry matching parent/name, + * - is not a mount point + * - is a file or oan empty directory + * then the dentry is unhashed (d_delete()). + */ +int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, + u64 child_nodeid, struct qstr *name); + +int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, + bool isdir); +ssize_t fuse_direct_io(struct file *file, const char __user *buf, + size_t count, loff_t *ppos, int write); +long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, + unsigned int flags); +long fuse_ioctl_common(struct file *file, unsigned int cmd, + unsigned long arg, unsigned int flags); +unsigned fuse_file_poll(struct file *file, poll_table *wait); +int fuse_dev_release(struct inode *inode, struct file *file); + +void fuse_write_update_size(struct inode *inode, loff_t pos); + +#endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c new file mode 100644 index 00000000..a59cf5e6 --- /dev/null +++ b/fs/fuse/inode.c @@ -0,0 +1,1267 @@ +/* + FUSE: Filesystem in Userspace + Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. +*/ + +#include "fuse_i.h" + +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/file.h> +#include <linux/seq_file.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/parser.h> +#include <linux/statfs.h> +#include <linux/random.h> +#include <linux/sched.h> +#include <linux/exportfs.h> + +MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); +MODULE_DESCRIPTION("Filesystem in Userspace"); +MODULE_LICENSE("GPL"); + +static struct kmem_cache *fuse_inode_cachep; +struct list_head fuse_conn_list; +DEFINE_MUTEX(fuse_mutex); + +static int set_global_limit(const char *val, struct kernel_param *kp); + +unsigned max_user_bgreq; +module_param_call(max_user_bgreq, set_global_limit, param_get_uint, + &max_user_bgreq, 0644); +__MODULE_PARM_TYPE(max_user_bgreq, "uint"); +MODULE_PARM_DESC(max_user_bgreq, + "Global limit for the maximum number of backgrounded requests an " + "unprivileged user can set"); + +unsigned max_user_congthresh; +module_param_call(max_user_congthresh, set_global_limit, param_get_uint, + &max_user_congthresh, 0644); +__MODULE_PARM_TYPE(max_user_congthresh, "uint"); +MODULE_PARM_DESC(max_user_congthresh, + "Global limit for the maximum congestion threshold an " + "unprivileged user can set"); + +#define FUSE_SUPER_MAGIC 0x65735546 + +#define FUSE_DEFAULT_BLKSIZE 512 + +/** Maximum number of outstanding background requests */ +#define FUSE_DEFAULT_MAX_BACKGROUND 12 + +/** Congestion starts at 75% of maximum */ +#define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4) + +struct fuse_mount_data { + int fd; + unsigned rootmode; + unsigned user_id; + unsigned group_id; + unsigned fd_present:1; + unsigned rootmode_present:1; + unsigned user_id_present:1; + unsigned group_id_present:1; + unsigned flags; + unsigned max_read; + unsigned blksize; +}; + +struct fuse_forget_link *fuse_alloc_forget(void) +{ + return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL); +} + +static struct inode *fuse_alloc_inode(struct super_block *sb) +{ + struct inode *inode; + struct fuse_inode *fi; + + inode = kmem_cache_alloc(fuse_inode_cachep, GFP_KERNEL); + if (!inode) + return NULL; + + fi = get_fuse_inode(inode); + fi->i_time = 0; + fi->nodeid = 0; + fi->nlookup = 0; + fi->attr_version = 0; + fi->writectr = 0; + fi->orig_ino = 0; + INIT_LIST_HEAD(&fi->write_files); + INIT_LIST_HEAD(&fi->queued_writes); + INIT_LIST_HEAD(&fi->writepages); + init_waitqueue_head(&fi->page_waitq); + fi->forget = fuse_alloc_forget(); + if (!fi->forget) { + kmem_cache_free(fuse_inode_cachep, inode); + return NULL; + } + + return inode; +} + +static void fuse_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(fuse_inode_cachep, inode); +} + +static void fuse_destroy_inode(struct inode *inode) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + BUG_ON(!list_empty(&fi->write_files)); + BUG_ON(!list_empty(&fi->queued_writes)); + kfree(fi->forget); + call_rcu(&inode->i_rcu, fuse_i_callback); +} + +static void fuse_evict_inode(struct inode *inode) +{ + truncate_inode_pages(&inode->i_data, 0); + end_writeback(inode); + if (inode->i_sb->s_flags & MS_ACTIVE) { + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup); + fi->forget = NULL; + } +} + +static int fuse_remount_fs(struct super_block *sb, int *flags, char *data) +{ + if (*flags & MS_MANDLOCK) + return -EINVAL; + + return 0; +} + +/* + * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down + * so that it will fit. + */ +static ino_t fuse_squash_ino(u64 ino64) +{ + ino_t ino = (ino_t) ino64; + if (sizeof(ino_t) < sizeof(u64)) + ino ^= ino64 >> (sizeof(u64) - sizeof(ino_t)) * 8; + return ino; +} + +void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, + u64 attr_valid) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + fi->attr_version = ++fc->attr_version; + fi->i_time = attr_valid; + + inode->i_ino = fuse_squash_ino(attr->ino); + inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); + set_nlink(inode, attr->nlink); + inode->i_uid = attr->uid; + inode->i_gid = attr->gid; + inode->i_blocks = attr->blocks; + inode->i_atime.tv_sec = attr->atime; + inode->i_atime.tv_nsec = attr->atimensec; + inode->i_mtime.tv_sec = attr->mtime; + inode->i_mtime.tv_nsec = attr->mtimensec; + inode->i_ctime.tv_sec = attr->ctime; + inode->i_ctime.tv_nsec = attr->ctimensec; + + if (attr->blksize != 0) + inode->i_blkbits = ilog2(attr->blksize); + else + inode->i_blkbits = inode->i_sb->s_blocksize_bits; + + /* + * Don't set the sticky bit in i_mode, unless we want the VFS + * to check permissions. This prevents failures due to the + * check in may_delete(). + */ + fi->orig_i_mode = inode->i_mode; + if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) + inode->i_mode &= ~S_ISVTX; + + fi->orig_ino = attr->ino; +} + +void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, + u64 attr_valid, u64 attr_version) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + loff_t oldsize; + + spin_lock(&fc->lock); + if (attr_version != 0 && fi->attr_version > attr_version) { + spin_unlock(&fc->lock); + return; + } + + fuse_change_attributes_common(inode, attr, attr_valid); + + oldsize = inode->i_size; + i_size_write(inode, attr->size); + spin_unlock(&fc->lock); + + if (S_ISREG(inode->i_mode) && oldsize != attr->size) { + truncate_pagecache(inode, oldsize, attr->size); + invalidate_inode_pages2(inode->i_mapping); + } +} + +static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) +{ + inode->i_mode = attr->mode & S_IFMT; + inode->i_size = attr->size; + if (S_ISREG(inode->i_mode)) { + fuse_init_common(inode); + fuse_init_file_inode(inode); + } else if (S_ISDIR(inode->i_mode)) + fuse_init_dir(inode); + else if (S_ISLNK(inode->i_mode)) + fuse_init_symlink(inode); + else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + fuse_init_common(inode); + init_special_inode(inode, inode->i_mode, + new_decode_dev(attr->rdev)); + } else + BUG(); +} + +int fuse_inode_eq(struct inode *inode, void *_nodeidp) +{ + u64 nodeid = *(u64 *) _nodeidp; + if (get_node_id(inode) == nodeid) + return 1; + else + return 0; +} + +static int fuse_inode_set(struct inode *inode, void *_nodeidp) +{ + u64 nodeid = *(u64 *) _nodeidp; + get_fuse_inode(inode)->nodeid = nodeid; + return 0; +} + +struct inode *fuse_iget(struct super_block *sb, u64 nodeid, + int generation, struct fuse_attr *attr, + u64 attr_valid, u64 attr_version) +{ + struct inode *inode; + struct fuse_inode *fi; + struct fuse_conn *fc = get_fuse_conn_super(sb); + + retry: + inode = iget5_locked(sb, nodeid, fuse_inode_eq, fuse_inode_set, &nodeid); + if (!inode) + return NULL; + + if ((inode->i_state & I_NEW)) { + inode->i_flags |= S_NOATIME|S_NOCMTIME; + inode->i_generation = generation; + inode->i_data.backing_dev_info = &fc->bdi; + fuse_init_inode(inode, attr); + unlock_new_inode(inode); + } else if ((inode->i_mode ^ attr->mode) & S_IFMT) { + /* Inode has changed type, any I/O on the old should fail */ + make_bad_inode(inode); + iput(inode); + goto retry; + } + + fi = get_fuse_inode(inode); + spin_lock(&fc->lock); + fi->nlookup++; + spin_unlock(&fc->lock); + fuse_change_attributes(inode, attr, attr_valid, attr_version); + + return inode; +} + +int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, + loff_t offset, loff_t len) +{ + struct inode *inode; + pgoff_t pg_start; + pgoff_t pg_end; + + inode = ilookup5(sb, nodeid, fuse_inode_eq, &nodeid); + if (!inode) + return -ENOENT; + + fuse_invalidate_attr(inode); + if (offset >= 0) { + pg_start = offset >> PAGE_CACHE_SHIFT; + if (len <= 0) + pg_end = -1; + else + pg_end = (offset + len - 1) >> PAGE_CACHE_SHIFT; + invalidate_inode_pages2_range(inode->i_mapping, + pg_start, pg_end); + } + iput(inode); + return 0; +} + +static void fuse_umount_begin(struct super_block *sb) +{ + fuse_abort_conn(get_fuse_conn_super(sb)); +} + +static void fuse_send_destroy(struct fuse_conn *fc) +{ + struct fuse_req *req = fc->destroy_req; + if (req && fc->conn_init) { + fc->destroy_req = NULL; + req->in.h.opcode = FUSE_DESTROY; + req->force = 1; + fuse_request_send(fc, req); + fuse_put_request(fc, req); + } +} + +static void fuse_bdi_destroy(struct fuse_conn *fc) +{ + if (fc->bdi_initialized) + bdi_destroy(&fc->bdi); +} + +void fuse_conn_kill(struct fuse_conn *fc) +{ + spin_lock(&fc->lock); + fc->connected = 0; + fc->blocked = 0; + spin_unlock(&fc->lock); + /* Flush all readers on this fs */ + kill_fasync(&fc->fasync, SIGIO, POLL_IN); + wake_up_all(&fc->waitq); + wake_up_all(&fc->blocked_waitq); + wake_up_all(&fc->reserved_req_waitq); + mutex_lock(&fuse_mutex); + list_del(&fc->entry); + fuse_ctl_remove_conn(fc); + mutex_unlock(&fuse_mutex); + fuse_bdi_destroy(fc); +} +EXPORT_SYMBOL_GPL(fuse_conn_kill); + +static void fuse_put_super(struct super_block *sb) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + + fuse_send_destroy(fc); + fuse_conn_kill(fc); + fuse_conn_put(fc); +} + +static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr) +{ + stbuf->f_type = FUSE_SUPER_MAGIC; + stbuf->f_bsize = attr->bsize; + stbuf->f_frsize = attr->frsize; + stbuf->f_blocks = attr->blocks; + stbuf->f_bfree = attr->bfree; + stbuf->f_bavail = attr->bavail; + stbuf->f_files = attr->files; + stbuf->f_ffree = attr->ffree; + stbuf->f_namelen = attr->namelen; + /* fsid is left zero */ +} + +static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct fuse_conn *fc = get_fuse_conn_super(sb); + struct fuse_req *req; + struct fuse_statfs_out outarg; + int err; + + if (!fuse_allow_task(fc, current)) { + buf->f_type = FUSE_SUPER_MAGIC; + return 0; + } + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + memset(&outarg, 0, sizeof(outarg)); + req->in.numargs = 0; + req->in.h.opcode = FUSE_STATFS; + req->in.h.nodeid = get_node_id(dentry->d_inode); + req->out.numargs = 1; + req->out.args[0].size = + fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg); + req->out.args[0].value = &outarg; + fuse_request_send(fc, req); + err = req->out.h.error; + if (!err) + convert_fuse_statfs(buf, &outarg.st); + fuse_put_request(fc, req); + return err; +} + +enum { + OPT_FD, + OPT_ROOTMODE, + OPT_USER_ID, + OPT_GROUP_ID, + OPT_DEFAULT_PERMISSIONS, + OPT_ALLOW_OTHER, + OPT_MAX_READ, + OPT_BLKSIZE, + OPT_ERR +}; + +static const match_table_t tokens = { + {OPT_FD, "fd=%u"}, + {OPT_ROOTMODE, "rootmode=%o"}, + {OPT_USER_ID, "user_id=%u"}, + {OPT_GROUP_ID, "group_id=%u"}, + {OPT_DEFAULT_PERMISSIONS, "default_permissions"}, + {OPT_ALLOW_OTHER, "allow_other"}, + {OPT_MAX_READ, "max_read=%u"}, + {OPT_BLKSIZE, "blksize=%u"}, + {OPT_ERR, NULL} +}; + +static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) +{ + char *p; + memset(d, 0, sizeof(struct fuse_mount_data)); + d->max_read = ~0; + d->blksize = FUSE_DEFAULT_BLKSIZE; + + while ((p = strsep(&opt, ",")) != NULL) { + int token; + int value; + substring_t args[MAX_OPT_ARGS]; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case OPT_FD: + if (match_int(&args[0], &value)) + return 0; + d->fd = value; + d->fd_present = 1; + break; + + case OPT_ROOTMODE: + if (match_octal(&args[0], &value)) + return 0; + if (!fuse_valid_type(value)) + return 0; + d->rootmode = value; + d->rootmode_present = 1; + break; + + case OPT_USER_ID: + if (match_int(&args[0], &value)) + return 0; + d->user_id = value; + d->user_id_present = 1; + break; + + case OPT_GROUP_ID: + if (match_int(&args[0], &value)) + return 0; + d->group_id = value; + d->group_id_present = 1; + break; + + case OPT_DEFAULT_PERMISSIONS: + d->flags |= FUSE_DEFAULT_PERMISSIONS; + break; + + case OPT_ALLOW_OTHER: + d->flags |= FUSE_ALLOW_OTHER; + break; + + case OPT_MAX_READ: + if (match_int(&args[0], &value)) + return 0; + d->max_read = value; + break; + + case OPT_BLKSIZE: + if (!is_bdev || match_int(&args[0], &value)) + return 0; + d->blksize = value; + break; + + default: + return 0; + } + } + + if (!d->fd_present || !d->rootmode_present || + !d->user_id_present || !d->group_id_present) + return 0; + + return 1; +} + +static int fuse_show_options(struct seq_file *m, struct dentry *root) +{ + struct super_block *sb = root->d_sb; + struct fuse_conn *fc = get_fuse_conn_super(sb); + + seq_printf(m, ",user_id=%u", fc->user_id); + seq_printf(m, ",group_id=%u", fc->group_id); + if (fc->flags & FUSE_DEFAULT_PERMISSIONS) + seq_puts(m, ",default_permissions"); + if (fc->flags & FUSE_ALLOW_OTHER) + seq_puts(m, ",allow_other"); + if (fc->max_read != ~0) + seq_printf(m, ",max_read=%u", fc->max_read); + if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE) + seq_printf(m, ",blksize=%lu", sb->s_blocksize); + return 0; +} + +void fuse_conn_init(struct fuse_conn *fc) +{ + memset(fc, 0, sizeof(*fc)); + spin_lock_init(&fc->lock); + mutex_init(&fc->inst_mutex); + init_rwsem(&fc->killsb); + atomic_set(&fc->count, 1); + init_waitqueue_head(&fc->waitq); + init_waitqueue_head(&fc->blocked_waitq); + init_waitqueue_head(&fc->reserved_req_waitq); + INIT_LIST_HEAD(&fc->pending); + INIT_LIST_HEAD(&fc->processing); + INIT_LIST_HEAD(&fc->io); + INIT_LIST_HEAD(&fc->interrupts); + INIT_LIST_HEAD(&fc->bg_queue); + INIT_LIST_HEAD(&fc->entry); + fc->forget_list_tail = &fc->forget_list_head; + atomic_set(&fc->num_waiting, 0); + fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND; + fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD; + fc->khctr = 0; + fc->polled_files = RB_ROOT; + fc->reqctr = 0; + fc->blocked = 1; + fc->attr_version = 1; + get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); +} +EXPORT_SYMBOL_GPL(fuse_conn_init); + +void fuse_conn_put(struct fuse_conn *fc) +{ + if (atomic_dec_and_test(&fc->count)) { + if (fc->destroy_req) + fuse_request_free(fc->destroy_req); + mutex_destroy(&fc->inst_mutex); + fc->release(fc); + } +} +EXPORT_SYMBOL_GPL(fuse_conn_put); + +struct fuse_conn *fuse_conn_get(struct fuse_conn *fc) +{ + atomic_inc(&fc->count); + return fc; +} +EXPORT_SYMBOL_GPL(fuse_conn_get); + +static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode) +{ + struct fuse_attr attr; + memset(&attr, 0, sizeof(attr)); + + attr.mode = mode; + attr.ino = FUSE_ROOT_ID; + attr.nlink = 1; + return fuse_iget(sb, 1, 0, &attr, 0, 0); +} + +struct fuse_inode_handle { + u64 nodeid; + u32 generation; +}; + +static struct dentry *fuse_get_dentry(struct super_block *sb, + struct fuse_inode_handle *handle) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + struct inode *inode; + struct dentry *entry; + int err = -ESTALE; + + if (handle->nodeid == 0) + goto out_err; + + inode = ilookup5(sb, handle->nodeid, fuse_inode_eq, &handle->nodeid); + if (!inode) { + struct fuse_entry_out outarg; + struct qstr name; + + if (!fc->export_support) + goto out_err; + + name.len = 1; + name.name = "."; + err = fuse_lookup_name(sb, handle->nodeid, &name, &outarg, + &inode); + if (err && err != -ENOENT) + goto out_err; + if (err || !inode) { + err = -ESTALE; + goto out_err; + } + err = -EIO; + if (get_node_id(inode) != handle->nodeid) + goto out_iput; + } + err = -ESTALE; + if (inode->i_generation != handle->generation) + goto out_iput; + + entry = d_obtain_alias(inode); + if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID) + fuse_invalidate_entry_cache(entry); + + return entry; + + out_iput: + iput(inode); + out_err: + return ERR_PTR(err); +} + +static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, + int connectable) +{ + struct inode *inode = dentry->d_inode; + bool encode_parent = connectable && !S_ISDIR(inode->i_mode); + int len = encode_parent ? 6 : 3; + u64 nodeid; + u32 generation; + + if (*max_len < len) { + *max_len = len; + return 255; + } + + nodeid = get_fuse_inode(inode)->nodeid; + generation = inode->i_generation; + + fh[0] = (u32)(nodeid >> 32); + fh[1] = (u32)(nodeid & 0xffffffff); + fh[2] = generation; + + if (encode_parent) { + struct inode *parent; + + spin_lock(&dentry->d_lock); + parent = dentry->d_parent->d_inode; + nodeid = get_fuse_inode(parent)->nodeid; + generation = parent->i_generation; + spin_unlock(&dentry->d_lock); + + fh[3] = (u32)(nodeid >> 32); + fh[4] = (u32)(nodeid & 0xffffffff); + fh[5] = generation; + } + + *max_len = len; + return encode_parent ? 0x82 : 0x81; +} + +static struct dentry *fuse_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + struct fuse_inode_handle handle; + + if ((fh_type != 0x81 && fh_type != 0x82) || fh_len < 3) + return NULL; + + handle.nodeid = (u64) fid->raw[0] << 32; + handle.nodeid |= (u64) fid->raw[1]; + handle.generation = fid->raw[2]; + return fuse_get_dentry(sb, &handle); +} + +static struct dentry *fuse_fh_to_parent(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + struct fuse_inode_handle parent; + + if (fh_type != 0x82 || fh_len < 6) + return NULL; + + parent.nodeid = (u64) fid->raw[3] << 32; + parent.nodeid |= (u64) fid->raw[4]; + parent.generation = fid->raw[5]; + return fuse_get_dentry(sb, &parent); +} + +static struct dentry *fuse_get_parent(struct dentry *child) +{ + struct inode *child_inode = child->d_inode; + struct fuse_conn *fc = get_fuse_conn(child_inode); + struct inode *inode; + struct dentry *parent; + struct fuse_entry_out outarg; + struct qstr name; + int err; + + if (!fc->export_support) + return ERR_PTR(-ESTALE); + + name.len = 2; + name.name = ".."; + err = fuse_lookup_name(child_inode->i_sb, get_node_id(child_inode), + &name, &outarg, &inode); + if (err) { + if (err == -ENOENT) + return ERR_PTR(-ESTALE); + return ERR_PTR(err); + } + + parent = d_obtain_alias(inode); + if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID) + fuse_invalidate_entry_cache(parent); + + return parent; +} + +static const struct export_operations fuse_export_operations = { + .fh_to_dentry = fuse_fh_to_dentry, + .fh_to_parent = fuse_fh_to_parent, + .encode_fh = fuse_encode_fh, + .get_parent = fuse_get_parent, +}; + +static const struct super_operations fuse_super_operations = { + .alloc_inode = fuse_alloc_inode, + .destroy_inode = fuse_destroy_inode, + .evict_inode = fuse_evict_inode, + .drop_inode = generic_delete_inode, + .remount_fs = fuse_remount_fs, + .put_super = fuse_put_super, + .umount_begin = fuse_umount_begin, + .statfs = fuse_statfs, + .show_options = fuse_show_options, +}; + +static void sanitize_global_limit(unsigned *limit) +{ + if (*limit == 0) + *limit = ((num_physpages << PAGE_SHIFT) >> 13) / + sizeof(struct fuse_req); + + if (*limit >= 1 << 16) + *limit = (1 << 16) - 1; +} + +static int set_global_limit(const char *val, struct kernel_param *kp) +{ + int rv; + + rv = param_set_uint(val, kp); + if (rv) + return rv; + + sanitize_global_limit((unsigned *)kp->arg); + + return 0; +} + +static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg) +{ + int cap_sys_admin = capable(CAP_SYS_ADMIN); + + if (arg->minor < 13) + return; + + sanitize_global_limit(&max_user_bgreq); + sanitize_global_limit(&max_user_congthresh); + + if (arg->max_background) { + fc->max_background = arg->max_background; + + if (!cap_sys_admin && fc->max_background > max_user_bgreq) + fc->max_background = max_user_bgreq; + } + if (arg->congestion_threshold) { + fc->congestion_threshold = arg->congestion_threshold; + + if (!cap_sys_admin && + fc->congestion_threshold > max_user_congthresh) + fc->congestion_threshold = max_user_congthresh; + } +} + +static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) +{ + struct fuse_init_out *arg = &req->misc.init_out; + + if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION) + fc->conn_error = 1; + else { + unsigned long ra_pages; + + process_init_limits(fc, arg); + + if (arg->minor >= 6) { + ra_pages = arg->max_readahead / PAGE_CACHE_SIZE; + if (arg->flags & FUSE_ASYNC_READ) + fc->async_read = 1; + if (!(arg->flags & FUSE_POSIX_LOCKS)) + fc->no_lock = 1; + if (arg->minor >= 17) { + if (!(arg->flags & FUSE_FLOCK_LOCKS)) + fc->no_flock = 1; + } else { + if (!(arg->flags & FUSE_POSIX_LOCKS)) + fc->no_flock = 1; + } + if (arg->flags & FUSE_ATOMIC_O_TRUNC) + fc->atomic_o_trunc = 1; + if (arg->minor >= 9) { + /* LOOKUP has dependency on proto version */ + if (arg->flags & FUSE_EXPORT_SUPPORT) + fc->export_support = 1; + } + if (arg->flags & FUSE_BIG_WRITES) + fc->big_writes = 1; + if (arg->flags & FUSE_DONT_MASK) + fc->dont_mask = 1; + } else { + ra_pages = fc->max_read / PAGE_CACHE_SIZE; + fc->no_lock = 1; + fc->no_flock = 1; + } + + fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages); + fc->minor = arg->minor; + fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; + fc->max_write = max_t(unsigned, 4096, fc->max_write); + fc->conn_init = 1; + } + fc->blocked = 0; + wake_up_all(&fc->blocked_waitq); +} + +static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) +{ + struct fuse_init_in *arg = &req->misc.init_in; + + arg->major = FUSE_KERNEL_VERSION; + arg->minor = FUSE_KERNEL_MINOR_VERSION; + arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE; + arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | + FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | + FUSE_FLOCK_LOCKS; + req->in.h.opcode = FUSE_INIT; + req->in.numargs = 1; + req->in.args[0].size = sizeof(*arg); + req->in.args[0].value = arg; + req->out.numargs = 1; + /* Variable length argument used for backward compatibility + with interface version < 7.5. Rest of init_out is zeroed + by do_get_request(), so a short reply is not a problem */ + req->out.argvar = 1; + req->out.args[0].size = sizeof(struct fuse_init_out); + req->out.args[0].value = &req->misc.init_out; + req->end = process_init_reply; + fuse_request_send_background(fc, req); +} + +static void fuse_free_conn(struct fuse_conn *fc) +{ + kfree(fc); +} + +static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) +{ + int err; + + fc->bdi.name = "fuse"; + fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; + /* fuse does it's own writeback accounting */ + fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB; + + err = bdi_init(&fc->bdi); + if (err) + return err; + + fc->bdi_initialized = 1; + + if (sb->s_bdev) { + err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk", + MAJOR(fc->dev), MINOR(fc->dev)); + } else { + err = bdi_register_dev(&fc->bdi, fc->dev); + } + + if (err) + return err; + + /* + * For a single fuse filesystem use max 1% of dirty + + * writeback threshold. + * + * This gives about 1M of write buffer for memory maps on a + * machine with 1G and 10% dirty_ratio, which should be more + * than enough. + * + * Privileged users can raise it by writing to + * + * /sys/class/bdi/<bdi>/max_ratio + */ + bdi_set_max_ratio(&fc->bdi, 1); + + return 0; +} + +static int fuse_fill_super(struct super_block *sb, void *data, int silent) +{ + struct fuse_conn *fc; + struct inode *root; + struct fuse_mount_data d; + struct file *file; + struct dentry *root_dentry; + struct fuse_req *init_req; + int err; + int is_bdev = sb->s_bdev != NULL; + + err = -EINVAL; + if (sb->s_flags & MS_MANDLOCK) + goto err; + + sb->s_flags &= ~MS_NOSEC; + + if (!parse_fuse_opt((char *) data, &d, is_bdev)) + goto err; + + if (is_bdev) { +#ifdef CONFIG_BLOCK + err = -EINVAL; + if (!sb_set_blocksize(sb, d.blksize)) + goto err; +#endif + } else { + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + } + sb->s_magic = FUSE_SUPER_MAGIC; + sb->s_op = &fuse_super_operations; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_time_gran = 1; + sb->s_export_op = &fuse_export_operations; + + file = fget(d.fd); + err = -EINVAL; + if (!file) + goto err; + + if (file->f_op != &fuse_dev_operations) + goto err_fput; + + fc = kmalloc(sizeof(*fc), GFP_KERNEL); + err = -ENOMEM; + if (!fc) + goto err_fput; + + fuse_conn_init(fc); + + fc->dev = sb->s_dev; + fc->sb = sb; + err = fuse_bdi_init(fc, sb); + if (err) + goto err_put_conn; + + sb->s_bdi = &fc->bdi; + + /* Handle umasking inside the fuse code */ + if (sb->s_flags & MS_POSIXACL) + fc->dont_mask = 1; + sb->s_flags |= MS_POSIXACL; + + fc->release = fuse_free_conn; + fc->flags = d.flags; + fc->user_id = d.user_id; + fc->group_id = d.group_id; + fc->max_read = max_t(unsigned, 4096, d.max_read); + + /* Used by get_root_inode() */ + sb->s_fs_info = fc; + + err = -ENOMEM; + root = fuse_get_root_inode(sb, d.rootmode); + root_dentry = d_make_root(root); + if (!root_dentry) + goto err_put_conn; + /* only now - we want root dentry with NULL ->d_op */ + sb->s_d_op = &fuse_dentry_operations; + + init_req = fuse_request_alloc(); + if (!init_req) + goto err_put_root; + + if (is_bdev) { + fc->destroy_req = fuse_request_alloc(); + if (!fc->destroy_req) + goto err_free_init_req; + } + + mutex_lock(&fuse_mutex); + err = -EINVAL; + if (file->private_data) + goto err_unlock; + + err = fuse_ctl_add_conn(fc); + if (err) + goto err_unlock; + + list_add_tail(&fc->entry, &fuse_conn_list); + sb->s_root = root_dentry; + fc->connected = 1; + file->private_data = fuse_conn_get(fc); + mutex_unlock(&fuse_mutex); + /* + * atomic_dec_and_test() in fput() provides the necessary + * memory barrier for file->private_data to be visible on all + * CPUs after this + */ + fput(file); + + fuse_send_init(fc, init_req); + + return 0; + + err_unlock: + mutex_unlock(&fuse_mutex); + err_free_init_req: + fuse_request_free(init_req); + err_put_root: + dput(root_dentry); + err_put_conn: + fuse_bdi_destroy(fc); + fuse_conn_put(fc); + err_fput: + fput(file); + err: + return err; +} + +static struct dentry *fuse_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *raw_data) +{ + return mount_nodev(fs_type, flags, raw_data, fuse_fill_super); +} + +static void fuse_kill_sb_anon(struct super_block *sb) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + + if (fc) { + down_write(&fc->killsb); + fc->sb = NULL; + up_write(&fc->killsb); + } + + kill_anon_super(sb); +} + +static struct file_system_type fuse_fs_type = { + .owner = THIS_MODULE, + .name = "fuse", + .fs_flags = FS_HAS_SUBTYPE, + .mount = fuse_mount, + .kill_sb = fuse_kill_sb_anon, +}; + +#ifdef CONFIG_BLOCK +static struct dentry *fuse_mount_blk(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *raw_data) +{ + return mount_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super); +} + +static void fuse_kill_sb_blk(struct super_block *sb) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + + if (fc) { + down_write(&fc->killsb); + fc->sb = NULL; + up_write(&fc->killsb); + } + + kill_block_super(sb); +} + +static struct file_system_type fuseblk_fs_type = { + .owner = THIS_MODULE, + .name = "fuseblk", + .mount = fuse_mount_blk, + .kill_sb = fuse_kill_sb_blk, + .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE, +}; + +static inline int register_fuseblk(void) +{ + return register_filesystem(&fuseblk_fs_type); +} + +static inline void unregister_fuseblk(void) +{ + unregister_filesystem(&fuseblk_fs_type); +} +#else +static inline int register_fuseblk(void) +{ + return 0; +} + +static inline void unregister_fuseblk(void) +{ +} +#endif + +static void fuse_inode_init_once(void *foo) +{ + struct inode *inode = foo; + + inode_init_once(inode); +} + +static int __init fuse_fs_init(void) +{ + int err; + + fuse_inode_cachep = kmem_cache_create("fuse_inode", + sizeof(struct fuse_inode), + 0, SLAB_HWCACHE_ALIGN, + fuse_inode_init_once); + err = -ENOMEM; + if (!fuse_inode_cachep) + goto out; + + err = register_fuseblk(); + if (err) + goto out2; + + err = register_filesystem(&fuse_fs_type); + if (err) + goto out3; + + return 0; + + out3: + unregister_fuseblk(); + out2: + kmem_cache_destroy(fuse_inode_cachep); + out: + return err; +} + +static void fuse_fs_cleanup(void) +{ + unregister_filesystem(&fuse_fs_type); + unregister_fuseblk(); + kmem_cache_destroy(fuse_inode_cachep); +} + +static struct kobject *fuse_kobj; +static struct kobject *connections_kobj; + +static int fuse_sysfs_init(void) +{ + int err; + + fuse_kobj = kobject_create_and_add("fuse", fs_kobj); + if (!fuse_kobj) { + err = -ENOMEM; + goto out_err; + } + + connections_kobj = kobject_create_and_add("connections", fuse_kobj); + if (!connections_kobj) { + err = -ENOMEM; + goto out_fuse_unregister; + } + + return 0; + + out_fuse_unregister: + kobject_put(fuse_kobj); + out_err: + return err; +} + +static void fuse_sysfs_cleanup(void) +{ + kobject_put(connections_kobj); + kobject_put(fuse_kobj); +} + +static int __init fuse_init(void) +{ + int res; + + printk(KERN_INFO "fuse init (API version %i.%i)\n", + FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION); + + INIT_LIST_HEAD(&fuse_conn_list); + res = fuse_fs_init(); + if (res) + goto err; + + res = fuse_dev_init(); + if (res) + goto err_fs_cleanup; + + res = fuse_sysfs_init(); + if (res) + goto err_dev_cleanup; + + res = fuse_ctl_init(); + if (res) + goto err_sysfs_cleanup; + + sanitize_global_limit(&max_user_bgreq); + sanitize_global_limit(&max_user_congthresh); + + return 0; + + err_sysfs_cleanup: + fuse_sysfs_cleanup(); + err_dev_cleanup: + fuse_dev_cleanup(); + err_fs_cleanup: + fuse_fs_cleanup(); + err: + return res; +} + +static void __exit fuse_exit(void) +{ + printk(KERN_DEBUG "fuse exit\n"); + + fuse_ctl_cleanup(); + fuse_sysfs_cleanup(); + fuse_fs_cleanup(); + fuse_dev_cleanup(); +} + +module_init(fuse_init); +module_exit(fuse_exit); |