summaryrefslogtreecommitdiff
path: root/ANDROID_3.4.5/fs/ceph
diff options
context:
space:
mode:
Diffstat (limited to 'ANDROID_3.4.5/fs/ceph')
-rw-r--r--ANDROID_3.4.5/fs/ceph/Kconfig18
-rw-r--r--ANDROID_3.4.5/fs/ceph/Makefile11
-rw-r--r--ANDROID_3.4.5/fs/ceph/addr.c1234
-rw-r--r--ANDROID_3.4.5/fs/ceph/caps.c3088
-rw-r--r--ANDROID_3.4.5/fs/ceph/ceph_frag.c22
-rw-r--r--ANDROID_3.4.5/fs/ceph/debugfs.c273
-rw-r--r--ANDROID_3.4.5/fs/ceph/dir.c1376
-rw-r--r--ANDROID_3.4.5/fs/ceph/export.c253
-rw-r--r--ANDROID_3.4.5/fs/ceph/file.c874
-rw-r--r--ANDROID_3.4.5/fs/ceph/inode.c1811
-rw-r--r--ANDROID_3.4.5/fs/ceph/ioctl.c290
-rw-r--r--ANDROID_3.4.5/fs/ceph/ioctl.h98
-rw-r--r--ANDROID_3.4.5/fs/ceph/locks.c286
-rw-r--r--ANDROID_3.4.5/fs/ceph/mds_client.c3465
-rw-r--r--ANDROID_3.4.5/fs/ceph/mds_client.h383
-rw-r--r--ANDROID_3.4.5/fs/ceph/mdsmap.c179
-rw-r--r--ANDROID_3.4.5/fs/ceph/snap.c931
-rw-r--r--ANDROID_3.4.5/fs/ceph/strings.c117
-rw-r--r--ANDROID_3.4.5/fs/ceph/super.c972
-rw-r--r--ANDROID_3.4.5/fs/ceph/super.h858
-rw-r--r--ANDROID_3.4.5/fs/ceph/xattr.c946
21 files changed, 0 insertions, 17485 deletions
diff --git a/ANDROID_3.4.5/fs/ceph/Kconfig b/ANDROID_3.4.5/fs/ceph/Kconfig
deleted file mode 100644
index 9eb134ea..00000000
--- a/ANDROID_3.4.5/fs/ceph/Kconfig
+++ /dev/null
@@ -1,18 +0,0 @@
-config CEPH_FS
- tristate "Ceph distributed file system (EXPERIMENTAL)"
- depends on INET && EXPERIMENTAL
- select CEPH_LIB
- select LIBCRC32C
- select CRYPTO_AES
- select CRYPTO
- default n
- help
- Choose Y or M here to include support for mounting the
- experimental Ceph distributed file system. Ceph is an extremely
- scalable file system designed to provide high performance,
- reliable access to petabytes of storage.
-
- More information at http://ceph.newdream.net/.
-
- If unsure, say N.
-
diff --git a/ANDROID_3.4.5/fs/ceph/Makefile b/ANDROID_3.4.5/fs/ceph/Makefile
deleted file mode 100644
index bd352125..00000000
--- a/ANDROID_3.4.5/fs/ceph/Makefile
+++ /dev/null
@@ -1,11 +0,0 @@
-#
-# Makefile for CEPH filesystem.
-#
-
-obj-$(CONFIG_CEPH_FS) += ceph.o
-
-ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
- export.o caps.o snap.o xattr.o \
- mds_client.o mdsmap.o strings.o ceph_frag.o \
- debugfs.o
-
diff --git a/ANDROID_3.4.5/fs/ceph/addr.c b/ANDROID_3.4.5/fs/ceph/addr.c
deleted file mode 100644
index 173b1d22..00000000
--- a/ANDROID_3.4.5/fs/ceph/addr.c
+++ /dev/null
@@ -1,1234 +0,0 @@
-#include <linux/ceph/ceph_debug.h>
-
-#include <linux/backing-dev.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/writeback.h> /* generic_writepages */
-#include <linux/slab.h>
-#include <linux/pagevec.h>
-#include <linux/task_io_accounting_ops.h>
-
-#include "super.h"
-#include "mds_client.h"
-#include <linux/ceph/osd_client.h>
-
-/*
- * Ceph address space ops.
- *
- * There are a few funny things going on here.
- *
- * The page->private field is used to reference a struct
- * ceph_snap_context for _every_ dirty page. This indicates which
- * snapshot the page was logically dirtied in, and thus which snap
- * context needs to be associated with the osd write during writeback.
- *
- * Similarly, struct ceph_inode_info maintains a set of counters to
- * count dirty pages on the inode. In the absence of snapshots,
- * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
- *
- * When a snapshot is taken (that is, when the client receives
- * notification that a snapshot was taken), each inode with caps and
- * with dirty pages (dirty pages implies there is a cap) gets a new
- * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
- * order, new snaps go to the tail). The i_wrbuffer_ref_head count is
- * moved to capsnap->dirty. (Unless a sync write is currently in
- * progress. In that case, the capsnap is said to be "pending", new
- * writes cannot start, and the capsnap isn't "finalized" until the
- * write completes (or fails) and a final size/mtime for the inode for
- * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0.
- *
- * On writeback, we must submit writes to the osd IN SNAP ORDER. So,
- * we look for the first capsnap in i_cap_snaps and write out pages in
- * that snap context _only_. Then we move on to the next capsnap,
- * eventually reaching the "live" or "head" context (i.e., pages that
- * are not yet snapped) and are writing the most recently dirtied
- * pages.
- *
- * Invalidate and so forth must take care to ensure the dirty page
- * accounting is preserved.
- */
-
-#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
-#define CONGESTION_OFF_THRESH(congestion_kb) \
- (CONGESTION_ON_THRESH(congestion_kb) - \
- (CONGESTION_ON_THRESH(congestion_kb) >> 2))
-
-
-
-/*
- * Dirty a page. Optimistically adjust accounting, on the assumption
- * that we won't race with invalidate. If we do, readjust.
- */
-static int ceph_set_page_dirty(struct page *page)
-{
- struct address_space *mapping = page->mapping;
- struct inode *inode;
- struct ceph_inode_info *ci;
- int undo = 0;
- struct ceph_snap_context *snapc;
-
- if (unlikely(!mapping))
- return !TestSetPageDirty(page);
-
- if (TestSetPageDirty(page)) {
- dout("%p set_page_dirty %p idx %lu -- already dirty\n",
- mapping->host, page, page->index);
- return 0;
- }
-
- inode = mapping->host;
- ci = ceph_inode(inode);
-
- /*
- * Note that we're grabbing a snapc ref here without holding
- * any locks!
- */
- snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
-
- /* dirty the head */
- spin_lock(&ci->i_ceph_lock);
- if (ci->i_head_snapc == NULL)
- ci->i_head_snapc = ceph_get_snap_context(snapc);
- ++ci->i_wrbuffer_ref_head;
- if (ci->i_wrbuffer_ref == 0)
- ihold(inode);
- ++ci->i_wrbuffer_ref;
- dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
- "snapc %p seq %lld (%d snaps)\n",
- mapping->host, page, page->index,
- ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
- ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
- snapc, snapc->seq, snapc->num_snaps);
- spin_unlock(&ci->i_ceph_lock);
-
- /* now adjust page */
- spin_lock_irq(&mapping->tree_lock);
- if (page->mapping) { /* Race with truncate? */
- WARN_ON_ONCE(!PageUptodate(page));
- account_page_dirtied(page, page->mapping);
- radix_tree_tag_set(&mapping->page_tree,
- page_index(page), PAGECACHE_TAG_DIRTY);
-
- /*
- * Reference snap context in page->private. Also set
- * PagePrivate so that we get invalidatepage callback.
- */
- page->private = (unsigned long)snapc;
- SetPagePrivate(page);
- } else {
- dout("ANON set_page_dirty %p (raced truncate?)\n", page);
- undo = 1;
- }
-
- spin_unlock_irq(&mapping->tree_lock);
-
- if (undo)
- /* whoops, we failed to dirty the page */
- ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
-
- __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
-
- BUG_ON(!PageDirty(page));
- return 1;
-}
-
-/*
- * If we are truncating the full page (i.e. offset == 0), adjust the
- * dirty page counters appropriately. Only called if there is private
- * data on the page.
- */
-static void ceph_invalidatepage(struct page *page, unsigned long offset)
-{
- struct inode *inode;
- struct ceph_inode_info *ci;
- struct ceph_snap_context *snapc = (void *)page->private;
-
- BUG_ON(!PageLocked(page));
- BUG_ON(!page->private);
- BUG_ON(!PagePrivate(page));
- BUG_ON(!page->mapping);
-
- inode = page->mapping->host;
-
- /*
- * We can get non-dirty pages here due to races between
- * set_page_dirty and truncate_complete_page; just spit out a
- * warning, in case we end up with accounting problems later.
- */
- if (!PageDirty(page))
- pr_err("%p invalidatepage %p page not dirty\n", inode, page);
-
- if (offset == 0)
- ClearPageChecked(page);
-
- ci = ceph_inode(inode);
- if (offset == 0) {
- dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
- inode, page, page->index, offset);
- ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
- ceph_put_snap_context(snapc);
- page->private = 0;
- ClearPagePrivate(page);
- } else {
- dout("%p invalidatepage %p idx %lu partial dirty page\n",
- inode, page, page->index);
- }
-}
-
-/* just a sanity check */
-static int ceph_releasepage(struct page *page, gfp_t g)
-{
- struct inode *inode = page->mapping ? page->mapping->host : NULL;
- dout("%p releasepage %p idx %lu\n", inode, page, page->index);
- WARN_ON(PageDirty(page));
- WARN_ON(page->private);
- WARN_ON(PagePrivate(page));
- return 0;
-}
-
-/*
- * read a single page, without unlocking it.
- */
-static int readpage_nounlock(struct file *filp, struct page *page)
-{
- struct inode *inode = filp->f_dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_osd_client *osdc =
- &ceph_inode_to_client(inode)->client->osdc;
- int err = 0;
- u64 len = PAGE_CACHE_SIZE;
-
- dout("readpage inode %p file %p page %p index %lu\n",
- inode, filp, page, page->index);
- err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
- page->index << PAGE_CACHE_SHIFT, &len,
- ci->i_truncate_seq, ci->i_truncate_size,
- &page, 1, 0);
- if (err == -ENOENT)
- err = 0;
- if (err < 0) {
- SetPageError(page);
- goto out;
- } else if (err < PAGE_CACHE_SIZE) {
- /* zero fill remainder of page */
- zero_user_segment(page, err, PAGE_CACHE_SIZE);
- }
- SetPageUptodate(page);
-
-out:
- return err < 0 ? err : 0;
-}
-
-static int ceph_readpage(struct file *filp, struct page *page)
-{
- int r = readpage_nounlock(filp, page);
- unlock_page(page);
- return r;
-}
-
-/*
- * Finish an async read(ahead) op.
- */
-static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
-{
- struct inode *inode = req->r_inode;
- struct ceph_osd_reply_head *replyhead;
- int rc, bytes;
- int i;
-
- /* parse reply */
- replyhead = msg->front.iov_base;
- WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
- rc = le32_to_cpu(replyhead->result);
- bytes = le32_to_cpu(msg->hdr.data_len);
-
- dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
-
- /* unlock all pages, zeroing any data we didn't read */
- for (i = 0; i < req->r_num_pages; i++, bytes -= PAGE_CACHE_SIZE) {
- struct page *page = req->r_pages[i];
-
- if (bytes < (int)PAGE_CACHE_SIZE) {
- /* zero (remainder of) page */
- int s = bytes < 0 ? 0 : bytes;
- zero_user_segment(page, s, PAGE_CACHE_SIZE);
- }
- dout("finish_read %p uptodate %p idx %lu\n", inode, page,
- page->index);
- flush_dcache_page(page);
- SetPageUptodate(page);
- unlock_page(page);
- page_cache_release(page);
- }
- kfree(req->r_pages);
-}
-
-/*
- * start an async read(ahead) operation. return nr_pages we submitted
- * a read for on success, or negative error code.
- */
-static int start_read(struct inode *inode, struct list_head *page_list, int max)
-{
- struct ceph_osd_client *osdc =
- &ceph_inode_to_client(inode)->client->osdc;
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct page *page = list_entry(page_list->prev, struct page, lru);
- struct ceph_osd_request *req;
- u64 off;
- u64 len;
- int i;
- struct page **pages;
- pgoff_t next_index;
- int nr_pages = 0;
- int ret;
-
- off = page->index << PAGE_CACHE_SHIFT;
-
- /* count pages */
- next_index = page->index;
- list_for_each_entry_reverse(page, page_list, lru) {
- if (page->index != next_index)
- break;
- nr_pages++;
- next_index++;
- if (max && nr_pages == max)
- break;
- }
- len = nr_pages << PAGE_CACHE_SHIFT;
- dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages,
- off, len);
-
- req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode),
- off, &len,
- CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
- NULL, 0,
- ci->i_truncate_seq, ci->i_truncate_size,
- NULL, false, 1, 0);
- if (!req)
- return -ENOMEM;
-
- /* build page vector */
- nr_pages = len >> PAGE_CACHE_SHIFT;
- pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS);
- ret = -ENOMEM;
- if (!pages)
- goto out;
- for (i = 0; i < nr_pages; ++i) {
- page = list_entry(page_list->prev, struct page, lru);
- BUG_ON(PageLocked(page));
- list_del(&page->lru);
-
- dout("start_read %p adding %p idx %lu\n", inode, page,
- page->index);
- if (add_to_page_cache_lru(page, &inode->i_data, page->index,
- GFP_NOFS)) {
- page_cache_release(page);
- dout("start_read %p add_to_page_cache failed %p\n",
- inode, page);
- nr_pages = i;
- goto out_pages;
- }
- pages[i] = page;
- }
- req->r_pages = pages;
- req->r_num_pages = nr_pages;
- req->r_callback = finish_read;
- req->r_inode = inode;
-
- dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
- ret = ceph_osdc_start_request(osdc, req, false);
- if (ret < 0)
- goto out_pages;
- ceph_osdc_put_request(req);
- return nr_pages;
-
-out_pages:
- ceph_release_page_vector(pages, nr_pages);
-out:
- ceph_osdc_put_request(req);
- return ret;
-}
-
-
-/*
- * Read multiple pages. Leave pages we don't read + unlock in page_list;
- * the caller (VM) cleans them up.
- */
-static int ceph_readpages(struct file *file, struct address_space *mapping,
- struct list_head *page_list, unsigned nr_pages)
-{
- struct inode *inode = file->f_dentry->d_inode;
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- int rc = 0;
- int max = 0;
-
- if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
- max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
- >> PAGE_SHIFT;
-
- dout("readpages %p file %p nr_pages %d max %d\n", inode, file, nr_pages,
- max);
- while (!list_empty(page_list)) {
- rc = start_read(inode, page_list, max);
- if (rc < 0)
- goto out;
- BUG_ON(rc == 0);
- }
-out:
- dout("readpages %p file %p ret %d\n", inode, file, rc);
- return rc;
-}
-
-/*
- * Get ref for the oldest snapc for an inode with dirty data... that is, the
- * only snap context we are allowed to write back.
- */
-static struct ceph_snap_context *get_oldest_context(struct inode *inode,
- u64 *snap_size)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_snap_context *snapc = NULL;
- struct ceph_cap_snap *capsnap = NULL;
-
- spin_lock(&ci->i_ceph_lock);
- list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
- dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
- capsnap->context, capsnap->dirty_pages);
- if (capsnap->dirty_pages) {
- snapc = ceph_get_snap_context(capsnap->context);
- if (snap_size)
- *snap_size = capsnap->size;
- break;
- }
- }
- if (!snapc && ci->i_wrbuffer_ref_head) {
- snapc = ceph_get_snap_context(ci->i_head_snapc);
- dout(" head snapc %p has %d dirty pages\n",
- snapc, ci->i_wrbuffer_ref_head);
- }
- spin_unlock(&ci->i_ceph_lock);
- return snapc;
-}
-
-/*
- * Write a single page, but leave the page locked.
- *
- * If we get a write error, set the page error bit, but still adjust the
- * dirty page accounting (i.e., page is no longer dirty).
- */
-static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
-{
- struct inode *inode;
- struct ceph_inode_info *ci;
- struct ceph_fs_client *fsc;
- struct ceph_osd_client *osdc;
- loff_t page_off = page->index << PAGE_CACHE_SHIFT;
- int len = PAGE_CACHE_SIZE;
- loff_t i_size;
- int err = 0;
- struct ceph_snap_context *snapc, *oldest;
- u64 snap_size = 0;
- long writeback_stat;
-
- dout("writepage %p idx %lu\n", page, page->index);
-
- if (!page->mapping || !page->mapping->host) {
- dout("writepage %p - no mapping\n", page);
- return -EFAULT;
- }
- inode = page->mapping->host;
- ci = ceph_inode(inode);
- fsc = ceph_inode_to_client(inode);
- osdc = &fsc->client->osdc;
-
- /* verify this is a writeable snap context */
- snapc = (void *)page->private;
- if (snapc == NULL) {
- dout("writepage %p page %p not dirty?\n", inode, page);
- goto out;
- }
- oldest = get_oldest_context(inode, &snap_size);
- if (snapc->seq > oldest->seq) {
- dout("writepage %p page %p snapc %p not writeable - noop\n",
- inode, page, (void *)page->private);
- /* we should only noop if called by kswapd */
- WARN_ON((current->flags & PF_MEMALLOC) == 0);
- ceph_put_snap_context(oldest);
- goto out;
- }
- ceph_put_snap_context(oldest);
-
- /* is this a partial page at end of file? */
- if (snap_size)
- i_size = snap_size;
- else
- i_size = i_size_read(inode);
- if (i_size < page_off + len)
- len = i_size - page_off;
-
- dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
- inode, page, page->index, page_off, len, snapc);
-
- writeback_stat = atomic_long_inc_return(&fsc->writeback_count);
- if (writeback_stat >
- CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
- set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
-
- set_page_writeback(page);
- err = ceph_osdc_writepages(osdc, ceph_vino(inode),
- &ci->i_layout, snapc,
- page_off, len,
- ci->i_truncate_seq, ci->i_truncate_size,
- &inode->i_mtime,
- &page, 1, 0, 0, true);
- if (err < 0) {
- dout("writepage setting page/mapping error %d %p\n", err, page);
- SetPageError(page);
- mapping_set_error(&inode->i_data, err);
- if (wbc)
- wbc->pages_skipped++;
- } else {
- dout("writepage cleaned page %p\n", page);
- err = 0; /* vfs expects us to return 0 */
- }
- page->private = 0;
- ClearPagePrivate(page);
- end_page_writeback(page);
- ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
- ceph_put_snap_context(snapc); /* page's reference */
-out:
- return err;
-}
-
-static int ceph_writepage(struct page *page, struct writeback_control *wbc)
-{
- int err;
- struct inode *inode = page->mapping->host;
- BUG_ON(!inode);
- ihold(inode);
- err = writepage_nounlock(page, wbc);
- unlock_page(page);
- iput(inode);
- return err;
-}
-
-
-/*
- * lame release_pages helper. release_pages() isn't exported to
- * modules.
- */
-static void ceph_release_pages(struct page **pages, int num)
-{
- struct pagevec pvec;
- int i;
-
- pagevec_init(&pvec, 0);
- for (i = 0; i < num; i++) {
- if (pagevec_add(&pvec, pages[i]) == 0)
- pagevec_release(&pvec);
- }
- pagevec_release(&pvec);
-}
-
-
-/*
- * async writeback completion handler.
- *
- * If we get an error, set the mapping error bit, but not the individual
- * page error bits.
- */
-static void writepages_finish(struct ceph_osd_request *req,
- struct ceph_msg *msg)
-{
- struct inode *inode = req->r_inode;
- struct ceph_osd_reply_head *replyhead;
- struct ceph_osd_op *op;
- struct ceph_inode_info *ci = ceph_inode(inode);
- unsigned wrote;
- struct page *page;
- int i;
- struct ceph_snap_context *snapc = req->r_snapc;
- struct address_space *mapping = inode->i_mapping;
- __s32 rc = -EIO;
- u64 bytes = 0;
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- long writeback_stat;
- unsigned issued = ceph_caps_issued(ci);
-
- /* parse reply */
- replyhead = msg->front.iov_base;
- WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
- op = (void *)(replyhead + 1);
- rc = le32_to_cpu(replyhead->result);
- bytes = le64_to_cpu(op->extent.length);
-
- if (rc >= 0) {
- /*
- * Assume we wrote the pages we originally sent. The
- * osd might reply with fewer pages if our writeback
- * raced with a truncation and was adjusted at the osd,
- * so don't believe the reply.
- */
- wrote = req->r_num_pages;
- } else {
- wrote = 0;
- mapping_set_error(mapping, rc);
- }
- dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
- inode, rc, bytes, wrote);
-
- /* clean all pages */
- for (i = 0; i < req->r_num_pages; i++) {
- page = req->r_pages[i];
- BUG_ON(!page);
- WARN_ON(!PageUptodate(page));
-
- writeback_stat =
- atomic_long_dec_return(&fsc->writeback_count);
- if (writeback_stat <
- CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
- clear_bdi_congested(&fsc->backing_dev_info,
- BLK_RW_ASYNC);
-
- ceph_put_snap_context((void *)page->private);
- page->private = 0;
- ClearPagePrivate(page);
- dout("unlocking %d %p\n", i, page);
- end_page_writeback(page);
-
- /*
- * We lost the cache cap, need to truncate the page before
- * it is unlocked, otherwise we'd truncate it later in the
- * page truncation thread, possibly losing some data that
- * raced its way in
- */
- if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
- generic_error_remove_page(inode->i_mapping, page);
-
- unlock_page(page);
- }
- dout("%p wrote+cleaned %d pages\n", inode, wrote);
- ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc);
-
- ceph_release_pages(req->r_pages, req->r_num_pages);
- if (req->r_pages_from_pool)
- mempool_free(req->r_pages,
- ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
- else
- kfree(req->r_pages);
- ceph_osdc_put_request(req);
-}
-
-/*
- * allocate a page vec, either directly, or if necessary, via a the
- * mempool. we avoid the mempool if we can because req->r_num_pages
- * may be less than the maximum write size.
- */
-static void alloc_page_vec(struct ceph_fs_client *fsc,
- struct ceph_osd_request *req)
-{
- req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
- GFP_NOFS);
- if (!req->r_pages) {
- req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS);
- req->r_pages_from_pool = 1;
- WARN_ON(!req->r_pages);
- }
-}
-
-/*
- * initiate async writeback
- */
-static int ceph_writepages_start(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- struct inode *inode = mapping->host;
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc;
- pgoff_t index, start, end;
- int range_whole = 0;
- int should_loop = 1;
- pgoff_t max_pages = 0, max_pages_ever = 0;
- struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;
- struct pagevec pvec;
- int done = 0;
- int rc = 0;
- unsigned wsize = 1 << inode->i_blkbits;
- struct ceph_osd_request *req = NULL;
- int do_sync;
- u64 snap_size = 0;
-
- /*
- * Include a 'sync' in the OSD request if this is a data
- * integrity write (e.g., O_SYNC write or fsync()), or if our
- * cap is being revoked.
- */
- do_sync = wbc->sync_mode == WB_SYNC_ALL;
- if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
- do_sync = 1;
- dout("writepages_start %p dosync=%d (mode=%s)\n",
- inode, do_sync,
- wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
- (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
-
- fsc = ceph_inode_to_client(inode);
- if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
- pr_warning("writepage_start %p on forced umount\n", inode);
- return -EIO; /* we're in a forced umount, don't write! */
- }
- if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
- wsize = fsc->mount_options->wsize;
- if (wsize < PAGE_CACHE_SIZE)
- wsize = PAGE_CACHE_SIZE;
- max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
-
- pagevec_init(&pvec, 0);
-
- /* where to start/end? */
- if (wbc->range_cyclic) {
- start = mapping->writeback_index; /* Start from prev offset */
- end = -1;
- dout(" cyclic, start at %lu\n", start);
- } else {
- start = wbc->range_start >> PAGE_CACHE_SHIFT;
- end = wbc->range_end >> PAGE_CACHE_SHIFT;
- if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
- range_whole = 1;
- should_loop = 0;
- dout(" not cyclic, %lu to %lu\n", start, end);
- }
- index = start;
-
-retry:
- /* find oldest snap context with dirty data */
- ceph_put_snap_context(snapc);
- snapc = get_oldest_context(inode, &snap_size);
- if (!snapc) {
- /* hmm, why does writepages get called when there
- is no dirty data? */
- dout(" no snap context with dirty data?\n");
- goto out;
- }
- dout(" oldest snapc is %p seq %lld (%d snaps)\n",
- snapc, snapc->seq, snapc->num_snaps);
- if (last_snapc && snapc != last_snapc) {
- /* if we switched to a newer snapc, restart our scan at the
- * start of the original file range. */
- dout(" snapc differs from last pass, restarting at %lu\n",
- index);
- index = start;
- }
- last_snapc = snapc;
-
- while (!done && index <= end) {
- unsigned i;
- int first;
- pgoff_t next;
- int pvec_pages, locked_pages;
- struct page *page;
- int want;
- u64 offset, len;
- struct ceph_osd_request_head *reqhead;
- struct ceph_osd_op *op;
- long writeback_stat;
-
- next = 0;
- locked_pages = 0;
- max_pages = max_pages_ever;
-
-get_more_pages:
- first = -1;
- want = min(end - index,
- min((pgoff_t)PAGEVEC_SIZE,
- max_pages - (pgoff_t)locked_pages) - 1)
- + 1;
- pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
- PAGECACHE_TAG_DIRTY,
- want);
- dout("pagevec_lookup_tag got %d\n", pvec_pages);
- if (!pvec_pages && !locked_pages)
- break;
- for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
- page = pvec.pages[i];
- dout("? %p idx %lu\n", page, page->index);
- if (locked_pages == 0)
- lock_page(page); /* first page */
- else if (!trylock_page(page))
- break;
-
- /* only dirty pages, or our accounting breaks */
- if (unlikely(!PageDirty(page)) ||
- unlikely(page->mapping != mapping)) {
- dout("!dirty or !mapping %p\n", page);
- unlock_page(page);
- break;
- }
- if (!wbc->range_cyclic && page->index > end) {
- dout("end of range %p\n", page);
- done = 1;
- unlock_page(page);
- break;
- }
- if (next && (page->index != next)) {
- dout("not consecutive %p\n", page);
- unlock_page(page);
- break;
- }
- if (wbc->sync_mode != WB_SYNC_NONE) {
- dout("waiting on writeback %p\n", page);
- wait_on_page_writeback(page);
- }
- if ((snap_size && page_offset(page) > snap_size) ||
- (!snap_size &&
- page_offset(page) > i_size_read(inode))) {
- dout("%p page eof %llu\n", page, snap_size ?
- snap_size : i_size_read(inode));
- done = 1;
- unlock_page(page);
- break;
- }
- if (PageWriteback(page)) {
- dout("%p under writeback\n", page);
- unlock_page(page);
- break;
- }
-
- /* only if matching snap context */
- pgsnapc = (void *)page->private;
- if (pgsnapc->seq > snapc->seq) {
- dout("page snapc %p %lld > oldest %p %lld\n",
- pgsnapc, pgsnapc->seq, snapc, snapc->seq);
- unlock_page(page);
- if (!locked_pages)
- continue; /* keep looking for snap */
- break;
- }
-
- if (!clear_page_dirty_for_io(page)) {
- dout("%p !clear_page_dirty_for_io\n", page);
- unlock_page(page);
- break;
- }
-
- /* ok */
- if (locked_pages == 0) {
- /* prepare async write request */
- offset = (unsigned long long)page->index
- << PAGE_CACHE_SHIFT;
- len = wsize;
- req = ceph_osdc_new_request(&fsc->client->osdc,
- &ci->i_layout,
- ceph_vino(inode),
- offset, &len,
- CEPH_OSD_OP_WRITE,
- CEPH_OSD_FLAG_WRITE |
- CEPH_OSD_FLAG_ONDISK,
- snapc, do_sync,
- ci->i_truncate_seq,
- ci->i_truncate_size,
- &inode->i_mtime, true, 1, 0);
-
- if (!req) {
- rc = -ENOMEM;
- unlock_page(page);
- break;
- }
-
- max_pages = req->r_num_pages;
-
- alloc_page_vec(fsc, req);
- req->r_callback = writepages_finish;
- req->r_inode = inode;
- }
-
- /* note position of first page in pvec */
- if (first < 0)
- first = i;
- dout("%p will write page %p idx %lu\n",
- inode, page, page->index);
-
- writeback_stat =
- atomic_long_inc_return(&fsc->writeback_count);
- if (writeback_stat > CONGESTION_ON_THRESH(
- fsc->mount_options->congestion_kb)) {
- set_bdi_congested(&fsc->backing_dev_info,
- BLK_RW_ASYNC);
- }
-
- set_page_writeback(page);
- req->r_pages[locked_pages] = page;
- locked_pages++;
- next = page->index + 1;
- }
-
- /* did we get anything? */
- if (!locked_pages)
- goto release_pvec_pages;
- if (i) {
- int j;
- BUG_ON(!locked_pages || first < 0);
-
- if (pvec_pages && i == pvec_pages &&
- locked_pages < max_pages) {
- dout("reached end pvec, trying for more\n");
- pagevec_reinit(&pvec);
- goto get_more_pages;
- }
-
- /* shift unused pages over in the pvec... we
- * will need to release them below. */
- for (j = i; j < pvec_pages; j++) {
- dout(" pvec leftover page %p\n",
- pvec.pages[j]);
- pvec.pages[j-i+first] = pvec.pages[j];
- }
- pvec.nr -= i-first;
- }
-
- /* submit the write */
- offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
- len = min((snap_size ? snap_size : i_size_read(inode)) - offset,
- (u64)locked_pages << PAGE_CACHE_SHIFT);
- dout("writepages got %d pages at %llu~%llu\n",
- locked_pages, offset, len);
-
- /* revise final length, page count */
- req->r_num_pages = locked_pages;
- reqhead = req->r_request->front.iov_base;
- op = (void *)(reqhead + 1);
- op->extent.length = cpu_to_le64(len);
- op->payload_len = cpu_to_le32(len);
- req->r_request->hdr.data_len = cpu_to_le32(len);
-
- rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
- BUG_ON(rc);
- req = NULL;
-
- /* continue? */
- index = next;
- wbc->nr_to_write -= locked_pages;
- if (wbc->nr_to_write <= 0)
- done = 1;
-
-release_pvec_pages:
- dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,
- pvec.nr ? pvec.pages[0] : NULL);
- pagevec_release(&pvec);
-
- if (locked_pages && !done)
- goto retry;
- }
-
- if (should_loop && !done) {
- /* more to do; loop back to beginning of file */
- dout("writepages looping back to beginning of file\n");
- should_loop = 0;
- index = 0;
- goto retry;
- }
-
- if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
- mapping->writeback_index = index;
-
-out:
- if (req)
- ceph_osdc_put_request(req);
- ceph_put_snap_context(snapc);
- dout("writepages done, rc = %d\n", rc);
- return rc;
-}
-
-
-
-/*
- * See if a given @snapc is either writeable, or already written.
- */
-static int context_is_writeable_or_written(struct inode *inode,
- struct ceph_snap_context *snapc)
-{
- struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
- int ret = !oldest || snapc->seq <= oldest->seq;
-
- ceph_put_snap_context(oldest);
- return ret;
-}
-
-/*
- * We are only allowed to write into/dirty the page if the page is
- * clean, or already dirty within the same snap context.
- *
- * called with page locked.
- * return success with page locked,
- * or any failure (incl -EAGAIN) with page unlocked.
- */
-static int ceph_update_writeable_page(struct file *file,
- loff_t pos, unsigned len,
- struct page *page)
-{
- struct inode *inode = file->f_dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
- loff_t page_off = pos & PAGE_CACHE_MASK;
- int pos_in_page = pos & ~PAGE_CACHE_MASK;
- int end_in_page = pos_in_page + len;
- loff_t i_size;
- int r;
- struct ceph_snap_context *snapc, *oldest;
-
-retry_locked:
- /* writepages currently holds page lock, but if we change that later, */
- wait_on_page_writeback(page);
-
- /* check snap context */
- BUG_ON(!ci->i_snap_realm);
- down_read(&mdsc->snap_rwsem);
- BUG_ON(!ci->i_snap_realm->cached_context);
- snapc = (void *)page->private;
- if (snapc && snapc != ci->i_head_snapc) {
- /*
- * this page is already dirty in another (older) snap
- * context! is it writeable now?
- */
- oldest = get_oldest_context(inode, NULL);
- up_read(&mdsc->snap_rwsem);
-
- if (snapc->seq > oldest->seq) {
- ceph_put_snap_context(oldest);
- dout(" page %p snapc %p not current or oldest\n",
- page, snapc);
- /*
- * queue for writeback, and wait for snapc to
- * be writeable or written
- */
- snapc = ceph_get_snap_context(snapc);
- unlock_page(page);
- ceph_queue_writeback(inode);
- r = wait_event_interruptible(ci->i_cap_wq,
- context_is_writeable_or_written(inode, snapc));
- ceph_put_snap_context(snapc);
- if (r == -ERESTARTSYS)
- return r;
- return -EAGAIN;
- }
- ceph_put_snap_context(oldest);
-
- /* yay, writeable, do it now (without dropping page lock) */
- dout(" page %p snapc %p not current, but oldest\n",
- page, snapc);
- if (!clear_page_dirty_for_io(page))
- goto retry_locked;
- r = writepage_nounlock(page, NULL);
- if (r < 0)
- goto fail_nosnap;
- goto retry_locked;
- }
-
- if (PageUptodate(page)) {
- dout(" page %p already uptodate\n", page);
- return 0;
- }
-
- /* full page? */
- if (pos_in_page == 0 && len == PAGE_CACHE_SIZE)
- return 0;
-
- /* past end of file? */
- i_size = inode->i_size; /* caller holds i_mutex */
-
- if (i_size + len > inode->i_sb->s_maxbytes) {
- /* file is too big */
- r = -EINVAL;
- goto fail;
- }
-
- if (page_off >= i_size ||
- (pos_in_page == 0 && (pos+len) >= i_size &&
- end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {
- dout(" zeroing %p 0 - %d and %d - %d\n",
- page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE);
- zero_user_segments(page,
- 0, pos_in_page,
- end_in_page, PAGE_CACHE_SIZE);
- return 0;
- }
-
- /* we need to read it. */
- up_read(&mdsc->snap_rwsem);
- r = readpage_nounlock(file, page);
- if (r < 0)
- goto fail_nosnap;
- goto retry_locked;
-
-fail:
- up_read(&mdsc->snap_rwsem);
-fail_nosnap:
- unlock_page(page);
- return r;
-}
-
-/*
- * We are only allowed to write into/dirty the page if the page is
- * clean, or already dirty within the same snap context.
- */
-static int ceph_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
-{
- struct inode *inode = file->f_dentry->d_inode;
- struct page *page;
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- int r;
-
- do {
- /* get a page */
- page = grab_cache_page_write_begin(mapping, index, 0);
- if (!page)
- return -ENOMEM;
- *pagep = page;
-
- dout("write_begin file %p inode %p page %p %d~%d\n", file,
- inode, page, (int)pos, (int)len);
-
- r = ceph_update_writeable_page(file, pos, len, page);
- } while (r == -EAGAIN);
-
- return r;
-}
-
-/*
- * we don't do anything in here that simple_write_end doesn't do
- * except adjust dirty page accounting and drop read lock on
- * mdsc->snap_rwsem.
- */
-static int ceph_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- struct inode *inode = file->f_dentry->d_inode;
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- struct ceph_mds_client *mdsc = fsc->mdsc;
- unsigned from = pos & (PAGE_CACHE_SIZE - 1);
- int check_cap = 0;
-
- dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
- inode, page, (int)pos, (int)copied, (int)len);
-
- /* zero the stale part of the page if we did a short copy */
- if (copied < len)
- zero_user_segment(page, from+copied, len);
-
- /* did file size increase? */
- /* (no need for i_size_read(); we caller holds i_mutex */
- if (pos+copied > inode->i_size)
- check_cap = ceph_inode_set_size(inode, pos+copied);
-
- if (!PageUptodate(page))
- SetPageUptodate(page);
-
- set_page_dirty(page);
-
- unlock_page(page);
- up_read(&mdsc->snap_rwsem);
- page_cache_release(page);
-
- if (check_cap)
- ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
-
- return copied;
-}
-
-/*
- * we set .direct_IO to indicate direct io is supported, but since we
- * intercept O_DIRECT reads and writes early, this function should
- * never get called.
- */
-static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
- const struct iovec *iov,
- loff_t pos, unsigned long nr_segs)
-{
- WARN_ON(1);
- return -EINVAL;
-}
-
-const struct address_space_operations ceph_aops = {
- .readpage = ceph_readpage,
- .readpages = ceph_readpages,
- .writepage = ceph_writepage,
- .writepages = ceph_writepages_start,
- .write_begin = ceph_write_begin,
- .write_end = ceph_write_end,
- .set_page_dirty = ceph_set_page_dirty,
- .invalidatepage = ceph_invalidatepage,
- .releasepage = ceph_releasepage,
- .direct_IO = ceph_direct_io,
-};
-
-
-/*
- * vm ops
- */
-
-/*
- * Reuse write_begin here for simplicity.
- */
-static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- struct inode *inode = vma->vm_file->f_dentry->d_inode;
- struct page *page = vmf->page;
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
- loff_t off = page->index << PAGE_CACHE_SHIFT;
- loff_t size, len;
- int ret;
-
- size = i_size_read(inode);
- if (off + PAGE_CACHE_SIZE <= size)
- len = PAGE_CACHE_SIZE;
- else
- len = size & ~PAGE_CACHE_MASK;
-
- dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode,
- off, len, page, page->index);
-
- lock_page(page);
-
- ret = VM_FAULT_NOPAGE;
- if ((off > size) ||
- (page->mapping != inode->i_mapping))
- goto out;
-
- ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
- if (ret == 0) {
- /* success. we'll keep the page locked. */
- set_page_dirty(page);
- up_read(&mdsc->snap_rwsem);
- ret = VM_FAULT_LOCKED;
- } else {
- if (ret == -ENOMEM)
- ret = VM_FAULT_OOM;
- else
- ret = VM_FAULT_SIGBUS;
- }
-out:
- dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret);
- if (ret != VM_FAULT_LOCKED)
- unlock_page(page);
- return ret;
-}
-
-static struct vm_operations_struct ceph_vmops = {
- .fault = filemap_fault,
- .page_mkwrite = ceph_page_mkwrite,
-};
-
-int ceph_mmap(struct file *file, struct vm_area_struct *vma)
-{
- struct address_space *mapping = file->f_mapping;
-
- if (!mapping->a_ops->readpage)
- return -ENOEXEC;
- file_accessed(file);
- vma->vm_ops = &ceph_vmops;
- vma->vm_flags |= VM_CAN_NONLINEAR;
- return 0;
-}
diff --git a/ANDROID_3.4.5/fs/ceph/caps.c b/ANDROID_3.4.5/fs/ceph/caps.c
deleted file mode 100644
index 620daad2..00000000
--- a/ANDROID_3.4.5/fs/ceph/caps.c
+++ /dev/null
@@ -1,3088 +0,0 @@
-#include <linux/ceph/ceph_debug.h>
-
-#include <linux/fs.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/wait.h>
-#include <linux/writeback.h>
-
-#include "super.h"
-#include "mds_client.h"
-#include <linux/ceph/decode.h>
-#include <linux/ceph/messenger.h>
-
-/*
- * Capability management
- *
- * The Ceph metadata servers control client access to inode metadata
- * and file data by issuing capabilities, granting clients permission
- * to read and/or write both inode field and file data to OSDs
- * (storage nodes). Each capability consists of a set of bits
- * indicating which operations are allowed.
- *
- * If the client holds a *_SHARED cap, the client has a coherent value
- * that can be safely read from the cached inode.
- *
- * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
- * client is allowed to change inode attributes (e.g., file size,
- * mtime), note its dirty state in the ceph_cap, and asynchronously
- * flush that metadata change to the MDS.
- *
- * In the event of a conflicting operation (perhaps by another
- * client), the MDS will revoke the conflicting client capabilities.
- *
- * In order for a client to cache an inode, it must hold a capability
- * with at least one MDS server. When inodes are released, release
- * notifications are batched and periodically sent en masse to the MDS
- * cluster to release server state.
- */
-
-
-/*
- * Generate readable cap strings for debugging output.
- */
-#define MAX_CAP_STR 20
-static char cap_str[MAX_CAP_STR][40];
-static DEFINE_SPINLOCK(cap_str_lock);
-static int last_cap_str;
-
-static char *gcap_string(char *s, int c)
-{
- if (c & CEPH_CAP_GSHARED)
- *s++ = 's';
- if (c & CEPH_CAP_GEXCL)
- *s++ = 'x';
- if (c & CEPH_CAP_GCACHE)
- *s++ = 'c';
- if (c & CEPH_CAP_GRD)
- *s++ = 'r';
- if (c & CEPH_CAP_GWR)
- *s++ = 'w';
- if (c & CEPH_CAP_GBUFFER)
- *s++ = 'b';
- if (c & CEPH_CAP_GLAZYIO)
- *s++ = 'l';
- return s;
-}
-
-const char *ceph_cap_string(int caps)
-{
- int i;
- char *s;
- int c;
-
- spin_lock(&cap_str_lock);
- i = last_cap_str++;
- if (last_cap_str == MAX_CAP_STR)
- last_cap_str = 0;
- spin_unlock(&cap_str_lock);
-
- s = cap_str[i];
-
- if (caps & CEPH_CAP_PIN)
- *s++ = 'p';
-
- c = (caps >> CEPH_CAP_SAUTH) & 3;
- if (c) {
- *s++ = 'A';
- s = gcap_string(s, c);
- }
-
- c = (caps >> CEPH_CAP_SLINK) & 3;
- if (c) {
- *s++ = 'L';
- s = gcap_string(s, c);
- }
-
- c = (caps >> CEPH_CAP_SXATTR) & 3;
- if (c) {
- *s++ = 'X';
- s = gcap_string(s, c);
- }
-
- c = caps >> CEPH_CAP_SFILE;
- if (c) {
- *s++ = 'F';
- s = gcap_string(s, c);
- }
-
- if (s == cap_str[i])
- *s++ = '-';
- *s = 0;
- return cap_str[i];
-}
-
-void ceph_caps_init(struct ceph_mds_client *mdsc)
-{
- INIT_LIST_HEAD(&mdsc->caps_list);
- spin_lock_init(&mdsc->caps_list_lock);
-}
-
-void ceph_caps_finalize(struct ceph_mds_client *mdsc)
-{
- struct ceph_cap *cap;
-
- spin_lock(&mdsc->caps_list_lock);
- while (!list_empty(&mdsc->caps_list)) {
- cap = list_first_entry(&mdsc->caps_list,
- struct ceph_cap, caps_item);
- list_del(&cap->caps_item);
- kmem_cache_free(ceph_cap_cachep, cap);
- }
- mdsc->caps_total_count = 0;
- mdsc->caps_avail_count = 0;
- mdsc->caps_use_count = 0;
- mdsc->caps_reserve_count = 0;
- mdsc->caps_min_count = 0;
- spin_unlock(&mdsc->caps_list_lock);
-}
-
-void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
-{
- spin_lock(&mdsc->caps_list_lock);
- mdsc->caps_min_count += delta;
- BUG_ON(mdsc->caps_min_count < 0);
- spin_unlock(&mdsc->caps_list_lock);
-}
-
-int ceph_reserve_caps(struct ceph_mds_client *mdsc,
- struct ceph_cap_reservation *ctx, int need)
-{
- int i;
- struct ceph_cap *cap;
- int have;
- int alloc = 0;
- LIST_HEAD(newcaps);
- int ret = 0;
-
- dout("reserve caps ctx=%p need=%d\n", ctx, need);
-
- /* first reserve any caps that are already allocated */
- spin_lock(&mdsc->caps_list_lock);
- if (mdsc->caps_avail_count >= need)
- have = need;
- else
- have = mdsc->caps_avail_count;
- mdsc->caps_avail_count -= have;
- mdsc->caps_reserve_count += have;
- BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
- mdsc->caps_reserve_count +
- mdsc->caps_avail_count);
- spin_unlock(&mdsc->caps_list_lock);
-
- for (i = have; i < need; i++) {
- cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
- if (!cap) {
- ret = -ENOMEM;
- goto out_alloc_count;
- }
- list_add(&cap->caps_item, &newcaps);
- alloc++;
- }
- BUG_ON(have + alloc != need);
-
- spin_lock(&mdsc->caps_list_lock);
- mdsc->caps_total_count += alloc;
- mdsc->caps_reserve_count += alloc;
- list_splice(&newcaps, &mdsc->caps_list);
-
- BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
- mdsc->caps_reserve_count +
- mdsc->caps_avail_count);
- spin_unlock(&mdsc->caps_list_lock);
-
- ctx->count = need;
- dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
- ctx, mdsc->caps_total_count, mdsc->caps_use_count,
- mdsc->caps_reserve_count, mdsc->caps_avail_count);
- return 0;
-
-out_alloc_count:
- /* we didn't manage to reserve as much as we needed */
- pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
- ctx, need, have);
- return ret;
-}
-
-int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
- struct ceph_cap_reservation *ctx)
-{
- dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
- if (ctx->count) {
- spin_lock(&mdsc->caps_list_lock);
- BUG_ON(mdsc->caps_reserve_count < ctx->count);
- mdsc->caps_reserve_count -= ctx->count;
- mdsc->caps_avail_count += ctx->count;
- ctx->count = 0;
- dout("unreserve caps %d = %d used + %d resv + %d avail\n",
- mdsc->caps_total_count, mdsc->caps_use_count,
- mdsc->caps_reserve_count, mdsc->caps_avail_count);
- BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
- mdsc->caps_reserve_count +
- mdsc->caps_avail_count);
- spin_unlock(&mdsc->caps_list_lock);
- }
- return 0;
-}
-
-static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc,
- struct ceph_cap_reservation *ctx)
-{
- struct ceph_cap *cap = NULL;
-
- /* temporary, until we do something about cap import/export */
- if (!ctx) {
- cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
- if (cap) {
- mdsc->caps_use_count++;
- mdsc->caps_total_count++;
- }
- return cap;
- }
-
- spin_lock(&mdsc->caps_list_lock);
- dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
- ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count,
- mdsc->caps_reserve_count, mdsc->caps_avail_count);
- BUG_ON(!ctx->count);
- BUG_ON(ctx->count > mdsc->caps_reserve_count);
- BUG_ON(list_empty(&mdsc->caps_list));
-
- ctx->count--;
- mdsc->caps_reserve_count--;
- mdsc->caps_use_count++;
-
- cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item);
- list_del(&cap->caps_item);
-
- BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
- mdsc->caps_reserve_count + mdsc->caps_avail_count);
- spin_unlock(&mdsc->caps_list_lock);
- return cap;
-}
-
-void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
-{
- spin_lock(&mdsc->caps_list_lock);
- dout("put_cap %p %d = %d used + %d resv + %d avail\n",
- cap, mdsc->caps_total_count, mdsc->caps_use_count,
- mdsc->caps_reserve_count, mdsc->caps_avail_count);
- mdsc->caps_use_count--;
- /*
- * Keep some preallocated caps around (ceph_min_count), to
- * avoid lots of free/alloc churn.
- */
- if (mdsc->caps_avail_count >= mdsc->caps_reserve_count +
- mdsc->caps_min_count) {
- mdsc->caps_total_count--;
- kmem_cache_free(ceph_cap_cachep, cap);
- } else {
- mdsc->caps_avail_count++;
- list_add(&cap->caps_item, &mdsc->caps_list);
- }
-
- BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
- mdsc->caps_reserve_count + mdsc->caps_avail_count);
- spin_unlock(&mdsc->caps_list_lock);
-}
-
-void ceph_reservation_status(struct ceph_fs_client *fsc,
- int *total, int *avail, int *used, int *reserved,
- int *min)
-{
- struct ceph_mds_client *mdsc = fsc->mdsc;
-
- if (total)
- *total = mdsc->caps_total_count;
- if (avail)
- *avail = mdsc->caps_avail_count;
- if (used)
- *used = mdsc->caps_use_count;
- if (reserved)
- *reserved = mdsc->caps_reserve_count;
- if (min)
- *min = mdsc->caps_min_count;
-}
-
-/*
- * Find ceph_cap for given mds, if any.
- *
- * Called with i_ceph_lock held.
- */
-static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
-{
- struct ceph_cap *cap;
- struct rb_node *n = ci->i_caps.rb_node;
-
- while (n) {
- cap = rb_entry(n, struct ceph_cap, ci_node);
- if (mds < cap->mds)
- n = n->rb_left;
- else if (mds > cap->mds)
- n = n->rb_right;
- else
- return cap;
- }
- return NULL;
-}
-
-struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds)
-{
- struct ceph_cap *cap;
-
- spin_lock(&ci->i_ceph_lock);
- cap = __get_cap_for_mds(ci, mds);
- spin_unlock(&ci->i_ceph_lock);
- return cap;
-}
-
-/*
- * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1.
- */
-static int __ceph_get_cap_mds(struct ceph_inode_info *ci)
-{
- struct ceph_cap *cap;
- int mds = -1;
- struct rb_node *p;
-
- /* prefer mds with WR|BUFFER|EXCL caps */
- for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
- cap = rb_entry(p, struct ceph_cap, ci_node);
- mds = cap->mds;
- if (cap->issued & (CEPH_CAP_FILE_WR |
- CEPH_CAP_FILE_BUFFER |
- CEPH_CAP_FILE_EXCL))
- break;
- }
- return mds;
-}
-
-int ceph_get_cap_mds(struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- int mds;
- spin_lock(&ci->i_ceph_lock);
- mds = __ceph_get_cap_mds(ceph_inode(inode));
- spin_unlock(&ci->i_ceph_lock);
- return mds;
-}
-
-/*
- * Called under i_ceph_lock.
- */
-static void __insert_cap_node(struct ceph_inode_info *ci,
- struct ceph_cap *new)
-{
- struct rb_node **p = &ci->i_caps.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_cap *cap = NULL;
-
- while (*p) {
- parent = *p;
- cap = rb_entry(parent, struct ceph_cap, ci_node);
- if (new->mds < cap->mds)
- p = &(*p)->rb_left;
- else if (new->mds > cap->mds)
- p = &(*p)->rb_right;
- else
- BUG();
- }
-
- rb_link_node(&new->ci_node, parent, p);
- rb_insert_color(&new->ci_node, &ci->i_caps);
-}
-
-/*
- * (re)set cap hold timeouts, which control the delayed release
- * of unused caps back to the MDS. Should be called on cap use.
- */
-static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
- struct ceph_inode_info *ci)
-{
- struct ceph_mount_options *ma = mdsc->fsc->mount_options;
-
- ci->i_hold_caps_min = round_jiffies(jiffies +
- ma->caps_wanted_delay_min * HZ);
- ci->i_hold_caps_max = round_jiffies(jiffies +
- ma->caps_wanted_delay_max * HZ);
- dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
- ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
-}
-
-/*
- * (Re)queue cap at the end of the delayed cap release list.
- *
- * If I_FLUSH is set, leave the inode at the front of the list.
- *
- * Caller holds i_ceph_lock
- * -> we take mdsc->cap_delay_lock
- */
-static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
- struct ceph_inode_info *ci)
-{
- __cap_set_timeouts(mdsc, ci);
- dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
- ci->i_ceph_flags, ci->i_hold_caps_max);
- if (!mdsc->stopping) {
- spin_lock(&mdsc->cap_delay_lock);
- if (!list_empty(&ci->i_cap_delay_list)) {
- if (ci->i_ceph_flags & CEPH_I_FLUSH)
- goto no_change;
- list_del_init(&ci->i_cap_delay_list);
- }
- list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
-no_change:
- spin_unlock(&mdsc->cap_delay_lock);
- }
-}
-
-/*
- * Queue an inode for immediate writeback. Mark inode with I_FLUSH,
- * indicating we should send a cap message to flush dirty metadata
- * asap, and move to the front of the delayed cap list.
- */
-static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
- struct ceph_inode_info *ci)
-{
- dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
- spin_lock(&mdsc->cap_delay_lock);
- ci->i_ceph_flags |= CEPH_I_FLUSH;
- if (!list_empty(&ci->i_cap_delay_list))
- list_del_init(&ci->i_cap_delay_list);
- list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
- spin_unlock(&mdsc->cap_delay_lock);
-}
-
-/*
- * Cancel delayed work on cap.
- *
- * Caller must hold i_ceph_lock.
- */
-static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
- struct ceph_inode_info *ci)
-{
- dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
- if (list_empty(&ci->i_cap_delay_list))
- return;
- spin_lock(&mdsc->cap_delay_lock);
- list_del_init(&ci->i_cap_delay_list);
- spin_unlock(&mdsc->cap_delay_lock);
-}
-
-/*
- * Common issue checks for add_cap, handle_cap_grant.
- */
-static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
- unsigned issued)
-{
- unsigned had = __ceph_caps_issued(ci, NULL);
-
- /*
- * Each time we receive FILE_CACHE anew, we increment
- * i_rdcache_gen.
- */
- if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
- (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
- ci->i_rdcache_gen++;
-
- /*
- * if we are newly issued FILE_SHARED, clear D_COMPLETE; we
- * don't know what happened to this directory while we didn't
- * have the cap.
- */
- if ((issued & CEPH_CAP_FILE_SHARED) &&
- (had & CEPH_CAP_FILE_SHARED) == 0) {
- ci->i_shared_gen++;
- if (S_ISDIR(ci->vfs_inode.i_mode))
- ceph_dir_clear_complete(&ci->vfs_inode);
- }
-}
-
-/*
- * Add a capability under the given MDS session.
- *
- * Caller should hold session snap_rwsem (read) and s_mutex.
- *
- * @fmode is the open file mode, if we are opening a file, otherwise
- * it is < 0. (This is so we can atomically add the cap and add an
- * open file reference to it.)
- */
-int ceph_add_cap(struct inode *inode,
- struct ceph_mds_session *session, u64 cap_id,
- int fmode, unsigned issued, unsigned wanted,
- unsigned seq, unsigned mseq, u64 realmino, int flags,
- struct ceph_cap_reservation *caps_reservation)
-{
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_cap *new_cap = NULL;
- struct ceph_cap *cap;
- int mds = session->s_mds;
- int actual_wanted;
-
- dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
- session->s_mds, cap_id, ceph_cap_string(issued), seq);
-
- /*
- * If we are opening the file, include file mode wanted bits
- * in wanted.
- */
- if (fmode >= 0)
- wanted |= ceph_caps_for_mode(fmode);
-
-retry:
- spin_lock(&ci->i_ceph_lock);
- cap = __get_cap_for_mds(ci, mds);
- if (!cap) {
- if (new_cap) {
- cap = new_cap;
- new_cap = NULL;
- } else {
- spin_unlock(&ci->i_ceph_lock);
- new_cap = get_cap(mdsc, caps_reservation);
- if (new_cap == NULL)
- return -ENOMEM;
- goto retry;
- }
-
- cap->issued = 0;
- cap->implemented = 0;
- cap->mds = mds;
- cap->mds_wanted = 0;
-
- cap->ci = ci;
- __insert_cap_node(ci, cap);
-
- /* clear out old exporting info? (i.e. on cap import) */
- if (ci->i_cap_exporting_mds == mds) {
- ci->i_cap_exporting_issued = 0;
- ci->i_cap_exporting_mseq = 0;
- ci->i_cap_exporting_mds = -1;
- }
-
- /* add to session cap list */
- cap->session = session;
- spin_lock(&session->s_cap_lock);
- list_add_tail(&cap->session_caps, &session->s_caps);
- session->s_nr_caps++;
- spin_unlock(&session->s_cap_lock);
- } else if (new_cap)
- ceph_put_cap(mdsc, new_cap);
-
- if (!ci->i_snap_realm) {
- /*
- * add this inode to the appropriate snap realm
- */
- struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
- realmino);
- if (realm) {
- ceph_get_snap_realm(mdsc, realm);
- spin_lock(&realm->inodes_with_caps_lock);
- ci->i_snap_realm = realm;
- list_add(&ci->i_snap_realm_item,
- &realm->inodes_with_caps);
- spin_unlock(&realm->inodes_with_caps_lock);
- } else {
- pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
- realmino);
- WARN_ON(!realm);
- }
- }
-
- __check_cap_issue(ci, cap, issued);
-
- /*
- * If we are issued caps we don't want, or the mds' wanted
- * value appears to be off, queue a check so we'll release
- * later and/or update the mds wanted value.
- */
- actual_wanted = __ceph_caps_wanted(ci);
- if ((wanted & ~actual_wanted) ||
- (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
- dout(" issued %s, mds wanted %s, actual %s, queueing\n",
- ceph_cap_string(issued), ceph_cap_string(wanted),
- ceph_cap_string(actual_wanted));
- __cap_delay_requeue(mdsc, ci);
- }
-
- if (flags & CEPH_CAP_FLAG_AUTH)
- ci->i_auth_cap = cap;
- else if (ci->i_auth_cap == cap)
- ci->i_auth_cap = NULL;
-
- dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
- inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
- ceph_cap_string(issued|cap->issued), seq, mds);
- cap->cap_id = cap_id;
- cap->issued = issued;
- cap->implemented |= issued;
- cap->mds_wanted |= wanted;
- cap->seq = seq;
- cap->issue_seq = seq;
- cap->mseq = mseq;
- cap->cap_gen = session->s_cap_gen;
-
- if (fmode >= 0)
- __ceph_get_fmode(ci, fmode);
- spin_unlock(&ci->i_ceph_lock);
- wake_up_all(&ci->i_cap_wq);
- return 0;
-}
-
-/*
- * Return true if cap has not timed out and belongs to the current
- * generation of the MDS session (i.e. has not gone 'stale' due to
- * us losing touch with the mds).
- */
-static int __cap_is_valid(struct ceph_cap *cap)
-{
- unsigned long ttl;
- u32 gen;
-
- spin_lock(&cap->session->s_gen_ttl_lock);
- gen = cap->session->s_cap_gen;
- ttl = cap->session->s_cap_ttl;
- spin_unlock(&cap->session->s_gen_ttl_lock);
-
- if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
- dout("__cap_is_valid %p cap %p issued %s "
- "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
- cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
- return 0;
- }
-
- return 1;
-}
-
-/*
- * Return set of valid cap bits issued to us. Note that caps time
- * out, and may be invalidated in bulk if the client session times out
- * and session->s_cap_gen is bumped.
- */
-int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
-{
- int have = ci->i_snap_caps | ci->i_cap_exporting_issued;
- struct ceph_cap *cap;
- struct rb_node *p;
-
- if (implemented)
- *implemented = 0;
- for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
- cap = rb_entry(p, struct ceph_cap, ci_node);
- if (!__cap_is_valid(cap))
- continue;
- dout("__ceph_caps_issued %p cap %p issued %s\n",
- &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
- have |= cap->issued;
- if (implemented)
- *implemented |= cap->implemented;
- }
- return have;
-}
-
-/*
- * Get cap bits issued by caps other than @ocap
- */
-int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
-{
- int have = ci->i_snap_caps;
- struct ceph_cap *cap;
- struct rb_node *p;
-
- for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
- cap = rb_entry(p, struct ceph_cap, ci_node);
- if (cap == ocap)
- continue;
- if (!__cap_is_valid(cap))
- continue;
- have |= cap->issued;
- }
- return have;
-}
-
-/*
- * Move a cap to the end of the LRU (oldest caps at list head, newest
- * at list tail).
- */
-static void __touch_cap(struct ceph_cap *cap)
-{
- struct ceph_mds_session *s = cap->session;
-
- spin_lock(&s->s_cap_lock);
- if (s->s_cap_iterator == NULL) {
- dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
- s->s_mds);
- list_move_tail(&cap->session_caps, &s->s_caps);
- } else {
- dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
- &cap->ci->vfs_inode, cap, s->s_mds);
- }
- spin_unlock(&s->s_cap_lock);
-}
-
-/*
- * Check if we hold the given mask. If so, move the cap(s) to the
- * front of their respective LRUs. (This is the preferred way for
- * callers to check for caps they want.)
- */
-int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
-{
- struct ceph_cap *cap;
- struct rb_node *p;
- int have = ci->i_snap_caps;
-
- if ((have & mask) == mask) {
- dout("__ceph_caps_issued_mask %p snap issued %s"
- " (mask %s)\n", &ci->vfs_inode,
- ceph_cap_string(have),
- ceph_cap_string(mask));
- return 1;
- }
-
- for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
- cap = rb_entry(p, struct ceph_cap, ci_node);
- if (!__cap_is_valid(cap))
- continue;
- if ((cap->issued & mask) == mask) {
- dout("__ceph_caps_issued_mask %p cap %p issued %s"
- " (mask %s)\n", &ci->vfs_inode, cap,
- ceph_cap_string(cap->issued),
- ceph_cap_string(mask));
- if (touch)
- __touch_cap(cap);
- return 1;
- }
-
- /* does a combination of caps satisfy mask? */
- have |= cap->issued;
- if ((have & mask) == mask) {
- dout("__ceph_caps_issued_mask %p combo issued %s"
- " (mask %s)\n", &ci->vfs_inode,
- ceph_cap_string(cap->issued),
- ceph_cap_string(mask));
- if (touch) {
- struct rb_node *q;
-
- /* touch this + preceding caps */
- __touch_cap(cap);
- for (q = rb_first(&ci->i_caps); q != p;
- q = rb_next(q)) {
- cap = rb_entry(q, struct ceph_cap,
- ci_node);
- if (!__cap_is_valid(cap))
- continue;
- __touch_cap(cap);
- }
- }
- return 1;
- }
- }
-
- return 0;
-}
-
-/*
- * Return true if mask caps are currently being revoked by an MDS.
- */
-int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
-{
- struct inode *inode = &ci->vfs_inode;
- struct ceph_cap *cap;
- struct rb_node *p;
- int ret = 0;
-
- spin_lock(&ci->i_ceph_lock);
- for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
- cap = rb_entry(p, struct ceph_cap, ci_node);
- if (__cap_is_valid(cap) &&
- (cap->implemented & ~cap->issued & mask)) {
- ret = 1;
- break;
- }
- }
- spin_unlock(&ci->i_ceph_lock);
- dout("ceph_caps_revoking %p %s = %d\n", inode,
- ceph_cap_string(mask), ret);
- return ret;
-}
-
-int __ceph_caps_used(struct ceph_inode_info *ci)
-{
- int used = 0;
- if (ci->i_pin_ref)
- used |= CEPH_CAP_PIN;
- if (ci->i_rd_ref)
- used |= CEPH_CAP_FILE_RD;
- if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages)
- used |= CEPH_CAP_FILE_CACHE;
- if (ci->i_wr_ref)
- used |= CEPH_CAP_FILE_WR;
- if (ci->i_wb_ref || ci->i_wrbuffer_ref)
- used |= CEPH_CAP_FILE_BUFFER;
- return used;
-}
-
-/*
- * wanted, by virtue of open file modes
- */
-int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
-{
- int want = 0;
- int mode;
- for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++)
- if (ci->i_nr_by_mode[mode])
- want |= ceph_caps_for_mode(mode);
- return want;
-}
-
-/*
- * Return caps we have registered with the MDS(s) as 'wanted'.
- */
-int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
-{
- struct ceph_cap *cap;
- struct rb_node *p;
- int mds_wanted = 0;
-
- for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
- cap = rb_entry(p, struct ceph_cap, ci_node);
- if (!__cap_is_valid(cap))
- continue;
- mds_wanted |= cap->mds_wanted;
- }
- return mds_wanted;
-}
-
-/*
- * called under i_ceph_lock
- */
-static int __ceph_is_any_caps(struct ceph_inode_info *ci)
-{
- return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0;
-}
-
-/*
- * Remove a cap. Take steps to deal with a racing iterate_session_caps.
- *
- * caller should hold i_ceph_lock.
- * caller will not hold session s_mutex if called from destroy_inode.
- */
-void __ceph_remove_cap(struct ceph_cap *cap)
-{
- struct ceph_mds_session *session = cap->session;
- struct ceph_inode_info *ci = cap->ci;
- struct ceph_mds_client *mdsc =
- ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
- int removed = 0;
-
- dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
-
- /* remove from session list */
- spin_lock(&session->s_cap_lock);
- if (session->s_cap_iterator == cap) {
- /* not yet, we are iterating over this very cap */
- dout("__ceph_remove_cap delaying %p removal from session %p\n",
- cap, cap->session);
- } else {
- list_del_init(&cap->session_caps);
- session->s_nr_caps--;
- cap->session = NULL;
- removed = 1;
- }
- /* protect backpointer with s_cap_lock: see iterate_session_caps */
- cap->ci = NULL;
- spin_unlock(&session->s_cap_lock);
-
- /* remove from inode list */
- rb_erase(&cap->ci_node, &ci->i_caps);
- if (ci->i_auth_cap == cap)
- ci->i_auth_cap = NULL;
-
- if (removed)
- ceph_put_cap(mdsc, cap);
-
- if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
- struct ceph_snap_realm *realm = ci->i_snap_realm;
- spin_lock(&realm->inodes_with_caps_lock);
- list_del_init(&ci->i_snap_realm_item);
- ci->i_snap_realm_counter++;
- ci->i_snap_realm = NULL;
- spin_unlock(&realm->inodes_with_caps_lock);
- ceph_put_snap_realm(mdsc, realm);
- }
- if (!__ceph_is_any_real_caps(ci))
- __cap_delay_cancel(mdsc, ci);
-}
-
-/*
- * Build and send a cap message to the given MDS.
- *
- * Caller should be holding s_mutex.
- */
-static int send_cap_msg(struct ceph_mds_session *session,
- u64 ino, u64 cid, int op,
- int caps, int wanted, int dirty,
- u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
- u64 size, u64 max_size,
- struct timespec *mtime, struct timespec *atime,
- u64 time_warp_seq,
- uid_t uid, gid_t gid, umode_t mode,
- u64 xattr_version,
- struct ceph_buffer *xattrs_buf,
- u64 follows)
-{
- struct ceph_mds_caps *fc;
- struct ceph_msg *msg;
-
- dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
- " seq %u/%u mseq %u follows %lld size %llu/%llu"
- " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
- cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
- ceph_cap_string(dirty),
- seq, issue_seq, mseq, follows, size, max_size,
- xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
-
- msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS, false);
- if (!msg)
- return -ENOMEM;
-
- msg->hdr.tid = cpu_to_le64(flush_tid);
-
- fc = msg->front.iov_base;
- memset(fc, 0, sizeof(*fc));
-
- fc->cap_id = cpu_to_le64(cid);
- fc->op = cpu_to_le32(op);
- fc->seq = cpu_to_le32(seq);
- fc->issue_seq = cpu_to_le32(issue_seq);
- fc->migrate_seq = cpu_to_le32(mseq);
- fc->caps = cpu_to_le32(caps);
- fc->wanted = cpu_to_le32(wanted);
- fc->dirty = cpu_to_le32(dirty);
- fc->ino = cpu_to_le64(ino);
- fc->snap_follows = cpu_to_le64(follows);
-
- fc->size = cpu_to_le64(size);
- fc->max_size = cpu_to_le64(max_size);
- if (mtime)
- ceph_encode_timespec(&fc->mtime, mtime);
- if (atime)
- ceph_encode_timespec(&fc->atime, atime);
- fc->time_warp_seq = cpu_to_le32(time_warp_seq);
-
- fc->uid = cpu_to_le32(uid);
- fc->gid = cpu_to_le32(gid);
- fc->mode = cpu_to_le32(mode);
-
- fc->xattr_version = cpu_to_le64(xattr_version);
- if (xattrs_buf) {
- msg->middle = ceph_buffer_get(xattrs_buf);
- fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);
- msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);
- }
-
- ceph_con_send(&session->s_con, msg);
- return 0;
-}
-
-static void __queue_cap_release(struct ceph_mds_session *session,
- u64 ino, u64 cap_id, u32 migrate_seq,
- u32 issue_seq)
-{
- struct ceph_msg *msg;
- struct ceph_mds_cap_release *head;
- struct ceph_mds_cap_item *item;
-
- spin_lock(&session->s_cap_lock);
- BUG_ON(!session->s_num_cap_releases);
- msg = list_first_entry(&session->s_cap_releases,
- struct ceph_msg, list_head);
-
- dout(" adding %llx release to mds%d msg %p (%d left)\n",
- ino, session->s_mds, msg, session->s_num_cap_releases);
-
- BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
- head = msg->front.iov_base;
- head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
- item = msg->front.iov_base + msg->front.iov_len;
- item->ino = cpu_to_le64(ino);
- item->cap_id = cpu_to_le64(cap_id);
- item->migrate_seq = cpu_to_le32(migrate_seq);
- item->seq = cpu_to_le32(issue_seq);
-
- session->s_num_cap_releases--;
-
- msg->front.iov_len += sizeof(*item);
- if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
- dout(" release msg %p full\n", msg);
- list_move_tail(&msg->list_head, &session->s_cap_releases_done);
- } else {
- dout(" release msg %p at %d/%d (%d)\n", msg,
- (int)le32_to_cpu(head->num),
- (int)CEPH_CAPS_PER_RELEASE,
- (int)msg->front.iov_len);
- }
- spin_unlock(&session->s_cap_lock);
-}
-
-/*
- * Queue cap releases when an inode is dropped from our cache. Since
- * inode is about to be destroyed, there is no need for i_ceph_lock.
- */
-void ceph_queue_caps_release(struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct rb_node *p;
-
- p = rb_first(&ci->i_caps);
- while (p) {
- struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
- struct ceph_mds_session *session = cap->session;
-
- __queue_cap_release(session, ceph_ino(inode), cap->cap_id,
- cap->mseq, cap->issue_seq);
- p = rb_next(p);
- __ceph_remove_cap(cap);
- }
-}
-
-/*
- * Send a cap msg on the given inode. Update our caps state, then
- * drop i_ceph_lock and send the message.
- *
- * Make note of max_size reported/requested from mds, revoked caps
- * that have now been implemented.
- *
- * Make half-hearted attempt ot to invalidate page cache if we are
- * dropping RDCACHE. Note that this will leave behind locked pages
- * that we'll then need to deal with elsewhere.
- *
- * Return non-zero if delayed release, or we experienced an error
- * such that the caller should requeue + retry later.
- *
- * called with i_ceph_lock, then drops it.
- * caller should hold snap_rwsem (read), s_mutex.
- */
-static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
- int op, int used, int want, int retain, int flushing,
- unsigned *pflush_tid)
- __releases(cap->ci->i_ceph_lock)
-{
- struct ceph_inode_info *ci = cap->ci;
- struct inode *inode = &ci->vfs_inode;
- u64 cap_id = cap->cap_id;
- int held, revoking, dropping, keep;
- u64 seq, issue_seq, mseq, time_warp_seq, follows;
- u64 size, max_size;
- struct timespec mtime, atime;
- int wake = 0;
- umode_t mode;
- uid_t uid;
- gid_t gid;
- struct ceph_mds_session *session;
- u64 xattr_version = 0;
- struct ceph_buffer *xattr_blob = NULL;
- int delayed = 0;
- u64 flush_tid = 0;
- int i;
- int ret;
-
- held = cap->issued | cap->implemented;
- revoking = cap->implemented & ~cap->issued;
- retain &= ~revoking;
- dropping = cap->issued & ~retain;
-
- dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
- inode, cap, cap->session,
- ceph_cap_string(held), ceph_cap_string(held & retain),
- ceph_cap_string(revoking));
- BUG_ON((retain & CEPH_CAP_PIN) == 0);
-
- session = cap->session;
-
- /* don't release wanted unless we've waited a bit. */
- if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
- time_before(jiffies, ci->i_hold_caps_min)) {
- dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
- ceph_cap_string(cap->issued),
- ceph_cap_string(cap->issued & retain),
- ceph_cap_string(cap->mds_wanted),
- ceph_cap_string(want));
- want |= cap->mds_wanted;
- retain |= cap->issued;
- delayed = 1;
- }
- ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
-
- cap->issued &= retain; /* drop bits we don't want */
- if (cap->implemented & ~cap->issued) {
- /*
- * Wake up any waiters on wanted -> needed transition.
- * This is due to the weird transition from buffered
- * to sync IO... we need to flush dirty pages _before_
- * allowing sync writes to avoid reordering.
- */
- wake = 1;
- }
- cap->implemented &= cap->issued | used;
- cap->mds_wanted = want;
-
- if (flushing) {
- /*
- * assign a tid for flush operations so we can avoid
- * flush1 -> dirty1 -> flush2 -> flushack1 -> mark
- * clean type races. track latest tid for every bit
- * so we can handle flush AxFw, flush Fw, and have the
- * first ack clean Ax.
- */
- flush_tid = ++ci->i_cap_flush_last_tid;
- if (pflush_tid)
- *pflush_tid = flush_tid;
- dout(" cap_flush_tid %d\n", (int)flush_tid);
- for (i = 0; i < CEPH_CAP_BITS; i++)
- if (flushing & (1 << i))
- ci->i_cap_flush_tid[i] = flush_tid;
-
- follows = ci->i_head_snapc->seq;
- } else {
- follows = 0;
- }
-
- keep = cap->implemented;
- seq = cap->seq;
- issue_seq = cap->issue_seq;
- mseq = cap->mseq;
- size = inode->i_size;
- ci->i_reported_size = size;
- max_size = ci->i_wanted_max_size;
- ci->i_requested_max_size = max_size;
- mtime = inode->i_mtime;
- atime = inode->i_atime;
- time_warp_seq = ci->i_time_warp_seq;
- uid = inode->i_uid;
- gid = inode->i_gid;
- mode = inode->i_mode;
-
- if (flushing & CEPH_CAP_XATTR_EXCL) {
- __ceph_build_xattrs_blob(ci);
- xattr_blob = ci->i_xattrs.blob;
- xattr_version = ci->i_xattrs.version;
- }
-
- spin_unlock(&ci->i_ceph_lock);
-
- ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
- op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
- size, max_size, &mtime, &atime, time_warp_seq,
- uid, gid, mode, xattr_version, xattr_blob,
- follows);
- if (ret < 0) {
- dout("error sending cap msg, must requeue %p\n", inode);
- delayed = 1;
- }
-
- if (wake)
- wake_up_all(&ci->i_cap_wq);
-
- return delayed;
-}
-
-/*
- * When a snapshot is taken, clients accumulate dirty metadata on
- * inodes with capabilities in ceph_cap_snaps to describe the file
- * state at the time the snapshot was taken. This must be flushed
- * asynchronously back to the MDS once sync writes complete and dirty
- * data is written out.
- *
- * Unless @again is true, skip cap_snaps that were already sent to
- * the MDS (i.e., during this session).
- *
- * Called under i_ceph_lock. Takes s_mutex as needed.
- */
-void __ceph_flush_snaps(struct ceph_inode_info *ci,
- struct ceph_mds_session **psession,
- int again)
- __releases(ci->i_ceph_lock)
- __acquires(ci->i_ceph_lock)
-{
- struct inode *inode = &ci->vfs_inode;
- int mds;
- struct ceph_cap_snap *capsnap;
- u32 mseq;
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
- struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
- session->s_mutex */
- u64 next_follows = 0; /* keep track of how far we've gotten through the
- i_cap_snaps list, and skip these entries next time
- around to avoid an infinite loop */
-
- if (psession)
- session = *psession;
-
- dout("__flush_snaps %p\n", inode);
-retry:
- list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
- /* avoid an infiniute loop after retry */
- if (capsnap->follows < next_follows)
- continue;
- /*
- * we need to wait for sync writes to complete and for dirty
- * pages to be written out.
- */
- if (capsnap->dirty_pages || capsnap->writing)
- break;
-
- /*
- * if cap writeback already occurred, we should have dropped
- * the capsnap in ceph_put_wrbuffer_cap_refs.
- */
- BUG_ON(capsnap->dirty == 0);
-
- /* pick mds, take s_mutex */
- if (ci->i_auth_cap == NULL) {
- dout("no auth cap (migrating?), doing nothing\n");
- goto out;
- }
-
- /* only flush each capsnap once */
- if (!again && !list_empty(&capsnap->flushing_item)) {
- dout("already flushed %p, skipping\n", capsnap);
- continue;
- }
-
- mds = ci->i_auth_cap->session->s_mds;
- mseq = ci->i_auth_cap->mseq;
-
- if (session && session->s_mds != mds) {
- dout("oops, wrong session %p mutex\n", session);
- mutex_unlock(&session->s_mutex);
- ceph_put_mds_session(session);
- session = NULL;
- }
- if (!session) {
- spin_unlock(&ci->i_ceph_lock);
- mutex_lock(&mdsc->mutex);
- session = __ceph_lookup_mds_session(mdsc, mds);
- mutex_unlock(&mdsc->mutex);
- if (session) {
- dout("inverting session/ino locks on %p\n",
- session);
- mutex_lock(&session->s_mutex);
- }
- /*
- * if session == NULL, we raced against a cap
- * deletion or migration. retry, and we'll
- * get a better @mds value next time.
- */
- spin_lock(&ci->i_ceph_lock);
- goto retry;
- }
-
- capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
- atomic_inc(&capsnap->nref);
- if (!list_empty(&capsnap->flushing_item))
- list_del_init(&capsnap->flushing_item);
- list_add_tail(&capsnap->flushing_item,
- &session->s_cap_snaps_flushing);
- spin_unlock(&ci->i_ceph_lock);
-
- dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",
- inode, capsnap, capsnap->follows, capsnap->flush_tid);
- send_cap_msg(session, ceph_vino(inode).ino, 0,
- CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
- capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
- capsnap->size, 0,
- &capsnap->mtime, &capsnap->atime,
- capsnap->time_warp_seq,
- capsnap->uid, capsnap->gid, capsnap->mode,
- capsnap->xattr_version, capsnap->xattr_blob,
- capsnap->follows);
-
- next_follows = capsnap->follows + 1;
- ceph_put_cap_snap(capsnap);
-
- spin_lock(&ci->i_ceph_lock);
- goto retry;
- }
-
- /* we flushed them all; remove this inode from the queue */
- spin_lock(&mdsc->snap_flush_lock);
- list_del_init(&ci->i_snap_flush_item);
- spin_unlock(&mdsc->snap_flush_lock);
-
-out:
- if (psession)
- *psession = session;
- else if (session) {
- mutex_unlock(&session->s_mutex);
- ceph_put_mds_session(session);
- }
-}
-
-static void ceph_flush_snaps(struct ceph_inode_info *ci)
-{
- spin_lock(&ci->i_ceph_lock);
- __ceph_flush_snaps(ci, NULL, 0);
- spin_unlock(&ci->i_ceph_lock);
-}
-
-/*
- * Mark caps dirty. If inode is newly dirty, return the dirty flags.
- * Caller is then responsible for calling __mark_inode_dirty with the
- * returned flags value.
- */
-int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
-{
- struct ceph_mds_client *mdsc =
- ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
- struct inode *inode = &ci->vfs_inode;
- int was = ci->i_dirty_caps;
- int dirty = 0;
-
- dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
- ceph_cap_string(mask), ceph_cap_string(was),
- ceph_cap_string(was | mask));
- ci->i_dirty_caps |= mask;
- if (was == 0) {
- if (!ci->i_head_snapc)
- ci->i_head_snapc = ceph_get_snap_context(
- ci->i_snap_realm->cached_context);
- dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode,
- ci->i_head_snapc);
- BUG_ON(!list_empty(&ci->i_dirty_item));
- spin_lock(&mdsc->cap_dirty_lock);
- list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
- spin_unlock(&mdsc->cap_dirty_lock);
- if (ci->i_flushing_caps == 0) {
- ihold(inode);
- dirty |= I_DIRTY_SYNC;
- }
- }
- BUG_ON(list_empty(&ci->i_dirty_item));
- if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
- (mask & CEPH_CAP_FILE_BUFFER))
- dirty |= I_DIRTY_DATASYNC;
- __cap_delay_requeue(mdsc, ci);
- return dirty;
-}
-
-/*
- * Add dirty inode to the flushing list. Assigned a seq number so we
- * can wait for caps to flush without starving.
- *
- * Called under i_ceph_lock.
- */
-static int __mark_caps_flushing(struct inode *inode,
- struct ceph_mds_session *session)
-{
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
- struct ceph_inode_info *ci = ceph_inode(inode);
- int flushing;
-
- BUG_ON(ci->i_dirty_caps == 0);
- BUG_ON(list_empty(&ci->i_dirty_item));
-
- flushing = ci->i_dirty_caps;
- dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
- ceph_cap_string(flushing),
- ceph_cap_string(ci->i_flushing_caps),
- ceph_cap_string(ci->i_flushing_caps | flushing));
- ci->i_flushing_caps |= flushing;
- ci->i_dirty_caps = 0;
- dout(" inode %p now !dirty\n", inode);
-
- spin_lock(&mdsc->cap_dirty_lock);
- list_del_init(&ci->i_dirty_item);
-
- ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
- if (list_empty(&ci->i_flushing_item)) {
- list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
- mdsc->num_cap_flushing++;
- dout(" inode %p now flushing seq %lld\n", inode,
- ci->i_cap_flush_seq);
- } else {
- list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
- dout(" inode %p now flushing (more) seq %lld\n", inode,
- ci->i_cap_flush_seq);
- }
- spin_unlock(&mdsc->cap_dirty_lock);
-
- return flushing;
-}
-
-/*
- * try to invalidate mapping pages without blocking.
- */
-static int try_nonblocking_invalidate(struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- u32 invalidating_gen = ci->i_rdcache_gen;
-
- spin_unlock(&ci->i_ceph_lock);
- invalidate_mapping_pages(&inode->i_data, 0, -1);
- spin_lock(&ci->i_ceph_lock);
-
- if (inode->i_data.nrpages == 0 &&
- invalidating_gen == ci->i_rdcache_gen) {
- /* success. */
- dout("try_nonblocking_invalidate %p success\n", inode);
- /* save any racing async invalidate some trouble */
- ci->i_rdcache_revoking = ci->i_rdcache_gen - 1;
- return 0;
- }
- dout("try_nonblocking_invalidate %p failed\n", inode);
- return -1;
-}
-
-/*
- * Swiss army knife function to examine currently used and wanted
- * versus held caps. Release, flush, ack revoked caps to mds as
- * appropriate.
- *
- * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
- * cap release further.
- * CHECK_CAPS_AUTHONLY - we should only check the auth cap
- * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
- * further delay.
- */
-void ceph_check_caps(struct ceph_inode_info *ci, int flags,
- struct ceph_mds_session *session)
-{
- struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
- struct ceph_mds_client *mdsc = fsc->mdsc;
- struct inode *inode = &ci->vfs_inode;
- struct ceph_cap *cap;
- int file_wanted, used;
- int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
- int issued, implemented, want, retain, revoking, flushing = 0;
- int mds = -1; /* keep track of how far we've gone through i_caps list
- to avoid an infinite loop on retry */
- struct rb_node *p;
- int tried_invalidate = 0;
- int delayed = 0, sent = 0, force_requeue = 0, num;
- int queue_invalidate = 0;
- int is_delayed = flags & CHECK_CAPS_NODELAY;
-
- /* if we are unmounting, flush any unused caps immediately. */
- if (mdsc->stopping)
- is_delayed = 1;
-
- spin_lock(&ci->i_ceph_lock);
-
- if (ci->i_ceph_flags & CEPH_I_FLUSH)
- flags |= CHECK_CAPS_FLUSH;
-
- /* flush snaps first time around only */
- if (!list_empty(&ci->i_cap_snaps))
- __ceph_flush_snaps(ci, &session, 0);
- goto retry_locked;
-retry:
- spin_lock(&ci->i_ceph_lock);
-retry_locked:
- file_wanted = __ceph_caps_file_wanted(ci);
- used = __ceph_caps_used(ci);
- want = file_wanted | used;
- issued = __ceph_caps_issued(ci, &implemented);
- revoking = implemented & ~issued;
-
- retain = want | CEPH_CAP_PIN;
- if (!mdsc->stopping && inode->i_nlink > 0) {
- if (want) {
- retain |= CEPH_CAP_ANY; /* be greedy */
- } else {
- retain |= CEPH_CAP_ANY_SHARED;
- /*
- * keep RD only if we didn't have the file open RW,
- * because then the mds would revoke it anyway to
- * journal max_size=0.
- */
- if (ci->i_max_size == 0)
- retain |= CEPH_CAP_ANY_RD;
- }
- }
-
- dout("check_caps %p file_want %s used %s dirty %s flushing %s"
- " issued %s revoking %s retain %s %s%s%s\n", inode,
- ceph_cap_string(file_wanted),
- ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
- ceph_cap_string(ci->i_flushing_caps),
- ceph_cap_string(issued), ceph_cap_string(revoking),
- ceph_cap_string(retain),
- (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
- (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
- (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
-
- /*
- * If we no longer need to hold onto old our caps, and we may
- * have cached pages, but don't want them, then try to invalidate.
- * If we fail, it's because pages are locked.... try again later.
- */
- if ((!is_delayed || mdsc->stopping) &&
- ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
- inode->i_data.nrpages && /* have cached pages */
- (file_wanted == 0 || /* no open files */
- (revoking & (CEPH_CAP_FILE_CACHE|
- CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */
- !tried_invalidate) {
- dout("check_caps trying to invalidate on %p\n", inode);
- if (try_nonblocking_invalidate(inode) < 0) {
- if (revoking & (CEPH_CAP_FILE_CACHE|
- CEPH_CAP_FILE_LAZYIO)) {
- dout("check_caps queuing invalidate\n");
- queue_invalidate = 1;
- ci->i_rdcache_revoking = ci->i_rdcache_gen;
- } else {
- dout("check_caps failed to invalidate pages\n");
- /* we failed to invalidate pages. check these
- caps again later. */
- force_requeue = 1;
- __cap_set_timeouts(mdsc, ci);
- }
- }
- tried_invalidate = 1;
- goto retry_locked;
- }
-
- num = 0;
- for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
- cap = rb_entry(p, struct ceph_cap, ci_node);
- num++;
-
- /* avoid looping forever */
- if (mds >= cap->mds ||
- ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
- continue;
-
- /* NOTE: no side-effects allowed, until we take s_mutex */
-
- revoking = cap->implemented & ~cap->issued;
- dout(" mds%d cap %p issued %s implemented %s revoking %s\n",
- cap->mds, cap, ceph_cap_string(cap->issued),
- ceph_cap_string(cap->implemented),
- ceph_cap_string(revoking));
-
- if (cap == ci->i_auth_cap &&
- (cap->issued & CEPH_CAP_FILE_WR)) {
- /* request larger max_size from MDS? */
- if (ci->i_wanted_max_size > ci->i_max_size &&
- ci->i_wanted_max_size > ci->i_requested_max_size) {
- dout("requesting new max_size\n");
- goto ack;
- }
-
- /* approaching file_max? */
- if ((inode->i_size << 1) >= ci->i_max_size &&
- (ci->i_reported_size << 1) < ci->i_max_size) {
- dout("i_size approaching max_size\n");
- goto ack;
- }
- }
- /* flush anything dirty? */
- if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
- ci->i_dirty_caps) {
- dout("flushing dirty caps\n");
- goto ack;
- }
-
- /* completed revocation? going down and there are no caps? */
- if (revoking && (revoking & used) == 0) {
- dout("completed revocation of %s\n",
- ceph_cap_string(cap->implemented & ~cap->issued));
- goto ack;
- }
-
- /* want more caps from mds? */
- if (want & ~(cap->mds_wanted | cap->issued))
- goto ack;
-
- /* things we might delay */
- if ((cap->issued & ~retain) == 0 &&
- cap->mds_wanted == want)
- continue; /* nope, all good */
-
- if (is_delayed)
- goto ack;
-
- /* delay? */
- if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
- time_before(jiffies, ci->i_hold_caps_max)) {
- dout(" delaying issued %s -> %s, wanted %s -> %s\n",
- ceph_cap_string(cap->issued),
- ceph_cap_string(cap->issued & retain),
- ceph_cap_string(cap->mds_wanted),
- ceph_cap_string(want));
- delayed++;
- continue;
- }
-
-ack:
- if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
- dout(" skipping %p I_NOFLUSH set\n", inode);
- continue;
- }
-
- if (session && session != cap->session) {
- dout("oops, wrong session %p mutex\n", session);
- mutex_unlock(&session->s_mutex);
- session = NULL;
- }
- if (!session) {
- session = cap->session;
- if (mutex_trylock(&session->s_mutex) == 0) {
- dout("inverting session/ino locks on %p\n",
- session);
- spin_unlock(&ci->i_ceph_lock);
- if (took_snap_rwsem) {
- up_read(&mdsc->snap_rwsem);
- took_snap_rwsem = 0;
- }
- mutex_lock(&session->s_mutex);
- goto retry;
- }
- }
- /* take snap_rwsem after session mutex */
- if (!took_snap_rwsem) {
- if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
- dout("inverting snap/in locks on %p\n",
- inode);
- spin_unlock(&ci->i_ceph_lock);
- down_read(&mdsc->snap_rwsem);
- took_snap_rwsem = 1;
- goto retry;
- }
- took_snap_rwsem = 1;
- }
-
- if (cap == ci->i_auth_cap && ci->i_dirty_caps)
- flushing = __mark_caps_flushing(inode, session);
- else
- flushing = 0;
-
- mds = cap->mds; /* remember mds, so we don't repeat */
- sent++;
-
- /* __send_cap drops i_ceph_lock */
- delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
- retain, flushing, NULL);
- goto retry; /* retake i_ceph_lock and restart our cap scan. */
- }
-
- /*
- * Reschedule delayed caps release if we delayed anything,
- * otherwise cancel.
- */
- if (delayed && is_delayed)
- force_requeue = 1; /* __send_cap delayed release; requeue */
- if (!delayed && !is_delayed)
- __cap_delay_cancel(mdsc, ci);
- else if (!is_delayed || force_requeue)
- __cap_delay_requeue(mdsc, ci);
-
- spin_unlock(&ci->i_ceph_lock);
-
- if (queue_invalidate)
- ceph_queue_invalidate(inode);
-
- if (session)
- mutex_unlock(&session->s_mutex);
- if (took_snap_rwsem)
- up_read(&mdsc->snap_rwsem);
-}
-
-/*
- * Try to flush dirty caps back to the auth mds.
- */
-static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
- unsigned *flush_tid)
-{
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
- struct ceph_inode_info *ci = ceph_inode(inode);
- int unlock_session = session ? 0 : 1;
- int flushing = 0;
-
-retry:
- spin_lock(&ci->i_ceph_lock);
- if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
- dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
- goto out;
- }
- if (ci->i_dirty_caps && ci->i_auth_cap) {
- struct ceph_cap *cap = ci->i_auth_cap;
- int used = __ceph_caps_used(ci);
- int want = __ceph_caps_wanted(ci);
- int delayed;
-
- if (!session) {
- spin_unlock(&ci->i_ceph_lock);
- session = cap->session;
- mutex_lock(&session->s_mutex);
- goto retry;
- }
- BUG_ON(session != cap->session);
- if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
- goto out;
-
- flushing = __mark_caps_flushing(inode, session);
-
- /* __send_cap drops i_ceph_lock */
- delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
- cap->issued | cap->implemented, flushing,
- flush_tid);
- if (!delayed)
- goto out_unlocked;
-
- spin_lock(&ci->i_ceph_lock);
- __cap_delay_requeue(mdsc, ci);
- }
-out:
- spin_unlock(&ci->i_ceph_lock);
-out_unlocked:
- if (session && unlock_session)
- mutex_unlock(&session->s_mutex);
- return flushing;
-}
-
-/*
- * Return true if we've flushed caps through the given flush_tid.
- */
-static int caps_are_flushed(struct inode *inode, unsigned tid)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- int i, ret = 1;
-
- spin_lock(&ci->i_ceph_lock);
- for (i = 0; i < CEPH_CAP_BITS; i++)
- if ((ci->i_flushing_caps & (1 << i)) &&
- ci->i_cap_flush_tid[i] <= tid) {
- /* still flushing this bit */
- ret = 0;
- break;
- }
- spin_unlock(&ci->i_ceph_lock);
- return ret;
-}
-
-/*
- * Wait on any unsafe replies for the given inode. First wait on the
- * newest request, and make that the upper bound. Then, if there are
- * more requests, keep waiting on the oldest as long as it is still older
- * than the original request.
- */
-static void sync_write_wait(struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct list_head *head = &ci->i_unsafe_writes;
- struct ceph_osd_request *req;
- u64 last_tid;
-
- spin_lock(&ci->i_unsafe_lock);
- if (list_empty(head))
- goto out;
-
- /* set upper bound as _last_ entry in chain */
- req = list_entry(head->prev, struct ceph_osd_request,
- r_unsafe_item);
- last_tid = req->r_tid;
-
- do {
- ceph_osdc_get_request(req);
- spin_unlock(&ci->i_unsafe_lock);
- dout("sync_write_wait on tid %llu (until %llu)\n",
- req->r_tid, last_tid);
- wait_for_completion(&req->r_safe_completion);
- spin_lock(&ci->i_unsafe_lock);
- ceph_osdc_put_request(req);
-
- /*
- * from here on look at first entry in chain, since we
- * only want to wait for anything older than last_tid
- */
- if (list_empty(head))
- break;
- req = list_entry(head->next, struct ceph_osd_request,
- r_unsafe_item);
- } while (req->r_tid < last_tid);
-out:
- spin_unlock(&ci->i_unsafe_lock);
-}
-
-int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
-{
- struct inode *inode = file->f_mapping->host;
- struct ceph_inode_info *ci = ceph_inode(inode);
- unsigned flush_tid;
- int ret;
- int dirty;
-
- dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
- sync_write_wait(inode);
-
- ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
- if (ret < 0)
- return ret;
- mutex_lock(&inode->i_mutex);
-
- dirty = try_flush_caps(inode, NULL, &flush_tid);
- dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
-
- /*
- * only wait on non-file metadata writeback (the mds
- * can recover size and mtime, so we don't need to
- * wait for that)
- */
- if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
- dout("fsync waiting for flush_tid %u\n", flush_tid);
- ret = wait_event_interruptible(ci->i_cap_wq,
- caps_are_flushed(inode, flush_tid));
- }
-
- dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
- mutex_unlock(&inode->i_mutex);
- return ret;
-}
-
-/*
- * Flush any dirty caps back to the mds. If we aren't asked to wait,
- * queue inode for flush but don't do so immediately, because we can
- * get by with fewer MDS messages if we wait for data writeback to
- * complete first.
- */
-int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- unsigned flush_tid;
- int err = 0;
- int dirty;
- int wait = wbc->sync_mode == WB_SYNC_ALL;
-
- dout("write_inode %p wait=%d\n", inode, wait);
- if (wait) {
- dirty = try_flush_caps(inode, NULL, &flush_tid);
- if (dirty)
- err = wait_event_interruptible(ci->i_cap_wq,
- caps_are_flushed(inode, flush_tid));
- } else {
- struct ceph_mds_client *mdsc =
- ceph_sb_to_client(inode->i_sb)->mdsc;
-
- spin_lock(&ci->i_ceph_lock);
- if (__ceph_caps_dirty(ci))
- __cap_delay_requeue_front(mdsc, ci);
- spin_unlock(&ci->i_ceph_lock);
- }
- return err;
-}
-
-/*
- * After a recovering MDS goes active, we need to resend any caps
- * we were flushing.
- *
- * Caller holds session->s_mutex.
- */
-static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
-{
- struct ceph_cap_snap *capsnap;
-
- dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
- list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
- flushing_item) {
- struct ceph_inode_info *ci = capsnap->ci;
- struct inode *inode = &ci->vfs_inode;
- struct ceph_cap *cap;
-
- spin_lock(&ci->i_ceph_lock);
- cap = ci->i_auth_cap;
- if (cap && cap->session == session) {
- dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
- cap, capsnap);
- __ceph_flush_snaps(ci, &session, 1);
- } else {
- pr_err("%p auth cap %p not mds%d ???\n", inode,
- cap, session->s_mds);
- }
- spin_unlock(&ci->i_ceph_lock);
- }
-}
-
-void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
-{
- struct ceph_inode_info *ci;
-
- kick_flushing_capsnaps(mdsc, session);
-
- dout("kick_flushing_caps mds%d\n", session->s_mds);
- list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
- struct inode *inode = &ci->vfs_inode;
- struct ceph_cap *cap;
- int delayed = 0;
-
- spin_lock(&ci->i_ceph_lock);
- cap = ci->i_auth_cap;
- if (cap && cap->session == session) {
- dout("kick_flushing_caps %p cap %p %s\n", inode,
- cap, ceph_cap_string(ci->i_flushing_caps));
- delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
- __ceph_caps_used(ci),
- __ceph_caps_wanted(ci),
- cap->issued | cap->implemented,
- ci->i_flushing_caps, NULL);
- if (delayed) {
- spin_lock(&ci->i_ceph_lock);
- __cap_delay_requeue(mdsc, ci);
- spin_unlock(&ci->i_ceph_lock);
- }
- } else {
- pr_err("%p auth cap %p not mds%d ???\n", inode,
- cap, session->s_mds);
- spin_unlock(&ci->i_ceph_lock);
- }
- }
-}
-
-static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session,
- struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_cap *cap;
- int delayed = 0;
-
- spin_lock(&ci->i_ceph_lock);
- cap = ci->i_auth_cap;
- dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
- ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
- __ceph_flush_snaps(ci, &session, 1);
- if (ci->i_flushing_caps) {
- delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
- __ceph_caps_used(ci),
- __ceph_caps_wanted(ci),
- cap->issued | cap->implemented,
- ci->i_flushing_caps, NULL);
- if (delayed) {
- spin_lock(&ci->i_ceph_lock);
- __cap_delay_requeue(mdsc, ci);
- spin_unlock(&ci->i_ceph_lock);
- }
- } else {
- spin_unlock(&ci->i_ceph_lock);
- }
-}
-
-
-/*
- * Take references to capabilities we hold, so that we don't release
- * them to the MDS prematurely.
- *
- * Protected by i_ceph_lock.
- */
-static void __take_cap_refs(struct ceph_inode_info *ci, int got)
-{
- if (got & CEPH_CAP_PIN)
- ci->i_pin_ref++;
- if (got & CEPH_CAP_FILE_RD)
- ci->i_rd_ref++;
- if (got & CEPH_CAP_FILE_CACHE)
- ci->i_rdcache_ref++;
- if (got & CEPH_CAP_FILE_WR)
- ci->i_wr_ref++;
- if (got & CEPH_CAP_FILE_BUFFER) {
- if (ci->i_wb_ref == 0)
- ihold(&ci->vfs_inode);
- ci->i_wb_ref++;
- dout("__take_cap_refs %p wb %d -> %d (?)\n",
- &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref);
- }
-}
-
-/*
- * Try to grab cap references. Specify those refs we @want, and the
- * minimal set we @need. Also include the larger offset we are writing
- * to (when applicable), and check against max_size here as well.
- * Note that caller is responsible for ensuring max_size increases are
- * requested from the MDS.
- */
-static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
- int *got, loff_t endoff, int *check_max, int *err)
-{
- struct inode *inode = &ci->vfs_inode;
- int ret = 0;
- int have, implemented;
- int file_wanted;
-
- dout("get_cap_refs %p need %s want %s\n", inode,
- ceph_cap_string(need), ceph_cap_string(want));
- spin_lock(&ci->i_ceph_lock);
-
- /* make sure file is actually open */
- file_wanted = __ceph_caps_file_wanted(ci);
- if ((file_wanted & need) == 0) {
- dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
- ceph_cap_string(need), ceph_cap_string(file_wanted));
- *err = -EBADF;
- ret = 1;
- goto out;
- }
-
- if (need & CEPH_CAP_FILE_WR) {
- if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
- dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
- inode, endoff, ci->i_max_size);
- if (endoff > ci->i_wanted_max_size) {
- *check_max = 1;
- ret = 1;
- }
- goto out;
- }
- /*
- * If a sync write is in progress, we must wait, so that we
- * can get a final snapshot value for size+mtime.
- */
- if (__ceph_have_pending_cap_snap(ci)) {
- dout("get_cap_refs %p cap_snap_pending\n", inode);
- goto out;
- }
- }
- have = __ceph_caps_issued(ci, &implemented);
-
- /*
- * disallow writes while a truncate is pending
- */
- if (ci->i_truncate_pending)
- have &= ~CEPH_CAP_FILE_WR;
-
- if ((have & need) == need) {
- /*
- * Look at (implemented & ~have & not) so that we keep waiting
- * on transition from wanted -> needed caps. This is needed
- * for WRBUFFER|WR -> WR to avoid a new WR sync write from
- * going before a prior buffered writeback happens.
- */
- int not = want & ~(have & need);
- int revoking = implemented & ~have;
- dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
- inode, ceph_cap_string(have), ceph_cap_string(not),
- ceph_cap_string(revoking));
- if ((revoking & not) == 0) {
- *got = need | (have & want);
- __take_cap_refs(ci, *got);
- ret = 1;
- }
- } else {
- dout("get_cap_refs %p have %s needed %s\n", inode,
- ceph_cap_string(have), ceph_cap_string(need));
- }
-out:
- spin_unlock(&ci->i_ceph_lock);
- dout("get_cap_refs %p ret %d got %s\n", inode,
- ret, ceph_cap_string(*got));
- return ret;
-}
-
-/*
- * Check the offset we are writing up to against our current
- * max_size. If necessary, tell the MDS we want to write to
- * a larger offset.
- */
-static void check_max_size(struct inode *inode, loff_t endoff)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- int check = 0;
-
- /* do we need to explicitly request a larger max_size? */
- spin_lock(&ci->i_ceph_lock);
- if ((endoff >= ci->i_max_size ||
- endoff > (inode->i_size << 1)) &&
- endoff > ci->i_wanted_max_size) {
- dout("write %p at large endoff %llu, req max_size\n",
- inode, endoff);
- ci->i_wanted_max_size = endoff;
- check = 1;
- }
- spin_unlock(&ci->i_ceph_lock);
- if (check)
- ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
-}
-
-/*
- * Wait for caps, and take cap references. If we can't get a WR cap
- * due to a small max_size, make sure we check_max_size (and possibly
- * ask the mds) so we don't get hung up indefinitely.
- */
-int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got,
- loff_t endoff)
-{
- int check_max, ret, err;
-
-retry:
- if (endoff > 0)
- check_max_size(&ci->vfs_inode, endoff);
- check_max = 0;
- err = 0;
- ret = wait_event_interruptible(ci->i_cap_wq,
- try_get_cap_refs(ci, need, want,
- got, endoff,
- &check_max, &err));
- if (err)
- ret = err;
- if (check_max)
- goto retry;
- return ret;
-}
-
-/*
- * Take cap refs. Caller must already know we hold at least one ref
- * on the caps in question or we don't know this is safe.
- */
-void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
-{
- spin_lock(&ci->i_ceph_lock);
- __take_cap_refs(ci, caps);
- spin_unlock(&ci->i_ceph_lock);
-}
-
-/*
- * Release cap refs.
- *
- * If we released the last ref on any given cap, call ceph_check_caps
- * to release (or schedule a release).
- *
- * If we are releasing a WR cap (from a sync write), finalize any affected
- * cap_snap, and wake up any waiters.
- */
-void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
-{
- struct inode *inode = &ci->vfs_inode;
- int last = 0, put = 0, flushsnaps = 0, wake = 0;
- struct ceph_cap_snap *capsnap;
-
- spin_lock(&ci->i_ceph_lock);
- if (had & CEPH_CAP_PIN)
- --ci->i_pin_ref;
- if (had & CEPH_CAP_FILE_RD)
- if (--ci->i_rd_ref == 0)
- last++;
- if (had & CEPH_CAP_FILE_CACHE)
- if (--ci->i_rdcache_ref == 0)
- last++;
- if (had & CEPH_CAP_FILE_BUFFER) {
- if (--ci->i_wb_ref == 0) {
- last++;
- put++;
- }
- dout("put_cap_refs %p wb %d -> %d (?)\n",
- inode, ci->i_wb_ref+1, ci->i_wb_ref);
- }
- if (had & CEPH_CAP_FILE_WR)
- if (--ci->i_wr_ref == 0) {
- last++;
- if (!list_empty(&ci->i_cap_snaps)) {
- capsnap = list_first_entry(&ci->i_cap_snaps,
- struct ceph_cap_snap,
- ci_item);
- if (capsnap->writing) {
- capsnap->writing = 0;
- flushsnaps =
- __ceph_finish_cap_snap(ci,
- capsnap);
- wake = 1;
- }
- }
- }
- spin_unlock(&ci->i_ceph_lock);
-
- dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
- last ? " last" : "", put ? " put" : "");
-
- if (last && !flushsnaps)
- ceph_check_caps(ci, 0, NULL);
- else if (flushsnaps)
- ceph_flush_snaps(ci);
- if (wake)
- wake_up_all(&ci->i_cap_wq);
- if (put)
- iput(inode);
-}
-
-/*
- * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
- * context. Adjust per-snap dirty page accounting as appropriate.
- * Once all dirty data for a cap_snap is flushed, flush snapped file
- * metadata back to the MDS. If we dropped the last ref, call
- * ceph_check_caps.
- */
-void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
- struct ceph_snap_context *snapc)
-{
- struct inode *inode = &ci->vfs_inode;
- int last = 0;
- int complete_capsnap = 0;
- int drop_capsnap = 0;
- int found = 0;
- struct ceph_cap_snap *capsnap = NULL;
-
- spin_lock(&ci->i_ceph_lock);
- ci->i_wrbuffer_ref -= nr;
- last = !ci->i_wrbuffer_ref;
-
- if (ci->i_head_snapc == snapc) {
- ci->i_wrbuffer_ref_head -= nr;
- if (ci->i_wrbuffer_ref_head == 0 &&
- ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) {
- BUG_ON(!ci->i_head_snapc);
- ceph_put_snap_context(ci->i_head_snapc);
- ci->i_head_snapc = NULL;
- }
- dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
- inode,
- ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
- ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
- last ? " LAST" : "");
- } else {
- list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
- if (capsnap->context == snapc) {
- found = 1;
- break;
- }
- }
- BUG_ON(!found);
- capsnap->dirty_pages -= nr;
- if (capsnap->dirty_pages == 0) {
- complete_capsnap = 1;
- if (capsnap->dirty == 0)
- /* cap writeback completed before we created
- * the cap_snap; no FLUSHSNAP is needed */
- drop_capsnap = 1;
- }
- dout("put_wrbuffer_cap_refs on %p cap_snap %p "
- " snap %lld %d/%d -> %d/%d %s%s%s\n",
- inode, capsnap, capsnap->context->seq,
- ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
- ci->i_wrbuffer_ref, capsnap->dirty_pages,
- last ? " (wrbuffer last)" : "",
- complete_capsnap ? " (complete capsnap)" : "",
- drop_capsnap ? " (drop capsnap)" : "");
- if (drop_capsnap) {
- ceph_put_snap_context(capsnap->context);
- list_del(&capsnap->ci_item);
- list_del(&capsnap->flushing_item);
- ceph_put_cap_snap(capsnap);
- }
- }
-
- spin_unlock(&ci->i_ceph_lock);
-
- if (last) {
- ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
- iput(inode);
- } else if (complete_capsnap) {
- ceph_flush_snaps(ci);
- wake_up_all(&ci->i_cap_wq);
- }
- if (drop_capsnap)
- iput(inode);
-}
-
-/*
- * Handle a cap GRANT message from the MDS. (Note that a GRANT may
- * actually be a revocation if it specifies a smaller cap set.)
- *
- * caller holds s_mutex and i_ceph_lock, we drop both.
- *
- * return value:
- * 0 - ok
- * 1 - check_caps on auth cap only (writeback)
- * 2 - check_caps (ack revoke)
- */
-static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
- struct ceph_mds_session *session,
- struct ceph_cap *cap,
- struct ceph_buffer *xattr_buf)
- __releases(ci->i_ceph_lock)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- int mds = session->s_mds;
- int seq = le32_to_cpu(grant->seq);
- int newcaps = le32_to_cpu(grant->caps);
- int issued, implemented, used, wanted, dirty;
- u64 size = le64_to_cpu(grant->size);
- u64 max_size = le64_to_cpu(grant->max_size);
- struct timespec mtime, atime, ctime;
- int check_caps = 0;
- int wake = 0;
- int writeback = 0;
- int revoked_rdcache = 0;
- int queue_invalidate = 0;
-
- dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
- inode, cap, mds, seq, ceph_cap_string(newcaps));
- dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
- inode->i_size);
-
- /*
- * If CACHE is being revoked, and we have no dirty buffers,
- * try to invalidate (once). (If there are dirty buffers, we
- * will invalidate _after_ writeback.)
- */
- if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
- (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
- !ci->i_wrbuffer_ref) {
- if (try_nonblocking_invalidate(inode) == 0) {
- revoked_rdcache = 1;
- } else {
- /* there were locked pages.. invalidate later
- in a separate thread. */
- if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
- queue_invalidate = 1;
- ci->i_rdcache_revoking = ci->i_rdcache_gen;
- }
- }
- }
-
- /* side effects now are allowed */
-
- issued = __ceph_caps_issued(ci, &implemented);
- issued |= implemented | __ceph_caps_dirty(ci);
-
- cap->cap_gen = session->s_cap_gen;
-
- __check_cap_issue(ci, cap, newcaps);
-
- if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
- inode->i_mode = le32_to_cpu(grant->mode);
- inode->i_uid = le32_to_cpu(grant->uid);
- inode->i_gid = le32_to_cpu(grant->gid);
- dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
- inode->i_uid, inode->i_gid);
- }
-
- if ((issued & CEPH_CAP_LINK_EXCL) == 0)
- set_nlink(inode, le32_to_cpu(grant->nlink));
-
- if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
- int len = le32_to_cpu(grant->xattr_len);
- u64 version = le64_to_cpu(grant->xattr_version);
-
- if (version > ci->i_xattrs.version) {
- dout(" got new xattrs v%llu on %p len %d\n",
- version, inode, len);
- if (ci->i_xattrs.blob)
- ceph_buffer_put(ci->i_xattrs.blob);
- ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
- ci->i_xattrs.version = version;
- }
- }
-
- /* size/ctime/mtime/atime? */
- ceph_fill_file_size(inode, issued,
- le32_to_cpu(grant->truncate_seq),
- le64_to_cpu(grant->truncate_size), size);
- ceph_decode_timespec(&mtime, &grant->mtime);
- ceph_decode_timespec(&atime, &grant->atime);
- ceph_decode_timespec(&ctime, &grant->ctime);
- ceph_fill_file_time(inode, issued,
- le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
- &atime);
-
- /* max size increase? */
- if (max_size != ci->i_max_size) {
- dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
- ci->i_max_size = max_size;
- if (max_size >= ci->i_wanted_max_size) {
- ci->i_wanted_max_size = 0; /* reset */
- ci->i_requested_max_size = 0;
- }
- wake = 1;
- }
-
- /* check cap bits */
- wanted = __ceph_caps_wanted(ci);
- used = __ceph_caps_used(ci);
- dirty = __ceph_caps_dirty(ci);
- dout(" my wanted = %s, used = %s, dirty %s\n",
- ceph_cap_string(wanted),
- ceph_cap_string(used),
- ceph_cap_string(dirty));
- if (wanted != le32_to_cpu(grant->wanted)) {
- dout("mds wanted %s -> %s\n",
- ceph_cap_string(le32_to_cpu(grant->wanted)),
- ceph_cap_string(wanted));
- grant->wanted = cpu_to_le32(wanted);
- }
-
- cap->seq = seq;
-
- /* file layout may have changed */
- ci->i_layout = grant->layout;
-
- /* revocation, grant, or no-op? */
- if (cap->issued & ~newcaps) {
- int revoking = cap->issued & ~newcaps;
-
- dout("revocation: %s -> %s (revoking %s)\n",
- ceph_cap_string(cap->issued),
- ceph_cap_string(newcaps),
- ceph_cap_string(revoking));
- if (revoking & used & CEPH_CAP_FILE_BUFFER)
- writeback = 1; /* initiate writeback; will delay ack */
- else if (revoking == CEPH_CAP_FILE_CACHE &&
- (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
- queue_invalidate)
- ; /* do nothing yet, invalidation will be queued */
- else if (cap == ci->i_auth_cap)
- check_caps = 1; /* check auth cap only */
- else
- check_caps = 2; /* check all caps */
- cap->issued = newcaps;
- cap->implemented |= newcaps;
- } else if (cap->issued == newcaps) {
- dout("caps unchanged: %s -> %s\n",
- ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
- } else {
- dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
- ceph_cap_string(newcaps));
- cap->issued = newcaps;
- cap->implemented |= newcaps; /* add bits only, to
- * avoid stepping on a
- * pending revocation */
- wake = 1;
- }
- BUG_ON(cap->issued & ~cap->implemented);
-
- spin_unlock(&ci->i_ceph_lock);
- if (writeback)
- /*
- * queue inode for writeback: we can't actually call
- * filemap_write_and_wait, etc. from message handler
- * context.
- */
- ceph_queue_writeback(inode);
- if (queue_invalidate)
- ceph_queue_invalidate(inode);
- if (wake)
- wake_up_all(&ci->i_cap_wq);
-
- if (check_caps == 1)
- ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
- session);
- else if (check_caps == 2)
- ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
- else
- mutex_unlock(&session->s_mutex);
-}
-
-/*
- * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
- * MDS has been safely committed.
- */
-static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
- struct ceph_mds_caps *m,
- struct ceph_mds_session *session,
- struct ceph_cap *cap)
- __releases(ci->i_ceph_lock)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
- unsigned seq = le32_to_cpu(m->seq);
- int dirty = le32_to_cpu(m->dirty);
- int cleaned = 0;
- int drop = 0;
- int i;
-
- for (i = 0; i < CEPH_CAP_BITS; i++)
- if ((dirty & (1 << i)) &&
- flush_tid == ci->i_cap_flush_tid[i])
- cleaned |= 1 << i;
-
- dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
- " flushing %s -> %s\n",
- inode, session->s_mds, seq, ceph_cap_string(dirty),
- ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
- ceph_cap_string(ci->i_flushing_caps & ~cleaned));
-
- if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
- goto out;
-
- ci->i_flushing_caps &= ~cleaned;
-
- spin_lock(&mdsc->cap_dirty_lock);
- if (ci->i_flushing_caps == 0) {
- list_del_init(&ci->i_flushing_item);
- if (!list_empty(&session->s_cap_flushing))
- dout(" mds%d still flushing cap on %p\n",
- session->s_mds,
- &list_entry(session->s_cap_flushing.next,
- struct ceph_inode_info,
- i_flushing_item)->vfs_inode);
- mdsc->num_cap_flushing--;
- wake_up_all(&mdsc->cap_flushing_wq);
- dout(" inode %p now !flushing\n", inode);
-
- if (ci->i_dirty_caps == 0) {
- dout(" inode %p now clean\n", inode);
- BUG_ON(!list_empty(&ci->i_dirty_item));
- drop = 1;
- if (ci->i_wrbuffer_ref_head == 0) {
- BUG_ON(!ci->i_head_snapc);
- ceph_put_snap_context(ci->i_head_snapc);
- ci->i_head_snapc = NULL;
- }
- } else {
- BUG_ON(list_empty(&ci->i_dirty_item));
- }
- }
- spin_unlock(&mdsc->cap_dirty_lock);
- wake_up_all(&ci->i_cap_wq);
-
-out:
- spin_unlock(&ci->i_ceph_lock);
- if (drop)
- iput(inode);
-}
-
-/*
- * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can
- * throw away our cap_snap.
- *
- * Caller hold s_mutex.
- */
-static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
- struct ceph_mds_caps *m,
- struct ceph_mds_session *session)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- u64 follows = le64_to_cpu(m->snap_follows);
- struct ceph_cap_snap *capsnap;
- int drop = 0;
-
- dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
- inode, ci, session->s_mds, follows);
-
- spin_lock(&ci->i_ceph_lock);
- list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
- if (capsnap->follows == follows) {
- if (capsnap->flush_tid != flush_tid) {
- dout(" cap_snap %p follows %lld tid %lld !="
- " %lld\n", capsnap, follows,
- flush_tid, capsnap->flush_tid);
- break;
- }
- WARN_ON(capsnap->dirty_pages || capsnap->writing);
- dout(" removing %p cap_snap %p follows %lld\n",
- inode, capsnap, follows);
- ceph_put_snap_context(capsnap->context);
- list_del(&capsnap->ci_item);
- list_del(&capsnap->flushing_item);
- ceph_put_cap_snap(capsnap);
- drop = 1;
- break;
- } else {
- dout(" skipping cap_snap %p follows %lld\n",
- capsnap, capsnap->follows);
- }
- }
- spin_unlock(&ci->i_ceph_lock);
- if (drop)
- iput(inode);
-}
-
-/*
- * Handle TRUNC from MDS, indicating file truncation.
- *
- * caller hold s_mutex.
- */
-static void handle_cap_trunc(struct inode *inode,
- struct ceph_mds_caps *trunc,
- struct ceph_mds_session *session)
- __releases(ci->i_ceph_lock)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- int mds = session->s_mds;
- int seq = le32_to_cpu(trunc->seq);
- u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
- u64 truncate_size = le64_to_cpu(trunc->truncate_size);
- u64 size = le64_to_cpu(trunc->size);
- int implemented = 0;
- int dirty = __ceph_caps_dirty(ci);
- int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
- int queue_trunc = 0;
-
- issued |= implemented | dirty;
-
- dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
- inode, mds, seq, truncate_size, truncate_seq);
- queue_trunc = ceph_fill_file_size(inode, issued,
- truncate_seq, truncate_size, size);
- spin_unlock(&ci->i_ceph_lock);
-
- if (queue_trunc)
- ceph_queue_vmtruncate(inode);
-}
-
-/*
- * Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a
- * different one. If we are the most recent migration we've seen (as
- * indicated by mseq), make note of the migrating cap bits for the
- * duration (until we see the corresponding IMPORT).
- *
- * caller holds s_mutex
- */
-static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
- struct ceph_mds_session *session,
- int *open_target_sessions)
-{
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
- struct ceph_inode_info *ci = ceph_inode(inode);
- int mds = session->s_mds;
- unsigned mseq = le32_to_cpu(ex->migrate_seq);
- struct ceph_cap *cap = NULL, *t;
- struct rb_node *p;
- int remember = 1;
-
- dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
- inode, ci, mds, mseq);
-
- spin_lock(&ci->i_ceph_lock);
-
- /* make sure we haven't seen a higher mseq */
- for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
- t = rb_entry(p, struct ceph_cap, ci_node);
- if (ceph_seq_cmp(t->mseq, mseq) > 0) {
- dout(" higher mseq on cap from mds%d\n",
- t->session->s_mds);
- remember = 0;
- }
- if (t->session->s_mds == mds)
- cap = t;
- }
-
- if (cap) {
- if (remember) {
- /* make note */
- ci->i_cap_exporting_mds = mds;
- ci->i_cap_exporting_mseq = mseq;
- ci->i_cap_exporting_issued = cap->issued;
-
- /*
- * make sure we have open sessions with all possible
- * export targets, so that we get the matching IMPORT
- */
- *open_target_sessions = 1;
-
- /*
- * we can't flush dirty caps that we've seen the
- * EXPORT but no IMPORT for
- */
- spin_lock(&mdsc->cap_dirty_lock);
- if (!list_empty(&ci->i_dirty_item)) {
- dout(" moving %p to cap_dirty_migrating\n",
- inode);
- list_move(&ci->i_dirty_item,
- &mdsc->cap_dirty_migrating);
- }
- spin_unlock(&mdsc->cap_dirty_lock);
- }
- __ceph_remove_cap(cap);
- }
- /* else, we already released it */
-
- spin_unlock(&ci->i_ceph_lock);
-}
-
-/*
- * Handle cap IMPORT. If there are temp bits from an older EXPORT,
- * clean them up.
- *
- * caller holds s_mutex.
- */
-static void handle_cap_import(struct ceph_mds_client *mdsc,
- struct inode *inode, struct ceph_mds_caps *im,
- struct ceph_mds_session *session,
- void *snaptrace, int snaptrace_len)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- int mds = session->s_mds;
- unsigned issued = le32_to_cpu(im->caps);
- unsigned wanted = le32_to_cpu(im->wanted);
- unsigned seq = le32_to_cpu(im->seq);
- unsigned mseq = le32_to_cpu(im->migrate_seq);
- u64 realmino = le64_to_cpu(im->realm);
- u64 cap_id = le64_to_cpu(im->cap_id);
-
- if (ci->i_cap_exporting_mds >= 0 &&
- ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
- dout("handle_cap_import inode %p ci %p mds%d mseq %d"
- " - cleared exporting from mds%d\n",
- inode, ci, mds, mseq,
- ci->i_cap_exporting_mds);
- ci->i_cap_exporting_issued = 0;
- ci->i_cap_exporting_mseq = 0;
- ci->i_cap_exporting_mds = -1;
-
- spin_lock(&mdsc->cap_dirty_lock);
- if (!list_empty(&ci->i_dirty_item)) {
- dout(" moving %p back to cap_dirty\n", inode);
- list_move(&ci->i_dirty_item, &mdsc->cap_dirty);
- }
- spin_unlock(&mdsc->cap_dirty_lock);
- } else {
- dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
- inode, ci, mds, mseq);
- }
-
- down_write(&mdsc->snap_rwsem);
- ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
- false);
- downgrade_write(&mdsc->snap_rwsem);
- ceph_add_cap(inode, session, cap_id, -1,
- issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
- NULL /* no caps context */);
- kick_flushing_inode_caps(mdsc, session, inode);
- up_read(&mdsc->snap_rwsem);
-
- /* make sure we re-request max_size, if necessary */
- spin_lock(&ci->i_ceph_lock);
- ci->i_requested_max_size = 0;
- spin_unlock(&ci->i_ceph_lock);
-}
-
-/*
- * Handle a caps message from the MDS.
- *
- * Identify the appropriate session, inode, and call the right handler
- * based on the cap op.
- */
-void ceph_handle_caps(struct ceph_mds_session *session,
- struct ceph_msg *msg)
-{
- struct ceph_mds_client *mdsc = session->s_mdsc;
- struct super_block *sb = mdsc->fsc->sb;
- struct inode *inode;
- struct ceph_inode_info *ci;
- struct ceph_cap *cap;
- struct ceph_mds_caps *h;
- int mds = session->s_mds;
- int op;
- u32 seq, mseq;
- struct ceph_vino vino;
- u64 cap_id;
- u64 size, max_size;
- u64 tid;
- void *snaptrace;
- size_t snaptrace_len;
- void *flock;
- u32 flock_len;
- int open_target_sessions = 0;
-
- dout("handle_caps from mds%d\n", mds);
-
- /* decode */
- tid = le64_to_cpu(msg->hdr.tid);
- if (msg->front.iov_len < sizeof(*h))
- goto bad;
- h = msg->front.iov_base;
- op = le32_to_cpu(h->op);
- vino.ino = le64_to_cpu(h->ino);
- vino.snap = CEPH_NOSNAP;
- cap_id = le64_to_cpu(h->cap_id);
- seq = le32_to_cpu(h->seq);
- mseq = le32_to_cpu(h->migrate_seq);
- size = le64_to_cpu(h->size);
- max_size = le64_to_cpu(h->max_size);
-
- snaptrace = h + 1;
- snaptrace_len = le32_to_cpu(h->snap_trace_len);
-
- if (le16_to_cpu(msg->hdr.version) >= 2) {
- void *p, *end;
-
- p = snaptrace + snaptrace_len;
- end = msg->front.iov_base + msg->front.iov_len;
- ceph_decode_32_safe(&p, end, flock_len, bad);
- flock = p;
- } else {
- flock = NULL;
- flock_len = 0;
- }
-
- mutex_lock(&session->s_mutex);
- session->s_seq++;
- dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
- (unsigned)seq);
-
- /* lookup ino */
- inode = ceph_find_inode(sb, vino);
- ci = ceph_inode(inode);
- dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
- vino.snap, inode);
- if (!inode) {
- dout(" i don't have ino %llx\n", vino.ino);
-
- if (op == CEPH_CAP_OP_IMPORT)
- __queue_cap_release(session, vino.ino, cap_id,
- mseq, seq);
- goto flush_cap_releases;
- }
-
- /* these will work even if we don't have a cap yet */
- switch (op) {
- case CEPH_CAP_OP_FLUSHSNAP_ACK:
- handle_cap_flushsnap_ack(inode, tid, h, session);
- goto done;
-
- case CEPH_CAP_OP_EXPORT:
- handle_cap_export(inode, h, session, &open_target_sessions);
- goto done;
-
- case CEPH_CAP_OP_IMPORT:
- handle_cap_import(mdsc, inode, h, session,
- snaptrace, snaptrace_len);
- ceph_check_caps(ceph_inode(inode), 0, session);
- goto done_unlocked;
- }
-
- /* the rest require a cap */
- spin_lock(&ci->i_ceph_lock);
- cap = __get_cap_for_mds(ceph_inode(inode), mds);
- if (!cap) {
- dout(" no cap on %p ino %llx.%llx from mds%d\n",
- inode, ceph_ino(inode), ceph_snap(inode), mds);
- spin_unlock(&ci->i_ceph_lock);
- goto flush_cap_releases;
- }
-
- /* note that each of these drops i_ceph_lock for us */
- switch (op) {
- case CEPH_CAP_OP_REVOKE:
- case CEPH_CAP_OP_GRANT:
- handle_cap_grant(inode, h, session, cap, msg->middle);
- goto done_unlocked;
-
- case CEPH_CAP_OP_FLUSH_ACK:
- handle_cap_flush_ack(inode, tid, h, session, cap);
- break;
-
- case CEPH_CAP_OP_TRUNC:
- handle_cap_trunc(inode, h, session);
- break;
-
- default:
- spin_unlock(&ci->i_ceph_lock);
- pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
- ceph_cap_op_name(op));
- }
-
- goto done;
-
-flush_cap_releases:
- /*
- * send any full release message to try to move things
- * along for the mds (who clearly thinks we still have this
- * cap).
- */
- ceph_add_cap_releases(mdsc, session);
- ceph_send_cap_releases(mdsc, session);
-
-done:
- mutex_unlock(&session->s_mutex);
-done_unlocked:
- if (inode)
- iput(inode);
- if (open_target_sessions)
- ceph_mdsc_open_export_target_sessions(mdsc, session);
- return;
-
-bad:
- pr_err("ceph_handle_caps: corrupt message\n");
- ceph_msg_dump(msg);
- return;
-}
-
-/*
- * Delayed work handler to process end of delayed cap release LRU list.
- */
-void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
-{
- struct ceph_inode_info *ci;
- int flags = CHECK_CAPS_NODELAY;
-
- dout("check_delayed_caps\n");
- while (1) {
- spin_lock(&mdsc->cap_delay_lock);
- if (list_empty(&mdsc->cap_delay_list))
- break;
- ci = list_first_entry(&mdsc->cap_delay_list,
- struct ceph_inode_info,
- i_cap_delay_list);
- if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
- time_before(jiffies, ci->i_hold_caps_max))
- break;
- list_del_init(&ci->i_cap_delay_list);
- spin_unlock(&mdsc->cap_delay_lock);
- dout("check_delayed_caps on %p\n", &ci->vfs_inode);
- ceph_check_caps(ci, flags, NULL);
- }
- spin_unlock(&mdsc->cap_delay_lock);
-}
-
-/*
- * Flush all dirty caps to the mds
- */
-void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
-{
- struct ceph_inode_info *ci;
- struct inode *inode;
-
- dout("flush_dirty_caps\n");
- spin_lock(&mdsc->cap_dirty_lock);
- while (!list_empty(&mdsc->cap_dirty)) {
- ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info,
- i_dirty_item);
- inode = &ci->vfs_inode;
- ihold(inode);
- dout("flush_dirty_caps %p\n", inode);
- spin_unlock(&mdsc->cap_dirty_lock);
- ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL);
- iput(inode);
- spin_lock(&mdsc->cap_dirty_lock);
- }
- spin_unlock(&mdsc->cap_dirty_lock);
- dout("flush_dirty_caps done\n");
-}
-
-/*
- * Drop open file reference. If we were the last open file,
- * we may need to release capabilities to the MDS (or schedule
- * their delayed release).
- */
-void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
-{
- struct inode *inode = &ci->vfs_inode;
- int last = 0;
-
- spin_lock(&ci->i_ceph_lock);
- dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
- ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
- BUG_ON(ci->i_nr_by_mode[fmode] == 0);
- if (--ci->i_nr_by_mode[fmode] == 0)
- last++;
- spin_unlock(&ci->i_ceph_lock);
-
- if (last && ci->i_vino.snap == CEPH_NOSNAP)
- ceph_check_caps(ci, 0, NULL);
-}
-
-/*
- * Helpers for embedding cap and dentry lease releases into mds
- * requests.
- *
- * @force is used by dentry_release (below) to force inclusion of a
- * record for the directory inode, even when there aren't any caps to
- * drop.
- */
-int ceph_encode_inode_release(void **p, struct inode *inode,
- int mds, int drop, int unless, int force)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_cap *cap;
- struct ceph_mds_request_release *rel = *p;
- int used, dirty;
- int ret = 0;
-
- spin_lock(&ci->i_ceph_lock);
- used = __ceph_caps_used(ci);
- dirty = __ceph_caps_dirty(ci);
-
- dout("encode_inode_release %p mds%d used|dirty %s drop %s unless %s\n",
- inode, mds, ceph_cap_string(used|dirty), ceph_cap_string(drop),
- ceph_cap_string(unless));
-
- /* only drop unused, clean caps */
- drop &= ~(used | dirty);
-
- cap = __get_cap_for_mds(ci, mds);
- if (cap && __cap_is_valid(cap)) {
- if (force ||
- ((cap->issued & drop) &&
- (cap->issued & unless) == 0)) {
- if ((cap->issued & drop) &&
- (cap->issued & unless) == 0) {
- dout("encode_inode_release %p cap %p %s -> "
- "%s\n", inode, cap,
- ceph_cap_string(cap->issued),
- ceph_cap_string(cap->issued & ~drop));
- cap->issued &= ~drop;
- cap->implemented &= ~drop;
- if (ci->i_ceph_flags & CEPH_I_NODELAY) {
- int wanted = __ceph_caps_wanted(ci);
- dout(" wanted %s -> %s (act %s)\n",
- ceph_cap_string(cap->mds_wanted),
- ceph_cap_string(cap->mds_wanted &
- ~wanted),
- ceph_cap_string(wanted));
- cap->mds_wanted &= wanted;
- }
- } else {
- dout("encode_inode_release %p cap %p %s"
- " (force)\n", inode, cap,
- ceph_cap_string(cap->issued));
- }
-
- rel->ino = cpu_to_le64(ceph_ino(inode));
- rel->cap_id = cpu_to_le64(cap->cap_id);
- rel->seq = cpu_to_le32(cap->seq);
- rel->issue_seq = cpu_to_le32(cap->issue_seq),
- rel->mseq = cpu_to_le32(cap->mseq);
- rel->caps = cpu_to_le32(cap->issued);
- rel->wanted = cpu_to_le32(cap->mds_wanted);
- rel->dname_len = 0;
- rel->dname_seq = 0;
- *p += sizeof(*rel);
- ret = 1;
- } else {
- dout("encode_inode_release %p cap %p %s\n",
- inode, cap, ceph_cap_string(cap->issued));
- }
- }
- spin_unlock(&ci->i_ceph_lock);
- return ret;
-}
-
-int ceph_encode_dentry_release(void **p, struct dentry *dentry,
- int mds, int drop, int unless)
-{
- struct inode *dir = dentry->d_parent->d_inode;
- struct ceph_mds_request_release *rel = *p;
- struct ceph_dentry_info *di = ceph_dentry(dentry);
- int force = 0;
- int ret;
-
- /*
- * force an record for the directory caps if we have a dentry lease.
- * this is racy (can't take i_ceph_lock and d_lock together), but it
- * doesn't have to be perfect; the mds will revoke anything we don't
- * release.
- */
- spin_lock(&dentry->d_lock);
- if (di->lease_session && di->lease_session->s_mds == mds)
- force = 1;
- spin_unlock(&dentry->d_lock);
-
- ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
-
- spin_lock(&dentry->d_lock);
- if (ret && di->lease_session && di->lease_session->s_mds == mds) {
- dout("encode_dentry_release %p mds%d seq %d\n",
- dentry, mds, (int)di->lease_seq);
- rel->dname_len = cpu_to_le32(dentry->d_name.len);
- memcpy(*p, dentry->d_name.name, dentry->d_name.len);
- *p += dentry->d_name.len;
- rel->dname_seq = cpu_to_le32(di->lease_seq);
- __ceph_mdsc_drop_dentry_lease(dentry);
- }
- spin_unlock(&dentry->d_lock);
- return ret;
-}
diff --git a/ANDROID_3.4.5/fs/ceph/ceph_frag.c b/ANDROID_3.4.5/fs/ceph/ceph_frag.c
deleted file mode 100644
index bdce8b1f..00000000
--- a/ANDROID_3.4.5/fs/ceph/ceph_frag.c
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Ceph 'frag' type
- */
-#include <linux/module.h>
-#include <linux/ceph/types.h>
-
-int ceph_frag_compare(__u32 a, __u32 b)
-{
- unsigned va = ceph_frag_value(a);
- unsigned vb = ceph_frag_value(b);
- if (va < vb)
- return -1;
- if (va > vb)
- return 1;
- va = ceph_frag_bits(a);
- vb = ceph_frag_bits(b);
- if (va < vb)
- return -1;
- if (va > vb)
- return 1;
- return 0;
-}
diff --git a/ANDROID_3.4.5/fs/ceph/debugfs.c b/ANDROID_3.4.5/fs/ceph/debugfs.c
deleted file mode 100644
index fb962efd..00000000
--- a/ANDROID_3.4.5/fs/ceph/debugfs.c
+++ /dev/null
@@ -1,273 +0,0 @@
-#include <linux/ceph/ceph_debug.h>
-
-#include <linux/device.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/ctype.h>
-#include <linux/debugfs.h>
-#include <linux/seq_file.h>
-
-#include <linux/ceph/libceph.h>
-#include <linux/ceph/mon_client.h>
-#include <linux/ceph/auth.h>
-#include <linux/ceph/debugfs.h>
-
-#include "super.h"
-
-#ifdef CONFIG_DEBUG_FS
-
-#include "mds_client.h"
-
-static int mdsmap_show(struct seq_file *s, void *p)
-{
- int i;
- struct ceph_fs_client *fsc = s->private;
-
- if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL)
- return 0;
- seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch);
- seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root);
- seq_printf(s, "session_timeout %d\n",
- fsc->mdsc->mdsmap->m_session_timeout);
- seq_printf(s, "session_autoclose %d\n",
- fsc->mdsc->mdsmap->m_session_autoclose);
- for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) {
- struct ceph_entity_addr *addr =
- &fsc->mdsc->mdsmap->m_info[i].addr;
- int state = fsc->mdsc->mdsmap->m_info[i].state;
-
- seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
- ceph_pr_addr(&addr->in_addr),
- ceph_mds_state_name(state));
- }
- return 0;
-}
-
-/*
- * mdsc debugfs
- */
-static int mdsc_show(struct seq_file *s, void *p)
-{
- struct ceph_fs_client *fsc = s->private;
- struct ceph_mds_client *mdsc = fsc->mdsc;
- struct ceph_mds_request *req;
- struct rb_node *rp;
- int pathlen;
- u64 pathbase;
- char *path;
-
- mutex_lock(&mdsc->mutex);
- for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
- req = rb_entry(rp, struct ceph_mds_request, r_node);
-
- if (req->r_request && req->r_session)
- seq_printf(s, "%lld\tmds%d\t", req->r_tid,
- req->r_session->s_mds);
- else if (!req->r_request)
- seq_printf(s, "%lld\t(no request)\t", req->r_tid);
- else
- seq_printf(s, "%lld\t(no session)\t", req->r_tid);
-
- seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
-
- if (req->r_got_unsafe)
- seq_printf(s, "\t(unsafe)");
- else
- seq_printf(s, "\t");
-
- if (req->r_inode) {
- seq_printf(s, " #%llx", ceph_ino(req->r_inode));
- } else if (req->r_dentry) {
- path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
- &pathbase, 0);
- if (IS_ERR(path))
- path = NULL;
- spin_lock(&req->r_dentry->d_lock);
- seq_printf(s, " #%llx/%.*s (%s)",
- ceph_ino(req->r_dentry->d_parent->d_inode),
- req->r_dentry->d_name.len,
- req->r_dentry->d_name.name,
- path ? path : "");
- spin_unlock(&req->r_dentry->d_lock);
- kfree(path);
- } else if (req->r_path1) {
- seq_printf(s, " #%llx/%s", req->r_ino1.ino,
- req->r_path1);
- }
-
- if (req->r_old_dentry) {
- path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
- &pathbase, 0);
- if (IS_ERR(path))
- path = NULL;
- spin_lock(&req->r_old_dentry->d_lock);
- seq_printf(s, " #%llx/%.*s (%s)",
- ceph_ino(req->r_old_dentry_dir),
- req->r_old_dentry->d_name.len,
- req->r_old_dentry->d_name.name,
- path ? path : "");
- spin_unlock(&req->r_old_dentry->d_lock);
- kfree(path);
- } else if (req->r_path2) {
- if (req->r_ino2.ino)
- seq_printf(s, " #%llx/%s", req->r_ino2.ino,
- req->r_path2);
- else
- seq_printf(s, " %s", req->r_path2);
- }
-
- seq_printf(s, "\n");
- }
- mutex_unlock(&mdsc->mutex);
-
- return 0;
-}
-
-static int caps_show(struct seq_file *s, void *p)
-{
- struct ceph_fs_client *fsc = s->private;
- int total, avail, used, reserved, min;
-
- ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min);
- seq_printf(s, "total\t\t%d\n"
- "avail\t\t%d\n"
- "used\t\t%d\n"
- "reserved\t%d\n"
- "min\t%d\n",
- total, avail, used, reserved, min);
- return 0;
-}
-
-static int dentry_lru_show(struct seq_file *s, void *ptr)
-{
- struct ceph_fs_client *fsc = s->private;
- struct ceph_mds_client *mdsc = fsc->mdsc;
- struct ceph_dentry_info *di;
-
- spin_lock(&mdsc->dentry_lru_lock);
- list_for_each_entry(di, &mdsc->dentry_lru, lru) {
- struct dentry *dentry = di->dentry;
- seq_printf(s, "%p %p\t%.*s\n",
- di, dentry, dentry->d_name.len, dentry->d_name.name);
- }
- spin_unlock(&mdsc->dentry_lru_lock);
-
- return 0;
-}
-
-CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
-CEPH_DEFINE_SHOW_FUNC(mdsc_show)
-CEPH_DEFINE_SHOW_FUNC(caps_show)
-CEPH_DEFINE_SHOW_FUNC(dentry_lru_show)
-
-
-/*
- * debugfs
- */
-static int congestion_kb_set(void *data, u64 val)
-{
- struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
-
- fsc->mount_options->congestion_kb = (int)val;
- return 0;
-}
-
-static int congestion_kb_get(void *data, u64 *val)
-{
- struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
-
- *val = (u64)fsc->mount_options->congestion_kb;
- return 0;
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
- congestion_kb_set, "%llu\n");
-
-
-void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
-{
- dout("ceph_fs_debugfs_cleanup\n");
- debugfs_remove(fsc->debugfs_bdi);
- debugfs_remove(fsc->debugfs_congestion_kb);
- debugfs_remove(fsc->debugfs_mdsmap);
- debugfs_remove(fsc->debugfs_caps);
- debugfs_remove(fsc->debugfs_mdsc);
- debugfs_remove(fsc->debugfs_dentry_lru);
-}
-
-int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
-{
- char name[100];
- int err = -ENOMEM;
-
- dout("ceph_fs_debugfs_init\n");
- fsc->debugfs_congestion_kb =
- debugfs_create_file("writeback_congestion_kb",
- 0600,
- fsc->client->debugfs_dir,
- fsc,
- &congestion_kb_fops);
- if (!fsc->debugfs_congestion_kb)
- goto out;
-
- snprintf(name, sizeof(name), "../../bdi/%s",
- dev_name(fsc->backing_dev_info.dev));
- fsc->debugfs_bdi =
- debugfs_create_symlink("bdi",
- fsc->client->debugfs_dir,
- name);
- if (!fsc->debugfs_bdi)
- goto out;
-
- fsc->debugfs_mdsmap = debugfs_create_file("mdsmap",
- 0600,
- fsc->client->debugfs_dir,
- fsc,
- &mdsmap_show_fops);
- if (!fsc->debugfs_mdsmap)
- goto out;
-
- fsc->debugfs_mdsc = debugfs_create_file("mdsc",
- 0600,
- fsc->client->debugfs_dir,
- fsc,
- &mdsc_show_fops);
- if (!fsc->debugfs_mdsc)
- goto out;
-
- fsc->debugfs_caps = debugfs_create_file("caps",
- 0400,
- fsc->client->debugfs_dir,
- fsc,
- &caps_show_fops);
- if (!fsc->debugfs_caps)
- goto out;
-
- fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
- 0600,
- fsc->client->debugfs_dir,
- fsc,
- &dentry_lru_show_fops);
- if (!fsc->debugfs_dentry_lru)
- goto out;
-
- return 0;
-
-out:
- ceph_fs_debugfs_cleanup(fsc);
- return err;
-}
-
-
-#else /* CONFIG_DEBUG_FS */
-
-int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
-{
- return 0;
-}
-
-void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
-{
-}
-
-#endif /* CONFIG_DEBUG_FS */
diff --git a/ANDROID_3.4.5/fs/ceph/dir.c b/ANDROID_3.4.5/fs/ceph/dir.c
deleted file mode 100644
index 3e8094be..00000000
--- a/ANDROID_3.4.5/fs/ceph/dir.c
+++ /dev/null
@@ -1,1376 +0,0 @@
-#include <linux/ceph/ceph_debug.h>
-
-#include <linux/spinlock.h>
-#include <linux/fs_struct.h>
-#include <linux/namei.h>
-#include <linux/slab.h>
-#include <linux/sched.h>
-
-#include "super.h"
-#include "mds_client.h"
-
-/*
- * Directory operations: readdir, lookup, create, link, unlink,
- * rename, etc.
- */
-
-/*
- * Ceph MDS operations are specified in terms of a base ino and
- * relative path. Thus, the client can specify an operation on a
- * specific inode (e.g., a getattr due to fstat(2)), or as a path
- * relative to, say, the root directory.
- *
- * Normally, we limit ourselves to strict inode ops (no path component)
- * or dentry operations (a single path component relative to an ino). The
- * exception to this is open_root_dentry(), which will open the mount
- * point by name.
- */
-
-const struct inode_operations ceph_dir_iops;
-const struct file_operations ceph_dir_fops;
-const struct dentry_operations ceph_dentry_ops;
-
-/*
- * Initialize ceph dentry state.
- */
-int ceph_init_dentry(struct dentry *dentry)
-{
- struct ceph_dentry_info *di;
-
- if (dentry->d_fsdata)
- return 0;
-
- di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
- if (!di)
- return -ENOMEM; /* oh well */
-
- spin_lock(&dentry->d_lock);
- if (dentry->d_fsdata) {
- /* lost a race */
- kmem_cache_free(ceph_dentry_cachep, di);
- goto out_unlock;
- }
-
- if (dentry->d_parent == NULL || /* nfs fh_to_dentry */
- ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
- d_set_d_op(dentry, &ceph_dentry_ops);
- else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
- d_set_d_op(dentry, &ceph_snapdir_dentry_ops);
- else
- d_set_d_op(dentry, &ceph_snap_dentry_ops);
-
- di->dentry = dentry;
- di->lease_session = NULL;
- dentry->d_time = jiffies;
- /* avoid reordering d_fsdata setup so that the check above is safe */
- smp_mb();
- dentry->d_fsdata = di;
- ceph_dentry_lru_add(dentry);
-out_unlock:
- spin_unlock(&dentry->d_lock);
- return 0;
-}
-
-struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry)
-{
- struct inode *inode = NULL;
-
- if (!dentry)
- return NULL;
-
- spin_lock(&dentry->d_lock);
- if (dentry->d_parent) {
- inode = dentry->d_parent->d_inode;
- ihold(inode);
- }
- spin_unlock(&dentry->d_lock);
- return inode;
-}
-
-
-/*
- * for readdir, we encode the directory frag and offset within that
- * frag into f_pos.
- */
-static unsigned fpos_frag(loff_t p)
-{
- return p >> 32;
-}
-static unsigned fpos_off(loff_t p)
-{
- return p & 0xffffffff;
-}
-
-/*
- * When possible, we try to satisfy a readdir by peeking at the
- * dcache. We make this work by carefully ordering dentries on
- * d_u.d_child when we initially get results back from the MDS, and
- * falling back to a "normal" sync readdir if any dentries in the dir
- * are dropped.
- *
- * D_COMPLETE tells indicates we have all dentries in the dir. It is
- * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
- * the MDS if/when the directory is modified).
- */
-static int __dcache_readdir(struct file *filp,
- void *dirent, filldir_t filldir)
-{
- struct ceph_file_info *fi = filp->private_data;
- struct dentry *parent = filp->f_dentry;
- struct inode *dir = parent->d_inode;
- struct list_head *p;
- struct dentry *dentry, *last;
- struct ceph_dentry_info *di;
- int err = 0;
-
- /* claim ref on last dentry we returned */
- last = fi->dentry;
- fi->dentry = NULL;
-
- dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
- last);
-
- spin_lock(&parent->d_lock);
-
- /* start at beginning? */
- if (filp->f_pos == 2 || last == NULL ||
- filp->f_pos < ceph_dentry(last)->offset) {
- if (list_empty(&parent->d_subdirs))
- goto out_unlock;
- p = parent->d_subdirs.prev;
- dout(" initial p %p/%p\n", p->prev, p->next);
- } else {
- p = last->d_u.d_child.prev;
- }
-
-more:
- dentry = list_entry(p, struct dentry, d_u.d_child);
- di = ceph_dentry(dentry);
- while (1) {
- dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next,
- d_unhashed(dentry) ? "!hashed" : "hashed",
- parent->d_subdirs.prev, parent->d_subdirs.next);
- if (p == &parent->d_subdirs) {
- fi->flags |= CEPH_F_ATEND;
- goto out_unlock;
- }
- spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
- if (!d_unhashed(dentry) && dentry->d_inode &&
- ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
- ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
- filp->f_pos <= di->offset)
- break;
- dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
- dentry->d_name.len, dentry->d_name.name, di->offset,
- filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
- !dentry->d_inode ? " null" : "");
- spin_unlock(&dentry->d_lock);
- p = p->prev;
- dentry = list_entry(p, struct dentry, d_u.d_child);
- di = ceph_dentry(dentry);
- }
-
- dget_dlock(dentry);
- spin_unlock(&dentry->d_lock);
- spin_unlock(&parent->d_lock);
-
- dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
- dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
- filp->f_pos = di->offset;
- err = filldir(dirent, dentry->d_name.name,
- dentry->d_name.len, di->offset,
- ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino),
- dentry->d_inode->i_mode >> 12);
-
- if (last) {
- if (err < 0) {
- /* remember our position */
- fi->dentry = last;
- fi->next_offset = di->offset;
- } else {
- dput(last);
- }
- }
- last = dentry;
-
- if (err < 0)
- goto out;
-
- filp->f_pos++;
-
- /* make sure a dentry wasn't dropped while we didn't have parent lock */
- if (!ceph_dir_test_complete(dir)) {
- dout(" lost D_COMPLETE on %p; falling back to mds\n", dir);
- err = -EAGAIN;
- goto out;
- }
-
- spin_lock(&parent->d_lock);
- p = p->prev; /* advance to next dentry */
- goto more;
-
-out_unlock:
- spin_unlock(&parent->d_lock);
-out:
- if (last)
- dput(last);
- return err;
-}
-
-/*
- * make note of the last dentry we read, so we can
- * continue at the same lexicographical point,
- * regardless of what dir changes take place on the
- * server.
- */
-static int note_last_dentry(struct ceph_file_info *fi, const char *name,
- int len)
-{
- kfree(fi->last_name);
- fi->last_name = kmalloc(len+1, GFP_NOFS);
- if (!fi->last_name)
- return -ENOMEM;
- memcpy(fi->last_name, name, len);
- fi->last_name[len] = 0;
- dout("note_last_dentry '%s'\n", fi->last_name);
- return 0;
-}
-
-static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
-{
- struct ceph_file_info *fi = filp->private_data;
- struct inode *inode = filp->f_dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- struct ceph_mds_client *mdsc = fsc->mdsc;
- unsigned frag = fpos_frag(filp->f_pos);
- int off = fpos_off(filp->f_pos);
- int err;
- u32 ftype;
- struct ceph_mds_reply_info_parsed *rinfo;
- const int max_entries = fsc->mount_options->max_readdir;
- const int max_bytes = fsc->mount_options->max_readdir_bytes;
-
- dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
- if (fi->flags & CEPH_F_ATEND)
- return 0;
-
- /* always start with . and .. */
- if (filp->f_pos == 0) {
- /* note dir version at start of readdir so we can tell
- * if any dentries get dropped */
- fi->dir_release_count = ci->i_release_count;
-
- dout("readdir off 0 -> '.'\n");
- if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
- ceph_translate_ino(inode->i_sb, inode->i_ino),
- inode->i_mode >> 12) < 0)
- return 0;
- filp->f_pos = 1;
- off = 1;
- }
- if (filp->f_pos == 1) {
- ino_t ino = parent_ino(filp->f_dentry);
- dout("readdir off 1 -> '..'\n");
- if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
- ceph_translate_ino(inode->i_sb, ino),
- inode->i_mode >> 12) < 0)
- return 0;
- filp->f_pos = 2;
- off = 2;
- }
-
- /* can we use the dcache? */
- spin_lock(&ci->i_ceph_lock);
- if ((filp->f_pos == 2 || fi->dentry) &&
- !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
- ceph_snap(inode) != CEPH_SNAPDIR &&
- ceph_dir_test_complete(inode) &&
- __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
- spin_unlock(&ci->i_ceph_lock);
- err = __dcache_readdir(filp, dirent, filldir);
- if (err != -EAGAIN)
- return err;
- } else {
- spin_unlock(&ci->i_ceph_lock);
- }
- if (fi->dentry) {
- err = note_last_dentry(fi, fi->dentry->d_name.name,
- fi->dentry->d_name.len);
- if (err)
- return err;
- dput(fi->dentry);
- fi->dentry = NULL;
- }
-
- /* proceed with a normal readdir */
-
-more:
- /* do we have the correct frag content buffered? */
- if (fi->frag != frag || fi->last_readdir == NULL) {
- struct ceph_mds_request *req;
- int op = ceph_snap(inode) == CEPH_SNAPDIR ?
- CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
-
- /* discard old result, if any */
- if (fi->last_readdir) {
- ceph_mdsc_put_request(fi->last_readdir);
- fi->last_readdir = NULL;
- }
-
- /* requery frag tree, as the frag topology may have changed */
- frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
-
- dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
- ceph_vinop(inode), frag, fi->last_name);
- req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
- if (IS_ERR(req))
- return PTR_ERR(req);
- req->r_inode = inode;
- ihold(inode);
- req->r_dentry = dget(filp->f_dentry);
- /* hints to request -> mds selection code */
- req->r_direct_mode = USE_AUTH_MDS;
- req->r_direct_hash = ceph_frag_value(frag);
- req->r_direct_is_hash = true;
- req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
- req->r_readdir_offset = fi->next_offset;
- req->r_args.readdir.frag = cpu_to_le32(frag);
- req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
- req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes);
- req->r_num_caps = max_entries + 1;
- err = ceph_mdsc_do_request(mdsc, NULL, req);
- if (err < 0) {
- ceph_mdsc_put_request(req);
- return err;
- }
- dout("readdir got and parsed readdir result=%d"
- " on frag %x, end=%d, complete=%d\n", err, frag,
- (int)req->r_reply_info.dir_end,
- (int)req->r_reply_info.dir_complete);
-
- if (!req->r_did_prepopulate) {
- dout("readdir !did_prepopulate");
- fi->dir_release_count--; /* preclude D_COMPLETE */
- }
-
- /* note next offset and last dentry name */
- fi->offset = fi->next_offset;
- fi->last_readdir = req;
-
- if (req->r_reply_info.dir_end) {
- kfree(fi->last_name);
- fi->last_name = NULL;
- if (ceph_frag_is_rightmost(frag))
- fi->next_offset = 2;
- else
- fi->next_offset = 0;
- } else {
- rinfo = &req->r_reply_info;
- err = note_last_dentry(fi,
- rinfo->dir_dname[rinfo->dir_nr-1],
- rinfo->dir_dname_len[rinfo->dir_nr-1]);
- if (err)
- return err;
- fi->next_offset += rinfo->dir_nr;
- }
- }
-
- rinfo = &fi->last_readdir->r_reply_info;
- dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
- rinfo->dir_nr, off, fi->offset);
- while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {
- u64 pos = ceph_make_fpos(frag, off);
- struct ceph_mds_reply_inode *in =
- rinfo->dir_in[off - fi->offset].in;
- struct ceph_vino vino;
- ino_t ino;
-
- dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
- off, off - fi->offset, rinfo->dir_nr, pos,
- rinfo->dir_dname_len[off - fi->offset],
- rinfo->dir_dname[off - fi->offset], in);
- BUG_ON(!in);
- ftype = le32_to_cpu(in->mode) >> 12;
- vino.ino = le64_to_cpu(in->ino);
- vino.snap = le64_to_cpu(in->snapid);
- ino = ceph_vino_to_ino(vino);
- if (filldir(dirent,
- rinfo->dir_dname[off - fi->offset],
- rinfo->dir_dname_len[off - fi->offset],
- pos,
- ceph_translate_ino(inode->i_sb, ino), ftype) < 0) {
- dout("filldir stopping us...\n");
- return 0;
- }
- off++;
- filp->f_pos = pos + 1;
- }
-
- if (fi->last_name) {
- ceph_mdsc_put_request(fi->last_readdir);
- fi->last_readdir = NULL;
- goto more;
- }
-
- /* more frags? */
- if (!ceph_frag_is_rightmost(frag)) {
- frag = ceph_frag_next(frag);
- off = 0;
- filp->f_pos = ceph_make_fpos(frag, off);
- dout("readdir next frag is %x\n", frag);
- goto more;
- }
- fi->flags |= CEPH_F_ATEND;
-
- /*
- * if dir_release_count still matches the dir, no dentries
- * were released during the whole readdir, and we should have
- * the complete dir contents in our cache.
- */
- spin_lock(&ci->i_ceph_lock);
- if (ci->i_release_count == fi->dir_release_count) {
- ceph_dir_set_complete(inode);
- ci->i_max_offset = filp->f_pos;
- }
- spin_unlock(&ci->i_ceph_lock);
-
- dout("readdir %p filp %p done.\n", inode, filp);
- return 0;
-}
-
-static void reset_readdir(struct ceph_file_info *fi)
-{
- if (fi->last_readdir) {
- ceph_mdsc_put_request(fi->last_readdir);
- fi->last_readdir = NULL;
- }
- kfree(fi->last_name);
- fi->last_name = NULL;
- fi->next_offset = 2; /* compensate for . and .. */
- if (fi->dentry) {
- dput(fi->dentry);
- fi->dentry = NULL;
- }
- fi->flags &= ~CEPH_F_ATEND;
-}
-
-static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
-{
- struct ceph_file_info *fi = file->private_data;
- struct inode *inode = file->f_mapping->host;
- loff_t old_offset = offset;
- loff_t retval;
-
- mutex_lock(&inode->i_mutex);
- retval = -EINVAL;
- switch (origin) {
- case SEEK_END:
- offset += inode->i_size + 2; /* FIXME */
- break;
- case SEEK_CUR:
- offset += file->f_pos;
- case SEEK_SET:
- break;
- default:
- goto out;
- }
-
- if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
- if (offset != file->f_pos) {
- file->f_pos = offset;
- file->f_version = 0;
- fi->flags &= ~CEPH_F_ATEND;
- }
- retval = offset;
-
- /*
- * discard buffered readdir content on seekdir(0), or
- * seek to new frag, or seek prior to current chunk.
- */
- if (offset == 0 ||
- fpos_frag(offset) != fpos_frag(old_offset) ||
- fpos_off(offset) < fi->offset) {
- dout("dir_llseek dropping %p content\n", file);
- reset_readdir(fi);
- }
-
- /* bump dir_release_count if we did a forward seek */
- if (offset > old_offset)
- fi->dir_release_count--;
- }
-out:
- mutex_unlock(&inode->i_mutex);
- return retval;
-}
-
-/*
- * Handle lookups for the hidden .snap directory.
- */
-int ceph_handle_snapdir(struct ceph_mds_request *req,
- struct dentry *dentry, int err)
-{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
- struct inode *parent = dentry->d_parent->d_inode; /* we hold i_mutex */
-
- /* .snap dir? */
- if (err == -ENOENT &&
- ceph_snap(parent) == CEPH_NOSNAP &&
- strcmp(dentry->d_name.name,
- fsc->mount_options->snapdir_name) == 0) {
- struct inode *inode = ceph_get_snapdir(parent);
- dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
- dentry, dentry->d_name.len, dentry->d_name.name, inode);
- BUG_ON(!d_unhashed(dentry));
- d_add(dentry, inode);
- err = 0;
- }
- return err;
-}
-
-/*
- * Figure out final result of a lookup/open request.
- *
- * Mainly, make sure we return the final req->r_dentry (if it already
- * existed) in place of the original VFS-provided dentry when they
- * differ.
- *
- * Gracefully handle the case where the MDS replies with -ENOENT and
- * no trace (which it may do, at its discretion, e.g., if it doesn't
- * care to issue a lease on the negative dentry).
- */
-struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
- struct dentry *dentry, int err)
-{
- if (err == -ENOENT) {
- /* no trace? */
- err = 0;
- if (!req->r_reply_info.head->is_dentry) {
- dout("ENOENT and no trace, dentry %p inode %p\n",
- dentry, dentry->d_inode);
- if (dentry->d_inode) {
- d_drop(dentry);
- err = -ENOENT;
- } else {
- d_add(dentry, NULL);
- }
- }
- }
- if (err)
- dentry = ERR_PTR(err);
- else if (dentry != req->r_dentry)
- dentry = dget(req->r_dentry); /* we got spliced */
- else
- dentry = NULL;
- return dentry;
-}
-
-static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
-{
- return ceph_ino(inode) == CEPH_INO_ROOT &&
- strncmp(dentry->d_name.name, ".ceph", 5) == 0;
-}
-
-/*
- * Look up a single dir entry. If there is a lookup intent, inform
- * the MDS so that it gets our 'caps wanted' value in a single op.
- */
-static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
- struct nameidata *nd)
-{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
- struct ceph_mds_request *req;
- int op;
- int err;
-
- dout("lookup %p dentry %p '%.*s'\n",
- dir, dentry, dentry->d_name.len, dentry->d_name.name);
-
- if (dentry->d_name.len > NAME_MAX)
- return ERR_PTR(-ENAMETOOLONG);
-
- err = ceph_init_dentry(dentry);
- if (err < 0)
- return ERR_PTR(err);
-
- /* open (but not create!) intent? */
- if (nd &&
- (nd->flags & LOOKUP_OPEN) &&
- !(nd->intent.open.flags & O_CREAT)) {
- int mode = nd->intent.open.create_mode & ~current->fs->umask;
- return ceph_lookup_open(dir, dentry, nd, mode, 1);
- }
-
- /* can we conclude ENOENT locally? */
- if (dentry->d_inode == NULL) {
- struct ceph_inode_info *ci = ceph_inode(dir);
- struct ceph_dentry_info *di = ceph_dentry(dentry);
-
- spin_lock(&ci->i_ceph_lock);
- dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
- if (strncmp(dentry->d_name.name,
- fsc->mount_options->snapdir_name,
- dentry->d_name.len) &&
- !is_root_ceph_dentry(dir, dentry) &&
- ceph_dir_test_complete(dir) &&
- (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
- spin_unlock(&ci->i_ceph_lock);
- dout(" dir %p complete, -ENOENT\n", dir);
- d_add(dentry, NULL);
- di->lease_shared_gen = ci->i_shared_gen;
- return NULL;
- }
- spin_unlock(&ci->i_ceph_lock);
- }
-
- op = ceph_snap(dir) == CEPH_SNAPDIR ?
- CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
- req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
- if (IS_ERR(req))
- return ERR_CAST(req);
- req->r_dentry = dget(dentry);
- req->r_num_caps = 2;
- /* we only need inode linkage */
- req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
- req->r_locked_dir = dir;
- err = ceph_mdsc_do_request(mdsc, NULL, req);
- err = ceph_handle_snapdir(req, dentry, err);
- dentry = ceph_finish_lookup(req, dentry, err);
- ceph_mdsc_put_request(req); /* will dput(dentry) */
- dout("lookup result=%p\n", dentry);
- return dentry;
-}
-
-/*
- * If we do a create but get no trace back from the MDS, follow up with
- * a lookup (the VFS expects us to link up the provided dentry).
- */
-int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
-{
- struct dentry *result = ceph_lookup(dir, dentry, NULL);
-
- if (result && !IS_ERR(result)) {
- /*
- * We created the item, then did a lookup, and found
- * it was already linked to another inode we already
- * had in our cache (and thus got spliced). Link our
- * dentry to that inode, but don't hash it, just in
- * case the VFS wants to dereference it.
- */
- BUG_ON(!result->d_inode);
- d_instantiate(dentry, result->d_inode);
- return 0;
- }
- return PTR_ERR(result);
-}
-
-static int ceph_mknod(struct inode *dir, struct dentry *dentry,
- umode_t mode, dev_t rdev)
-{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
- struct ceph_mds_request *req;
- int err;
-
- if (ceph_snap(dir) != CEPH_NOSNAP)
- return -EROFS;
-
- dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n",
- dir, dentry, mode, rdev);
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
- if (IS_ERR(req)) {
- d_drop(dentry);
- return PTR_ERR(req);
- }
- req->r_dentry = dget(dentry);
- req->r_num_caps = 2;
- req->r_locked_dir = dir;
- req->r_args.mknod.mode = cpu_to_le32(mode);
- req->r_args.mknod.rdev = cpu_to_le32(rdev);
- req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
- req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
- err = ceph_mdsc_do_request(mdsc, dir, req);
- if (!err && !req->r_reply_info.head->is_dentry)
- err = ceph_handle_notrace_create(dir, dentry);
- ceph_mdsc_put_request(req);
- if (err)
- d_drop(dentry);
- return err;
-}
-
-static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- struct nameidata *nd)
-{
- dout("create in dir %p dentry %p name '%.*s'\n",
- dir, dentry, dentry->d_name.len, dentry->d_name.name);
-
- if (ceph_snap(dir) != CEPH_NOSNAP)
- return -EROFS;
-
- if (nd) {
- BUG_ON((nd->flags & LOOKUP_OPEN) == 0);
- dentry = ceph_lookup_open(dir, dentry, nd, mode, 0);
- /* hrm, what should i do here if we get aliased? */
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
- return 0;
- }
-
- /* fall back to mknod */
- return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0);
-}
-
-static int ceph_symlink(struct inode *dir, struct dentry *dentry,
- const char *dest)
-{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
- struct ceph_mds_request *req;
- int err;
-
- if (ceph_snap(dir) != CEPH_NOSNAP)
- return -EROFS;
-
- dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
- if (IS_ERR(req)) {
- d_drop(dentry);
- return PTR_ERR(req);
- }
- req->r_dentry = dget(dentry);
- req->r_num_caps = 2;
- req->r_path2 = kstrdup(dest, GFP_NOFS);
- req->r_locked_dir = dir;
- req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
- req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
- err = ceph_mdsc_do_request(mdsc, dir, req);
- if (!err && !req->r_reply_info.head->is_dentry)
- err = ceph_handle_notrace_create(dir, dentry);
- ceph_mdsc_put_request(req);
- if (err)
- d_drop(dentry);
- return err;
-}
-
-static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
-{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
- struct ceph_mds_request *req;
- int err = -EROFS;
- int op;
-
- if (ceph_snap(dir) == CEPH_SNAPDIR) {
- /* mkdir .snap/foo is a MKSNAP */
- op = CEPH_MDS_OP_MKSNAP;
- dout("mksnap dir %p snap '%.*s' dn %p\n", dir,
- dentry->d_name.len, dentry->d_name.name, dentry);
- } else if (ceph_snap(dir) == CEPH_NOSNAP) {
- dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);
- op = CEPH_MDS_OP_MKDIR;
- } else {
- goto out;
- }
- req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
- if (IS_ERR(req)) {
- err = PTR_ERR(req);
- goto out;
- }
-
- req->r_dentry = dget(dentry);
- req->r_num_caps = 2;
- req->r_locked_dir = dir;
- req->r_args.mkdir.mode = cpu_to_le32(mode);
- req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
- req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
- err = ceph_mdsc_do_request(mdsc, dir, req);
- if (!err && !req->r_reply_info.head->is_dentry)
- err = ceph_handle_notrace_create(dir, dentry);
- ceph_mdsc_put_request(req);
-out:
- if (err < 0)
- d_drop(dentry);
- return err;
-}
-
-static int ceph_link(struct dentry *old_dentry, struct inode *dir,
- struct dentry *dentry)
-{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
- struct ceph_mds_request *req;
- int err;
-
- if (ceph_snap(dir) != CEPH_NOSNAP)
- return -EROFS;
-
- dout("link in dir %p old_dentry %p dentry %p\n", dir,
- old_dentry, dentry);
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
- if (IS_ERR(req)) {
- d_drop(dentry);
- return PTR_ERR(req);
- }
- req->r_dentry = dget(dentry);
- req->r_num_caps = 2;
- req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
- req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);
- req->r_locked_dir = dir;
- req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
- req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
- err = ceph_mdsc_do_request(mdsc, dir, req);
- if (err) {
- d_drop(dentry);
- } else if (!req->r_reply_info.head->is_dentry) {
- ihold(old_dentry->d_inode);
- d_instantiate(dentry, old_dentry->d_inode);
- }
- ceph_mdsc_put_request(req);
- return err;
-}
-
-/*
- * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it
- * looks like the link count will hit 0, drop any other caps (other
- * than PIN) we don't specifically want (due to the file still being
- * open).
- */
-static int drop_caps_for_unlink(struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
-
- spin_lock(&ci->i_ceph_lock);
- if (inode->i_nlink == 1) {
- drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
- ci->i_ceph_flags |= CEPH_I_NODELAY;
- }
- spin_unlock(&ci->i_ceph_lock);
- return drop;
-}
-
-/*
- * rmdir and unlink are differ only by the metadata op code
- */
-static int ceph_unlink(struct inode *dir, struct dentry *dentry)
-{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
- struct inode *inode = dentry->d_inode;
- struct ceph_mds_request *req;
- int err = -EROFS;
- int op;
-
- if (ceph_snap(dir) == CEPH_SNAPDIR) {
- /* rmdir .snap/foo is RMSNAP */
- dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len,
- dentry->d_name.name, dentry);
- op = CEPH_MDS_OP_RMSNAP;
- } else if (ceph_snap(dir) == CEPH_NOSNAP) {
- dout("unlink/rmdir dir %p dn %p inode %p\n",
- dir, dentry, inode);
- op = S_ISDIR(dentry->d_inode->i_mode) ?
- CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
- } else
- goto out;
- req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
- if (IS_ERR(req)) {
- err = PTR_ERR(req);
- goto out;
- }
- req->r_dentry = dget(dentry);
- req->r_num_caps = 2;
- req->r_locked_dir = dir;
- req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
- req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
- req->r_inode_drop = drop_caps_for_unlink(inode);
- err = ceph_mdsc_do_request(mdsc, dir, req);
- if (!err && !req->r_reply_info.head->is_dentry)
- d_delete(dentry);
- ceph_mdsc_put_request(req);
-out:
- return err;
-}
-
-static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
-{
- struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
- struct ceph_mds_request *req;
- int err;
-
- if (ceph_snap(old_dir) != ceph_snap(new_dir))
- return -EXDEV;
- if (ceph_snap(old_dir) != CEPH_NOSNAP ||
- ceph_snap(new_dir) != CEPH_NOSNAP)
- return -EROFS;
- dout("rename dir %p dentry %p to dir %p dentry %p\n",
- old_dir, old_dentry, new_dir, new_dentry);
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
- if (IS_ERR(req))
- return PTR_ERR(req);
- req->r_dentry = dget(new_dentry);
- req->r_num_caps = 2;
- req->r_old_dentry = dget(old_dentry);
- req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);
- req->r_locked_dir = new_dir;
- req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
- req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
- req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
- req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
- /* release LINK_RDCACHE on source inode (mds will lock it) */
- req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;
- if (new_dentry->d_inode)
- req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode);
- err = ceph_mdsc_do_request(mdsc, old_dir, req);
- if (!err && !req->r_reply_info.head->is_dentry) {
- /*
- * Normally d_move() is done by fill_trace (called by
- * do_request, above). If there is no trace, we need
- * to do it here.
- */
-
- /* d_move screws up d_subdirs order */
- ceph_dir_clear_complete(new_dir);
-
- d_move(old_dentry, new_dentry);
-
- /* ensure target dentry is invalidated, despite
- rehashing bug in vfs_rename_dir */
- ceph_invalidate_dentry_lease(new_dentry);
- }
- ceph_mdsc_put_request(req);
- return err;
-}
-
-/*
- * Ensure a dentry lease will no longer revalidate.
- */
-void ceph_invalidate_dentry_lease(struct dentry *dentry)
-{
- spin_lock(&dentry->d_lock);
- dentry->d_time = jiffies;
- ceph_dentry(dentry)->lease_shared_gen = 0;
- spin_unlock(&dentry->d_lock);
-}
-
-/*
- * Check if dentry lease is valid. If not, delete the lease. Try to
- * renew if the least is more than half up.
- */
-static int dentry_lease_is_valid(struct dentry *dentry)
-{
- struct ceph_dentry_info *di;
- struct ceph_mds_session *s;
- int valid = 0;
- u32 gen;
- unsigned long ttl;
- struct ceph_mds_session *session = NULL;
- struct inode *dir = NULL;
- u32 seq = 0;
-
- spin_lock(&dentry->d_lock);
- di = ceph_dentry(dentry);
- if (di->lease_session) {
- s = di->lease_session;
- spin_lock(&s->s_gen_ttl_lock);
- gen = s->s_cap_gen;
- ttl = s->s_cap_ttl;
- spin_unlock(&s->s_gen_ttl_lock);
-
- if (di->lease_gen == gen &&
- time_before(jiffies, dentry->d_time) &&
- time_before(jiffies, ttl)) {
- valid = 1;
- if (di->lease_renew_after &&
- time_after(jiffies, di->lease_renew_after)) {
- /* we should renew */
- dir = dentry->d_parent->d_inode;
- session = ceph_get_mds_session(s);
- seq = di->lease_seq;
- di->lease_renew_after = 0;
- di->lease_renew_from = jiffies;
- }
- }
- }
- spin_unlock(&dentry->d_lock);
-
- if (session) {
- ceph_mdsc_lease_send_msg(session, dir, dentry,
- CEPH_MDS_LEASE_RENEW, seq);
- ceph_put_mds_session(session);
- }
- dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
- return valid;
-}
-
-/*
- * Check if directory-wide content lease/cap is valid.
- */
-static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
-{
- struct ceph_inode_info *ci = ceph_inode(dir);
- struct ceph_dentry_info *di = ceph_dentry(dentry);
- int valid = 0;
-
- spin_lock(&ci->i_ceph_lock);
- if (ci->i_shared_gen == di->lease_shared_gen)
- valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
- spin_unlock(&ci->i_ceph_lock);
- dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
- dir, (unsigned)ci->i_shared_gen, dentry,
- (unsigned)di->lease_shared_gen, valid);
- return valid;
-}
-
-/*
- * Check if cached dentry can be trusted.
- */
-static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
-{
- int valid = 0;
- struct inode *dir;
-
- if (nd && nd->flags & LOOKUP_RCU)
- return -ECHILD;
-
- dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
- dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
- ceph_dentry(dentry)->offset);
-
- dir = ceph_get_dentry_parent_inode(dentry);
-
- /* always trust cached snapped dentries, snapdir dentry */
- if (ceph_snap(dir) != CEPH_NOSNAP) {
- dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,
- dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
- valid = 1;
- } else if (dentry->d_inode &&
- ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) {
- valid = 1;
- } else if (dentry_lease_is_valid(dentry) ||
- dir_lease_is_valid(dir, dentry)) {
- valid = 1;
- }
-
- dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
- if (valid)
- ceph_dentry_lru_touch(dentry);
- else
- d_drop(dentry);
- iput(dir);
- return valid;
-}
-
-/*
- * Release our ceph_dentry_info.
- */
-static void ceph_d_release(struct dentry *dentry)
-{
- struct ceph_dentry_info *di = ceph_dentry(dentry);
-
- dout("d_release %p\n", dentry);
- ceph_dentry_lru_del(dentry);
- if (di->lease_session)
- ceph_put_mds_session(di->lease_session);
- kmem_cache_free(ceph_dentry_cachep, di);
- dentry->d_fsdata = NULL;
-}
-
-static int ceph_snapdir_d_revalidate(struct dentry *dentry,
- struct nameidata *nd)
-{
- /*
- * Eventually, we'll want to revalidate snapped metadata
- * too... probably...
- */
- return 1;
-}
-
-/*
- * Set/clear/test dir complete flag on the dir's dentry.
- */
-void ceph_dir_set_complete(struct inode *inode)
-{
- struct dentry *dentry = d_find_any_alias(inode);
-
- if (dentry && ceph_dentry(dentry) &&
- ceph_test_mount_opt(ceph_sb_to_client(dentry->d_sb), DCACHE)) {
- dout(" marking %p (%p) complete\n", inode, dentry);
- set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
- }
- dput(dentry);
-}
-
-void ceph_dir_clear_complete(struct inode *inode)
-{
- struct dentry *dentry = d_find_any_alias(inode);
-
- if (dentry && ceph_dentry(dentry)) {
- dout(" marking %p (%p) complete\n", inode, dentry);
- set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
- }
- dput(dentry);
-}
-
-bool ceph_dir_test_complete(struct inode *inode)
-{
- struct dentry *dentry = d_find_any_alias(inode);
-
- if (dentry && ceph_dentry(dentry)) {
- dout(" marking %p (%p) NOT complete\n", inode, dentry);
- clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
- }
- dput(dentry);
- return false;
-}
-
-/*
- * When the VFS prunes a dentry from the cache, we need to clear the
- * complete flag on the parent directory.
- *
- * Called under dentry->d_lock.
- */
-static void ceph_d_prune(struct dentry *dentry)
-{
- struct ceph_dentry_info *di;
-
- dout("ceph_d_prune %p\n", dentry);
-
- /* do we have a valid parent? */
- if (!dentry->d_parent || IS_ROOT(dentry))
- return;
-
- /* if we are not hashed, we don't affect D_COMPLETE */
- if (d_unhashed(dentry))
- return;
-
- /*
- * we hold d_lock, so d_parent is stable, and d_fsdata is never
- * cleared until d_release
- */
- di = ceph_dentry(dentry->d_parent);
- clear_bit(CEPH_D_COMPLETE, &di->flags);
-}
-
-/*
- * read() on a dir. This weird interface hack only works if mounted
- * with '-o dirstat'.
- */
-static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
- loff_t *ppos)
-{
- struct ceph_file_info *cf = file->private_data;
- struct inode *inode = file->f_dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
- int left;
- const int bufsize = 1024;
-
- if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
- return -EISDIR;
-
- if (!cf->dir_info) {
- cf->dir_info = kmalloc(bufsize, GFP_NOFS);
- if (!cf->dir_info)
- return -ENOMEM;
- cf->dir_info_len =
- snprintf(cf->dir_info, bufsize,
- "entries: %20lld\n"
- " files: %20lld\n"
- " subdirs: %20lld\n"
- "rentries: %20lld\n"
- " rfiles: %20lld\n"
- " rsubdirs: %20lld\n"
- "rbytes: %20lld\n"
- "rctime: %10ld.%09ld\n",
- ci->i_files + ci->i_subdirs,
- ci->i_files,
- ci->i_subdirs,
- ci->i_rfiles + ci->i_rsubdirs,
- ci->i_rfiles,
- ci->i_rsubdirs,
- ci->i_rbytes,
- (long)ci->i_rctime.tv_sec,
- (long)ci->i_rctime.tv_nsec);
- }
-
- if (*ppos >= cf->dir_info_len)
- return 0;
- size = min_t(unsigned, size, cf->dir_info_len-*ppos);
- left = copy_to_user(buf, cf->dir_info + *ppos, size);
- if (left == size)
- return -EFAULT;
- *ppos += (size - left);
- return size - left;
-}
-
-/*
- * an fsync() on a dir will wait for any uncommitted directory
- * operations to commit.
- */
-static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
- int datasync)
-{
- struct inode *inode = file->f_path.dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct list_head *head = &ci->i_unsafe_dirops;
- struct ceph_mds_request *req;
- u64 last_tid;
- int ret = 0;
-
- dout("dir_fsync %p\n", inode);
- ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
- if (ret)
- return ret;
- mutex_lock(&inode->i_mutex);
-
- spin_lock(&ci->i_unsafe_lock);
- if (list_empty(head))
- goto out;
-
- req = list_entry(head->prev,
- struct ceph_mds_request, r_unsafe_dir_item);
- last_tid = req->r_tid;
-
- do {
- ceph_mdsc_get_request(req);
- spin_unlock(&ci->i_unsafe_lock);
-
- dout("dir_fsync %p wait on tid %llu (until %llu)\n",
- inode, req->r_tid, last_tid);
- if (req->r_timeout) {
- ret = wait_for_completion_timeout(
- &req->r_safe_completion, req->r_timeout);
- if (ret > 0)
- ret = 0;
- else if (ret == 0)
- ret = -EIO; /* timed out */
- } else {
- wait_for_completion(&req->r_safe_completion);
- }
- ceph_mdsc_put_request(req);
-
- spin_lock(&ci->i_unsafe_lock);
- if (ret || list_empty(head))
- break;
- req = list_entry(head->next,
- struct ceph_mds_request, r_unsafe_dir_item);
- } while (req->r_tid < last_tid);
-out:
- spin_unlock(&ci->i_unsafe_lock);
- mutex_unlock(&inode->i_mutex);
-
- return ret;
-}
-
-/*
- * We maintain a private dentry LRU.
- *
- * FIXME: this needs to be changed to a per-mds lru to be useful.
- */
-void ceph_dentry_lru_add(struct dentry *dn)
-{
- struct ceph_dentry_info *di = ceph_dentry(dn);
- struct ceph_mds_client *mdsc;
-
- dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
- dn->d_name.len, dn->d_name.name);
- mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
- spin_lock(&mdsc->dentry_lru_lock);
- list_add_tail(&di->lru, &mdsc->dentry_lru);
- mdsc->num_dentry++;
- spin_unlock(&mdsc->dentry_lru_lock);
-}
-
-void ceph_dentry_lru_touch(struct dentry *dn)
-{
- struct ceph_dentry_info *di = ceph_dentry(dn);
- struct ceph_mds_client *mdsc;
-
- dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
- dn->d_name.len, dn->d_name.name, di->offset);
- mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
- spin_lock(&mdsc->dentry_lru_lock);
- list_move_tail(&di->lru, &mdsc->dentry_lru);
- spin_unlock(&mdsc->dentry_lru_lock);
-}
-
-void ceph_dentry_lru_del(struct dentry *dn)
-{
- struct ceph_dentry_info *di = ceph_dentry(dn);
- struct ceph_mds_client *mdsc;
-
- dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
- dn->d_name.len, dn->d_name.name);
- mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
- spin_lock(&mdsc->dentry_lru_lock);
- list_del_init(&di->lru);
- mdsc->num_dentry--;
- spin_unlock(&mdsc->dentry_lru_lock);
-}
-
-/*
- * Return name hash for a given dentry. This is dependent on
- * the parent directory's hash function.
- */
-unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn)
-{
- struct ceph_inode_info *dci = ceph_inode(dir);
-
- switch (dci->i_dir_layout.dl_dir_hash) {
- case 0: /* for backward compat */
- case CEPH_STR_HASH_LINUX:
- return dn->d_name.hash;
-
- default:
- return ceph_str_hash(dci->i_dir_layout.dl_dir_hash,
- dn->d_name.name, dn->d_name.len);
- }
-}
-
-const struct file_operations ceph_dir_fops = {
- .read = ceph_read_dir,
- .readdir = ceph_readdir,
- .llseek = ceph_dir_llseek,
- .open = ceph_open,
- .release = ceph_release,
- .unlocked_ioctl = ceph_ioctl,
- .fsync = ceph_dir_fsync,
-};
-
-const struct inode_operations ceph_dir_iops = {
- .lookup = ceph_lookup,
- .permission = ceph_permission,
- .getattr = ceph_getattr,
- .setattr = ceph_setattr,
- .setxattr = ceph_setxattr,
- .getxattr = ceph_getxattr,
- .listxattr = ceph_listxattr,
- .removexattr = ceph_removexattr,
- .mknod = ceph_mknod,
- .symlink = ceph_symlink,
- .mkdir = ceph_mkdir,
- .link = ceph_link,
- .unlink = ceph_unlink,
- .rmdir = ceph_unlink,
- .rename = ceph_rename,
- .create = ceph_create,
-};
-
-const struct dentry_operations ceph_dentry_ops = {
- .d_revalidate = ceph_d_revalidate,
- .d_release = ceph_d_release,
- .d_prune = ceph_d_prune,
-};
-
-const struct dentry_operations ceph_snapdir_dentry_ops = {
- .d_revalidate = ceph_snapdir_d_revalidate,
- .d_release = ceph_d_release,
-};
-
-const struct dentry_operations ceph_snap_dentry_ops = {
- .d_release = ceph_d_release,
- .d_prune = ceph_d_prune,
-};
diff --git a/ANDROID_3.4.5/fs/ceph/export.c b/ANDROID_3.4.5/fs/ceph/export.c
deleted file mode 100644
index fbb2a643..00000000
--- a/ANDROID_3.4.5/fs/ceph/export.c
+++ /dev/null
@@ -1,253 +0,0 @@
-#include <linux/ceph/ceph_debug.h>
-
-#include <linux/exportfs.h>
-#include <linux/slab.h>
-#include <asm/unaligned.h>
-
-#include "super.h"
-#include "mds_client.h"
-
-/*
- * NFS export support
- *
- * NFS re-export of a ceph mount is, at present, only semireliable.
- * The basic issue is that the Ceph architectures doesn't lend itself
- * well to generating filehandles that will remain valid forever.
- *
- * So, we do our best. If you're lucky, your inode will be in the
- * client's cache. If it's not, and you have a connectable fh, then
- * the MDS server may be able to find it for you. Otherwise, you get
- * ESTALE.
- *
- * There are ways to this more reliable, but in the non-connectable fh
- * case, we won't every work perfectly, and in the connectable case,
- * some changes are needed on the MDS side to work better.
- */
-
-/*
- * Basic fh
- */
-struct ceph_nfs_fh {
- u64 ino;
-} __attribute__ ((packed));
-
-/*
- * Larger 'connectable' fh that includes parent ino and name hash.
- * Use this whenever possible, as it works more reliably.
- */
-struct ceph_nfs_confh {
- u64 ino, parent_ino;
- u32 parent_name_hash;
-} __attribute__ ((packed));
-
-static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
- int connectable)
-{
- int type;
- struct ceph_nfs_fh *fh = (void *)rawfh;
- struct ceph_nfs_confh *cfh = (void *)rawfh;
- struct dentry *parent;
- struct inode *inode = dentry->d_inode;
- int connected_handle_length = sizeof(*cfh)/4;
- int handle_length = sizeof(*fh)/4;
-
- /* don't re-export snaps */
- if (ceph_snap(inode) != CEPH_NOSNAP)
- return -EINVAL;
-
- spin_lock(&dentry->d_lock);
- parent = dentry->d_parent;
- if (*max_len >= connected_handle_length) {
- dout("encode_fh %p connectable\n", dentry);
- cfh->ino = ceph_ino(dentry->d_inode);
- cfh->parent_ino = ceph_ino(parent->d_inode);
- cfh->parent_name_hash = ceph_dentry_hash(parent->d_inode,
- dentry);
- *max_len = connected_handle_length;
- type = 2;
- } else if (*max_len >= handle_length) {
- if (connectable) {
- *max_len = connected_handle_length;
- type = 255;
- } else {
- dout("encode_fh %p\n", dentry);
- fh->ino = ceph_ino(dentry->d_inode);
- *max_len = handle_length;
- type = 1;
- }
- } else {
- *max_len = handle_length;
- type = 255;
- }
- spin_unlock(&dentry->d_lock);
- return type;
-}
-
-/*
- * convert regular fh to dentry
- *
- * FIXME: we should try harder by querying the mds for the ino.
- */
-static struct dentry *__fh_to_dentry(struct super_block *sb,
- struct ceph_nfs_fh *fh)
-{
- struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
- struct inode *inode;
- struct dentry *dentry;
- struct ceph_vino vino;
- int err;
-
- dout("__fh_to_dentry %llx\n", fh->ino);
- vino.ino = fh->ino;
- vino.snap = CEPH_NOSNAP;
- inode = ceph_find_inode(sb, vino);
- if (!inode) {
- struct ceph_mds_request *req;
-
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO,
- USE_ANY_MDS);
- if (IS_ERR(req))
- return ERR_CAST(req);
-
- req->r_ino1 = vino;
- req->r_num_caps = 1;
- err = ceph_mdsc_do_request(mdsc, NULL, req);
- inode = req->r_target_inode;
- if (inode)
- ihold(inode);
- ceph_mdsc_put_request(req);
- if (!inode)
- return ERR_PTR(-ESTALE);
- }
-
- dentry = d_obtain_alias(inode);
- if (IS_ERR(dentry)) {
- pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
- fh->ino, inode);
- iput(inode);
- return dentry;
- }
- err = ceph_init_dentry(dentry);
- if (err < 0) {
- iput(inode);
- return ERR_PTR(err);
- }
- dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry);
- return dentry;
-}
-
-/*
- * convert connectable fh to dentry
- */
-static struct dentry *__cfh_to_dentry(struct super_block *sb,
- struct ceph_nfs_confh *cfh)
-{
- struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
- struct inode *inode;
- struct dentry *dentry;
- struct ceph_vino vino;
- int err;
-
- dout("__cfh_to_dentry %llx (%llx/%x)\n",
- cfh->ino, cfh->parent_ino, cfh->parent_name_hash);
-
- vino.ino = cfh->ino;
- vino.snap = CEPH_NOSNAP;
- inode = ceph_find_inode(sb, vino);
- if (!inode) {
- struct ceph_mds_request *req;
-
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
- USE_ANY_MDS);
- if (IS_ERR(req))
- return ERR_CAST(req);
-
- req->r_ino1 = vino;
- req->r_ino2.ino = cfh->parent_ino;
- req->r_ino2.snap = CEPH_NOSNAP;
- req->r_path2 = kmalloc(16, GFP_NOFS);
- snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash);
- req->r_num_caps = 1;
- err = ceph_mdsc_do_request(mdsc, NULL, req);
- inode = req->r_target_inode;
- if (inode)
- ihold(inode);
- ceph_mdsc_put_request(req);
- if (!inode)
- return ERR_PTR(err ? err : -ESTALE);
- }
-
- dentry = d_obtain_alias(inode);
- if (IS_ERR(dentry)) {
- pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
- cfh->ino, inode);
- iput(inode);
- return dentry;
- }
- err = ceph_init_dentry(dentry);
- if (err < 0) {
- iput(inode);
- return ERR_PTR(err);
- }
- dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry);
- return dentry;
-}
-
-static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid,
- int fh_len, int fh_type)
-{
- if (fh_type == 1)
- return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw);
- else
- return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw);
-}
-
-/*
- * get parent, if possible.
- *
- * FIXME: we could do better by querying the mds to discover the
- * parent.
- */
-static struct dentry *ceph_fh_to_parent(struct super_block *sb,
- struct fid *fid,
- int fh_len, int fh_type)
-{
- struct ceph_nfs_confh *cfh = (void *)fid->raw;
- struct ceph_vino vino;
- struct inode *inode;
- struct dentry *dentry;
- int err;
-
- if (fh_type == 1)
- return ERR_PTR(-ESTALE);
-
- pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino,
- cfh->parent_name_hash);
-
- vino.ino = cfh->ino;
- vino.snap = CEPH_NOSNAP;
- inode = ceph_find_inode(sb, vino);
- if (!inode)
- return ERR_PTR(-ESTALE);
-
- dentry = d_obtain_alias(inode);
- if (IS_ERR(dentry)) {
- pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
- cfh->ino, inode);
- iput(inode);
- return dentry;
- }
- err = ceph_init_dentry(dentry);
- if (err < 0) {
- iput(inode);
- return ERR_PTR(err);
- }
- dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry);
- return dentry;
-}
-
-const struct export_operations ceph_export_ops = {
- .encode_fh = ceph_encode_fh,
- .fh_to_dentry = ceph_fh_to_dentry,
- .fh_to_parent = ceph_fh_to_parent,
-};
diff --git a/ANDROID_3.4.5/fs/ceph/file.c b/ANDROID_3.4.5/fs/ceph/file.c
deleted file mode 100644
index ed72428d..00000000
--- a/ANDROID_3.4.5/fs/ceph/file.c
+++ /dev/null
@@ -1,874 +0,0 @@
-#include <linux/ceph/ceph_debug.h>
-
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/file.h>
-#include <linux/namei.h>
-#include <linux/writeback.h>
-
-#include "super.h"
-#include "mds_client.h"
-
-/*
- * Ceph file operations
- *
- * Implement basic open/close functionality, and implement
- * read/write.
- *
- * We implement three modes of file I/O:
- * - buffered uses the generic_file_aio_{read,write} helpers
- *
- * - synchronous is used when there is multi-client read/write
- * sharing, avoids the page cache, and synchronously waits for an
- * ack from the OSD.
- *
- * - direct io takes the variant of the sync path that references
- * user pages directly.
- *
- * fsync() flushes and waits on dirty pages, but just queues metadata
- * for writeback: since the MDS can recover size and mtime there is no
- * need to wait for MDS acknowledgement.
- */
-
-
-/*
- * Prepare an open request. Preallocate ceph_cap to avoid an
- * inopportune ENOMEM later.
- */
-static struct ceph_mds_request *
-prepare_open_request(struct super_block *sb, int flags, int create_mode)
-{
- struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
- struct ceph_mds_request *req;
- int want_auth = USE_ANY_MDS;
- int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
-
- if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC))
- want_auth = USE_AUTH_MDS;
-
- req = ceph_mdsc_create_request(mdsc, op, want_auth);
- if (IS_ERR(req))
- goto out;
- req->r_fmode = ceph_flags_to_mode(flags);
- req->r_args.open.flags = cpu_to_le32(flags);
- req->r_args.open.mode = cpu_to_le32(create_mode);
- req->r_args.open.preferred = cpu_to_le32(-1);
-out:
- return req;
-}
-
-/*
- * initialize private struct file data.
- * if we fail, clean up by dropping fmode reference on the ceph_inode
- */
-static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
-{
- struct ceph_file_info *cf;
- int ret = 0;
-
- switch (inode->i_mode & S_IFMT) {
- case S_IFREG:
- case S_IFDIR:
- dout("init_file %p %p 0%o (regular)\n", inode, file,
- inode->i_mode);
- cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO);
- if (cf == NULL) {
- ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
- return -ENOMEM;
- }
- cf->fmode = fmode;
- cf->next_offset = 2;
- file->private_data = cf;
- BUG_ON(inode->i_fop->release != ceph_release);
- break;
-
- case S_IFLNK:
- dout("init_file %p %p 0%o (symlink)\n", inode, file,
- inode->i_mode);
- ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
- break;
-
- default:
- dout("init_file %p %p 0%o (special)\n", inode, file,
- inode->i_mode);
- /*
- * we need to drop the open ref now, since we don't
- * have .release set to ceph_release.
- */
- ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
- BUG_ON(inode->i_fop->release == ceph_release);
-
- /* call the proper open fop */
- ret = inode->i_fop->open(inode, file);
- }
- return ret;
-}
-
-/*
- * If the filp already has private_data, that means the file was
- * already opened by intent during lookup, and we do nothing.
- *
- * If we already have the requisite capabilities, we can satisfy
- * the open request locally (no need to request new caps from the
- * MDS). We do, however, need to inform the MDS (asynchronously)
- * if our wanted caps set expands.
- */
-int ceph_open(struct inode *inode, struct file *file)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
- struct ceph_mds_request *req;
- struct ceph_file_info *cf = file->private_data;
- struct inode *parent_inode = NULL;
- int err;
- int flags, fmode, wanted;
-
- if (cf) {
- dout("open file %p is already opened\n", file);
- return 0;
- }
-
- /* filter out O_CREAT|O_EXCL; vfs did that already. yuck. */
- flags = file->f_flags & ~(O_CREAT|O_EXCL);
- if (S_ISDIR(inode->i_mode))
- flags = O_DIRECTORY; /* mds likes to know */
-
- dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
- ceph_vinop(inode), file, flags, file->f_flags);
- fmode = ceph_flags_to_mode(flags);
- wanted = ceph_caps_for_mode(fmode);
-
- /* snapped files are read-only */
- if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
- return -EROFS;
-
- /* trivially open snapdir */
- if (ceph_snap(inode) == CEPH_SNAPDIR) {
- spin_lock(&ci->i_ceph_lock);
- __ceph_get_fmode(ci, fmode);
- spin_unlock(&ci->i_ceph_lock);
- return ceph_init_file(inode, file, fmode);
- }
-
- /*
- * No need to block if we have caps on the auth MDS (for
- * write) or any MDS (for read). Update wanted set
- * asynchronously.
- */
- spin_lock(&ci->i_ceph_lock);
- if (__ceph_is_any_real_caps(ci) &&
- (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) {
- int mds_wanted = __ceph_caps_mds_wanted(ci);
- int issued = __ceph_caps_issued(ci, NULL);
-
- dout("open %p fmode %d want %s issued %s using existing\n",
- inode, fmode, ceph_cap_string(wanted),
- ceph_cap_string(issued));
- __ceph_get_fmode(ci, fmode);
- spin_unlock(&ci->i_ceph_lock);
-
- /* adjust wanted? */
- if ((issued & wanted) != wanted &&
- (mds_wanted & wanted) != wanted &&
- ceph_snap(inode) != CEPH_SNAPDIR)
- ceph_check_caps(ci, 0, NULL);
-
- return ceph_init_file(inode, file, fmode);
- } else if (ceph_snap(inode) != CEPH_NOSNAP &&
- (ci->i_snap_caps & wanted) == wanted) {
- __ceph_get_fmode(ci, fmode);
- spin_unlock(&ci->i_ceph_lock);
- return ceph_init_file(inode, file, fmode);
- }
- spin_unlock(&ci->i_ceph_lock);
-
- dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
- req = prepare_open_request(inode->i_sb, flags, 0);
- if (IS_ERR(req)) {
- err = PTR_ERR(req);
- goto out;
- }
- req->r_inode = inode;
- ihold(inode);
- req->r_num_caps = 1;
- if (flags & (O_CREAT|O_TRUNC))
- parent_inode = ceph_get_dentry_parent_inode(file->f_dentry);
- err = ceph_mdsc_do_request(mdsc, parent_inode, req);
- iput(parent_inode);
- if (!err)
- err = ceph_init_file(inode, file, req->r_fmode);
- ceph_mdsc_put_request(req);
- dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
-out:
- return err;
-}
-
-
-/*
- * Do a lookup + open with a single request.
- *
- * If this succeeds, but some subsequent check in the vfs
- * may_open() fails, the struct *file gets cleaned up (i.e.
- * ceph_release gets called). So fear not!
- */
-/*
- * flags
- * path_lookup_open -> LOOKUP_OPEN
- * path_lookup_create -> LOOKUP_OPEN|LOOKUP_CREATE
- */
-struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
- struct nameidata *nd, int mode,
- int locked_dir)
-{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
- struct file *file;
- struct ceph_mds_request *req;
- struct dentry *ret;
- int err;
- int flags = nd->intent.open.flags;
-
- dout("ceph_lookup_open dentry %p '%.*s' flags %d mode 0%o\n",
- dentry, dentry->d_name.len, dentry->d_name.name, flags, mode);
-
- /* do the open */
- req = prepare_open_request(dir->i_sb, flags, mode);
- if (IS_ERR(req))
- return ERR_CAST(req);
- req->r_dentry = dget(dentry);
- req->r_num_caps = 2;
- if (flags & O_CREAT) {
- req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
- req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
- }
- req->r_locked_dir = dir; /* caller holds dir->i_mutex */
- err = ceph_mdsc_do_request(mdsc,
- (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
- req);
- err = ceph_handle_snapdir(req, dentry, err);
- if (err)
- goto out;
- if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
- err = ceph_handle_notrace_create(dir, dentry);
- if (err)
- goto out;
- file = lookup_instantiate_filp(nd, req->r_dentry, ceph_open);
- if (IS_ERR(file))
- err = PTR_ERR(file);
-out:
- ret = ceph_finish_lookup(req, dentry, err);
- ceph_mdsc_put_request(req);
- dout("ceph_lookup_open result=%p\n", ret);
- return ret;
-}
-
-int ceph_release(struct inode *inode, struct file *file)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_file_info *cf = file->private_data;
-
- dout("release inode %p file %p\n", inode, file);
- ceph_put_fmode(ci, cf->fmode);
- if (cf->last_readdir)
- ceph_mdsc_put_request(cf->last_readdir);
- kfree(cf->last_name);
- kfree(cf->dir_info);
- dput(cf->dentry);
- kmem_cache_free(ceph_file_cachep, cf);
-
- /* wake up anyone waiting for caps on this inode */
- wake_up_all(&ci->i_cap_wq);
- return 0;
-}
-
-/*
- * Read a range of bytes striped over one or more objects. Iterate over
- * objects we stripe over. (That's not atomic, but good enough for now.)
- *
- * If we get a short result from the OSD, check against i_size; we need to
- * only return a short read to the caller if we hit EOF.
- */
-static int striped_read(struct inode *inode,
- u64 off, u64 len,
- struct page **pages, int num_pages,
- int *checkeof, bool o_direct,
- unsigned long buf_align)
-{
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- struct ceph_inode_info *ci = ceph_inode(inode);
- u64 pos, this_len;
- int io_align, page_align;
- int left, pages_left;
- int read;
- struct page **page_pos;
- int ret;
- bool hit_stripe, was_short;
-
- /*
- * we may need to do multiple reads. not atomic, unfortunately.
- */
- pos = off;
- left = len;
- page_pos = pages;
- pages_left = num_pages;
- read = 0;
- io_align = off & ~PAGE_MASK;
-
-more:
- if (o_direct)
- page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
- else
- page_align = pos & ~PAGE_MASK;
- this_len = left;
- ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
- &ci->i_layout, pos, &this_len,
- ci->i_truncate_seq,
- ci->i_truncate_size,
- page_pos, pages_left, page_align);
- if (ret == -ENOENT)
- ret = 0;
- hit_stripe = this_len < left;
- was_short = ret >= 0 && ret < this_len;
- dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read,
- ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
-
- if (ret > 0) {
- int didpages = (page_align + ret) >> PAGE_CACHE_SHIFT;
-
- if (read < pos - off) {
- dout(" zero gap %llu to %llu\n", off + read, pos);
- ceph_zero_page_vector_range(page_align + read,
- pos - off - read, pages);
- }
- pos += ret;
- read = pos - off;
- left -= ret;
- page_pos += didpages;
- pages_left -= didpages;
-
- /* hit stripe? */
- if (left && hit_stripe)
- goto more;
- }
-
- if (was_short) {
- /* did we bounce off eof? */
- if (pos + left > inode->i_size)
- *checkeof = 1;
-
- /* zero trailing bytes (inside i_size) */
- if (left > 0 && pos < inode->i_size) {
- if (pos + left > inode->i_size)
- left = inode->i_size - pos;
-
- dout("zero tail %d\n", left);
- ceph_zero_page_vector_range(page_align + read, left,
- pages);
- read += left;
- }
- }
-
- if (ret >= 0)
- ret = read;
- dout("striped_read returns %d\n", ret);
- return ret;
-}
-
-/*
- * Completely synchronous read and write methods. Direct from __user
- * buffer to osd, or directly to user pages (if O_DIRECT).
- *
- * If the read spans object boundary, just do multiple reads.
- */
-static ssize_t ceph_sync_read(struct file *file, char __user *data,
- unsigned len, loff_t *poff, int *checkeof)
-{
- struct inode *inode = file->f_dentry->d_inode;
- struct page **pages;
- u64 off = *poff;
- int num_pages, ret;
-
- dout("sync_read on file %p %llu~%u %s\n", file, off, len,
- (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
-
- if (file->f_flags & O_DIRECT) {
- num_pages = calc_pages_for((unsigned long)data, len);
- pages = ceph_get_direct_page_vector(data, num_pages, true);
- } else {
- num_pages = calc_pages_for(off, len);
- pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
- }
- if (IS_ERR(pages))
- return PTR_ERR(pages);
-
- /*
- * flush any page cache pages in this range. this
- * will make concurrent normal and sync io slow,
- * but it will at least behave sensibly when they are
- * in sequence.
- */
- ret = filemap_write_and_wait(inode->i_mapping);
- if (ret < 0)
- goto done;
-
- ret = striped_read(inode, off, len, pages, num_pages, checkeof,
- file->f_flags & O_DIRECT,
- (unsigned long)data & ~PAGE_MASK);
-
- if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
- ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
- if (ret >= 0)
- *poff = off + ret;
-
-done:
- if (file->f_flags & O_DIRECT)
- ceph_put_page_vector(pages, num_pages, true);
- else
- ceph_release_page_vector(pages, num_pages);
- dout("sync_read result %d\n", ret);
- return ret;
-}
-
-/*
- * Write commit callback, called if we requested both an ACK and
- * ONDISK commit reply from the OSD.
- */
-static void sync_write_commit(struct ceph_osd_request *req,
- struct ceph_msg *msg)
-{
- struct ceph_inode_info *ci = ceph_inode(req->r_inode);
-
- dout("sync_write_commit %p tid %llu\n", req, req->r_tid);
- spin_lock(&ci->i_unsafe_lock);
- list_del_init(&req->r_unsafe_item);
- spin_unlock(&ci->i_unsafe_lock);
- ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
-}
-
-/*
- * Synchronous write, straight from __user pointer or user pages (if
- * O_DIRECT).
- *
- * If write spans object boundary, just do multiple writes. (For a
- * correct atomic write, we should e.g. take write locks on all
- * objects, rollback on failure, etc.)
- */
-static ssize_t ceph_sync_write(struct file *file, const char __user *data,
- size_t left, loff_t *offset)
-{
- struct inode *inode = file->f_dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- struct ceph_osd_request *req;
- struct page **pages;
- int num_pages;
- long long unsigned pos;
- u64 len;
- int written = 0;
- int flags;
- int do_sync = 0;
- int check_caps = 0;
- int page_align, io_align;
- unsigned long buf_align;
- int ret;
- struct timespec mtime = CURRENT_TIME;
-
- if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP)
- return -EROFS;
-
- dout("sync_write on file %p %lld~%u %s\n", file, *offset,
- (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
-
- if (file->f_flags & O_APPEND)
- pos = i_size_read(inode);
- else
- pos = *offset;
-
- ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
- if (ret < 0)
- return ret;
-
- ret = invalidate_inode_pages2_range(inode->i_mapping,
- pos >> PAGE_CACHE_SHIFT,
- (pos + left) >> PAGE_CACHE_SHIFT);
- if (ret < 0)
- dout("invalidate_inode_pages2_range returned %d\n", ret);
-
- flags = CEPH_OSD_FLAG_ORDERSNAP |
- CEPH_OSD_FLAG_ONDISK |
- CEPH_OSD_FLAG_WRITE;
- if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
- flags |= CEPH_OSD_FLAG_ACK;
- else
- do_sync = 1;
-
- /*
- * we may need to do multiple writes here if we span an object
- * boundary. this isn't atomic, unfortunately. :(
- */
-more:
- io_align = pos & ~PAGE_MASK;
- buf_align = (unsigned long)data & ~PAGE_MASK;
- len = left;
- if (file->f_flags & O_DIRECT) {
- /* write from beginning of first page, regardless of
- io alignment */
- page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
- num_pages = calc_pages_for((unsigned long)data, len);
- } else {
- page_align = pos & ~PAGE_MASK;
- num_pages = calc_pages_for(pos, len);
- }
- req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
- ceph_vino(inode), pos, &len,
- CEPH_OSD_OP_WRITE, flags,
- ci->i_snap_realm->cached_context,
- do_sync,
- ci->i_truncate_seq, ci->i_truncate_size,
- &mtime, false, 2, page_align);
- if (!req)
- return -ENOMEM;
-
- if (file->f_flags & O_DIRECT) {
- pages = ceph_get_direct_page_vector(data, num_pages, false);
- if (IS_ERR(pages)) {
- ret = PTR_ERR(pages);
- goto out;
- }
-
- /*
- * throw out any page cache pages in this range. this
- * may block.
- */
- truncate_inode_pages_range(inode->i_mapping, pos,
- (pos+len) | (PAGE_CACHE_SIZE-1));
- } else {
- pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
- if (IS_ERR(pages)) {
- ret = PTR_ERR(pages);
- goto out;
- }
- ret = ceph_copy_user_to_page_vector(pages, data, pos, len);
- if (ret < 0) {
- ceph_release_page_vector(pages, num_pages);
- goto out;
- }
-
- if ((file->f_flags & O_SYNC) == 0) {
- /* get a second commit callback */
- req->r_safe_callback = sync_write_commit;
- req->r_own_pages = 1;
- }
- }
- req->r_pages = pages;
- req->r_num_pages = num_pages;
- req->r_inode = inode;
-
- ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
- if (!ret) {
- if (req->r_safe_callback) {
- /*
- * Add to inode unsafe list only after we
- * start_request so that a tid has been assigned.
- */
- spin_lock(&ci->i_unsafe_lock);
- list_add_tail(&req->r_unsafe_item,
- &ci->i_unsafe_writes);
- spin_unlock(&ci->i_unsafe_lock);
- ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
- }
-
- ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
- if (ret < 0 && req->r_safe_callback) {
- spin_lock(&ci->i_unsafe_lock);
- list_del_init(&req->r_unsafe_item);
- spin_unlock(&ci->i_unsafe_lock);
- ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
- }
- }
-
- if (file->f_flags & O_DIRECT)
- ceph_put_page_vector(pages, num_pages, false);
- else if (file->f_flags & O_SYNC)
- ceph_release_page_vector(pages, num_pages);
-
-out:
- ceph_osdc_put_request(req);
- if (ret == 0) {
- pos += len;
- written += len;
- left -= len;
- data += written;
- if (left)
- goto more;
-
- ret = written;
- *offset = pos;
- if (pos > i_size_read(inode))
- check_caps = ceph_inode_set_size(inode, pos);
- if (check_caps)
- ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
- NULL);
- }
- return ret;
-}
-
-/*
- * Wrap generic_file_aio_read with checks for cap bits on the inode.
- * Atomically grab references, so that those bits are not released
- * back to the MDS mid-read.
- *
- * Hmm, the sync read case isn't actually async... should it be?
- */
-static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
-{
- struct file *filp = iocb->ki_filp;
- struct ceph_file_info *fi = filp->private_data;
- loff_t *ppos = &iocb->ki_pos;
- size_t len = iov->iov_len;
- struct inode *inode = filp->f_dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
- void __user *base = iov->iov_base;
- ssize_t ret;
- int want, got = 0;
- int checkeof = 0, read = 0;
-
- dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
- inode, ceph_vinop(inode), pos, (unsigned)len, inode);
-again:
- __ceph_do_pending_vmtruncate(inode);
- if (fi->fmode & CEPH_FILE_MODE_LAZY)
- want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
- else
- want = CEPH_CAP_FILE_CACHE;
- ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1);
- if (ret < 0)
- goto out;
- dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
- inode, ceph_vinop(inode), pos, (unsigned)len,
- ceph_cap_string(got));
-
- if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
- (iocb->ki_filp->f_flags & O_DIRECT) ||
- (inode->i_sb->s_flags & MS_SYNCHRONOUS) ||
- (fi->flags & CEPH_F_SYNC))
- /* hmm, this isn't really async... */
- ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
- else
- ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
-
-out:
- dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
- inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
- ceph_put_cap_refs(ci, got);
-
- if (checkeof && ret >= 0) {
- int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
-
- /* hit EOF or hole? */
- if (statret == 0 && *ppos < inode->i_size) {
- dout("aio_read sync_read hit hole, ppos %lld < size %lld, reading more\n", *ppos, inode->i_size);
- read += ret;
- base += ret;
- len -= ret;
- checkeof = 0;
- goto again;
- }
- }
- if (ret >= 0)
- ret += read;
-
- return ret;
-}
-
-/*
- * Take cap references to avoid releasing caps to MDS mid-write.
- *
- * If we are synchronous, and write with an old snap context, the OSD
- * may return EOLDSNAPC. In that case, retry the write.. _after_
- * dropping our cap refs and allowing the pending snap to logically
- * complete _before_ this write occurs.
- *
- * If we are near ENOSPC, write synchronously.
- */
-static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
-{
- struct file *file = iocb->ki_filp;
- struct ceph_file_info *fi = file->private_data;
- struct inode *inode = file->f_dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_osd_client *osdc =
- &ceph_sb_to_client(inode->i_sb)->client->osdc;
- loff_t endoff = pos + iov->iov_len;
- int want, got = 0;
- int ret, err;
-
- if (ceph_snap(inode) != CEPH_NOSNAP)
- return -EROFS;
-
-retry_snap:
- if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
- return -ENOSPC;
- __ceph_do_pending_vmtruncate(inode);
- dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
- inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
- inode->i_size);
- if (fi->fmode & CEPH_FILE_MODE_LAZY)
- want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
- else
- want = CEPH_CAP_FILE_BUFFER;
- ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff);
- if (ret < 0)
- goto out_put;
-
- dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n",
- inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
- ceph_cap_string(got));
-
- if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
- (iocb->ki_filp->f_flags & O_DIRECT) ||
- (inode->i_sb->s_flags & MS_SYNCHRONOUS) ||
- (fi->flags & CEPH_F_SYNC)) {
- ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
- &iocb->ki_pos);
- } else {
- /*
- * buffered write; drop Fw early to avoid slow
- * revocation if we get stuck on balance_dirty_pages
- */
- int dirty;
-
- spin_lock(&ci->i_ceph_lock);
- dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
- spin_unlock(&ci->i_ceph_lock);
- ceph_put_cap_refs(ci, got);
-
- ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
- if ((ret >= 0 || ret == -EIOCBQUEUED) &&
- ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
- || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
- err = vfs_fsync_range(file, pos, pos + ret - 1, 1);
- if (err < 0)
- ret = err;
- }
-
- if (dirty)
- __mark_inode_dirty(inode, dirty);
- goto out;
- }
-
- if (ret >= 0) {
- int dirty;
- spin_lock(&ci->i_ceph_lock);
- dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
- spin_unlock(&ci->i_ceph_lock);
- if (dirty)
- __mark_inode_dirty(inode, dirty);
- }
-
-out_put:
- dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
- inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
- ceph_cap_string(got));
- ceph_put_cap_refs(ci, got);
-
-out:
- if (ret == -EOLDSNAPC) {
- dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
- inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
- goto retry_snap;
- }
-
- return ret;
-}
-
-/*
- * llseek. be sure to verify file size on SEEK_END.
- */
-static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
-{
- struct inode *inode = file->f_mapping->host;
- int ret;
-
- mutex_lock(&inode->i_mutex);
- __ceph_do_pending_vmtruncate(inode);
-
- if (origin == SEEK_END || origin == SEEK_DATA || origin == SEEK_HOLE) {
- ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
- if (ret < 0) {
- offset = ret;
- goto out;
- }
- }
-
- switch (origin) {
- case SEEK_END:
- offset += inode->i_size;
- break;
- case SEEK_CUR:
- /*
- * Here we special-case the lseek(fd, 0, SEEK_CUR)
- * position-querying operation. Avoid rewriting the "same"
- * f_pos value back to the file because a concurrent read(),
- * write() or lseek() might have altered it
- */
- if (offset == 0) {
- offset = file->f_pos;
- goto out;
- }
- offset += file->f_pos;
- break;
- case SEEK_DATA:
- if (offset >= inode->i_size) {
- ret = -ENXIO;
- goto out;
- }
- break;
- case SEEK_HOLE:
- if (offset >= inode->i_size) {
- ret = -ENXIO;
- goto out;
- }
- offset = inode->i_size;
- break;
- }
-
- if (offset < 0 || offset > inode->i_sb->s_maxbytes) {
- offset = -EINVAL;
- goto out;
- }
-
- /* Special lock needed here? */
- if (offset != file->f_pos) {
- file->f_pos = offset;
- file->f_version = 0;
- }
-
-out:
- mutex_unlock(&inode->i_mutex);
- return offset;
-}
-
-const struct file_operations ceph_file_fops = {
- .open = ceph_open,
- .release = ceph_release,
- .llseek = ceph_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = ceph_aio_read,
- .aio_write = ceph_aio_write,
- .mmap = ceph_mmap,
- .fsync = ceph_fsync,
- .lock = ceph_lock,
- .flock = ceph_flock,
- .splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
- .unlocked_ioctl = ceph_ioctl,
- .compat_ioctl = ceph_ioctl,
-};
-
diff --git a/ANDROID_3.4.5/fs/ceph/inode.c b/ANDROID_3.4.5/fs/ceph/inode.c
deleted file mode 100644
index 9fff9f3b..00000000
--- a/ANDROID_3.4.5/fs/ceph/inode.c
+++ /dev/null
@@ -1,1811 +0,0 @@
-#include <linux/ceph/ceph_debug.h>
-
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/uaccess.h>
-#include <linux/kernel.h>
-#include <linux/namei.h>
-#include <linux/writeback.h>
-#include <linux/vmalloc.h>
-
-#include "super.h"
-#include "mds_client.h"
-#include <linux/ceph/decode.h>
-
-/*
- * Ceph inode operations
- *
- * Implement basic inode helpers (get, alloc) and inode ops (getattr,
- * setattr, etc.), xattr helpers, and helpers for assimilating
- * metadata returned by the MDS into our cache.
- *
- * Also define helpers for doing asynchronous writeback, invalidation,
- * and truncation for the benefit of those who can't afford to block
- * (typically because they are in the message handler path).
- */
-
-static const struct inode_operations ceph_symlink_iops;
-
-static void ceph_invalidate_work(struct work_struct *work);
-static void ceph_writeback_work(struct work_struct *work);
-static void ceph_vmtruncate_work(struct work_struct *work);
-
-/*
- * find or create an inode, given the ceph ino number
- */
-static int ceph_set_ino_cb(struct inode *inode, void *data)
-{
- ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
- inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
- return 0;
-}
-
-struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
-{
- struct inode *inode;
- ino_t t = ceph_vino_to_ino(vino);
-
- inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
- if (inode == NULL)
- return ERR_PTR(-ENOMEM);
- if (inode->i_state & I_NEW) {
- dout("get_inode created new inode %p %llx.%llx ino %llx\n",
- inode, ceph_vinop(inode), (u64)inode->i_ino);
- unlock_new_inode(inode);
- }
-
- dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
- vino.snap, inode);
- return inode;
-}
-
-/*
- * get/constuct snapdir inode for a given directory
- */
-struct inode *ceph_get_snapdir(struct inode *parent)
-{
- struct ceph_vino vino = {
- .ino = ceph_ino(parent),
- .snap = CEPH_SNAPDIR,
- };
- struct inode *inode = ceph_get_inode(parent->i_sb, vino);
- struct ceph_inode_info *ci = ceph_inode(inode);
-
- BUG_ON(!S_ISDIR(parent->i_mode));
- if (IS_ERR(inode))
- return inode;
- inode->i_mode = parent->i_mode;
- inode->i_uid = parent->i_uid;
- inode->i_gid = parent->i_gid;
- inode->i_op = &ceph_dir_iops;
- inode->i_fop = &ceph_dir_fops;
- ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
- ci->i_rbytes = 0;
- return inode;
-}
-
-const struct inode_operations ceph_file_iops = {
- .permission = ceph_permission,
- .setattr = ceph_setattr,
- .getattr = ceph_getattr,
- .setxattr = ceph_setxattr,
- .getxattr = ceph_getxattr,
- .listxattr = ceph_listxattr,
- .removexattr = ceph_removexattr,
-};
-
-
-/*
- * We use a 'frag tree' to keep track of the MDS's directory fragments
- * for a given inode (usually there is just a single fragment). We
- * need to know when a child frag is delegated to a new MDS, or when
- * it is flagged as replicated, so we can direct our requests
- * accordingly.
- */
-
-/*
- * find/create a frag in the tree
- */
-static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
- u32 f)
-{
- struct rb_node **p;
- struct rb_node *parent = NULL;
- struct ceph_inode_frag *frag;
- int c;
-
- p = &ci->i_fragtree.rb_node;
- while (*p) {
- parent = *p;
- frag = rb_entry(parent, struct ceph_inode_frag, node);
- c = ceph_frag_compare(f, frag->frag);
- if (c < 0)
- p = &(*p)->rb_left;
- else if (c > 0)
- p = &(*p)->rb_right;
- else
- return frag;
- }
-
- frag = kmalloc(sizeof(*frag), GFP_NOFS);
- if (!frag) {
- pr_err("__get_or_create_frag ENOMEM on %p %llx.%llx "
- "frag %x\n", &ci->vfs_inode,
- ceph_vinop(&ci->vfs_inode), f);
- return ERR_PTR(-ENOMEM);
- }
- frag->frag = f;
- frag->split_by = 0;
- frag->mds = -1;
- frag->ndist = 0;
-
- rb_link_node(&frag->node, parent, p);
- rb_insert_color(&frag->node, &ci->i_fragtree);
-
- dout("get_or_create_frag added %llx.%llx frag %x\n",
- ceph_vinop(&ci->vfs_inode), f);
- return frag;
-}
-
-/*
- * find a specific frag @f
- */
-struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
-{
- struct rb_node *n = ci->i_fragtree.rb_node;
-
- while (n) {
- struct ceph_inode_frag *frag =
- rb_entry(n, struct ceph_inode_frag, node);
- int c = ceph_frag_compare(f, frag->frag);
- if (c < 0)
- n = n->rb_left;
- else if (c > 0)
- n = n->rb_right;
- else
- return frag;
- }
- return NULL;
-}
-
-/*
- * Choose frag containing the given value @v. If @pfrag is
- * specified, copy the frag delegation info to the caller if
- * it is present.
- */
-u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
- struct ceph_inode_frag *pfrag,
- int *found)
-{
- u32 t = ceph_frag_make(0, 0);
- struct ceph_inode_frag *frag;
- unsigned nway, i;
- u32 n;
-
- if (found)
- *found = 0;
-
- mutex_lock(&ci->i_fragtree_mutex);
- while (1) {
- WARN_ON(!ceph_frag_contains_value(t, v));
- frag = __ceph_find_frag(ci, t);
- if (!frag)
- break; /* t is a leaf */
- if (frag->split_by == 0) {
- if (pfrag)
- memcpy(pfrag, frag, sizeof(*pfrag));
- if (found)
- *found = 1;
- break;
- }
-
- /* choose child */
- nway = 1 << frag->split_by;
- dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t,
- frag->split_by, nway);
- for (i = 0; i < nway; i++) {
- n = ceph_frag_make_child(t, frag->split_by, i);
- if (ceph_frag_contains_value(n, v)) {
- t = n;
- break;
- }
- }
- BUG_ON(i == nway);
- }
- dout("choose_frag(%x) = %x\n", v, t);
-
- mutex_unlock(&ci->i_fragtree_mutex);
- return t;
-}
-
-/*
- * Process dirfrag (delegation) info from the mds. Include leaf
- * fragment in tree ONLY if ndist > 0. Otherwise, only
- * branches/splits are included in i_fragtree)
- */
-static int ceph_fill_dirfrag(struct inode *inode,
- struct ceph_mds_reply_dirfrag *dirinfo)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_inode_frag *frag;
- u32 id = le32_to_cpu(dirinfo->frag);
- int mds = le32_to_cpu(dirinfo->auth);
- int ndist = le32_to_cpu(dirinfo->ndist);
- int i;
- int err = 0;
-
- mutex_lock(&ci->i_fragtree_mutex);
- if (ndist == 0) {
- /* no delegation info needed. */
- frag = __ceph_find_frag(ci, id);
- if (!frag)
- goto out;
- if (frag->split_by == 0) {
- /* tree leaf, remove */
- dout("fill_dirfrag removed %llx.%llx frag %x"
- " (no ref)\n", ceph_vinop(inode), id);
- rb_erase(&frag->node, &ci->i_fragtree);
- kfree(frag);
- } else {
- /* tree branch, keep and clear */
- dout("fill_dirfrag cleared %llx.%llx frag %x"
- " referral\n", ceph_vinop(inode), id);
- frag->mds = -1;
- frag->ndist = 0;
- }
- goto out;
- }
-
-
- /* find/add this frag to store mds delegation info */
- frag = __get_or_create_frag(ci, id);
- if (IS_ERR(frag)) {
- /* this is not the end of the world; we can continue
- with bad/inaccurate delegation info */
- pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n",
- ceph_vinop(inode), le32_to_cpu(dirinfo->frag));
- err = -ENOMEM;
- goto out;
- }
-
- frag->mds = mds;
- frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
- for (i = 0; i < frag->ndist; i++)
- frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
- dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n",
- ceph_vinop(inode), frag->frag, frag->ndist);
-
-out:
- mutex_unlock(&ci->i_fragtree_mutex);
- return err;
-}
-
-
-/*
- * initialize a newly allocated inode.
- */
-struct inode *ceph_alloc_inode(struct super_block *sb)
-{
- struct ceph_inode_info *ci;
- int i;
-
- ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS);
- if (!ci)
- return NULL;
-
- dout("alloc_inode %p\n", &ci->vfs_inode);
-
- spin_lock_init(&ci->i_ceph_lock);
-
- ci->i_version = 0;
- ci->i_time_warp_seq = 0;
- ci->i_ceph_flags = 0;
- ci->i_release_count = 0;
- ci->i_symlink = NULL;
-
- memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
-
- ci->i_fragtree = RB_ROOT;
- mutex_init(&ci->i_fragtree_mutex);
-
- ci->i_xattrs.blob = NULL;
- ci->i_xattrs.prealloc_blob = NULL;
- ci->i_xattrs.dirty = false;
- ci->i_xattrs.index = RB_ROOT;
- ci->i_xattrs.count = 0;
- ci->i_xattrs.names_size = 0;
- ci->i_xattrs.vals_size = 0;
- ci->i_xattrs.version = 0;
- ci->i_xattrs.index_version = 0;
-
- ci->i_caps = RB_ROOT;
- ci->i_auth_cap = NULL;
- ci->i_dirty_caps = 0;
- ci->i_flushing_caps = 0;
- INIT_LIST_HEAD(&ci->i_dirty_item);
- INIT_LIST_HEAD(&ci->i_flushing_item);
- ci->i_cap_flush_seq = 0;
- ci->i_cap_flush_last_tid = 0;
- memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
- init_waitqueue_head(&ci->i_cap_wq);
- ci->i_hold_caps_min = 0;
- ci->i_hold_caps_max = 0;
- INIT_LIST_HEAD(&ci->i_cap_delay_list);
- ci->i_cap_exporting_mds = 0;
- ci->i_cap_exporting_mseq = 0;
- ci->i_cap_exporting_issued = 0;
- INIT_LIST_HEAD(&ci->i_cap_snaps);
- ci->i_head_snapc = NULL;
- ci->i_snap_caps = 0;
-
- for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
- ci->i_nr_by_mode[i] = 0;
-
- ci->i_truncate_seq = 0;
- ci->i_truncate_size = 0;
- ci->i_truncate_pending = 0;
-
- ci->i_max_size = 0;
- ci->i_reported_size = 0;
- ci->i_wanted_max_size = 0;
- ci->i_requested_max_size = 0;
-
- ci->i_pin_ref = 0;
- ci->i_rd_ref = 0;
- ci->i_rdcache_ref = 0;
- ci->i_wr_ref = 0;
- ci->i_wb_ref = 0;
- ci->i_wrbuffer_ref = 0;
- ci->i_wrbuffer_ref_head = 0;
- ci->i_shared_gen = 0;
- ci->i_rdcache_gen = 0;
- ci->i_rdcache_revoking = 0;
-
- INIT_LIST_HEAD(&ci->i_unsafe_writes);
- INIT_LIST_HEAD(&ci->i_unsafe_dirops);
- spin_lock_init(&ci->i_unsafe_lock);
-
- ci->i_snap_realm = NULL;
- INIT_LIST_HEAD(&ci->i_snap_realm_item);
- INIT_LIST_HEAD(&ci->i_snap_flush_item);
-
- INIT_WORK(&ci->i_wb_work, ceph_writeback_work);
- INIT_WORK(&ci->i_pg_inv_work, ceph_invalidate_work);
-
- INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
-
- return &ci->vfs_inode;
-}
-
-static void ceph_i_callback(struct rcu_head *head)
-{
- struct inode *inode = container_of(head, struct inode, i_rcu);
- struct ceph_inode_info *ci = ceph_inode(inode);
-
- kmem_cache_free(ceph_inode_cachep, ci);
-}
-
-void ceph_destroy_inode(struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_inode_frag *frag;
- struct rb_node *n;
-
- dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
-
- ceph_queue_caps_release(inode);
-
- /*
- * we may still have a snap_realm reference if there are stray
- * caps in i_cap_exporting_issued or i_snap_caps.
- */
- if (ci->i_snap_realm) {
- struct ceph_mds_client *mdsc =
- ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
- struct ceph_snap_realm *realm = ci->i_snap_realm;
-
- dout(" dropping residual ref to snap realm %p\n", realm);
- spin_lock(&realm->inodes_with_caps_lock);
- list_del_init(&ci->i_snap_realm_item);
- spin_unlock(&realm->inodes_with_caps_lock);
- ceph_put_snap_realm(mdsc, realm);
- }
-
- kfree(ci->i_symlink);
- while ((n = rb_first(&ci->i_fragtree)) != NULL) {
- frag = rb_entry(n, struct ceph_inode_frag, node);
- rb_erase(n, &ci->i_fragtree);
- kfree(frag);
- }
-
- __ceph_destroy_xattrs(ci);
- if (ci->i_xattrs.blob)
- ceph_buffer_put(ci->i_xattrs.blob);
- if (ci->i_xattrs.prealloc_blob)
- ceph_buffer_put(ci->i_xattrs.prealloc_blob);
-
- call_rcu(&inode->i_rcu, ceph_i_callback);
-}
-
-
-/*
- * Helpers to fill in size, ctime, mtime, and atime. We have to be
- * careful because either the client or MDS may have more up to date
- * info, depending on which capabilities are held, and whether
- * time_warp_seq or truncate_seq have increased. (Ordinarily, mtime
- * and size are monotonically increasing, except when utimes() or
- * truncate() increments the corresponding _seq values.)
- */
-int ceph_fill_file_size(struct inode *inode, int issued,
- u32 truncate_seq, u64 truncate_size, u64 size)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- int queue_trunc = 0;
-
- if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
- (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
- dout("size %lld -> %llu\n", inode->i_size, size);
- inode->i_size = size;
- inode->i_blocks = (size + (1<<9) - 1) >> 9;
- ci->i_reported_size = size;
- if (truncate_seq != ci->i_truncate_seq) {
- dout("truncate_seq %u -> %u\n",
- ci->i_truncate_seq, truncate_seq);
- ci->i_truncate_seq = truncate_seq;
- /*
- * If we hold relevant caps, or in the case where we're
- * not the only client referencing this file and we
- * don't hold those caps, then we need to check whether
- * the file is either opened or mmaped
- */
- if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD|
- CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
- CEPH_CAP_FILE_EXCL|
- CEPH_CAP_FILE_LAZYIO)) ||
- mapping_mapped(inode->i_mapping) ||
- __ceph_caps_file_wanted(ci)) {
- ci->i_truncate_pending++;
- queue_trunc = 1;
- }
- }
- }
- if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 &&
- ci->i_truncate_size != truncate_size) {
- dout("truncate_size %lld -> %llu\n", ci->i_truncate_size,
- truncate_size);
- ci->i_truncate_size = truncate_size;
- }
- return queue_trunc;
-}
-
-void ceph_fill_file_time(struct inode *inode, int issued,
- u64 time_warp_seq, struct timespec *ctime,
- struct timespec *mtime, struct timespec *atime)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- int warn = 0;
-
- if (issued & (CEPH_CAP_FILE_EXCL|
- CEPH_CAP_FILE_WR|
- CEPH_CAP_FILE_BUFFER|
- CEPH_CAP_AUTH_EXCL|
- CEPH_CAP_XATTR_EXCL)) {
- if (timespec_compare(ctime, &inode->i_ctime) > 0) {
- dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
- inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
- ctime->tv_sec, ctime->tv_nsec);
- inode->i_ctime = *ctime;
- }
- if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
- /* the MDS did a utimes() */
- dout("mtime %ld.%09ld -> %ld.%09ld "
- "tw %d -> %d\n",
- inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
- mtime->tv_sec, mtime->tv_nsec,
- ci->i_time_warp_seq, (int)time_warp_seq);
-
- inode->i_mtime = *mtime;
- inode->i_atime = *atime;
- ci->i_time_warp_seq = time_warp_seq;
- } else if (time_warp_seq == ci->i_time_warp_seq) {
- /* nobody did utimes(); take the max */
- if (timespec_compare(mtime, &inode->i_mtime) > 0) {
- dout("mtime %ld.%09ld -> %ld.%09ld inc\n",
- inode->i_mtime.tv_sec,
- inode->i_mtime.tv_nsec,
- mtime->tv_sec, mtime->tv_nsec);
- inode->i_mtime = *mtime;
- }
- if (timespec_compare(atime, &inode->i_atime) > 0) {
- dout("atime %ld.%09ld -> %ld.%09ld inc\n",
- inode->i_atime.tv_sec,
- inode->i_atime.tv_nsec,
- atime->tv_sec, atime->tv_nsec);
- inode->i_atime = *atime;
- }
- } else if (issued & CEPH_CAP_FILE_EXCL) {
- /* we did a utimes(); ignore mds values */
- } else {
- warn = 1;
- }
- } else {
- /* we have no write|excl caps; whatever the MDS says is true */
- if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
- inode->i_ctime = *ctime;
- inode->i_mtime = *mtime;
- inode->i_atime = *atime;
- ci->i_time_warp_seq = time_warp_seq;
- } else {
- warn = 1;
- }
- }
- if (warn) /* time_warp_seq shouldn't go backwards */
- dout("%p mds time_warp_seq %llu < %u\n",
- inode, time_warp_seq, ci->i_time_warp_seq);
-}
-
-/*
- * Populate an inode based on info from mds. May be called on new or
- * existing inodes.
- */
-static int fill_inode(struct inode *inode,
- struct ceph_mds_reply_info_in *iinfo,
- struct ceph_mds_reply_dirfrag *dirinfo,
- struct ceph_mds_session *session,
- unsigned long ttl_from, int cap_fmode,
- struct ceph_cap_reservation *caps_reservation)
-{
- struct ceph_mds_reply_inode *info = iinfo->in;
- struct ceph_inode_info *ci = ceph_inode(inode);
- int i;
- int issued = 0, implemented;
- int updating_inode = 0;
- struct timespec mtime, atime, ctime;
- u32 nsplits;
- struct ceph_buffer *xattr_blob = NULL;
- int err = 0;
- int queue_trunc = 0;
-
- dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
- inode, ceph_vinop(inode), le64_to_cpu(info->version),
- ci->i_version);
-
- /*
- * prealloc xattr data, if it looks like we'll need it. only
- * if len > 4 (meaning there are actually xattrs; the first 4
- * bytes are the xattr count).
- */
- if (iinfo->xattr_len > 4) {
- xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
- if (!xattr_blob)
- pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
- iinfo->xattr_len);
- }
-
- spin_lock(&ci->i_ceph_lock);
-
- /*
- * provided version will be odd if inode value is projected,
- * even if stable. skip the update if we have newer stable
- * info (ours>=theirs, e.g. due to racing mds replies), unless
- * we are getting projected (unstable) info (in which case the
- * version is odd, and we want ours>theirs).
- * us them
- * 2 2 skip
- * 3 2 skip
- * 3 3 update
- */
- if (le64_to_cpu(info->version) > 0 &&
- (ci->i_version & ~1) >= le64_to_cpu(info->version))
- goto no_change;
-
- updating_inode = 1;
- issued = __ceph_caps_issued(ci, &implemented);
- issued |= implemented | __ceph_caps_dirty(ci);
-
- /* update inode */
- ci->i_version = le64_to_cpu(info->version);
- inode->i_version++;
- inode->i_rdev = le32_to_cpu(info->rdev);
-
- if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
- inode->i_mode = le32_to_cpu(info->mode);
- inode->i_uid = le32_to_cpu(info->uid);
- inode->i_gid = le32_to_cpu(info->gid);
- dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
- inode->i_uid, inode->i_gid);
- }
-
- if ((issued & CEPH_CAP_LINK_EXCL) == 0)
- set_nlink(inode, le32_to_cpu(info->nlink));
-
- /* be careful with mtime, atime, size */
- ceph_decode_timespec(&atime, &info->atime);
- ceph_decode_timespec(&mtime, &info->mtime);
- ceph_decode_timespec(&ctime, &info->ctime);
- queue_trunc = ceph_fill_file_size(inode, issued,
- le32_to_cpu(info->truncate_seq),
- le64_to_cpu(info->truncate_size),
- le64_to_cpu(info->size));
- ceph_fill_file_time(inode, issued,
- le32_to_cpu(info->time_warp_seq),
- &ctime, &mtime, &atime);
-
- /* only update max_size on auth cap */
- if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
- ci->i_max_size != le64_to_cpu(info->max_size)) {
- dout("max_size %lld -> %llu\n", ci->i_max_size,
- le64_to_cpu(info->max_size));
- ci->i_max_size = le64_to_cpu(info->max_size);
- }
-
- ci->i_layout = info->layout;
- inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
-
- /* xattrs */
- /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
- if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
- le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
- if (ci->i_xattrs.blob)
- ceph_buffer_put(ci->i_xattrs.blob);
- ci->i_xattrs.blob = xattr_blob;
- if (xattr_blob)
- memcpy(ci->i_xattrs.blob->vec.iov_base,
- iinfo->xattr_data, iinfo->xattr_len);
- ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
- xattr_blob = NULL;
- }
-
- inode->i_mapping->a_ops = &ceph_aops;
- inode->i_mapping->backing_dev_info =
- &ceph_sb_to_client(inode->i_sb)->backing_dev_info;
-
- switch (inode->i_mode & S_IFMT) {
- case S_IFIFO:
- case S_IFBLK:
- case S_IFCHR:
- case S_IFSOCK:
- init_special_inode(inode, inode->i_mode, inode->i_rdev);
- inode->i_op = &ceph_file_iops;
- break;
- case S_IFREG:
- inode->i_op = &ceph_file_iops;
- inode->i_fop = &ceph_file_fops;
- break;
- case S_IFLNK:
- inode->i_op = &ceph_symlink_iops;
- if (!ci->i_symlink) {
- u32 symlen = iinfo->symlink_len;
- char *sym;
-
- spin_unlock(&ci->i_ceph_lock);
-
- err = -EINVAL;
- if (WARN_ON(symlen != inode->i_size))
- goto out;
-
- err = -ENOMEM;
- sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
- if (!sym)
- goto out;
-
- spin_lock(&ci->i_ceph_lock);
- if (!ci->i_symlink)
- ci->i_symlink = sym;
- else
- kfree(sym); /* lost a race */
- }
- break;
- case S_IFDIR:
- inode->i_op = &ceph_dir_iops;
- inode->i_fop = &ceph_dir_fops;
-
- ci->i_dir_layout = iinfo->dir_layout;
-
- ci->i_files = le64_to_cpu(info->files);
- ci->i_subdirs = le64_to_cpu(info->subdirs);
- ci->i_rbytes = le64_to_cpu(info->rbytes);
- ci->i_rfiles = le64_to_cpu(info->rfiles);
- ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
- ceph_decode_timespec(&ci->i_rctime, &info->rctime);
- break;
- default:
- pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
- ceph_vinop(inode), inode->i_mode);
- }
-
-no_change:
- spin_unlock(&ci->i_ceph_lock);
-
- /* queue truncate if we saw i_size decrease */
- if (queue_trunc)
- ceph_queue_vmtruncate(inode);
-
- /* populate frag tree */
- /* FIXME: move me up, if/when version reflects fragtree changes */
- nsplits = le32_to_cpu(info->fragtree.nsplits);
- mutex_lock(&ci->i_fragtree_mutex);
- for (i = 0; i < nsplits; i++) {
- u32 id = le32_to_cpu(info->fragtree.splits[i].frag);
- struct ceph_inode_frag *frag = __get_or_create_frag(ci, id);
-
- if (IS_ERR(frag))
- continue;
- frag->split_by = le32_to_cpu(info->fragtree.splits[i].by);
- dout(" frag %x split by %d\n", frag->frag, frag->split_by);
- }
- mutex_unlock(&ci->i_fragtree_mutex);
-
- /* were we issued a capability? */
- if (info->cap.caps) {
- if (ceph_snap(inode) == CEPH_NOSNAP) {
- ceph_add_cap(inode, session,
- le64_to_cpu(info->cap.cap_id),
- cap_fmode,
- le32_to_cpu(info->cap.caps),
- le32_to_cpu(info->cap.wanted),
- le32_to_cpu(info->cap.seq),
- le32_to_cpu(info->cap.mseq),
- le64_to_cpu(info->cap.realm),
- info->cap.flags,
- caps_reservation);
- } else {
- spin_lock(&ci->i_ceph_lock);
- dout(" %p got snap_caps %s\n", inode,
- ceph_cap_string(le32_to_cpu(info->cap.caps)));
- ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
- if (cap_fmode >= 0)
- __ceph_get_fmode(ci, cap_fmode);
- spin_unlock(&ci->i_ceph_lock);
- }
- } else if (cap_fmode >= 0) {
- pr_warning("mds issued no caps on %llx.%llx\n",
- ceph_vinop(inode));
- __ceph_get_fmode(ci, cap_fmode);
- }
-
- /* set dir completion flag? */
- if (S_ISDIR(inode->i_mode) &&
- updating_inode && /* didn't jump to no_change */
- ci->i_files == 0 && ci->i_subdirs == 0 &&
- ceph_snap(inode) == CEPH_NOSNAP &&
- (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
- (issued & CEPH_CAP_FILE_EXCL) == 0 &&
- !ceph_dir_test_complete(inode)) {
- dout(" marking %p complete (empty)\n", inode);
- ceph_dir_set_complete(inode);
- ci->i_max_offset = 2;
- }
-
- /* update delegation info? */
- if (dirinfo)
- ceph_fill_dirfrag(inode, dirinfo);
-
- err = 0;
-
-out:
- if (xattr_blob)
- ceph_buffer_put(xattr_blob);
- return err;
-}
-
-/*
- * caller should hold session s_mutex.
- */
-static void update_dentry_lease(struct dentry *dentry,
- struct ceph_mds_reply_lease *lease,
- struct ceph_mds_session *session,
- unsigned long from_time)
-{
- struct ceph_dentry_info *di = ceph_dentry(dentry);
- long unsigned duration = le32_to_cpu(lease->duration_ms);
- long unsigned ttl = from_time + (duration * HZ) / 1000;
- long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
- struct inode *dir;
-
- /* only track leases on regular dentries */
- if (dentry->d_op != &ceph_dentry_ops)
- return;
-
- spin_lock(&dentry->d_lock);
- dout("update_dentry_lease %p duration %lu ms ttl %lu\n",
- dentry, duration, ttl);
-
- /* make lease_rdcache_gen match directory */
- dir = dentry->d_parent->d_inode;
- di->lease_shared_gen = ceph_inode(dir)->i_shared_gen;
-
- if (duration == 0)
- goto out_unlock;
-
- if (di->lease_gen == session->s_cap_gen &&
- time_before(ttl, dentry->d_time))
- goto out_unlock; /* we already have a newer lease. */
-
- if (di->lease_session && di->lease_session != session)
- goto out_unlock;
-
- ceph_dentry_lru_touch(dentry);
-
- if (!di->lease_session)
- di->lease_session = ceph_get_mds_session(session);
- di->lease_gen = session->s_cap_gen;
- di->lease_seq = le32_to_cpu(lease->seq);
- di->lease_renew_after = half_ttl;
- di->lease_renew_from = 0;
- dentry->d_time = ttl;
-out_unlock:
- spin_unlock(&dentry->d_lock);
- return;
-}
-
-/*
- * Set dentry's directory position based on the current dir's max, and
- * order it in d_subdirs, so that dcache_readdir behaves.
- *
- * Always called under directory's i_mutex.
- */
-static void ceph_set_dentry_offset(struct dentry *dn)
-{
- struct dentry *dir = dn->d_parent;
- struct inode *inode = dir->d_inode;
- struct ceph_inode_info *ci;
- struct ceph_dentry_info *di;
-
- BUG_ON(!inode);
-
- ci = ceph_inode(inode);
- di = ceph_dentry(dn);
-
- spin_lock(&ci->i_ceph_lock);
- if (!ceph_dir_test_complete(inode)) {
- spin_unlock(&ci->i_ceph_lock);
- return;
- }
- di->offset = ceph_inode(inode)->i_max_offset++;
- spin_unlock(&ci->i_ceph_lock);
-
- spin_lock(&dir->d_lock);
- spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
- list_move(&dn->d_u.d_child, &dir->d_subdirs);
- dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
- dn->d_u.d_child.prev, dn->d_u.d_child.next);
- spin_unlock(&dn->d_lock);
- spin_unlock(&dir->d_lock);
-}
-
-/*
- * splice a dentry to an inode.
- * caller must hold directory i_mutex for this to be safe.
- *
- * we will only rehash the resulting dentry if @prehash is
- * true; @prehash will be set to false (for the benefit of
- * the caller) if we fail.
- */
-static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
- bool *prehash, bool set_offset)
-{
- struct dentry *realdn;
-
- BUG_ON(dn->d_inode);
-
- /* dn must be unhashed */
- if (!d_unhashed(dn))
- d_drop(dn);
- realdn = d_materialise_unique(dn, in);
- if (IS_ERR(realdn)) {
- pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n",
- PTR_ERR(realdn), dn, in, ceph_vinop(in));
- if (prehash)
- *prehash = false; /* don't rehash on error */
- dn = realdn; /* note realdn contains the error */
- goto out;
- } else if (realdn) {
- dout("dn %p (%d) spliced with %p (%d) "
- "inode %p ino %llx.%llx\n",
- dn, dn->d_count,
- realdn, realdn->d_count,
- realdn->d_inode, ceph_vinop(realdn->d_inode));
- dput(dn);
- dn = realdn;
- } else {
- BUG_ON(!ceph_dentry(dn));
- dout("dn %p attached to %p ino %llx.%llx\n",
- dn, dn->d_inode, ceph_vinop(dn->d_inode));
- }
- if ((!prehash || *prehash) && d_unhashed(dn))
- d_rehash(dn);
- if (set_offset)
- ceph_set_dentry_offset(dn);
-out:
- return dn;
-}
-
-/*
- * Incorporate results into the local cache. This is either just
- * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
- * after a lookup).
- *
- * A reply may contain
- * a directory inode along with a dentry.
- * and/or a target inode
- *
- * Called with snap_rwsem (read).
- */
-int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
- struct ceph_mds_session *session)
-{
- struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
- struct inode *in = NULL;
- struct ceph_mds_reply_inode *ininfo;
- struct ceph_vino vino;
- struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
- int i = 0;
- int err = 0;
-
- dout("fill_trace %p is_dentry %d is_target %d\n", req,
- rinfo->head->is_dentry, rinfo->head->is_target);
-
-#if 0
- /*
- * Debugging hook:
- *
- * If we resend completed ops to a recovering mds, we get no
- * trace. Since that is very rare, pretend this is the case
- * to ensure the 'no trace' handlers in the callers behave.
- *
- * Fill in inodes unconditionally to avoid breaking cap
- * invariants.
- */
- if (rinfo->head->op & CEPH_MDS_OP_WRITE) {
- pr_info("fill_trace faking empty trace on %lld %s\n",
- req->r_tid, ceph_mds_op_name(rinfo->head->op));
- if (rinfo->head->is_dentry) {
- rinfo->head->is_dentry = 0;
- err = fill_inode(req->r_locked_dir,
- &rinfo->diri, rinfo->dirfrag,
- session, req->r_request_started, -1);
- }
- if (rinfo->head->is_target) {
- rinfo->head->is_target = 0;
- ininfo = rinfo->targeti.in;
- vino.ino = le64_to_cpu(ininfo->ino);
- vino.snap = le64_to_cpu(ininfo->snapid);
- in = ceph_get_inode(sb, vino);
- err = fill_inode(in, &rinfo->targeti, NULL,
- session, req->r_request_started,
- req->r_fmode);
- iput(in);
- }
- }
-#endif
-
- if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
- dout("fill_trace reply is empty!\n");
- if (rinfo->head->result == 0 && req->r_locked_dir)
- ceph_invalidate_dir_request(req);
- return 0;
- }
-
- if (rinfo->head->is_dentry) {
- struct inode *dir = req->r_locked_dir;
-
- err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
- session, req->r_request_started, -1,
- &req->r_caps_reservation);
- if (err < 0)
- return err;
- }
-
- /*
- * ignore null lease/binding on snapdir ENOENT, or else we
- * will have trouble splicing in the virtual snapdir later
- */
- if (rinfo->head->is_dentry && !req->r_aborted &&
- (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
- fsc->mount_options->snapdir_name,
- req->r_dentry->d_name.len))) {
- /*
- * lookup link rename : null -> possibly existing inode
- * mknod symlink mkdir : null -> new inode
- * unlink : linked -> null
- */
- struct inode *dir = req->r_locked_dir;
- struct dentry *dn = req->r_dentry;
- bool have_dir_cap, have_lease;
-
- BUG_ON(!dn);
- BUG_ON(!dir);
- BUG_ON(dn->d_parent->d_inode != dir);
- BUG_ON(ceph_ino(dir) !=
- le64_to_cpu(rinfo->diri.in->ino));
- BUG_ON(ceph_snap(dir) !=
- le64_to_cpu(rinfo->diri.in->snapid));
-
- /* do we have a lease on the whole dir? */
- have_dir_cap =
- (le32_to_cpu(rinfo->diri.in->cap.caps) &
- CEPH_CAP_FILE_SHARED);
-
- /* do we have a dn lease? */
- have_lease = have_dir_cap ||
- le32_to_cpu(rinfo->dlease->duration_ms);
- if (!have_lease)
- dout("fill_trace no dentry lease or dir cap\n");
-
- /* rename? */
- if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
- dout(" src %p '%.*s' dst %p '%.*s'\n",
- req->r_old_dentry,
- req->r_old_dentry->d_name.len,
- req->r_old_dentry->d_name.name,
- dn, dn->d_name.len, dn->d_name.name);
- dout("fill_trace doing d_move %p -> %p\n",
- req->r_old_dentry, dn);
-
- d_move(req->r_old_dentry, dn);
- dout(" src %p '%.*s' dst %p '%.*s'\n",
- req->r_old_dentry,
- req->r_old_dentry->d_name.len,
- req->r_old_dentry->d_name.name,
- dn, dn->d_name.len, dn->d_name.name);
-
- /* ensure target dentry is invalidated, despite
- rehashing bug in vfs_rename_dir */
- ceph_invalidate_dentry_lease(dn);
-
- /*
- * d_move() puts the renamed dentry at the end of
- * d_subdirs. We need to assign it an appropriate
- * directory offset so we can behave when holding
- * D_COMPLETE.
- */
- ceph_set_dentry_offset(req->r_old_dentry);
- dout("dn %p gets new offset %lld\n", req->r_old_dentry,
- ceph_dentry(req->r_old_dentry)->offset);
-
- dn = req->r_old_dentry; /* use old_dentry */
- in = dn->d_inode;
- }
-
- /* null dentry? */
- if (!rinfo->head->is_target) {
- dout("fill_trace null dentry\n");
- if (dn->d_inode) {
- dout("d_delete %p\n", dn);
- d_delete(dn);
- } else {
- dout("d_instantiate %p NULL\n", dn);
- d_instantiate(dn, NULL);
- if (have_lease && d_unhashed(dn))
- d_rehash(dn);
- update_dentry_lease(dn, rinfo->dlease,
- session,
- req->r_request_started);
- }
- goto done;
- }
-
- /* attach proper inode */
- ininfo = rinfo->targeti.in;
- vino.ino = le64_to_cpu(ininfo->ino);
- vino.snap = le64_to_cpu(ininfo->snapid);
- in = dn->d_inode;
- if (!in) {
- in = ceph_get_inode(sb, vino);
- if (IS_ERR(in)) {
- pr_err("fill_trace bad get_inode "
- "%llx.%llx\n", vino.ino, vino.snap);
- err = PTR_ERR(in);
- d_delete(dn);
- goto done;
- }
- dn = splice_dentry(dn, in, &have_lease, true);
- if (IS_ERR(dn)) {
- err = PTR_ERR(dn);
- goto done;
- }
- req->r_dentry = dn; /* may have spliced */
- ihold(in);
- } else if (ceph_ino(in) == vino.ino &&
- ceph_snap(in) == vino.snap) {
- ihold(in);
- } else {
- dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
- dn, in, ceph_ino(in), ceph_snap(in),
- vino.ino, vino.snap);
- have_lease = false;
- in = NULL;
- }
-
- if (have_lease)
- update_dentry_lease(dn, rinfo->dlease, session,
- req->r_request_started);
- dout(" final dn %p\n", dn);
- i++;
- } else if (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
- req->r_op == CEPH_MDS_OP_MKSNAP) {
- struct dentry *dn = req->r_dentry;
-
- /* fill out a snapdir LOOKUPSNAP dentry */
- BUG_ON(!dn);
- BUG_ON(!req->r_locked_dir);
- BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR);
- ininfo = rinfo->targeti.in;
- vino.ino = le64_to_cpu(ininfo->ino);
- vino.snap = le64_to_cpu(ininfo->snapid);
- in = ceph_get_inode(sb, vino);
- if (IS_ERR(in)) {
- pr_err("fill_inode get_inode badness %llx.%llx\n",
- vino.ino, vino.snap);
- err = PTR_ERR(in);
- d_delete(dn);
- goto done;
- }
- dout(" linking snapped dir %p to dn %p\n", in, dn);
- dn = splice_dentry(dn, in, NULL, true);
- if (IS_ERR(dn)) {
- err = PTR_ERR(dn);
- goto done;
- }
- req->r_dentry = dn; /* may have spliced */
- ihold(in);
- rinfo->head->is_dentry = 1; /* fool notrace handlers */
- }
-
- if (rinfo->head->is_target) {
- vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
- vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
-
- if (in == NULL || ceph_ino(in) != vino.ino ||
- ceph_snap(in) != vino.snap) {
- in = ceph_get_inode(sb, vino);
- if (IS_ERR(in)) {
- err = PTR_ERR(in);
- goto done;
- }
- }
- req->r_target_inode = in;
-
- err = fill_inode(in,
- &rinfo->targeti, NULL,
- session, req->r_request_started,
- (le32_to_cpu(rinfo->head->result) == 0) ?
- req->r_fmode : -1,
- &req->r_caps_reservation);
- if (err < 0) {
- pr_err("fill_inode badness %p %llx.%llx\n",
- in, ceph_vinop(in));
- goto done;
- }
- }
-
-done:
- dout("fill_trace done err=%d\n", err);
- return err;
-}
-
-/*
- * Prepopulate our cache with readdir results, leases, etc.
- */
-int ceph_readdir_prepopulate(struct ceph_mds_request *req,
- struct ceph_mds_session *session)
-{
- struct dentry *parent = req->r_dentry;
- struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
- struct qstr dname;
- struct dentry *dn;
- struct inode *in;
- int err = 0, i;
- struct inode *snapdir = NULL;
- struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
- u64 frag = le32_to_cpu(rhead->args.readdir.frag);
- struct ceph_dentry_info *di;
-
- if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
- snapdir = ceph_get_snapdir(parent->d_inode);
- parent = d_find_alias(snapdir);
- dout("readdir_prepopulate %d items under SNAPDIR dn %p\n",
- rinfo->dir_nr, parent);
- } else {
- dout("readdir_prepopulate %d items under dn %p\n",
- rinfo->dir_nr, parent);
- if (rinfo->dir_dir)
- ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir);
- }
-
- for (i = 0; i < rinfo->dir_nr; i++) {
- struct ceph_vino vino;
-
- dname.name = rinfo->dir_dname[i];
- dname.len = rinfo->dir_dname_len[i];
- dname.hash = full_name_hash(dname.name, dname.len);
-
- vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
- vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
-
-retry_lookup:
- dn = d_lookup(parent, &dname);
- dout("d_lookup on parent=%p name=%.*s got %p\n",
- parent, dname.len, dname.name, dn);
-
- if (!dn) {
- dn = d_alloc(parent, &dname);
- dout("d_alloc %p '%.*s' = %p\n", parent,
- dname.len, dname.name, dn);
- if (dn == NULL) {
- dout("d_alloc badness\n");
- err = -ENOMEM;
- goto out;
- }
- err = ceph_init_dentry(dn);
- if (err < 0) {
- dput(dn);
- goto out;
- }
- } else if (dn->d_inode &&
- (ceph_ino(dn->d_inode) != vino.ino ||
- ceph_snap(dn->d_inode) != vino.snap)) {
- dout(" dn %p points to wrong inode %p\n",
- dn, dn->d_inode);
- d_delete(dn);
- dput(dn);
- goto retry_lookup;
- } else {
- /* reorder parent's d_subdirs */
- spin_lock(&parent->d_lock);
- spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
- list_move(&dn->d_u.d_child, &parent->d_subdirs);
- spin_unlock(&dn->d_lock);
- spin_unlock(&parent->d_lock);
- }
-
- di = dn->d_fsdata;
- di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
-
- /* inode */
- if (dn->d_inode) {
- in = dn->d_inode;
- } else {
- in = ceph_get_inode(parent->d_sb, vino);
- if (IS_ERR(in)) {
- dout("new_inode badness\n");
- d_delete(dn);
- dput(dn);
- err = PTR_ERR(in);
- goto out;
- }
- dn = splice_dentry(dn, in, NULL, false);
- if (IS_ERR(dn))
- dn = NULL;
- }
-
- if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
- req->r_request_started, -1,
- &req->r_caps_reservation) < 0) {
- pr_err("fill_inode badness on %p\n", in);
- goto next_item;
- }
- if (dn)
- update_dentry_lease(dn, rinfo->dir_dlease[i],
- req->r_session,
- req->r_request_started);
-next_item:
- if (dn)
- dput(dn);
- }
- req->r_did_prepopulate = true;
-
-out:
- if (snapdir) {
- iput(snapdir);
- dput(parent);
- }
- dout("readdir_prepopulate done\n");
- return err;
-}
-
-int ceph_inode_set_size(struct inode *inode, loff_t size)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- int ret = 0;
-
- spin_lock(&ci->i_ceph_lock);
- dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
- inode->i_size = size;
- inode->i_blocks = (size + (1 << 9) - 1) >> 9;
-
- /* tell the MDS if we are approaching max_size */
- if ((size << 1) >= ci->i_max_size &&
- (ci->i_reported_size << 1) < ci->i_max_size)
- ret = 1;
-
- spin_unlock(&ci->i_ceph_lock);
- return ret;
-}
-
-/*
- * Write back inode data in a worker thread. (This can't be done
- * in the message handler context.)
- */
-void ceph_queue_writeback(struct inode *inode)
-{
- ihold(inode);
- if (queue_work(ceph_inode_to_client(inode)->wb_wq,
- &ceph_inode(inode)->i_wb_work)) {
- dout("ceph_queue_writeback %p\n", inode);
- } else {
- dout("ceph_queue_writeback %p failed\n", inode);
- iput(inode);
- }
-}
-
-static void ceph_writeback_work(struct work_struct *work)
-{
- struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
- i_wb_work);
- struct inode *inode = &ci->vfs_inode;
-
- dout("writeback %p\n", inode);
- filemap_fdatawrite(&inode->i_data);
- iput(inode);
-}
-
-/*
- * queue an async invalidation
- */
-void ceph_queue_invalidate(struct inode *inode)
-{
- ihold(inode);
- if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
- &ceph_inode(inode)->i_pg_inv_work)) {
- dout("ceph_queue_invalidate %p\n", inode);
- } else {
- dout("ceph_queue_invalidate %p failed\n", inode);
- iput(inode);
- }
-}
-
-/*
- * Invalidate inode pages in a worker thread. (This can't be done
- * in the message handler context.)
- */
-static void ceph_invalidate_work(struct work_struct *work)
-{
- struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
- i_pg_inv_work);
- struct inode *inode = &ci->vfs_inode;
- u32 orig_gen;
- int check = 0;
-
- spin_lock(&ci->i_ceph_lock);
- dout("invalidate_pages %p gen %d revoking %d\n", inode,
- ci->i_rdcache_gen, ci->i_rdcache_revoking);
- if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
- /* nevermind! */
- spin_unlock(&ci->i_ceph_lock);
- goto out;
- }
- orig_gen = ci->i_rdcache_gen;
- spin_unlock(&ci->i_ceph_lock);
-
- truncate_inode_pages(&inode->i_data, 0);
-
- spin_lock(&ci->i_ceph_lock);
- if (orig_gen == ci->i_rdcache_gen &&
- orig_gen == ci->i_rdcache_revoking) {
- dout("invalidate_pages %p gen %d successful\n", inode,
- ci->i_rdcache_gen);
- ci->i_rdcache_revoking--;
- check = 1;
- } else {
- dout("invalidate_pages %p gen %d raced, now %d revoking %d\n",
- inode, orig_gen, ci->i_rdcache_gen,
- ci->i_rdcache_revoking);
- }
- spin_unlock(&ci->i_ceph_lock);
-
- if (check)
- ceph_check_caps(ci, 0, NULL);
-out:
- iput(inode);
-}
-
-
-/*
- * called by trunc_wq; take i_mutex ourselves
- *
- * We also truncate in a separate thread as well.
- */
-static void ceph_vmtruncate_work(struct work_struct *work)
-{
- struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
- i_vmtruncate_work);
- struct inode *inode = &ci->vfs_inode;
-
- dout("vmtruncate_work %p\n", inode);
- mutex_lock(&inode->i_mutex);
- __ceph_do_pending_vmtruncate(inode);
- mutex_unlock(&inode->i_mutex);
- iput(inode);
-}
-
-/*
- * Queue an async vmtruncate. If we fail to queue work, we will handle
- * the truncation the next time we call __ceph_do_pending_vmtruncate.
- */
-void ceph_queue_vmtruncate(struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
-
- ihold(inode);
- if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
- &ci->i_vmtruncate_work)) {
- dout("ceph_queue_vmtruncate %p\n", inode);
- } else {
- dout("ceph_queue_vmtruncate %p failed, pending=%d\n",
- inode, ci->i_truncate_pending);
- iput(inode);
- }
-}
-
-/*
- * called with i_mutex held.
- *
- * Make sure any pending truncation is applied before doing anything
- * that may depend on it.
- */
-void __ceph_do_pending_vmtruncate(struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- u64 to;
- int wrbuffer_refs, wake = 0;
-
-retry:
- spin_lock(&ci->i_ceph_lock);
- if (ci->i_truncate_pending == 0) {
- dout("__do_pending_vmtruncate %p none pending\n", inode);
- spin_unlock(&ci->i_ceph_lock);
- return;
- }
-
- /*
- * make sure any dirty snapped pages are flushed before we
- * possibly truncate them.. so write AND block!
- */
- if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
- dout("__do_pending_vmtruncate %p flushing snaps first\n",
- inode);
- spin_unlock(&ci->i_ceph_lock);
- filemap_write_and_wait_range(&inode->i_data, 0,
- inode->i_sb->s_maxbytes);
- goto retry;
- }
-
- to = ci->i_truncate_size;
- wrbuffer_refs = ci->i_wrbuffer_ref;
- dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
- ci->i_truncate_pending, to);
- spin_unlock(&ci->i_ceph_lock);
-
- truncate_inode_pages(inode->i_mapping, to);
-
- spin_lock(&ci->i_ceph_lock);
- ci->i_truncate_pending--;
- if (ci->i_truncate_pending == 0)
- wake = 1;
- spin_unlock(&ci->i_ceph_lock);
-
- if (wrbuffer_refs == 0)
- ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
- if (wake)
- wake_up_all(&ci->i_cap_wq);
-}
-
-
-/*
- * symlinks
- */
-static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
- struct ceph_inode_info *ci = ceph_inode(dentry->d_inode);
- nd_set_link(nd, ci->i_symlink);
- return NULL;
-}
-
-static const struct inode_operations ceph_symlink_iops = {
- .readlink = generic_readlink,
- .follow_link = ceph_sym_follow_link,
-};
-
-/*
- * setattr
- */
-int ceph_setattr(struct dentry *dentry, struct iattr *attr)
-{
- struct inode *inode = dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct inode *parent_inode;
- const unsigned int ia_valid = attr->ia_valid;
- struct ceph_mds_request *req;
- struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
- int issued;
- int release = 0, dirtied = 0;
- int mask = 0;
- int err = 0;
- int inode_dirty_flags = 0;
-
- if (ceph_snap(inode) != CEPH_NOSNAP)
- return -EROFS;
-
- __ceph_do_pending_vmtruncate(inode);
-
- err = inode_change_ok(inode, attr);
- if (err != 0)
- return err;
-
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
- USE_AUTH_MDS);
- if (IS_ERR(req))
- return PTR_ERR(req);
-
- spin_lock(&ci->i_ceph_lock);
- issued = __ceph_caps_issued(ci, NULL);
- dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
-
- if (ia_valid & ATTR_UID) {
- dout("setattr %p uid %d -> %d\n", inode,
- inode->i_uid, attr->ia_uid);
- if (issued & CEPH_CAP_AUTH_EXCL) {
- inode->i_uid = attr->ia_uid;
- dirtied |= CEPH_CAP_AUTH_EXCL;
- } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
- attr->ia_uid != inode->i_uid) {
- req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid);
- mask |= CEPH_SETATTR_UID;
- release |= CEPH_CAP_AUTH_SHARED;
- }
- }
- if (ia_valid & ATTR_GID) {
- dout("setattr %p gid %d -> %d\n", inode,
- inode->i_gid, attr->ia_gid);
- if (issued & CEPH_CAP_AUTH_EXCL) {
- inode->i_gid = attr->ia_gid;
- dirtied |= CEPH_CAP_AUTH_EXCL;
- } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
- attr->ia_gid != inode->i_gid) {
- req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid);
- mask |= CEPH_SETATTR_GID;
- release |= CEPH_CAP_AUTH_SHARED;
- }
- }
- if (ia_valid & ATTR_MODE) {
- dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode,
- attr->ia_mode);
- if (issued & CEPH_CAP_AUTH_EXCL) {
- inode->i_mode = attr->ia_mode;
- dirtied |= CEPH_CAP_AUTH_EXCL;
- } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
- attr->ia_mode != inode->i_mode) {
- req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
- mask |= CEPH_SETATTR_MODE;
- release |= CEPH_CAP_AUTH_SHARED;
- }
- }
-
- if (ia_valid & ATTR_ATIME) {
- dout("setattr %p atime %ld.%ld -> %ld.%ld\n", inode,
- inode->i_atime.tv_sec, inode->i_atime.tv_nsec,
- attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
- if (issued & CEPH_CAP_FILE_EXCL) {
- ci->i_time_warp_seq++;
- inode->i_atime = attr->ia_atime;
- dirtied |= CEPH_CAP_FILE_EXCL;
- } else if ((issued & CEPH_CAP_FILE_WR) &&
- timespec_compare(&inode->i_atime,
- &attr->ia_atime) < 0) {
- inode->i_atime = attr->ia_atime;
- dirtied |= CEPH_CAP_FILE_WR;
- } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
- !timespec_equal(&inode->i_atime, &attr->ia_atime)) {
- ceph_encode_timespec(&req->r_args.setattr.atime,
- &attr->ia_atime);
- mask |= CEPH_SETATTR_ATIME;
- release |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
- CEPH_CAP_FILE_WR;
- }
- }
- if (ia_valid & ATTR_MTIME) {
- dout("setattr %p mtime %ld.%ld -> %ld.%ld\n", inode,
- inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
- attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
- if (issued & CEPH_CAP_FILE_EXCL) {
- ci->i_time_warp_seq++;
- inode->i_mtime = attr->ia_mtime;
- dirtied |= CEPH_CAP_FILE_EXCL;
- } else if ((issued & CEPH_CAP_FILE_WR) &&
- timespec_compare(&inode->i_mtime,
- &attr->ia_mtime) < 0) {
- inode->i_mtime = attr->ia_mtime;
- dirtied |= CEPH_CAP_FILE_WR;
- } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
- !timespec_equal(&inode->i_mtime, &attr->ia_mtime)) {
- ceph_encode_timespec(&req->r_args.setattr.mtime,
- &attr->ia_mtime);
- mask |= CEPH_SETATTR_MTIME;
- release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
- CEPH_CAP_FILE_WR;
- }
- }
- if (ia_valid & ATTR_SIZE) {
- dout("setattr %p size %lld -> %lld\n", inode,
- inode->i_size, attr->ia_size);
- if (attr->ia_size > inode->i_sb->s_maxbytes) {
- err = -EINVAL;
- goto out;
- }
- if ((issued & CEPH_CAP_FILE_EXCL) &&
- attr->ia_size > inode->i_size) {
- inode->i_size = attr->ia_size;
- inode->i_blocks =
- (attr->ia_size + (1 << 9) - 1) >> 9;
- inode->i_ctime = attr->ia_ctime;
- ci->i_reported_size = attr->ia_size;
- dirtied |= CEPH_CAP_FILE_EXCL;
- } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
- attr->ia_size != inode->i_size) {
- req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
- req->r_args.setattr.old_size =
- cpu_to_le64(inode->i_size);
- mask |= CEPH_SETATTR_SIZE;
- release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
- CEPH_CAP_FILE_WR;
- }
- }
-
- /* these do nothing */
- if (ia_valid & ATTR_CTIME) {
- bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
- ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
- dout("setattr %p ctime %ld.%ld -> %ld.%ld (%s)\n", inode,
- inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
- attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
- only ? "ctime only" : "ignored");
- inode->i_ctime = attr->ia_ctime;
- if (only) {
- /*
- * if kernel wants to dirty ctime but nothing else,
- * we need to choose a cap to dirty under, or do
- * a almost-no-op setattr
- */
- if (issued & CEPH_CAP_AUTH_EXCL)
- dirtied |= CEPH_CAP_AUTH_EXCL;
- else if (issued & CEPH_CAP_FILE_EXCL)
- dirtied |= CEPH_CAP_FILE_EXCL;
- else if (issued & CEPH_CAP_XATTR_EXCL)
- dirtied |= CEPH_CAP_XATTR_EXCL;
- else
- mask |= CEPH_SETATTR_CTIME;
- }
- }
- if (ia_valid & ATTR_FILE)
- dout("setattr %p ATTR_FILE ... hrm!\n", inode);
-
- if (dirtied) {
- inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied);
- inode->i_ctime = CURRENT_TIME;
- }
-
- release &= issued;
- spin_unlock(&ci->i_ceph_lock);
-
- if (inode_dirty_flags)
- __mark_inode_dirty(inode, inode_dirty_flags);
-
- if (mask) {
- req->r_inode = inode;
- ihold(inode);
- req->r_inode_drop = release;
- req->r_args.setattr.mask = cpu_to_le32(mask);
- req->r_num_caps = 1;
- parent_inode = ceph_get_dentry_parent_inode(dentry);
- err = ceph_mdsc_do_request(mdsc, parent_inode, req);
- iput(parent_inode);
- }
- dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
- ceph_cap_string(dirtied), mask);
-
- ceph_mdsc_put_request(req);
- __ceph_do_pending_vmtruncate(inode);
- return err;
-out:
- spin_unlock(&ci->i_ceph_lock);
- ceph_mdsc_put_request(req);
- return err;
-}
-
-/*
- * Verify that we have a lease on the given mask. If not,
- * do a getattr against an mds.
- */
-int ceph_do_getattr(struct inode *inode, int mask)
-{
- struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
- struct ceph_mds_request *req;
- int err;
-
- if (ceph_snap(inode) == CEPH_SNAPDIR) {
- dout("do_getattr inode %p SNAPDIR\n", inode);
- return 0;
- }
-
- dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode);
- if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
- return 0;
-
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
- if (IS_ERR(req))
- return PTR_ERR(req);
- req->r_inode = inode;
- ihold(inode);
- req->r_num_caps = 1;
- req->r_args.getattr.mask = cpu_to_le32(mask);
- err = ceph_mdsc_do_request(mdsc, NULL, req);
- ceph_mdsc_put_request(req);
- dout("do_getattr result=%d\n", err);
- return err;
-}
-
-
-/*
- * Check inode permissions. We verify we have a valid value for
- * the AUTH cap, then call the generic handler.
- */
-int ceph_permission(struct inode *inode, int mask)
-{
- int err;
-
- if (mask & MAY_NOT_BLOCK)
- return -ECHILD;
-
- err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
-
- if (!err)
- err = generic_permission(inode, mask);
- return err;
-}
-
-/*
- * Get all attributes. Hopefully somedata we'll have a statlite()
- * and can limit the fields we require to be accurate.
- */
-int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
- struct kstat *stat)
-{
- struct inode *inode = dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
- int err;
-
- err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL);
- if (!err) {
- generic_fillattr(inode, stat);
- stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
- if (ceph_snap(inode) != CEPH_NOSNAP)
- stat->dev = ceph_snap(inode);
- else
- stat->dev = 0;
- if (S_ISDIR(inode->i_mode)) {
- if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
- RBYTES))
- stat->size = ci->i_rbytes;
- else
- stat->size = ci->i_files + ci->i_subdirs;
- stat->blocks = 0;
- stat->blksize = 65536;
- }
- }
- return err;
-}
diff --git a/ANDROID_3.4.5/fs/ceph/ioctl.c b/ANDROID_3.4.5/fs/ceph/ioctl.c
deleted file mode 100644
index 790914a5..00000000
--- a/ANDROID_3.4.5/fs/ceph/ioctl.c
+++ /dev/null
@@ -1,290 +0,0 @@
-#include <linux/in.h>
-
-#include "super.h"
-#include "mds_client.h"
-#include <linux/ceph/ceph_debug.h>
-
-#include "ioctl.h"
-
-
-/*
- * ioctls
- */
-
-/*
- * get and set the file layout
- */
-static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
-{
- struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode);
- struct ceph_ioctl_layout l;
- int err;
-
- err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
- if (!err) {
- l.stripe_unit = ceph_file_layout_su(ci->i_layout);
- l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
- l.object_size = ceph_file_layout_object_size(ci->i_layout);
- l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
- l.preferred_osd =
- (s32)le32_to_cpu(ci->i_layout.fl_pg_preferred);
- if (copy_to_user(arg, &l, sizeof(l)))
- return -EFAULT;
- }
-
- return err;
-}
-
-static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
-{
- struct inode *inode = file->f_dentry->d_inode;
- struct inode *parent_inode;
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
- struct ceph_mds_request *req;
- struct ceph_ioctl_layout l;
- struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode);
- struct ceph_ioctl_layout nl;
- int err, i;
-
- if (copy_from_user(&l, arg, sizeof(l)))
- return -EFAULT;
-
- /* validate changed params against current layout */
- err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
- if (!err) {
- nl.stripe_unit = ceph_file_layout_su(ci->i_layout);
- nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
- nl.object_size = ceph_file_layout_object_size(ci->i_layout);
- nl.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
- nl.preferred_osd =
- (s32)le32_to_cpu(ci->i_layout.fl_pg_preferred);
- } else
- return err;
-
- if (l.stripe_count)
- nl.stripe_count = l.stripe_count;
- if (l.stripe_unit)
- nl.stripe_unit = l.stripe_unit;
- if (l.object_size)
- nl.object_size = l.object_size;
- if (l.data_pool)
- nl.data_pool = l.data_pool;
- if (l.preferred_osd)
- nl.preferred_osd = l.preferred_osd;
-
- if ((nl.object_size & ~PAGE_MASK) ||
- (nl.stripe_unit & ~PAGE_MASK) ||
- ((unsigned)nl.object_size % (unsigned)nl.stripe_unit))
- return -EINVAL;
-
- /* make sure it's a valid data pool */
- if (l.data_pool > 0) {
- mutex_lock(&mdsc->mutex);
- err = -EINVAL;
- for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
- if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
- err = 0;
- break;
- }
- mutex_unlock(&mdsc->mutex);
- if (err)
- return err;
- }
-
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT,
- USE_AUTH_MDS);
- if (IS_ERR(req))
- return PTR_ERR(req);
- req->r_inode = inode;
- ihold(inode);
- req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
-
- req->r_args.setlayout.layout.fl_stripe_unit =
- cpu_to_le32(l.stripe_unit);
- req->r_args.setlayout.layout.fl_stripe_count =
- cpu_to_le32(l.stripe_count);
- req->r_args.setlayout.layout.fl_object_size =
- cpu_to_le32(l.object_size);
- req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
- req->r_args.setlayout.layout.fl_pg_preferred =
- cpu_to_le32(l.preferred_osd);
-
- parent_inode = ceph_get_dentry_parent_inode(file->f_dentry);
- err = ceph_mdsc_do_request(mdsc, parent_inode, req);
- iput(parent_inode);
- ceph_mdsc_put_request(req);
- return err;
-}
-
-/*
- * Set a layout policy on a directory inode. All items in the tree
- * rooted at this inode will inherit this layout on creation,
- * (It doesn't apply retroactively )
- * unless a subdirectory has its own layout policy.
- */
-static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
-{
- struct inode *inode = file->f_dentry->d_inode;
- struct ceph_mds_request *req;
- struct ceph_ioctl_layout l;
- int err, i;
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
-
- /* copy and validate */
- if (copy_from_user(&l, arg, sizeof(l)))
- return -EFAULT;
-
- if ((l.object_size & ~PAGE_MASK) ||
- (l.stripe_unit & ~PAGE_MASK) ||
- !l.stripe_unit ||
- (l.object_size &&
- (unsigned)l.object_size % (unsigned)l.stripe_unit))
- return -EINVAL;
-
- /* make sure it's a valid data pool */
- if (l.data_pool > 0) {
- mutex_lock(&mdsc->mutex);
- err = -EINVAL;
- for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
- if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
- err = 0;
- break;
- }
- mutex_unlock(&mdsc->mutex);
- if (err)
- return err;
- }
-
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT,
- USE_AUTH_MDS);
-
- if (IS_ERR(req))
- return PTR_ERR(req);
- req->r_inode = inode;
- ihold(inode);
-
- req->r_args.setlayout.layout.fl_stripe_unit =
- cpu_to_le32(l.stripe_unit);
- req->r_args.setlayout.layout.fl_stripe_count =
- cpu_to_le32(l.stripe_count);
- req->r_args.setlayout.layout.fl_object_size =
- cpu_to_le32(l.object_size);
- req->r_args.setlayout.layout.fl_pg_pool =
- cpu_to_le32(l.data_pool);
- req->r_args.setlayout.layout.fl_pg_preferred =
- cpu_to_le32(l.preferred_osd);
-
- err = ceph_mdsc_do_request(mdsc, inode, req);
- ceph_mdsc_put_request(req);
- return err;
-}
-
-/*
- * Return object name, size/offset information, and location (OSD
- * number, network address) for a given file offset.
- */
-static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
-{
- struct ceph_ioctl_dataloc dl;
- struct inode *inode = file->f_dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_osd_client *osdc =
- &ceph_sb_to_client(inode->i_sb)->client->osdc;
- u64 len = 1, olen;
- u64 tmp;
- struct ceph_object_layout ol;
- struct ceph_pg pgid;
-
- /* copy and validate */
- if (copy_from_user(&dl, arg, sizeof(dl)))
- return -EFAULT;
-
- down_read(&osdc->map_sem);
- ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
- &dl.object_no, &dl.object_offset, &olen);
- dl.file_offset -= dl.object_offset;
- dl.object_size = ceph_file_layout_object_size(ci->i_layout);
- dl.block_size = ceph_file_layout_su(ci->i_layout);
-
- /* block_offset = object_offset % block_size */
- tmp = dl.object_offset;
- dl.block_offset = do_div(tmp, dl.block_size);
-
- snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
- ceph_ino(inode), dl.object_no);
- ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout,
- osdc->osdmap);
-
- pgid = ol.ol_pgid;
- dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
- if (dl.osd >= 0) {
- struct ceph_entity_addr *a =
- ceph_osd_addr(osdc->osdmap, dl.osd);
- if (a)
- memcpy(&dl.osd_addr, &a->in_addr, sizeof(dl.osd_addr));
- } else {
- memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
- }
- up_read(&osdc->map_sem);
-
- /* send result back to user */
- if (copy_to_user(arg, &dl, sizeof(dl)))
- return -EFAULT;
-
- return 0;
-}
-
-static long ceph_ioctl_lazyio(struct file *file)
-{
- struct ceph_file_info *fi = file->private_data;
- struct inode *inode = file->f_dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
-
- if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
- spin_lock(&ci->i_ceph_lock);
- ci->i_nr_by_mode[fi->fmode]--;
- fi->fmode |= CEPH_FILE_MODE_LAZY;
- ci->i_nr_by_mode[fi->fmode]++;
- spin_unlock(&ci->i_ceph_lock);
- dout("ioctl_layzio: file %p marked lazy\n", file);
-
- ceph_check_caps(ci, 0, NULL);
- } else {
- dout("ioctl_layzio: file %p already lazy\n", file);
- }
- return 0;
-}
-
-static long ceph_ioctl_syncio(struct file *file)
-{
- struct ceph_file_info *fi = file->private_data;
-
- fi->flags |= CEPH_F_SYNC;
- return 0;
-}
-
-long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
- dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
- switch (cmd) {
- case CEPH_IOC_GET_LAYOUT:
- return ceph_ioctl_get_layout(file, (void __user *)arg);
-
- case CEPH_IOC_SET_LAYOUT:
- return ceph_ioctl_set_layout(file, (void __user *)arg);
-
- case CEPH_IOC_SET_LAYOUT_POLICY:
- return ceph_ioctl_set_layout_policy(file, (void __user *)arg);
-
- case CEPH_IOC_GET_DATALOC:
- return ceph_ioctl_get_dataloc(file, (void __user *)arg);
-
- case CEPH_IOC_LAZYIO:
- return ceph_ioctl_lazyio(file);
-
- case CEPH_IOC_SYNCIO:
- return ceph_ioctl_syncio(file);
- }
-
- return -ENOTTY;
-}
diff --git a/ANDROID_3.4.5/fs/ceph/ioctl.h b/ANDROID_3.4.5/fs/ceph/ioctl.h
deleted file mode 100644
index be4a6048..00000000
--- a/ANDROID_3.4.5/fs/ceph/ioctl.h
+++ /dev/null
@@ -1,98 +0,0 @@
-#ifndef FS_CEPH_IOCTL_H
-#define FS_CEPH_IOCTL_H
-
-#include <linux/ioctl.h>
-#include <linux/types.h>
-
-#define CEPH_IOCTL_MAGIC 0x97
-
-/*
- * CEPH_IOC_GET_LAYOUT - get file layout or dir layout policy
- * CEPH_IOC_SET_LAYOUT - set file layout
- * CEPH_IOC_SET_LAYOUT_POLICY - set dir layout policy
- *
- * The file layout specifies how file data is striped over objects in
- * the distributed object store, which object pool they belong to (if
- * it differs from the default), and an optional 'preferred osd' to
- * store them on.
- *
- * Files get a new layout based on the policy set on the containing
- * directory or one of its ancestors. The GET_LAYOUT ioctl will let
- * you examine the layout for a file or the policy on a directory.
- *
- * SET_LAYOUT will let you set a layout on a newly created file. This
- * only works immediately after the file is created and before any
- * data is written to it.
- *
- * SET_LAYOUT_POLICY will let you set a layout policy (default layout)
- * on a directory that will apply to any new files created in that
- * directory (or any child directory that doesn't specify a layout of
- * its own).
- */
-
-/* use u64 to align sanely on all archs */
-struct ceph_ioctl_layout {
- __u64 stripe_unit, stripe_count, object_size;
- __u64 data_pool;
- __s64 preferred_osd;
-};
-
-#define CEPH_IOC_GET_LAYOUT _IOR(CEPH_IOCTL_MAGIC, 1, \
- struct ceph_ioctl_layout)
-#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \
- struct ceph_ioctl_layout)
-#define CEPH_IOC_SET_LAYOUT_POLICY _IOW(CEPH_IOCTL_MAGIC, 5, \
- struct ceph_ioctl_layout)
-
-/*
- * CEPH_IOC_GET_DATALOC - get location of file data in the cluster
- *
- * Extract identity, address of the OSD and object storing a given
- * file offset.
- */
-struct ceph_ioctl_dataloc {
- __u64 file_offset; /* in+out: file offset */
- __u64 object_offset; /* out: offset in object */
- __u64 object_no; /* out: object # */
- __u64 object_size; /* out: object size */
- char object_name[64]; /* out: object name */
- __u64 block_offset; /* out: offset in block */
- __u64 block_size; /* out: block length */
- __s64 osd; /* out: osd # */
- struct sockaddr_storage osd_addr; /* out: osd address */
-};
-
-#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \
- struct ceph_ioctl_dataloc)
-
-/*
- * CEPH_IOC_LAZYIO - relax consistency
- *
- * Normally Ceph switches to synchronous IO when multiple clients have
- * the file open (and or more for write). Reads and writes bypass the
- * page cache and go directly to the OSD. Setting this flag on a file
- * descriptor will allow buffered IO for this file in cases where the
- * application knows it won't interfere with other nodes (or doesn't
- * care).
- */
-#define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4)
-
-/*
- * CEPH_IOC_SYNCIO - force synchronous IO
- *
- * This ioctl sets a file flag that forces the synchronous IO that
- * bypasses the page cache, even if it is not necessary. This is
- * essentially the opposite behavior of IOC_LAZYIO. This forces the
- * same read/write path as a file opened by multiple clients when one
- * or more of those clients is opened for write.
- *
- * Note that this type of sync IO takes a different path than a file
- * opened with O_SYNC/D_SYNC (writes hit the page cache and are
- * immediately flushed on page boundaries). It is very similar to
- * O_DIRECT (writes bypass the page cache) excep that O_DIRECT writes
- * are not copied (user page must remain stable) and O_DIRECT writes
- * have alignment restrictions (on the buffer and file offset).
- */
-#define CEPH_IOC_SYNCIO _IO(CEPH_IOCTL_MAGIC, 5)
-
-#endif
diff --git a/ANDROID_3.4.5/fs/ceph/locks.c b/ANDROID_3.4.5/fs/ceph/locks.c
deleted file mode 100644
index 80576d05..00000000
--- a/ANDROID_3.4.5/fs/ceph/locks.c
+++ /dev/null
@@ -1,286 +0,0 @@
-#include <linux/ceph/ceph_debug.h>
-
-#include <linux/file.h>
-#include <linux/namei.h>
-
-#include "super.h"
-#include "mds_client.h"
-#include <linux/ceph/pagelist.h>
-
-/**
- * Implement fcntl and flock locking functions.
- */
-static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
- int cmd, u8 wait, struct file_lock *fl)
-{
- struct inode *inode = file->f_dentry->d_inode;
- struct ceph_mds_client *mdsc =
- ceph_sb_to_client(inode->i_sb)->mdsc;
- struct ceph_mds_request *req;
- int err;
- u64 length = 0;
-
- req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
- if (IS_ERR(req))
- return PTR_ERR(req);
- req->r_inode = inode;
- ihold(inode);
-
- /* mds requires start and length rather than start and end */
- if (LLONG_MAX == fl->fl_end)
- length = 0;
- else
- length = fl->fl_end - fl->fl_start + 1;
-
- dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
- "length: %llu, wait: %d, type: %d", (int)lock_type,
- (int)operation, (u64)fl->fl_pid, fl->fl_start,
- length, wait, fl->fl_type);
-
- req->r_args.filelock_change.rule = lock_type;
- req->r_args.filelock_change.type = cmd;
- req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
- /* This should be adjusted, but I'm not sure if
- namespaces actually get id numbers*/
- req->r_args.filelock_change.pid_namespace =
- cpu_to_le64((u64)(unsigned long)fl->fl_nspid);
- req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
- req->r_args.filelock_change.length = cpu_to_le64(length);
- req->r_args.filelock_change.wait = wait;
-
- err = ceph_mdsc_do_request(mdsc, inode, req);
-
- if ( operation == CEPH_MDS_OP_GETFILELOCK){
- fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid);
- if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
- fl->fl_type = F_RDLCK;
- else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
- fl->fl_type = F_WRLCK;
- else
- fl->fl_type = F_UNLCK;
-
- fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start);
- length = le64_to_cpu(req->r_reply_info.filelock_reply->start) +
- le64_to_cpu(req->r_reply_info.filelock_reply->length);
- if (length >= 1)
- fl->fl_end = length -1;
- else
- fl->fl_end = 0;
-
- }
- ceph_mdsc_put_request(req);
- dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
- "length: %llu, wait: %d, type: %d, err code %d", (int)lock_type,
- (int)operation, (u64)fl->fl_pid, fl->fl_start,
- length, wait, fl->fl_type, err);
- return err;
-}
-
-/**
- * Attempt to set an fcntl lock.
- * For now, this just goes away to the server. Later it may be more awesome.
- */
-int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
-{
- u8 lock_cmd;
- int err;
- u8 wait = 0;
- u16 op = CEPH_MDS_OP_SETFILELOCK;
-
- fl->fl_nspid = get_pid(task_tgid(current));
- dout("ceph_lock, fl_pid:%d", fl->fl_pid);
-
- /* set wait bit as appropriate, then make command as Ceph expects it*/
- if (F_SETLKW == cmd)
- wait = 1;
- if (F_GETLK == cmd)
- op = CEPH_MDS_OP_GETFILELOCK;
-
- if (F_RDLCK == fl->fl_type)
- lock_cmd = CEPH_LOCK_SHARED;
- else if (F_WRLCK == fl->fl_type)
- lock_cmd = CEPH_LOCK_EXCL;
- else
- lock_cmd = CEPH_LOCK_UNLOCK;
-
- err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl);
- if (!err) {
- if ( op != CEPH_MDS_OP_GETFILELOCK ){
- dout("mds locked, locking locally");
- err = posix_lock_file(file, fl, NULL);
- if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
- /* undo! This should only happen if
- * the kernel detects local
- * deadlock. */
- ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
- CEPH_LOCK_UNLOCK, 0, fl);
- dout("got %d on posix_lock_file, undid lock",
- err);
- }
- }
-
- } else if (err == -ERESTARTSYS) {
- dout("undoing lock\n");
- ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
- CEPH_LOCK_UNLOCK, 0, fl);
- }
- return err;
-}
-
-int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
-{
- u8 lock_cmd;
- int err;
- u8 wait = 1;
-
- fl->fl_nspid = get_pid(task_tgid(current));
- dout("ceph_flock, fl_pid:%d", fl->fl_pid);
-
- /* set wait bit, then clear it out of cmd*/
- if (cmd & LOCK_NB)
- wait = 0;
- cmd = cmd & (LOCK_SH | LOCK_EX | LOCK_UN);
- /* set command sequence that Ceph wants to see:
- shared lock, exclusive lock, or unlock */
- if (LOCK_SH == cmd)
- lock_cmd = CEPH_LOCK_SHARED;
- else if (LOCK_EX == cmd)
- lock_cmd = CEPH_LOCK_EXCL;
- else
- lock_cmd = CEPH_LOCK_UNLOCK;
-
- err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
- file, lock_cmd, wait, fl);
- if (!err) {
- err = flock_lock_file_wait(file, fl);
- if (err) {
- ceph_lock_message(CEPH_LOCK_FLOCK,
- CEPH_MDS_OP_SETFILELOCK,
- file, CEPH_LOCK_UNLOCK, 0, fl);
- dout("got %d on flock_lock_file_wait, undid lock", err);
- }
- } else if (err == -ERESTARTSYS) {
- dout("undoing lock\n");
- ceph_lock_message(CEPH_LOCK_FLOCK,
- CEPH_MDS_OP_SETFILELOCK,
- file, CEPH_LOCK_UNLOCK, 0, fl);
- }
- return err;
-}
-
-/**
- * Must be called with BKL already held. Fills in the passed
- * counter variables, so you can prepare pagelist metadata before calling
- * ceph_encode_locks.
- */
-void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
-{
- struct file_lock *lock;
-
- *fcntl_count = 0;
- *flock_count = 0;
-
- for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
- if (lock->fl_flags & FL_POSIX)
- ++(*fcntl_count);
- else if (lock->fl_flags & FL_FLOCK)
- ++(*flock_count);
- }
- dout("counted %d flock locks and %d fcntl locks",
- *flock_count, *fcntl_count);
-}
-
-/**
- * Encode the flock and fcntl locks for the given inode into the pagelist.
- * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
- * sequential flock locks.
- * Must be called with lock_flocks() already held.
- * If we encounter more of a specific lock type than expected,
- * we return the value 1.
- */
-int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
- int num_fcntl_locks, int num_flock_locks)
-{
- struct file_lock *lock;
- struct ceph_filelock cephlock;
- int err = 0;
- int seen_fcntl = 0;
- int seen_flock = 0;
-
- dout("encoding %d flock and %d fcntl locks", num_flock_locks,
- num_fcntl_locks);
- err = ceph_pagelist_append(pagelist, &num_fcntl_locks, sizeof(u32));
- if (err)
- goto fail;
- for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
- if (lock->fl_flags & FL_POSIX) {
- ++seen_fcntl;
- if (seen_fcntl > num_fcntl_locks) {
- err = -ENOSPC;
- goto fail;
- }
- err = lock_to_ceph_filelock(lock, &cephlock);
- if (err)
- goto fail;
- err = ceph_pagelist_append(pagelist, &cephlock,
- sizeof(struct ceph_filelock));
- }
- if (err)
- goto fail;
- }
-
- err = ceph_pagelist_append(pagelist, &num_flock_locks, sizeof(u32));
- if (err)
- goto fail;
- for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
- if (lock->fl_flags & FL_FLOCK) {
- ++seen_flock;
- if (seen_flock > num_flock_locks) {
- err = -ENOSPC;
- goto fail;
- }
- err = lock_to_ceph_filelock(lock, &cephlock);
- if (err)
- goto fail;
- err = ceph_pagelist_append(pagelist, &cephlock,
- sizeof(struct ceph_filelock));
- }
- if (err)
- goto fail;
- }
-fail:
- return err;
-}
-
-/*
- * Given a pointer to a lock, convert it to a ceph filelock
- */
-int lock_to_ceph_filelock(struct file_lock *lock,
- struct ceph_filelock *cephlock)
-{
- int err = 0;
-
- cephlock->start = cpu_to_le64(lock->fl_start);
- cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
- cephlock->client = cpu_to_le64(0);
- cephlock->pid = cpu_to_le64(lock->fl_pid);
- cephlock->pid_namespace =
- cpu_to_le64((u64)(unsigned long)lock->fl_nspid);
-
- switch (lock->fl_type) {
- case F_RDLCK:
- cephlock->type = CEPH_LOCK_SHARED;
- break;
- case F_WRLCK:
- cephlock->type = CEPH_LOCK_EXCL;
- break;
- case F_UNLCK:
- cephlock->type = CEPH_LOCK_UNLOCK;
- break;
- default:
- dout("Have unknown lock type %d", lock->fl_type);
- err = -EINVAL;
- }
-
- return err;
-}
diff --git a/ANDROID_3.4.5/fs/ceph/mds_client.c b/ANDROID_3.4.5/fs/ceph/mds_client.c
deleted file mode 100644
index 89971e13..00000000
--- a/ANDROID_3.4.5/fs/ceph/mds_client.c
+++ /dev/null
@@ -1,3465 +0,0 @@
-#include <linux/ceph/ceph_debug.h>
-
-#include <linux/fs.h>
-#include <linux/wait.h>
-#include <linux/slab.h>
-#include <linux/sched.h>
-#include <linux/debugfs.h>
-#include <linux/seq_file.h>
-
-#include "super.h"
-#include "mds_client.h"
-
-#include <linux/ceph/messenger.h>
-#include <linux/ceph/decode.h>
-#include <linux/ceph/pagelist.h>
-#include <linux/ceph/auth.h>
-#include <linux/ceph/debugfs.h>
-
-/*
- * A cluster of MDS (metadata server) daemons is responsible for
- * managing the file system namespace (the directory hierarchy and
- * inodes) and for coordinating shared access to storage. Metadata is
- * partitioning hierarchically across a number of servers, and that
- * partition varies over time as the cluster adjusts the distribution
- * in order to balance load.
- *
- * The MDS client is primarily responsible to managing synchronous
- * metadata requests for operations like open, unlink, and so forth.
- * If there is a MDS failure, we find out about it when we (possibly
- * request and) receive a new MDS map, and can resubmit affected
- * requests.
- *
- * For the most part, though, we take advantage of a lossless
- * communications channel to the MDS, and do not need to worry about
- * timing out or resubmitting requests.
- *
- * We maintain a stateful "session" with each MDS we interact with.
- * Within each session, we sent periodic heartbeat messages to ensure
- * any capabilities or leases we have been issues remain valid. If
- * the session times out and goes stale, our leases and capabilities
- * are no longer valid.
- */
-
-struct ceph_reconnect_state {
- struct ceph_pagelist *pagelist;
- bool flock;
-};
-
-static void __wake_requests(struct ceph_mds_client *mdsc,
- struct list_head *head);
-
-static const struct ceph_connection_operations mds_con_ops;
-
-
-/*
- * mds reply parsing
- */
-
-/*
- * parse individual inode info
- */
-static int parse_reply_info_in(void **p, void *end,
- struct ceph_mds_reply_info_in *info,
- int features)
-{
- int err = -EIO;
-
- info->in = *p;
- *p += sizeof(struct ceph_mds_reply_inode) +
- sizeof(*info->in->fragtree.splits) *
- le32_to_cpu(info->in->fragtree.nsplits);
-
- ceph_decode_32_safe(p, end, info->symlink_len, bad);
- ceph_decode_need(p, end, info->symlink_len, bad);
- info->symlink = *p;
- *p += info->symlink_len;
-
- if (features & CEPH_FEATURE_DIRLAYOUTHASH)
- ceph_decode_copy_safe(p, end, &info->dir_layout,
- sizeof(info->dir_layout), bad);
- else
- memset(&info->dir_layout, 0, sizeof(info->dir_layout));
-
- ceph_decode_32_safe(p, end, info->xattr_len, bad);
- ceph_decode_need(p, end, info->xattr_len, bad);
- info->xattr_data = *p;
- *p += info->xattr_len;
- return 0;
-bad:
- return err;
-}
-
-/*
- * parse a normal reply, which may contain a (dir+)dentry and/or a
- * target inode.
- */
-static int parse_reply_info_trace(void **p, void *end,
- struct ceph_mds_reply_info_parsed *info,
- int features)
-{
- int err;
-
- if (info->head->is_dentry) {
- err = parse_reply_info_in(p, end, &info->diri, features);
- if (err < 0)
- goto out_bad;
-
- if (unlikely(*p + sizeof(*info->dirfrag) > end))
- goto bad;
- info->dirfrag = *p;
- *p += sizeof(*info->dirfrag) +
- sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
- if (unlikely(*p > end))
- goto bad;
-
- ceph_decode_32_safe(p, end, info->dname_len, bad);
- ceph_decode_need(p, end, info->dname_len, bad);
- info->dname = *p;
- *p += info->dname_len;
- info->dlease = *p;
- *p += sizeof(*info->dlease);
- }
-
- if (info->head->is_target) {
- err = parse_reply_info_in(p, end, &info->targeti, features);
- if (err < 0)
- goto out_bad;
- }
-
- if (unlikely(*p != end))
- goto bad;
- return 0;
-
-bad:
- err = -EIO;
-out_bad:
- pr_err("problem parsing mds trace %d\n", err);
- return err;
-}
-
-/*
- * parse readdir results
- */
-static int parse_reply_info_dir(void **p, void *end,
- struct ceph_mds_reply_info_parsed *info,
- int features)
-{
- u32 num, i = 0;
- int err;
-
- info->dir_dir = *p;
- if (*p + sizeof(*info->dir_dir) > end)
- goto bad;
- *p += sizeof(*info->dir_dir) +
- sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
- if (*p > end)
- goto bad;
-
- ceph_decode_need(p, end, sizeof(num) + 2, bad);
- num = ceph_decode_32(p);
- info->dir_end = ceph_decode_8(p);
- info->dir_complete = ceph_decode_8(p);
- if (num == 0)
- goto done;
-
- /* alloc large array */
- info->dir_nr = num;
- info->dir_in = kcalloc(num, sizeof(*info->dir_in) +
- sizeof(*info->dir_dname) +
- sizeof(*info->dir_dname_len) +
- sizeof(*info->dir_dlease),
- GFP_NOFS);
- if (info->dir_in == NULL) {
- err = -ENOMEM;
- goto out_bad;
- }
- info->dir_dname = (void *)(info->dir_in + num);
- info->dir_dname_len = (void *)(info->dir_dname + num);
- info->dir_dlease = (void *)(info->dir_dname_len + num);
-
- while (num) {
- /* dentry */
- ceph_decode_need(p, end, sizeof(u32)*2, bad);
- info->dir_dname_len[i] = ceph_decode_32(p);
- ceph_decode_need(p, end, info->dir_dname_len[i], bad);
- info->dir_dname[i] = *p;
- *p += info->dir_dname_len[i];
- dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
- info->dir_dname[i]);
- info->dir_dlease[i] = *p;
- *p += sizeof(struct ceph_mds_reply_lease);
-
- /* inode */
- err = parse_reply_info_in(p, end, &info->dir_in[i], features);
- if (err < 0)
- goto out_bad;
- i++;
- num--;
- }
-
-done:
- if (*p != end)
- goto bad;
- return 0;
-
-bad:
- err = -EIO;
-out_bad:
- pr_err("problem parsing dir contents %d\n", err);
- return err;
-}
-
-/*
- * parse fcntl F_GETLK results
- */
-static int parse_reply_info_filelock(void **p, void *end,
- struct ceph_mds_reply_info_parsed *info,
- int features)
-{
- if (*p + sizeof(*info->filelock_reply) > end)
- goto bad;
-
- info->filelock_reply = *p;
- *p += sizeof(*info->filelock_reply);
-
- if (unlikely(*p != end))
- goto bad;
- return 0;
-
-bad:
- return -EIO;
-}
-
-/*
- * parse extra results
- */
-static int parse_reply_info_extra(void **p, void *end,
- struct ceph_mds_reply_info_parsed *info,
- int features)
-{
- if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
- return parse_reply_info_filelock(p, end, info, features);
- else
- return parse_reply_info_dir(p, end, info, features);
-}
-
-/*
- * parse entire mds reply
- */
-static int parse_reply_info(struct ceph_msg *msg,
- struct ceph_mds_reply_info_parsed *info,
- int features)
-{
- void *p, *end;
- u32 len;
- int err;
-
- info->head = msg->front.iov_base;
- p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
- end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
-
- /* trace */
- ceph_decode_32_safe(&p, end, len, bad);
- if (len > 0) {
- ceph_decode_need(&p, end, len, bad);
- err = parse_reply_info_trace(&p, p+len, info, features);
- if (err < 0)
- goto out_bad;
- }
-
- /* extra */
- ceph_decode_32_safe(&p, end, len, bad);
- if (len > 0) {
- ceph_decode_need(&p, end, len, bad);
- err = parse_reply_info_extra(&p, p+len, info, features);
- if (err < 0)
- goto out_bad;
- }
-
- /* snap blob */
- ceph_decode_32_safe(&p, end, len, bad);
- info->snapblob_len = len;
- info->snapblob = p;
- p += len;
-
- if (p != end)
- goto bad;
- return 0;
-
-bad:
- err = -EIO;
-out_bad:
- pr_err("mds parse_reply err %d\n", err);
- return err;
-}
-
-static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
-{
- kfree(info->dir_in);
-}
-
-
-/*
- * sessions
- */
-static const char *session_state_name(int s)
-{
- switch (s) {
- case CEPH_MDS_SESSION_NEW: return "new";
- case CEPH_MDS_SESSION_OPENING: return "opening";
- case CEPH_MDS_SESSION_OPEN: return "open";
- case CEPH_MDS_SESSION_HUNG: return "hung";
- case CEPH_MDS_SESSION_CLOSING: return "closing";
- case CEPH_MDS_SESSION_RESTARTING: return "restarting";
- case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
- default: return "???";
- }
-}
-
-static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
-{
- if (atomic_inc_not_zero(&s->s_ref)) {
- dout("mdsc get_session %p %d -> %d\n", s,
- atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref));
- return s;
- } else {
- dout("mdsc get_session %p 0 -- FAIL", s);
- return NULL;
- }
-}
-
-void ceph_put_mds_session(struct ceph_mds_session *s)
-{
- dout("mdsc put_session %p %d -> %d\n", s,
- atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
- if (atomic_dec_and_test(&s->s_ref)) {
- if (s->s_authorizer)
- s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer(
- s->s_mdsc->fsc->client->monc.auth,
- s->s_authorizer);
- kfree(s);
- }
-}
-
-/*
- * called under mdsc->mutex
- */
-struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
- int mds)
-{
- struct ceph_mds_session *session;
-
- if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL)
- return NULL;
- session = mdsc->sessions[mds];
- dout("lookup_mds_session %p %d\n", session,
- atomic_read(&session->s_ref));
- get_session(session);
- return session;
-}
-
-static bool __have_session(struct ceph_mds_client *mdsc, int mds)
-{
- if (mds >= mdsc->max_sessions)
- return false;
- return mdsc->sessions[mds];
-}
-
-static int __verify_registered_session(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *s)
-{
- if (s->s_mds >= mdsc->max_sessions ||
- mdsc->sessions[s->s_mds] != s)
- return -ENOENT;
- return 0;
-}
-
-/*
- * create+register a new session for given mds.
- * called under mdsc->mutex.
- */
-static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
- int mds)
-{
- struct ceph_mds_session *s;
-
- s = kzalloc(sizeof(*s), GFP_NOFS);
- if (!s)
- return ERR_PTR(-ENOMEM);
- s->s_mdsc = mdsc;
- s->s_mds = mds;
- s->s_state = CEPH_MDS_SESSION_NEW;
- s->s_ttl = 0;
- s->s_seq = 0;
- mutex_init(&s->s_mutex);
-
- ceph_con_init(mdsc->fsc->client->msgr, &s->s_con);
- s->s_con.private = s;
- s->s_con.ops = &mds_con_ops;
- s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
- s->s_con.peer_name.num = cpu_to_le64(mds);
-
- spin_lock_init(&s->s_gen_ttl_lock);
- s->s_cap_gen = 0;
- s->s_cap_ttl = jiffies - 1;
-
- spin_lock_init(&s->s_cap_lock);
- s->s_renew_requested = 0;
- s->s_renew_seq = 0;
- INIT_LIST_HEAD(&s->s_caps);
- s->s_nr_caps = 0;
- s->s_trim_caps = 0;
- atomic_set(&s->s_ref, 1);
- INIT_LIST_HEAD(&s->s_waiting);
- INIT_LIST_HEAD(&s->s_unsafe);
- s->s_num_cap_releases = 0;
- s->s_cap_iterator = NULL;
- INIT_LIST_HEAD(&s->s_cap_releases);
- INIT_LIST_HEAD(&s->s_cap_releases_done);
- INIT_LIST_HEAD(&s->s_cap_flushing);
- INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
-
- dout("register_session mds%d\n", mds);
- if (mds >= mdsc->max_sessions) {
- int newmax = 1 << get_count_order(mds+1);
- struct ceph_mds_session **sa;
-
- dout("register_session realloc to %d\n", newmax);
- sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
- if (sa == NULL)
- goto fail_realloc;
- if (mdsc->sessions) {
- memcpy(sa, mdsc->sessions,
- mdsc->max_sessions * sizeof(void *));
- kfree(mdsc->sessions);
- }
- mdsc->sessions = sa;
- mdsc->max_sessions = newmax;
- }
- mdsc->sessions[mds] = s;
- atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */
-
- ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
-
- return s;
-
-fail_realloc:
- kfree(s);
- return ERR_PTR(-ENOMEM);
-}
-
-/*
- * called under mdsc->mutex
- */
-static void __unregister_session(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *s)
-{
- dout("__unregister_session mds%d %p\n", s->s_mds, s);
- BUG_ON(mdsc->sessions[s->s_mds] != s);
- mdsc->sessions[s->s_mds] = NULL;
- ceph_con_close(&s->s_con);
- ceph_put_mds_session(s);
-}
-
-/*
- * drop session refs in request.
- *
- * should be last request ref, or hold mdsc->mutex
- */
-static void put_request_session(struct ceph_mds_request *req)
-{
- if (req->r_session) {
- ceph_put_mds_session(req->r_session);
- req->r_session = NULL;
- }
-}
-
-void ceph_mdsc_release_request(struct kref *kref)
-{
- struct ceph_mds_request *req = container_of(kref,
- struct ceph_mds_request,
- r_kref);
- if (req->r_request)
- ceph_msg_put(req->r_request);
- if (req->r_reply) {
- ceph_msg_put(req->r_reply);
- destroy_reply_info(&req->r_reply_info);
- }
- if (req->r_inode) {
- ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
- iput(req->r_inode);
- }
- if (req->r_locked_dir)
- ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
- if (req->r_target_inode)
- iput(req->r_target_inode);
- if (req->r_dentry)
- dput(req->r_dentry);
- if (req->r_old_dentry) {
- /*
- * track (and drop pins for) r_old_dentry_dir
- * separately, since r_old_dentry's d_parent may have
- * changed between the dir mutex being dropped and
- * this request being freed.
- */
- ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
- CEPH_CAP_PIN);
- dput(req->r_old_dentry);
- iput(req->r_old_dentry_dir);
- }
- kfree(req->r_path1);
- kfree(req->r_path2);
- put_request_session(req);
- ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
- kfree(req);
-}
-
-/*
- * lookup session, bump ref if found.
- *
- * called under mdsc->mutex.
- */
-static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
- u64 tid)
-{
- struct ceph_mds_request *req;
- struct rb_node *n = mdsc->request_tree.rb_node;
-
- while (n) {
- req = rb_entry(n, struct ceph_mds_request, r_node);
- if (tid < req->r_tid)
- n = n->rb_left;
- else if (tid > req->r_tid)
- n = n->rb_right;
- else {
- ceph_mdsc_get_request(req);
- return req;
- }
- }
- return NULL;
-}
-
-static void __insert_request(struct ceph_mds_client *mdsc,
- struct ceph_mds_request *new)
-{
- struct rb_node **p = &mdsc->request_tree.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_mds_request *req = NULL;
-
- while (*p) {
- parent = *p;
- req = rb_entry(parent, struct ceph_mds_request, r_node);
- if (new->r_tid < req->r_tid)
- p = &(*p)->rb_left;
- else if (new->r_tid > req->r_tid)
- p = &(*p)->rb_right;
- else
- BUG();
- }
-
- rb_link_node(&new->r_node, parent, p);
- rb_insert_color(&new->r_node, &mdsc->request_tree);
-}
-
-/*
- * Register an in-flight request, and assign a tid. Link to directory
- * are modifying (if any).
- *
- * Called under mdsc->mutex.
- */
-static void __register_request(struct ceph_mds_client *mdsc,
- struct ceph_mds_request *req,
- struct inode *dir)
-{
- req->r_tid = ++mdsc->last_tid;
- if (req->r_num_caps)
- ceph_reserve_caps(mdsc, &req->r_caps_reservation,
- req->r_num_caps);
- dout("__register_request %p tid %lld\n", req, req->r_tid);
- ceph_mdsc_get_request(req);
- __insert_request(mdsc, req);
-
- req->r_uid = current_fsuid();
- req->r_gid = current_fsgid();
-
- if (dir) {
- struct ceph_inode_info *ci = ceph_inode(dir);
-
- ihold(dir);
- spin_lock(&ci->i_unsafe_lock);
- req->r_unsafe_dir = dir;
- list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
- spin_unlock(&ci->i_unsafe_lock);
- }
-}
-
-static void __unregister_request(struct ceph_mds_client *mdsc,
- struct ceph_mds_request *req)
-{
- dout("__unregister_request %p tid %lld\n", req, req->r_tid);
- rb_erase(&req->r_node, &mdsc->request_tree);
- RB_CLEAR_NODE(&req->r_node);
-
- if (req->r_unsafe_dir) {
- struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
-
- spin_lock(&ci->i_unsafe_lock);
- list_del_init(&req->r_unsafe_dir_item);
- spin_unlock(&ci->i_unsafe_lock);
-
- iput(req->r_unsafe_dir);
- req->r_unsafe_dir = NULL;
- }
-
- ceph_mdsc_put_request(req);
-}
-
-/*
- * Choose mds to send request to next. If there is a hint set in the
- * request (e.g., due to a prior forward hint from the mds), use that.
- * Otherwise, consult frag tree and/or caps to identify the
- * appropriate mds. If all else fails, choose randomly.
- *
- * Called under mdsc->mutex.
- */
-static struct dentry *get_nonsnap_parent(struct dentry *dentry)
-{
- /*
- * we don't need to worry about protecting the d_parent access
- * here because we never renaming inside the snapped namespace
- * except to resplice to another snapdir, and either the old or new
- * result is a valid result.
- */
- while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
- dentry = dentry->d_parent;
- return dentry;
-}
-
-static int __choose_mds(struct ceph_mds_client *mdsc,
- struct ceph_mds_request *req)
-{
- struct inode *inode;
- struct ceph_inode_info *ci;
- struct ceph_cap *cap;
- int mode = req->r_direct_mode;
- int mds = -1;
- u32 hash = req->r_direct_hash;
- bool is_hash = req->r_direct_is_hash;
-
- /*
- * is there a specific mds we should try? ignore hint if we have
- * no session and the mds is not up (active or recovering).
- */
- if (req->r_resend_mds >= 0 &&
- (__have_session(mdsc, req->r_resend_mds) ||
- ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
- dout("choose_mds using resend_mds mds%d\n",
- req->r_resend_mds);
- return req->r_resend_mds;
- }
-
- if (mode == USE_RANDOM_MDS)
- goto random;
-
- inode = NULL;
- if (req->r_inode) {
- inode = req->r_inode;
- } else if (req->r_dentry) {
- /* ignore race with rename; old or new d_parent is okay */
- struct dentry *parent = req->r_dentry->d_parent;
- struct inode *dir = parent->d_inode;
-
- if (dir->i_sb != mdsc->fsc->sb) {
- /* not this fs! */
- inode = req->r_dentry->d_inode;
- } else if (ceph_snap(dir) != CEPH_NOSNAP) {
- /* direct snapped/virtual snapdir requests
- * based on parent dir inode */
- struct dentry *dn = get_nonsnap_parent(parent);
- inode = dn->d_inode;
- dout("__choose_mds using nonsnap parent %p\n", inode);
- } else if (req->r_dentry->d_inode) {
- /* dentry target */
- inode = req->r_dentry->d_inode;
- } else {
- /* dir + name */
- inode = dir;
- hash = ceph_dentry_hash(dir, req->r_dentry);
- is_hash = true;
- }
- }
-
- dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
- (int)hash, mode);
- if (!inode)
- goto random;
- ci = ceph_inode(inode);
-
- if (is_hash && S_ISDIR(inode->i_mode)) {
- struct ceph_inode_frag frag;
- int found;
-
- ceph_choose_frag(ci, hash, &frag, &found);
- if (found) {
- if (mode == USE_ANY_MDS && frag.ndist > 0) {
- u8 r;
-
- /* choose a random replica */
- get_random_bytes(&r, 1);
- r %= frag.ndist;
- mds = frag.dist[r];
- dout("choose_mds %p %llx.%llx "
- "frag %u mds%d (%d/%d)\n",
- inode, ceph_vinop(inode),
- frag.frag, mds,
- (int)r, frag.ndist);
- if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
- CEPH_MDS_STATE_ACTIVE)
- return mds;
- }
-
- /* since this file/dir wasn't known to be
- * replicated, then we want to look for the
- * authoritative mds. */
- mode = USE_AUTH_MDS;
- if (frag.mds >= 0) {
- /* choose auth mds */
- mds = frag.mds;
- dout("choose_mds %p %llx.%llx "
- "frag %u mds%d (auth)\n",
- inode, ceph_vinop(inode), frag.frag, mds);
- if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
- CEPH_MDS_STATE_ACTIVE)
- return mds;
- }
- }
- }
-
- spin_lock(&ci->i_ceph_lock);
- cap = NULL;
- if (mode == USE_AUTH_MDS)
- cap = ci->i_auth_cap;
- if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
- cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
- if (!cap) {
- spin_unlock(&ci->i_ceph_lock);
- goto random;
- }
- mds = cap->session->s_mds;
- dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
- inode, ceph_vinop(inode), mds,
- cap == ci->i_auth_cap ? "auth " : "", cap);
- spin_unlock(&ci->i_ceph_lock);
- return mds;
-
-random:
- mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
- dout("choose_mds chose random mds%d\n", mds);
- return mds;
-}
-
-
-/*
- * session messages
- */
-static struct ceph_msg *create_session_msg(u32 op, u64 seq)
-{
- struct ceph_msg *msg;
- struct ceph_mds_session_head *h;
-
- msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS,
- false);
- if (!msg) {
- pr_err("create_session_msg ENOMEM creating msg\n");
- return NULL;
- }
- h = msg->front.iov_base;
- h->op = cpu_to_le32(op);
- h->seq = cpu_to_le64(seq);
- return msg;
-}
-
-/*
- * send session open request.
- *
- * called under mdsc->mutex
- */
-static int __open_session(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
-{
- struct ceph_msg *msg;
- int mstate;
- int mds = session->s_mds;
-
- /* wait for mds to go active? */
- mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
- dout("open_session to mds%d (%s)\n", mds,
- ceph_mds_state_name(mstate));
- session->s_state = CEPH_MDS_SESSION_OPENING;
- session->s_renew_requested = jiffies;
-
- /* send connect message */
- msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
- if (!msg)
- return -ENOMEM;
- ceph_con_send(&session->s_con, msg);
- return 0;
-}
-
-/*
- * open sessions for any export targets for the given mds
- *
- * called under mdsc->mutex
- */
-static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
-{
- struct ceph_mds_info *mi;
- struct ceph_mds_session *ts;
- int i, mds = session->s_mds;
- int target;
-
- if (mds >= mdsc->mdsmap->m_max_mds)
- return;
- mi = &mdsc->mdsmap->m_info[mds];
- dout("open_export_target_sessions for mds%d (%d targets)\n",
- session->s_mds, mi->num_export_targets);
-
- for (i = 0; i < mi->num_export_targets; i++) {
- target = mi->export_targets[i];
- ts = __ceph_lookup_mds_session(mdsc, target);
- if (!ts) {
- ts = register_session(mdsc, target);
- if (IS_ERR(ts))
- return;
- }
- if (session->s_state == CEPH_MDS_SESSION_NEW ||
- session->s_state == CEPH_MDS_SESSION_CLOSING)
- __open_session(mdsc, session);
- else
- dout(" mds%d target mds%d %p is %s\n", session->s_mds,
- i, ts, session_state_name(ts->s_state));
- ceph_put_mds_session(ts);
- }
-}
-
-void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
-{
- mutex_lock(&mdsc->mutex);
- __open_export_target_sessions(mdsc, session);
- mutex_unlock(&mdsc->mutex);
-}
-
-/*
- * session caps
- */
-
-/*
- * Free preallocated cap messages assigned to this session
- */
-static void cleanup_cap_releases(struct ceph_mds_session *session)
-{
- struct ceph_msg *msg;
-
- spin_lock(&session->s_cap_lock);
- while (!list_empty(&session->s_cap_releases)) {
- msg = list_first_entry(&session->s_cap_releases,
- struct ceph_msg, list_head);
- list_del_init(&msg->list_head);
- ceph_msg_put(msg);
- }
- while (!list_empty(&session->s_cap_releases_done)) {
- msg = list_first_entry(&session->s_cap_releases_done,
- struct ceph_msg, list_head);
- list_del_init(&msg->list_head);
- ceph_msg_put(msg);
- }
- spin_unlock(&session->s_cap_lock);
-}
-
-/*
- * Helper to safely iterate over all caps associated with a session, with
- * special care taken to handle a racing __ceph_remove_cap().
- *
- * Caller must hold session s_mutex.
- */
-static int iterate_session_caps(struct ceph_mds_session *session,
- int (*cb)(struct inode *, struct ceph_cap *,
- void *), void *arg)
-{
- struct list_head *p;
- struct ceph_cap *cap;
- struct inode *inode, *last_inode = NULL;
- struct ceph_cap *old_cap = NULL;
- int ret;
-
- dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
- spin_lock(&session->s_cap_lock);
- p = session->s_caps.next;
- while (p != &session->s_caps) {
- cap = list_entry(p, struct ceph_cap, session_caps);
- inode = igrab(&cap->ci->vfs_inode);
- if (!inode) {
- p = p->next;
- continue;
- }
- session->s_cap_iterator = cap;
- spin_unlock(&session->s_cap_lock);
-
- if (last_inode) {
- iput(last_inode);
- last_inode = NULL;
- }
- if (old_cap) {
- ceph_put_cap(session->s_mdsc, old_cap);
- old_cap = NULL;
- }
-
- ret = cb(inode, cap, arg);
- last_inode = inode;
-
- spin_lock(&session->s_cap_lock);
- p = p->next;
- if (cap->ci == NULL) {
- dout("iterate_session_caps finishing cap %p removal\n",
- cap);
- BUG_ON(cap->session != session);
- list_del_init(&cap->session_caps);
- session->s_nr_caps--;
- cap->session = NULL;
- old_cap = cap; /* put_cap it w/o locks held */
- }
- if (ret < 0)
- goto out;
- }
- ret = 0;
-out:
- session->s_cap_iterator = NULL;
- spin_unlock(&session->s_cap_lock);
-
- if (last_inode)
- iput(last_inode);
- if (old_cap)
- ceph_put_cap(session->s_mdsc, old_cap);
-
- return ret;
-}
-
-static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
- void *arg)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- int drop = 0;
-
- dout("removing cap %p, ci is %p, inode is %p\n",
- cap, ci, &ci->vfs_inode);
- spin_lock(&ci->i_ceph_lock);
- __ceph_remove_cap(cap);
- if (!__ceph_is_any_real_caps(ci)) {
- struct ceph_mds_client *mdsc =
- ceph_sb_to_client(inode->i_sb)->mdsc;
-
- spin_lock(&mdsc->cap_dirty_lock);
- if (!list_empty(&ci->i_dirty_item)) {
- pr_info(" dropping dirty %s state for %p %lld\n",
- ceph_cap_string(ci->i_dirty_caps),
- inode, ceph_ino(inode));
- ci->i_dirty_caps = 0;
- list_del_init(&ci->i_dirty_item);
- drop = 1;
- }
- if (!list_empty(&ci->i_flushing_item)) {
- pr_info(" dropping dirty+flushing %s state for %p %lld\n",
- ceph_cap_string(ci->i_flushing_caps),
- inode, ceph_ino(inode));
- ci->i_flushing_caps = 0;
- list_del_init(&ci->i_flushing_item);
- mdsc->num_cap_flushing--;
- drop = 1;
- }
- if (drop && ci->i_wrbuffer_ref) {
- pr_info(" dropping dirty data for %p %lld\n",
- inode, ceph_ino(inode));
- ci->i_wrbuffer_ref = 0;
- ci->i_wrbuffer_ref_head = 0;
- drop++;
- }
- spin_unlock(&mdsc->cap_dirty_lock);
- }
- spin_unlock(&ci->i_ceph_lock);
- while (drop--)
- iput(inode);
- return 0;
-}
-
-/*
- * caller must hold session s_mutex
- */
-static void remove_session_caps(struct ceph_mds_session *session)
-{
- dout("remove_session_caps on %p\n", session);
- iterate_session_caps(session, remove_session_caps_cb, NULL);
- BUG_ON(session->s_nr_caps > 0);
- BUG_ON(!list_empty(&session->s_cap_flushing));
- cleanup_cap_releases(session);
-}
-
-/*
- * wake up any threads waiting on this session's caps. if the cap is
- * old (didn't get renewed on the client reconnect), remove it now.
- *
- * caller must hold s_mutex.
- */
-static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
- void *arg)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
-
- wake_up_all(&ci->i_cap_wq);
- if (arg) {
- spin_lock(&ci->i_ceph_lock);
- ci->i_wanted_max_size = 0;
- ci->i_requested_max_size = 0;
- spin_unlock(&ci->i_ceph_lock);
- }
- return 0;
-}
-
-static void wake_up_session_caps(struct ceph_mds_session *session,
- int reconnect)
-{
- dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
- iterate_session_caps(session, wake_up_session_cb,
- (void *)(unsigned long)reconnect);
-}
-
-/*
- * Send periodic message to MDS renewing all currently held caps. The
- * ack will reset the expiration for all caps from this session.
- *
- * caller holds s_mutex
- */
-static int send_renew_caps(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
-{
- struct ceph_msg *msg;
- int state;
-
- if (time_after_eq(jiffies, session->s_cap_ttl) &&
- time_after_eq(session->s_cap_ttl, session->s_renew_requested))
- pr_info("mds%d caps stale\n", session->s_mds);
- session->s_renew_requested = jiffies;
-
- /* do not try to renew caps until a recovering mds has reconnected
- * with its clients. */
- state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
- if (state < CEPH_MDS_STATE_RECONNECT) {
- dout("send_renew_caps ignoring mds%d (%s)\n",
- session->s_mds, ceph_mds_state_name(state));
- return 0;
- }
-
- dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
- ceph_mds_state_name(state));
- msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
- ++session->s_renew_seq);
- if (!msg)
- return -ENOMEM;
- ceph_con_send(&session->s_con, msg);
- return 0;
-}
-
-/*
- * Note new cap ttl, and any transition from stale -> not stale (fresh?).
- *
- * Called under session->s_mutex
- */
-static void renewed_caps(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session, int is_renew)
-{
- int was_stale;
- int wake = 0;
-
- spin_lock(&session->s_cap_lock);
- was_stale = is_renew && time_after_eq(jiffies, session->s_cap_ttl);
-
- session->s_cap_ttl = session->s_renew_requested +
- mdsc->mdsmap->m_session_timeout*HZ;
-
- if (was_stale) {
- if (time_before(jiffies, session->s_cap_ttl)) {
- pr_info("mds%d caps renewed\n", session->s_mds);
- wake = 1;
- } else {
- pr_info("mds%d caps still stale\n", session->s_mds);
- }
- }
- dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
- session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
- time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
- spin_unlock(&session->s_cap_lock);
-
- if (wake)
- wake_up_session_caps(session, 0);
-}
-
-/*
- * send a session close request
- */
-static int request_close_session(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
-{
- struct ceph_msg *msg;
-
- dout("request_close_session mds%d state %s seq %lld\n",
- session->s_mds, session_state_name(session->s_state),
- session->s_seq);
- msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
- if (!msg)
- return -ENOMEM;
- ceph_con_send(&session->s_con, msg);
- return 0;
-}
-
-/*
- * Called with s_mutex held.
- */
-static int __close_session(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
-{
- if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
- return 0;
- session->s_state = CEPH_MDS_SESSION_CLOSING;
- return request_close_session(mdsc, session);
-}
-
-/*
- * Trim old(er) caps.
- *
- * Because we can't cache an inode without one or more caps, we do
- * this indirectly: if a cap is unused, we prune its aliases, at which
- * point the inode will hopefully get dropped to.
- *
- * Yes, this is a bit sloppy. Our only real goal here is to respond to
- * memory pressure from the MDS, though, so it needn't be perfect.
- */
-static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
-{
- struct ceph_mds_session *session = arg;
- struct ceph_inode_info *ci = ceph_inode(inode);
- int used, oissued, mine;
-
- if (session->s_trim_caps <= 0)
- return -1;
-
- spin_lock(&ci->i_ceph_lock);
- mine = cap->issued | cap->implemented;
- used = __ceph_caps_used(ci);
- oissued = __ceph_caps_issued_other(ci, cap);
-
- dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n",
- inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
- ceph_cap_string(used));
- if (ci->i_dirty_caps)
- goto out; /* dirty caps */
- if ((used & ~oissued) & mine)
- goto out; /* we need these caps */
-
- session->s_trim_caps--;
- if (oissued) {
- /* we aren't the only cap.. just remove us */
- __ceph_remove_cap(cap);
- } else {
- /* try to drop referring dentries */
- spin_unlock(&ci->i_ceph_lock);
- d_prune_aliases(inode);
- dout("trim_caps_cb %p cap %p pruned, count now %d\n",
- inode, cap, atomic_read(&inode->i_count));
- return 0;
- }
-
-out:
- spin_unlock(&ci->i_ceph_lock);
- return 0;
-}
-
-/*
- * Trim session cap count down to some max number.
- */
-static int trim_caps(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session,
- int max_caps)
-{
- int trim_caps = session->s_nr_caps - max_caps;
-
- dout("trim_caps mds%d start: %d / %d, trim %d\n",
- session->s_mds, session->s_nr_caps, max_caps, trim_caps);
- if (trim_caps > 0) {
- session->s_trim_caps = trim_caps;
- iterate_session_caps(session, trim_caps_cb, session);
- dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
- session->s_mds, session->s_nr_caps, max_caps,
- trim_caps - session->s_trim_caps);
- session->s_trim_caps = 0;
- }
- return 0;
-}
-
-/*
- * Allocate cap_release messages. If there is a partially full message
- * in the queue, try to allocate enough to cover it's remainder, so that
- * we can send it immediately.
- *
- * Called under s_mutex.
- */
-int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
-{
- struct ceph_msg *msg, *partial = NULL;
- struct ceph_mds_cap_release *head;
- int err = -ENOMEM;
- int extra = mdsc->fsc->mount_options->cap_release_safety;
- int num;
-
- dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds,
- extra);
-
- spin_lock(&session->s_cap_lock);
-
- if (!list_empty(&session->s_cap_releases)) {
- msg = list_first_entry(&session->s_cap_releases,
- struct ceph_msg,
- list_head);
- head = msg->front.iov_base;
- num = le32_to_cpu(head->num);
- if (num) {
- dout(" partial %p with (%d/%d)\n", msg, num,
- (int)CEPH_CAPS_PER_RELEASE);
- extra += CEPH_CAPS_PER_RELEASE - num;
- partial = msg;
- }
- }
- while (session->s_num_cap_releases < session->s_nr_caps + extra) {
- spin_unlock(&session->s_cap_lock);
- msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
- GFP_NOFS, false);
- if (!msg)
- goto out_unlocked;
- dout("add_cap_releases %p msg %p now %d\n", session, msg,
- (int)msg->front.iov_len);
- head = msg->front.iov_base;
- head->num = cpu_to_le32(0);
- msg->front.iov_len = sizeof(*head);
- spin_lock(&session->s_cap_lock);
- list_add(&msg->list_head, &session->s_cap_releases);
- session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
- }
-
- if (partial) {
- head = partial->front.iov_base;
- num = le32_to_cpu(head->num);
- dout(" queueing partial %p with %d/%d\n", partial, num,
- (int)CEPH_CAPS_PER_RELEASE);
- list_move_tail(&partial->list_head,
- &session->s_cap_releases_done);
- session->s_num_cap_releases -= CEPH_CAPS_PER_RELEASE - num;
- }
- err = 0;
- spin_unlock(&session->s_cap_lock);
-out_unlocked:
- return err;
-}
-
-/*
- * flush all dirty inode data to disk.
- *
- * returns true if we've flushed through want_flush_seq
- */
-static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
-{
- int mds, ret = 1;
-
- dout("check_cap_flush want %lld\n", want_flush_seq);
- mutex_lock(&mdsc->mutex);
- for (mds = 0; ret && mds < mdsc->max_sessions; mds++) {
- struct ceph_mds_session *session = mdsc->sessions[mds];
-
- if (!session)
- continue;
- get_session(session);
- mutex_unlock(&mdsc->mutex);
-
- mutex_lock(&session->s_mutex);
- if (!list_empty(&session->s_cap_flushing)) {
- struct ceph_inode_info *ci =
- list_entry(session->s_cap_flushing.next,
- struct ceph_inode_info,
- i_flushing_item);
- struct inode *inode = &ci->vfs_inode;
-
- spin_lock(&ci->i_ceph_lock);
- if (ci->i_cap_flush_seq <= want_flush_seq) {
- dout("check_cap_flush still flushing %p "
- "seq %lld <= %lld to mds%d\n", inode,
- ci->i_cap_flush_seq, want_flush_seq,
- session->s_mds);
- ret = 0;
- }
- spin_unlock(&ci->i_ceph_lock);
- }
- mutex_unlock(&session->s_mutex);
- ceph_put_mds_session(session);
-
- if (!ret)
- return ret;
- mutex_lock(&mdsc->mutex);
- }
-
- mutex_unlock(&mdsc->mutex);
- dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
- return ret;
-}
-
-/*
- * called under s_mutex
- */
-void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
-{
- struct ceph_msg *msg;
-
- dout("send_cap_releases mds%d\n", session->s_mds);
- spin_lock(&session->s_cap_lock);
- while (!list_empty(&session->s_cap_releases_done)) {
- msg = list_first_entry(&session->s_cap_releases_done,
- struct ceph_msg, list_head);
- list_del_init(&msg->list_head);
- spin_unlock(&session->s_cap_lock);
- msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
- dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
- ceph_con_send(&session->s_con, msg);
- spin_lock(&session->s_cap_lock);
- }
- spin_unlock(&session->s_cap_lock);
-}
-
-static void discard_cap_releases(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
-{
- struct ceph_msg *msg;
- struct ceph_mds_cap_release *head;
- unsigned num;
-
- dout("discard_cap_releases mds%d\n", session->s_mds);
- spin_lock(&session->s_cap_lock);
-
- /* zero out the in-progress message */
- msg = list_first_entry(&session->s_cap_releases,
- struct ceph_msg, list_head);
- head = msg->front.iov_base;
- num = le32_to_cpu(head->num);
- dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
- head->num = cpu_to_le32(0);
- session->s_num_cap_releases += num;
-
- /* requeue completed messages */
- while (!list_empty(&session->s_cap_releases_done)) {
- msg = list_first_entry(&session->s_cap_releases_done,
- struct ceph_msg, list_head);
- list_del_init(&msg->list_head);
-
- head = msg->front.iov_base;
- num = le32_to_cpu(head->num);
- dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg,
- num);
- session->s_num_cap_releases += num;
- head->num = cpu_to_le32(0);
- msg->front.iov_len = sizeof(*head);
- list_add(&msg->list_head, &session->s_cap_releases);
- }
-
- spin_unlock(&session->s_cap_lock);
-}
-
-/*
- * requests
- */
-
-/*
- * Create an mds request.
- */
-struct ceph_mds_request *
-ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
-{
- struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
-
- if (!req)
- return ERR_PTR(-ENOMEM);
-
- mutex_init(&req->r_fill_mutex);
- req->r_mdsc = mdsc;
- req->r_started = jiffies;
- req->r_resend_mds = -1;
- INIT_LIST_HEAD(&req->r_unsafe_dir_item);
- req->r_fmode = -1;
- kref_init(&req->r_kref);
- INIT_LIST_HEAD(&req->r_wait);
- init_completion(&req->r_completion);
- init_completion(&req->r_safe_completion);
- INIT_LIST_HEAD(&req->r_unsafe_item);
-
- req->r_op = op;
- req->r_direct_mode = mode;
- return req;
-}
-
-/*
- * return oldest (lowest) request, tid in request tree, 0 if none.
- *
- * called under mdsc->mutex.
- */
-static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
-{
- if (RB_EMPTY_ROOT(&mdsc->request_tree))
- return NULL;
- return rb_entry(rb_first(&mdsc->request_tree),
- struct ceph_mds_request, r_node);
-}
-
-static u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
-{
- struct ceph_mds_request *req = __get_oldest_req(mdsc);
-
- if (req)
- return req->r_tid;
- return 0;
-}
-
-/*
- * Build a dentry's path. Allocate on heap; caller must kfree. Based
- * on build_path_from_dentry in fs/cifs/dir.c.
- *
- * If @stop_on_nosnap, generate path relative to the first non-snapped
- * inode.
- *
- * Encode hidden .snap dirs as a double /, i.e.
- * foo/.snap/bar -> foo//bar
- */
-char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
- int stop_on_nosnap)
-{
- struct dentry *temp;
- char *path;
- int len, pos;
- unsigned seq;
-
- if (dentry == NULL)
- return ERR_PTR(-EINVAL);
-
-retry:
- len = 0;
- seq = read_seqbegin(&rename_lock);
- rcu_read_lock();
- for (temp = dentry; !IS_ROOT(temp);) {
- struct inode *inode = temp->d_inode;
- if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
- len++; /* slash only */
- else if (stop_on_nosnap && inode &&
- ceph_snap(inode) == CEPH_NOSNAP)
- break;
- else
- len += 1 + temp->d_name.len;
- temp = temp->d_parent;
- if (temp == NULL) {
- rcu_read_unlock();
- pr_err("build_path corrupt dentry %p\n", dentry);
- return ERR_PTR(-EINVAL);
- }
- }
- rcu_read_unlock();
- if (len)
- len--; /* no leading '/' */
-
- path = kmalloc(len+1, GFP_NOFS);
- if (path == NULL)
- return ERR_PTR(-ENOMEM);
- pos = len;
- path[pos] = 0; /* trailing null */
- rcu_read_lock();
- for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) {
- struct inode *inode;
-
- spin_lock(&temp->d_lock);
- inode = temp->d_inode;
- if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
- dout("build_path path+%d: %p SNAPDIR\n",
- pos, temp);
- } else if (stop_on_nosnap && inode &&
- ceph_snap(inode) == CEPH_NOSNAP) {
- spin_unlock(&temp->d_lock);
- break;
- } else {
- pos -= temp->d_name.len;
- if (pos < 0) {
- spin_unlock(&temp->d_lock);
- break;
- }
- strncpy(path + pos, temp->d_name.name,
- temp->d_name.len);
- }
- spin_unlock(&temp->d_lock);
- if (pos)
- path[--pos] = '/';
- temp = temp->d_parent;
- if (temp == NULL) {
- rcu_read_unlock();
- pr_err("build_path corrupt dentry\n");
- kfree(path);
- return ERR_PTR(-EINVAL);
- }
- }
- rcu_read_unlock();
- if (pos != 0 || read_seqretry(&rename_lock, seq)) {
- pr_err("build_path did not end path lookup where "
- "expected, namelen is %d, pos is %d\n", len, pos);
- /* presumably this is only possible if racing with a
- rename of one of the parent directories (we can not
- lock the dentries above us to prevent this, but
- retrying should be harmless) */
- kfree(path);
- goto retry;
- }
-
- *base = ceph_ino(temp->d_inode);
- *plen = len;
- dout("build_path on %p %d built %llx '%.*s'\n",
- dentry, dentry->d_count, *base, len, path);
- return path;
-}
-
-static int build_dentry_path(struct dentry *dentry,
- const char **ppath, int *ppathlen, u64 *pino,
- int *pfreepath)
-{
- char *path;
-
- if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) {
- *pino = ceph_ino(dentry->d_parent->d_inode);
- *ppath = dentry->d_name.name;
- *ppathlen = dentry->d_name.len;
- return 0;
- }
- path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
- if (IS_ERR(path))
- return PTR_ERR(path);
- *ppath = path;
- *pfreepath = 1;
- return 0;
-}
-
-static int build_inode_path(struct inode *inode,
- const char **ppath, int *ppathlen, u64 *pino,
- int *pfreepath)
-{
- struct dentry *dentry;
- char *path;
-
- if (ceph_snap(inode) == CEPH_NOSNAP) {
- *pino = ceph_ino(inode);
- *ppathlen = 0;
- return 0;
- }
- dentry = d_find_alias(inode);
- path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
- dput(dentry);
- if (IS_ERR(path))
- return PTR_ERR(path);
- *ppath = path;
- *pfreepath = 1;
- return 0;
-}
-
-/*
- * request arguments may be specified via an inode *, a dentry *, or
- * an explicit ino+path.
- */
-static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
- const char *rpath, u64 rino,
- const char **ppath, int *pathlen,
- u64 *ino, int *freepath)
-{
- int r = 0;
-
- if (rinode) {
- r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
- dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
- ceph_snap(rinode));
- } else if (rdentry) {
- r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
- dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
- *ppath);
- } else if (rpath || rino) {
- *ino = rino;
- *ppath = rpath;
- *pathlen = strlen(rpath);
- dout(" path %.*s\n", *pathlen, rpath);
- }
-
- return r;
-}
-
-/*
- * called under mdsc->mutex
- */
-static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
- struct ceph_mds_request *req,
- int mds)
-{
- struct ceph_msg *msg;
- struct ceph_mds_request_head *head;
- const char *path1 = NULL;
- const char *path2 = NULL;
- u64 ino1 = 0, ino2 = 0;
- int pathlen1 = 0, pathlen2 = 0;
- int freepath1 = 0, freepath2 = 0;
- int len;
- u16 releases;
- void *p, *end;
- int ret;
-
- ret = set_request_path_attr(req->r_inode, req->r_dentry,
- req->r_path1, req->r_ino1.ino,
- &path1, &pathlen1, &ino1, &freepath1);
- if (ret < 0) {
- msg = ERR_PTR(ret);
- goto out;
- }
-
- ret = set_request_path_attr(NULL, req->r_old_dentry,
- req->r_path2, req->r_ino2.ino,
- &path2, &pathlen2, &ino2, &freepath2);
- if (ret < 0) {
- msg = ERR_PTR(ret);
- goto out_free1;
- }
-
- len = sizeof(*head) +
- pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64));
-
- /* calculate (max) length for cap releases */
- len += sizeof(struct ceph_mds_request_release) *
- (!!req->r_inode_drop + !!req->r_dentry_drop +
- !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
- if (req->r_dentry_drop)
- len += req->r_dentry->d_name.len;
- if (req->r_old_dentry_drop)
- len += req->r_old_dentry->d_name.len;
-
- msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false);
- if (!msg) {
- msg = ERR_PTR(-ENOMEM);
- goto out_free2;
- }
-
- msg->hdr.tid = cpu_to_le64(req->r_tid);
-
- head = msg->front.iov_base;
- p = msg->front.iov_base + sizeof(*head);
- end = msg->front.iov_base + msg->front.iov_len;
-
- head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
- head->op = cpu_to_le32(req->r_op);
- head->caller_uid = cpu_to_le32(req->r_uid);
- head->caller_gid = cpu_to_le32(req->r_gid);
- head->args = req->r_args;
-
- ceph_encode_filepath(&p, end, ino1, path1);
- ceph_encode_filepath(&p, end, ino2, path2);
-
- /* make note of release offset, in case we need to replay */
- req->r_request_release_offset = p - msg->front.iov_base;
-
- /* cap releases */
- releases = 0;
- if (req->r_inode_drop)
- releases += ceph_encode_inode_release(&p,
- req->r_inode ? req->r_inode : req->r_dentry->d_inode,
- mds, req->r_inode_drop, req->r_inode_unless, 0);
- if (req->r_dentry_drop)
- releases += ceph_encode_dentry_release(&p, req->r_dentry,
- mds, req->r_dentry_drop, req->r_dentry_unless);
- if (req->r_old_dentry_drop)
- releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
- mds, req->r_old_dentry_drop, req->r_old_dentry_unless);
- if (req->r_old_inode_drop)
- releases += ceph_encode_inode_release(&p,
- req->r_old_dentry->d_inode,
- mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
- head->num_releases = cpu_to_le16(releases);
-
- BUG_ON(p > end);
- msg->front.iov_len = p - msg->front.iov_base;
- msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
-
- msg->pages = req->r_pages;
- msg->nr_pages = req->r_num_pages;
- msg->hdr.data_len = cpu_to_le32(req->r_data_len);
- msg->hdr.data_off = cpu_to_le16(0);
-
-out_free2:
- if (freepath2)
- kfree((char *)path2);
-out_free1:
- if (freepath1)
- kfree((char *)path1);
-out:
- return msg;
-}
-
-/*
- * called under mdsc->mutex if error, under no mutex if
- * success.
- */
-static void complete_request(struct ceph_mds_client *mdsc,
- struct ceph_mds_request *req)
-{
- if (req->r_callback)
- req->r_callback(mdsc, req);
- else
- complete_all(&req->r_completion);
-}
-
-/*
- * called under mdsc->mutex
- */
-static int __prepare_send_request(struct ceph_mds_client *mdsc,
- struct ceph_mds_request *req,
- int mds)
-{
- struct ceph_mds_request_head *rhead;
- struct ceph_msg *msg;
- int flags = 0;
-
- req->r_attempts++;
- if (req->r_inode) {
- struct ceph_cap *cap =
- ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds);
-
- if (cap)
- req->r_sent_on_mseq = cap->mseq;
- else
- req->r_sent_on_mseq = -1;
- }
- dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
- req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
-
- if (req->r_got_unsafe) {
- /*
- * Replay. Do not regenerate message (and rebuild
- * paths, etc.); just use the original message.
- * Rebuilding paths will break for renames because
- * d_move mangles the src name.
- */
- msg = req->r_request;
- rhead = msg->front.iov_base;
-
- flags = le32_to_cpu(rhead->flags);
- flags |= CEPH_MDS_FLAG_REPLAY;
- rhead->flags = cpu_to_le32(flags);
-
- if (req->r_target_inode)
- rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
-
- rhead->num_retry = req->r_attempts - 1;
-
- /* remove cap/dentry releases from message */
- rhead->num_releases = 0;
- msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset);
- msg->front.iov_len = req->r_request_release_offset;
- return 0;
- }
-
- if (req->r_request) {
- ceph_msg_put(req->r_request);
- req->r_request = NULL;
- }
- msg = create_request_message(mdsc, req, mds);
- if (IS_ERR(msg)) {
- req->r_err = PTR_ERR(msg);
- complete_request(mdsc, req);
- return PTR_ERR(msg);
- }
- req->r_request = msg;
-
- rhead = msg->front.iov_base;
- rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
- if (req->r_got_unsafe)
- flags |= CEPH_MDS_FLAG_REPLAY;
- if (req->r_locked_dir)
- flags |= CEPH_MDS_FLAG_WANT_DENTRY;
- rhead->flags = cpu_to_le32(flags);
- rhead->num_fwd = req->r_num_fwd;
- rhead->num_retry = req->r_attempts - 1;
- rhead->ino = 0;
-
- dout(" r_locked_dir = %p\n", req->r_locked_dir);
- return 0;
-}
-
-/*
- * send request, or put it on the appropriate wait list.
- */
-static int __do_request(struct ceph_mds_client *mdsc,
- struct ceph_mds_request *req)
-{
- struct ceph_mds_session *session = NULL;
- int mds = -1;
- int err = -EAGAIN;
-
- if (req->r_err || req->r_got_result)
- goto out;
-
- if (req->r_timeout &&
- time_after_eq(jiffies, req->r_started + req->r_timeout)) {
- dout("do_request timed out\n");
- err = -EIO;
- goto finish;
- }
-
- put_request_session(req);
-
- mds = __choose_mds(mdsc, req);
- if (mds < 0 ||
- ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
- dout("do_request no mds or not active, waiting for map\n");
- list_add(&req->r_wait, &mdsc->waiting_for_map);
- goto out;
- }
-
- /* get, open session */
- session = __ceph_lookup_mds_session(mdsc, mds);
- if (!session) {
- session = register_session(mdsc, mds);
- if (IS_ERR(session)) {
- err = PTR_ERR(session);
- goto finish;
- }
- }
- req->r_session = get_session(session);
-
- dout("do_request mds%d session %p state %s\n", mds, session,
- session_state_name(session->s_state));
- if (session->s_state != CEPH_MDS_SESSION_OPEN &&
- session->s_state != CEPH_MDS_SESSION_HUNG) {
- if (session->s_state == CEPH_MDS_SESSION_NEW ||
- session->s_state == CEPH_MDS_SESSION_CLOSING)
- __open_session(mdsc, session);
- list_add(&req->r_wait, &session->s_waiting);
- goto out_session;
- }
-
- /* send request */
- req->r_resend_mds = -1; /* forget any previous mds hint */
-
- if (req->r_request_started == 0) /* note request start time */
- req->r_request_started = jiffies;
-
- err = __prepare_send_request(mdsc, req, mds);
- if (!err) {
- ceph_msg_get(req->r_request);
- ceph_con_send(&session->s_con, req->r_request);
- }
-
-out_session:
- ceph_put_mds_session(session);
-out:
- return err;
-
-finish:
- req->r_err = err;
- complete_request(mdsc, req);
- goto out;
-}
-
-/*
- * called under mdsc->mutex
- */
-static void __wake_requests(struct ceph_mds_client *mdsc,
- struct list_head *head)
-{
- struct ceph_mds_request *req, *nreq;
-
- list_for_each_entry_safe(req, nreq, head, r_wait) {
- list_del_init(&req->r_wait);
- __do_request(mdsc, req);
- }
-}
-
-/*
- * Wake up threads with requests pending for @mds, so that they can
- * resubmit their requests to a possibly different mds.
- */
-static void kick_requests(struct ceph_mds_client *mdsc, int mds)
-{
- struct ceph_mds_request *req;
- struct rb_node *p;
-
- dout("kick_requests mds%d\n", mds);
- for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) {
- req = rb_entry(p, struct ceph_mds_request, r_node);
- if (req->r_got_unsafe)
- continue;
- if (req->r_session &&
- req->r_session->s_mds == mds) {
- dout(" kicking tid %llu\n", req->r_tid);
- __do_request(mdsc, req);
- }
- }
-}
-
-void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
- struct ceph_mds_request *req)
-{
- dout("submit_request on %p\n", req);
- mutex_lock(&mdsc->mutex);
- __register_request(mdsc, req, NULL);
- __do_request(mdsc, req);
- mutex_unlock(&mdsc->mutex);
-}
-
-/*
- * Synchrously perform an mds request. Take care of all of the
- * session setup, forwarding, retry details.
- */
-int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
- struct inode *dir,
- struct ceph_mds_request *req)
-{
- int err;
-
- dout("do_request on %p\n", req);
-
- /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */
- if (req->r_inode)
- ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
- if (req->r_locked_dir)
- ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
- if (req->r_old_dentry)
- ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
- CEPH_CAP_PIN);
-
- /* issue */
- mutex_lock(&mdsc->mutex);
- __register_request(mdsc, req, dir);
- __do_request(mdsc, req);
-
- if (req->r_err) {
- err = req->r_err;
- __unregister_request(mdsc, req);
- dout("do_request early error %d\n", err);
- goto out;
- }
-
- /* wait */
- mutex_unlock(&mdsc->mutex);
- dout("do_request waiting\n");
- if (req->r_timeout) {
- err = (long)wait_for_completion_killable_timeout(
- &req->r_completion, req->r_timeout);
- if (err == 0)
- err = -EIO;
- } else {
- err = wait_for_completion_killable(&req->r_completion);
- }
- dout("do_request waited, got %d\n", err);
- mutex_lock(&mdsc->mutex);
-
- /* only abort if we didn't race with a real reply */
- if (req->r_got_result) {
- err = le32_to_cpu(req->r_reply_info.head->result);
- } else if (err < 0) {
- dout("aborted request %lld with %d\n", req->r_tid, err);
-
- /*
- * ensure we aren't running concurrently with
- * ceph_fill_trace or ceph_readdir_prepopulate, which
- * rely on locks (dir mutex) held by our caller.
- */
- mutex_lock(&req->r_fill_mutex);
- req->r_err = err;
- req->r_aborted = true;
- mutex_unlock(&req->r_fill_mutex);
-
- if (req->r_locked_dir &&
- (req->r_op & CEPH_MDS_OP_WRITE))
- ceph_invalidate_dir_request(req);
- } else {
- err = req->r_err;
- }
-
-out:
- mutex_unlock(&mdsc->mutex);
- dout("do_request %p done, result %d\n", req, err);
- return err;
-}
-
-/*
- * Invalidate dir D_COMPLETE, dentry lease state on an aborted MDS
- * namespace request.
- */
-void ceph_invalidate_dir_request(struct ceph_mds_request *req)
-{
- struct inode *inode = req->r_locked_dir;
- struct ceph_inode_info *ci = ceph_inode(inode);
-
- dout("invalidate_dir_request %p (D_COMPLETE, lease(s))\n", inode);
- spin_lock(&ci->i_ceph_lock);
- ceph_dir_clear_complete(inode);
- ci->i_release_count++;
- spin_unlock(&ci->i_ceph_lock);
-
- if (req->r_dentry)
- ceph_invalidate_dentry_lease(req->r_dentry);
- if (req->r_old_dentry)
- ceph_invalidate_dentry_lease(req->r_old_dentry);
-}
-
-/*
- * Handle mds reply.
- *
- * We take the session mutex and parse and process the reply immediately.
- * This preserves the logical ordering of replies, capabilities, etc., sent
- * by the MDS as they are applied to our local cache.
- */
-static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
-{
- struct ceph_mds_client *mdsc = session->s_mdsc;
- struct ceph_mds_request *req;
- struct ceph_mds_reply_head *head = msg->front.iov_base;
- struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */
- u64 tid;
- int err, result;
- int mds = session->s_mds;
-
- if (msg->front.iov_len < sizeof(*head)) {
- pr_err("mdsc_handle_reply got corrupt (short) reply\n");
- ceph_msg_dump(msg);
- return;
- }
-
- /* get request, session */
- tid = le64_to_cpu(msg->hdr.tid);
- mutex_lock(&mdsc->mutex);
- req = __lookup_request(mdsc, tid);
- if (!req) {
- dout("handle_reply on unknown tid %llu\n", tid);
- mutex_unlock(&mdsc->mutex);
- return;
- }
- dout("handle_reply %p\n", req);
-
- /* correct session? */
- if (req->r_session != session) {
- pr_err("mdsc_handle_reply got %llu on session mds%d"
- " not mds%d\n", tid, session->s_mds,
- req->r_session ? req->r_session->s_mds : -1);
- mutex_unlock(&mdsc->mutex);
- goto out;
- }
-
- /* dup? */
- if ((req->r_got_unsafe && !head->safe) ||
- (req->r_got_safe && head->safe)) {
- pr_warning("got a dup %s reply on %llu from mds%d\n",
- head->safe ? "safe" : "unsafe", tid, mds);
- mutex_unlock(&mdsc->mutex);
- goto out;
- }
- if (req->r_got_safe && !head->safe) {
- pr_warning("got unsafe after safe on %llu from mds%d\n",
- tid, mds);
- mutex_unlock(&mdsc->mutex);
- goto out;
- }
-
- result = le32_to_cpu(head->result);
-
- /*
- * Handle an ESTALE
- * if we're not talking to the authority, send to them
- * if the authority has changed while we weren't looking,
- * send to new authority
- * Otherwise we just have to return an ESTALE
- */
- if (result == -ESTALE) {
- dout("got ESTALE on request %llu", req->r_tid);
- if (!req->r_inode) {
- /* do nothing; not an authority problem */
- } else if (req->r_direct_mode != USE_AUTH_MDS) {
- dout("not using auth, setting for that now");
- req->r_direct_mode = USE_AUTH_MDS;
- __do_request(mdsc, req);
- mutex_unlock(&mdsc->mutex);
- goto out;
- } else {
- struct ceph_inode_info *ci = ceph_inode(req->r_inode);
- struct ceph_cap *cap = NULL;
-
- if (req->r_session)
- cap = ceph_get_cap_for_mds(ci,
- req->r_session->s_mds);
-
- dout("already using auth");
- if ((!cap || cap != ci->i_auth_cap) ||
- (cap->mseq != req->r_sent_on_mseq)) {
- dout("but cap changed, so resending");
- __do_request(mdsc, req);
- mutex_unlock(&mdsc->mutex);
- goto out;
- }
- }
- dout("have to return ESTALE on request %llu", req->r_tid);
- }
-
-
- if (head->safe) {
- req->r_got_safe = true;
- __unregister_request(mdsc, req);
- complete_all(&req->r_safe_completion);
-
- if (req->r_got_unsafe) {
- /*
- * We already handled the unsafe response, now do the
- * cleanup. No need to examine the response; the MDS
- * doesn't include any result info in the safe
- * response. And even if it did, there is nothing
- * useful we could do with a revised return value.
- */
- dout("got safe reply %llu, mds%d\n", tid, mds);
- list_del_init(&req->r_unsafe_item);
-
- /* last unsafe request during umount? */
- if (mdsc->stopping && !__get_oldest_req(mdsc))
- complete_all(&mdsc->safe_umount_waiters);
- mutex_unlock(&mdsc->mutex);
- goto out;
- }
- } else {
- req->r_got_unsafe = true;
- list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
- }
-
- dout("handle_reply tid %lld result %d\n", tid, result);
- rinfo = &req->r_reply_info;
- err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
- mutex_unlock(&mdsc->mutex);
-
- mutex_lock(&session->s_mutex);
- if (err < 0) {
- pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid);
- ceph_msg_dump(msg);
- goto out_err;
- }
-
- /* snap trace */
- if (rinfo->snapblob_len) {
- down_write(&mdsc->snap_rwsem);
- ceph_update_snap_trace(mdsc, rinfo->snapblob,
- rinfo->snapblob + rinfo->snapblob_len,
- le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP);
- downgrade_write(&mdsc->snap_rwsem);
- } else {
- down_read(&mdsc->snap_rwsem);
- }
-
- /* insert trace into our cache */
- mutex_lock(&req->r_fill_mutex);
- err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
- if (err == 0) {
- if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK &&
- rinfo->dir_nr)
- ceph_readdir_prepopulate(req, req->r_session);
- ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
- }
- mutex_unlock(&req->r_fill_mutex);
-
- up_read(&mdsc->snap_rwsem);
-out_err:
- mutex_lock(&mdsc->mutex);
- if (!req->r_aborted) {
- if (err) {
- req->r_err = err;
- } else {
- req->r_reply = msg;
- ceph_msg_get(msg);
- req->r_got_result = true;
- }
- } else {
- dout("reply arrived after request %lld was aborted\n", tid);
- }
- mutex_unlock(&mdsc->mutex);
-
- ceph_add_cap_releases(mdsc, req->r_session);
- mutex_unlock(&session->s_mutex);
-
- /* kick calling process */
- complete_request(mdsc, req);
-out:
- ceph_mdsc_put_request(req);
- return;
-}
-
-
-
-/*
- * handle mds notification that our request has been forwarded.
- */
-static void handle_forward(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session,
- struct ceph_msg *msg)
-{
- struct ceph_mds_request *req;
- u64 tid = le64_to_cpu(msg->hdr.tid);
- u32 next_mds;
- u32 fwd_seq;
- int err = -EINVAL;
- void *p = msg->front.iov_base;
- void *end = p + msg->front.iov_len;
-
- ceph_decode_need(&p, end, 2*sizeof(u32), bad);
- next_mds = ceph_decode_32(&p);
- fwd_seq = ceph_decode_32(&p);
-
- mutex_lock(&mdsc->mutex);
- req = __lookup_request(mdsc, tid);
- if (!req) {
- dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
- goto out; /* dup reply? */
- }
-
- if (req->r_aborted) {
- dout("forward tid %llu aborted, unregistering\n", tid);
- __unregister_request(mdsc, req);
- } else if (fwd_seq <= req->r_num_fwd) {
- dout("forward tid %llu to mds%d - old seq %d <= %d\n",
- tid, next_mds, req->r_num_fwd, fwd_seq);
- } else {
- /* resend. forward race not possible; mds would drop */
- dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
- BUG_ON(req->r_err);
- BUG_ON(req->r_got_result);
- req->r_num_fwd = fwd_seq;
- req->r_resend_mds = next_mds;
- put_request_session(req);
- __do_request(mdsc, req);
- }
- ceph_mdsc_put_request(req);
-out:
- mutex_unlock(&mdsc->mutex);
- return;
-
-bad:
- pr_err("mdsc_handle_forward decode error err=%d\n", err);
-}
-
-/*
- * handle a mds session control message
- */
-static void handle_session(struct ceph_mds_session *session,
- struct ceph_msg *msg)
-{
- struct ceph_mds_client *mdsc = session->s_mdsc;
- u32 op;
- u64 seq;
- int mds = session->s_mds;
- struct ceph_mds_session_head *h = msg->front.iov_base;
- int wake = 0;
-
- /* decode */
- if (msg->front.iov_len != sizeof(*h))
- goto bad;
- op = le32_to_cpu(h->op);
- seq = le64_to_cpu(h->seq);
-
- mutex_lock(&mdsc->mutex);
- if (op == CEPH_SESSION_CLOSE)
- __unregister_session(mdsc, session);
- /* FIXME: this ttl calculation is generous */
- session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
- mutex_unlock(&mdsc->mutex);
-
- mutex_lock(&session->s_mutex);
-
- dout("handle_session mds%d %s %p state %s seq %llu\n",
- mds, ceph_session_op_name(op), session,
- session_state_name(session->s_state), seq);
-
- if (session->s_state == CEPH_MDS_SESSION_HUNG) {
- session->s_state = CEPH_MDS_SESSION_OPEN;
- pr_info("mds%d came back\n", session->s_mds);
- }
-
- switch (op) {
- case CEPH_SESSION_OPEN:
- if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
- pr_info("mds%d reconnect success\n", session->s_mds);
- session->s_state = CEPH_MDS_SESSION_OPEN;
- renewed_caps(mdsc, session, 0);
- wake = 1;
- if (mdsc->stopping)
- __close_session(mdsc, session);
- break;
-
- case CEPH_SESSION_RENEWCAPS:
- if (session->s_renew_seq == seq)
- renewed_caps(mdsc, session, 1);
- break;
-
- case CEPH_SESSION_CLOSE:
- if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
- pr_info("mds%d reconnect denied\n", session->s_mds);
- remove_session_caps(session);
- wake = 1; /* for good measure */
- wake_up_all(&mdsc->session_close_wq);
- kick_requests(mdsc, mds);
- break;
-
- case CEPH_SESSION_STALE:
- pr_info("mds%d caps went stale, renewing\n",
- session->s_mds);
- spin_lock(&session->s_gen_ttl_lock);
- session->s_cap_gen++;
- session->s_cap_ttl = jiffies - 1;
- spin_unlock(&session->s_gen_ttl_lock);
- send_renew_caps(mdsc, session);
- break;
-
- case CEPH_SESSION_RECALL_STATE:
- trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
- break;
-
- default:
- pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
- WARN_ON(1);
- }
-
- mutex_unlock(&session->s_mutex);
- if (wake) {
- mutex_lock(&mdsc->mutex);
- __wake_requests(mdsc, &session->s_waiting);
- mutex_unlock(&mdsc->mutex);
- }
- return;
-
-bad:
- pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
- (int)msg->front.iov_len);
- ceph_msg_dump(msg);
- return;
-}
-
-
-/*
- * called under session->mutex.
- */
-static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
-{
- struct ceph_mds_request *req, *nreq;
- int err;
-
- dout("replay_unsafe_requests mds%d\n", session->s_mds);
-
- mutex_lock(&mdsc->mutex);
- list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
- err = __prepare_send_request(mdsc, req, session->s_mds);
- if (!err) {
- ceph_msg_get(req->r_request);
- ceph_con_send(&session->s_con, req->r_request);
- }
- }
- mutex_unlock(&mdsc->mutex);
-}
-
-/*
- * Encode information about a cap for a reconnect with the MDS.
- */
-static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
- void *arg)
-{
- union {
- struct ceph_mds_cap_reconnect v2;
- struct ceph_mds_cap_reconnect_v1 v1;
- } rec;
- size_t reclen;
- struct ceph_inode_info *ci;
- struct ceph_reconnect_state *recon_state = arg;
- struct ceph_pagelist *pagelist = recon_state->pagelist;
- char *path;
- int pathlen, err;
- u64 pathbase;
- struct dentry *dentry;
-
- ci = cap->ci;
-
- dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
- inode, ceph_vinop(inode), cap, cap->cap_id,
- ceph_cap_string(cap->issued));
- err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
- if (err)
- return err;
-
- dentry = d_find_alias(inode);
- if (dentry) {
- path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
- if (IS_ERR(path)) {
- err = PTR_ERR(path);
- goto out_dput;
- }
- } else {
- path = NULL;
- pathlen = 0;
- }
- err = ceph_pagelist_encode_string(pagelist, path, pathlen);
- if (err)
- goto out_free;
-
- spin_lock(&ci->i_ceph_lock);
- cap->seq = 0; /* reset cap seq */
- cap->issue_seq = 0; /* and issue_seq */
-
- if (recon_state->flock) {
- rec.v2.cap_id = cpu_to_le64(cap->cap_id);
- rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
- rec.v2.issued = cpu_to_le32(cap->issued);
- rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
- rec.v2.pathbase = cpu_to_le64(pathbase);
- rec.v2.flock_len = 0;
- reclen = sizeof(rec.v2);
- } else {
- rec.v1.cap_id = cpu_to_le64(cap->cap_id);
- rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
- rec.v1.issued = cpu_to_le32(cap->issued);
- rec.v1.size = cpu_to_le64(inode->i_size);
- ceph_encode_timespec(&rec.v1.mtime, &inode->i_mtime);
- ceph_encode_timespec(&rec.v1.atime, &inode->i_atime);
- rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
- rec.v1.pathbase = cpu_to_le64(pathbase);
- reclen = sizeof(rec.v1);
- }
- spin_unlock(&ci->i_ceph_lock);
-
- if (recon_state->flock) {
- int num_fcntl_locks, num_flock_locks;
- struct ceph_pagelist_cursor trunc_point;
-
- ceph_pagelist_set_cursor(pagelist, &trunc_point);
- do {
- lock_flocks();
- ceph_count_locks(inode, &num_fcntl_locks,
- &num_flock_locks);
- rec.v2.flock_len = (2*sizeof(u32) +
- (num_fcntl_locks+num_flock_locks) *
- sizeof(struct ceph_filelock));
- unlock_flocks();
-
- /* pre-alloc pagelist */
- ceph_pagelist_truncate(pagelist, &trunc_point);
- err = ceph_pagelist_append(pagelist, &rec, reclen);
- if (!err)
- err = ceph_pagelist_reserve(pagelist,
- rec.v2.flock_len);
-
- /* encode locks */
- if (!err) {
- lock_flocks();
- err = ceph_encode_locks(inode,
- pagelist,
- num_fcntl_locks,
- num_flock_locks);
- unlock_flocks();
- }
- } while (err == -ENOSPC);
- } else {
- err = ceph_pagelist_append(pagelist, &rec, reclen);
- }
-
-out_free:
- kfree(path);
-out_dput:
- dput(dentry);
- return err;
-}
-
-
-/*
- * If an MDS fails and recovers, clients need to reconnect in order to
- * reestablish shared state. This includes all caps issued through
- * this session _and_ the snap_realm hierarchy. Because it's not
- * clear which snap realms the mds cares about, we send everything we
- * know about.. that ensures we'll then get any new info the
- * recovering MDS might have.
- *
- * This is a relatively heavyweight operation, but it's rare.
- *
- * called with mdsc->mutex held.
- */
-static void send_mds_reconnect(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
-{
- struct ceph_msg *reply;
- struct rb_node *p;
- int mds = session->s_mds;
- int err = -ENOMEM;
- struct ceph_pagelist *pagelist;
- struct ceph_reconnect_state recon_state;
-
- pr_info("mds%d reconnect start\n", mds);
-
- pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
- if (!pagelist)
- goto fail_nopagelist;
- ceph_pagelist_init(pagelist);
-
- reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false);
- if (!reply)
- goto fail_nomsg;
-
- mutex_lock(&session->s_mutex);
- session->s_state = CEPH_MDS_SESSION_RECONNECTING;
- session->s_seq = 0;
-
- ceph_con_open(&session->s_con,
- ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
-
- /* replay unsafe requests */
- replay_unsafe_requests(mdsc, session);
-
- down_read(&mdsc->snap_rwsem);
-
- dout("session %p state %s\n", session,
- session_state_name(session->s_state));
-
- /* drop old cap expires; we're about to reestablish that state */
- discard_cap_releases(mdsc, session);
-
- /* traverse this session's caps */
- err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
- if (err)
- goto fail;
-
- recon_state.pagelist = pagelist;
- recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK;
- err = iterate_session_caps(session, encode_caps_cb, &recon_state);
- if (err < 0)
- goto fail;
-
- /*
- * snaprealms. we provide mds with the ino, seq (version), and
- * parent for all of our realms. If the mds has any newer info,
- * it will tell us.
- */
- for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
- struct ceph_snap_realm *realm =
- rb_entry(p, struct ceph_snap_realm, node);
- struct ceph_mds_snaprealm_reconnect sr_rec;
-
- dout(" adding snap realm %llx seq %lld parent %llx\n",
- realm->ino, realm->seq, realm->parent_ino);
- sr_rec.ino = cpu_to_le64(realm->ino);
- sr_rec.seq = cpu_to_le64(realm->seq);
- sr_rec.parent = cpu_to_le64(realm->parent_ino);
- err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
- if (err)
- goto fail;
- }
-
- reply->pagelist = pagelist;
- if (recon_state.flock)
- reply->hdr.version = cpu_to_le16(2);
- reply->hdr.data_len = cpu_to_le32(pagelist->length);
- reply->nr_pages = calc_pages_for(0, pagelist->length);
- ceph_con_send(&session->s_con, reply);
-
- mutex_unlock(&session->s_mutex);
-
- mutex_lock(&mdsc->mutex);
- __wake_requests(mdsc, &session->s_waiting);
- mutex_unlock(&mdsc->mutex);
-
- up_read(&mdsc->snap_rwsem);
- return;
-
-fail:
- ceph_msg_put(reply);
- up_read(&mdsc->snap_rwsem);
- mutex_unlock(&session->s_mutex);
-fail_nomsg:
- ceph_pagelist_release(pagelist);
- kfree(pagelist);
-fail_nopagelist:
- pr_err("error %d preparing reconnect for mds%d\n", err, mds);
- return;
-}
-
-
-/*
- * compare old and new mdsmaps, kicking requests
- * and closing out old connections as necessary
- *
- * called under mdsc->mutex.
- */
-static void check_new_map(struct ceph_mds_client *mdsc,
- struct ceph_mdsmap *newmap,
- struct ceph_mdsmap *oldmap)
-{
- int i;
- int oldstate, newstate;
- struct ceph_mds_session *s;
-
- dout("check_new_map new %u old %u\n",
- newmap->m_epoch, oldmap->m_epoch);
-
- for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) {
- if (mdsc->sessions[i] == NULL)
- continue;
- s = mdsc->sessions[i];
- oldstate = ceph_mdsmap_get_state(oldmap, i);
- newstate = ceph_mdsmap_get_state(newmap, i);
-
- dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n",
- i, ceph_mds_state_name(oldstate),
- ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",
- ceph_mds_state_name(newstate),
- ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
- session_state_name(s->s_state));
-
- if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
- ceph_mdsmap_get_addr(newmap, i),
- sizeof(struct ceph_entity_addr))) {
- if (s->s_state == CEPH_MDS_SESSION_OPENING) {
- /* the session never opened, just close it
- * out now */
- __wake_requests(mdsc, &s->s_waiting);
- __unregister_session(mdsc, s);
- } else {
- /* just close it */
- mutex_unlock(&mdsc->mutex);
- mutex_lock(&s->s_mutex);
- mutex_lock(&mdsc->mutex);
- ceph_con_close(&s->s_con);
- mutex_unlock(&s->s_mutex);
- s->s_state = CEPH_MDS_SESSION_RESTARTING;
- }
-
- /* kick any requests waiting on the recovering mds */
- kick_requests(mdsc, i);
- } else if (oldstate == newstate) {
- continue; /* nothing new with this mds */
- }
-
- /*
- * send reconnect?
- */
- if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
- newstate >= CEPH_MDS_STATE_RECONNECT) {
- mutex_unlock(&mdsc->mutex);
- send_mds_reconnect(mdsc, s);
- mutex_lock(&mdsc->mutex);
- }
-
- /*
- * kick request on any mds that has gone active.
- */
- if (oldstate < CEPH_MDS_STATE_ACTIVE &&
- newstate >= CEPH_MDS_STATE_ACTIVE) {
- if (oldstate != CEPH_MDS_STATE_CREATING &&
- oldstate != CEPH_MDS_STATE_STARTING)
- pr_info("mds%d recovery completed\n", s->s_mds);
- kick_requests(mdsc, i);
- ceph_kick_flushing_caps(mdsc, s);
- wake_up_session_caps(s, 1);
- }
- }
-
- for (i = 0; i < newmap->m_max_mds && i < mdsc->max_sessions; i++) {
- s = mdsc->sessions[i];
- if (!s)
- continue;
- if (!ceph_mdsmap_is_laggy(newmap, i))
- continue;
- if (s->s_state == CEPH_MDS_SESSION_OPEN ||
- s->s_state == CEPH_MDS_SESSION_HUNG ||
- s->s_state == CEPH_MDS_SESSION_CLOSING) {
- dout(" connecting to export targets of laggy mds%d\n",
- i);
- __open_export_target_sessions(mdsc, s);
- }
- }
-}
-
-
-
-/*
- * leases
- */
-
-/*
- * caller must hold session s_mutex, dentry->d_lock
- */
-void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
-{
- struct ceph_dentry_info *di = ceph_dentry(dentry);
-
- ceph_put_mds_session(di->lease_session);
- di->lease_session = NULL;
-}
-
-static void handle_lease(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session,
- struct ceph_msg *msg)
-{
- struct super_block *sb = mdsc->fsc->sb;
- struct inode *inode;
- struct dentry *parent, *dentry;
- struct ceph_dentry_info *di;
- int mds = session->s_mds;
- struct ceph_mds_lease *h = msg->front.iov_base;
- u32 seq;
- struct ceph_vino vino;
- struct qstr dname;
- int release = 0;
-
- dout("handle_lease from mds%d\n", mds);
-
- /* decode */
- if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
- goto bad;
- vino.ino = le64_to_cpu(h->ino);
- vino.snap = CEPH_NOSNAP;
- seq = le32_to_cpu(h->seq);
- dname.name = (void *)h + sizeof(*h) + sizeof(u32);
- dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
- if (dname.len != get_unaligned_le32(h+1))
- goto bad;
-
- mutex_lock(&session->s_mutex);
- session->s_seq++;
-
- /* lookup inode */
- inode = ceph_find_inode(sb, vino);
- dout("handle_lease %s, ino %llx %p %.*s\n",
- ceph_lease_op_name(h->action), vino.ino, inode,
- dname.len, dname.name);
- if (inode == NULL) {
- dout("handle_lease no inode %llx\n", vino.ino);
- goto release;
- }
-
- /* dentry */
- parent = d_find_alias(inode);
- if (!parent) {
- dout("no parent dentry on inode %p\n", inode);
- WARN_ON(1);
- goto release; /* hrm... */
- }
- dname.hash = full_name_hash(dname.name, dname.len);
- dentry = d_lookup(parent, &dname);
- dput(parent);
- if (!dentry)
- goto release;
-
- spin_lock(&dentry->d_lock);
- di = ceph_dentry(dentry);
- switch (h->action) {
- case CEPH_MDS_LEASE_REVOKE:
- if (di->lease_session == session) {
- if (ceph_seq_cmp(di->lease_seq, seq) > 0)
- h->seq = cpu_to_le32(di->lease_seq);
- __ceph_mdsc_drop_dentry_lease(dentry);
- }
- release = 1;
- break;
-
- case CEPH_MDS_LEASE_RENEW:
- if (di->lease_session == session &&
- di->lease_gen == session->s_cap_gen &&
- di->lease_renew_from &&
- di->lease_renew_after == 0) {
- unsigned long duration =
- le32_to_cpu(h->duration_ms) * HZ / 1000;
-
- di->lease_seq = seq;
- dentry->d_time = di->lease_renew_from + duration;
- di->lease_renew_after = di->lease_renew_from +
- (duration >> 1);
- di->lease_renew_from = 0;
- }
- break;
- }
- spin_unlock(&dentry->d_lock);
- dput(dentry);
-
- if (!release)
- goto out;
-
-release:
- /* let's just reuse the same message */
- h->action = CEPH_MDS_LEASE_REVOKE_ACK;
- ceph_msg_get(msg);
- ceph_con_send(&session->s_con, msg);
-
-out:
- iput(inode);
- mutex_unlock(&session->s_mutex);
- return;
-
-bad:
- pr_err("corrupt lease message\n");
- ceph_msg_dump(msg);
-}
-
-void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
- struct inode *inode,
- struct dentry *dentry, char action,
- u32 seq)
-{
- struct ceph_msg *msg;
- struct ceph_mds_lease *lease;
- int len = sizeof(*lease) + sizeof(u32);
- int dnamelen = 0;
-
- dout("lease_send_msg inode %p dentry %p %s to mds%d\n",
- inode, dentry, ceph_lease_op_name(action), session->s_mds);
- dnamelen = dentry->d_name.len;
- len += dnamelen;
-
- msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false);
- if (!msg)
- return;
- lease = msg->front.iov_base;
- lease->action = action;
- lease->ino = cpu_to_le64(ceph_vino(inode).ino);
- lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
- lease->seq = cpu_to_le32(seq);
- put_unaligned_le32(dnamelen, lease + 1);
- memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen);
-
- /*
- * if this is a preemptive lease RELEASE, no need to
- * flush request stream, since the actual request will
- * soon follow.
- */
- msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
-
- ceph_con_send(&session->s_con, msg);
-}
-
-/*
- * Preemptively release a lease we expect to invalidate anyway.
- * Pass @inode always, @dentry is optional.
- */
-void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
- struct dentry *dentry)
-{
- struct ceph_dentry_info *di;
- struct ceph_mds_session *session;
- u32 seq;
-
- BUG_ON(inode == NULL);
- BUG_ON(dentry == NULL);
-
- /* is dentry lease valid? */
- spin_lock(&dentry->d_lock);
- di = ceph_dentry(dentry);
- if (!di || !di->lease_session ||
- di->lease_session->s_mds < 0 ||
- di->lease_gen != di->lease_session->s_cap_gen ||
- !time_before(jiffies, dentry->d_time)) {
- dout("lease_release inode %p dentry %p -- "
- "no lease\n",
- inode, dentry);
- spin_unlock(&dentry->d_lock);
- return;
- }
-
- /* we do have a lease on this dentry; note mds and seq */
- session = ceph_get_mds_session(di->lease_session);
- seq = di->lease_seq;
- __ceph_mdsc_drop_dentry_lease(dentry);
- spin_unlock(&dentry->d_lock);
-
- dout("lease_release inode %p dentry %p to mds%d\n",
- inode, dentry, session->s_mds);
- ceph_mdsc_lease_send_msg(session, inode, dentry,
- CEPH_MDS_LEASE_RELEASE, seq);
- ceph_put_mds_session(session);
-}
-
-/*
- * drop all leases (and dentry refs) in preparation for umount
- */
-static void drop_leases(struct ceph_mds_client *mdsc)
-{
- int i;
-
- dout("drop_leases\n");
- mutex_lock(&mdsc->mutex);
- for (i = 0; i < mdsc->max_sessions; i++) {
- struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
- if (!s)
- continue;
- mutex_unlock(&mdsc->mutex);
- mutex_lock(&s->s_mutex);
- mutex_unlock(&s->s_mutex);
- ceph_put_mds_session(s);
- mutex_lock(&mdsc->mutex);
- }
- mutex_unlock(&mdsc->mutex);
-}
-
-
-
-/*
- * delayed work -- periodically trim expired leases, renew caps with mds
- */
-static void schedule_delayed(struct ceph_mds_client *mdsc)
-{
- int delay = 5;
- unsigned hz = round_jiffies_relative(HZ * delay);
- schedule_delayed_work(&mdsc->delayed_work, hz);
-}
-
-static void delayed_work(struct work_struct *work)
-{
- int i;
- struct ceph_mds_client *mdsc =
- container_of(work, struct ceph_mds_client, delayed_work.work);
- int renew_interval;
- int renew_caps;
-
- dout("mdsc delayed_work\n");
- ceph_check_delayed_caps(mdsc);
-
- mutex_lock(&mdsc->mutex);
- renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
- renew_caps = time_after_eq(jiffies, HZ*renew_interval +
- mdsc->last_renew_caps);
- if (renew_caps)
- mdsc->last_renew_caps = jiffies;
-
- for (i = 0; i < mdsc->max_sessions; i++) {
- struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
- if (s == NULL)
- continue;
- if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
- dout("resending session close request for mds%d\n",
- s->s_mds);
- request_close_session(mdsc, s);
- ceph_put_mds_session(s);
- continue;
- }
- if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
- if (s->s_state == CEPH_MDS_SESSION_OPEN) {
- s->s_state = CEPH_MDS_SESSION_HUNG;
- pr_info("mds%d hung\n", s->s_mds);
- }
- }
- if (s->s_state < CEPH_MDS_SESSION_OPEN) {
- /* this mds is failed or recovering, just wait */
- ceph_put_mds_session(s);
- continue;
- }
- mutex_unlock(&mdsc->mutex);
-
- mutex_lock(&s->s_mutex);
- if (renew_caps)
- send_renew_caps(mdsc, s);
- else
- ceph_con_keepalive(&s->s_con);
- ceph_add_cap_releases(mdsc, s);
- if (s->s_state == CEPH_MDS_SESSION_OPEN ||
- s->s_state == CEPH_MDS_SESSION_HUNG)
- ceph_send_cap_releases(mdsc, s);
- mutex_unlock(&s->s_mutex);
- ceph_put_mds_session(s);
-
- mutex_lock(&mdsc->mutex);
- }
- mutex_unlock(&mdsc->mutex);
-
- schedule_delayed(mdsc);
-}
-
-int ceph_mdsc_init(struct ceph_fs_client *fsc)
-
-{
- struct ceph_mds_client *mdsc;
-
- mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS);
- if (!mdsc)
- return -ENOMEM;
- mdsc->fsc = fsc;
- fsc->mdsc = mdsc;
- mutex_init(&mdsc->mutex);
- mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
- if (mdsc->mdsmap == NULL)
- return -ENOMEM;
-
- init_completion(&mdsc->safe_umount_waiters);
- init_waitqueue_head(&mdsc->session_close_wq);
- INIT_LIST_HEAD(&mdsc->waiting_for_map);
- mdsc->sessions = NULL;
- mdsc->max_sessions = 0;
- mdsc->stopping = 0;
- init_rwsem(&mdsc->snap_rwsem);
- mdsc->snap_realms = RB_ROOT;
- INIT_LIST_HEAD(&mdsc->snap_empty);
- spin_lock_init(&mdsc->snap_empty_lock);
- mdsc->last_tid = 0;
- mdsc->request_tree = RB_ROOT;
- INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
- mdsc->last_renew_caps = jiffies;
- INIT_LIST_HEAD(&mdsc->cap_delay_list);
- spin_lock_init(&mdsc->cap_delay_lock);
- INIT_LIST_HEAD(&mdsc->snap_flush_list);
- spin_lock_init(&mdsc->snap_flush_lock);
- mdsc->cap_flush_seq = 0;
- INIT_LIST_HEAD(&mdsc->cap_dirty);
- INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
- mdsc->num_cap_flushing = 0;
- spin_lock_init(&mdsc->cap_dirty_lock);
- init_waitqueue_head(&mdsc->cap_flushing_wq);
- spin_lock_init(&mdsc->dentry_lru_lock);
- INIT_LIST_HEAD(&mdsc->dentry_lru);
-
- ceph_caps_init(mdsc);
- ceph_adjust_min_caps(mdsc, fsc->min_caps);
-
- return 0;
-}
-
-/*
- * Wait for safe replies on open mds requests. If we time out, drop
- * all requests from the tree to avoid dangling dentry refs.
- */
-static void wait_requests(struct ceph_mds_client *mdsc)
-{
- struct ceph_mds_request *req;
- struct ceph_fs_client *fsc = mdsc->fsc;
-
- mutex_lock(&mdsc->mutex);
- if (__get_oldest_req(mdsc)) {
- mutex_unlock(&mdsc->mutex);
-
- dout("wait_requests waiting for requests\n");
- wait_for_completion_timeout(&mdsc->safe_umount_waiters,
- fsc->client->options->mount_timeout * HZ);
-
- /* tear down remaining requests */
- mutex_lock(&mdsc->mutex);
- while ((req = __get_oldest_req(mdsc))) {
- dout("wait_requests timed out on tid %llu\n",
- req->r_tid);
- __unregister_request(mdsc, req);
- }
- }
- mutex_unlock(&mdsc->mutex);
- dout("wait_requests done\n");
-}
-
-/*
- * called before mount is ro, and before dentries are torn down.
- * (hmm, does this still race with new lookups?)
- */
-void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
-{
- dout("pre_umount\n");
- mdsc->stopping = 1;
-
- drop_leases(mdsc);
- ceph_flush_dirty_caps(mdsc);
- wait_requests(mdsc);
-
- /*
- * wait for reply handlers to drop their request refs and
- * their inode/dcache refs
- */
- ceph_msgr_flush();
-}
-
-/*
- * wait for all write mds requests to flush.
- */
-static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
-{
- struct ceph_mds_request *req = NULL, *nextreq;
- struct rb_node *n;
-
- mutex_lock(&mdsc->mutex);
- dout("wait_unsafe_requests want %lld\n", want_tid);
-restart:
- req = __get_oldest_req(mdsc);
- while (req && req->r_tid <= want_tid) {
- /* find next request */
- n = rb_next(&req->r_node);
- if (n)
- nextreq = rb_entry(n, struct ceph_mds_request, r_node);
- else
- nextreq = NULL;
- if ((req->r_op & CEPH_MDS_OP_WRITE)) {
- /* write op */
- ceph_mdsc_get_request(req);
- if (nextreq)
- ceph_mdsc_get_request(nextreq);
- mutex_unlock(&mdsc->mutex);
- dout("wait_unsafe_requests wait on %llu (want %llu)\n",
- req->r_tid, want_tid);
- wait_for_completion(&req->r_safe_completion);
- mutex_lock(&mdsc->mutex);
- ceph_mdsc_put_request(req);
- if (!nextreq)
- break; /* next dne before, so we're done! */
- if (RB_EMPTY_NODE(&nextreq->r_node)) {
- /* next request was removed from tree */
- ceph_mdsc_put_request(nextreq);
- goto restart;
- }
- ceph_mdsc_put_request(nextreq); /* won't go away */
- }
- req = nextreq;
- }
- mutex_unlock(&mdsc->mutex);
- dout("wait_unsafe_requests done\n");
-}
-
-void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
-{
- u64 want_tid, want_flush;
-
- if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
- return;
-
- dout("sync\n");
- mutex_lock(&mdsc->mutex);
- want_tid = mdsc->last_tid;
- want_flush = mdsc->cap_flush_seq;
- mutex_unlock(&mdsc->mutex);
- dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
-
- ceph_flush_dirty_caps(mdsc);
-
- wait_unsafe_requests(mdsc, want_tid);
- wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
-}
-
-/*
- * true if all sessions are closed, or we force unmount
- */
-static bool done_closing_sessions(struct ceph_mds_client *mdsc)
-{
- int i, n = 0;
-
- if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
- return true;
-
- mutex_lock(&mdsc->mutex);
- for (i = 0; i < mdsc->max_sessions; i++)
- if (mdsc->sessions[i])
- n++;
- mutex_unlock(&mdsc->mutex);
- return n == 0;
-}
-
-/*
- * called after sb is ro.
- */
-void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
-{
- struct ceph_mds_session *session;
- int i;
- struct ceph_fs_client *fsc = mdsc->fsc;
- unsigned long timeout = fsc->client->options->mount_timeout * HZ;
-
- dout("close_sessions\n");
-
- /* close sessions */
- mutex_lock(&mdsc->mutex);
- for (i = 0; i < mdsc->max_sessions; i++) {
- session = __ceph_lookup_mds_session(mdsc, i);
- if (!session)
- continue;
- mutex_unlock(&mdsc->mutex);
- mutex_lock(&session->s_mutex);
- __close_session(mdsc, session);
- mutex_unlock(&session->s_mutex);
- ceph_put_mds_session(session);
- mutex_lock(&mdsc->mutex);
- }
- mutex_unlock(&mdsc->mutex);
-
- dout("waiting for sessions to close\n");
- wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc),
- timeout);
-
- /* tear down remaining sessions */
- mutex_lock(&mdsc->mutex);
- for (i = 0; i < mdsc->max_sessions; i++) {
- if (mdsc->sessions[i]) {
- session = get_session(mdsc->sessions[i]);
- __unregister_session(mdsc, session);
- mutex_unlock(&mdsc->mutex);
- mutex_lock(&session->s_mutex);
- remove_session_caps(session);
- mutex_unlock(&session->s_mutex);
- ceph_put_mds_session(session);
- mutex_lock(&mdsc->mutex);
- }
- }
- WARN_ON(!list_empty(&mdsc->cap_delay_list));
- mutex_unlock(&mdsc->mutex);
-
- ceph_cleanup_empty_realms(mdsc);
-
- cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
-
- dout("stopped\n");
-}
-
-static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
-{
- dout("stop\n");
- cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
- if (mdsc->mdsmap)
- ceph_mdsmap_destroy(mdsc->mdsmap);
- kfree(mdsc->sessions);
- ceph_caps_finalize(mdsc);
-}
-
-void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
-{
- struct ceph_mds_client *mdsc = fsc->mdsc;
-
- dout("mdsc_destroy %p\n", mdsc);
- ceph_mdsc_stop(mdsc);
-
- /* flush out any connection work with references to us */
- ceph_msgr_flush();
-
- fsc->mdsc = NULL;
- kfree(mdsc);
- dout("mdsc_destroy %p done\n", mdsc);
-}
-
-
-/*
- * handle mds map update.
- */
-void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
-{
- u32 epoch;
- u32 maplen;
- void *p = msg->front.iov_base;
- void *end = p + msg->front.iov_len;
- struct ceph_mdsmap *newmap, *oldmap;
- struct ceph_fsid fsid;
- int err = -EINVAL;
-
- ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
- ceph_decode_copy(&p, &fsid, sizeof(fsid));
- if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0)
- return;
- epoch = ceph_decode_32(&p);
- maplen = ceph_decode_32(&p);
- dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
-
- /* do we need it? */
- ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch);
- mutex_lock(&mdsc->mutex);
- if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
- dout("handle_map epoch %u <= our %u\n",
- epoch, mdsc->mdsmap->m_epoch);
- mutex_unlock(&mdsc->mutex);
- return;
- }
-
- newmap = ceph_mdsmap_decode(&p, end);
- if (IS_ERR(newmap)) {
- err = PTR_ERR(newmap);
- goto bad_unlock;
- }
-
- /* swap into place */
- if (mdsc->mdsmap) {
- oldmap = mdsc->mdsmap;
- mdsc->mdsmap = newmap;
- check_new_map(mdsc, newmap, oldmap);
- ceph_mdsmap_destroy(oldmap);
- } else {
- mdsc->mdsmap = newmap; /* first mds map */
- }
- mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
-
- __wake_requests(mdsc, &mdsc->waiting_for_map);
-
- mutex_unlock(&mdsc->mutex);
- schedule_delayed(mdsc);
- return;
-
-bad_unlock:
- mutex_unlock(&mdsc->mutex);
-bad:
- pr_err("error decoding mdsmap %d\n", err);
- return;
-}
-
-static struct ceph_connection *con_get(struct ceph_connection *con)
-{
- struct ceph_mds_session *s = con->private;
-
- if (get_session(s)) {
- dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref));
- return con;
- }
- dout("mdsc con_get %p FAIL\n", s);
- return NULL;
-}
-
-static void con_put(struct ceph_connection *con)
-{
- struct ceph_mds_session *s = con->private;
-
- dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref) - 1);
- ceph_put_mds_session(s);
-}
-
-/*
- * if the client is unresponsive for long enough, the mds will kill
- * the session entirely.
- */
-static void peer_reset(struct ceph_connection *con)
-{
- struct ceph_mds_session *s = con->private;
- struct ceph_mds_client *mdsc = s->s_mdsc;
-
- pr_warning("mds%d closed our session\n", s->s_mds);
- send_mds_reconnect(mdsc, s);
-}
-
-static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
-{
- struct ceph_mds_session *s = con->private;
- struct ceph_mds_client *mdsc = s->s_mdsc;
- int type = le16_to_cpu(msg->hdr.type);
-
- mutex_lock(&mdsc->mutex);
- if (__verify_registered_session(mdsc, s) < 0) {
- mutex_unlock(&mdsc->mutex);
- goto out;
- }
- mutex_unlock(&mdsc->mutex);
-
- switch (type) {
- case CEPH_MSG_MDS_MAP:
- ceph_mdsc_handle_map(mdsc, msg);
- break;
- case CEPH_MSG_CLIENT_SESSION:
- handle_session(s, msg);
- break;
- case CEPH_MSG_CLIENT_REPLY:
- handle_reply(s, msg);
- break;
- case CEPH_MSG_CLIENT_REQUEST_FORWARD:
- handle_forward(mdsc, s, msg);
- break;
- case CEPH_MSG_CLIENT_CAPS:
- ceph_handle_caps(s, msg);
- break;
- case CEPH_MSG_CLIENT_SNAP:
- ceph_handle_snap(mdsc, s, msg);
- break;
- case CEPH_MSG_CLIENT_LEASE:
- handle_lease(mdsc, s, msg);
- break;
-
- default:
- pr_err("received unknown message type %d %s\n", type,
- ceph_msg_type_name(type));
- }
-out:
- ceph_msg_put(msg);
-}
-
-/*
- * authentication
- */
-static int get_authorizer(struct ceph_connection *con,
- void **buf, int *len, int *proto,
- void **reply_buf, int *reply_len, int force_new)
-{
- struct ceph_mds_session *s = con->private;
- struct ceph_mds_client *mdsc = s->s_mdsc;
- struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
- int ret = 0;
-
- if (force_new && s->s_authorizer) {
- ac->ops->destroy_authorizer(ac, s->s_authorizer);
- s->s_authorizer = NULL;
- }
- if (s->s_authorizer == NULL) {
- if (ac->ops->create_authorizer) {
- ret = ac->ops->create_authorizer(
- ac, CEPH_ENTITY_TYPE_MDS,
- &s->s_authorizer,
- &s->s_authorizer_buf,
- &s->s_authorizer_buf_len,
- &s->s_authorizer_reply_buf,
- &s->s_authorizer_reply_buf_len);
- if (ret)
- return ret;
- }
- }
-
- *proto = ac->protocol;
- *buf = s->s_authorizer_buf;
- *len = s->s_authorizer_buf_len;
- *reply_buf = s->s_authorizer_reply_buf;
- *reply_len = s->s_authorizer_reply_buf_len;
- return 0;
-}
-
-
-static int verify_authorizer_reply(struct ceph_connection *con, int len)
-{
- struct ceph_mds_session *s = con->private;
- struct ceph_mds_client *mdsc = s->s_mdsc;
- struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
-
- return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
-}
-
-static int invalidate_authorizer(struct ceph_connection *con)
-{
- struct ceph_mds_session *s = con->private;
- struct ceph_mds_client *mdsc = s->s_mdsc;
- struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
-
- if (ac->ops->invalidate_authorizer)
- ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
-
- return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
-}
-
-static const struct ceph_connection_operations mds_con_ops = {
- .get = con_get,
- .put = con_put,
- .dispatch = dispatch,
- .get_authorizer = get_authorizer,
- .verify_authorizer_reply = verify_authorizer_reply,
- .invalidate_authorizer = invalidate_authorizer,
- .peer_reset = peer_reset,
-};
-
-/* eof */
diff --git a/ANDROID_3.4.5/fs/ceph/mds_client.h b/ANDROID_3.4.5/fs/ceph/mds_client.h
deleted file mode 100644
index 8c7c04eb..00000000
--- a/ANDROID_3.4.5/fs/ceph/mds_client.h
+++ /dev/null
@@ -1,383 +0,0 @@
-#ifndef _FS_CEPH_MDS_CLIENT_H
-#define _FS_CEPH_MDS_CLIENT_H
-
-#include <linux/completion.h>
-#include <linux/kref.h>
-#include <linux/list.h>
-#include <linux/mutex.h>
-#include <linux/rbtree.h>
-#include <linux/spinlock.h>
-
-#include <linux/ceph/types.h>
-#include <linux/ceph/messenger.h>
-#include <linux/ceph/mdsmap.h>
-
-/*
- * Some lock dependencies:
- *
- * session->s_mutex
- * mdsc->mutex
- *
- * mdsc->snap_rwsem
- *
- * ci->i_ceph_lock
- * mdsc->snap_flush_lock
- * mdsc->cap_delay_lock
- *
- */
-
-struct ceph_fs_client;
-struct ceph_cap;
-
-/*
- * parsed info about a single inode. pointers are into the encoded
- * on-wire structures within the mds reply message payload.
- */
-struct ceph_mds_reply_info_in {
- struct ceph_mds_reply_inode *in;
- struct ceph_dir_layout dir_layout;
- u32 symlink_len;
- char *symlink;
- u32 xattr_len;
- char *xattr_data;
-};
-
-/*
- * parsed info about an mds reply, including information about
- * either: 1) the target inode and/or its parent directory and dentry,
- * and directory contents (for readdir results), or
- * 2) the file range lock info (for fcntl F_GETLK results).
- */
-struct ceph_mds_reply_info_parsed {
- struct ceph_mds_reply_head *head;
-
- /* trace */
- struct ceph_mds_reply_info_in diri, targeti;
- struct ceph_mds_reply_dirfrag *dirfrag;
- char *dname;
- u32 dname_len;
- struct ceph_mds_reply_lease *dlease;
-
- /* extra */
- union {
- /* for fcntl F_GETLK results */
- struct ceph_filelock *filelock_reply;
-
- /* for readdir results */
- struct {
- struct ceph_mds_reply_dirfrag *dir_dir;
- int dir_nr;
- char **dir_dname;
- u32 *dir_dname_len;
- struct ceph_mds_reply_lease **dir_dlease;
- struct ceph_mds_reply_info_in *dir_in;
- u8 dir_complete, dir_end;
- };
- };
-
- /* encoded blob describing snapshot contexts for certain
- operations (e.g., open) */
- void *snapblob;
- int snapblob_len;
-};
-
-
-/*
- * cap releases are batched and sent to the MDS en masse.
- */
-#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE - \
- sizeof(struct ceph_mds_cap_release)) / \
- sizeof(struct ceph_mds_cap_item))
-
-
-/*
- * state associated with each MDS<->client session
- */
-enum {
- CEPH_MDS_SESSION_NEW = 1,
- CEPH_MDS_SESSION_OPENING = 2,
- CEPH_MDS_SESSION_OPEN = 3,
- CEPH_MDS_SESSION_HUNG = 4,
- CEPH_MDS_SESSION_CLOSING = 5,
- CEPH_MDS_SESSION_RESTARTING = 6,
- CEPH_MDS_SESSION_RECONNECTING = 7,
-};
-
-struct ceph_mds_session {
- struct ceph_mds_client *s_mdsc;
- int s_mds;
- int s_state;
- unsigned long s_ttl; /* time until mds kills us */
- u64 s_seq; /* incoming msg seq # */
- struct mutex s_mutex; /* serialize session messages */
-
- struct ceph_connection s_con;
-
- struct ceph_authorizer *s_authorizer;
- void *s_authorizer_buf, *s_authorizer_reply_buf;
- size_t s_authorizer_buf_len, s_authorizer_reply_buf_len;
-
- /* protected by s_gen_ttl_lock */
- spinlock_t s_gen_ttl_lock;
- u32 s_cap_gen; /* inc each time we get mds stale msg */
- unsigned long s_cap_ttl; /* when session caps expire */
-
- /* protected by s_cap_lock */
- spinlock_t s_cap_lock;
- struct list_head s_caps; /* all caps issued by this session */
- int s_nr_caps, s_trim_caps;
- int s_num_cap_releases;
- struct list_head s_cap_releases; /* waiting cap_release messages */
- struct list_head s_cap_releases_done; /* ready to send */
- struct ceph_cap *s_cap_iterator;
-
- /* protected by mutex */
- struct list_head s_cap_flushing; /* inodes w/ flushing caps */
- struct list_head s_cap_snaps_flushing;
- unsigned long s_renew_requested; /* last time we sent a renew req */
- u64 s_renew_seq;
-
- atomic_t s_ref;
- struct list_head s_waiting; /* waiting requests */
- struct list_head s_unsafe; /* unsafe requests */
-};
-
-/*
- * modes of choosing which MDS to send a request to
- */
-enum {
- USE_ANY_MDS,
- USE_RANDOM_MDS,
- USE_AUTH_MDS, /* prefer authoritative mds for this metadata item */
-};
-
-struct ceph_mds_request;
-struct ceph_mds_client;
-
-/*
- * request completion callback
- */
-typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
- struct ceph_mds_request *req);
-
-/*
- * an in-flight mds request
- */
-struct ceph_mds_request {
- u64 r_tid; /* transaction id */
- struct rb_node r_node;
- struct ceph_mds_client *r_mdsc;
-
- int r_op; /* mds op code */
-
- /* operation on what? */
- struct inode *r_inode; /* arg1 */
- struct dentry *r_dentry; /* arg1 */
- struct dentry *r_old_dentry; /* arg2: rename from or link from */
- struct inode *r_old_dentry_dir; /* arg2: old dentry's parent dir */
- char *r_path1, *r_path2;
- struct ceph_vino r_ino1, r_ino2;
-
- struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
- struct inode *r_target_inode; /* resulting inode */
-
- struct mutex r_fill_mutex;
-
- union ceph_mds_request_args r_args;
- int r_fmode; /* file mode, if expecting cap */
- uid_t r_uid;
- gid_t r_gid;
-
- /* for choosing which mds to send this request to */
- int r_direct_mode;
- u32 r_direct_hash; /* choose dir frag based on this dentry hash */
- bool r_direct_is_hash; /* true if r_direct_hash is valid */
-
- /* data payload is used for xattr ops */
- struct page **r_pages;
- int r_num_pages;
- int r_data_len;
-
- /* what caps shall we drop? */
- int r_inode_drop, r_inode_unless;
- int r_dentry_drop, r_dentry_unless;
- int r_old_dentry_drop, r_old_dentry_unless;
- struct inode *r_old_inode;
- int r_old_inode_drop, r_old_inode_unless;
-
- struct ceph_msg *r_request; /* original request */
- int r_request_release_offset;
- struct ceph_msg *r_reply;
- struct ceph_mds_reply_info_parsed r_reply_info;
- int r_err;
- bool r_aborted;
-
- unsigned long r_timeout; /* optional. jiffies */
- unsigned long r_started; /* start time to measure timeout against */
- unsigned long r_request_started; /* start time for mds request only,
- used to measure lease durations */
-
- /* link unsafe requests to parent directory, for fsync */
- struct inode *r_unsafe_dir;
- struct list_head r_unsafe_dir_item;
-
- struct ceph_mds_session *r_session;
-
- int r_attempts; /* resend attempts */
- int r_num_fwd; /* number of forward attempts */
- int r_resend_mds; /* mds to resend to next, if any*/
- u32 r_sent_on_mseq; /* cap mseq request was sent at*/
-
- struct kref r_kref;
- struct list_head r_wait;
- struct completion r_completion;
- struct completion r_safe_completion;
- ceph_mds_request_callback_t r_callback;
- struct list_head r_unsafe_item; /* per-session unsafe list item */
- bool r_got_unsafe, r_got_safe, r_got_result;
-
- bool r_did_prepopulate;
- u32 r_readdir_offset;
-
- struct ceph_cap_reservation r_caps_reservation;
- int r_num_caps;
-};
-
-/*
- * mds client state
- */
-struct ceph_mds_client {
- struct ceph_fs_client *fsc;
- struct mutex mutex; /* all nested structures */
-
- struct ceph_mdsmap *mdsmap;
- struct completion safe_umount_waiters;
- wait_queue_head_t session_close_wq;
- struct list_head waiting_for_map;
-
- struct ceph_mds_session **sessions; /* NULL for mds if no session */
- int max_sessions; /* len of s_mds_sessions */
- int stopping; /* true if shutting down */
-
- /*
- * snap_rwsem will cover cap linkage into snaprealms, and
- * realm snap contexts. (later, we can do per-realm snap
- * contexts locks..) the empty list contains realms with no
- * references (implying they contain no inodes with caps) that
- * should be destroyed.
- */
- struct rw_semaphore snap_rwsem;
- struct rb_root snap_realms;
- struct list_head snap_empty;
- spinlock_t snap_empty_lock; /* protect snap_empty */
-
- u64 last_tid; /* most recent mds request */
- struct rb_root request_tree; /* pending mds requests */
- struct delayed_work delayed_work; /* delayed work */
- unsigned long last_renew_caps; /* last time we renewed our caps */
- struct list_head cap_delay_list; /* caps with delayed release */
- spinlock_t cap_delay_lock; /* protects cap_delay_list */
- struct list_head snap_flush_list; /* cap_snaps ready to flush */
- spinlock_t snap_flush_lock;
-
- u64 cap_flush_seq;
- struct list_head cap_dirty; /* inodes with dirty caps */
- struct list_head cap_dirty_migrating; /* ...that are migration... */
- int num_cap_flushing; /* # caps we are flushing */
- spinlock_t cap_dirty_lock; /* protects above items */
- wait_queue_head_t cap_flushing_wq;
-
- /*
- * Cap reservations
- *
- * Maintain a global pool of preallocated struct ceph_caps, referenced
- * by struct ceph_caps_reservations. This ensures that we preallocate
- * memory needed to successfully process an MDS response. (If an MDS
- * sends us cap information and we fail to process it, we will have
- * problems due to the client and MDS being out of sync.)
- *
- * Reservations are 'owned' by a ceph_cap_reservation context.
- */
- spinlock_t caps_list_lock;
- struct list_head caps_list; /* unused (reserved or
- unreserved) */
- int caps_total_count; /* total caps allocated */
- int caps_use_count; /* in use */
- int caps_reserve_count; /* unused, reserved */
- int caps_avail_count; /* unused, unreserved */
- int caps_min_count; /* keep at least this many
- (unreserved) */
- spinlock_t dentry_lru_lock;
- struct list_head dentry_lru;
- int num_dentry;
-};
-
-extern const char *ceph_mds_op_name(int op);
-
-extern struct ceph_mds_session *
-__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
-
-static inline struct ceph_mds_session *
-ceph_get_mds_session(struct ceph_mds_session *s)
-{
- atomic_inc(&s->s_ref);
- return s;
-}
-
-extern void ceph_put_mds_session(struct ceph_mds_session *s);
-
-extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
- struct ceph_msg *msg, int mds);
-
-extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
-extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
-extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
-
-extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
-
-extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
- struct inode *inode,
- struct dentry *dn);
-
-extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
-
-extern struct ceph_mds_request *
-ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
-extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
- struct ceph_mds_request *req);
-extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
- struct inode *dir,
- struct ceph_mds_request *req);
-static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
-{
- kref_get(&req->r_kref);
-}
-extern void ceph_mdsc_release_request(struct kref *kref);
-static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
-{
- kref_put(&req->r_kref, ceph_mdsc_release_request);
-}
-
-extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session);
-extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session);
-
-extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
-
-extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
- int stop_on_nosnap);
-
-extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
-extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
- struct inode *inode,
- struct dentry *dentry, char action,
- u32 seq);
-
-extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
- struct ceph_msg *msg);
-
-extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session);
-
-#endif
diff --git a/ANDROID_3.4.5/fs/ceph/mdsmap.c b/ANDROID_3.4.5/fs/ceph/mdsmap.c
deleted file mode 100644
index 73b7d44e..00000000
--- a/ANDROID_3.4.5/fs/ceph/mdsmap.c
+++ /dev/null
@@ -1,179 +0,0 @@
-#include <linux/ceph/ceph_debug.h>
-
-#include <linux/bug.h>
-#include <linux/err.h>
-#include <linux/random.h>
-#include <linux/slab.h>
-#include <linux/types.h>
-
-#include <linux/ceph/mdsmap.h>
-#include <linux/ceph/messenger.h>
-#include <linux/ceph/decode.h>
-
-#include "super.h"
-
-
-/*
- * choose a random mds that is "up" (i.e. has a state > 0), or -1.
- */
-int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
-{
- int n = 0;
- int i;
- char r;
-
- /* count */
- for (i = 0; i < m->m_max_mds; i++)
- if (m->m_info[i].state > 0)
- n++;
- if (n == 0)
- return -1;
-
- /* pick */
- get_random_bytes(&r, 1);
- n = r % n;
- i = 0;
- for (i = 0; n > 0; i++, n--)
- while (m->m_info[i].state <= 0)
- i++;
-
- return i;
-}
-
-/*
- * Decode an MDS map
- *
- * Ignore any fields we don't care about (there are quite a few of
- * them).
- */
-struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
-{
- struct ceph_mdsmap *m;
- const void *start = *p;
- int i, j, n;
- int err = -EINVAL;
- u16 version;
-
- m = kzalloc(sizeof(*m), GFP_NOFS);
- if (m == NULL)
- return ERR_PTR(-ENOMEM);
-
- ceph_decode_16_safe(p, end, version, bad);
-
- ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
- m->m_epoch = ceph_decode_32(p);
- m->m_client_epoch = ceph_decode_32(p);
- m->m_last_failure = ceph_decode_32(p);
- m->m_root = ceph_decode_32(p);
- m->m_session_timeout = ceph_decode_32(p);
- m->m_session_autoclose = ceph_decode_32(p);
- m->m_max_file_size = ceph_decode_64(p);
- m->m_max_mds = ceph_decode_32(p);
-
- m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS);
- if (m->m_info == NULL)
- goto badmem;
-
- /* pick out active nodes from mds_info (state > 0) */
- n = ceph_decode_32(p);
- for (i = 0; i < n; i++) {
- u64 global_id;
- u32 namelen;
- s32 mds, inc, state;
- u64 state_seq;
- u8 infoversion;
- struct ceph_entity_addr addr;
- u32 num_export_targets;
- void *pexport_targets = NULL;
- struct ceph_timespec laggy_since;
-
- ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
- global_id = ceph_decode_64(p);
- infoversion = ceph_decode_8(p);
- *p += sizeof(u64);
- namelen = ceph_decode_32(p); /* skip mds name */
- *p += namelen;
-
- ceph_decode_need(p, end,
- 4*sizeof(u32) + sizeof(u64) +
- sizeof(addr) + sizeof(struct ceph_timespec),
- bad);
- mds = ceph_decode_32(p);
- inc = ceph_decode_32(p);
- state = ceph_decode_32(p);
- state_seq = ceph_decode_64(p);
- ceph_decode_copy(p, &addr, sizeof(addr));
- ceph_decode_addr(&addr);
- ceph_decode_copy(p, &laggy_since, sizeof(laggy_since));
- *p += sizeof(u32);
- ceph_decode_32_safe(p, end, namelen, bad);
- *p += namelen;
- if (infoversion >= 2) {
- ceph_decode_32_safe(p, end, num_export_targets, bad);
- pexport_targets = *p;
- *p += num_export_targets * sizeof(u32);
- } else {
- num_export_targets = 0;
- }
-
- dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
- i+1, n, global_id, mds, inc,
- ceph_pr_addr(&addr.in_addr),
- ceph_mds_state_name(state));
- if (mds >= 0 && mds < m->m_max_mds && state > 0) {
- m->m_info[mds].global_id = global_id;
- m->m_info[mds].state = state;
- m->m_info[mds].addr = addr;
- m->m_info[mds].laggy =
- (laggy_since.tv_sec != 0 ||
- laggy_since.tv_nsec != 0);
- m->m_info[mds].num_export_targets = num_export_targets;
- if (num_export_targets) {
- m->m_info[mds].export_targets =
- kcalloc(num_export_targets, sizeof(u32),
- GFP_NOFS);
- for (j = 0; j < num_export_targets; j++)
- m->m_info[mds].export_targets[j] =
- ceph_decode_32(&pexport_targets);
- } else {
- m->m_info[mds].export_targets = NULL;
- }
- }
- }
-
- /* pg_pools */
- ceph_decode_32_safe(p, end, n, bad);
- m->m_num_data_pg_pools = n;
- m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS);
- if (!m->m_data_pg_pools)
- goto badmem;
- ceph_decode_need(p, end, sizeof(u32)*(n+1), bad);
- for (i = 0; i < n; i++)
- m->m_data_pg_pools[i] = ceph_decode_32(p);
- m->m_cas_pg_pool = ceph_decode_32(p);
-
- /* ok, we don't care about the rest. */
- dout("mdsmap_decode success epoch %u\n", m->m_epoch);
- return m;
-
-badmem:
- err = -ENOMEM;
-bad:
- pr_err("corrupt mdsmap\n");
- print_hex_dump(KERN_DEBUG, "mdsmap: ",
- DUMP_PREFIX_OFFSET, 16, 1,
- start, end - start, true);
- ceph_mdsmap_destroy(m);
- return ERR_PTR(-EINVAL);
-}
-
-void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
-{
- int i;
-
- for (i = 0; i < m->m_max_mds; i++)
- kfree(m->m_info[i].export_targets);
- kfree(m->m_info);
- kfree(m->m_data_pg_pools);
- kfree(m);
-}
diff --git a/ANDROID_3.4.5/fs/ceph/snap.c b/ANDROID_3.4.5/fs/ceph/snap.c
deleted file mode 100644
index f04c0961..00000000
--- a/ANDROID_3.4.5/fs/ceph/snap.c
+++ /dev/null
@@ -1,931 +0,0 @@
-#include <linux/ceph/ceph_debug.h>
-
-#include <linux/sort.h>
-#include <linux/slab.h>
-
-#include "super.h"
-#include "mds_client.h"
-
-#include <linux/ceph/decode.h>
-
-/*
- * Snapshots in ceph are driven in large part by cooperation from the
- * client. In contrast to local file systems or file servers that
- * implement snapshots at a single point in the system, ceph's
- * distributed access to storage requires clients to help decide
- * whether a write logically occurs before or after a recently created
- * snapshot.
- *
- * This provides a perfect instantanous client-wide snapshot. Between
- * clients, however, snapshots may appear to be applied at slightly
- * different points in time, depending on delays in delivering the
- * snapshot notification.
- *
- * Snapshots are _not_ file system-wide. Instead, each snapshot
- * applies to the subdirectory nested beneath some directory. This
- * effectively divides the hierarchy into multiple "realms," where all
- * of the files contained by each realm share the same set of
- * snapshots. An individual realm's snap set contains snapshots
- * explicitly created on that realm, as well as any snaps in its
- * parent's snap set _after_ the point at which the parent became it's
- * parent (due to, say, a rename). Similarly, snaps from prior parents
- * during the time intervals during which they were the parent are included.
- *
- * The client is spared most of this detail, fortunately... it must only
- * maintains a hierarchy of realms reflecting the current parent/child
- * realm relationship, and for each realm has an explicit list of snaps
- * inherited from prior parents.
- *
- * A snap_realm struct is maintained for realms containing every inode
- * with an open cap in the system. (The needed snap realm information is
- * provided by the MDS whenever a cap is issued, i.e., on open.) A 'seq'
- * version number is used to ensure that as realm parameters change (new
- * snapshot, new parent, etc.) the client's realm hierarchy is updated.
- *
- * The realm hierarchy drives the generation of a 'snap context' for each
- * realm, which simply lists the resulting set of snaps for the realm. This
- * is attached to any writes sent to OSDs.
- */
-/*
- * Unfortunately error handling is a bit mixed here. If we get a snap
- * update, but don't have enough memory to update our realm hierarchy,
- * it's not clear what we can do about it (besides complaining to the
- * console).
- */
-
-
-/*
- * increase ref count for the realm
- *
- * caller must hold snap_rwsem for write.
- */
-void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
- struct ceph_snap_realm *realm)
-{
- dout("get_realm %p %d -> %d\n", realm,
- atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
- /*
- * since we _only_ increment realm refs or empty the empty
- * list with snap_rwsem held, adjusting the empty list here is
- * safe. we do need to protect against concurrent empty list
- * additions, however.
- */
- if (atomic_read(&realm->nref) == 0) {
- spin_lock(&mdsc->snap_empty_lock);
- list_del_init(&realm->empty_item);
- spin_unlock(&mdsc->snap_empty_lock);
- }
-
- atomic_inc(&realm->nref);
-}
-
-static void __insert_snap_realm(struct rb_root *root,
- struct ceph_snap_realm *new)
-{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent = NULL;
- struct ceph_snap_realm *r = NULL;
-
- while (*p) {
- parent = *p;
- r = rb_entry(parent, struct ceph_snap_realm, node);
- if (new->ino < r->ino)
- p = &(*p)->rb_left;
- else if (new->ino > r->ino)
- p = &(*p)->rb_right;
- else
- BUG();
- }
-
- rb_link_node(&new->node, parent, p);
- rb_insert_color(&new->node, root);
-}
-
-/*
- * create and get the realm rooted at @ino and bump its ref count.
- *
- * caller must hold snap_rwsem for write.
- */
-static struct ceph_snap_realm *ceph_create_snap_realm(
- struct ceph_mds_client *mdsc,
- u64 ino)
-{
- struct ceph_snap_realm *realm;
-
- realm = kzalloc(sizeof(*realm), GFP_NOFS);
- if (!realm)
- return ERR_PTR(-ENOMEM);
-
- atomic_set(&realm->nref, 0); /* tree does not take a ref */
- realm->ino = ino;
- INIT_LIST_HEAD(&realm->children);
- INIT_LIST_HEAD(&realm->child_item);
- INIT_LIST_HEAD(&realm->empty_item);
- INIT_LIST_HEAD(&realm->dirty_item);
- INIT_LIST_HEAD(&realm->inodes_with_caps);
- spin_lock_init(&realm->inodes_with_caps_lock);
- __insert_snap_realm(&mdsc->snap_realms, realm);
- dout("create_snap_realm %llx %p\n", realm->ino, realm);
- return realm;
-}
-
-/*
- * lookup the realm rooted at @ino.
- *
- * caller must hold snap_rwsem for write.
- */
-struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
- u64 ino)
-{
- struct rb_node *n = mdsc->snap_realms.rb_node;
- struct ceph_snap_realm *r;
-
- while (n) {
- r = rb_entry(n, struct ceph_snap_realm, node);
- if (ino < r->ino)
- n = n->rb_left;
- else if (ino > r->ino)
- n = n->rb_right;
- else {
- dout("lookup_snap_realm %llx %p\n", r->ino, r);
- return r;
- }
- }
- return NULL;
-}
-
-static void __put_snap_realm(struct ceph_mds_client *mdsc,
- struct ceph_snap_realm *realm);
-
-/*
- * called with snap_rwsem (write)
- */
-static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
- struct ceph_snap_realm *realm)
-{
- dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
-
- rb_erase(&realm->node, &mdsc->snap_realms);
-
- if (realm->parent) {
- list_del_init(&realm->child_item);
- __put_snap_realm(mdsc, realm->parent);
- }
-
- kfree(realm->prior_parent_snaps);
- kfree(realm->snaps);
- ceph_put_snap_context(realm->cached_context);
- kfree(realm);
-}
-
-/*
- * caller holds snap_rwsem (write)
- */
-static void __put_snap_realm(struct ceph_mds_client *mdsc,
- struct ceph_snap_realm *realm)
-{
- dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
- atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
- if (atomic_dec_and_test(&realm->nref))
- __destroy_snap_realm(mdsc, realm);
-}
-
-/*
- * caller needn't hold any locks
- */
-void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
- struct ceph_snap_realm *realm)
-{
- dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
- atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
- if (!atomic_dec_and_test(&realm->nref))
- return;
-
- if (down_write_trylock(&mdsc->snap_rwsem)) {
- __destroy_snap_realm(mdsc, realm);
- up_write(&mdsc->snap_rwsem);
- } else {
- spin_lock(&mdsc->snap_empty_lock);
- list_add(&realm->empty_item, &mdsc->snap_empty);
- spin_unlock(&mdsc->snap_empty_lock);
- }
-}
-
-/*
- * Clean up any realms whose ref counts have dropped to zero. Note
- * that this does not include realms who were created but not yet
- * used.
- *
- * Called under snap_rwsem (write)
- */
-static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
-{
- struct ceph_snap_realm *realm;
-
- spin_lock(&mdsc->snap_empty_lock);
- while (!list_empty(&mdsc->snap_empty)) {
- realm = list_first_entry(&mdsc->snap_empty,
- struct ceph_snap_realm, empty_item);
- list_del(&realm->empty_item);
- spin_unlock(&mdsc->snap_empty_lock);
- __destroy_snap_realm(mdsc, realm);
- spin_lock(&mdsc->snap_empty_lock);
- }
- spin_unlock(&mdsc->snap_empty_lock);
-}
-
-void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc)
-{
- down_write(&mdsc->snap_rwsem);
- __cleanup_empty_realms(mdsc);
- up_write(&mdsc->snap_rwsem);
-}
-
-/*
- * adjust the parent realm of a given @realm. adjust child list, and parent
- * pointers, and ref counts appropriately.
- *
- * return true if parent was changed, 0 if unchanged, <0 on error.
- *
- * caller must hold snap_rwsem for write.
- */
-static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
- struct ceph_snap_realm *realm,
- u64 parentino)
-{
- struct ceph_snap_realm *parent;
-
- if (realm->parent_ino == parentino)
- return 0;
-
- parent = ceph_lookup_snap_realm(mdsc, parentino);
- if (!parent) {
- parent = ceph_create_snap_realm(mdsc, parentino);
- if (IS_ERR(parent))
- return PTR_ERR(parent);
- }
- dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n",
- realm->ino, realm, realm->parent_ino, realm->parent,
- parentino, parent);
- if (realm->parent) {
- list_del_init(&realm->child_item);
- ceph_put_snap_realm(mdsc, realm->parent);
- }
- realm->parent_ino = parentino;
- realm->parent = parent;
- ceph_get_snap_realm(mdsc, parent);
- list_add(&realm->child_item, &parent->children);
- return 1;
-}
-
-
-static int cmpu64_rev(const void *a, const void *b)
-{
- if (*(u64 *)a < *(u64 *)b)
- return 1;
- if (*(u64 *)a > *(u64 *)b)
- return -1;
- return 0;
-}
-
-/*
- * build the snap context for a given realm.
- */
-static int build_snap_context(struct ceph_snap_realm *realm)
-{
- struct ceph_snap_realm *parent = realm->parent;
- struct ceph_snap_context *snapc;
- int err = 0;
- int i;
- int num = realm->num_prior_parent_snaps + realm->num_snaps;
-
- /*
- * build parent context, if it hasn't been built.
- * conservatively estimate that all parent snaps might be
- * included by us.
- */
- if (parent) {
- if (!parent->cached_context) {
- err = build_snap_context(parent);
- if (err)
- goto fail;
- }
- num += parent->cached_context->num_snaps;
- }
-
- /* do i actually need to update? not if my context seq
- matches realm seq, and my parents' does to. (this works
- because we rebuild_snap_realms() works _downward_ in
- hierarchy after each update.) */
- if (realm->cached_context &&
- realm->cached_context->seq == realm->seq &&
- (!parent ||
- realm->cached_context->seq >= parent->cached_context->seq)) {
- dout("build_snap_context %llx %p: %p seq %lld (%d snaps)"
- " (unchanged)\n",
- realm->ino, realm, realm->cached_context,
- realm->cached_context->seq,
- realm->cached_context->num_snaps);
- return 0;
- }
-
- /* alloc new snap context */
- err = -ENOMEM;
- if (num > (ULONG_MAX - sizeof(*snapc)) / sizeof(u64))
- goto fail;
- snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS);
- if (!snapc)
- goto fail;
- atomic_set(&snapc->nref, 1);
-
- /* build (reverse sorted) snap vector */
- num = 0;
- snapc->seq = realm->seq;
- if (parent) {
- /* include any of parent's snaps occurring _after_ my
- parent became my parent */
- for (i = 0; i < parent->cached_context->num_snaps; i++)
- if (parent->cached_context->snaps[i] >=
- realm->parent_since)
- snapc->snaps[num++] =
- parent->cached_context->snaps[i];
- if (parent->cached_context->seq > snapc->seq)
- snapc->seq = parent->cached_context->seq;
- }
- memcpy(snapc->snaps + num, realm->snaps,
- sizeof(u64)*realm->num_snaps);
- num += realm->num_snaps;
- memcpy(snapc->snaps + num, realm->prior_parent_snaps,
- sizeof(u64)*realm->num_prior_parent_snaps);
- num += realm->num_prior_parent_snaps;
-
- sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
- snapc->num_snaps = num;
- dout("build_snap_context %llx %p: %p seq %lld (%d snaps)\n",
- realm->ino, realm, snapc, snapc->seq, snapc->num_snaps);
-
- if (realm->cached_context)
- ceph_put_snap_context(realm->cached_context);
- realm->cached_context = snapc;
- return 0;
-
-fail:
- /*
- * if we fail, clear old (incorrect) cached_context... hopefully
- * we'll have better luck building it later
- */
- if (realm->cached_context) {
- ceph_put_snap_context(realm->cached_context);
- realm->cached_context = NULL;
- }
- pr_err("build_snap_context %llx %p fail %d\n", realm->ino,
- realm, err);
- return err;
-}
-
-/*
- * rebuild snap context for the given realm and all of its children.
- */
-static void rebuild_snap_realms(struct ceph_snap_realm *realm)
-{
- struct ceph_snap_realm *child;
-
- dout("rebuild_snap_realms %llx %p\n", realm->ino, realm);
- build_snap_context(realm);
-
- list_for_each_entry(child, &realm->children, child_item)
- rebuild_snap_realms(child);
-}
-
-
-/*
- * helper to allocate and decode an array of snapids. free prior
- * instance, if any.
- */
-static int dup_array(u64 **dst, __le64 *src, int num)
-{
- int i;
-
- kfree(*dst);
- if (num) {
- *dst = kcalloc(num, sizeof(u64), GFP_NOFS);
- if (!*dst)
- return -ENOMEM;
- for (i = 0; i < num; i++)
- (*dst)[i] = get_unaligned_le64(src + i);
- } else {
- *dst = NULL;
- }
- return 0;
-}
-
-
-/*
- * When a snapshot is applied, the size/mtime inode metadata is queued
- * in a ceph_cap_snap (one for each snapshot) until writeback
- * completes and the metadata can be flushed back to the MDS.
- *
- * However, if a (sync) write is currently in-progress when we apply
- * the snapshot, we have to wait until the write succeeds or fails
- * (and a final size/mtime is known). In this case the
- * cap_snap->writing = 1, and is said to be "pending." When the write
- * finishes, we __ceph_finish_cap_snap().
- *
- * Caller must hold snap_rwsem for read (i.e., the realm topology won't
- * change).
- */
-void ceph_queue_cap_snap(struct ceph_inode_info *ci)
-{
- struct inode *inode = &ci->vfs_inode;
- struct ceph_cap_snap *capsnap;
- int used, dirty;
-
- capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
- if (!capsnap) {
- pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
- return;
- }
-
- spin_lock(&ci->i_ceph_lock);
- used = __ceph_caps_used(ci);
- dirty = __ceph_caps_dirty(ci);
-
- /*
- * If there is a write in progress, treat that as a dirty Fw,
- * even though it hasn't completed yet; by the time we finish
- * up this capsnap it will be.
- */
- if (used & CEPH_CAP_FILE_WR)
- dirty |= CEPH_CAP_FILE_WR;
-
- if (__ceph_have_pending_cap_snap(ci)) {
- /* there is no point in queuing multiple "pending" cap_snaps,
- as no new writes are allowed to start when pending, so any
- writes in progress now were started before the previous
- cap_snap. lucky us. */
- dout("queue_cap_snap %p already pending\n", inode);
- kfree(capsnap);
- } else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL|
- CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) {
- struct ceph_snap_context *snapc = ci->i_head_snapc;
-
- /*
- * if we are a sync write, we may need to go to the snaprealm
- * to get the current snapc.
- */
- if (!snapc)
- snapc = ci->i_snap_realm->cached_context;
-
- dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n",
- inode, capsnap, snapc, ceph_cap_string(dirty));
- ihold(inode);
-
- atomic_set(&capsnap->nref, 1);
- capsnap->ci = ci;
- INIT_LIST_HEAD(&capsnap->ci_item);
- INIT_LIST_HEAD(&capsnap->flushing_item);
-
- capsnap->follows = snapc->seq;
- capsnap->issued = __ceph_caps_issued(ci, NULL);
- capsnap->dirty = dirty;
-
- capsnap->mode = inode->i_mode;
- capsnap->uid = inode->i_uid;
- capsnap->gid = inode->i_gid;
-
- if (dirty & CEPH_CAP_XATTR_EXCL) {
- __ceph_build_xattrs_blob(ci);
- capsnap->xattr_blob =
- ceph_buffer_get(ci->i_xattrs.blob);
- capsnap->xattr_version = ci->i_xattrs.version;
- } else {
- capsnap->xattr_blob = NULL;
- capsnap->xattr_version = 0;
- }
-
- /* dirty page count moved from _head to this cap_snap;
- all subsequent writes page dirties occur _after_ this
- snapshot. */
- capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
- ci->i_wrbuffer_ref_head = 0;
- capsnap->context = snapc;
- ci->i_head_snapc =
- ceph_get_snap_context(ci->i_snap_realm->cached_context);
- dout(" new snapc is %p\n", ci->i_head_snapc);
- list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
-
- if (used & CEPH_CAP_FILE_WR) {
- dout("queue_cap_snap %p cap_snap %p snapc %p"
- " seq %llu used WR, now pending\n", inode,
- capsnap, snapc, snapc->seq);
- capsnap->writing = 1;
- } else {
- /* note mtime, size NOW. */
- __ceph_finish_cap_snap(ci, capsnap);
- }
- } else {
- dout("queue_cap_snap %p nothing dirty|writing\n", inode);
- kfree(capsnap);
- }
-
- spin_unlock(&ci->i_ceph_lock);
-}
-
-/*
- * Finalize the size, mtime for a cap_snap.. that is, settle on final values
- * to be used for the snapshot, to be flushed back to the mds.
- *
- * If capsnap can now be flushed, add to snap_flush list, and return 1.
- *
- * Caller must hold i_ceph_lock.
- */
-int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
- struct ceph_cap_snap *capsnap)
-{
- struct inode *inode = &ci->vfs_inode;
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
-
- BUG_ON(capsnap->writing);
- capsnap->size = inode->i_size;
- capsnap->mtime = inode->i_mtime;
- capsnap->atime = inode->i_atime;
- capsnap->ctime = inode->i_ctime;
- capsnap->time_warp_seq = ci->i_time_warp_seq;
- if (capsnap->dirty_pages) {
- dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu "
- "still has %d dirty pages\n", inode, capsnap,
- capsnap->context, capsnap->context->seq,
- ceph_cap_string(capsnap->dirty), capsnap->size,
- capsnap->dirty_pages);
- return 0;
- }
- dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n",
- inode, capsnap, capsnap->context,
- capsnap->context->seq, ceph_cap_string(capsnap->dirty),
- capsnap->size);
-
- spin_lock(&mdsc->snap_flush_lock);
- list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
- spin_unlock(&mdsc->snap_flush_lock);
- return 1; /* caller may want to ceph_flush_snaps */
-}
-
-/*
- * Queue cap_snaps for snap writeback for this realm and its children.
- * Called under snap_rwsem, so realm topology won't change.
- */
-static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
-{
- struct ceph_inode_info *ci;
- struct inode *lastinode = NULL;
- struct ceph_snap_realm *child;
-
- dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino);
-
- spin_lock(&realm->inodes_with_caps_lock);
- list_for_each_entry(ci, &realm->inodes_with_caps,
- i_snap_realm_item) {
- struct inode *inode = igrab(&ci->vfs_inode);
- if (!inode)
- continue;
- spin_unlock(&realm->inodes_with_caps_lock);
- if (lastinode)
- iput(lastinode);
- lastinode = inode;
- ceph_queue_cap_snap(ci);
- spin_lock(&realm->inodes_with_caps_lock);
- }
- spin_unlock(&realm->inodes_with_caps_lock);
- if (lastinode)
- iput(lastinode);
-
- list_for_each_entry(child, &realm->children, child_item) {
- dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n",
- realm, realm->ino, child, child->ino);
- list_del_init(&child->dirty_item);
- list_add(&child->dirty_item, &realm->dirty_item);
- }
-
- list_del_init(&realm->dirty_item);
- dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
-}
-
-/*
- * Parse and apply a snapblob "snap trace" from the MDS. This specifies
- * the snap realm parameters from a given realm and all of its ancestors,
- * up to the root.
- *
- * Caller must hold snap_rwsem for write.
- */
-int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
- void *p, void *e, bool deletion)
-{
- struct ceph_mds_snap_realm *ri; /* encoded */
- __le64 *snaps; /* encoded */
- __le64 *prior_parent_snaps; /* encoded */
- struct ceph_snap_realm *realm;
- int invalidate = 0;
- int err = -ENOMEM;
- LIST_HEAD(dirty_realms);
-
- dout("update_snap_trace deletion=%d\n", deletion);
-more:
- ceph_decode_need(&p, e, sizeof(*ri), bad);
- ri = p;
- p += sizeof(*ri);
- ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) +
- le32_to_cpu(ri->num_prior_parent_snaps)), bad);
- snaps = p;
- p += sizeof(u64) * le32_to_cpu(ri->num_snaps);
- prior_parent_snaps = p;
- p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps);
-
- realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino));
- if (!realm) {
- realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino));
- if (IS_ERR(realm)) {
- err = PTR_ERR(realm);
- goto fail;
- }
- }
-
- /* ensure the parent is correct */
- err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
- if (err < 0)
- goto fail;
- invalidate += err;
-
- if (le64_to_cpu(ri->seq) > realm->seq) {
- dout("update_snap_trace updating %llx %p %lld -> %lld\n",
- realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
- /* update realm parameters, snap lists */
- realm->seq = le64_to_cpu(ri->seq);
- realm->created = le64_to_cpu(ri->created);
- realm->parent_since = le64_to_cpu(ri->parent_since);
-
- realm->num_snaps = le32_to_cpu(ri->num_snaps);
- err = dup_array(&realm->snaps, snaps, realm->num_snaps);
- if (err < 0)
- goto fail;
-
- realm->num_prior_parent_snaps =
- le32_to_cpu(ri->num_prior_parent_snaps);
- err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps,
- realm->num_prior_parent_snaps);
- if (err < 0)
- goto fail;
-
- /* queue realm for cap_snap creation */
- list_add(&realm->dirty_item, &dirty_realms);
-
- invalidate = 1;
- } else if (!realm->cached_context) {
- dout("update_snap_trace %llx %p seq %lld new\n",
- realm->ino, realm, realm->seq);
- invalidate = 1;
- } else {
- dout("update_snap_trace %llx %p seq %lld unchanged\n",
- realm->ino, realm, realm->seq);
- }
-
- dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
- realm, invalidate, p, e);
-
- if (p < e)
- goto more;
-
- /* invalidate when we reach the _end_ (root) of the trace */
- if (invalidate)
- rebuild_snap_realms(realm);
-
- /*
- * queue cap snaps _after_ we've built the new snap contexts,
- * so that i_head_snapc can be set appropriately.
- */
- while (!list_empty(&dirty_realms)) {
- realm = list_first_entry(&dirty_realms, struct ceph_snap_realm,
- dirty_item);
- queue_realm_cap_snaps(realm);
- }
-
- __cleanup_empty_realms(mdsc);
- return 0;
-
-bad:
- err = -EINVAL;
-fail:
- pr_err("update_snap_trace error %d\n", err);
- return err;
-}
-
-
-/*
- * Send any cap_snaps that are queued for flush. Try to carry
- * s_mutex across multiple snap flushes to avoid locking overhead.
- *
- * Caller holds no locks.
- */
-static void flush_snaps(struct ceph_mds_client *mdsc)
-{
- struct ceph_inode_info *ci;
- struct inode *inode;
- struct ceph_mds_session *session = NULL;
-
- dout("flush_snaps\n");
- spin_lock(&mdsc->snap_flush_lock);
- while (!list_empty(&mdsc->snap_flush_list)) {
- ci = list_first_entry(&mdsc->snap_flush_list,
- struct ceph_inode_info, i_snap_flush_item);
- inode = &ci->vfs_inode;
- ihold(inode);
- spin_unlock(&mdsc->snap_flush_lock);
- spin_lock(&ci->i_ceph_lock);
- __ceph_flush_snaps(ci, &session, 0);
- spin_unlock(&ci->i_ceph_lock);
- iput(inode);
- spin_lock(&mdsc->snap_flush_lock);
- }
- spin_unlock(&mdsc->snap_flush_lock);
-
- if (session) {
- mutex_unlock(&session->s_mutex);
- ceph_put_mds_session(session);
- }
- dout("flush_snaps done\n");
-}
-
-
-/*
- * Handle a snap notification from the MDS.
- *
- * This can take two basic forms: the simplest is just a snap creation
- * or deletion notification on an existing realm. This should update the
- * realm and its children.
- *
- * The more difficult case is realm creation, due to snap creation at a
- * new point in the file hierarchy, or due to a rename that moves a file or
- * directory into another realm.
- */
-void ceph_handle_snap(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session,
- struct ceph_msg *msg)
-{
- struct super_block *sb = mdsc->fsc->sb;
- int mds = session->s_mds;
- u64 split;
- int op;
- int trace_len;
- struct ceph_snap_realm *realm = NULL;
- void *p = msg->front.iov_base;
- void *e = p + msg->front.iov_len;
- struct ceph_mds_snap_head *h;
- int num_split_inos, num_split_realms;
- __le64 *split_inos = NULL, *split_realms = NULL;
- int i;
- int locked_rwsem = 0;
-
- /* decode */
- if (msg->front.iov_len < sizeof(*h))
- goto bad;
- h = p;
- op = le32_to_cpu(h->op);
- split = le64_to_cpu(h->split); /* non-zero if we are splitting an
- * existing realm */
- num_split_inos = le32_to_cpu(h->num_split_inos);
- num_split_realms = le32_to_cpu(h->num_split_realms);
- trace_len = le32_to_cpu(h->trace_len);
- p += sizeof(*h);
-
- dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds,
- ceph_snap_op_name(op), split, trace_len);
-
- mutex_lock(&session->s_mutex);
- session->s_seq++;
- mutex_unlock(&session->s_mutex);
-
- down_write(&mdsc->snap_rwsem);
- locked_rwsem = 1;
-
- if (op == CEPH_SNAP_OP_SPLIT) {
- struct ceph_mds_snap_realm *ri;
-
- /*
- * A "split" breaks part of an existing realm off into
- * a new realm. The MDS provides a list of inodes
- * (with caps) and child realms that belong to the new
- * child.
- */
- split_inos = p;
- p += sizeof(u64) * num_split_inos;
- split_realms = p;
- p += sizeof(u64) * num_split_realms;
- ceph_decode_need(&p, e, sizeof(*ri), bad);
- /* we will peek at realm info here, but will _not_
- * advance p, as the realm update will occur below in
- * ceph_update_snap_trace. */
- ri = p;
-
- realm = ceph_lookup_snap_realm(mdsc, split);
- if (!realm) {
- realm = ceph_create_snap_realm(mdsc, split);
- if (IS_ERR(realm))
- goto out;
- }
- ceph_get_snap_realm(mdsc, realm);
-
- dout("splitting snap_realm %llx %p\n", realm->ino, realm);
- for (i = 0; i < num_split_inos; i++) {
- struct ceph_vino vino = {
- .ino = le64_to_cpu(split_inos[i]),
- .snap = CEPH_NOSNAP,
- };
- struct inode *inode = ceph_find_inode(sb, vino);
- struct ceph_inode_info *ci;
- struct ceph_snap_realm *oldrealm;
-
- if (!inode)
- continue;
- ci = ceph_inode(inode);
-
- spin_lock(&ci->i_ceph_lock);
- if (!ci->i_snap_realm)
- goto skip_inode;
- /*
- * If this inode belongs to a realm that was
- * created after our new realm, we experienced
- * a race (due to another split notifications
- * arriving from a different MDS). So skip
- * this inode.
- */
- if (ci->i_snap_realm->created >
- le64_to_cpu(ri->created)) {
- dout(" leaving %p in newer realm %llx %p\n",
- inode, ci->i_snap_realm->ino,
- ci->i_snap_realm);
- goto skip_inode;
- }
- dout(" will move %p to split realm %llx %p\n",
- inode, realm->ino, realm);
- /*
- * Move the inode to the new realm
- */
- spin_lock(&realm->inodes_with_caps_lock);
- list_del_init(&ci->i_snap_realm_item);
- list_add(&ci->i_snap_realm_item,
- &realm->inodes_with_caps);
- oldrealm = ci->i_snap_realm;
- ci->i_snap_realm = realm;
- spin_unlock(&realm->inodes_with_caps_lock);
- spin_unlock(&ci->i_ceph_lock);
-
- ceph_get_snap_realm(mdsc, realm);
- ceph_put_snap_realm(mdsc, oldrealm);
-
- iput(inode);
- continue;
-
-skip_inode:
- spin_unlock(&ci->i_ceph_lock);
- iput(inode);
- }
-
- /* we may have taken some of the old realm's children. */
- for (i = 0; i < num_split_realms; i++) {
- struct ceph_snap_realm *child =
- ceph_lookup_snap_realm(mdsc,
- le64_to_cpu(split_realms[i]));
- if (!child)
- continue;
- adjust_snap_realm_parent(mdsc, child, realm->ino);
- }
- }
-
- /*
- * update using the provided snap trace. if we are deleting a
- * snap, we can avoid queueing cap_snaps.
- */
- ceph_update_snap_trace(mdsc, p, e,
- op == CEPH_SNAP_OP_DESTROY);
-
- if (op == CEPH_SNAP_OP_SPLIT)
- /* we took a reference when we created the realm, above */
- ceph_put_snap_realm(mdsc, realm);
-
- __cleanup_empty_realms(mdsc);
-
- up_write(&mdsc->snap_rwsem);
-
- flush_snaps(mdsc);
- return;
-
-bad:
- pr_err("corrupt snap message from mds%d\n", mds);
- ceph_msg_dump(msg);
-out:
- if (locked_rwsem)
- up_write(&mdsc->snap_rwsem);
- return;
-}
-
-
-
diff --git a/ANDROID_3.4.5/fs/ceph/strings.c b/ANDROID_3.4.5/fs/ceph/strings.c
deleted file mode 100644
index cd5097d7..00000000
--- a/ANDROID_3.4.5/fs/ceph/strings.c
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Ceph fs string constants
- */
-#include <linux/module.h>
-#include <linux/ceph/types.h>
-
-
-const char *ceph_mds_state_name(int s)
-{
- switch (s) {
- /* down and out */
- case CEPH_MDS_STATE_DNE: return "down:dne";
- case CEPH_MDS_STATE_STOPPED: return "down:stopped";
- /* up and out */
- case CEPH_MDS_STATE_BOOT: return "up:boot";
- case CEPH_MDS_STATE_STANDBY: return "up:standby";
- case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay";
- case CEPH_MDS_STATE_CREATING: return "up:creating";
- case CEPH_MDS_STATE_STARTING: return "up:starting";
- /* up and in */
- case CEPH_MDS_STATE_REPLAY: return "up:replay";
- case CEPH_MDS_STATE_RESOLVE: return "up:resolve";
- case CEPH_MDS_STATE_RECONNECT: return "up:reconnect";
- case CEPH_MDS_STATE_REJOIN: return "up:rejoin";
- case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay";
- case CEPH_MDS_STATE_ACTIVE: return "up:active";
- case CEPH_MDS_STATE_STOPPING: return "up:stopping";
- }
- return "???";
-}
-
-const char *ceph_session_op_name(int op)
-{
- switch (op) {
- case CEPH_SESSION_REQUEST_OPEN: return "request_open";
- case CEPH_SESSION_OPEN: return "open";
- case CEPH_SESSION_REQUEST_CLOSE: return "request_close";
- case CEPH_SESSION_CLOSE: return "close";
- case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps";
- case CEPH_SESSION_RENEWCAPS: return "renewcaps";
- case CEPH_SESSION_STALE: return "stale";
- case CEPH_SESSION_RECALL_STATE: return "recall_state";
- }
- return "???";
-}
-
-const char *ceph_mds_op_name(int op)
-{
- switch (op) {
- case CEPH_MDS_OP_LOOKUP: return "lookup";
- case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash";
- case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent";
- case CEPH_MDS_OP_GETATTR: return "getattr";
- case CEPH_MDS_OP_SETXATTR: return "setxattr";
- case CEPH_MDS_OP_SETATTR: return "setattr";
- case CEPH_MDS_OP_RMXATTR: return "rmxattr";
- case CEPH_MDS_OP_READDIR: return "readdir";
- case CEPH_MDS_OP_MKNOD: return "mknod";
- case CEPH_MDS_OP_LINK: return "link";
- case CEPH_MDS_OP_UNLINK: return "unlink";
- case CEPH_MDS_OP_RENAME: return "rename";
- case CEPH_MDS_OP_MKDIR: return "mkdir";
- case CEPH_MDS_OP_RMDIR: return "rmdir";
- case CEPH_MDS_OP_SYMLINK: return "symlink";
- case CEPH_MDS_OP_CREATE: return "create";
- case CEPH_MDS_OP_OPEN: return "open";
- case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap";
- case CEPH_MDS_OP_LSSNAP: return "lssnap";
- case CEPH_MDS_OP_MKSNAP: return "mksnap";
- case CEPH_MDS_OP_RMSNAP: return "rmsnap";
- case CEPH_MDS_OP_SETFILELOCK: return "setfilelock";
- case CEPH_MDS_OP_GETFILELOCK: return "getfilelock";
- }
- return "???";
-}
-
-const char *ceph_cap_op_name(int op)
-{
- switch (op) {
- case CEPH_CAP_OP_GRANT: return "grant";
- case CEPH_CAP_OP_REVOKE: return "revoke";
- case CEPH_CAP_OP_TRUNC: return "trunc";
- case CEPH_CAP_OP_EXPORT: return "export";
- case CEPH_CAP_OP_IMPORT: return "import";
- case CEPH_CAP_OP_UPDATE: return "update";
- case CEPH_CAP_OP_DROP: return "drop";
- case CEPH_CAP_OP_FLUSH: return "flush";
- case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack";
- case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
- case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
- case CEPH_CAP_OP_RELEASE: return "release";
- case CEPH_CAP_OP_RENEW: return "renew";
- }
- return "???";
-}
-
-const char *ceph_lease_op_name(int o)
-{
- switch (o) {
- case CEPH_MDS_LEASE_REVOKE: return "revoke";
- case CEPH_MDS_LEASE_RELEASE: return "release";
- case CEPH_MDS_LEASE_RENEW: return "renew";
- case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack";
- }
- return "???";
-}
-
-const char *ceph_snap_op_name(int o)
-{
- switch (o) {
- case CEPH_SNAP_OP_UPDATE: return "update";
- case CEPH_SNAP_OP_CREATE: return "create";
- case CEPH_SNAP_OP_DESTROY: return "destroy";
- case CEPH_SNAP_OP_SPLIT: return "split";
- }
- return "???";
-}
diff --git a/ANDROID_3.4.5/fs/ceph/super.c b/ANDROID_3.4.5/fs/ceph/super.c
deleted file mode 100644
index 1e67dd73..00000000
--- a/ANDROID_3.4.5/fs/ceph/super.c
+++ /dev/null
@@ -1,972 +0,0 @@
-
-#include <linux/ceph/ceph_debug.h>
-
-#include <linux/backing-dev.h>
-#include <linux/ctype.h>
-#include <linux/fs.h>
-#include <linux/inet.h>
-#include <linux/in6.h>
-#include <linux/module.h>
-#include <linux/mount.h>
-#include <linux/parser.h>
-#include <linux/sched.h>
-#include <linux/seq_file.h>
-#include <linux/slab.h>
-#include <linux/statfs.h>
-#include <linux/string.h>
-
-#include "super.h"
-#include "mds_client.h"
-
-#include <linux/ceph/decode.h>
-#include <linux/ceph/mon_client.h>
-#include <linux/ceph/auth.h>
-#include <linux/ceph/debugfs.h>
-
-/*
- * Ceph superblock operations
- *
- * Handle the basics of mounting, unmounting.
- */
-
-/*
- * super ops
- */
-static void ceph_put_super(struct super_block *s)
-{
- struct ceph_fs_client *fsc = ceph_sb_to_client(s);
-
- dout("put_super\n");
- ceph_mdsc_close_sessions(fsc->mdsc);
-
- /*
- * ensure we release the bdi before put_anon_super releases
- * the device name.
- */
- if (s->s_bdi == &fsc->backing_dev_info) {
- bdi_unregister(&fsc->backing_dev_info);
- s->s_bdi = NULL;
- }
-
- return;
-}
-
-static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
- struct ceph_fs_client *fsc = ceph_inode_to_client(dentry->d_inode);
- struct ceph_monmap *monmap = fsc->client->monc.monmap;
- struct ceph_statfs st;
- u64 fsid;
- int err;
-
- dout("statfs\n");
- err = ceph_monc_do_statfs(&fsc->client->monc, &st);
- if (err < 0)
- return err;
-
- /* fill in kstatfs */
- buf->f_type = CEPH_SUPER_MAGIC; /* ?? */
-
- /*
- * express utilization in terms of large blocks to avoid
- * overflow on 32-bit machines.
- */
- buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
- buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
- buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
- buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
-
- buf->f_files = le64_to_cpu(st.num_objects);
- buf->f_ffree = -1;
- buf->f_namelen = NAME_MAX;
- buf->f_frsize = PAGE_CACHE_SIZE;
-
- /* leave fsid little-endian, regardless of host endianness */
- fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
- buf->f_fsid.val[0] = fsid & 0xffffffff;
- buf->f_fsid.val[1] = fsid >> 32;
-
- return 0;
-}
-
-
-static int ceph_sync_fs(struct super_block *sb, int wait)
-{
- struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
-
- if (!wait) {
- dout("sync_fs (non-blocking)\n");
- ceph_flush_dirty_caps(fsc->mdsc);
- dout("sync_fs (non-blocking) done\n");
- return 0;
- }
-
- dout("sync_fs (blocking)\n");
- ceph_osdc_sync(&fsc->client->osdc);
- ceph_mdsc_sync(fsc->mdsc);
- dout("sync_fs (blocking) done\n");
- return 0;
-}
-
-/*
- * mount options
- */
-enum {
- Opt_wsize,
- Opt_rsize,
- Opt_rasize,
- Opt_caps_wanted_delay_min,
- Opt_caps_wanted_delay_max,
- Opt_cap_release_safety,
- Opt_readdir_max_entries,
- Opt_readdir_max_bytes,
- Opt_congestion_kb,
- Opt_last_int,
- /* int args above */
- Opt_snapdirname,
- Opt_last_string,
- /* string args above */
- Opt_dirstat,
- Opt_nodirstat,
- Opt_rbytes,
- Opt_norbytes,
- Opt_asyncreaddir,
- Opt_noasyncreaddir,
- Opt_dcache,
- Opt_nodcache,
- Opt_ino32,
- Opt_noino32,
-};
-
-static match_table_t fsopt_tokens = {
- {Opt_wsize, "wsize=%d"},
- {Opt_rsize, "rsize=%d"},
- {Opt_rasize, "rasize=%d"},
- {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
- {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
- {Opt_cap_release_safety, "cap_release_safety=%d"},
- {Opt_readdir_max_entries, "readdir_max_entries=%d"},
- {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
- {Opt_congestion_kb, "write_congestion_kb=%d"},
- /* int args above */
- {Opt_snapdirname, "snapdirname=%s"},
- /* string args above */
- {Opt_dirstat, "dirstat"},
- {Opt_nodirstat, "nodirstat"},
- {Opt_rbytes, "rbytes"},
- {Opt_norbytes, "norbytes"},
- {Opt_asyncreaddir, "asyncreaddir"},
- {Opt_noasyncreaddir, "noasyncreaddir"},
- {Opt_dcache, "dcache"},
- {Opt_nodcache, "nodcache"},
- {Opt_ino32, "ino32"},
- {Opt_noino32, "noino32"},
- {-1, NULL}
-};
-
-static int parse_fsopt_token(char *c, void *private)
-{
- struct ceph_mount_options *fsopt = private;
- substring_t argstr[MAX_OPT_ARGS];
- int token, intval, ret;
-
- token = match_token((char *)c, fsopt_tokens, argstr);
- if (token < 0)
- return -EINVAL;
-
- if (token < Opt_last_int) {
- ret = match_int(&argstr[0], &intval);
- if (ret < 0) {
- pr_err("bad mount option arg (not int) "
- "at '%s'\n", c);
- return ret;
- }
- dout("got int token %d val %d\n", token, intval);
- } else if (token > Opt_last_int && token < Opt_last_string) {
- dout("got string token %d val %s\n", token,
- argstr[0].from);
- } else {
- dout("got token %d\n", token);
- }
-
- switch (token) {
- case Opt_snapdirname:
- kfree(fsopt->snapdir_name);
- fsopt->snapdir_name = kstrndup(argstr[0].from,
- argstr[0].to-argstr[0].from,
- GFP_KERNEL);
- if (!fsopt->snapdir_name)
- return -ENOMEM;
- break;
-
- /* misc */
- case Opt_wsize:
- fsopt->wsize = intval;
- break;
- case Opt_rsize:
- fsopt->rsize = intval;
- break;
- case Opt_rasize:
- fsopt->rasize = intval;
- break;
- case Opt_caps_wanted_delay_min:
- fsopt->caps_wanted_delay_min = intval;
- break;
- case Opt_caps_wanted_delay_max:
- fsopt->caps_wanted_delay_max = intval;
- break;
- case Opt_readdir_max_entries:
- fsopt->max_readdir = intval;
- break;
- case Opt_readdir_max_bytes:
- fsopt->max_readdir_bytes = intval;
- break;
- case Opt_congestion_kb:
- fsopt->congestion_kb = intval;
- break;
- case Opt_dirstat:
- fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT;
- break;
- case Opt_nodirstat:
- fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT;
- break;
- case Opt_rbytes:
- fsopt->flags |= CEPH_MOUNT_OPT_RBYTES;
- break;
- case Opt_norbytes:
- fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
- break;
- case Opt_asyncreaddir:
- fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR;
- break;
- case Opt_noasyncreaddir:
- fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
- break;
- case Opt_dcache:
- fsopt->flags |= CEPH_MOUNT_OPT_DCACHE;
- break;
- case Opt_nodcache:
- fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE;
- break;
- case Opt_ino32:
- fsopt->flags |= CEPH_MOUNT_OPT_INO32;
- break;
- case Opt_noino32:
- fsopt->flags &= ~CEPH_MOUNT_OPT_INO32;
- break;
- default:
- BUG_ON(token);
- }
- return 0;
-}
-
-static void destroy_mount_options(struct ceph_mount_options *args)
-{
- dout("destroy_mount_options %p\n", args);
- kfree(args->snapdir_name);
- kfree(args);
-}
-
-static int strcmp_null(const char *s1, const char *s2)
-{
- if (!s1 && !s2)
- return 0;
- if (s1 && !s2)
- return -1;
- if (!s1 && s2)
- return 1;
- return strcmp(s1, s2);
-}
-
-static int compare_mount_options(struct ceph_mount_options *new_fsopt,
- struct ceph_options *new_opt,
- struct ceph_fs_client *fsc)
-{
- struct ceph_mount_options *fsopt1 = new_fsopt;
- struct ceph_mount_options *fsopt2 = fsc->mount_options;
- int ofs = offsetof(struct ceph_mount_options, snapdir_name);
- int ret;
-
- ret = memcmp(fsopt1, fsopt2, ofs);
- if (ret)
- return ret;
-
- ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
- if (ret)
- return ret;
-
- return ceph_compare_options(new_opt, fsc->client);
-}
-
-static int parse_mount_options(struct ceph_mount_options **pfsopt,
- struct ceph_options **popt,
- int flags, char *options,
- const char *dev_name,
- const char **path)
-{
- struct ceph_mount_options *fsopt;
- const char *dev_name_end;
- int err = -ENOMEM;
-
- fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);
- if (!fsopt)
- return -ENOMEM;
-
- dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name);
-
- fsopt->sb_flags = flags;
- fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
-
- fsopt->rsize = CEPH_RSIZE_DEFAULT;
- fsopt->rasize = CEPH_RASIZE_DEFAULT;
- fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
- fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
- fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
- fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
- fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
- fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
- fsopt->congestion_kb = default_congestion_kb();
-
- /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
- err = -EINVAL;
- if (!dev_name)
- goto out;
- *path = strstr(dev_name, ":/");
- if (*path == NULL) {
- pr_err("device name is missing path (no :/ in %s)\n",
- dev_name);
- goto out;
- }
- dev_name_end = *path;
- dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
-
- /* path on server */
- *path += 2;
- dout("server path '%s'\n", *path);
-
- *popt = ceph_parse_options(options, dev_name, dev_name_end,
- parse_fsopt_token, (void *)fsopt);
- if (IS_ERR(*popt)) {
- err = PTR_ERR(*popt);
- goto out;
- }
-
- /* success */
- *pfsopt = fsopt;
- return 0;
-
-out:
- destroy_mount_options(fsopt);
- return err;
-}
-
-/**
- * ceph_show_options - Show mount options in /proc/mounts
- * @m: seq_file to write to
- * @root: root of that (sub)tree
- */
-static int ceph_show_options(struct seq_file *m, struct dentry *root)
-{
- struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb);
- struct ceph_mount_options *fsopt = fsc->mount_options;
- struct ceph_options *opt = fsc->client->options;
-
- if (opt->flags & CEPH_OPT_FSID)
- seq_printf(m, ",fsid=%pU", &opt->fsid);
- if (opt->flags & CEPH_OPT_NOSHARE)
- seq_puts(m, ",noshare");
- if (opt->flags & CEPH_OPT_NOCRC)
- seq_puts(m, ",nocrc");
-
- if (opt->name)
- seq_printf(m, ",name=%s", opt->name);
- if (opt->key)
- seq_puts(m, ",secret=<hidden>");
-
- if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
- seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
- if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
- seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
- if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
- seq_printf(m, ",osdtimeout=%d", opt->osd_timeout);
- if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
- seq_printf(m, ",osdkeepalivetimeout=%d",
- opt->osd_keepalive_timeout);
-
- if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
- seq_puts(m, ",dirstat");
- if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0)
- seq_puts(m, ",norbytes");
- if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
- seq_puts(m, ",noasyncreaddir");
- if (fsopt->flags & CEPH_MOUNT_OPT_DCACHE)
- seq_puts(m, ",dcache");
- else
- seq_puts(m, ",nodcache");
-
- if (fsopt->wsize)
- seq_printf(m, ",wsize=%d", fsopt->wsize);
- if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
- seq_printf(m, ",rsize=%d", fsopt->rsize);
- if (fsopt->rasize != CEPH_RASIZE_DEFAULT)
- seq_printf(m, ",rasize=%d", fsopt->rasize);
- if (fsopt->congestion_kb != default_congestion_kb())
- seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
- if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
- seq_printf(m, ",caps_wanted_delay_min=%d",
- fsopt->caps_wanted_delay_min);
- if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
- seq_printf(m, ",caps_wanted_delay_max=%d",
- fsopt->caps_wanted_delay_max);
- if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
- seq_printf(m, ",cap_release_safety=%d",
- fsopt->cap_release_safety);
- if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT)
- seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir);
- if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
- seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
- if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
- seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
- return 0;
-}
-
-/*
- * handle any mon messages the standard library doesn't understand.
- * return error if we don't either.
- */
-static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
-{
- struct ceph_fs_client *fsc = client->private;
- int type = le16_to_cpu(msg->hdr.type);
-
- switch (type) {
- case CEPH_MSG_MDS_MAP:
- ceph_mdsc_handle_map(fsc->mdsc, msg);
- return 0;
-
- default:
- return -1;
- }
-}
-
-/*
- * create a new fs client
- */
-static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
- struct ceph_options *opt)
-{
- struct ceph_fs_client *fsc;
- const unsigned supported_features =
- CEPH_FEATURE_FLOCK |
- CEPH_FEATURE_DIRLAYOUTHASH;
- const unsigned required_features = 0;
- int err = -ENOMEM;
-
- fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
- if (!fsc)
- return ERR_PTR(-ENOMEM);
-
- fsc->client = ceph_create_client(opt, fsc, supported_features,
- required_features);
- if (IS_ERR(fsc->client)) {
- err = PTR_ERR(fsc->client);
- goto fail;
- }
- fsc->client->extra_mon_dispatch = extra_mon_dispatch;
- fsc->client->monc.want_mdsmap = 1;
-
- fsc->mount_options = fsopt;
-
- fsc->sb = NULL;
- fsc->mount_state = CEPH_MOUNT_MOUNTING;
-
- atomic_long_set(&fsc->writeback_count, 0);
-
- err = bdi_init(&fsc->backing_dev_info);
- if (err < 0)
- goto fail_client;
-
- err = -ENOMEM;
- /*
- * The number of concurrent works can be high but they don't need
- * to be processed in parallel, limit concurrency.
- */
- fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1);
- if (fsc->wb_wq == NULL)
- goto fail_bdi;
- fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1);
- if (fsc->pg_inv_wq == NULL)
- goto fail_wb_wq;
- fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1);
- if (fsc->trunc_wq == NULL)
- goto fail_pg_inv_wq;
-
- /* set up mempools */
- err = -ENOMEM;
- fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
- fsc->mount_options->wsize >> PAGE_CACHE_SHIFT);
- if (!fsc->wb_pagevec_pool)
- goto fail_trunc_wq;
-
- /* caps */
- fsc->min_caps = fsopt->max_readdir;
-
- return fsc;
-
-fail_trunc_wq:
- destroy_workqueue(fsc->trunc_wq);
-fail_pg_inv_wq:
- destroy_workqueue(fsc->pg_inv_wq);
-fail_wb_wq:
- destroy_workqueue(fsc->wb_wq);
-fail_bdi:
- bdi_destroy(&fsc->backing_dev_info);
-fail_client:
- ceph_destroy_client(fsc->client);
-fail:
- kfree(fsc);
- return ERR_PTR(err);
-}
-
-static void destroy_fs_client(struct ceph_fs_client *fsc)
-{
- dout("destroy_fs_client %p\n", fsc);
-
- destroy_workqueue(fsc->wb_wq);
- destroy_workqueue(fsc->pg_inv_wq);
- destroy_workqueue(fsc->trunc_wq);
-
- bdi_destroy(&fsc->backing_dev_info);
-
- mempool_destroy(fsc->wb_pagevec_pool);
-
- destroy_mount_options(fsc->mount_options);
-
- ceph_fs_debugfs_cleanup(fsc);
-
- ceph_destroy_client(fsc->client);
-
- kfree(fsc);
- dout("destroy_fs_client %p done\n", fsc);
-}
-
-/*
- * caches
- */
-struct kmem_cache *ceph_inode_cachep;
-struct kmem_cache *ceph_cap_cachep;
-struct kmem_cache *ceph_dentry_cachep;
-struct kmem_cache *ceph_file_cachep;
-
-static void ceph_inode_init_once(void *foo)
-{
- struct ceph_inode_info *ci = foo;
- inode_init_once(&ci->vfs_inode);
-}
-
-static int __init init_caches(void)
-{
- ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
- sizeof(struct ceph_inode_info),
- __alignof__(struct ceph_inode_info),
- (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
- ceph_inode_init_once);
- if (ceph_inode_cachep == NULL)
- return -ENOMEM;
-
- ceph_cap_cachep = KMEM_CACHE(ceph_cap,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
- if (ceph_cap_cachep == NULL)
- goto bad_cap;
-
- ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
- if (ceph_dentry_cachep == NULL)
- goto bad_dentry;
-
- ceph_file_cachep = KMEM_CACHE(ceph_file_info,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
- if (ceph_file_cachep == NULL)
- goto bad_file;
-
- return 0;
-
-bad_file:
- kmem_cache_destroy(ceph_dentry_cachep);
-bad_dentry:
- kmem_cache_destroy(ceph_cap_cachep);
-bad_cap:
- kmem_cache_destroy(ceph_inode_cachep);
- return -ENOMEM;
-}
-
-static void destroy_caches(void)
-{
- kmem_cache_destroy(ceph_inode_cachep);
- kmem_cache_destroy(ceph_cap_cachep);
- kmem_cache_destroy(ceph_dentry_cachep);
- kmem_cache_destroy(ceph_file_cachep);
-}
-
-
-/*
- * ceph_umount_begin - initiate forced umount. Tear down down the
- * mount, skipping steps that may hang while waiting for server(s).
- */
-static void ceph_umount_begin(struct super_block *sb)
-{
- struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
-
- dout("ceph_umount_begin - starting forced umount\n");
- if (!fsc)
- return;
- fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
- return;
-}
-
-static const struct super_operations ceph_super_ops = {
- .alloc_inode = ceph_alloc_inode,
- .destroy_inode = ceph_destroy_inode,
- .write_inode = ceph_write_inode,
- .sync_fs = ceph_sync_fs,
- .put_super = ceph_put_super,
- .show_options = ceph_show_options,
- .statfs = ceph_statfs,
- .umount_begin = ceph_umount_begin,
-};
-
-/*
- * Bootstrap mount by opening the root directory. Note the mount
- * @started time from caller, and time out if this takes too long.
- */
-static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
- const char *path,
- unsigned long started)
-{
- struct ceph_mds_client *mdsc = fsc->mdsc;
- struct ceph_mds_request *req = NULL;
- int err;
- struct dentry *root;
-
- /* open dir */
- dout("open_root_inode opening '%s'\n", path);
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
- if (IS_ERR(req))
- return ERR_CAST(req);
- req->r_path1 = kstrdup(path, GFP_NOFS);
- req->r_ino1.ino = CEPH_INO_ROOT;
- req->r_ino1.snap = CEPH_NOSNAP;
- req->r_started = started;
- req->r_timeout = fsc->client->options->mount_timeout * HZ;
- req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
- req->r_num_caps = 2;
- err = ceph_mdsc_do_request(mdsc, NULL, req);
- if (err == 0) {
- struct inode *inode = req->r_target_inode;
- req->r_target_inode = NULL;
- dout("open_root_inode success\n");
- if (ceph_ino(inode) == CEPH_INO_ROOT &&
- fsc->sb->s_root == NULL) {
- root = d_make_root(inode);
- if (!root) {
- root = ERR_PTR(-ENOMEM);
- goto out;
- }
- } else {
- root = d_obtain_alias(inode);
- }
- ceph_init_dentry(root);
- dout("open_root_inode success, root dentry is %p\n", root);
- } else {
- root = ERR_PTR(err);
- }
-out:
- ceph_mdsc_put_request(req);
- return root;
-}
-
-
-
-
-/*
- * mount: join the ceph cluster, and open root directory.
- */
-static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
- const char *path)
-{
- int err;
- unsigned long started = jiffies; /* note the start time */
- struct dentry *root;
- int first = 0; /* first vfsmount for this super_block */
-
- dout("mount start\n");
- mutex_lock(&fsc->client->mount_mutex);
-
- err = __ceph_open_session(fsc->client, started);
- if (err < 0)
- goto out;
-
- dout("mount opening root\n");
- root = open_root_dentry(fsc, "", started);
- if (IS_ERR(root)) {
- err = PTR_ERR(root);
- goto out;
- }
- if (fsc->sb->s_root) {
- dput(root);
- } else {
- fsc->sb->s_root = root;
- first = 1;
-
- err = ceph_fs_debugfs_init(fsc);
- if (err < 0)
- goto fail;
- }
-
- if (path[0] == 0) {
- dget(root);
- } else {
- dout("mount opening base mountpoint\n");
- root = open_root_dentry(fsc, path, started);
- if (IS_ERR(root)) {
- err = PTR_ERR(root);
- goto fail;
- }
- }
-
- fsc->mount_state = CEPH_MOUNT_MOUNTED;
- dout("mount success\n");
- mutex_unlock(&fsc->client->mount_mutex);
- return root;
-
-out:
- mutex_unlock(&fsc->client->mount_mutex);
- return ERR_PTR(err);
-
-fail:
- if (first) {
- dput(fsc->sb->s_root);
- fsc->sb->s_root = NULL;
- }
- goto out;
-}
-
-static int ceph_set_super(struct super_block *s, void *data)
-{
- struct ceph_fs_client *fsc = data;
- int ret;
-
- dout("set_super %p data %p\n", s, data);
-
- s->s_flags = fsc->mount_options->sb_flags;
- s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */
-
- s->s_fs_info = fsc;
- fsc->sb = s;
-
- s->s_op = &ceph_super_ops;
- s->s_export_op = &ceph_export_ops;
-
- s->s_time_gran = 1000; /* 1000 ns == 1 us */
-
- ret = set_anon_super(s, NULL); /* what is that second arg for? */
- if (ret != 0)
- goto fail;
-
- return ret;
-
-fail:
- s->s_fs_info = NULL;
- fsc->sb = NULL;
- return ret;
-}
-
-/*
- * share superblock if same fs AND options
- */
-static int ceph_compare_super(struct super_block *sb, void *data)
-{
- struct ceph_fs_client *new = data;
- struct ceph_mount_options *fsopt = new->mount_options;
- struct ceph_options *opt = new->client->options;
- struct ceph_fs_client *other = ceph_sb_to_client(sb);
-
- dout("ceph_compare_super %p\n", sb);
-
- if (compare_mount_options(fsopt, opt, other)) {
- dout("monitor(s)/mount options don't match\n");
- return 0;
- }
- if ((opt->flags & CEPH_OPT_FSID) &&
- ceph_fsid_compare(&opt->fsid, &other->client->fsid)) {
- dout("fsid doesn't match\n");
- return 0;
- }
- if (fsopt->sb_flags != other->mount_options->sb_flags) {
- dout("flags differ\n");
- return 0;
- }
- return 1;
-}
-
-/*
- * construct our own bdi so we can control readahead, etc.
- */
-static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
-
-static int ceph_register_bdi(struct super_block *sb,
- struct ceph_fs_client *fsc)
-{
- int err;
-
- /* set ra_pages based on rasize mount option? */
- if (fsc->mount_options->rasize >= PAGE_CACHE_SIZE)
- fsc->backing_dev_info.ra_pages =
- (fsc->mount_options->rasize + PAGE_CACHE_SIZE - 1)
- >> PAGE_SHIFT;
- else
- fsc->backing_dev_info.ra_pages =
- default_backing_dev_info.ra_pages;
-
- err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d",
- atomic_long_inc_return(&bdi_seq));
- if (!err)
- sb->s_bdi = &fsc->backing_dev_info;
- return err;
-}
-
-static struct dentry *ceph_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
-{
- struct super_block *sb;
- struct ceph_fs_client *fsc;
- struct dentry *res;
- int err;
- int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
- const char *path = NULL;
- struct ceph_mount_options *fsopt = NULL;
- struct ceph_options *opt = NULL;
-
- dout("ceph_mount\n");
- err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
- if (err < 0) {
- res = ERR_PTR(err);
- goto out_final;
- }
-
- /* create client (which we may/may not use) */
- fsc = create_fs_client(fsopt, opt);
- if (IS_ERR(fsc)) {
- res = ERR_CAST(fsc);
- destroy_mount_options(fsopt);
- ceph_destroy_options(opt);
- goto out_final;
- }
-
- err = ceph_mdsc_init(fsc);
- if (err < 0) {
- res = ERR_PTR(err);
- goto out;
- }
-
- if (ceph_test_opt(fsc->client, NOSHARE))
- compare_super = NULL;
- sb = sget(fs_type, compare_super, ceph_set_super, fsc);
- if (IS_ERR(sb)) {
- res = ERR_CAST(sb);
- goto out;
- }
-
- if (ceph_sb_to_client(sb) != fsc) {
- ceph_mdsc_destroy(fsc);
- destroy_fs_client(fsc);
- fsc = ceph_sb_to_client(sb);
- dout("get_sb got existing client %p\n", fsc);
- } else {
- dout("get_sb using new client %p\n", fsc);
- err = ceph_register_bdi(sb, fsc);
- if (err < 0) {
- res = ERR_PTR(err);
- goto out_splat;
- }
- }
-
- res = ceph_real_mount(fsc, path);
- if (IS_ERR(res))
- goto out_splat;
- dout("root %p inode %p ino %llx.%llx\n", res,
- res->d_inode, ceph_vinop(res->d_inode));
- return res;
-
-out_splat:
- ceph_mdsc_close_sessions(fsc->mdsc);
- deactivate_locked_super(sb);
- goto out_final;
-
-out:
- ceph_mdsc_destroy(fsc);
- destroy_fs_client(fsc);
-out_final:
- dout("ceph_mount fail %ld\n", PTR_ERR(res));
- return res;
-}
-
-static void ceph_kill_sb(struct super_block *s)
-{
- struct ceph_fs_client *fsc = ceph_sb_to_client(s);
- dout("kill_sb %p\n", s);
- ceph_mdsc_pre_umount(fsc->mdsc);
- kill_anon_super(s); /* will call put_super after sb is r/o */
- ceph_mdsc_destroy(fsc);
- destroy_fs_client(fsc);
-}
-
-static struct file_system_type ceph_fs_type = {
- .owner = THIS_MODULE,
- .name = "ceph",
- .mount = ceph_mount,
- .kill_sb = ceph_kill_sb,
- .fs_flags = FS_RENAME_DOES_D_MOVE,
-};
-
-#define _STRINGIFY(x) #x
-#define STRINGIFY(x) _STRINGIFY(x)
-
-static int __init init_ceph(void)
-{
- int ret = init_caches();
- if (ret)
- goto out;
-
- ceph_xattr_init();
- ret = register_filesystem(&ceph_fs_type);
- if (ret)
- goto out_icache;
-
- pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
-
- return 0;
-
-out_icache:
- ceph_xattr_exit();
- destroy_caches();
-out:
- return ret;
-}
-
-static void __exit exit_ceph(void)
-{
- dout("exit_ceph\n");
- unregister_filesystem(&ceph_fs_type);
- ceph_xattr_exit();
- destroy_caches();
-}
-
-module_init(init_ceph);
-module_exit(exit_ceph);
-
-MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
-MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
-MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
-MODULE_DESCRIPTION("Ceph filesystem for Linux");
-MODULE_LICENSE("GPL");
diff --git a/ANDROID_3.4.5/fs/ceph/super.h b/ANDROID_3.4.5/fs/ceph/super.h
deleted file mode 100644
index fc35036d..00000000
--- a/ANDROID_3.4.5/fs/ceph/super.h
+++ /dev/null
@@ -1,858 +0,0 @@
-#ifndef _FS_CEPH_SUPER_H
-#define _FS_CEPH_SUPER_H
-
-#include <linux/ceph/ceph_debug.h>
-
-#include <asm/unaligned.h>
-#include <linux/backing-dev.h>
-#include <linux/completion.h>
-#include <linux/exportfs.h>
-#include <linux/fs.h>
-#include <linux/mempool.h>
-#include <linux/pagemap.h>
-#include <linux/wait.h>
-#include <linux/writeback.h>
-#include <linux/slab.h>
-
-#include <linux/ceph/libceph.h>
-
-/* f_type in struct statfs */
-#define CEPH_SUPER_MAGIC 0x00c36400
-
-/* large granularity for statfs utilization stats to facilitate
- * large volume sizes on 32-bit machines. */
-#define CEPH_BLOCK_SHIFT 20 /* 1 MB */
-#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
-
-#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
-#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
-#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
-#define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */
-#define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */
-
-#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES)
-
-#define ceph_set_mount_opt(fsc, opt) \
- (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
-#define ceph_test_mount_opt(fsc, opt) \
- (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
-
-#define CEPH_RSIZE_DEFAULT 0 /* max read size */
-#define CEPH_RASIZE_DEFAULT (8192*1024) /* readahead */
-#define CEPH_MAX_READDIR_DEFAULT 1024
-#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
-#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
-
-struct ceph_mount_options {
- int flags;
- int sb_flags;
-
- int wsize; /* max write size */
- int rsize; /* max read size */
- int rasize; /* max readahead */
- int congestion_kb; /* max writeback in flight */
- int caps_wanted_delay_min, caps_wanted_delay_max;
- int cap_release_safety;
- int max_readdir; /* max readdir result (entires) */
- int max_readdir_bytes; /* max readdir result (bytes) */
-
- /*
- * everything above this point can be memcmp'd; everything below
- * is handled in compare_mount_options()
- */
-
- char *snapdir_name; /* default ".snap" */
-};
-
-struct ceph_fs_client {
- struct super_block *sb;
-
- struct ceph_mount_options *mount_options;
- struct ceph_client *client;
-
- unsigned long mount_state;
- int min_caps; /* min caps i added */
-
- struct ceph_mds_client *mdsc;
-
- /* writeback */
- mempool_t *wb_pagevec_pool;
- struct workqueue_struct *wb_wq;
- struct workqueue_struct *pg_inv_wq;
- struct workqueue_struct *trunc_wq;
- atomic_long_t writeback_count;
-
- struct backing_dev_info backing_dev_info;
-
-#ifdef CONFIG_DEBUG_FS
- struct dentry *debugfs_dentry_lru, *debugfs_caps;
- struct dentry *debugfs_congestion_kb;
- struct dentry *debugfs_bdi;
- struct dentry *debugfs_mdsc, *debugfs_mdsmap;
-#endif
-};
-
-
-/*
- * File i/o capability. This tracks shared state with the metadata
- * server that allows us to cache or writeback attributes or to read
- * and write data. For any given inode, we should have one or more
- * capabilities, one issued by each metadata server, and our
- * cumulative access is the OR of all issued capabilities.
- *
- * Each cap is referenced by the inode's i_caps rbtree and by per-mds
- * session capability lists.
- */
-struct ceph_cap {
- struct ceph_inode_info *ci;
- struct rb_node ci_node; /* per-ci cap tree */
- struct ceph_mds_session *session;
- struct list_head session_caps; /* per-session caplist */
- int mds;
- u64 cap_id; /* unique cap id (mds provided) */
- int issued; /* latest, from the mds */
- int implemented; /* implemented superset of issued (for revocation) */
- int mds_wanted;
- u32 seq, issue_seq, mseq;
- u32 cap_gen; /* active/stale cycle */
- unsigned long last_used;
- struct list_head caps_item;
-};
-
-#define CHECK_CAPS_NODELAY 1 /* do not delay any further */
-#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */
-#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */
-
-/*
- * Snapped cap state that is pending flush to mds. When a snapshot occurs,
- * we first complete any in-process sync writes and writeback any dirty
- * data before flushing the snapped state (tracked here) back to the MDS.
- */
-struct ceph_cap_snap {
- atomic_t nref;
- struct ceph_inode_info *ci;
- struct list_head ci_item, flushing_item;
-
- u64 follows, flush_tid;
- int issued, dirty;
- struct ceph_snap_context *context;
-
- umode_t mode;
- uid_t uid;
- gid_t gid;
-
- struct ceph_buffer *xattr_blob;
- u64 xattr_version;
-
- u64 size;
- struct timespec mtime, atime, ctime;
- u64 time_warp_seq;
- int writing; /* a sync write is still in progress */
- int dirty_pages; /* dirty pages awaiting writeback */
-};
-
-static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
-{
- if (atomic_dec_and_test(&capsnap->nref)) {
- if (capsnap->xattr_blob)
- ceph_buffer_put(capsnap->xattr_blob);
- kfree(capsnap);
- }
-}
-
-/*
- * The frag tree describes how a directory is fragmented, potentially across
- * multiple metadata servers. It is also used to indicate points where
- * metadata authority is delegated, and whether/where metadata is replicated.
- *
- * A _leaf_ frag will be present in the i_fragtree IFF there is
- * delegation info. That is, if mds >= 0 || ndist > 0.
- */
-#define CEPH_MAX_DIRFRAG_REP 4
-
-struct ceph_inode_frag {
- struct rb_node node;
-
- /* fragtree state */
- u32 frag;
- int split_by; /* i.e. 2^(split_by) children */
-
- /* delegation and replication info */
- int mds; /* -1 if same authority as parent */
- int ndist; /* >0 if replicated */
- int dist[CEPH_MAX_DIRFRAG_REP];
-};
-
-/*
- * We cache inode xattrs as an encoded blob until they are first used,
- * at which point we parse them into an rbtree.
- */
-struct ceph_inode_xattr {
- struct rb_node node;
-
- const char *name;
- int name_len;
- const char *val;
- int val_len;
- int dirty;
-
- int should_free_name;
- int should_free_val;
-};
-
-/*
- * Ceph dentry state
- */
-struct ceph_dentry_info {
- unsigned long flags;
- struct ceph_mds_session *lease_session;
- u32 lease_gen, lease_shared_gen;
- u32 lease_seq;
- unsigned long lease_renew_after, lease_renew_from;
- struct list_head lru;
- struct dentry *dentry;
- u64 time;
- u64 offset;
-};
-
-/*
- * dentry flags
- *
- * The locking for D_COMPLETE is a bit odd:
- * - we can clear it at almost any time (see ceph_d_prune)
- * - it is only meaningful if:
- * - we hold dir inode i_ceph_lock
- * - we hold dir FILE_SHARED caps
- * - the dentry D_COMPLETE is set
- */
-#define CEPH_D_COMPLETE 1 /* if set, d_u.d_subdirs is complete directory */
-
-struct ceph_inode_xattrs_info {
- /*
- * (still encoded) xattr blob. we avoid the overhead of parsing
- * this until someone actually calls getxattr, etc.
- *
- * blob->vec.iov_len == 4 implies there are no xattrs; blob ==
- * NULL means we don't know.
- */
- struct ceph_buffer *blob, *prealloc_blob;
-
- struct rb_root index;
- bool dirty;
- int count;
- int names_size;
- int vals_size;
- u64 version, index_version;
-};
-
-/*
- * Ceph inode.
- */
-struct ceph_inode_info {
- struct ceph_vino i_vino; /* ceph ino + snap */
-
- spinlock_t i_ceph_lock;
-
- u64 i_version;
- u32 i_time_warp_seq;
-
- unsigned i_ceph_flags;
- unsigned long i_release_count;
-
- struct ceph_dir_layout i_dir_layout;
- struct ceph_file_layout i_layout;
- char *i_symlink;
-
- /* for dirs */
- struct timespec i_rctime;
- u64 i_rbytes, i_rfiles, i_rsubdirs;
- u64 i_files, i_subdirs;
- u64 i_max_offset; /* largest readdir offset, set with D_COMPLETE */
-
- struct rb_root i_fragtree;
- struct mutex i_fragtree_mutex;
-
- struct ceph_inode_xattrs_info i_xattrs;
-
- /* capabilities. protected _both_ by i_ceph_lock and cap->session's
- * s_mutex. */
- struct rb_root i_caps; /* cap list */
- struct ceph_cap *i_auth_cap; /* authoritative cap, if any */
- unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */
- struct list_head i_dirty_item, i_flushing_item;
- u64 i_cap_flush_seq;
- /* we need to track cap writeback on a per-cap-bit basis, to allow
- * overlapping, pipelined cap flushes to the mds. we can probably
- * reduce the tid to 8 bits if we're concerned about inode size. */
- u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS];
- wait_queue_head_t i_cap_wq; /* threads waiting on a capability */
- unsigned long i_hold_caps_min; /* jiffies */
- unsigned long i_hold_caps_max; /* jiffies */
- struct list_head i_cap_delay_list; /* for delayed cap release to mds */
- int i_cap_exporting_mds; /* to handle cap migration between */
- unsigned i_cap_exporting_mseq; /* mds's. */
- unsigned i_cap_exporting_issued;
- struct ceph_cap_reservation i_cap_migration_resv;
- struct list_head i_cap_snaps; /* snapped state pending flush to mds */
- struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or
- dirty|flushing caps */
- unsigned i_snap_caps; /* cap bits for snapped files */
-
- int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
-
- u32 i_truncate_seq; /* last truncate to smaller size */
- u64 i_truncate_size; /* and the size we last truncated down to */
- int i_truncate_pending; /* still need to call vmtruncate */
-
- u64 i_max_size; /* max file size authorized by mds */
- u64 i_reported_size; /* (max_)size reported to or requested of mds */
- u64 i_wanted_max_size; /* offset we'd like to write too */
- u64 i_requested_max_size; /* max_size we've requested */
-
- /* held references to caps */
- int i_pin_ref;
- int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref;
- int i_wrbuffer_ref, i_wrbuffer_ref_head;
- u32 i_shared_gen; /* increment each time we get FILE_SHARED */
- u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */
- u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
-
- struct list_head i_unsafe_writes; /* uncommitted sync writes */
- struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
- spinlock_t i_unsafe_lock;
-
- struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
- int i_snap_realm_counter; /* snap realm (if caps) */
- struct list_head i_snap_realm_item;
- struct list_head i_snap_flush_item;
-
- struct work_struct i_wb_work; /* writeback work */
- struct work_struct i_pg_inv_work; /* page invalidation work */
-
- struct work_struct i_vmtruncate_work;
-
- struct inode vfs_inode; /* at end */
-};
-
-static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
-{
- return container_of(inode, struct ceph_inode_info, vfs_inode);
-}
-
-static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
-{
- return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
-}
-
-static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
-{
- return (struct ceph_fs_client *)sb->s_fs_info;
-}
-
-static inline struct ceph_vino ceph_vino(struct inode *inode)
-{
- return ceph_inode(inode)->i_vino;
-}
-
-/*
- * ino_t is <64 bits on many architectures, blech.
- *
- * i_ino (kernel inode) st_ino (userspace)
- * i386 32 32
- * x86_64+ino32 64 32
- * x86_64 64 64
- */
-static inline u32 ceph_ino_to_ino32(__u64 vino)
-{
- u32 ino = vino & 0xffffffff;
- ino ^= vino >> 32;
- if (!ino)
- ino = 2;
- return ino;
-}
-
-/*
- * kernel i_ino value
- */
-static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
-{
-#if BITS_PER_LONG == 32
- return ceph_ino_to_ino32(vino.ino);
-#else
- return (ino_t)vino.ino;
-#endif
-}
-
-/*
- * user-visible ino (stat, filldir)
- */
-#if BITS_PER_LONG == 32
-static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
-{
- return ino;
-}
-#else
-static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
-{
- if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))
- ino = ceph_ino_to_ino32(ino);
- return ino;
-}
-#endif
-
-
-/* for printf-style formatting */
-#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
-
-static inline u64 ceph_ino(struct inode *inode)
-{
- return ceph_inode(inode)->i_vino.ino;
-}
-static inline u64 ceph_snap(struct inode *inode)
-{
- return ceph_inode(inode)->i_vino.snap;
-}
-
-static inline int ceph_ino_compare(struct inode *inode, void *data)
-{
- struct ceph_vino *pvino = (struct ceph_vino *)data;
- struct ceph_inode_info *ci = ceph_inode(inode);
- return ci->i_vino.ino == pvino->ino &&
- ci->i_vino.snap == pvino->snap;
-}
-
-static inline struct inode *ceph_find_inode(struct super_block *sb,
- struct ceph_vino vino)
-{
- ino_t t = ceph_vino_to_ino(vino);
- return ilookup5(sb, t, ceph_ino_compare, &vino);
-}
-
-
-/*
- * Ceph inode.
- */
-#define CEPH_I_NODELAY 4 /* do not delay cap release */
-#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
-#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
-
-static inline void ceph_i_clear(struct inode *inode, unsigned mask)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
-
- spin_lock(&ci->i_ceph_lock);
- ci->i_ceph_flags &= ~mask;
- spin_unlock(&ci->i_ceph_lock);
-}
-
-static inline void ceph_i_set(struct inode *inode, unsigned mask)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
-
- spin_lock(&ci->i_ceph_lock);
- ci->i_ceph_flags |= mask;
- spin_unlock(&ci->i_ceph_lock);
-}
-
-static inline bool ceph_i_test(struct inode *inode, unsigned mask)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- bool r;
-
- spin_lock(&ci->i_ceph_lock);
- r = (ci->i_ceph_flags & mask) == mask;
- spin_unlock(&ci->i_ceph_lock);
- return r;
-}
-
-
-/* find a specific frag @f */
-extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci,
- u32 f);
-
-/*
- * choose fragment for value @v. copy frag content to pfrag, if leaf
- * exists
- */
-extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
- struct ceph_inode_frag *pfrag,
- int *found);
-
-static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
-{
- return (struct ceph_dentry_info *)dentry->d_fsdata;
-}
-
-static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
-{
- return ((loff_t)frag << 32) | (loff_t)off;
-}
-
-/*
- * set/clear directory D_COMPLETE flag
- */
-void ceph_dir_set_complete(struct inode *inode);
-void ceph_dir_clear_complete(struct inode *inode);
-bool ceph_dir_test_complete(struct inode *inode);
-
-/*
- * caps helpers
- */
-static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
-{
- return !RB_EMPTY_ROOT(&ci->i_caps);
-}
-
-extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented);
-extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t);
-extern int __ceph_caps_issued_other(struct ceph_inode_info *ci,
- struct ceph_cap *cap);
-
-static inline int ceph_caps_issued(struct ceph_inode_info *ci)
-{
- int issued;
- spin_lock(&ci->i_ceph_lock);
- issued = __ceph_caps_issued(ci, NULL);
- spin_unlock(&ci->i_ceph_lock);
- return issued;
-}
-
-static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask,
- int touch)
-{
- int r;
- spin_lock(&ci->i_ceph_lock);
- r = __ceph_caps_issued_mask(ci, mask, touch);
- spin_unlock(&ci->i_ceph_lock);
- return r;
-}
-
-static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
-{
- return ci->i_dirty_caps | ci->i_flushing_caps;
-}
-extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
-
-extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
-extern int __ceph_caps_used(struct ceph_inode_info *ci);
-
-extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
-
-/*
- * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
- */
-static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
-{
- int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
- if (w & CEPH_CAP_FILE_BUFFER)
- w |= CEPH_CAP_FILE_EXCL; /* we want EXCL if dirty data */
- return w;
-}
-
-/* what the mds thinks we want */
-extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
-
-extern void ceph_caps_init(struct ceph_mds_client *mdsc);
-extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
-extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta);
-extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
- struct ceph_cap_reservation *ctx, int need);
-extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
- struct ceph_cap_reservation *ctx);
-extern void ceph_reservation_status(struct ceph_fs_client *client,
- int *total, int *avail, int *used,
- int *reserved, int *min);
-
-
-
-/*
- * we keep buffered readdir results attached to file->private_data
- */
-#define CEPH_F_SYNC 1
-#define CEPH_F_ATEND 2
-
-struct ceph_file_info {
- short fmode; /* initialized on open */
- short flags; /* CEPH_F_* */
-
- /* readdir: position within the dir */
- u32 frag;
- struct ceph_mds_request *last_readdir;
-
- /* readdir: position within a frag */
- unsigned offset; /* offset of last chunk, adjusted for . and .. */
- u64 next_offset; /* offset of next chunk (last_name's + 1) */
- char *last_name; /* last entry in previous chunk */
- struct dentry *dentry; /* next dentry (for dcache readdir) */
- unsigned long dir_release_count;
-
- /* used for -o dirstat read() on directory thing */
- char *dir_info;
- int dir_info_len;
-};
-
-
-
-/*
- * A "snap realm" describes a subset of the file hierarchy sharing
- * the same set of snapshots that apply to it. The realms themselves
- * are organized into a hierarchy, such that children inherit (some of)
- * the snapshots of their parents.
- *
- * All inodes within the realm that have capabilities are linked into a
- * per-realm list.
- */
-struct ceph_snap_realm {
- u64 ino;
- atomic_t nref;
- struct rb_node node;
-
- u64 created, seq;
- u64 parent_ino;
- u64 parent_since; /* snapid when our current parent became so */
-
- u64 *prior_parent_snaps; /* snaps inherited from any parents we */
- int num_prior_parent_snaps; /* had prior to parent_since */
- u64 *snaps; /* snaps specific to this realm */
- int num_snaps;
-
- struct ceph_snap_realm *parent;
- struct list_head children; /* list of child realms */
- struct list_head child_item;
-
- struct list_head empty_item; /* if i have ref==0 */
-
- struct list_head dirty_item; /* if realm needs new context */
-
- /* the current set of snaps for this realm */
- struct ceph_snap_context *cached_context;
-
- struct list_head inodes_with_caps;
- spinlock_t inodes_with_caps_lock;
-};
-
-static inline int default_congestion_kb(void)
-{
- int congestion_kb;
-
- /*
- * Copied from NFS
- *
- * congestion size, scale with available memory.
- *
- * 64MB: 8192k
- * 128MB: 11585k
- * 256MB: 16384k
- * 512MB: 23170k
- * 1GB: 32768k
- * 2GB: 46340k
- * 4GB: 65536k
- * 8GB: 92681k
- * 16GB: 131072k
- *
- * This allows larger machines to have larger/more transfers.
- * Limit the default to 256M
- */
- congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
- if (congestion_kb > 256*1024)
- congestion_kb = 256*1024;
-
- return congestion_kb;
-}
-
-
-
-/* snap.c */
-struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
- u64 ino);
-extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
- struct ceph_snap_realm *realm);
-extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
- struct ceph_snap_realm *realm);
-extern int ceph_update_snap_trace(struct ceph_mds_client *m,
- void *p, void *e, bool deletion);
-extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session,
- struct ceph_msg *msg);
-extern void ceph_queue_cap_snap(struct ceph_inode_info *ci);
-extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
- struct ceph_cap_snap *capsnap);
-extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
-
-/*
- * a cap_snap is "pending" if it is still awaiting an in-progress
- * sync write (that may/may not still update size, mtime, etc.).
- */
-static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
-{
- return !list_empty(&ci->i_cap_snaps) &&
- list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap,
- ci_item)->writing;
-}
-
-/* inode.c */
-extern const struct inode_operations ceph_file_iops;
-
-extern struct inode *ceph_alloc_inode(struct super_block *sb);
-extern void ceph_destroy_inode(struct inode *inode);
-
-extern struct inode *ceph_get_inode(struct super_block *sb,
- struct ceph_vino vino);
-extern struct inode *ceph_get_snapdir(struct inode *parent);
-extern int ceph_fill_file_size(struct inode *inode, int issued,
- u32 truncate_seq, u64 truncate_size, u64 size);
-extern void ceph_fill_file_time(struct inode *inode, int issued,
- u64 time_warp_seq, struct timespec *ctime,
- struct timespec *mtime, struct timespec *atime);
-extern int ceph_fill_trace(struct super_block *sb,
- struct ceph_mds_request *req,
- struct ceph_mds_session *session);
-extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
- struct ceph_mds_session *session);
-
-extern int ceph_inode_holds_cap(struct inode *inode, int mask);
-
-extern int ceph_inode_set_size(struct inode *inode, loff_t size);
-extern void __ceph_do_pending_vmtruncate(struct inode *inode);
-extern void ceph_queue_vmtruncate(struct inode *inode);
-
-extern void ceph_queue_invalidate(struct inode *inode);
-extern void ceph_queue_writeback(struct inode *inode);
-
-extern int ceph_do_getattr(struct inode *inode, int mask);
-extern int ceph_permission(struct inode *inode, int mask);
-extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
-extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
- struct kstat *stat);
-
-/* xattr.c */
-extern int ceph_setxattr(struct dentry *, const char *, const void *,
- size_t, int);
-extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
-extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
-extern int ceph_removexattr(struct dentry *, const char *);
-extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
-extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
-extern void __init ceph_xattr_init(void);
-extern void ceph_xattr_exit(void);
-
-/* caps.c */
-extern const char *ceph_cap_string(int c);
-extern void ceph_handle_caps(struct ceph_mds_session *session,
- struct ceph_msg *msg);
-extern int ceph_add_cap(struct inode *inode,
- struct ceph_mds_session *session, u64 cap_id,
- int fmode, unsigned issued, unsigned wanted,
- unsigned cap, unsigned seq, u64 realmino, int flags,
- struct ceph_cap_reservation *caps_reservation);
-extern void __ceph_remove_cap(struct ceph_cap *cap);
-static inline void ceph_remove_cap(struct ceph_cap *cap)
-{
- spin_lock(&cap->ci->i_ceph_lock);
- __ceph_remove_cap(cap);
- spin_unlock(&cap->ci->i_ceph_lock);
-}
-extern void ceph_put_cap(struct ceph_mds_client *mdsc,
- struct ceph_cap *cap);
-
-extern void ceph_queue_caps_release(struct inode *inode);
-extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
-extern int ceph_fsync(struct file *file, loff_t start, loff_t end,
- int datasync);
-extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session);
-extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
- int mds);
-extern int ceph_get_cap_mds(struct inode *inode);
-extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
-extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
-extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
- struct ceph_snap_context *snapc);
-extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
- struct ceph_mds_session **psession,
- int again);
-extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
- struct ceph_mds_session *session);
-extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
-extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
-
-extern int ceph_encode_inode_release(void **p, struct inode *inode,
- int mds, int drop, int unless, int force);
-extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
- int mds, int drop, int unless);
-
-extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
- int *got, loff_t endoff);
-
-/* for counting open files by mode */
-static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode)
-{
- ci->i_nr_by_mode[mode]++;
-}
-extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
-
-/* addr.c */
-extern const struct address_space_operations ceph_aops;
-extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
-
-/* file.c */
-extern const struct file_operations ceph_file_fops;
-extern const struct address_space_operations ceph_aops;
-extern int ceph_copy_to_page_vector(struct page **pages,
- const char *data,
- loff_t off, size_t len);
-extern int ceph_copy_from_page_vector(struct page **pages,
- char *data,
- loff_t off, size_t len);
-extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
-extern int ceph_open(struct inode *inode, struct file *file);
-extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
- struct nameidata *nd, int mode,
- int locked_dir);
-extern int ceph_release(struct inode *inode, struct file *filp);
-
-/* dir.c */
-extern const struct file_operations ceph_dir_fops;
-extern const struct inode_operations ceph_dir_iops;
-extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
- ceph_snapdir_dentry_ops;
-
-extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
-extern int ceph_handle_snapdir(struct ceph_mds_request *req,
- struct dentry *dentry, int err);
-extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
- struct dentry *dentry, int err);
-
-extern void ceph_dentry_lru_add(struct dentry *dn);
-extern void ceph_dentry_lru_touch(struct dentry *dn);
-extern void ceph_dentry_lru_del(struct dentry *dn);
-extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
-extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn);
-extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry);
-
-/*
- * our d_ops vary depending on whether the inode is live,
- * snapshotted (read-only), or a virtual ".snap" directory.
- */
-int ceph_init_dentry(struct dentry *dentry);
-
-
-/* ioctl.c */
-extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
-
-/* export.c */
-extern const struct export_operations ceph_export_ops;
-
-/* locks.c */
-extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
-extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
-extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num);
-extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p,
- int p_locks, int f_locks);
-extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c);
-
-/* debugfs.c */
-extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);
-extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
-
-#endif /* _FS_CEPH_SUPER_H */
diff --git a/ANDROID_3.4.5/fs/ceph/xattr.c b/ANDROID_3.4.5/fs/ceph/xattr.c
deleted file mode 100644
index 35b86331..00000000
--- a/ANDROID_3.4.5/fs/ceph/xattr.c
+++ /dev/null
@@ -1,946 +0,0 @@
-#include <linux/ceph/ceph_debug.h>
-
-#include "super.h"
-#include "mds_client.h"
-
-#include <linux/ceph/decode.h>
-
-#include <linux/xattr.h>
-#include <linux/slab.h>
-
-#define XATTR_CEPH_PREFIX "ceph."
-#define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1)
-
-static bool ceph_is_valid_xattr(const char *name)
-{
- return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) ||
- !strncmp(name, XATTR_SECURITY_PREFIX,
- XATTR_SECURITY_PREFIX_LEN) ||
- !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
- !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
-}
-
-/*
- * These define virtual xattrs exposing the recursive directory
- * statistics and layout metadata.
- */
-struct ceph_vxattr {
- char *name;
- size_t name_size; /* strlen(name) + 1 (for '\0') */
- size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
- size_t size);
- bool readonly;
-};
-
-/* directories */
-
-static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,
- size_t size)
-{
- return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
-}
-
-static size_t ceph_vxattrcb_dir_files(struct ceph_inode_info *ci, char *val,
- size_t size)
-{
- return snprintf(val, size, "%lld", ci->i_files);
-}
-
-static size_t ceph_vxattrcb_dir_subdirs(struct ceph_inode_info *ci, char *val,
- size_t size)
-{
- return snprintf(val, size, "%lld", ci->i_subdirs);
-}
-
-static size_t ceph_vxattrcb_dir_rentries(struct ceph_inode_info *ci, char *val,
- size_t size)
-{
- return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
-}
-
-static size_t ceph_vxattrcb_dir_rfiles(struct ceph_inode_info *ci, char *val,
- size_t size)
-{
- return snprintf(val, size, "%lld", ci->i_rfiles);
-}
-
-static size_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val,
- size_t size)
-{
- return snprintf(val, size, "%lld", ci->i_rsubdirs);
-}
-
-static size_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val,
- size_t size)
-{
- return snprintf(val, size, "%lld", ci->i_rbytes);
-}
-
-static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
- size_t size)
-{
- return snprintf(val, size, "%ld.09%ld", (long)ci->i_rctime.tv_sec,
- (long)ci->i_rctime.tv_nsec);
-}
-
-#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name
-
-#define XATTR_NAME_CEPH(_type, _name) \
- { \
- .name = CEPH_XATTR_NAME(_type, _name), \
- .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
- .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
- .readonly = true, \
- }
-
-static struct ceph_vxattr ceph_dir_vxattrs[] = {
- XATTR_NAME_CEPH(dir, entries),
- XATTR_NAME_CEPH(dir, files),
- XATTR_NAME_CEPH(dir, subdirs),
- XATTR_NAME_CEPH(dir, rentries),
- XATTR_NAME_CEPH(dir, rfiles),
- XATTR_NAME_CEPH(dir, rsubdirs),
- XATTR_NAME_CEPH(dir, rbytes),
- XATTR_NAME_CEPH(dir, rctime),
- { 0 } /* Required table terminator */
-};
-static size_t ceph_dir_vxattrs_name_size; /* total size of all names */
-
-/* files */
-
-static size_t ceph_vxattrcb_file_layout(struct ceph_inode_info *ci, char *val,
- size_t size)
-{
- int ret;
-
- ret = snprintf(val, size,
- "chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n",
- (unsigned long long)ceph_file_layout_su(ci->i_layout),
- (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
- (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
-
- if (ceph_file_layout_pg_preferred(ci->i_layout) >= 0) {
- val += ret;
- size -= ret;
- ret += snprintf(val, size, "preferred_osd=%lld\n",
- (unsigned long long)ceph_file_layout_pg_preferred(
- ci->i_layout));
- }
-
- return ret;
-}
-
-static struct ceph_vxattr ceph_file_vxattrs[] = {
- XATTR_NAME_CEPH(file, layout),
- /* The following extended attribute name is deprecated */
- {
- .name = XATTR_CEPH_PREFIX "layout",
- .name_size = sizeof (XATTR_CEPH_PREFIX "layout"),
- .getxattr_cb = ceph_vxattrcb_file_layout,
- .readonly = true,
- },
- { 0 } /* Required table terminator */
-};
-static size_t ceph_file_vxattrs_name_size; /* total size of all names */
-
-static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode)
-{
- if (S_ISDIR(inode->i_mode))
- return ceph_dir_vxattrs;
- else if (S_ISREG(inode->i_mode))
- return ceph_file_vxattrs;
- return NULL;
-}
-
-static size_t ceph_vxattrs_name_size(struct ceph_vxattr *vxattrs)
-{
- if (vxattrs == ceph_dir_vxattrs)
- return ceph_dir_vxattrs_name_size;
- if (vxattrs == ceph_file_vxattrs)
- return ceph_file_vxattrs_name_size;
- BUG();
-
- return 0;
-}
-
-/*
- * Compute the aggregate size (including terminating '\0') of all
- * virtual extended attribute names in the given vxattr table.
- */
-static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs)
-{
- struct ceph_vxattr *vxattr;
- size_t size = 0;
-
- for (vxattr = vxattrs; vxattr->name; vxattr++)
- size += vxattr->name_size;
-
- return size;
-}
-
-/* Routines called at initialization and exit time */
-
-void __init ceph_xattr_init(void)
-{
- ceph_dir_vxattrs_name_size = vxattrs_name_size(ceph_dir_vxattrs);
- ceph_file_vxattrs_name_size = vxattrs_name_size(ceph_file_vxattrs);
-}
-
-void ceph_xattr_exit(void)
-{
- ceph_dir_vxattrs_name_size = 0;
- ceph_file_vxattrs_name_size = 0;
-}
-
-static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode,
- const char *name)
-{
- struct ceph_vxattr *vxattr = ceph_inode_vxattrs(inode);
-
- if (vxattr) {
- while (vxattr->name) {
- if (!strcmp(vxattr->name, name))
- return vxattr;
- vxattr++;
- }
- }
-
- return NULL;
-}
-
-static int __set_xattr(struct ceph_inode_info *ci,
- const char *name, int name_len,
- const char *val, int val_len,
- int dirty,
- int should_free_name, int should_free_val,
- struct ceph_inode_xattr **newxattr)
-{
- struct rb_node **p;
- struct rb_node *parent = NULL;
- struct ceph_inode_xattr *xattr = NULL;
- int c;
- int new = 0;
-
- p = &ci->i_xattrs.index.rb_node;
- while (*p) {
- parent = *p;
- xattr = rb_entry(parent, struct ceph_inode_xattr, node);
- c = strncmp(name, xattr->name, min(name_len, xattr->name_len));
- if (c < 0)
- p = &(*p)->rb_left;
- else if (c > 0)
- p = &(*p)->rb_right;
- else {
- if (name_len == xattr->name_len)
- break;
- else if (name_len < xattr->name_len)
- p = &(*p)->rb_left;
- else
- p = &(*p)->rb_right;
- }
- xattr = NULL;
- }
-
- if (!xattr) {
- new = 1;
- xattr = *newxattr;
- xattr->name = name;
- xattr->name_len = name_len;
- xattr->should_free_name = should_free_name;
-
- ci->i_xattrs.count++;
- dout("__set_xattr count=%d\n", ci->i_xattrs.count);
- } else {
- kfree(*newxattr);
- *newxattr = NULL;
- if (xattr->should_free_val)
- kfree((void *)xattr->val);
-
- if (should_free_name) {
- kfree((void *)name);
- name = xattr->name;
- }
- ci->i_xattrs.names_size -= xattr->name_len;
- ci->i_xattrs.vals_size -= xattr->val_len;
- }
- ci->i_xattrs.names_size += name_len;
- ci->i_xattrs.vals_size += val_len;
- if (val)
- xattr->val = val;
- else
- xattr->val = "";
-
- xattr->val_len = val_len;
- xattr->dirty = dirty;
- xattr->should_free_val = (val && should_free_val);
-
- if (new) {
- rb_link_node(&xattr->node, parent, p);
- rb_insert_color(&xattr->node, &ci->i_xattrs.index);
- dout("__set_xattr_val p=%p\n", p);
- }
-
- dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n",
- ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val);
-
- return 0;
-}
-
-static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
- const char *name)
-{
- struct rb_node **p;
- struct rb_node *parent = NULL;
- struct ceph_inode_xattr *xattr = NULL;
- int name_len = strlen(name);
- int c;
-
- p = &ci->i_xattrs.index.rb_node;
- while (*p) {
- parent = *p;
- xattr = rb_entry(parent, struct ceph_inode_xattr, node);
- c = strncmp(name, xattr->name, xattr->name_len);
- if (c == 0 && name_len > xattr->name_len)
- c = 1;
- if (c < 0)
- p = &(*p)->rb_left;
- else if (c > 0)
- p = &(*p)->rb_right;
- else {
- dout("__get_xattr %s: found %.*s\n", name,
- xattr->val_len, xattr->val);
- return xattr;
- }
- }
-
- dout("__get_xattr %s: not found\n", name);
-
- return NULL;
-}
-
-static void __free_xattr(struct ceph_inode_xattr *xattr)
-{
- BUG_ON(!xattr);
-
- if (xattr->should_free_name)
- kfree((void *)xattr->name);
- if (xattr->should_free_val)
- kfree((void *)xattr->val);
-
- kfree(xattr);
-}
-
-static int __remove_xattr(struct ceph_inode_info *ci,
- struct ceph_inode_xattr *xattr)
-{
- if (!xattr)
- return -EOPNOTSUPP;
-
- rb_erase(&xattr->node, &ci->i_xattrs.index);
-
- if (xattr->should_free_name)
- kfree((void *)xattr->name);
- if (xattr->should_free_val)
- kfree((void *)xattr->val);
-
- ci->i_xattrs.names_size -= xattr->name_len;
- ci->i_xattrs.vals_size -= xattr->val_len;
- ci->i_xattrs.count--;
- kfree(xattr);
-
- return 0;
-}
-
-static int __remove_xattr_by_name(struct ceph_inode_info *ci,
- const char *name)
-{
- struct rb_node **p;
- struct ceph_inode_xattr *xattr;
- int err;
-
- p = &ci->i_xattrs.index.rb_node;
- xattr = __get_xattr(ci, name);
- err = __remove_xattr(ci, xattr);
- return err;
-}
-
-static char *__copy_xattr_names(struct ceph_inode_info *ci,
- char *dest)
-{
- struct rb_node *p;
- struct ceph_inode_xattr *xattr = NULL;
-
- p = rb_first(&ci->i_xattrs.index);
- dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count);
-
- while (p) {
- xattr = rb_entry(p, struct ceph_inode_xattr, node);
- memcpy(dest, xattr->name, xattr->name_len);
- dest[xattr->name_len] = '\0';
-
- dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name,
- xattr->name_len, ci->i_xattrs.names_size);
-
- dest += xattr->name_len + 1;
- p = rb_next(p);
- }
-
- return dest;
-}
-
-void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
-{
- struct rb_node *p, *tmp;
- struct ceph_inode_xattr *xattr = NULL;
-
- p = rb_first(&ci->i_xattrs.index);
-
- dout("__ceph_destroy_xattrs p=%p\n", p);
-
- while (p) {
- xattr = rb_entry(p, struct ceph_inode_xattr, node);
- tmp = p;
- p = rb_next(tmp);
- dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p,
- xattr->name_len, xattr->name);
- rb_erase(tmp, &ci->i_xattrs.index);
-
- __free_xattr(xattr);
- }
-
- ci->i_xattrs.names_size = 0;
- ci->i_xattrs.vals_size = 0;
- ci->i_xattrs.index_version = 0;
- ci->i_xattrs.count = 0;
- ci->i_xattrs.index = RB_ROOT;
-}
-
-static int __build_xattrs(struct inode *inode)
- __releases(ci->i_ceph_lock)
- __acquires(ci->i_ceph_lock)
-{
- u32 namelen;
- u32 numattr = 0;
- void *p, *end;
- u32 len;
- const char *name, *val;
- struct ceph_inode_info *ci = ceph_inode(inode);
- int xattr_version;
- struct ceph_inode_xattr **xattrs = NULL;
- int err = 0;
- int i;
-
- dout("__build_xattrs() len=%d\n",
- ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0);
-
- if (ci->i_xattrs.index_version >= ci->i_xattrs.version)
- return 0; /* already built */
-
- __ceph_destroy_xattrs(ci);
-
-start:
- /* updated internal xattr rb tree */
- if (ci->i_xattrs.blob && ci->i_xattrs.blob->vec.iov_len > 4) {
- p = ci->i_xattrs.blob->vec.iov_base;
- end = p + ci->i_xattrs.blob->vec.iov_len;
- ceph_decode_32_safe(&p, end, numattr, bad);
- xattr_version = ci->i_xattrs.version;
- spin_unlock(&ci->i_ceph_lock);
-
- xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *),
- GFP_NOFS);
- err = -ENOMEM;
- if (!xattrs)
- goto bad_lock;
- memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *));
- for (i = 0; i < numattr; i++) {
- xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr),
- GFP_NOFS);
- if (!xattrs[i])
- goto bad_lock;
- }
-
- spin_lock(&ci->i_ceph_lock);
- if (ci->i_xattrs.version != xattr_version) {
- /* lost a race, retry */
- for (i = 0; i < numattr; i++)
- kfree(xattrs[i]);
- kfree(xattrs);
- goto start;
- }
- err = -EIO;
- while (numattr--) {
- ceph_decode_32_safe(&p, end, len, bad);
- namelen = len;
- name = p;
- p += len;
- ceph_decode_32_safe(&p, end, len, bad);
- val = p;
- p += len;
-
- err = __set_xattr(ci, name, namelen, val, len,
- 0, 0, 0, &xattrs[numattr]);
-
- if (err < 0)
- goto bad;
- }
- kfree(xattrs);
- }
- ci->i_xattrs.index_version = ci->i_xattrs.version;
- ci->i_xattrs.dirty = false;
-
- return err;
-bad_lock:
- spin_lock(&ci->i_ceph_lock);
-bad:
- if (xattrs) {
- for (i = 0; i < numattr; i++)
- kfree(xattrs[i]);
- kfree(xattrs);
- }
- ci->i_xattrs.names_size = 0;
- return err;
-}
-
-static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
- int val_size)
-{
- /*
- * 4 bytes for the length, and additional 4 bytes per each xattr name,
- * 4 bytes per each value
- */
- int size = 4 + ci->i_xattrs.count*(4 + 4) +
- ci->i_xattrs.names_size +
- ci->i_xattrs.vals_size;
- dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n",
- ci->i_xattrs.count, ci->i_xattrs.names_size,
- ci->i_xattrs.vals_size);
-
- if (name_size)
- size += 4 + 4 + name_size + val_size;
-
- return size;
-}
-
-/*
- * If there are dirty xattrs, reencode xattrs into the prealloc_blob
- * and swap into place.
- */
-void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
-{
- struct rb_node *p;
- struct ceph_inode_xattr *xattr = NULL;
- void *dest;
-
- dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
- if (ci->i_xattrs.dirty) {
- int need = __get_required_blob_size(ci, 0, 0);
-
- BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len);
-
- p = rb_first(&ci->i_xattrs.index);
- dest = ci->i_xattrs.prealloc_blob->vec.iov_base;
-
- ceph_encode_32(&dest, ci->i_xattrs.count);
- while (p) {
- xattr = rb_entry(p, struct ceph_inode_xattr, node);
-
- ceph_encode_32(&dest, xattr->name_len);
- memcpy(dest, xattr->name, xattr->name_len);
- dest += xattr->name_len;
- ceph_encode_32(&dest, xattr->val_len);
- memcpy(dest, xattr->val, xattr->val_len);
- dest += xattr->val_len;
-
- p = rb_next(p);
- }
-
- /* adjust buffer len; it may be larger than we need */
- ci->i_xattrs.prealloc_blob->vec.iov_len =
- dest - ci->i_xattrs.prealloc_blob->vec.iov_base;
-
- if (ci->i_xattrs.blob)
- ceph_buffer_put(ci->i_xattrs.blob);
- ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
- ci->i_xattrs.prealloc_blob = NULL;
- ci->i_xattrs.dirty = false;
- ci->i_xattrs.version++;
- }
-}
-
-ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
- size_t size)
-{
- struct inode *inode = dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
- int err;
- struct ceph_inode_xattr *xattr;
- struct ceph_vxattr *vxattr = NULL;
-
- if (!ceph_is_valid_xattr(name))
- return -ENODATA;
-
- /* let's see if a virtual xattr was requested */
- vxattr = ceph_match_vxattr(inode, name);
-
- spin_lock(&ci->i_ceph_lock);
- dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
- ci->i_xattrs.version, ci->i_xattrs.index_version);
-
- if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
- (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
- goto get_xattr;
- } else {
- spin_unlock(&ci->i_ceph_lock);
- /* get xattrs from mds (if we don't already have them) */
- err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
- if (err)
- return err;
- }
-
- spin_lock(&ci->i_ceph_lock);
-
- if (vxattr && vxattr->readonly) {
- err = vxattr->getxattr_cb(ci, value, size);
- goto out;
- }
-
- err = __build_xattrs(inode);
- if (err < 0)
- goto out;
-
-get_xattr:
- err = -ENODATA; /* == ENOATTR */
- xattr = __get_xattr(ci, name);
- if (!xattr) {
- if (vxattr)
- err = vxattr->getxattr_cb(ci, value, size);
- goto out;
- }
-
- err = -ERANGE;
- if (size && size < xattr->val_len)
- goto out;
-
- err = xattr->val_len;
- if (size == 0)
- goto out;
-
- memcpy(value, xattr->val, xattr->val_len);
-
-out:
- spin_unlock(&ci->i_ceph_lock);
- return err;
-}
-
-ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
-{
- struct inode *inode = dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode);
- u32 vir_namelen = 0;
- u32 namelen;
- int err;
- u32 len;
- int i;
-
- spin_lock(&ci->i_ceph_lock);
- dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
- ci->i_xattrs.version, ci->i_xattrs.index_version);
-
- if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
- (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
- goto list_xattr;
- } else {
- spin_unlock(&ci->i_ceph_lock);
- err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
- if (err)
- return err;
- }
-
- spin_lock(&ci->i_ceph_lock);
-
- err = __build_xattrs(inode);
- if (err < 0)
- goto out;
-
-list_xattr:
- /*
- * Start with virtual dir xattr names (if any) (including
- * terminating '\0' characters for each).
- */
- vir_namelen = ceph_vxattrs_name_size(vxattrs);
-
- /* adding 1 byte per each variable due to the null termination */
- namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;
- err = -ERANGE;
- if (size && namelen > size)
- goto out;
-
- err = namelen;
- if (size == 0)
- goto out;
-
- names = __copy_xattr_names(ci, names);
-
- /* virtual xattr names, too */
- if (vxattrs)
- for (i = 0; vxattrs[i].name; i++) {
- len = sprintf(names, "%s", vxattrs[i].name);
- names += len + 1;
- }
-
-out:
- spin_unlock(&ci->i_ceph_lock);
- return err;
-}
-
-static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
- const char *value, size_t size, int flags)
-{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
- struct inode *inode = dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct inode *parent_inode;
- struct ceph_mds_request *req;
- struct ceph_mds_client *mdsc = fsc->mdsc;
- int err;
- int i, nr_pages;
- struct page **pages = NULL;
- void *kaddr;
-
- /* copy value into some pages */
- nr_pages = calc_pages_for(0, size);
- if (nr_pages) {
- pages = kmalloc(sizeof(pages[0])*nr_pages, GFP_NOFS);
- if (!pages)
- return -ENOMEM;
- err = -ENOMEM;
- for (i = 0; i < nr_pages; i++) {
- pages[i] = __page_cache_alloc(GFP_NOFS);
- if (!pages[i]) {
- nr_pages = i;
- goto out;
- }
- kaddr = kmap(pages[i]);
- memcpy(kaddr, value + i*PAGE_CACHE_SIZE,
- min(PAGE_CACHE_SIZE, size-i*PAGE_CACHE_SIZE));
- }
- }
-
- dout("setxattr value=%.*s\n", (int)size, value);
-
- /* do request */
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
- USE_AUTH_MDS);
- if (IS_ERR(req)) {
- err = PTR_ERR(req);
- goto out;
- }
- req->r_inode = inode;
- ihold(inode);
- req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
- req->r_num_caps = 1;
- req->r_args.setxattr.flags = cpu_to_le32(flags);
- req->r_path2 = kstrdup(name, GFP_NOFS);
-
- req->r_pages = pages;
- req->r_num_pages = nr_pages;
- req->r_data_len = size;
-
- dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
- parent_inode = ceph_get_dentry_parent_inode(dentry);
- err = ceph_mdsc_do_request(mdsc, parent_inode, req);
- iput(parent_inode);
- ceph_mdsc_put_request(req);
- dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
-
-out:
- if (pages) {
- for (i = 0; i < nr_pages; i++)
- __free_page(pages[i]);
- kfree(pages);
- }
- return err;
-}
-
-int ceph_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
-{
- struct inode *inode = dentry->d_inode;
- struct ceph_vxattr *vxattr;
- struct ceph_inode_info *ci = ceph_inode(inode);
- int issued;
- int err;
- int dirty;
- int name_len = strlen(name);
- int val_len = size;
- char *newname = NULL;
- char *newval = NULL;
- struct ceph_inode_xattr *xattr = NULL;
- int required_blob_size;
-
- if (ceph_snap(inode) != CEPH_NOSNAP)
- return -EROFS;
-
- if (!ceph_is_valid_xattr(name))
- return -EOPNOTSUPP;
-
- vxattr = ceph_match_vxattr(inode, name);
- if (vxattr && vxattr->readonly)
- return -EOPNOTSUPP;
-
- /* preallocate memory for xattr name, value, index node */
- err = -ENOMEM;
- newname = kmemdup(name, name_len + 1, GFP_NOFS);
- if (!newname)
- goto out;
-
- if (val_len) {
- newval = kmemdup(value, val_len, GFP_NOFS);
- if (!newval)
- goto out;
- }
-
- xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS);
- if (!xattr)
- goto out;
-
- spin_lock(&ci->i_ceph_lock);
-retry:
- issued = __ceph_caps_issued(ci, NULL);
- dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
- if (!(issued & CEPH_CAP_XATTR_EXCL))
- goto do_sync;
- __build_xattrs(inode);
-
- required_blob_size = __get_required_blob_size(ci, name_len, val_len);
-
- if (!ci->i_xattrs.prealloc_blob ||
- required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
- struct ceph_buffer *blob;
-
- spin_unlock(&ci->i_ceph_lock);
- dout(" preaallocating new blob size=%d\n", required_blob_size);
- blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
- if (!blob)
- goto out;
- spin_lock(&ci->i_ceph_lock);
- if (ci->i_xattrs.prealloc_blob)
- ceph_buffer_put(ci->i_xattrs.prealloc_blob);
- ci->i_xattrs.prealloc_blob = blob;
- goto retry;
- }
-
- err = __set_xattr(ci, newname, name_len, newval,
- val_len, 1, 1, 1, &xattr);
-
- dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
- ci->i_xattrs.dirty = true;
- inode->i_ctime = CURRENT_TIME;
-
- spin_unlock(&ci->i_ceph_lock);
- if (dirty)
- __mark_inode_dirty(inode, dirty);
- return err;
-
-do_sync:
- spin_unlock(&ci->i_ceph_lock);
- err = ceph_sync_setxattr(dentry, name, value, size, flags);
-out:
- kfree(newname);
- kfree(newval);
- kfree(xattr);
- return err;
-}
-
-static int ceph_send_removexattr(struct dentry *dentry, const char *name)
-{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
- struct inode *inode = dentry->d_inode;
- struct inode *parent_inode;
- struct ceph_mds_request *req;
- int err;
-
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RMXATTR,
- USE_AUTH_MDS);
- if (IS_ERR(req))
- return PTR_ERR(req);
- req->r_inode = inode;
- ihold(inode);
- req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
- req->r_num_caps = 1;
- req->r_path2 = kstrdup(name, GFP_NOFS);
-
- parent_inode = ceph_get_dentry_parent_inode(dentry);
- err = ceph_mdsc_do_request(mdsc, parent_inode, req);
- iput(parent_inode);
- ceph_mdsc_put_request(req);
- return err;
-}
-
-int ceph_removexattr(struct dentry *dentry, const char *name)
-{
- struct inode *inode = dentry->d_inode;
- struct ceph_vxattr *vxattr;
- struct ceph_inode_info *ci = ceph_inode(inode);
- int issued;
- int err;
- int required_blob_size;
- int dirty;
-
- if (ceph_snap(inode) != CEPH_NOSNAP)
- return -EROFS;
-
- if (!ceph_is_valid_xattr(name))
- return -EOPNOTSUPP;
-
- vxattr = ceph_match_vxattr(inode, name);
- if (vxattr && vxattr->readonly)
- return -EOPNOTSUPP;
-
- err = -ENOMEM;
- spin_lock(&ci->i_ceph_lock);
-retry:
- issued = __ceph_caps_issued(ci, NULL);
- dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
-
- if (!(issued & CEPH_CAP_XATTR_EXCL))
- goto do_sync;
- __build_xattrs(inode);
-
- required_blob_size = __get_required_blob_size(ci, 0, 0);
-
- if (!ci->i_xattrs.prealloc_blob ||
- required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
- struct ceph_buffer *blob;
-
- spin_unlock(&ci->i_ceph_lock);
- dout(" preaallocating new blob size=%d\n", required_blob_size);
- blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
- if (!blob)
- goto out;
- spin_lock(&ci->i_ceph_lock);
- if (ci->i_xattrs.prealloc_blob)
- ceph_buffer_put(ci->i_xattrs.prealloc_blob);
- ci->i_xattrs.prealloc_blob = blob;
- goto retry;
- }
-
- err = __remove_xattr_by_name(ceph_inode(inode), name);
-
- dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
- ci->i_xattrs.dirty = true;
- inode->i_ctime = CURRENT_TIME;
- spin_unlock(&ci->i_ceph_lock);
- if (dirty)
- __mark_inode_dirty(inode, dirty);
- return err;
-do_sync:
- spin_unlock(&ci->i_ceph_lock);
- err = ceph_send_removexattr(dentry, name);
-out:
- return err;
-}
-