]> Git Repo - linux.git/commitdiff
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
authorLinus Torvalds <[email protected]>
Sun, 5 Jul 2015 02:36:06 +0000 (19:36 -0700)
committerLinus Torvalds <[email protected]>
Sun, 5 Jul 2015 02:36:06 +0000 (19:36 -0700)
Pull more vfs updates from Al Viro:
 "Assorted VFS fixes and related cleanups (IMO the most interesting in
  that part are f_path-related things and Eric's descriptor-related
  stuff).  UFS regression fixes (it got broken last cycle).  9P fixes.
  fs-cache series, DAX patches, Jan's file_remove_suid() work"

[ I'd say this is much more than "fixes and related cleanups".  The
  file_table locking rule change by Eric Dumazet is a rather big and
  fundamental update even if the patch isn't huge.   - Linus ]

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs: (49 commits)
  9p: cope with bogus responses from server in p9_client_{read,write}
  p9_client_write(): avoid double p9_free_req()
  9p: forgetting to cancel request on interrupted zero-copy RPC
  dax: bdev_direct_access() may sleep
  block: Add support for DAX reads/writes to block devices
  dax: Use copy_from_iter_nocache
  dax: Add block size note to documentation
  fs/file.c: __fget() and dup2() atomicity rules
  fs/file.c: don't acquire files->file_lock in fd_install()
  fs:super:get_anon_bdev: fix race condition could cause dev exceed its upper limitation
  vfs: avoid creation of inode number 0 in get_next_ino
  namei: make set_root_rcu() return void
  make simple_positive() public
  ufs: use dir_pages instead of ufs_dir_pages()
  pagemap.h: move dir_pages() over there
  remove the pointless include of lglock.h
  fs: cleanup slight list_entry abuse
  xfs: Correctly lock inode when removing suid and file capabilities
  fs: Call security_ops->inode_killpriv on truncate
  fs: Provide function telling whether file_remove_privs() will do anything
  ...

32 files changed:
1  2 
Documentation/filesystems/porting
arch/arc/kernel/troubleshoot.c
arch/s390/hypfs/inode.c
arch/tile/kernel/stack.c
drivers/block/loop.c
drivers/md/bitmap.c
drivers/md/md.c
fs/binfmt_elf.c
fs/block_dev.c
fs/btrfs/file.c
fs/ceph/file.c
fs/coredump.c
fs/dax.c
fs/dcache.c
fs/debugfs/inode.c
fs/ext4/super.c
fs/fuse/file.c
fs/inode.c
fs/libfs.c
fs/nfs/dir.c
fs/ntfs/file.c
fs/overlayfs/super.c
fs/seq_file.c
fs/tracefs/inode.c
fs/ufs/super.c
fs/xfs/xfs_file.c
include/linux/fs.h
include/linux/pagemap.h
kernel/events/core.c
mm/filemap.c
mm/memory.c
security/inode.c

index 68f1c9106573f40df371e01d14946c52df98405d,ec5456113072ab93240033a4737953bd753afd76..f24d1b8339576e96c46045f5da8f275ee9250056
@@@ -379,10 -379,10 +379,10 @@@ may now be called in rcu-walk mode (nd-
  returned if the filesystem cannot handle rcu-walk. See
  Documentation/filesystems/vfs.txt for more details.
  
 -      permission and check_acl are inode permission checks that are called
 -on many or all directory inodes on the way down a path walk (to check for
 -exec permission). These must now be rcu-walk aware (flags & IPERM_FLAG_RCU).
 -See Documentation/filesystems/vfs.txt for more details.
 +      permission is an inode permission check that is called on many or all
 +directory inodes on the way down a path walk (to check for exec permission). It
 +must now be rcu-walk aware (mask & MAY_NOT_BLOCK).  See
 +Documentation/filesystems/vfs.txt for more details.
   
  --
  [mandatory]
@@@ -500,3 -500,7 +500,7 @@@ in your dentry operations instead
        dentry,  it does not get nameidata at all and it gets called only when cookie
        is non-NULL.  Note that link body isn't available anymore, so if you need it,
        store it as cookie.
+ --
+ [mandatory]
+       __fd_install() & fd_install() can now sleep. Callers should not
+       hold a spinlock or other resources that do not allow a schedule.
index e0cf998932123fae4df27f8c98712f57551d8849,9f80c5adcb689aa03295bf716585a2ba6eb5a99d..807f7d61d7a7cf867bca011251729d1164bd3f33
@@@ -14,7 -14,6 +14,7 @@@
  #include <linux/proc_fs.h>
  #include <linux/file.h>
  #include <asm/arcregs.h>
 +#include <asm/irqflags.h>
  
  /*
   * Common routine to print scratch regs (r0-r12) or callee regs (r13-r25)
@@@ -35,10 -34,7 +35,10 @@@ static noinline void print_reg_file(lon
                        n += scnprintf(buf + n, len - n, "\n");
  
                /* because pt_regs has regs reversed: r12..r0, r25..r13 */
 -              reg_rev--;
 +              if (is_isa_arcv2() && start_num == 0)
 +                      reg_rev++;
 +              else
 +                      reg_rev--;
        }
  
        if (start_num != 0)
@@@ -71,15 -67,12 +71,12 @@@ static void print_task_path_n_nm(struc
        mmput(mm);
  
        if (exe_file) {
-               path = exe_file->f_path;
-               path_get(&exe_file->f_path);
+               path_nm = file_path(exe_file, buf, 255);
                fput(exe_file);
-               path_nm = d_path(&path, buf, 255);
-               path_put(&path);
        }
  
  done:
-       pr_info("Path: %s\n", path_nm);
+       pr_info("Path: %s\n", !IS_ERR(path_nm) ? path_nm : "?");
  }
  
  static void show_faulting_vma(unsigned long address, char *buf)
        if (vma && (vma->vm_start <= address)) {
                struct file *file = vma->vm_file;
                if (file) {
-                       struct path *path = &file->f_path;
-                       nm = d_path(path, buf, PAGE_SIZE - 1);
+                       nm = file_path(file, buf, PAGE_SIZE - 1);
                        inode = file_inode(vma->vm_file);
                        dev = inode->i_sb->s_dev;
                        ino = inode->i_ino;
@@@ -156,15 -148,6 +152,15 @@@ static void show_ecr_verbose(struct pt_
                                ((cause_code == 0x02) ? "Write" : "EX"));
        } else if (vec == ECR_V_INSN_ERR) {
                pr_cont("Illegal Insn\n");
 +#ifdef CONFIG_ISA_ARCV2
 +      } else if (vec == ECR_V_MEM_ERR) {
 +              if (cause_code == 0x00)
 +                      pr_cont("Bus Error from Insn Mem\n");
 +              else if (cause_code == 0x10)
 +                      pr_cont("Bus Error from Data Mem\n");
 +              else
 +                      pr_cont("Bus Error, check PRM\n");
 +#endif
        } else {
                pr_cont("Check Programmer's Manual\n");
        }
@@@ -198,20 -181,12 +194,20 @@@ void show_regs(struct pt_regs *regs
  
        pr_info("[STAT32]: 0x%08lx", regs->status32);
  
 -#define STS_BIT(r, bit)       r->status32 & STATUS_##bit##_MASK ? #bit : ""
 -      if (!user_mode(regs))
 -              pr_cont(" : %2s %2s %2s %2s %2s\n",
 -                      STS_BIT(regs, AE), STS_BIT(regs, A2), STS_BIT(regs, A1),
 -                      STS_BIT(regs, E2), STS_BIT(regs, E1));
 +#define STS_BIT(r, bit)       r->status32 & STATUS_##bit##_MASK ? #bit" " : ""
  
 +#ifdef CONFIG_ISA_ARCOMPACT
 +      pr_cont(" : %2s%2s%2s%2s%2s%2s%2s\n",
 +                      (regs->status32 & STATUS_U_MASK) ? "U " : "K ",
 +                      STS_BIT(regs, DE), STS_BIT(regs, AE),
 +                      STS_BIT(regs, A2), STS_BIT(regs, A1),
 +                      STS_BIT(regs, E2), STS_BIT(regs, E1));
 +#else
 +      pr_cont(" : %2s%2s%2s%2s\n",
 +                      STS_BIT(regs, IE),
 +                      (regs->status32 & STATUS_U_MASK) ? "U " : "K ",
 +                      STS_BIT(regs, DE), STS_BIT(regs, AE));
 +#endif
        pr_info("BTA: 0x%08lx\t SP: 0x%08lx\t FP: 0x%08lx\n",
                regs->bta, regs->sp, regs->fp);
        pr_info("LPS: 0x%08lx\tLPE: 0x%08lx\tLPC: 0x%08lx\n",
diff --combined arch/s390/hypfs/inode.c
index 2eeb0a0f506d5d54a90b0f98da581468962c0399,8ffad54372321c156e5066b6e5fd92625d166bd5..b2e5902bd8f4d8e5f4f53cc3e6fad1cb183db7be
@@@ -62,18 -62,13 +62,13 @@@ static void hypfs_add_dentry(struct den
        hypfs_last_dentry = dentry;
  }
  
- static inline int hypfs_positive(struct dentry *dentry)
- {
-       return d_really_is_positive(dentry) && !d_unhashed(dentry);
- }
  static void hypfs_remove(struct dentry *dentry)
  {
        struct dentry *parent;
  
        parent = dentry->d_parent;
        mutex_lock(&d_inode(parent)->i_mutex);
-       if (hypfs_positive(dentry)) {
+       if (simple_positive(dentry)) {
                if (d_is_dir(dentry))
                        simple_rmdir(d_inode(parent), dentry);
                else
@@@ -456,6 -451,8 +451,6 @@@ static const struct super_operations hy
        .show_options   = hypfs_show_options,
  };
  
 -static struct kobject *s390_kobj;
 -
  static int __init hypfs_init(void)
  {
        int rc;
                rc = -ENODATA;
                goto fail_hypfs_sprp_exit;
        }
 -      s390_kobj = kobject_create_and_add("s390", hypervisor_kobj);
 -      if (!s390_kobj) {
 -              rc = -ENOMEM;
 +      rc = sysfs_create_mount_point(hypervisor_kobj, "s390");
 +      if (rc)
                goto fail_hypfs_diag0c_exit;
 -      }
        rc = register_filesystem(&hypfs_type);
        if (rc)
                goto fail_filesystem;
        return 0;
  
  fail_filesystem:
 -      kobject_put(s390_kobj);
 +      sysfs_remove_mount_point(hypervisor_kobj, "s390");
  fail_hypfs_diag0c_exit:
        hypfs_diag0c_exit();
  fail_hypfs_sprp_exit:
@@@ -506,7 -505,7 +501,7 @@@ fail_dbfs_exit
  static void __exit hypfs_exit(void)
  {
        unregister_filesystem(&hypfs_type);
 -      kobject_put(s390_kobj);
 +      sysfs_remove_mount_point(hypervisor_kobj, "s390");
        hypfs_diag0c_exit();
        hypfs_sprp_exit();
        hypfs_vm_exit();
diff --combined arch/tile/kernel/stack.c
index 35d34635e4f1305473f5cf1990d185216657c59d,8d62cf12c2c027b95e5e67cee9bd72228dfd69f5..402b9c85a894dc4b10982462178dd08529bca49d
@@@ -23,7 -23,6 +23,7 @@@
  #include <linux/mmzone.h>
  #include <linux/dcache.h>
  #include <linux/fs.h>
 +#include <linux/hardirq.h>
  #include <linux/string.h>
  #include <asm/backtrace.h>
  #include <asm/page.h>
@@@ -110,7 -109,7 +110,7 @@@ static struct pt_regs *valid_fault_hand
                if (kbt->verbose)
                        pr_err("  <%s while in user mode>\n", fault);
        } else {
 -              if (kbt->verbose)
 +              if (kbt->verbose && (p->pc != 0 || p->sp != 0 || p->ex1 != 0))
                        pr_err("  (odd fault: pc %#lx, sp %#lx, ex1 %#lx?)\n",
                               p->pc, p->sp, p->ex1);
                return NULL;
        return p;
  }
  
 -/* Is the pc pointing to a sigreturn trampoline? */
 -static int is_sigreturn(unsigned long pc)
 +/* Is the iterator pointing to a sigreturn trampoline? */
 +static int is_sigreturn(struct KBacktraceIterator *kbt)
  {
 -      return current->mm && (pc == VDSO_SYM(&__vdso_rt_sigreturn));
 +      return kbt->task->mm &&
 +              (kbt->it.pc == ((ulong)kbt->task->mm->context.vdso_base +
 +                              (ulong)&__vdso_rt_sigreturn));
  }
  
  /* Return a pt_regs pointer for a valid signal handler frame */
@@@ -134,7 -131,7 +134,7 @@@ static struct pt_regs *valid_sigframe(s
  {
        BacktraceIterator *b = &kbt->it;
  
 -      if (is_sigreturn(b->pc) && b->sp < PAGE_OFFSET &&
 +      if (is_sigreturn(kbt) && b->sp < PAGE_OFFSET &&
            b->sp % sizeof(long) == 0) {
                int retval;
                pagefault_disable();
        return NULL;
  }
  
 -static int KBacktraceIterator_is_sigreturn(struct KBacktraceIterator *kbt)
 -{
 -      return is_sigreturn(kbt->it.pc);
 -}
 -
  static int KBacktraceIterator_restart(struct KBacktraceIterator *kbt)
  {
        struct pt_regs *p;
@@@ -176,7 -178,7 +176,7 @@@ static int KBacktraceIterator_next_item
  {
        for (;;) {
                do {
 -                      if (!KBacktraceIterator_is_sigreturn(kbt))
 +                      if (!is_sigreturn(kbt))
                                return KBT_ONGOING;
                } while (backtrace_next(&kbt->it));
  
@@@ -332,7 -334,7 +332,7 @@@ static void describe_addr(struct KBackt
        }
  
        if (vma->vm_file) {
-               p = d_path(&vma->vm_file->f_path, buf, bufsize);
+               p = file_path(vma->vm_file, buf, bufsize);
                if (IS_ERR(p))
                        p = "?";
                name = kbasename(p);
   */
  static bool start_backtrace(void)
  {
 -      if (current->thread.in_backtrace) {
 +      if (current_thread_info()->in_backtrace) {
                pr_err("Backtrace requested while in backtrace!\n");
                return false;
        }
 -      current->thread.in_backtrace = true;
 +      current_thread_info()->in_backtrace = true;
        return true;
  }
  
  static void end_backtrace(void)
  {
 -      current->thread.in_backtrace = false;
 +      current_thread_info()->in_backtrace = false;
  }
  
  /*
   * This method wraps the backtracer's more generic support.
   * It is only invoked from the architecture-specific code; show_stack()
 - * and dump_stack() (in entry.S) are architecture-independent entry points.
 + * and dump_stack() are architecture-independent entry points.
   */
 -void tile_show_stack(struct KBacktraceIterator *kbt, int headers)
 +void tile_show_stack(struct KBacktraceIterator *kbt)
  {
        int i;
        int have_mmap_sem = 0;
  
        if (!start_backtrace())
                return;
 -      if (headers) {
 -              /*
 -               * Add a blank line since if we are called from panic(),
 -               * then bust_spinlocks() spit out a space in front of us
 -               * and it will mess up our KERN_ERR.
 -               */
 -              pr_err("Starting stack dump of tid %d, pid %d (%s) on cpu %d at cycle %lld\n",
 -                     kbt->task->pid, kbt->task->tgid, kbt->task->comm,
 -                     raw_smp_processor_id(), get_cycles());
 -      }
        kbt->verbose = 1;
        i = 0;
        for (; !KBacktraceIterator_end(kbt); KBacktraceIterator_next(kbt)) {
                char namebuf[KSYM_NAME_LEN+100];
                unsigned long address = kbt->it.pc;
  
 -              /* Try to acquire the mmap_sem as we pass into userspace. */
 -              if (address < PAGE_OFFSET && !have_mmap_sem && kbt->task->mm)
 +              /*
 +               * Try to acquire the mmap_sem as we pass into userspace.
 +               * If we're in an interrupt context, don't even try, since
 +               * it's not safe to call e.g. d_path() from an interrupt,
 +               * since it uses spin locks without disabling interrupts.
 +               * Note we test "kbt->task == current", not "kbt->is_current",
 +               * since we're checking that "current" will work in d_path().
 +               */
 +              if (kbt->task == current && address < PAGE_OFFSET &&
 +                  !have_mmap_sem && kbt->task->mm && !in_interrupt()) {
                        have_mmap_sem =
                                down_read_trylock(&kbt->task->mm->mmap_sem);
 +              }
  
                describe_addr(kbt, address, have_mmap_sem,
                              namebuf, sizeof(namebuf));
        }
        if (kbt->end == KBT_LOOP)
                pr_err("Stack dump stopped; next frame identical to this one\n");
 -      if (headers)
 -              pr_err("Stack dump complete\n");
        if (have_mmap_sem)
                up_read(&kbt->task->mm->mmap_sem);
        end_backtrace();
  }
  EXPORT_SYMBOL(tile_show_stack);
  
 -
 -/* This is called from show_regs() and _dump_stack() */
 -void dump_stack_regs(struct pt_regs *regs)
 -{
 -      struct KBacktraceIterator kbt;
 -      KBacktraceIterator_init(&kbt, NULL, regs);
 -      tile_show_stack(&kbt, 1);
 -}
 -EXPORT_SYMBOL(dump_stack_regs);
 -
  static struct pt_regs *regs_to_pt_regs(struct pt_regs *regs,
                                       ulong pc, ulong lr, ulong sp, ulong r52)
  {
        return regs;
  }
  
 -/* This is called from dump_stack() and just converts to pt_regs */
 +/* Deprecated function currently only used by kernel_double_fault(). */
  void _dump_stack(int dummy, ulong pc, ulong lr, ulong sp, ulong r52)
  {
 +      struct KBacktraceIterator kbt;
        struct pt_regs regs;
 -      dump_stack_regs(regs_to_pt_regs(&regs, pc, lr, sp, r52));
 +
 +      regs_to_pt_regs(&regs, pc, lr, sp, r52);
 +      KBacktraceIterator_init(&kbt, NULL, &regs);
 +      tile_show_stack(&kbt);
  }
  
  /* This is called from KBacktraceIterator_init_current() */
@@@ -450,30 -461,22 +450,30 @@@ void _KBacktraceIterator_init_current(s
                                regs_to_pt_regs(&regs, pc, lr, sp, r52));
  }
  
 -/* This is called only from kernel/sched/core.c, with esp == NULL */
 +/*
 + * Called from sched_show_task() with task != NULL, or dump_stack()
 + * with task == NULL.  The esp argument is always NULL.
 + */
  void show_stack(struct task_struct *task, unsigned long *esp)
  {
        struct KBacktraceIterator kbt;
 -      if (task == NULL || task == current)
 +      if (task == NULL || task == current) {
                KBacktraceIterator_init_current(&kbt);
 -      else
 +              KBacktraceIterator_next(&kbt);  /* don't show first frame */
 +      } else {
                KBacktraceIterator_init(&kbt, task, NULL);
 -      tile_show_stack(&kbt, 0);
 +      }
 +      tile_show_stack(&kbt);
  }
  
  #ifdef CONFIG_STACKTRACE
  
  /* Support generic Linux stack API too */
  
 -void save_stack_trace_tsk(struct task_struct *task, struct stack_trace *trace)
 +static void save_stack_trace_common(struct task_struct *task,
 +                                  struct pt_regs *regs,
 +                                  bool user,
 +                                  struct stack_trace *trace)
  {
        struct KBacktraceIterator kbt;
        int skip = trace->skip;
  
        if (!start_backtrace())
                goto done;
 -      if (task == NULL || task == current)
 +      if (regs != NULL) {
 +              KBacktraceIterator_init(&kbt, NULL, regs);
 +      } else if (task == NULL || task == current) {
                KBacktraceIterator_init_current(&kbt);
 -      else
 +              skip++;  /* don't show KBacktraceIterator_init_current */
 +      } else {
                KBacktraceIterator_init(&kbt, task, NULL);
 +      }
        for (; !KBacktraceIterator_end(&kbt); KBacktraceIterator_next(&kbt)) {
                if (skip) {
                        --skip;
                        continue;
                }
 -              if (i >= trace->max_entries || kbt.it.pc < PAGE_OFFSET)
 +              if (i >= trace->max_entries ||
 +                  (!user && kbt.it.pc < PAGE_OFFSET))
                        break;
                trace->entries[i++] = kbt.it.pc;
        }
        end_backtrace();
  done:
 +      if (i < trace->max_entries)
 +              trace->entries[i++] = ULONG_MAX;
        trace->nr_entries = i;
  }
 +
 +void save_stack_trace_tsk(struct task_struct *task, struct stack_trace *trace)
 +{
 +      save_stack_trace_common(task, NULL, false, trace);
 +}
  EXPORT_SYMBOL(save_stack_trace_tsk);
  
  void save_stack_trace(struct stack_trace *trace)
  {
 -      save_stack_trace_tsk(NULL, trace);
 +      save_stack_trace_common(NULL, NULL, false, trace);
  }
  EXPORT_SYMBOL_GPL(save_stack_trace);
  
 +void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
 +{
 +      save_stack_trace_common(NULL, regs, false, trace);
 +}
 +
 +void save_stack_trace_user(struct stack_trace *trace)
 +{
 +      /* Trace user stack if we are not a kernel thread. */
 +      if (current->mm)
 +              save_stack_trace_common(NULL, task_pt_regs(current),
 +                                      true, trace);
 +      else if (trace->nr_entries < trace->max_entries)
 +              trace->entries[trace->nr_entries++] = ULONG_MAX;
 +}
  #endif
  
  /* In entry.S */
diff --combined drivers/block/loop.c
index 40580dc7f41cacef42eedafeebe725e96943c91e,0d8ad59413cd88119dd4ee49517f979203a839da..f7a4c9d7f721816666a76e2d667adf9153e162e9
@@@ -86,6 -86,8 +86,6 @@@ static DEFINE_MUTEX(loop_index_mutex)
  static int max_part;
  static int part_shift;
  
 -static struct workqueue_struct *loop_wq;
 -
  static int transfer_xor(struct loop_device *lo, int cmd,
                        struct page *raw_page, unsigned raw_off,
                        struct page *loop_page, unsigned loop_off,
@@@ -474,28 -476,6 +474,28 @@@ static int loop_flush(struct loop_devic
        return loop_switch(lo, NULL);
  }
  
 +static void loop_reread_partitions(struct loop_device *lo,
 +                                 struct block_device *bdev)
 +{
 +      int rc;
 +
 +      /*
 +       * bd_mutex has been held already in release path, so don't
 +       * acquire it if this function is called in such case.
 +       *
 +       * If the reread partition isn't from release path, lo_refcnt
 +       * must be at least one and it can only become zero when the
 +       * current holder is released.
 +       */
 +      if (!atomic_read(&lo->lo_refcnt))
 +              rc = __blkdev_reread_part(bdev);
 +      else
 +              rc = blkdev_reread_part(bdev);
 +      if (rc)
 +              pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n",
 +                      __func__, lo->lo_number, lo->lo_file_name, rc);
 +}
 +
  /*
   * loop_change_fd switched the backing store of a loopback device to
   * a new file. This is useful for operating system installers to free up
@@@ -544,7 -524,7 +544,7 @@@ static int loop_change_fd(struct loop_d
  
        fput(old_file);
        if (lo->lo_flags & LO_FLAGS_PARTSCAN)
 -              ioctl_by_bdev(bdev, BLKRRPART, 0);
 +              loop_reread_partitions(lo, bdev);
        return 0;
  
   out_putf:
@@@ -588,7 -568,7 +588,7 @@@ static ssize_t loop_attr_backing_file_s
  
        spin_lock_irq(&lo->lo_lock);
        if (lo->lo_backing_file)
-               p = d_path(&lo->lo_backing_file->f_path, buf, PAGE_SIZE - 1);
+               p = file_path(lo->lo_backing_file, buf, PAGE_SIZE - 1);
        spin_unlock_irq(&lo->lo_lock);
  
        if (IS_ERR_OR_NULL(p))
@@@ -745,12 -725,6 +745,12 @@@ static int loop_set_fd(struct loop_devi
        size = get_loop_size(lo, file);
        if ((loff_t)(sector_t)size != size)
                goto out_putf;
 +      error = -ENOMEM;
 +      lo->wq = alloc_workqueue("kloopd%d",
 +                      WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 16,
 +                      lo->lo_number);
 +      if (!lo->wq)
 +              goto out_putf;
  
        error = 0;
  
        if (part_shift)
                lo->lo_flags |= LO_FLAGS_PARTSCAN;
        if (lo->lo_flags & LO_FLAGS_PARTSCAN)
 -              ioctl_by_bdev(bdev, BLKRRPART, 0);
 +              loop_reread_partitions(lo, bdev);
  
        /* Grab the block_device to prevent its destruction after we
         * put /dev/loopXX inode. Later in loop_clr_fd() we bdput(bdev).
@@@ -853,7 -827,7 +853,7 @@@ static int loop_clr_fd(struct loop_devi
         * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d
         * command to fail with EBUSY.
         */
 -      if (lo->lo_refcnt > 1) {
 +      if (atomic_read(&lo->lo_refcnt) > 1) {
                lo->lo_flags |= LO_FLAGS_AUTOCLEAR;
                mutex_unlock(&lo->lo_ctl_mutex);
                return 0;
        if (filp == NULL)
                return -EINVAL;
  
 +      /* freeze request queue during the transition */
 +      blk_mq_freeze_queue(lo->lo_queue);
 +
        spin_lock_irq(&lo->lo_lock);
        lo->lo_state = Lo_rundown;
        lo->lo_backing_file = NULL;
        lo->lo_state = Lo_unbound;
        /* This is safe: open() is still holding a reference. */
        module_put(THIS_MODULE);
 +      blk_mq_unfreeze_queue(lo->lo_queue);
 +
        if (lo->lo_flags & LO_FLAGS_PARTSCAN && bdev)
 -              ioctl_by_bdev(bdev, BLKRRPART, 0);
 +              loop_reread_partitions(lo, bdev);
        lo->lo_flags = 0;
        if (!part_shift)
                lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
 +      destroy_workqueue(lo->wq);
 +      lo->wq = NULL;
        mutex_unlock(&lo->lo_ctl_mutex);
        /*
         * Need not hold lo_ctl_mutex to fput backing file.
@@@ -976,7 -943,7 +976,7 @@@ loop_set_status(struct loop_device *lo
             !(lo->lo_flags & LO_FLAGS_PARTSCAN)) {
                lo->lo_flags |= LO_FLAGS_PARTSCAN;
                lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN;
 -              ioctl_by_bdev(lo->lo_device, BLKRRPART, 0);
 +              loop_reread_partitions(lo, lo->lo_device);
        }
  
        lo->lo_encrypt_key_size = info->lo_encrypt_key_size;
@@@ -1357,7 -1324,9 +1357,7 @@@ static int lo_open(struct block_device 
                goto out;
        }
  
 -      mutex_lock(&lo->lo_ctl_mutex);
 -      lo->lo_refcnt++;
 -      mutex_unlock(&lo->lo_ctl_mutex);
 +      atomic_inc(&lo->lo_refcnt);
  out:
        mutex_unlock(&loop_index_mutex);
        return err;
@@@ -1368,10 -1337,11 +1368,10 @@@ static void lo_release(struct gendisk *
        struct loop_device *lo = disk->private_data;
        int err;
  
 -      mutex_lock(&lo->lo_ctl_mutex);
 -
 -      if (--lo->lo_refcnt)
 -              goto out;
 +      if (atomic_dec_return(&lo->lo_refcnt))
 +              return;
  
 +      mutex_lock(&lo->lo_ctl_mutex);
        if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
                /*
                 * In autoclear mode, stop the loop thread
                loop_flush(lo);
        }
  
 -out:
        mutex_unlock(&lo->lo_ctl_mutex);
  }
  
@@@ -1454,13 -1425,9 +1454,13 @@@ static int loop_queue_rq(struct blk_mq_
                const struct blk_mq_queue_data *bd)
  {
        struct loop_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
 +      struct loop_device *lo = cmd->rq->q->queuedata;
  
        blk_mq_start_request(bd->rq);
  
 +      if (lo->lo_state != Lo_bound)
 +              return -EIO;
 +
        if (cmd->rq->cmd_flags & REQ_WRITE) {
                struct loop_device *lo = cmd->rq->q->queuedata;
                bool need_sched = true;
                spin_unlock_irq(&lo->lo_lock);
  
                if (need_sched)
 -                      queue_work(loop_wq, &lo->write_work);
 +                      queue_work(lo->wq, &lo->write_work);
        } else {
 -              queue_work(loop_wq, &cmd->read_work);
 +              queue_work(lo->wq, &cmd->read_work);
        }
  
        return BLK_MQ_RQ_QUEUE_OK;
@@@ -1488,6 -1455,9 +1488,6 @@@ static void loop_handle_cmd(struct loop
        struct loop_device *lo = cmd->rq->q->queuedata;
        int ret = -EIO;
  
 -      if (lo->lo_state != Lo_bound)
 -              goto failed;
 -
        if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY))
                goto failed;
  
@@@ -1624,7 -1594,6 +1624,7 @@@ static int loop_add(struct loop_device 
                disk->flags |= GENHD_FL_NO_PART_SCAN;
        disk->flags |= GENHD_FL_EXT_DEVT;
        mutex_init(&lo->lo_ctl_mutex);
 +      atomic_set(&lo->lo_refcnt, 0);
        lo->lo_number           = i;
        spin_lock_init(&lo->lo_lock);
        disk->major             = LOOP_MAJOR;
@@@ -1742,7 -1711,7 +1742,7 @@@ static long loop_control_ioctl(struct f
                        mutex_unlock(&lo->lo_ctl_mutex);
                        break;
                }
 -              if (lo->lo_refcnt > 0) {
 +              if (atomic_read(&lo->lo_refcnt) > 0) {
                        ret = -EBUSY;
                        mutex_unlock(&lo->lo_ctl_mutex);
                        break;
@@@ -1837,6 -1806,13 +1837,6 @@@ static int __init loop_init(void
                goto misc_out;
        }
  
 -      loop_wq = alloc_workqueue("kloopd",
 -                      WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 0);
 -      if (!loop_wq) {
 -              err = -ENOMEM;
 -              goto misc_out;
 -      }
 -
        blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
                                  THIS_MODULE, loop_probe, NULL, NULL);
  
@@@ -1874,6 -1850,8 +1874,6 @@@ static void __exit loop_exit(void
        blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range);
        unregister_blkdev(LOOP_MAJOR, "loop");
  
 -      destroy_workqueue(loop_wq);
 -
        misc_deregister(&loop_misc);
  }
  
diff --combined drivers/md/bitmap.c
index 135a0907e9de413d140e9fb9b793a91b638a1606,3813fdfee4beed18466b96732f6e9a5f2489c7dc..ed2346ddf4c9fb54dafeb92ae9c795a0584444e8
@@@ -177,16 -177,11 +177,16 @@@ static struct md_rdev *next_active_rdev
         * nr_pending is 0 and In_sync is clear, the entries we return will
         * still be in the same position on the list when we re-enter
         * list_for_each_entry_continue_rcu.
 +       *
 +       * Note that if entered with 'rdev == NULL' to start at the
 +       * beginning, we temporarily assign 'rdev' to an address which
 +       * isn't really an rdev, but which can be used by
 +       * list_for_each_entry_continue_rcu() to find the first entry.
         */
        rcu_read_lock();
        if (rdev == NULL)
                /* start at the beginning */
 -              rdev = list_entry_rcu(&mddev->disks, struct md_rdev, same_set);
 +              rdev = list_entry(&mddev->disks, struct md_rdev, same_set);
        else {
                /* release the previous rdev and start from there. */
                rdev_dec_pending(rdev, mddev);
@@@ -839,7 -834,7 +839,7 @@@ static void bitmap_file_kick(struct bit
                if (bitmap->storage.file) {
                        path = kmalloc(PAGE_SIZE, GFP_KERNEL);
                        if (path)
-                               ptr = d_path(&bitmap->storage.file->f_path,
+                               ptr = file_path(bitmap->storage.file,
                                             path, PAGE_SIZE);
  
                        printk(KERN_ALERT
@@@ -1927,7 -1922,7 +1927,7 @@@ void bitmap_status(struct seq_file *seq
                   chunk_kb ? "KB" : "B");
        if (bitmap->storage.file) {
                seq_printf(seq, ", file: ");
-               seq_path(seq, &bitmap->storage.file->f_path, " \t\n");
+               seq_file_path(seq, bitmap->storage.file, " \t\n");
        }
  
        seq_printf(seq, "\n");
diff --combined drivers/md/md.c
index df92d30ca054c68a2af9cc3ee299525d1635a0eb,e67f3ac137bf2d5c88ef315293539b06b4d14169..d429c30cd51471c26cb1c07cb3e6a413106133d4
@@@ -2024,6 -2024,7 +2024,6 @@@ static int bind_rdev_to_array(struct md
  {
        char b[BDEVNAME_SIZE];
        struct kobject *ko;
 -      char *s;
        int err;
  
        /* prevent duplicates */
                return -EBUSY;
        }
        bdevname(rdev->bdev,b);
 -      while ( (s=strchr(b, '/')) != NULL)
 -              *s = '!';
 +      strreplace(b, '/', '!');
  
        rdev->mddev = mddev;
        printk(KERN_INFO "md: bind<%s>\n", b);
@@@ -2628,14 -2630,13 +2628,14 @@@ errors_show(struct md_rdev *rdev, char 
  static ssize_t
  errors_store(struct md_rdev *rdev, const char *buf, size_t len)
  {
 -      char *e;
 -      unsigned long n = simple_strtoul(buf, &e, 10);
 -      if (*buf && (*e == 0 || *e == '\n')) {
 -              atomic_set(&rdev->corrected_errors, n);
 -              return len;
 -      }
 -      return -EINVAL;
 +      unsigned int n;
 +      int rv;
 +
 +      rv = kstrtouint(buf, 10, &n);
 +      if (rv < 0)
 +              return rv;
 +      atomic_set(&rdev->corrected_errors, n);
 +      return len;
  }
  static struct rdev_sysfs_entry rdev_errors =
  __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
@@@ -2652,16 -2653,13 +2652,16 @@@ slot_show(struct md_rdev *rdev, char *p
  static ssize_t
  slot_store(struct md_rdev *rdev, const char *buf, size_t len)
  {
 -      char *e;
 +      int slot;
        int err;
 -      int slot = simple_strtoul(buf, &e, 10);
 +
        if (strncmp(buf, "none", 4)==0)
                slot = -1;
 -      else if (e==buf || (*e && *e!= '\n'))
 -              return -EINVAL;
 +      else {
 +              err = kstrtouint(buf, 10, (unsigned int *)&slot);
 +              if (err < 0)
 +                      return err;
 +      }
        if (rdev->mddev->pers && slot == -1) {
                /* Setting 'slot' on an active array requires also
                 * updating the 'rd%d' link, and communicating
@@@ -3546,12 -3544,12 +3546,12 @@@ layout_show(struct mddev *mddev, char *
  static ssize_t
  layout_store(struct mddev *mddev, const char *buf, size_t len)
  {
 -      char *e;
 -      unsigned long n = simple_strtoul(buf, &e, 10);
 +      unsigned int n;
        int err;
  
 -      if (!*buf || (*e && *e != '\n'))
 -              return -EINVAL;
 +      err = kstrtouint(buf, 10, &n);
 +      if (err < 0)
 +              return err;
        err = mddev_lock(mddev);
        if (err)
                return err;
@@@ -3595,12 -3593,12 +3595,12 @@@ static int update_raid_disks(struct mdd
  static ssize_t
  raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
  {
 -      char *e;
 +      unsigned int n;
        int err;
 -      unsigned long n = simple_strtoul(buf, &e, 10);
  
 -      if (!*buf || (*e && *e != '\n'))
 -              return -EINVAL;
 +      err = kstrtouint(buf, 10, &n);
 +      if (err < 0)
 +              return err;
  
        err = mddev_lock(mddev);
        if (err)
@@@ -3647,12 -3645,12 +3647,12 @@@ chunk_size_show(struct mddev *mddev, ch
  static ssize_t
  chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
  {
 +      unsigned long n;
        int err;
 -      char *e;
 -      unsigned long n = simple_strtoul(buf, &e, 10);
  
 -      if (!*buf || (*e && *e != '\n'))
 -              return -EINVAL;
 +      err = kstrtoul(buf, 10, &n);
 +      if (err < 0)
 +              return err;
  
        err = mddev_lock(mddev);
        if (err)
@@@ -3690,24 -3688,19 +3690,24 @@@ resync_start_show(struct mddev *mddev, 
  static ssize_t
  resync_start_store(struct mddev *mddev, const char *buf, size_t len)
  {
 +      unsigned long long n;
        int err;
 -      char *e;
 -      unsigned long long n = simple_strtoull(buf, &e, 10);
 +
 +      if (cmd_match(buf, "none"))
 +              n = MaxSector;
 +      else {
 +              err = kstrtoull(buf, 10, &n);
 +              if (err < 0)
 +                      return err;
 +              if (n != (sector_t)n)
 +                      return -EINVAL;
 +      }
  
        err = mddev_lock(mddev);
        if (err)
                return err;
        if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
                err = -EBUSY;
 -      else if (cmd_match(buf, "none"))
 -              n = MaxSector;
 -      else if (!*buf || (*e && *e != '\n'))
 -              err = -EINVAL;
  
        if (!err) {
                mddev->recovery_cp = n;
@@@ -3841,7 -3834,7 +3841,7 @@@ array_state_store(struct mddev *mddev, 
                                err = -EBUSY;
                }
                spin_unlock(&mddev->lock);
 -              return err;
 +              return err ?: len;
        }
        err = mddev_lock(mddev);
        if (err)
@@@ -3943,14 -3936,14 +3943,14 @@@ max_corrected_read_errors_show(struct m
  static ssize_t
  max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
  {
 -      char *e;
 -      unsigned long n = simple_strtoul(buf, &e, 10);
 +      unsigned int n;
 +      int rv;
  
 -      if (*buf && (*e == 0 || *e == '\n')) {
 -              atomic_set(&mddev->max_corr_read_errors, n);
 -              return len;
 -      }
 -      return -EINVAL;
 +      rv = kstrtouint(buf, 10, &n);
 +      if (rv < 0)
 +              return rv;
 +      atomic_set(&mddev->max_corr_read_errors, n);
 +      return len;
  }
  
  static struct md_sysfs_entry max_corr_read_errors =
@@@ -4012,10 -4005,8 +4012,10 @@@ new_dev_store(struct mddev *mddev, cons
        else
                rdev = md_import_device(dev, -1, -1);
  
 -      if (IS_ERR(rdev))
 +      if (IS_ERR(rdev)) {
 +              mddev_unlock(mddev);
                return PTR_ERR(rdev);
 +      }
        err = bind_rdev_to_array(rdev, mddev);
   out:
        if (err)
@@@ -4220,36 -4211,34 +4220,36 @@@ action_store(struct mddev *mddev, cons
        if (!mddev->pers || !mddev->pers->sync_request)
                return -EINVAL;
  
 -      if (cmd_match(page, "frozen"))
 -              set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 -      else
 -              clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
  
        if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
 -              flush_workqueue(md_misc_wq);
 -              if (mddev->sync_thread) {
 -                      set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 -                      if (mddev_lock(mddev) == 0) {
 +              if (cmd_match(page, "frozen"))
 +                      set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 +              else
 +                      clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 +              if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
 +                  mddev_lock(mddev) == 0) {
 +                      flush_workqueue(md_misc_wq);
 +                      if (mddev->sync_thread) {
 +                              set_bit(MD_RECOVERY_INTR, &mddev->recovery);
                                md_reap_sync_thread(mddev);
 -                              mddev_unlock(mddev);
                        }
 +                      mddev_unlock(mddev);
                }
        } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
                   test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
                return -EBUSY;
        else if (cmd_match(page, "resync"))
 -              set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 +              clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
        else if (cmd_match(page, "recover")) {
 +              clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
                set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
 -              set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
        } else if (cmd_match(page, "reshape")) {
                int err;
                if (mddev->pers->start_reshape == NULL)
                        return -EINVAL;
                err = mddev_lock(mddev);
                if (!err) {
 +                      clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
                        err = mddev->pers->start_reshape(mddev);
                        mddev_unlock(mddev);
                }
                        set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
                else if (!cmd_match(page, "repair"))
                        return -EINVAL;
 +              clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
                set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
                set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
        }
@@@ -4309,18 -4297,15 +4309,18 @@@ sync_min_show(struct mddev *mddev, cha
  static ssize_t
  sync_min_store(struct mddev *mddev, const char *buf, size_t len)
  {
 -      int min;
 -      char *e;
 +      unsigned int min;
 +      int rv;
 +
        if (strncmp(buf, "system", 6)==0) {
 -              mddev->sync_speed_min = 0;
 -              return len;
 +              min = 0;
 +      } else {
 +              rv = kstrtouint(buf, 10, &min);
 +              if (rv < 0)
 +                      return rv;
 +              if (min == 0)
 +                      return -EINVAL;
        }
 -      min = simple_strtoul(buf, &e, 10);
 -      if (buf == e || (*e && *e != '\n') || min <= 0)
 -              return -EINVAL;
        mddev->sync_speed_min = min;
        return len;
  }
@@@ -4338,18 -4323,15 +4338,18 @@@ sync_max_show(struct mddev *mddev, cha
  static ssize_t
  sync_max_store(struct mddev *mddev, const char *buf, size_t len)
  {
 -      int max;
 -      char *e;
 +      unsigned int max;
 +      int rv;
 +
        if (strncmp(buf, "system", 6)==0) {
 -              mddev->sync_speed_max = 0;
 -              return len;
 +              max = 0;
 +      } else {
 +              rv = kstrtouint(buf, 10, &max);
 +              if (rv < 0)
 +                      return rv;
 +              if (max == 0)
 +                      return -EINVAL;
        }
 -      max = simple_strtoul(buf, &e, 10);
 -      if (buf == e || (*e && *e != '\n') || max <= 0)
 -              return -EINVAL;
        mddev->sync_speed_max = max;
        return len;
  }
@@@ -4532,13 -4514,12 +4532,13 @@@ suspend_lo_show(struct mddev *mddev, ch
  static ssize_t
  suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
  {
 -      char *e;
 -      unsigned long long new = simple_strtoull(buf, &e, 10);
 -      unsigned long long old;
 +      unsigned long long old, new;
        int err;
  
 -      if (buf == e || (*e && *e != '\n'))
 +      err = kstrtoull(buf, 10, &new);
 +      if (err < 0)
 +              return err;
 +      if (new != (sector_t)new)
                return -EINVAL;
  
        err = mddev_lock(mddev);
@@@ -4575,13 -4556,12 +4575,13 @@@ suspend_hi_show(struct mddev *mddev, ch
  static ssize_t
  suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
  {
 -      char *e;
 -      unsigned long long new = simple_strtoull(buf, &e, 10);
 -      unsigned long long old;
 +      unsigned long long old, new;
        int err;
  
 -      if (buf == e || (*e && *e != '\n'))
 +      err = kstrtoull(buf, 10, &new);
 +      if (err < 0)
 +              return err;
 +      if (new != (sector_t)new)
                return -EINVAL;
  
        err = mddev_lock(mddev);
@@@ -4623,13 -4603,11 +4623,13 @@@ static ssize_
  reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
  {
        struct md_rdev *rdev;
 -      char *e;
 +      unsigned long long new;
        int err;
 -      unsigned long long new = simple_strtoull(buf, &e, 10);
  
 -      if (buf == e || (*e && *e != '\n'))
 +      err = kstrtoull(buf, 10, &new);
 +      if (err < 0)
 +              return err;
 +      if (new != (sector_t)new)
                return -EINVAL;
        err = mddev_lock(mddev);
        if (err)
@@@ -5178,7 -5156,6 +5178,7 @@@ int md_run(struct mddev *mddev
                mddev_detach(mddev);
                if (mddev->private)
                        pers->free(mddev, mddev->private);
 +              mddev->private = NULL;
                module_put(pers->owner);
                bitmap_destroy(mddev);
                return err;
@@@ -5314,7 -5291,6 +5314,7 @@@ static void md_clean(struct mddev *mdde
        mddev->changed = 0;
        mddev->degraded = 0;
        mddev->safemode = 0;
 +      mddev->private = NULL;
        mddev->merge_check_needed = 0;
        mddev->bitmap_info.offset = 0;
        mddev->bitmap_info.default_offset = 0;
@@@ -5387,7 -5363,6 +5387,7 @@@ static void __md_stop(struct mddev *mdd
        mddev->pers = NULL;
        spin_unlock(&mddev->lock);
        pers->free(mddev, mddev->private);
 +      mddev->private = NULL;
        if (pers->sync_request && mddev->to_remove == NULL)
                mddev->to_remove = &md_redundancy_group;
        module_put(pers->owner);
@@@ -5766,7 -5741,7 +5766,7 @@@ static int get_bitmap_file(struct mdde
        /* bitmap disabled, zero the first byte and copy out */
        if (!mddev->bitmap_info.file)
                file->pathname[0] = '\0';
-       else if ((ptr = d_path(&mddev->bitmap_info.file->f_path,
+       else if ((ptr = file_path(mddev->bitmap_info.file,
                               file->pathname, sizeof(file->pathname))),
                 IS_ERR(ptr))
                err = PTR_ERR(ptr);
@@@ -6397,7 -6372,7 +6397,7 @@@ static int update_array_info(struct mdd
            mddev->ctime         != info->ctime         ||
            mddev->level         != info->level         ||
  /*        mddev->layout        != info->layout        || */
 -          !mddev->persistent   != info->not_persistent||
 +          mddev->persistent    != !info->not_persistent ||
            mddev->chunk_sectors != info->chunk_size >> 9 ||
            /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
            ((state^info->state) & 0xfffffe00)
@@@ -8128,15 -8103,6 +8128,15 @@@ void md_check_recovery(struct mddev *md
                int spares = 0;
  
                if (mddev->ro) {
 +                      struct md_rdev *rdev;
 +                      if (!mddev->external && mddev->in_sync)
 +                              /* 'Blocked' flag not needed as failed devices
 +                               * will be recorded if array switched to read/write.
 +                               * Leaving it set will prevent the device
 +                               * from being removed.
 +                               */
 +                              rdev_for_each(rdev, mddev)
 +                                      clear_bit(Blocked, &rdev->flags);
                        /* On a read-only array we can:
                         * - remove failed devices
                         * - add already-in_sync devices if the array itself
@@@ -8293,7 -8259,6 +8293,7 @@@ void md_reap_sync_thread(struct mddev *
        if (mddev_is_clustered(mddev))
                md_cluster_ops->metadata_update_finish(mddev);
        clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
 +      clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
        clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
        clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
        clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
@@@ -9044,7 -9009,13 +9044,7 @@@ static int get_ro(char *buffer, struct 
  }
  static int set_ro(const char *val, struct kernel_param *kp)
  {
 -      char *e;
 -      int num = simple_strtoul(val, &e, 10);
 -      if (*val && (*e == '\0' || *e == '\n')) {
 -              start_readonly = num;
 -              return 0;
 -      }
 -      return -EINVAL;
 +      return kstrtouint(val, 10, (unsigned int *)&start_readonly);
  }
  
  module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
diff --combined fs/binfmt_elf.c
index cd46e415883090747d8238c2a2fbaa9b101dbc5e,5046b62471037dc0a929306de0a7f89d7c8267a9..6b659967898ebc534c1ce8d91ff1179f050e8f34
@@@ -918,7 -918,7 +918,7 @@@ static int load_elf_binary(struct linux
                        total_size = total_mapping_size(elf_phdata,
                                                        loc->elf_ex.e_phnum);
                        if (!total_size) {
 -                              error = -EINVAL;
 +                              retval = -EINVAL;
                                goto out_free_dentry;
                        }
                }
@@@ -1530,7 -1530,7 +1530,7 @@@ static int fill_files_note(struct memel
                file = vma->vm_file;
                if (!file)
                        continue;
-               filename = d_path(&file->f_path, name_curpos, remaining);
+               filename = file_path(file, name_curpos, remaining);
                if (IS_ERR(filename)) {
                        if (PTR_ERR(filename) == -ENAMETOOLONG) {
                                vfree(data);
                        continue;
                }
  
-               /* d_path() fills at the end, move name down */
+               /* file_path() fills at the end, move name down */
                /* n = strlen(filename) + 1: */
                n = (name_curpos + remaining) - filename;
                remaining = filename - name_curpos;
diff --combined fs/block_dev.c
index 4fe10f93db8a3e52ebbb5330e94b80ee92455e1d,12b22ddb22ef04ec150dabf4d6d631400bb79070..198243717da567bd5f47ad7c94ab823a82506c62
@@@ -14,7 -14,6 +14,7 @@@
  #include <linux/device_cgroup.h>
  #include <linux/highmem.h>
  #include <linux/blkdev.h>
 +#include <linux/backing-dev.h>
  #include <linux/module.h>
  #include <linux/blkpg.h>
  #include <linux/magic.h>
@@@ -43,7 -42,7 +43,7 @@@ static inline struct bdev_inode *BDEV_I
        return container_of(inode, struct bdev_inode, vfs_inode);
  }
  
 -inline struct block_device *I_BDEV(struct inode *inode)
 +struct block_device *I_BDEV(struct inode *inode)
  {
        return &BDEV_I(inode)->bdev;
  }
@@@ -152,6 -151,9 +152,9 @@@ blkdev_direct_IO(struct kiocb *iocb, st
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
  
+       if (IS_DAX(inode))
+               return dax_do_io(iocb, inode, iter, offset, blkdev_get_block,
+                               NULL, DIO_SKIP_DIO_COUNT);
        return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, offset,
                                    blkdev_get_block, NULL, NULL,
                                    DIO_SKIP_DIO_COUNT);
@@@ -377,7 -379,7 +380,7 @@@ int bdev_read_page(struct block_device 
                        struct page *page)
  {
        const struct block_device_operations *ops = bdev->bd_disk->fops;
 -      if (!ops->rw_page)
 +      if (!ops->rw_page || bdev_get_integrity(bdev))
                return -EOPNOTSUPP;
        return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ);
  }
@@@ -408,7 -410,7 +411,7 @@@ int bdev_write_page(struct block_devic
        int result;
        int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE;
        const struct block_device_operations *ops = bdev->bd_disk->fops;
 -      if (!ops->rw_page)
 +      if (!ops->rw_page || bdev_get_integrity(bdev))
                return -EOPNOTSUPP;
        set_page_writeback(page);
        result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw);
@@@ -443,6 -445,12 +446,12 @@@ long bdev_direct_access(struct block_de
        long avail;
        const struct block_device_operations *ops = bdev->bd_disk->fops;
  
+       /*
+        * The device driver is allowed to sleep, in order to make the
+        * memory directly accessible.
+        */
+       might_sleep();
        if (size < 0)
                return size;
        if (!ops->direct_access)
@@@ -547,8 -555,7 +556,8 @@@ static struct file_system_type bd_type 
        .kill_sb        = kill_anon_super,
  };
  
 -static struct super_block *blockdev_superblock __read_mostly;
 +struct super_block *blockdev_superblock __read_mostly;
 +EXPORT_SYMBOL_GPL(blockdev_superblock);
  
  void __init bdev_cache_init(void)
  {
@@@ -689,6 -696,11 +698,6 @@@ static struct block_device *bd_acquire(
        return bdev;
  }
  
 -int sb_is_blkdev_sb(struct super_block *sb)
 -{
 -      return sb == blockdev_superblock;
 -}
 -
  /* Call when you free inode */
  
  void bd_forget(struct inode *inode)
@@@ -1170,6 -1182,7 +1179,7 @@@ static int __blkdev_get(struct block_de
                bdev->bd_disk = disk;
                bdev->bd_queue = disk->queue;
                bdev->bd_contains = bdev;
+               bdev->bd_inode->i_flags = disk->fops->direct_access ? S_DAX : 0;
                if (!partno) {
                        ret = -ENXIO;
                        bdev->bd_part = disk_get_part(disk, partno);
diff --combined fs/btrfs/file.c
index 795d754327a7277de47d13e0f1426aaa5c7fd85c,86f97282779a20e036286142da21bad7b0465233..b823fac91c9289bc67d3bb5191f4ce96e38294ac
@@@ -1748,7 -1748,7 +1748,7 @@@ static ssize_t btrfs_file_write_iter(st
        }
  
        current->backing_dev_info = inode_to_bdi(inode);
-       err = file_remove_suid(file);
+       err = file_remove_privs(file);
        if (err) {
                mutex_unlock(&inode->i_mutex);
                goto out;
@@@ -1868,7 -1868,6 +1868,7 @@@ int btrfs_sync_file(struct file *file, 
        struct btrfs_log_ctx ctx;
        int ret = 0;
        bool full_sync = 0;
 +      const u64 len = end - start + 1;
  
        trace_btrfs_sync_file(file, datasync);
  
                 * all extents are persisted and the respective file extent
                 * items are in the fs/subvol btree.
                 */
 -              ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
 +              ret = btrfs_wait_ordered_range(inode, start, len);
        } else {
                /*
                 * Start any new ordered operations before starting to log the
         */
        smp_mb();
        if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
 -          (full_sync && BTRFS_I(inode)->last_trans <=
 -           root->fs_info->last_trans_committed)) {
 +          (BTRFS_I(inode)->last_trans <=
 +           root->fs_info->last_trans_committed &&
 +           (full_sync ||
 +            !btrfs_have_ordered_extents_in_range(inode, start, len)))) {
                /*
                 * We'v had everything committed since the last time we were
                 * modified so clear this flag in case it was set for whatever
diff --combined fs/ceph/file.c
index faf92095e105650617d8e465e898f8d22a803d60,e55fe32c6224363c5bfbfedce8fb9c46e09fe48a..8b79d87eaf4675ff91cf05c10a3fc53e70d5b313
@@@ -89,14 -89,13 +89,14 @@@ static int ceph_init_file(struct inode 
        case S_IFDIR:
                dout("init_file %p %p 0%o (regular)\n", inode, file,
                     inode->i_mode);
 -              cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO);
 +              cf = kmem_cache_alloc(ceph_file_cachep, GFP_KERNEL | __GFP_ZERO);
                if (cf == NULL) {
                        ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
                        return -ENOMEM;
                }
                cf->fmode = fmode;
                cf->next_offset = 2;
 +              cf->readdir_cache_idx = -1;
                file->private_data = cf;
                BUG_ON(inode->i_fop->release != ceph_release);
                break;
@@@ -325,6 -324,7 +325,6 @@@ int ceph_release(struct inode *inode, s
                ceph_mdsc_put_request(cf->last_readdir);
        kfree(cf->last_name);
        kfree(cf->dir_info);
 -      dput(cf->dentry);
        kmem_cache_free(ceph_file_cachep, cf);
  
        /* wake up anyone waiting for caps on this inode */
@@@ -483,7 -483,7 +483,7 @@@ static ssize_t ceph_sync_read(struct ki
                }
        } else {
                num_pages = calc_pages_for(off, len);
 -              pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
 +              pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
                if (IS_ERR(pages))
                        return PTR_ERR(pages);
                ret = striped_read(inode, off, len, pages,
@@@ -557,13 -557,13 +557,13 @@@ static void ceph_sync_write_unsafe(stru
   * objects, rollback on failure, etc.)
   */
  static ssize_t
 -ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
 +ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 +                     struct ceph_snap_context *snapc)
  {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 -      struct ceph_snap_context *snapc;
        struct ceph_vino vino;
        struct ceph_osd_request *req;
        struct page **pages;
                size_t start;
                ssize_t n;
  
 -              snapc = ci->i_snap_realm->cached_context;
                vino = ceph_vino(inode);
                req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
                                            vino, pos, &len, 0,
                        break;
                }
  
 -              osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC);
 +              osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
  
                n = iov_iter_get_pages_alloc(from, &pages, len, &start);
                if (unlikely(n < 0)) {
   * objects, rollback on failure, etc.)
   */
  static ssize_t
 -ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
 +ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 +              struct ceph_snap_context *snapc)
  {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 -      struct ceph_snap_context *snapc;
        struct ceph_vino vino;
        struct ceph_osd_request *req;
        struct page **pages;
                size_t left;
                int n;
  
 -              snapc = ci->i_snap_realm->cached_context;
                vino = ceph_vino(inode);
                req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
                                            vino, pos, &len, 0, 1,
                 */
                num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
  
 -              pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
 +              pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
                if (IS_ERR(pages)) {
                        ret = PTR_ERR(pages);
                        goto out;
@@@ -858,7 -860,7 +858,7 @@@ again
                struct page *page = NULL;
                loff_t i_size;
                if (retry_op == READ_INLINE) {
 -                      page = __page_cache_alloc(GFP_NOFS);
 +                      page = __page_cache_alloc(GFP_KERNEL);
                        if (!page)
                                return -ENOMEM;
                }
@@@ -939,7 -941,6 +939,7 @@@ static ssize_t ceph_write_iter(struct k
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_osd_client *osdc =
                &ceph_sb_to_client(inode->i_sb)->client->osdc;
 +      struct ceph_cap_flush *prealloc_cf;
        ssize_t count, written = 0;
        int err, want, got;
        loff_t pos;
        if (ceph_snap(inode) != CEPH_NOSNAP)
                return -EROFS;
  
 +      prealloc_cf = ceph_alloc_cap_flush();
 +      if (!prealloc_cf)
 +              return -ENOMEM;
 +
        mutex_lock(&inode->i_mutex);
  
        /* We can write back this queue in page reclaim */
  
        pos = iocb->ki_pos;
        count = iov_iter_count(from);
-       err = file_remove_suid(file);
+       err = file_remove_privs(file);
        if (err)
                goto out;
  
@@@ -999,30 -996,14 +999,30 @@@ retry_snap
  
        if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
            (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) {
 +              struct ceph_snap_context *snapc;
                struct iov_iter data;
                mutex_unlock(&inode->i_mutex);
 +
 +              spin_lock(&ci->i_ceph_lock);
 +              if (__ceph_have_pending_cap_snap(ci)) {
 +                      struct ceph_cap_snap *capsnap =
 +                                      list_last_entry(&ci->i_cap_snaps,
 +                                                      struct ceph_cap_snap,
 +                                                      ci_item);
 +                      snapc = ceph_get_snap_context(capsnap->context);
 +              } else {
 +                      BUG_ON(!ci->i_head_snapc);
 +                      snapc = ceph_get_snap_context(ci->i_head_snapc);
 +              }
 +              spin_unlock(&ci->i_ceph_lock);
 +
                /* we might need to revert back to that point */
                data = *from;
                if (iocb->ki_flags & IOCB_DIRECT)
 -                      written = ceph_sync_direct_write(iocb, &data, pos);
 +                      written = ceph_sync_direct_write(iocb, &data, pos,
 +                                                       snapc);
                else
 -                      written = ceph_sync_write(iocb, &data, pos);
 +                      written = ceph_sync_write(iocb, &data, pos, snapc);
                if (written == -EOLDSNAPC) {
                        dout("aio_write %p %llx.%llx %llu~%u"
                                "got EOLDSNAPC, retrying\n",
                }
                if (written > 0)
                        iov_iter_advance(from, written);
 +              ceph_put_snap_context(snapc);
        } else {
                loff_t old_size = inode->i_size;
                /*
                int dirty;
                spin_lock(&ci->i_ceph_lock);
                ci->i_inline_version = CEPH_INLINE_NONE;
 -              dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
 +              dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
 +                                             &prealloc_cf);
                spin_unlock(&ci->i_ceph_lock);
                if (dirty)
                        __mark_inode_dirty(inode, dirty);
  out:
        mutex_unlock(&inode->i_mutex);
  out_unlocked:
 +      ceph_free_cap_flush(prealloc_cf);
        current->backing_dev_info = NULL;
        return written ? written : err;
  }
@@@ -1277,7 -1255,6 +1277,7 @@@ static long ceph_fallocate(struct file 
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_osd_client *osdc =
                &ceph_inode_to_client(inode)->client->osdc;
 +      struct ceph_cap_flush *prealloc_cf;
        int want, got = 0;
        int dirty;
        int ret = 0;
        if (!S_ISREG(inode->i_mode))
                return -EOPNOTSUPP;
  
 +      prealloc_cf = ceph_alloc_cap_flush();
 +      if (!prealloc_cf)
 +              return -ENOMEM;
 +
        mutex_lock(&inode->i_mutex);
  
        if (ceph_snap(inode) != CEPH_NOSNAP) {
        if (!ret) {
                spin_lock(&ci->i_ceph_lock);
                ci->i_inline_version = CEPH_INLINE_NONE;
 -              dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
 +              dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
 +                                             &prealloc_cf);
                spin_unlock(&ci->i_ceph_lock);
                if (dirty)
                        __mark_inode_dirty(inode, dirty);
        ceph_put_cap_refs(ci, got);
  unlock:
        mutex_unlock(&inode->i_mutex);
 +      ceph_free_cap_flush(prealloc_cf);
        return ret;
  }
  
diff --combined fs/coredump.c
index e52e0064feac8d01c1447969e7e48189af5f2714,5b771b36cc6e79de2883b6e23b138f547bb0b1f6..c5ecde6f3eed975af7756c17cec4f3b1748dbc83
@@@ -70,8 -70,7 +70,8 @@@ static int expand_corename(struct core_
        return 0;
  }
  
 -static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg)
 +static __printf(2, 0) int cn_vprintf(struct core_name *cn, const char *fmt,
 +                                   va_list arg)
  {
        int free, need;
        va_list arg_copy;
@@@ -94,7 -93,7 +94,7 @@@ again
        return -ENOMEM;
  }
  
 -static int cn_printf(struct core_name *cn, const char *fmt, ...)
 +static __printf(2, 3) int cn_printf(struct core_name *cn, const char *fmt, ...)
  {
        va_list arg;
        int ret;
        return ret;
  }
  
 -static int cn_esc_printf(struct core_name *cn, const char *fmt, ...)
 +static __printf(2, 3)
 +int cn_esc_printf(struct core_name *cn, const char *fmt, ...)
  {
        int cur = cn->used;
        va_list arg;
@@@ -140,7 -138,7 +140,7 @@@ static int cn_print_exe_file(struct cor
                goto put_exe_file;
        }
  
-       path = d_path(&exe_file->f_path, pathbuf, PATH_MAX);
+       path = file_path(exe_file, pathbuf, PATH_MAX);
        if (IS_ERR(path)) {
                ret = PTR_ERR(path);
                goto free_buf;
@@@ -211,15 -209,11 +211,15 @@@ static int format_corename(struct core_
                                break;
                        /* uid */
                        case 'u':
 -                              err = cn_printf(cn, "%d", cred->uid);
 +                              err = cn_printf(cn, "%u",
 +                                              from_kuid(&init_user_ns,
 +                                                        cred->uid));
                                break;
                        /* gid */
                        case 'g':
 -                              err = cn_printf(cn, "%d", cred->gid);
 +                              err = cn_printf(cn, "%u",
 +                                              from_kgid(&init_user_ns,
 +                                                        cred->gid));
                                break;
                        case 'd':
                                err = cn_printf(cn, "%d",
                                break;
                        /* signal that caused the coredump */
                        case 's':
 -                              err = cn_printf(cn, "%ld", cprm->siginfo->si_signo);
 +                              err = cn_printf(cn, "%d",
 +                                              cprm->siginfo->si_signo);
                                break;
                        /* UNIX time of coredump */
                        case 't': {
diff --combined fs/dax.c
index 99b5fbc38992db1f88be1a1e48dad4fda584a0c1,37a0c4826c1ae3ef6f017885148502e7ae56f407..c3e21ccfc358b2da1170c15f09a5946afad0ff16
+++ b/fs/dax.c
@@@ -155,7 -155,7 +155,7 @@@ static ssize_t dax_io(struct inode *ino
                }
  
                if (iov_iter_rw(iter) == WRITE)
-                       len = copy_from_iter(addr, max - pos, iter);
+                       len = copy_from_iter_nocache(addr, max - pos, iter);
                else if (!hole)
                        len = copy_to_iter(addr, max - pos, iter);
                else
@@@ -209,7 -209,8 +209,8 @@@ ssize_t dax_do_io(struct kiocb *iocb, s
        }
  
        /* Protects against truncate */
-       inode_dio_begin(inode);
+       if (!(flags & DIO_SKIP_DIO_COUNT))
+               inode_dio_begin(inode);
  
        retval = dax_io(inode, iter, pos, end, get_block, &bh);
  
        if ((retval > 0) && end_io)
                end_io(iocb, pos, retval, bh.b_private);
  
-       inode_dio_end(inode);
+       if (!(flags & DIO_SKIP_DIO_COUNT))
+               inode_dio_end(inode);
   out:
        return retval;
  }
@@@ -309,21 -311,14 +311,21 @@@ static int dax_insert_mapping(struct in
   out:
        i_mmap_unlock_read(mapping);
  
 -      if (bh->b_end_io)
 -              bh->b_end_io(bh, 1);
 -
        return error;
  }
  
 -static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 -                      get_block_t get_block)
 +/**
 + * __dax_fault - handle a page fault on a DAX file
 + * @vma: The virtual memory area where the fault occurred
 + * @vmf: The description of the fault
 + * @get_block: The filesystem method used to translate file offsets to blocks
 + *
 + * When a page fault occurs, filesystems may call this helper in their
 + * fault handler for DAX files. __dax_fault() assumes the caller has done all
 + * the necessary locking for the page fault to proceed successfully.
 + */
 +int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 +                      get_block_t get_block, dax_iodone_t complete_unwritten)
  {
        struct file *file = vma->vm_file;
        struct address_space *mapping = file->f_mapping;
                page_cache_release(page);
        }
  
 +      /*
 +       * If we successfully insert the new mapping over an unwritten extent,
 +       * we need to ensure we convert the unwritten extent. If there is an
 +       * error inserting the mapping, the filesystem needs to leave it as
 +       * unwritten to prevent exposure of the stale underlying data to
 +       * userspace, but we still need to call the completion function so
 +       * the private resources on the mapping buffer can be released. We
 +       * indicate what the callback should do via the uptodate variable, same
 +       * as for normal BH based IO completions.
 +       */
        error = dax_insert_mapping(inode, &bh, vma, vmf);
 +      if (buffer_unwritten(&bh))
 +              complete_unwritten(&bh, !error);
  
   out:
        if (error == -ENOMEM)
        }
        goto out;
  }
 +EXPORT_SYMBOL(__dax_fault);
  
  /**
   * dax_fault - handle a page fault on a DAX file
   * fault handler for DAX files.
   */
  int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 -                      get_block_t get_block)
 +            get_block_t get_block, dax_iodone_t complete_unwritten)
  {
        int result;
        struct super_block *sb = file_inode(vma->vm_file)->i_sb;
                sb_start_pagefault(sb);
                file_update_time(vma->vm_file);
        }
 -      result = do_dax_fault(vma, vmf, get_block);
 +      result = __dax_fault(vma, vmf, get_block, complete_unwritten);
        if (vmf->flags & FAULT_FLAG_WRITE)
                sb_end_pagefault(sb);
  
diff --combined fs/dcache.c
index 910968b4b6bf74c53003b1ab81feeaa7ba859d80,c4ce35110704d8cf84289c92152c7dd3ed64b6b2..7a3f3e5f9ceabfc4cad41a5d5478258f7e817dc1
@@@ -322,17 -322,17 +322,17 @@@ static void dentry_free(struct dentry *
  }
  
  /**
 - * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups
 + * dentry_rcuwalk_invalidate - invalidate in-progress rcu-walk lookups
   * @dentry: the target dentry
   * After this call, in-progress rcu-walk path lookup will fail. This
   * should be called after unhashing, and after changing d_inode (if
   * the dentry has not already been unhashed).
   */
 -static inline void dentry_rcuwalk_barrier(struct dentry *dentry)
 +static inline void dentry_rcuwalk_invalidate(struct dentry *dentry)
  {
 -      assert_spin_locked(&dentry->d_lock);
 -      /* Go through a barrier */
 -      write_seqcount_barrier(&dentry->d_seq);
 +      lockdep_assert_held(&dentry->d_lock);
 +      /* Go through am invalidation barrier */
 +      write_seqcount_invalidate(&dentry->d_seq);
  }
  
  /*
@@@ -372,7 -372,7 +372,7 @@@ static void dentry_unlink_inode(struct 
        struct inode *inode = dentry->d_inode;
        __d_clear_type_and_inode(dentry);
        hlist_del_init(&dentry->d_u.d_alias);
 -      dentry_rcuwalk_barrier(dentry);
 +      dentry_rcuwalk_invalidate(dentry);
        spin_unlock(&dentry->d_lock);
        spin_unlock(&inode->i_lock);
        if (!inode->i_nlink)
@@@ -494,7 -494,7 +494,7 @@@ void __d_drop(struct dentry *dentry
                __hlist_bl_del(&dentry->d_hash);
                dentry->d_hash.pprev = NULL;
                hlist_bl_unlock(b);
 -              dentry_rcuwalk_barrier(dentry);
 +              dentry_rcuwalk_invalidate(dentry);
        }
  }
  EXPORT_SYMBOL(__d_drop);
@@@ -1673,7 -1673,8 +1673,8 @@@ void d_set_d_op(struct dentry *dentry, 
                                DCACHE_OP_COMPARE       |
                                DCACHE_OP_REVALIDATE    |
                                DCACHE_OP_WEAK_REVALIDATE       |
-                               DCACHE_OP_DELETE ));
+                               DCACHE_OP_DELETE        |
+                               DCACHE_OP_SELECT_INODE));
        dentry->d_op = op;
        if (!op)
                return;
                dentry->d_flags |= DCACHE_OP_DELETE;
        if (op->d_prune)
                dentry->d_flags |= DCACHE_OP_PRUNE;
+       if (op->d_select_inode)
+               dentry->d_flags |= DCACHE_OP_SELECT_INODE;
  
  }
  EXPORT_SYMBOL(d_set_d_op);
@@@ -1752,7 -1755,7 +1755,7 @@@ static void __d_instantiate(struct dent
        if (inode)
                hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
        __d_set_inode_and_type(dentry, inode, add_flags);
 -      dentry_rcuwalk_barrier(dentry);
 +      dentry_rcuwalk_invalidate(dentry);
        spin_unlock(&dentry->d_lock);
        fsnotify_d_instantiate(dentry, inode);
  }
@@@ -2927,6 -2930,17 +2930,6 @@@ restart
                                vfsmnt = &mnt->mnt;
                                continue;
                        }
 -                      /*
 -                       * Filesystems needing to implement special "root names"
 -                       * should do so with ->d_dname()
 -                       */
 -                      if (IS_ROOT(dentry) &&
 -                         (dentry->d_name.len != 1 ||
 -                          dentry->d_name.name[0] != '/')) {
 -                              WARN(1, "Root dentry has weird name <%.*s>\n",
 -                                   (int) dentry->d_name.len,
 -                                   dentry->d_name.name);
 -                      }
                        if (!error)
                                error = is_mounted(vfsmnt) ? 1 : 2;
                        break;
diff --combined fs/debugfs/inode.c
index d6d1cf004123385e49750bd3242b417d09215ae4,ef86ad6bdc3ee952d8171905860af9c9767fa60f..c711be8d6a3cc71a598a92a82026a59f05eccd90
@@@ -44,11 -44,6 +44,6 @@@ static struct inode *debugfs_get_inode(
        return inode;
  }
  
- static inline int debugfs_positive(struct dentry *dentry)
- {
-       return d_really_is_positive(dentry) && !d_unhashed(dentry);
- }
  struct debugfs_mount_opts {
        kuid_t uid;
        kgid_t gid;
@@@ -522,7 -517,7 +517,7 @@@ static int __debugfs_remove(struct dent
  {
        int ret = 0;
  
-       if (debugfs_positive(dentry)) {
+       if (simple_positive(dentry)) {
                dget(dentry);
                if (d_is_dir(dentry))
                        ret = simple_rmdir(d_inode(parent), dentry);
@@@ -602,7 -597,7 +597,7 @@@ void debugfs_remove_recursive(struct de
         */
        spin_lock(&parent->d_lock);
        list_for_each_entry(child, &parent->d_subdirs, d_child) {
-               if (!debugfs_positive(child))
+               if (!simple_positive(child))
                        continue;
  
                /* perhaps simple_empty(child) makes more sense */
                 * from d_subdirs. When releasing the parent->d_lock we can
                 * no longer trust that the next pointer is valid.
                 * Restart the loop. We'll skip this one with the
-                * debugfs_positive() check.
+                * simple_positive() check.
                 */
                goto loop;
        }
@@@ -716,17 -711,20 +711,17 @@@ bool debugfs_initialized(void
  }
  EXPORT_SYMBOL_GPL(debugfs_initialized);
  
 -
 -static struct kobject *debug_kobj;
 -
  static int __init debugfs_init(void)
  {
        int retval;
  
 -      debug_kobj = kobject_create_and_add("debug", kernel_kobj);
 -      if (!debug_kobj)
 -              return -EINVAL;
 +      retval = sysfs_create_mount_point(kernel_kobj, "debug");
 +      if (retval)
 +              return retval;
  
        retval = register_filesystem(&debug_fs_type);
        if (retval)
 -              kobject_put(debug_kobj);
 +              sysfs_remove_mount_point(kernel_kobj, "debug");
        else
                debugfs_registered = true;
  
diff --combined fs/ext4/super.c
index 5c787647afe2a3817dfbf49949e195218199c8f6,0ae853d2e1f141cb2bde05dde7f089b6eff98c09..58987b5c514b2baf433801dee0525d05c4525906
@@@ -24,7 -24,6 +24,7 @@@
  #include <linux/slab.h>
  #include <linux/init.h>
  #include <linux/blkdev.h>
 +#include <linux/backing-dev.h>
  #include <linux/parser.h>
  #include <linux/buffer_head.h>
  #include <linux/exportfs.h>
@@@ -295,8 -294,6 +295,8 @@@ static void __save_error_info(struct su
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
  
        EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 +      if (bdev_read_only(sb->s_bdev))
 +              return;
        es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
        es->s_last_error_time = cpu_to_le32(get_seconds());
        strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
@@@ -452,7 -449,7 +452,7 @@@ void __ext4_error_file(struct file *fil
        es = EXT4_SB(inode->i_sb)->s_es;
        es->s_last_error_ino = cpu_to_le32(inode->i_ino);
        if (ext4_error_ratelimit(inode->i_sb)) {
-               path = d_path(&(file->f_path), pathname, sizeof(pathname));
+               path = file_path(file, pathname, sizeof(pathname));
                if (IS_ERR(path))
                        path = "(unknown)";
                va_start(args, fmt);
@@@ -592,17 -589,14 +592,17 @@@ void __ext4_msg(struct super_block *sb
        va_end(args);
  }
  
 +#define ext4_warning_ratelimit(sb)                                    \
 +              ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), \
 +                           "EXT4-fs warning")
 +
  void __ext4_warning(struct super_block *sb, const char *function,
                    unsigned int line, const char *fmt, ...)
  {
        struct va_format vaf;
        va_list args;
  
 -      if (!___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),
 -                        "EXT4-fs warning"))
 +      if (!ext4_warning_ratelimit(sb))
                return;
  
        va_start(args, fmt);
        va_end(args);
  }
  
 +void __ext4_warning_inode(const struct inode *inode, const char *function,
 +                        unsigned int line, const char *fmt, ...)
 +{
 +      struct va_format vaf;
 +      va_list args;
 +
 +      if (!ext4_warning_ratelimit(inode->i_sb))
 +              return;
 +
 +      va_start(args, fmt);
 +      vaf.fmt = fmt;
 +      vaf.va = &args;
 +      printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: "
 +             "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id,
 +             function, line, inode->i_ino, current->comm, &vaf);
 +      va_end(args);
 +}
 +
  void __ext4_grp_locked_error(const char *function, unsigned int line,
                             struct super_block *sb, ext4_group_t grp,
                             unsigned long ino, ext4_fsblk_t block,
@@@ -829,7 -805,6 +829,7 @@@ static void ext4_put_super(struct super
                dump_orphan_list(sb, sbi);
        J_ASSERT(list_empty(&sbi->s_orphan));
  
 +      sync_blockdev(sb->s_bdev);
        invalidate_bdev(sb->s_bdev);
        if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
                /*
@@@ -902,8 -877,9 +902,8 @@@ static struct inode *ext4_alloc_inode(s
        atomic_set(&ei->i_unwritten, 0);
        INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
  #ifdef CONFIG_EXT4_FS_ENCRYPTION
 -      ei->i_encryption_key.mode = EXT4_ENCRYPTION_MODE_INVALID;
 +      ei->i_crypt_info = NULL;
  #endif
 -
        return &ei->vfs_inode;
  }
  
@@@ -980,10 -956,6 +980,10 @@@ void ext4_clear_inode(struct inode *ino
                jbd2_free_inode(EXT4_I(inode)->jinode);
                EXT4_I(inode)->jinode = NULL;
        }
 +#ifdef CONFIG_EXT4_FS_ENCRYPTION
 +      if (EXT4_I(inode)->i_crypt_info)
 +              ext4_free_encryption_info(inode, EXT4_I(inode)->i_crypt_info);
 +#endif
  }
  
  static struct inode *ext4_nfs_get_inode(struct super_block *sb,
@@@ -3446,6 -3418,7 +3446,6 @@@ static int ext4_fill_super(struct super
        unsigned long journal_devnum = 0;
        unsigned long def_mount_opts;
        struct inode *root;
 -      char *cp;
        const char *descr;
        int ret = -ENOMEM;
        int blocksize, clustersize;
        if (sb->s_bdev->bd_part)
                sbi->s_sectors_written_start =
                        part_stat_read(sb->s_bdev->bd_part, sectors[1]);
 -#ifdef CONFIG_EXT4_FS_ENCRYPTION
 -      /* Modes of operations for file and directory encryption. */
 -      sbi->s_file_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS;
 -      sbi->s_dir_encryption_mode = EXT4_ENCRYPTION_MODE_INVALID;
 -#endif
  
        /* Cleanup superblock name */
 -      for (cp = sb->s_id; (cp = strchr(cp, '/'));)
 -              *cp = '!';
 +      strreplace(sb->s_id, '/', '!');
  
        /* -EINVAL is default */
        ret = -EINVAL;
@@@ -4086,15 -4065,7 +4086,15 @@@ no_journal
                }
        }
  
 -      if (unlikely(sbi->s_mount_flags & EXT4_MF_TEST_DUMMY_ENCRYPTION) &&
 +      if ((DUMMY_ENCRYPTION_ENABLED(sbi) ||
 +           EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) &&
 +          (blocksize != PAGE_CACHE_SIZE)) {
 +              ext4_msg(sb, KERN_ERR,
 +                       "Unsupported blocksize for fs encryption");
 +              goto failed_mount_wq;
 +      }
 +
 +      if (DUMMY_ENCRYPTION_ENABLED(sbi) &&
            !(sb->s_flags & MS_RDONLY) &&
            !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) {
                EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT);
@@@ -4970,9 -4941,6 +4970,9 @@@ static int ext4_remount(struct super_bl
                set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
        }
  
 +      if (*flags & MS_LAZYTIME)
 +              sb->s_flags |= MS_LAZYTIME;
 +
        if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
                if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
                        err = -EROFS;
@@@ -5440,7 -5408,6 +5440,7 @@@ static ssize_t ext4_quota_write(struct 
        struct inode *inode = sb_dqopt(sb)->files[type];
        ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
        int err, offset = off & (sb->s_blocksize - 1);
 +      int retries = 0;
        struct buffer_head *bh;
        handle_t *handle = journal_current_handle();
  
                return -EIO;
        }
  
 -      bh = ext4_bread(handle, inode, blk, 1);
 +      do {
 +              bh = ext4_bread(handle, inode, blk,
 +                              EXT4_GET_BLOCKS_CREATE |
 +                              EXT4_GET_BLOCKS_METADATA_NOFAIL);
 +      } while (IS_ERR(bh) && (PTR_ERR(bh) == -ENOSPC) &&
 +               ext4_should_retry_alloc(inode->i_sb, &retries));
        if (IS_ERR(bh))
                return PTR_ERR(bh);
        if (!bh)
@@@ -5683,7 -5645,6 +5683,7 @@@ out7
  
  static void __exit ext4_exit_fs(void)
  {
 +      ext4_exit_crypto();
        ext4_destroy_lazyinit_thread();
        unregister_as_ext2();
        unregister_as_ext3();
diff --combined fs/fuse/file.c
index 014fa8ba2b5189e923557c446be8f150921f9b28,1344647965dc09f83bb67920e801cafbf1e7b564..f523f2f04c196db5b1201a38a6e3222ae40d1724
@@@ -96,17 -96,17 +96,17 @@@ static void fuse_file_put(struct fuse_f
                         * Drop the release request when client does not
                         * implement 'open'
                         */
 -                      req->background = 0;
 +                      __clear_bit(FR_BACKGROUND, &req->flags);
                        iput(req->misc.release.inode);
                        fuse_put_request(ff->fc, req);
                } else if (sync) {
 -                      req->background = 0;
 +                      __clear_bit(FR_BACKGROUND, &req->flags);
                        fuse_request_send(ff->fc, req);
                        iput(req->misc.release.inode);
                        fuse_put_request(ff->fc, req);
                } else {
                        req->end = fuse_release_end;
 -                      req->background = 1;
 +                      __set_bit(FR_BACKGROUND, &req->flags);
                        fuse_request_send_background(ff->fc, req);
                }
                kfree(ff);
@@@ -299,8 -299,8 +299,8 @@@ void fuse_sync_release(struct fuse_fil
  {
        WARN_ON(atomic_read(&ff->count) > 1);
        fuse_prepare_release(ff, flags, FUSE_RELEASE);
 -      ff->reserved_req->force = 1;
 -      ff->reserved_req->background = 0;
 +      __set_bit(FR_FORCE, &ff->reserved_req->flags);
 +      __clear_bit(FR_BACKGROUND, &ff->reserved_req->flags);
        fuse_request_send(ff->fc, ff->reserved_req);
        fuse_put_request(ff->fc, ff->reserved_req);
        kfree(ff);
@@@ -426,7 -426,7 +426,7 @@@ static int fuse_flush(struct file *file
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
 -      req->force = 1;
 +      __set_bit(FR_FORCE, &req->flags);
        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
@@@ -1169,7 -1169,7 +1169,7 @@@ static ssize_t fuse_file_write_iter(str
        if (err <= 0)
                goto out;
  
-       err = file_remove_suid(file);
+       err = file_remove_privs(file);
        if (err)
                goto out;
  
@@@ -1445,9 -1445,9 +1445,9 @@@ static void fuse_writepage_finish(struc
  
        list_del(&req->writepages_entry);
        for (i = 0; i < req->num_pages; i++) {
 -              dec_bdi_stat(bdi, BDI_WRITEBACK);
 +              dec_wb_stat(&bdi->wb, WB_WRITEBACK);
                dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP);
 -              bdi_writeout_inc(bdi);
 +              wb_writeout_inc(&bdi->wb);
        }
        wake_up(&fi->page_waitq);
  }
@@@ -1611,8 -1611,7 +1611,8 @@@ static int fuse_writepage_locked(struc
        if (!req)
                goto err;
  
 -      req->background = 1; /* writeback always goes to bg_queue */
 +      /* writeback always goes to bg_queue */
 +      __set_bit(FR_BACKGROUND, &req->flags);
        tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
        if (!tmp_page)
                goto err_free;
        req->end = fuse_writepage_end;
        req->inode = inode;
  
 -      inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
 +      inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
        inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
  
        spin_lock(&fc->lock);
@@@ -1743,15 -1742,16 +1743,15 @@@ static bool fuse_writepage_in_flight(st
                }
        }
  
 -      if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT ||
 -                                      old_req->state == FUSE_REQ_PENDING)) {
 +      if (old_req->num_pages == 1 && test_bit(FR_PENDING, &old_req->flags)) {
                struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host);
  
                copy_highpage(old_req->pages[0], page);
                spin_unlock(&fc->lock);
  
 -              dec_bdi_stat(bdi, BDI_WRITEBACK);
 +              dec_wb_stat(&bdi->wb, WB_WRITEBACK);
                dec_zone_page_state(page, NR_WRITEBACK_TEMP);
 -              bdi_writeout_inc(bdi);
 +              wb_writeout_inc(&bdi->wb);
                fuse_writepage_free(fc, new_req);
                fuse_request_free(new_req);
                goto out;
@@@ -1830,7 -1830,7 +1830,7 @@@ static int fuse_writepages_fill(struct 
                req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
                req->misc.write.next = NULL;
                req->in.argpages = 1;
 -              req->background = 1;
 +              __set_bit(FR_BACKGROUND, &req->flags);
                req->num_pages = 0;
                req->end = fuse_writepage_end;
                req->inode = inode;
        req->page_descs[req->num_pages].offset = 0;
        req->page_descs[req->num_pages].length = PAGE_SIZE;
  
 -      inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
 +      inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
        inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
  
        err = 0;
diff --combined fs/inode.c
index 069721f0cc0e0b733bb659fb0d7836cd71499690,648e71ce6ec216358e6733c2d6e0141c3d90c6f6..d30640f7a193879d07f4ff2c12efe3b817a386f3
@@@ -224,7 -224,6 +224,7 @@@ EXPORT_SYMBOL(free_inode_nonrcu)
  void __destroy_inode(struct inode *inode)
  {
        BUG_ON(inode_has_buffers(inode));
 +      inode_detach_wb(inode);
        security_inode_free(inode);
        fsnotify_inode_delete(inode);
        locks_free_lock_context(inode->i_flctx);
@@@ -841,7 -840,11 +841,11 @@@ unsigned int get_next_ino(void
        }
  #endif
  
-       *p = ++res;
+       res++;
+       /* get_next_ino should not provide a 0 inode number */
+       if (unlikely(!res))
+               res++;
+       *p = res;
        put_cpu_var(last_ino);
        return res;
  }
@@@ -1674,7 -1677,31 +1678,31 @@@ int should_remove_suid(struct dentry *d
  }
  EXPORT_SYMBOL(should_remove_suid);
  
- static int __remove_suid(struct dentry *dentry, int kill)
+ /*
+  * Return mask of changes for notify_change() that need to be done as a
+  * response to write or truncate. Return 0 if nothing has to be changed.
+  * Negative value on error (change should be denied).
+  */
+ int dentry_needs_remove_privs(struct dentry *dentry)
+ {
+       struct inode *inode = d_inode(dentry);
+       int mask = 0;
+       int ret;
+       if (IS_NOSEC(inode))
+               return 0;
+       mask = should_remove_suid(dentry);
+       ret = security_inode_need_killpriv(dentry);
+       if (ret < 0)
+               return ret;
+       if (ret)
+               mask |= ATTR_KILL_PRIV;
+       return mask;
+ }
+ EXPORT_SYMBOL(dentry_needs_remove_privs);
+ static int __remove_privs(struct dentry *dentry, int kill)
  {
        struct iattr newattrs;
  
        return notify_change(dentry, &newattrs, NULL);
  }
  
- int file_remove_suid(struct file *file)
+ /*
+  * Remove special file priviledges (suid, capabilities) when file is written
+  * to or truncated.
+  */
+ int file_remove_privs(struct file *file)
  {
        struct dentry *dentry = file->f_path.dentry;
        struct inode *inode = d_inode(dentry);
-       int killsuid;
-       int killpriv;
+       int kill;
        int error = 0;
  
        /* Fast path for nothing security related */
        if (IS_NOSEC(inode))
                return 0;
  
-       killsuid = should_remove_suid(dentry);
-       killpriv = security_inode_need_killpriv(dentry);
-       if (killpriv < 0)
-               return killpriv;
-       if (killpriv)
-               error = security_inode_killpriv(dentry);
-       if (!error && killsuid)
-               error = __remove_suid(dentry, killsuid);
-       if (!error && (inode->i_sb->s_flags & MS_NOSEC))
-               inode->i_flags |= S_NOSEC;
+       kill = file_needs_remove_privs(file);
+       if (kill < 0)
+               return kill;
+       if (kill)
+               error = __remove_privs(dentry, kill);
+       if (!error)
+               inode_has_no_xattr(inode);
  
        return error;
  }
- EXPORT_SYMBOL(file_remove_suid);
+ EXPORT_SYMBOL(file_remove_privs);
  
  /**
   *    file_update_time        -       update mtime and ctime time
@@@ -1967,9 -1993,8 +1994,8 @@@ EXPORT_SYMBOL(inode_dio_wait)
   * inode is being instantiated).  The reason for the cmpxchg() loop
   * --- which wouldn't be necessary if all code paths which modify
   * i_flags actually followed this rule, is that there is at least one
-  * code path which doesn't today --- for example,
-  * __generic_file_aio_write() calls file_remove_suid() without holding
-  * i_mutex --- so we use cmpxchg() out of an abundance of caution.
+  * code path which doesn't today so we use cmpxchg() out of an abundance
+  * of caution.
   *
   * In the long run, i_mutex is overkill, and we should probably look
   * at using the i_lock spinlock to protect i_flags, and then make sure
diff --combined fs/libfs.c
index 88a4cb418756c29c5273432ac0f364c340c7f6f7,4d9e6c118fe15d5926ba344b0220e8c49673d9aa..102edfd39000c15f14594a47b6236ba8edb5ccc8
  
  #include "internal.h"
  
- static inline int simple_positive(struct dentry *dentry)
- {
-       return d_really_is_positive(dentry) && !d_unhashed(dentry);
- }
  int simple_getattr(struct vfsmount *mnt, struct dentry *dentry,
                   struct kstat *stat)
  {
@@@ -1108,98 -1103,3 +1103,98 @@@ const struct inode_operations simple_sy
        .readlink = generic_readlink
  };
  EXPORT_SYMBOL(simple_symlink_inode_operations);
 +
 +/*
 + * Operations for a permanently empty directory.
 + */
 +static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 +{
 +      return ERR_PTR(-ENOENT);
 +}
 +
 +static int empty_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
 +                               struct kstat *stat)
 +{
 +      struct inode *inode = d_inode(dentry);
 +      generic_fillattr(inode, stat);
 +      return 0;
 +}
 +
 +static int empty_dir_setattr(struct dentry *dentry, struct iattr *attr)
 +{
 +      return -EPERM;
 +}
 +
 +static int empty_dir_setxattr(struct dentry *dentry, const char *name,
 +                            const void *value, size_t size, int flags)
 +{
 +      return -EOPNOTSUPP;
 +}
 +
 +static ssize_t empty_dir_getxattr(struct dentry *dentry, const char *name,
 +                                void *value, size_t size)
 +{
 +      return -EOPNOTSUPP;
 +}
 +
 +static int empty_dir_removexattr(struct dentry *dentry, const char *name)
 +{
 +      return -EOPNOTSUPP;
 +}
 +
 +static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t size)
 +{
 +      return -EOPNOTSUPP;
 +}
 +
 +static const struct inode_operations empty_dir_inode_operations = {
 +      .lookup         = empty_dir_lookup,
 +      .permission     = generic_permission,
 +      .setattr        = empty_dir_setattr,
 +      .getattr        = empty_dir_getattr,
 +      .setxattr       = empty_dir_setxattr,
 +      .getxattr       = empty_dir_getxattr,
 +      .removexattr    = empty_dir_removexattr,
 +      .listxattr      = empty_dir_listxattr,
 +};
 +
 +static loff_t empty_dir_llseek(struct file *file, loff_t offset, int whence)
 +{
 +      /* An empty directory has two entries . and .. at offsets 0 and 1 */
 +      return generic_file_llseek_size(file, offset, whence, 2, 2);
 +}
 +
 +static int empty_dir_readdir(struct file *file, struct dir_context *ctx)
 +{
 +      dir_emit_dots(file, ctx);
 +      return 0;
 +}
 +
 +static const struct file_operations empty_dir_operations = {
 +      .llseek         = empty_dir_llseek,
 +      .read           = generic_read_dir,
 +      .iterate        = empty_dir_readdir,
 +      .fsync          = noop_fsync,
 +};
 +
 +
 +void make_empty_dir_inode(struct inode *inode)
 +{
 +      set_nlink(inode, 2);
 +      inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
 +      inode->i_uid = GLOBAL_ROOT_UID;
 +      inode->i_gid = GLOBAL_ROOT_GID;
 +      inode->i_rdev = 0;
 +      inode->i_size = 2;
 +      inode->i_blkbits = PAGE_SHIFT;
 +      inode->i_blocks = 0;
 +
 +      inode->i_op = &empty_dir_inode_operations;
 +      inode->i_fop = &empty_dir_operations;
 +}
 +
 +bool is_empty_dir_inode(struct inode *inode)
 +{
 +      return (inode->i_fop == &empty_dir_operations) &&
 +              (inode->i_op == &empty_dir_inode_operations);
 +}
diff --combined fs/nfs/dir.c
index 21457bb0edd62b42af307d5850b336711f41e82f,b9108f4254a70477d14e9f11a3442aaed56cecbe..547308a5ec6f4a738006370e523c751c90927e1b
@@@ -1470,6 -1470,9 +1470,6 @@@ static int nfs_finish_open(struct nfs_o
  {
        int err;
  
 -      if ((open_flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
 -              *opened |= FILE_CREATED;
 -
        err = finish_open(file, dentry, do_open, opened);
        if (err)
                goto out;
@@@ -1768,7 -1771,7 +1768,7 @@@ EXPORT_SYMBOL_GPL(nfs_mkdir)
  
  static void nfs_dentry_handle_enoent(struct dentry *dentry)
  {
-       if (d_really_is_positive(dentry) && !d_unhashed(dentry))
+       if (simple_positive(dentry))
                d_delete(dentry);
  }
  
diff --combined fs/ntfs/file.c
index 2cd65367076458e84532eebcb67869944e3aa327,182bb93aa79cceb5bfe6312d1b5c104d3ed82657..262561fea923aa2315cffe91af91d12b399ded8c
@@@ -382,7 -382,7 +382,7 @@@ static ssize_t ntfs_prepare_file_for_wr
        base_ni = ni;
        if (NInoAttr(ni))
                base_ni = ni->ext.base_ntfs_ino;
-       err = file_remove_suid(file);
+       err = file_remove_privs(file);
        if (unlikely(err))
                goto out;
        /*
@@@ -525,8 -525,7 +525,8 @@@ static inline int __ntfs_grab_cache_pag
                                }
                        }
                        err = add_to_page_cache_lru(*cached_page, mapping,
 -                                      index, GFP_KERNEL);
 +                                      index,
 +                                      GFP_KERNEL & mapping_gfp_mask(mapping));
                        if (unlikely(err)) {
                                if (err == -EEXIST)
                                        continue;
diff --combined fs/overlayfs/super.c
index 8a08c582bc22e400a16f395609200ed105d3933f,84c5e27fbfd9c6bd7dcd4467f47c018af33a5614..7466ff339c667ea63ead6bf04f18d5662ef3d142
@@@ -273,57 -273,11 +273,58 @@@ static void ovl_dentry_release(struct d
        }
  }
  
 +static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags)
 +{
 +      struct ovl_entry *oe = dentry->d_fsdata;
 +      unsigned int i;
 +      int ret = 1;
 +
 +      for (i = 0; i < oe->numlower; i++) {
 +              struct dentry *d = oe->lowerstack[i].dentry;
 +
 +              if (d->d_flags & DCACHE_OP_REVALIDATE) {
 +                      ret = d->d_op->d_revalidate(d, flags);
 +                      if (ret < 0)
 +                              return ret;
 +                      if (!ret) {
 +                              if (!(flags & LOOKUP_RCU))
 +                                      d_invalidate(d);
 +                              return -ESTALE;
 +                      }
 +              }
 +      }
 +      return 1;
 +}
 +
 +static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags)
 +{
 +      struct ovl_entry *oe = dentry->d_fsdata;
 +      unsigned int i;
 +      int ret = 1;
 +
 +      for (i = 0; i < oe->numlower; i++) {
 +              struct dentry *d = oe->lowerstack[i].dentry;
 +
 +              if (d->d_flags & DCACHE_OP_WEAK_REVALIDATE) {
 +                      ret = d->d_op->d_weak_revalidate(d, flags);
 +                      if (ret <= 0)
 +                              break;
 +              }
 +      }
 +      return ret;
 +}
 +
  static const struct dentry_operations ovl_dentry_operations = {
        .d_release = ovl_dentry_release,
+       .d_select_inode = ovl_d_select_inode,
  };
  
 +static const struct dentry_operations ovl_reval_dentry_operations = {
 +      .d_release = ovl_dentry_release,
 +      .d_revalidate = ovl_dentry_revalidate,
 +      .d_weak_revalidate = ovl_dentry_weak_revalidate,
 +};
 +
  static struct ovl_entry *ovl_alloc_entry(unsigned int numlower)
  {
        size_t size = offsetof(struct ovl_entry, lowerstack[numlower]);
        return oe;
  }
  
 +static bool ovl_dentry_remote(struct dentry *dentry)
 +{
 +      return dentry->d_flags &
 +              (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE);
 +}
 +
 +static bool ovl_dentry_weird(struct dentry *dentry)
 +{
 +      return dentry->d_flags & (DCACHE_NEED_AUTOMOUNT |
 +                                DCACHE_MANAGE_TRANSIT |
 +                                DCACHE_OP_HASH |
 +                                DCACHE_OP_COMPARE);
 +}
 +
  static inline struct dentry *ovl_lookup_real(struct dentry *dir,
                                             struct qstr *name)
  {
        } else if (!dentry->d_inode) {
                dput(dentry);
                dentry = NULL;
 +      } else if (ovl_dentry_weird(dentry)) {
 +              dput(dentry);
 +              /* Don't support traversing automounts and other weirdness */
 +              dentry = ERR_PTR(-EREMOTE);
        }
        return dentry;
  }
@@@ -415,11 -351,6 +416,11 @@@ struct dentry *ovl_lookup(struct inode 
                        goto out;
  
                if (this) {
 +                      if (unlikely(ovl_dentry_remote(this))) {
 +                              dput(this);
 +                              err = -EREMOTE;
 +                              goto out;
 +                      }
                        if (ovl_is_whiteout(this)) {
                                dput(this);
                                this = NULL;
@@@ -599,7 -530,7 +600,7 @@@ static int ovl_remount(struct super_blo
  {
        struct ovl_fs *ufs = sb->s_fs_info;
  
 -      if (!(*flags & MS_RDONLY) && !ufs->upper_mnt)
 +      if (!(*flags & MS_RDONLY) && (!ufs->upper_mnt || !ufs->workdir))
                return -EROFS;
  
        return 0;
@@@ -764,6 -695,25 +765,6 @@@ static void ovl_unescape(char *s
        }
  }
  
 -static bool ovl_is_allowed_fs_type(struct dentry *root)
 -{
 -      const struct dentry_operations *dop = root->d_op;
 -
 -      /*
 -       * We don't support:
 -       *  - automount filesystems
 -       *  - filesystems with revalidate (FIXME for lower layer)
 -       *  - filesystems with case insensitive names
 -       */
 -      if (dop &&
 -          (dop->d_manage || dop->d_automount ||
 -           dop->d_revalidate || dop->d_weak_revalidate ||
 -           dop->d_compare || dop->d_hash)) {
 -              return false;
 -      }
 -      return true;
 -}
 -
  static int ovl_mount_dir_noesc(const char *name, struct path *path)
  {
        int err = -EINVAL;
                goto out;
        }
        err = -EINVAL;
 -      if (!ovl_is_allowed_fs_type(path->dentry)) {
 +      if (ovl_dentry_weird(path->dentry)) {
                pr_err("overlayfs: filesystem on '%s' not supported\n", name);
                goto out_put;
        }
@@@ -802,21 -752,13 +803,21 @@@ static int ovl_mount_dir(const char *na
        if (tmp) {
                ovl_unescape(tmp);
                err = ovl_mount_dir_noesc(tmp, path);
 +
 +              if (!err)
 +                      if (ovl_dentry_remote(path->dentry)) {
 +                              pr_err("overlayfs: filesystem on '%s' not supported as upperdir\n",
 +                                     tmp);
 +                              path_put(path);
 +                              err = -EINVAL;
 +                      }
                kfree(tmp);
        }
        return err;
  }
  
  static int ovl_lower_dir(const char *name, struct path *path, long *namelen,
 -                       int *stack_depth)
 +                       int *stack_depth, bool *remote)
  {
        int err;
        struct kstatfs statfs;
        *namelen = max(*namelen, statfs.f_namelen);
        *stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth);
  
 +      if (ovl_dentry_remote(path->dentry))
 +              *remote = true;
 +
        return 0;
  
  out_put:
@@@ -889,7 -828,6 +890,7 @@@ static int ovl_fill_super(struct super_
        unsigned int numlower;
        unsigned int stacklen = 0;
        unsigned int i;
 +      bool remote = false;
        int err;
  
        err = -ENOMEM;
        lower = lowertmp;
        for (numlower = 0; numlower < stacklen; numlower++) {
                err = ovl_lower_dir(lower, &stack[numlower],
 -                                  &ufs->lower_namelen, &sb->s_stack_depth);
 +                                  &ufs->lower_namelen, &sb->s_stack_depth,
 +                                  &remote);
                if (err)
                        goto out_put_lowerpath;
  
                ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry);
                err = PTR_ERR(ufs->workdir);
                if (IS_ERR(ufs->workdir)) {
 -                      pr_err("overlayfs: failed to create directory %s/%s\n",
 -                             ufs->config.workdir, OVL_WORKDIR_NAME);
 -                      goto out_put_upper_mnt;
 +                      pr_warn("overlayfs: failed to create directory %s/%s (errno: %i); mounting read-only\n",
 +                              ufs->config.workdir, OVL_WORKDIR_NAME, -err);
 +                      sb->s_flags |= MS_RDONLY;
 +                      ufs->workdir = NULL;
                }
        }
  
        if (!ufs->upper_mnt)
                sb->s_flags |= MS_RDONLY;
  
 -      sb->s_d_op = &ovl_dentry_operations;
 +      if (remote)
 +              sb->s_d_op = &ovl_reval_dentry_operations;
 +      else
 +              sb->s_d_op = &ovl_dentry_operations;
  
        err = -ENOMEM;
        oe = ovl_alloc_entry(numlower);
@@@ -1065,6 -998,7 +1066,6 @@@ out_put_lower_mnt
        kfree(ufs->lower_mnt);
  out_put_workdir:
        dput(ufs->workdir);
 -out_put_upper_mnt:
        mntput(ufs->upper_mnt);
  out_put_lowerpath:
        for (i = 0; i < numlower; i++)
diff --combined fs/seq_file.c
index 1d9c1cbd4d0b4a1a17dd924e876c854b8d722169,d8a0545ad7ea43c08eefa531ab58b59073e6c422..ce9e39fd5dafc768c27b2ceaa4e69a02c3ed1e6e
@@@ -48,21 -48,18 +48,21 @@@ static void *seq_buf_alloc(unsigned lon
   *    ERR_PTR(error).  In the end of sequence they return %NULL. ->show()
   *    returns 0 in case of success and negative number in case of error.
   *    Returning SEQ_SKIP means "discard this element and move on".
 + *    Note: seq_open() will allocate a struct seq_file and store its
 + *    pointer in @file->private_data. This pointer should not be modified.
   */
  int seq_open(struct file *file, const struct seq_operations *op)
  {
 -      struct seq_file *p = file->private_data;
 +      struct seq_file *p;
 +
 +      WARN_ON(file->private_data);
 +
 +      p = kzalloc(sizeof(*p), GFP_KERNEL);
 +      if (!p)
 +              return -ENOMEM;
 +
 +      file->private_data = p;
  
 -      if (!p) {
 -              p = kmalloc(sizeof(*p), GFP_KERNEL);
 -              if (!p)
 -                      return -ENOMEM;
 -              file->private_data = p;
 -      }
 -      memset(p, 0, sizeof(*p));
        mutex_init(&p->lock);
        p->op = op;
  #ifdef CONFIG_USER_NS
@@@ -490,6 -487,20 +490,20 @@@ int seq_path(struct seq_file *m, const 
  }
  EXPORT_SYMBOL(seq_path);
  
+ /**
+  * seq_file_path - seq_file interface to print a pathname of a file
+  * @m: the seq_file handle
+  * @file: the struct file to print
+  * @esc: set of characters to escape in the output
+  *
+  * return the absolute path to the file.
+  */
+ int seq_file_path(struct seq_file *m, struct file *file, const char *esc)
+ {
+       return seq_path(m, &file->f_path, esc);
+ }
+ EXPORT_SYMBOL(seq_file_path);
  /*
   * Same as seq_path, but relative to supplied root.
   */
@@@ -541,7 -552,6 +555,7 @@@ int seq_dentry(struct seq_file *m, stru
  
        return res;
  }
 +EXPORT_SYMBOL(seq_dentry);
  
  static void *single_start(struct seq_file *p, loff_t *pos)
  {
diff --combined fs/tracefs/inode.c
index a43df11a163f17fb743a23aaca2c84a296ec50b0,6e8a1400d6629c1973b4fb9f66256949527ae0d2..cbc8d5d2755a691a560c46f7105e85ca6220d835
@@@ -496,16 -496,11 +496,11 @@@ struct dentry *tracefs_create_instance_
        return dentry;
  }
  
- static inline int tracefs_positive(struct dentry *dentry)
- {
-       return dentry->d_inode && !d_unhashed(dentry);
- }
  static int __tracefs_remove(struct dentry *dentry, struct dentry *parent)
  {
        int ret = 0;
  
-       if (tracefs_positive(dentry)) {
+       if (simple_positive(dentry)) {
                if (dentry->d_inode) {
                        dget(dentry);
                        switch (dentry->d_inode->i_mode & S_IFMT) {
@@@ -582,7 -577,7 +577,7 @@@ void tracefs_remove_recursive(struct de
         */
        spin_lock(&parent->d_lock);
        list_for_each_entry(child, &parent->d_subdirs, d_child) {
-               if (!tracefs_positive(child))
+               if (!simple_positive(child))
                        continue;
  
                /* perhaps simple_empty(child) makes more sense */
                 * from d_subdirs. When releasing the parent->d_lock we can
                 * no longer trust that the next pointer is valid.
                 * Restart the loop. We'll skip this one with the
-                * tracefs_positive() check.
+                * simple_positive() check.
                 */
                goto loop;
        }
@@@ -631,12 -626,14 +626,12 @@@ bool tracefs_initialized(void
        return tracefs_registered;
  }
  
 -static struct kobject *trace_kobj;
 -
  static int __init tracefs_init(void)
  {
        int retval;
  
 -      trace_kobj = kobject_create_and_add("tracing", kernel_kobj);
 -      if (!trace_kobj)
 +      retval = sysfs_create_mount_point(kernel_kobj, "tracing");
 +      if (retval)
                return -EINVAL;
  
        retval = register_filesystem(&trace_fs_type);
diff --combined fs/ufs/super.c
index 098508a93c7b302fe8e6ab65ec7cd753d2515634,dc33f94163404f04155cddb7a76eae84c96df42b..250579a80d90bd379caee1b7aeaf252dda97c34d
@@@ -80,7 -80,6 +80,7 @@@
  #include <linux/stat.h>
  #include <linux/string.h>
  #include <linux/blkdev.h>
 +#include <linux/backing-dev.h>
  #include <linux/init.h>
  #include <linux/parser.h>
  #include <linux/buffer_head.h>
@@@ -695,6 -694,7 +695,7 @@@ static int ufs_sync_fs(struct super_blo
        unsigned flags;
  
        lock_ufs(sb);
+       mutex_lock(&UFS_SB(sb)->s_lock);
  
        UFSD("ENTER\n");
  
        ufs_put_cstotal(sb);
  
        UFSD("EXIT\n");
+       mutex_unlock(&UFS_SB(sb)->s_lock);
        unlock_ufs(sb);
  
        return 0;
@@@ -800,6 -801,7 +802,7 @@@ static int ufs_fill_super(struct super_
        UFSD("flag %u\n", (int)(sb->s_flags & MS_RDONLY));
        
        mutex_init(&sbi->mutex);
+       mutex_init(&sbi->s_lock);
        spin_lock_init(&sbi->work_lock);
        INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs);
        /*
@@@ -1278,6 -1280,7 +1281,7 @@@ static int ufs_remount (struct super_bl
  
        sync_filesystem(sb);
        lock_ufs(sb);
+       mutex_lock(&UFS_SB(sb)->s_lock);
        uspi = UFS_SB(sb)->s_uspi;
        flags = UFS_SB(sb)->s_flags;
        usb1 = ubh_get_usb_first(uspi);
        new_mount_opt = 0;
        ufs_set_opt (new_mount_opt, ONERROR_LOCK);
        if (!ufs_parse_options (data, &new_mount_opt)) {
+               mutex_unlock(&UFS_SB(sb)->s_lock);
                unlock_ufs(sb);
                return -EINVAL;
        }
                new_mount_opt |= ufstype;
        } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
                pr_err("ufstype can't be changed during remount\n");
+               mutex_unlock(&UFS_SB(sb)->s_lock);
                unlock_ufs(sb);
                return -EINVAL;
        }
  
        if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
                UFS_SB(sb)->s_mount_opt = new_mount_opt;
+               mutex_unlock(&UFS_SB(sb)->s_lock);
                unlock_ufs(sb);
                return 0;
        }
         */
  #ifndef CONFIG_UFS_FS_WRITE
                pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n");
+               mutex_unlock(&UFS_SB(sb)->s_lock);
                unlock_ufs(sb);
                return -EINVAL;
  #else
                    ufstype != UFS_MOUNT_UFSTYPE_SUNx86 &&
                    ufstype != UFS_MOUNT_UFSTYPE_UFS2) {
                        pr_err("this ufstype is read-only supported\n");
+                       mutex_unlock(&UFS_SB(sb)->s_lock);
                        unlock_ufs(sb);
                        return -EINVAL;
                }
                if (!ufs_read_cylinder_structures(sb)) {
                        pr_err("failed during remounting\n");
+                       mutex_unlock(&UFS_SB(sb)->s_lock);
                        unlock_ufs(sb);
                        return -EPERM;
                }
  #endif
        }
        UFS_SB(sb)->s_mount_opt = new_mount_opt;
+       mutex_unlock(&UFS_SB(sb)->s_lock);
        unlock_ufs(sb);
        return 0;
  }
diff --combined fs/xfs/xfs_file.c
index 874507de3485b818e94bfcd0348f79e9747fdce6,71c2c712e609afca2e9f5c93049ca2e641221108..f0e8249722d40a0dcaf9f31bc25effaba142b0a9
@@@ -41,7 -41,6 +41,7 @@@
  #include <linux/dcache.h>
  #include <linux/falloc.h>
  #include <linux/pagevec.h>
 +#include <linux/backing-dev.h>
  
  static const struct vm_operations_struct xfs_file_vm_ops;
  
@@@ -80,15 -79,14 +80,15 @@@ xfs_rw_ilock_demote
  }
  
  /*
 - *    xfs_iozero
 + * xfs_iozero clears the specified range supplied via the page cache (except in
 + * the DAX case). Writes through the page cache will allocate blocks over holes,
 + * though the callers usually map the holes first and avoid them. If a block is
 + * not completely zeroed, then it will be read from disk before being partially
 + * zeroed.
   *
 - *    xfs_iozero clears the specified range of buffer supplied,
 - *    and marks all the affected blocks as valid and modified.  If
 - *    an affected block is not allocated, it will be allocated.  If
 - *    an affected block is not completely overwritten, and is not
 - *    valid before the operation, it will be read from disk before
 - *    being partially zeroed.
 + * In the DAX case, we can just directly write to the underlying pages. This
 + * will not allocate blocks, but will avoid holes and unwritten extents and so
 + * not do unnecessary work.
   */
  int
  xfs_iozero(
@@@ -98,8 -96,7 +98,8 @@@
  {
        struct page             *page;
        struct address_space    *mapping;
 -      int                     status;
 +      int                     status = 0;
 +
  
        mapping = VFS_I(ip)->i_mapping;
        do {
                if (bytes > count)
                        bytes = count;
  
 -              status = pagecache_write_begin(NULL, mapping, pos, bytes,
 -                                      AOP_FLAG_UNINTERRUPTIBLE,
 -                                      &page, &fsdata);
 -              if (status)
 -                      break;
 +              if (IS_DAX(VFS_I(ip))) {
 +                      status = dax_zero_page_range(VFS_I(ip), pos, bytes,
 +                                                   xfs_get_blocks_direct);
 +                      if (status)
 +                              break;
 +              } else {
 +                      status = pagecache_write_begin(NULL, mapping, pos, bytes,
 +                                              AOP_FLAG_UNINTERRUPTIBLE,
 +                                              &page, &fsdata);
 +                      if (status)
 +                              break;
  
 -              zero_user(page, offset, bytes);
 +                      zero_user(page, offset, bytes);
  
 -              status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
 -                                      page, fsdata);
 -              WARN_ON(status <= 0); /* can't return less than zero! */
 +                      status = pagecache_write_end(NULL, mapping, pos, bytes,
 +                                              bytes, page, fsdata);
 +                      WARN_ON(status <= 0); /* can't return less than zero! */
 +                      status = 0;
 +              }
                pos += bytes;
                count -= bytes;
 -              status = 0;
        } while (count);
  
 -      return (-status);
 +      return status;
  }
  
  int
@@@ -148,7 -138,7 +148,7 @@@ xfs_update_prealloc_flags
        tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
        error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
        if (error) {
 -              xfs_trans_cancel(tp, 0);
 +              xfs_trans_cancel(tp);
                return error;
        }
  
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        if (flags & XFS_PREALLOC_SYNC)
                xfs_trans_set_sync(tp);
 -      return xfs_trans_commit(tp, 0);
 +      return xfs_trans_commit(tp);
  }
  
  /*
@@@ -294,7 -284,7 +294,7 @@@ xfs_file_read_iter
        if (file->f_mode & FMODE_NOCMTIME)
                ioflags |= XFS_IO_INVIS;
  
 -      if (unlikely(ioflags & XFS_IO_ISDIRECT)) {
 +      if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {
                xfs_buftarg_t   *target =
                        XFS_IS_REALTIME_INODE(ip) ?
                                mp->m_rtdev_targp : mp->m_ddev_targp;
@@@ -388,11 -378,7 +388,11 @@@ xfs_file_splice_read
  
        trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
  
 -      ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
 +      /* for dax, we need to avoid the page cache */
 +      if (IS_DAX(VFS_I(ip)))
 +              ret = default_file_splice_read(infilp, ppos, pipe, count, flags);
 +      else
 +              ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
        if (ret > 0)
                XFS_STATS_ADD(xs_read_bytes, ret);
  
@@@ -577,6 -563,13 +577,13 @@@ restart
        if (error)
                return error;
  
+       /* For changing security info in file_remove_privs() we need i_mutex */
+       if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
+               xfs_rw_iunlock(ip, *iolock);
+               *iolock = XFS_IOLOCK_EXCL;
+               xfs_rw_ilock(ip, *iolock);
+               goto restart;
+       }
        /*
         * If the offset is beyond the size of the file, we need to zero any
         * blocks that fall between the existing EOF and the start of this
         * setgid bits if the process is not being run by root.  This keeps
         * people from modifying setuid and setgid binaries.
         */
-       return file_remove_suid(file);
+       if (!IS_NOSEC(inode))
+               return file_remove_privs(file);
+       return 0;
  }
  
  /*
@@@ -686,7 -681,7 +695,7 @@@ xfs_file_dio_aio_write
                                        mp->m_rtdev_targp : mp->m_ddev_targp;
  
        /* DIO must be aligned to device logical sector size */
 -      if ((pos | count) & target->bt_logical_sectormask)
 +      if (!IS_DAX(inode) && ((pos | count) & target->bt_logical_sectormask))
                return -EINVAL;
  
        /* "unaligned" here means not aligned to a filesystem block */
  out:
        xfs_rw_iunlock(ip, iolock);
  
 -      /* No fallback to buffered IO on errors for XFS. */
 -      ASSERT(ret < 0 || ret == count);
 +      /*
 +       * No fallback to buffered IO on errors for XFS. DAX can result in
 +       * partial writes, but direct IO will either complete fully or fail.
 +       */
 +      ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
        return ret;
  }
  
@@@ -859,7 -851,7 +868,7 @@@ xfs_file_write_iter
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return -EIO;
  
 -      if (unlikely(iocb->ki_flags & IOCB_DIRECT))
 +      if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
                ret = xfs_file_dio_aio_write(iocb, from);
        else
                ret = xfs_file_buffered_aio_write(iocb, from);
@@@ -1080,6 -1072,17 +1089,6 @@@ xfs_file_readdir
        return xfs_readdir(ip, ctx, bufsize);
  }
  
 -STATIC int
 -xfs_file_mmap(
 -      struct file     *filp,
 -      struct vm_area_struct *vma)
 -{
 -      vma->vm_ops = &xfs_file_vm_ops;
 -
 -      file_accessed(filp);
 -      return 0;
 -}
 -
  /*
   * This type is designed to indicate the type of offset we would like
   * to search from page cache for xfs_seek_hole_data().
@@@ -1460,83 -1463,48 +1469,83 @@@ xfs_file_llseek
   * ordering of:
   *
   * mmap_sem (MM)
 - *   i_mmap_lock (XFS - truncate serialisation)
 - *     page_lock (MM)
 - *       i_lock (XFS - extent map serialisation)
 + *   sb_start_pagefault(vfs, freeze)
 + *     i_mmap_lock (XFS - truncate serialisation)
 + *       page_lock (MM)
 + *         i_lock (XFS - extent map serialisation)
 + */
 +
 +/*
 + * mmap()d file has taken write protection fault and is being made writable. We
 + * can set the page state up correctly for a writable page, which means we can
 + * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
 + * mapping.
   */
  STATIC int
 -xfs_filemap_fault(
 +xfs_filemap_page_mkwrite(
        struct vm_area_struct   *vma,
        struct vm_fault         *vmf)
  {
 -      struct xfs_inode        *ip = XFS_I(vma->vm_file->f_mapping->host);
 -      int                     error;
 +      struct inode            *inode = file_inode(vma->vm_file);
 +      int                     ret;
  
 -      trace_xfs_filemap_fault(ip);
 +      trace_xfs_filemap_page_mkwrite(XFS_I(inode));
  
 -      xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
 -      error = filemap_fault(vma, vmf);
 -      xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
 +      sb_start_pagefault(inode->i_sb);
 +      file_update_time(vma->vm_file);
 +      xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
  
 -      return error;
 +      if (IS_DAX(inode)) {
 +              ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct,
 +                                  xfs_end_io_dax_write);
 +      } else {
 +              ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks);
 +              ret = block_page_mkwrite_return(ret);
 +      }
 +
 +      xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 +      sb_end_pagefault(inode->i_sb);
 +
 +      return ret;
  }
  
 -/*
 - * mmap()d file has taken write protection fault and is being made writable. We
 - * can set the page state up correctly for a writable page, which means we can
 - * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
 - * mapping.
 - */
  STATIC int
 -xfs_filemap_page_mkwrite(
 +xfs_filemap_fault(
        struct vm_area_struct   *vma,
        struct vm_fault         *vmf)
  {
 -      struct xfs_inode        *ip = XFS_I(vma->vm_file->f_mapping->host);
 -      int                     error;
 +      struct xfs_inode        *ip = XFS_I(file_inode(vma->vm_file));
 +      int                     ret;
 +
 +      trace_xfs_filemap_fault(ip);
  
 -      trace_xfs_filemap_page_mkwrite(ip);
 +      /* DAX can shortcut the normal fault path on write faults! */
 +      if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(VFS_I(ip)))
 +              return xfs_filemap_page_mkwrite(vma, vmf);
  
        xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
 -      error = block_page_mkwrite(vma, vmf, xfs_get_blocks);
 +      ret = filemap_fault(vma, vmf);
        xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
  
 -      return error;
 +      return ret;
 +}
 +
 +static const struct vm_operations_struct xfs_file_vm_ops = {
 +      .fault          = xfs_filemap_fault,
 +      .map_pages      = filemap_map_pages,
 +      .page_mkwrite   = xfs_filemap_page_mkwrite,
 +};
 +
 +STATIC int
 +xfs_file_mmap(
 +      struct file     *filp,
 +      struct vm_area_struct *vma)
 +{
 +      file_accessed(filp);
 +      vma->vm_ops = &xfs_file_vm_ops;
 +      if (IS_DAX(file_inode(filp)))
 +              vma->vm_flags |= VM_MIXEDMAP;
 +      return 0;
  }
  
  const struct file_operations xfs_file_operations = {
@@@ -1567,3 -1535,9 +1576,3 @@@ const struct file_operations xfs_dir_fi
  #endif
        .fsync          = xfs_dir_fsync,
  };
 -
 -static const struct vm_operations_struct xfs_file_vm_ops = {
 -      .fault          = xfs_filemap_fault,
 -      .map_pages      = filemap_map_pages,
 -      .page_mkwrite   = xfs_filemap_page_mkwrite,
 -};
diff --combined include/linux/fs.h
index 8a81fcbb0074d9d89dc7aff09367d88c051f69d4,1e658b11c2656bcdcabc939fa0b120bf3e6e0e29..a0653e560c2679a2eea870035a55cd3282e47894
@@@ -35,7 -35,6 +35,7 @@@
  #include <uapi/linux/fs.h>
  
  struct backing_dev_info;
 +struct bdi_writeback;
  struct export_operations;
  struct hd_geometry;
  struct iovec;
@@@ -70,7 -69,6 +70,7 @@@ typedef int (get_block_t)(struct inode 
                        struct buffer_head *bh_result, int create);
  typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
                        ssize_t bytes, void *private);
 +typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate);
  
  #define MAY_EXEC              0x00000001
  #define MAY_WRITE             0x00000002
@@@ -636,14 -634,6 +636,14 @@@ struct inode 
  
        struct hlist_node       i_hash;
        struct list_head        i_wb_list;      /* backing dev IO list */
 +#ifdef CONFIG_CGROUP_WRITEBACK
 +      struct bdi_writeback    *i_wb;          /* the associated cgroup wb */
 +
 +      /* foreign inode detection, see wbc_detach_inode() */
 +      int                     i_wb_frn_winner;
 +      u16                     i_wb_frn_avg_time;
 +      u16                     i_wb_frn_history;
 +#endif
        struct list_head        i_lru;          /* inode LRU list */
        struct list_head        i_sb_list;
        union {
@@@ -1242,8 -1232,6 +1242,8 @@@ struct mm_struct
  #define UMOUNT_NOFOLLOW       0x00000008      /* Don't follow symlink on umount */
  #define UMOUNT_UNUSED 0x80000000      /* Flag guaranteed to be unused */
  
 +/* sb->s_iflags */
 +#define SB_I_CGROUPWB 0x00000001      /* cgroup-aware writeback enabled */
  
  /* Possible states of 'frozen' field */
  enum {
@@@ -1282,7 -1270,6 +1282,7 @@@ struct super_block 
        const struct quotactl_ops       *s_qcop;
        const struct export_operations *s_export_op;
        unsigned long           s_flags;
 +      unsigned long           s_iflags;       /* internal SB_I_* flags */
        unsigned long           s_magic;
        struct dentry           *s_root;
        struct rw_semaphore     s_umount;
@@@ -1654,7 -1641,6 +1654,6 @@@ struct inode_operations 
        int (*set_acl)(struct inode *, struct posix_acl *, int);
  
        /* WARNING: probably going away soon, do not use! */
-       int (*dentry_open)(struct dentry *, struct file *, const struct cred *);
  } ____cacheline_aligned;
  
  ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
@@@ -1819,11 -1805,6 +1818,11 @@@ struct super_operations 
   *
   * I_DIO_WAKEUP               Never set.  Only used as a key for wait_on_bit().
   *
 + * I_WB_SWITCH                Cgroup bdi_writeback switching in progress.  Used to
 + *                    synchronize competing switching instances and to tell
 + *                    wb stat updates to grab mapping->tree_lock.  See
 + *                    inode_switch_wb_work_fn() for details.
 + *
   * Q: What is the difference between I_WILL_FREE and I_FREEING?
   */
  #define I_DIRTY_SYNC          (1 << 0)
  #define I_DIRTY_TIME          (1 << 11)
  #define __I_DIRTY_TIME_EXPIRED        12
  #define I_DIRTY_TIME_EXPIRED  (1 << __I_DIRTY_TIME_EXPIRED)
 +#define I_WB_SWITCH           (1 << 13)
  
  #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
  #define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)
@@@ -1917,7 -1897,6 +1916,7 @@@ struct file_system_type 
  #define FS_HAS_SUBTYPE                4
  #define FS_USERNS_MOUNT               8       /* Can be mounted by userns root */
  #define FS_USERNS_DEV_MOUNT   16 /* A userns mount does not imply MNT_NODEV */
 +#define FS_USERNS_VISIBLE     32      /* FS must already be visible */
  #define FS_RENAME_DOES_D_MOVE 32768   /* FS will handle d_move() during rename() internally. */
        struct dentry *(*mount) (struct file_system_type *, int,
                       const char *, void *);
@@@ -2005,6 -1984,7 +2004,6 @@@ extern int vfs_ustat(dev_t, struct ksta
  extern int freeze_super(struct super_block *super);
  extern int thaw_super(struct super_block *super);
  extern bool our_mnt(struct vfsmount *mnt);
 -extern bool fs_fully_visible(struct file_system_type *);
  
  extern int current_umask(void);
  
@@@ -2213,7 -2193,6 +2212,6 @@@ extern struct file *file_open_name(stru
  extern struct file *filp_open(const char *, int, umode_t);
  extern struct file *file_open_root(struct dentry *, struct vfsmount *,
                                   const char *, int);
- extern int vfs_open(const struct path *, struct file *, const struct cred *);
  extern struct file * dentry_open(const struct path *, int, const struct cred *);
  extern int filp_close(struct file *, fl_owner_t id);
  
@@@ -2260,13 -2239,7 +2258,13 @@@ extern struct super_block *freeze_bdev(
  extern void emergency_thaw_all(void);
  extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
  extern int fsync_bdev(struct block_device *);
 -extern int sb_is_blkdev_sb(struct super_block *sb);
 +
 +extern struct super_block *blockdev_superblock;
 +
 +static inline bool sb_is_blkdev_sb(struct super_block *sb)
 +{
 +      return sb == blockdev_superblock;
 +}
  #else
  static inline void bd_forget(struct inode *inode) {}
  static inline int sync_blockdev(struct block_device *bdev) { return 0; }
@@@ -2305,9 -2278,6 +2303,9 @@@ extern struct block_device *blkdev_get_
  extern struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode,
                                              void *holder);
  extern void blkdev_put(struct block_device *bdev, fmode_t mode);
 +extern int __blkdev_reread_part(struct block_device *bdev);
 +extern int blkdev_reread_part(struct block_device *bdev);
 +
  #ifdef CONFIG_SYSFS
  extern int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk);
  extern void bd_unlink_disk_holder(struct block_device *bdev,
@@@ -2530,6 -2500,8 +2528,8 @@@ extern struct file * open_exec(const ch
  extern int is_subdir(struct dentry *, struct dentry *);
  extern int path_is_under(struct path *, struct path *);
  
+ extern char *file_path(struct file *, char *, int);
  #include <linux/err.h>
  
  /* needed for stackable file system support */
@@@ -2581,7 -2553,12 +2581,12 @@@ extern struct inode *new_inode_pseudo(s
  extern struct inode *new_inode(struct super_block *sb);
  extern void free_inode_nonrcu(struct inode *inode);
  extern int should_remove_suid(struct dentry *);
- extern int file_remove_suid(struct file *);
+ extern int file_remove_privs(struct file *);
+ extern int dentry_needs_remove_privs(struct dentry *dentry);
+ static inline int file_needs_remove_privs(struct file *file)
+ {
+       return dentry_needs_remove_privs(file->f_path.dentry);
+ }
  
  extern void __insert_inode_hash(struct inode *, unsigned long hashval);
  static inline void insert_inode_hash(struct inode *inode)
@@@ -2656,13 -2633,9 +2661,13 @@@ ssize_t dax_do_io(struct kiocb *, struc
  int dax_clear_blocks(struct inode *, sector_t block, long size);
  int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
  int dax_truncate_page(struct inode *, loff_t from, get_block_t);
 -int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
 +int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
 +              dax_iodone_t);
 +int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
 +              dax_iodone_t);
  int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
 -#define dax_mkwrite(vma, vmf, gb)     dax_fault(vma, vmf, gb)
 +#define dax_mkwrite(vma, vmf, gb, iod)                dax_fault(vma, vmf, gb, iod)
 +#define __dax_mkwrite(vma, vmf, gb, iod)      __dax_fault(vma, vmf, gb, iod)
  
  #ifdef CONFIG_BLOCK
  typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,
@@@ -2816,8 -2789,6 +2821,8 @@@ extern struct dentry *simple_lookup(str
  extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *);
  extern const struct file_operations simple_dir_operations;
  extern const struct inode_operations simple_dir_inode_operations;
 +extern void make_empty_dir_inode(struct inode *inode);
 +extern bool is_empty_dir_inode(struct inode *inode);
  struct tree_descr { char *name; const struct file_operations *ops; int mode; };
  struct dentry *d_alloc_name(struct dentry *, const char *);
  extern int simple_fill_super(struct super_block *, unsigned long, struct tree_descr *);
diff --combined include/linux/pagemap.h
index fb0814ca65c7328b0bb2bcf9be958dbff2a05c04,808942d3106260231b5d4d870c4be9eab52e5429..a6c78e00ea9684a784938ed39229c2018ffd8e75
@@@ -651,8 -651,7 +651,8 @@@ int add_to_page_cache_locked(struct pag
  int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
                                pgoff_t index, gfp_t gfp_mask);
  extern void delete_from_page_cache(struct page *page);
 -extern void __delete_from_page_cache(struct page *page, void *shadow);
 +extern void __delete_from_page_cache(struct page *page, void *shadow,
 +                                   struct mem_cgroup *memcg);
  int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
  
  /*
@@@ -671,4 -670,10 +671,10 @@@ static inline int add_to_page_cache(str
        return error;
  }
  
+ static inline unsigned long dir_pages(struct inode *inode)
+ {
+       return (unsigned long)(inode->i_size + PAGE_CACHE_SIZE - 1) >>
+                              PAGE_CACHE_SHIFT;
+ }
  #endif /* _LINUX_PAGEMAP_H */
diff --combined kernel/events/core.c
index d1f37ddd19608d26a32dd130d491cf7c98c41d9c,5c964e84548355b1958814f641b1139655a2d549..e965cfae420725645349c4facd145828e4e59d13
@@@ -36,7 -36,7 +36,7 @@@
  #include <linux/kernel_stat.h>
  #include <linux/cgroup.h>
  #include <linux/perf_event.h>
 -#include <linux/ftrace_event.h>
 +#include <linux/trace_events.h>
  #include <linux/hw_breakpoint.h>
  #include <linux/mm_types.h>
  #include <linux/module.h>
  
  static struct workqueue_struct *perf_wq;
  
 +typedef int (*remote_function_f)(void *);
 +
  struct remote_function_call {
        struct task_struct      *p;
 -      int                     (*func)(void *info);
 +      remote_function_f       func;
        void                    *info;
        int                     ret;
  };
@@@ -88,7 -86,7 +88,7 @@@ static void remote_function(void *data
   *        -EAGAIN - when the process moved away
   */
  static int
 -task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
 +task_function_call(struct task_struct *p, remote_function_f func, void *info)
  {
        struct remote_function_call data = {
                .p      = p,
   *
   * returns: @func return value or -ENXIO when the cpu is offline
   */
 -static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
 +static int cpu_function_call(int cpu, remote_function_f func, void *info)
  {
        struct remote_function_call data = {
                .p      = NULL,
@@@ -749,31 -747,62 +749,31 @@@ perf_cgroup_mark_enabled(struct perf_ev
  /*
   * function must be called with interrupts disbled
   */
 -static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
 +static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
  {
        struct perf_cpu_context *cpuctx;
 -      enum hrtimer_restart ret = HRTIMER_NORESTART;
        int rotations = 0;
  
        WARN_ON(!irqs_disabled());
  
        cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
 -
        rotations = perf_rotate_context(cpuctx);
  
 -      /*
 -       * arm timer if needed
 -       */
 -      if (rotations) {
 +      raw_spin_lock(&cpuctx->hrtimer_lock);
 +      if (rotations)
                hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
 -              ret = HRTIMER_RESTART;
 -      }
 -
 -      return ret;
 -}
 -
 -/* CPU is going down */
 -void perf_cpu_hrtimer_cancel(int cpu)
 -{
 -      struct perf_cpu_context *cpuctx;
 -      struct pmu *pmu;
 -      unsigned long flags;
 -
 -      if (WARN_ON(cpu != smp_processor_id()))
 -              return;
 -
 -      local_irq_save(flags);
 -
 -      rcu_read_lock();
 -
 -      list_for_each_entry_rcu(pmu, &pmus, entry) {
 -              cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 -
 -              if (pmu->task_ctx_nr == perf_sw_context)
 -                      continue;
 -
 -              hrtimer_cancel(&cpuctx->hrtimer);
 -      }
 -
 -      rcu_read_unlock();
 +      else
 +              cpuctx->hrtimer_active = 0;
 +      raw_spin_unlock(&cpuctx->hrtimer_lock);
  
 -      local_irq_restore(flags);
 +      return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
  }
  
 -static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
 +static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
  {
 -      struct hrtimer *hr = &cpuctx->hrtimer;
 +      struct hrtimer *timer = &cpuctx->hrtimer;
        struct pmu *pmu = cpuctx->ctx.pmu;
 -      int timer;
 +      u64 interval;
  
        /* no multiplexing needed for SW PMU */
        if (pmu->task_ctx_nr == perf_sw_context)
         * check default is sane, if not set then force to
         * default interval (1/tick)
         */
 -      timer = pmu->hrtimer_interval_ms;
 -      if (timer < 1)
 -              timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
 +      interval = pmu->hrtimer_interval_ms;
 +      if (interval < 1)
 +              interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
  
 -      cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
 +      cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
  
 -      hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 -      hr->function = perf_cpu_hrtimer_handler;
 +      raw_spin_lock_init(&cpuctx->hrtimer_lock);
 +      hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
 +      timer->function = perf_mux_hrtimer_handler;
  }
  
 -static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
 +static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
  {
 -      struct hrtimer *hr = &cpuctx->hrtimer;
 +      struct hrtimer *timer = &cpuctx->hrtimer;
        struct pmu *pmu = cpuctx->ctx.pmu;
 +      unsigned long flags;
  
        /* not for SW PMU */
        if (pmu->task_ctx_nr == perf_sw_context)
 -              return;
 +              return 0;
  
 -      if (hrtimer_active(hr))
 -              return;
 +      raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
 +      if (!cpuctx->hrtimer_active) {
 +              cpuctx->hrtimer_active = 1;
 +              hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
 +              hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
 +      }
 +      raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
  
 -      if (!hrtimer_callback_running(hr))
 -              __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
 -                                       0, HRTIMER_MODE_REL_PINNED, 0);
 +      return 0;
  }
  
  void perf_pmu_disable(struct pmu *pmu)
@@@ -889,30 -913,10 +889,30 @@@ static void put_ctx(struct perf_event_c
   * Those places that change perf_event::ctx will hold both
   * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
   *
 - * Lock ordering is by mutex address. There is one other site where
 - * perf_event_context::mutex nests and that is put_event(). But remember that
 - * that is a parent<->child context relation, and migration does not affect
 - * children, therefore these two orderings should not interact.
 + * Lock ordering is by mutex address. There are two other sites where
 + * perf_event_context::mutex nests and those are:
 + *
 + *  - perf_event_exit_task_context()  [ child , 0 ]
 + *      __perf_event_exit_task()
 + *        sync_child_event()
 + *          put_event()                       [ parent, 1 ]
 + *
 + *  - perf_event_init_context()               [ parent, 0 ]
 + *      inherit_task_group()
 + *        inherit_group()
 + *          inherit_event()
 + *            perf_event_alloc()
 + *              perf_init_event()
 + *                perf_try_init_event()       [ child , 1 ]
 + *
 + * While it appears there is an obvious deadlock here -- the parent and child
 + * nesting levels are inverted between the two. This is in fact safe because
 + * life-time rules separate them. That is an exiting task cannot fork, and a
 + * spawning task cannot (yet) exit.
 + *
 + * But remember that that these are parent<->child context relations, and
 + * migration does not affect children, therefore these two orderings should not
 + * interact.
   *
   * The change in perf_event::ctx does not affect children (as claimed above)
   * because the sys_perf_event_open() case will install a new event and break
@@@ -1502,17 -1506,11 +1502,17 @@@ static int __init perf_workqueue_init(v
  
  core_initcall(perf_workqueue_init);
  
 +static inline int pmu_filter_match(struct perf_event *event)
 +{
 +      struct pmu *pmu = event->pmu;
 +      return pmu->filter_match ? pmu->filter_match(event) : 1;
 +}
 +
  static inline int
  event_filter_match(struct perf_event *event)
  {
        return (event->cpu == -1 || event->cpu == smp_processor_id())
 -          && perf_cgroup_match(event);
 +          && perf_cgroup_match(event) && pmu_filter_match(event);
  }
  
  static void
@@@ -1917,7 -1915,7 +1917,7 @@@ group_sched_in(struct perf_event *group
  
        if (event_sched_in(group_event, cpuctx, ctx)) {
                pmu->cancel_txn(pmu);
 -              perf_cpu_hrtimer_restart(cpuctx);
 +              perf_mux_hrtimer_restart(cpuctx);
                return -EAGAIN;
        }
  
@@@ -1964,7 -1962,7 +1964,7 @@@ group_error
  
        pmu->cancel_txn(pmu);
  
 -      perf_cpu_hrtimer_restart(cpuctx);
 +      perf_mux_hrtimer_restart(cpuctx);
  
        return -EAGAIN;
  }
@@@ -2237,7 -2235,7 +2237,7 @@@ static int __perf_event_enable(void *in
                 */
                if (leader != event) {
                        group_sched_out(leader, cpuctx, ctx);
 -                      perf_cpu_hrtimer_restart(cpuctx);
 +                      perf_mux_hrtimer_restart(cpuctx);
                }
                if (leader->attr.pinned) {
                        update_group_times(leader);
@@@ -3424,6 -3422,7 +3424,6 @@@ static void free_event_rcu(struct rcu_h
        if (event->ns)
                put_pid_ns(event->ns);
        perf_event_free_filter(event);
 -      perf_event_free_bpf_prog(event);
        kfree(event);
  }
  
@@@ -3554,8 -3553,6 +3554,8 @@@ static void __free_event(struct perf_ev
                        put_callchain_buffers();
        }
  
 +      perf_event_free_bpf_prog(event);
 +
        if (event->destroy)
                event->destroy(event);
  
@@@ -3660,6 -3657,9 +3660,6 @@@ static void perf_remove_from_owner(stru
        }
  }
  
 -/*
 - * Called when the last reference to the file is gone.
 - */
  static void put_event(struct perf_event *event)
  {
        struct perf_event_context *ctx;
@@@ -3697,9 -3697,6 +3697,9 @@@ int perf_event_release_kernel(struct pe
  }
  EXPORT_SYMBOL_GPL(perf_event_release_kernel);
  
 +/*
 + * Called when the last reference to the file is gone.
 + */
  static int perf_release(struct inode *inode, struct file *file)
  {
        put_event(file->private_data);
@@@ -4313,20 -4310,20 +4313,20 @@@ static void ring_buffer_attach(struct p
                WARN_ON_ONCE(event->rcu_pending);
  
                old_rb = event->rb;
 -              event->rcu_batches = get_state_synchronize_rcu();
 -              event->rcu_pending = 1;
 -
                spin_lock_irqsave(&old_rb->event_lock, flags);
                list_del_rcu(&event->rb_entry);
                spin_unlock_irqrestore(&old_rb->event_lock, flags);
 -      }
  
 -      if (event->rcu_pending && rb) {
 -              cond_synchronize_rcu(event->rcu_batches);
 -              event->rcu_pending = 0;
 +              event->rcu_batches = get_state_synchronize_rcu();
 +              event->rcu_pending = 1;
        }
  
        if (rb) {
 +              if (event->rcu_pending) {
 +                      cond_synchronize_rcu(event->rcu_batches);
 +                      event->rcu_pending = 0;
 +              }
 +
                spin_lock_irqsave(&rb->event_lock, flags);
                list_add_rcu(&event->rb_entry, &rb->event_list);
                spin_unlock_irqrestore(&rb->event_lock, flags);
@@@ -5363,9 -5360,9 +5363,9 @@@ void perf_prepare_sample(struct perf_ev
        }
  }
  
 -static void perf_event_output(struct perf_event *event,
 -                              struct perf_sample_data *data,
 -                              struct pt_regs *regs)
 +void perf_event_output(struct perf_event *event,
 +                      struct perf_sample_data *data,
 +                      struct pt_regs *regs)
  {
        struct perf_output_handle handle;
        struct perf_event_header header;
@@@ -5794,7 -5791,7 +5794,7 @@@ static void perf_event_mmap_event(struc
                 * need to add enough zero bytes after the string to handle
                 * the 64bit alignment we do later.
                 */
-               name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
+               name = file_path(file, buf, PATH_MAX - sizeof(u64));
                if (IS_ERR(name)) {
                        name = "//toolong";
                        goto cpy_name;
@@@ -5956,39 -5953,6 +5956,39 @@@ void perf_event_aux_event(struct perf_e
        perf_output_end(&handle);
  }
  
 +/*
 + * Lost/dropped samples logging
 + */
 +void perf_log_lost_samples(struct perf_event *event, u64 lost)
 +{
 +      struct perf_output_handle handle;
 +      struct perf_sample_data sample;
 +      int ret;
 +
 +      struct {
 +              struct perf_event_header        header;
 +              u64                             lost;
 +      } lost_samples_event = {
 +              .header = {
 +                      .type = PERF_RECORD_LOST_SAMPLES,
 +                      .misc = 0,
 +                      .size = sizeof(lost_samples_event),
 +              },
 +              .lost           = lost,
 +      };
 +
 +      perf_event_header__init_id(&lost_samples_event.header, &sample, event);
 +
 +      ret = perf_output_begin(&handle, event,
 +                              lost_samples_event.header.size);
 +      if (ret)
 +              return;
 +
 +      perf_output_put(&handle, lost_samples_event);
 +      perf_event__output_id_sample(event, &handle, &sample);
 +      perf_output_end(&handle);
 +}
 +
  /*
   * IRQ throttle logging
   */
@@@ -6879,8 -6843,9 +6879,8 @@@ static void perf_swevent_start_hrtimer(
        } else {
                period = max_t(u64, 10000, hwc->sample_period);
        }
 -      __hrtimer_start_range_ns(&hwc->hrtimer,
 -                              ns_to_ktime(period), 0,
 -                              HRTIMER_MODE_REL_PINNED, 0);
 +      hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
 +                    HRTIMER_MODE_REL_PINNED);
  }
  
  static void perf_swevent_cancel_hrtimer(struct perf_event *event)
@@@ -7181,8 -7146,6 +7181,8 @@@ perf_event_mux_interval_ms_show(struct 
        return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
  }
  
 +static DEFINE_MUTEX(mux_interval_mutex);
 +
  static ssize_t
  perf_event_mux_interval_ms_store(struct device *dev,
                                 struct device_attribute *attr,
        if (timer == pmu->hrtimer_interval_ms)
                return count;
  
 +      mutex_lock(&mux_interval_mutex);
        pmu->hrtimer_interval_ms = timer;
  
        /* update all cpuctx for this PMU */
 -      for_each_possible_cpu(cpu) {
 +      get_online_cpus();
 +      for_each_online_cpu(cpu) {
                struct perf_cpu_context *cpuctx;
                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
                cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
  
 -              if (hrtimer_active(&cpuctx->hrtimer))
 -                      hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
 +              cpu_function_call(cpu,
 +                      (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
        }
 +      put_online_cpus();
 +      mutex_unlock(&mux_interval_mutex);
  
        return count;
  }
@@@ -7321,7 -7280,7 +7321,7 @@@ skip_type
                lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
                cpuctx->ctx.pmu = pmu;
  
 -              __perf_cpu_hrtimer_init(cpuctx, cpu);
 +              __perf_mux_hrtimer_init(cpuctx, cpu);
  
                cpuctx->unique_pmu = pmu;
        }
@@@ -7405,12 -7364,7 +7405,12 @@@ static int perf_try_init_event(struct p
                return -ENODEV;
  
        if (event->group_leader != event) {
 -              ctx = perf_event_ctx_lock(event->group_leader);
 +              /*
 +               * This ctx->mutex can nest when we're called through
 +               * inheritance. See the perf_event_ctx_lock_nested() comment.
 +               */
 +              ctx = perf_event_ctx_lock_nested(event->group_leader,
 +                                               SINGLE_DEPTH_NESTING);
                BUG_ON(!ctx);
        }
  
diff --combined mm/filemap.c
index 11f10efd637c2d67e071c482951e2bb38a105d6b,f851e36802d573e19f10307ab984c1d52bfc9588..1283fc82545861d155c4eef7013bf8e285040fed
   *    ->tree_lock             (page_remove_rmap->set_page_dirty)
   *    bdi.wb->list_lock               (page_remove_rmap->set_page_dirty)
   *    ->inode->i_lock         (page_remove_rmap->set_page_dirty)
 + *    ->memcg->move_lock      (page_remove_rmap->mem_cgroup_begin_page_stat)
   *    bdi.wb->list_lock               (zap_pte_range->set_page_dirty)
   *    ->inode->i_lock         (zap_pte_range->set_page_dirty)
   *    ->private_lock          (zap_pte_range->__set_page_dirty_buffers)
@@@ -175,11 -174,9 +175,11 @@@ static void page_cache_tree_delete(stru
  /*
   * Delete a page from the page cache and free it. Caller has to make
   * sure the page is locked and that nobody else uses it - or that usage
 - * is safe.  The caller must hold the mapping's tree_lock.
 + * is safe.  The caller must hold the mapping's tree_lock and
 + * mem_cgroup_begin_page_stat().
   */
 -void __delete_from_page_cache(struct page *page, void *shadow)
 +void __delete_from_page_cache(struct page *page, void *shadow,
 +                            struct mem_cgroup *memcg)
  {
        struct address_space *mapping = page->mapping;
  
        page->mapping = NULL;
        /* Leave page->index set: truncation lookup relies upon it */
  
 -      __dec_zone_page_state(page, NR_FILE_PAGES);
 +      /* hugetlb pages do not participate in page cache accounting. */
 +      if (!PageHuge(page))
 +              __dec_zone_page_state(page, NR_FILE_PAGES);
        if (PageSwapBacked(page))
                __dec_zone_page_state(page, NR_SHMEM);
        BUG_ON(page_mapped(page));
         * anyway will be cleared before returning page into buddy allocator.
         */
        if (WARN_ON_ONCE(PageDirty(page)))
 -              account_page_cleaned(page, mapping);
 +              account_page_cleaned(page, mapping, memcg,
 +                                   inode_to_wb(mapping->host));
  }
  
  /**
  void delete_from_page_cache(struct page *page)
  {
        struct address_space *mapping = page->mapping;
 +      struct mem_cgroup *memcg;
 +      unsigned long flags;
 +
        void (*freepage)(struct page *);
  
        BUG_ON(!PageLocked(page));
  
        freepage = mapping->a_ops->freepage;
 -      spin_lock_irq(&mapping->tree_lock);
 -      __delete_from_page_cache(page, NULL);
 -      spin_unlock_irq(&mapping->tree_lock);
 +
 +      memcg = mem_cgroup_begin_page_stat(page);
 +      spin_lock_irqsave(&mapping->tree_lock, flags);
 +      __delete_from_page_cache(page, NULL, memcg);
 +      spin_unlock_irqrestore(&mapping->tree_lock, flags);
 +      mem_cgroup_end_page_stat(memcg);
  
        if (freepage)
                freepage(page);
@@@ -293,9 -281,7 +293,9 @@@ int __filemap_fdatawrite_range(struct a
        if (!mapping_cap_writeback_dirty(mapping))
                return 0;
  
 +      wbc_attach_fdatawrite_inode(&wbc, mapping->host);
        ret = do_writepages(mapping, &wbc);
 +      wbc_detach_inode(&wbc);
        return ret;
  }
  
@@@ -484,8 -470,6 +484,8 @@@ int replace_page_cache_page(struct pag
        if (!error) {
                struct address_space *mapping = old->mapping;
                void (*freepage)(struct page *);
 +              struct mem_cgroup *memcg;
 +              unsigned long flags;
  
                pgoff_t offset = old->index;
                freepage = mapping->a_ops->freepage;
                new->mapping = mapping;
                new->index = offset;
  
 -              spin_lock_irq(&mapping->tree_lock);
 -              __delete_from_page_cache(old, NULL);
 +              memcg = mem_cgroup_begin_page_stat(old);
 +              spin_lock_irqsave(&mapping->tree_lock, flags);
 +              __delete_from_page_cache(old, NULL, memcg);
                error = radix_tree_insert(&mapping->page_tree, offset, new);
                BUG_ON(error);
                mapping->nrpages++;
 -              __inc_zone_page_state(new, NR_FILE_PAGES);
 +
 +              /*
 +               * hugetlb pages do not participate in page cache accounting.
 +               */
 +              if (!PageHuge(new))
 +                      __inc_zone_page_state(new, NR_FILE_PAGES);
                if (PageSwapBacked(new))
                        __inc_zone_page_state(new, NR_SHMEM);
 -              spin_unlock_irq(&mapping->tree_lock);
 +              spin_unlock_irqrestore(&mapping->tree_lock, flags);
 +              mem_cgroup_end_page_stat(memcg);
                mem_cgroup_migrate(old, new, true);
                radix_tree_preload_end();
                if (freepage)
@@@ -598,10 -575,7 +598,10 @@@ static int __add_to_page_cache_locked(s
        radix_tree_preload_end();
        if (unlikely(error))
                goto err_insert;
 -      __inc_zone_page_state(page, NR_FILE_PAGES);
 +
 +      /* hugetlb pages do not participate in page cache accounting. */
 +      if (!huge)
 +              __inc_zone_page_state(page, NR_FILE_PAGES);
        spin_unlock_irq(&mapping->tree_lock);
        if (!huge)
                mem_cgroup_commit_charge(page, memcg, false);
@@@ -1680,8 -1654,8 +1680,8 @@@ no_cached_page
                        error = -ENOMEM;
                        goto out;
                }
 -              error = add_to_page_cache_lru(page, mapping,
 -                                              index, GFP_KERNEL);
 +              error = add_to_page_cache_lru(page, mapping, index,
 +                                      GFP_KERNEL & mapping_gfp_mask(mapping));
                if (error) {
                        page_cache_release(page);
                        if (error == -EEXIST) {
@@@ -1782,8 -1756,7 +1782,8 @@@ static int page_cache_read(struct file 
                if (!page)
                        return -ENOMEM;
  
 -              ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
 +              ret = add_to_page_cache_lru(page, mapping, offset,
 +                              GFP_KERNEL & mapping_gfp_mask(mapping));
                if (ret == 0)
                        ret = mapping->a_ops->readpage(file, page);
                else if (ret == -EEXIST)
@@@ -2563,7 -2536,7 +2563,7 @@@ ssize_t __generic_file_write_iter(struc
  
        /* We can write back this queue in page reclaim */
        current->backing_dev_info = inode_to_bdi(inode);
-       err = file_remove_suid(file);
+       err = file_remove_privs(file);
        if (err)
                goto out;
  
diff --combined mm/memory.c
index 11b9ca1767408dddb147c4b225de0aa31b8f17e7,28c10da1efbca0d7ac0af6189a7c0c2e0789e832..a84fbb772034f2e73eac300e254bd54f2a36ce03
@@@ -2081,12 -2081,11 +2081,12 @@@ static int wp_page_copy(struct mm_struc
                        goto oom;
                cow_user_page(new_page, old_page, address, vma);
        }
 -      __SetPageUptodate(new_page);
  
        if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
                goto oom_free_new;
  
 +      __SetPageUptodate(new_page);
 +
        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
  
        /*
@@@ -2690,10 -2689,6 +2690,10 @@@ static int do_anonymous_page(struct mm_
        page = alloc_zeroed_user_highpage_movable(vma, address);
        if (!page)
                goto oom;
 +
 +      if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
 +              goto oom_free_page;
 +
        /*
         * The memory barrier inside __SetPageUptodate makes sure that
         * preceeding stores to the page contents become visible before
         */
        __SetPageUptodate(page);
  
 -      if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
 -              goto oom_free_page;
 -
        entry = mk_pte(page, vma->vm_page_prot);
        if (vma->vm_flags & VM_WRITE)
                entry = pte_mkwrite(pte_mkdirty(entry));
@@@ -3726,7 -3724,7 +3726,7 @@@ void print_vma_addr(char *prefix, unsig
                if (buf) {
                        char *p;
  
-                       p = d_path(&f->f_path, buf, PAGE_SIZE);
+                       p = file_path(f, buf, PAGE_SIZE);
                        if (IS_ERR(p))
                                p = "?";
                        printk("%s%s[%lx+%lx]", prefix, kbasename(p),
  }
  
  #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
 -void might_fault(void)
 +void __might_fault(const char *file, int line)
  {
        /*
         * Some code (nfs/sunrpc) uses socket ops on kernel memory while
         */
        if (segment_eq(get_fs(), KERNEL_DS))
                return;
 -
 -      /*
 -       * it would be nicer only to annotate paths which are not under
 -       * pagefault_disable, however that requires a larger audit and
 -       * providing helpers like get_user_atomic.
 -       */
 -      if (in_atomic())
 +      if (pagefault_disabled())
                return;
 -
 -      __might_sleep(__FILE__, __LINE__, 0);
 -
 +      __might_sleep(file, line, 0);
 +#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
        if (current->mm)
                might_lock_read(&current->mm->mmap_sem);
 +#endif
  }
 -EXPORT_SYMBOL(might_fault);
 +EXPORT_SYMBOL(__might_fault);
  #endif
  
  #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
diff --combined security/inode.c
index 0e37e4fba8faca36d118ab5a1dc75c139bf3ea2f,6df0d8dae1e0d9b8b1c482d0f2eca622a3e0e414..16622aef9bdea83bee67e5e0080c7b9a17d240ae
  static struct vfsmount *mount;
  static int mount_count;
  
- static inline int positive(struct dentry *dentry)
- {
-       return d_really_is_positive(dentry) && !d_unhashed(dentry);
- }
  static int fill_super(struct super_block *sb, void *data, int silent)
  {
        static struct tree_descr files[] = {{""}};
@@@ -201,31 -196,31 +196,29 @@@ void securityfs_remove(struct dentry *d
                return;
  
        mutex_lock(&d_inode(parent)->i_mutex);
-       if (positive(dentry)) {
-               if (d_really_is_positive(dentry)) {
-                       if (d_is_dir(dentry))
-                               simple_rmdir(d_inode(parent), dentry);
-                       else
-                               simple_unlink(d_inode(parent), dentry);
-                       dput(dentry);
-               }
+       if (simple_positive(dentry)) {
+               if (d_is_dir(dentry))
+                       simple_rmdir(d_inode(parent), dentry);
+               else
+                       simple_unlink(d_inode(parent), dentry);
+               dput(dentry);
        }
        mutex_unlock(&d_inode(parent)->i_mutex);
        simple_release_fs(&mount, &mount_count);
  }
  EXPORT_SYMBOL_GPL(securityfs_remove);
  
 -static struct kobject *security_kobj;
 -
  static int __init securityfs_init(void)
  {
        int retval;
  
 -      security_kobj = kobject_create_and_add("security", kernel_kobj);
 -      if (!security_kobj)
 -              return -EINVAL;
 +      retval = sysfs_create_mount_point(kernel_kobj, "security");
 +      if (retval)
 +              return retval;
  
        retval = register_filesystem(&fs_type);
        if (retval)
 -              kobject_put(security_kobj);
 +              sysfs_remove_mount_point(kernel_kobj, "security");
        return retval;
  }
  
This page took 0.261442 seconds and 4 git commands to generate.