returned if the filesystem cannot handle rcu-walk. See
Documentation/filesystems/vfs.txt for more details.
- permission and check_acl are inode permission checks that are called
-on many or all directory inodes on the way down a path walk (to check for
-exec permission). These must now be rcu-walk aware (flags & IPERM_FLAG_RCU).
-See Documentation/filesystems/vfs.txt for more details.
+ permission is an inode permission check that is called on many or all
+directory inodes on the way down a path walk (to check for exec permission). It
+must now be rcu-walk aware (mask & MAY_NOT_BLOCK). See
+Documentation/filesystems/vfs.txt for more details.
--
[mandatory]
dentry, it does not get nameidata at all and it gets called only when cookie
is non-NULL. Note that link body isn't available anymore, so if you need it,
store it as cookie.
+ --
+ [mandatory]
+ __fd_install() & fd_install() can now sleep. Callers should not
+ hold a spinlock or other resources that do not allow a schedule.
#include <linux/proc_fs.h>
#include <linux/file.h>
#include <asm/arcregs.h>
+#include <asm/irqflags.h>
/*
* Common routine to print scratch regs (r0-r12) or callee regs (r13-r25)
n += scnprintf(buf + n, len - n, "\n");
/* because pt_regs has regs reversed: r12..r0, r25..r13 */
- reg_rev--;
+ if (is_isa_arcv2() && start_num == 0)
+ reg_rev++;
+ else
+ reg_rev--;
}
if (start_num != 0)
mmput(mm);
if (exe_file) {
- path = exe_file->f_path;
- path_get(&exe_file->f_path);
+ path_nm = file_path(exe_file, buf, 255);
fput(exe_file);
- path_nm = d_path(&path, buf, 255);
- path_put(&path);
}
done:
- pr_info("Path: %s\n", path_nm);
+ pr_info("Path: %s\n", !IS_ERR(path_nm) ? path_nm : "?");
}
static void show_faulting_vma(unsigned long address, char *buf)
if (vma && (vma->vm_start <= address)) {
struct file *file = vma->vm_file;
if (file) {
- struct path *path = &file->f_path;
- nm = d_path(path, buf, PAGE_SIZE - 1);
+ nm = file_path(file, buf, PAGE_SIZE - 1);
inode = file_inode(vma->vm_file);
dev = inode->i_sb->s_dev;
ino = inode->i_ino;
((cause_code == 0x02) ? "Write" : "EX"));
} else if (vec == ECR_V_INSN_ERR) {
pr_cont("Illegal Insn\n");
+#ifdef CONFIG_ISA_ARCV2
+ } else if (vec == ECR_V_MEM_ERR) {
+ if (cause_code == 0x00)
+ pr_cont("Bus Error from Insn Mem\n");
+ else if (cause_code == 0x10)
+ pr_cont("Bus Error from Data Mem\n");
+ else
+ pr_cont("Bus Error, check PRM\n");
+#endif
} else {
pr_cont("Check Programmer's Manual\n");
}
pr_info("[STAT32]: 0x%08lx", regs->status32);
-#define STS_BIT(r, bit) r->status32 & STATUS_##bit##_MASK ? #bit : ""
- if (!user_mode(regs))
- pr_cont(" : %2s %2s %2s %2s %2s\n",
- STS_BIT(regs, AE), STS_BIT(regs, A2), STS_BIT(regs, A1),
- STS_BIT(regs, E2), STS_BIT(regs, E1));
+#define STS_BIT(r, bit) r->status32 & STATUS_##bit##_MASK ? #bit" " : ""
+#ifdef CONFIG_ISA_ARCOMPACT
+ pr_cont(" : %2s%2s%2s%2s%2s%2s%2s\n",
+ (regs->status32 & STATUS_U_MASK) ? "U " : "K ",
+ STS_BIT(regs, DE), STS_BIT(regs, AE),
+ STS_BIT(regs, A2), STS_BIT(regs, A1),
+ STS_BIT(regs, E2), STS_BIT(regs, E1));
+#else
+ pr_cont(" : %2s%2s%2s%2s\n",
+ STS_BIT(regs, IE),
+ (regs->status32 & STATUS_U_MASK) ? "U " : "K ",
+ STS_BIT(regs, DE), STS_BIT(regs, AE));
+#endif
pr_info("BTA: 0x%08lx\t SP: 0x%08lx\t FP: 0x%08lx\n",
regs->bta, regs->sp, regs->fp);
pr_info("LPS: 0x%08lx\tLPE: 0x%08lx\tLPC: 0x%08lx\n",
hypfs_last_dentry = dentry;
}
- static inline int hypfs_positive(struct dentry *dentry)
- {
- return d_really_is_positive(dentry) && !d_unhashed(dentry);
- }
-
static void hypfs_remove(struct dentry *dentry)
{
struct dentry *parent;
parent = dentry->d_parent;
mutex_lock(&d_inode(parent)->i_mutex);
- if (hypfs_positive(dentry)) {
+ if (simple_positive(dentry)) {
if (d_is_dir(dentry))
simple_rmdir(d_inode(parent), dentry);
else
.show_options = hypfs_show_options,
};
-static struct kobject *s390_kobj;
-
static int __init hypfs_init(void)
{
int rc;
rc = -ENODATA;
goto fail_hypfs_sprp_exit;
}
- s390_kobj = kobject_create_and_add("s390", hypervisor_kobj);
- if (!s390_kobj) {
- rc = -ENOMEM;
+ rc = sysfs_create_mount_point(hypervisor_kobj, "s390");
+ if (rc)
goto fail_hypfs_diag0c_exit;
- }
rc = register_filesystem(&hypfs_type);
if (rc)
goto fail_filesystem;
return 0;
fail_filesystem:
- kobject_put(s390_kobj);
+ sysfs_remove_mount_point(hypervisor_kobj, "s390");
fail_hypfs_diag0c_exit:
hypfs_diag0c_exit();
fail_hypfs_sprp_exit:
static void __exit hypfs_exit(void)
{
unregister_filesystem(&hypfs_type);
- kobject_put(s390_kobj);
+ sysfs_remove_mount_point(hypervisor_kobj, "s390");
hypfs_diag0c_exit();
hypfs_sprp_exit();
hypfs_vm_exit();
#include <linux/mmzone.h>
#include <linux/dcache.h>
#include <linux/fs.h>
+#include <linux/hardirq.h>
#include <linux/string.h>
#include <asm/backtrace.h>
#include <asm/page.h>
if (kbt->verbose)
pr_err(" <%s while in user mode>\n", fault);
} else {
- if (kbt->verbose)
+ if (kbt->verbose && (p->pc != 0 || p->sp != 0 || p->ex1 != 0))
pr_err(" (odd fault: pc %#lx, sp %#lx, ex1 %#lx?)\n",
p->pc, p->sp, p->ex1);
return NULL;
return p;
}
-/* Is the pc pointing to a sigreturn trampoline? */
-static int is_sigreturn(unsigned long pc)
+/* Is the iterator pointing to a sigreturn trampoline? */
+static int is_sigreturn(struct KBacktraceIterator *kbt)
{
- return current->mm && (pc == VDSO_SYM(&__vdso_rt_sigreturn));
+ return kbt->task->mm &&
+ (kbt->it.pc == ((ulong)kbt->task->mm->context.vdso_base +
+ (ulong)&__vdso_rt_sigreturn));
}
/* Return a pt_regs pointer for a valid signal handler frame */
{
BacktraceIterator *b = &kbt->it;
- if (is_sigreturn(b->pc) && b->sp < PAGE_OFFSET &&
+ if (is_sigreturn(kbt) && b->sp < PAGE_OFFSET &&
b->sp % sizeof(long) == 0) {
int retval;
pagefault_disable();
return NULL;
}
-static int KBacktraceIterator_is_sigreturn(struct KBacktraceIterator *kbt)
-{
- return is_sigreturn(kbt->it.pc);
-}
-
static int KBacktraceIterator_restart(struct KBacktraceIterator *kbt)
{
struct pt_regs *p;
{
for (;;) {
do {
- if (!KBacktraceIterator_is_sigreturn(kbt))
+ if (!is_sigreturn(kbt))
return KBT_ONGOING;
} while (backtrace_next(&kbt->it));
}
if (vma->vm_file) {
- p = d_path(&vma->vm_file->f_path, buf, bufsize);
+ p = file_path(vma->vm_file, buf, bufsize);
if (IS_ERR(p))
p = "?";
name = kbasename(p);
*/
static bool start_backtrace(void)
{
- if (current->thread.in_backtrace) {
+ if (current_thread_info()->in_backtrace) {
pr_err("Backtrace requested while in backtrace!\n");
return false;
}
- current->thread.in_backtrace = true;
+ current_thread_info()->in_backtrace = true;
return true;
}
static void end_backtrace(void)
{
- current->thread.in_backtrace = false;
+ current_thread_info()->in_backtrace = false;
}
/*
* This method wraps the backtracer's more generic support.
* It is only invoked from the architecture-specific code; show_stack()
- * and dump_stack() (in entry.S) are architecture-independent entry points.
+ * and dump_stack() are architecture-independent entry points.
*/
-void tile_show_stack(struct KBacktraceIterator *kbt, int headers)
+void tile_show_stack(struct KBacktraceIterator *kbt)
{
int i;
int have_mmap_sem = 0;
if (!start_backtrace())
return;
- if (headers) {
- /*
- * Add a blank line since if we are called from panic(),
- * then bust_spinlocks() spit out a space in front of us
- * and it will mess up our KERN_ERR.
- */
- pr_err("Starting stack dump of tid %d, pid %d (%s) on cpu %d at cycle %lld\n",
- kbt->task->pid, kbt->task->tgid, kbt->task->comm,
- raw_smp_processor_id(), get_cycles());
- }
kbt->verbose = 1;
i = 0;
for (; !KBacktraceIterator_end(kbt); KBacktraceIterator_next(kbt)) {
char namebuf[KSYM_NAME_LEN+100];
unsigned long address = kbt->it.pc;
- /* Try to acquire the mmap_sem as we pass into userspace. */
- if (address < PAGE_OFFSET && !have_mmap_sem && kbt->task->mm)
+ /*
+ * Try to acquire the mmap_sem as we pass into userspace.
+ * If we're in an interrupt context, don't even try, since
+ * it's not safe to call e.g. d_path() from an interrupt,
+ * since it uses spin locks without disabling interrupts.
+ * Note we test "kbt->task == current", not "kbt->is_current",
+ * since we're checking that "current" will work in d_path().
+ */
+ if (kbt->task == current && address < PAGE_OFFSET &&
+ !have_mmap_sem && kbt->task->mm && !in_interrupt()) {
have_mmap_sem =
down_read_trylock(&kbt->task->mm->mmap_sem);
+ }
describe_addr(kbt, address, have_mmap_sem,
namebuf, sizeof(namebuf));
}
if (kbt->end == KBT_LOOP)
pr_err("Stack dump stopped; next frame identical to this one\n");
- if (headers)
- pr_err("Stack dump complete\n");
if (have_mmap_sem)
up_read(&kbt->task->mm->mmap_sem);
end_backtrace();
}
EXPORT_SYMBOL(tile_show_stack);
-
-/* This is called from show_regs() and _dump_stack() */
-void dump_stack_regs(struct pt_regs *regs)
-{
- struct KBacktraceIterator kbt;
- KBacktraceIterator_init(&kbt, NULL, regs);
- tile_show_stack(&kbt, 1);
-}
-EXPORT_SYMBOL(dump_stack_regs);
-
static struct pt_regs *regs_to_pt_regs(struct pt_regs *regs,
ulong pc, ulong lr, ulong sp, ulong r52)
{
return regs;
}
-/* This is called from dump_stack() and just converts to pt_regs */
+/* Deprecated function currently only used by kernel_double_fault(). */
void _dump_stack(int dummy, ulong pc, ulong lr, ulong sp, ulong r52)
{
+ struct KBacktraceIterator kbt;
struct pt_regs regs;
- dump_stack_regs(regs_to_pt_regs(®s, pc, lr, sp, r52));
+
+ regs_to_pt_regs(®s, pc, lr, sp, r52);
+ KBacktraceIterator_init(&kbt, NULL, ®s);
+ tile_show_stack(&kbt);
}
/* This is called from KBacktraceIterator_init_current() */
regs_to_pt_regs(®s, pc, lr, sp, r52));
}
-/* This is called only from kernel/sched/core.c, with esp == NULL */
+/*
+ * Called from sched_show_task() with task != NULL, or dump_stack()
+ * with task == NULL. The esp argument is always NULL.
+ */
void show_stack(struct task_struct *task, unsigned long *esp)
{
struct KBacktraceIterator kbt;
- if (task == NULL || task == current)
+ if (task == NULL || task == current) {
KBacktraceIterator_init_current(&kbt);
- else
+ KBacktraceIterator_next(&kbt); /* don't show first frame */
+ } else {
KBacktraceIterator_init(&kbt, task, NULL);
- tile_show_stack(&kbt, 0);
+ }
+ tile_show_stack(&kbt);
}
#ifdef CONFIG_STACKTRACE
/* Support generic Linux stack API too */
-void save_stack_trace_tsk(struct task_struct *task, struct stack_trace *trace)
+static void save_stack_trace_common(struct task_struct *task,
+ struct pt_regs *regs,
+ bool user,
+ struct stack_trace *trace)
{
struct KBacktraceIterator kbt;
int skip = trace->skip;
if (!start_backtrace())
goto done;
- if (task == NULL || task == current)
+ if (regs != NULL) {
+ KBacktraceIterator_init(&kbt, NULL, regs);
+ } else if (task == NULL || task == current) {
KBacktraceIterator_init_current(&kbt);
- else
+ skip++; /* don't show KBacktraceIterator_init_current */
+ } else {
KBacktraceIterator_init(&kbt, task, NULL);
+ }
for (; !KBacktraceIterator_end(&kbt); KBacktraceIterator_next(&kbt)) {
if (skip) {
--skip;
continue;
}
- if (i >= trace->max_entries || kbt.it.pc < PAGE_OFFSET)
+ if (i >= trace->max_entries ||
+ (!user && kbt.it.pc < PAGE_OFFSET))
break;
trace->entries[i++] = kbt.it.pc;
}
end_backtrace();
done:
+ if (i < trace->max_entries)
+ trace->entries[i++] = ULONG_MAX;
trace->nr_entries = i;
}
+
+void save_stack_trace_tsk(struct task_struct *task, struct stack_trace *trace)
+{
+ save_stack_trace_common(task, NULL, false, trace);
+}
EXPORT_SYMBOL(save_stack_trace_tsk);
void save_stack_trace(struct stack_trace *trace)
{
- save_stack_trace_tsk(NULL, trace);
+ save_stack_trace_common(NULL, NULL, false, trace);
}
EXPORT_SYMBOL_GPL(save_stack_trace);
+void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
+{
+ save_stack_trace_common(NULL, regs, false, trace);
+}
+
+void save_stack_trace_user(struct stack_trace *trace)
+{
+ /* Trace user stack if we are not a kernel thread. */
+ if (current->mm)
+ save_stack_trace_common(NULL, task_pt_regs(current),
+ true, trace);
+ else if (trace->nr_entries < trace->max_entries)
+ trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
#endif
/* In entry.S */
static int max_part;
static int part_shift;
-static struct workqueue_struct *loop_wq;
-
static int transfer_xor(struct loop_device *lo, int cmd,
struct page *raw_page, unsigned raw_off,
struct page *loop_page, unsigned loop_off,
return loop_switch(lo, NULL);
}
+static void loop_reread_partitions(struct loop_device *lo,
+ struct block_device *bdev)
+{
+ int rc;
+
+ /*
+ * bd_mutex has been held already in release path, so don't
+ * acquire it if this function is called in such case.
+ *
+ * If the reread partition isn't from release path, lo_refcnt
+ * must be at least one and it can only become zero when the
+ * current holder is released.
+ */
+ if (!atomic_read(&lo->lo_refcnt))
+ rc = __blkdev_reread_part(bdev);
+ else
+ rc = blkdev_reread_part(bdev);
+ if (rc)
+ pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n",
+ __func__, lo->lo_number, lo->lo_file_name, rc);
+}
+
/*
* loop_change_fd switched the backing store of a loopback device to
* a new file. This is useful for operating system installers to free up
fput(old_file);
if (lo->lo_flags & LO_FLAGS_PARTSCAN)
- ioctl_by_bdev(bdev, BLKRRPART, 0);
+ loop_reread_partitions(lo, bdev);
return 0;
out_putf:
spin_lock_irq(&lo->lo_lock);
if (lo->lo_backing_file)
- p = d_path(&lo->lo_backing_file->f_path, buf, PAGE_SIZE - 1);
+ p = file_path(lo->lo_backing_file, buf, PAGE_SIZE - 1);
spin_unlock_irq(&lo->lo_lock);
if (IS_ERR_OR_NULL(p))
size = get_loop_size(lo, file);
if ((loff_t)(sector_t)size != size)
goto out_putf;
+ error = -ENOMEM;
+ lo->wq = alloc_workqueue("kloopd%d",
+ WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 16,
+ lo->lo_number);
+ if (!lo->wq)
+ goto out_putf;
error = 0;
if (part_shift)
lo->lo_flags |= LO_FLAGS_PARTSCAN;
if (lo->lo_flags & LO_FLAGS_PARTSCAN)
- ioctl_by_bdev(bdev, BLKRRPART, 0);
+ loop_reread_partitions(lo, bdev);
/* Grab the block_device to prevent its destruction after we
* put /dev/loopXX inode. Later in loop_clr_fd() we bdput(bdev).
* <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d
* command to fail with EBUSY.
*/
- if (lo->lo_refcnt > 1) {
+ if (atomic_read(&lo->lo_refcnt) > 1) {
lo->lo_flags |= LO_FLAGS_AUTOCLEAR;
mutex_unlock(&lo->lo_ctl_mutex);
return 0;
if (filp == NULL)
return -EINVAL;
+ /* freeze request queue during the transition */
+ blk_mq_freeze_queue(lo->lo_queue);
+
spin_lock_irq(&lo->lo_lock);
lo->lo_state = Lo_rundown;
lo->lo_backing_file = NULL;
lo->lo_state = Lo_unbound;
/* This is safe: open() is still holding a reference. */
module_put(THIS_MODULE);
+ blk_mq_unfreeze_queue(lo->lo_queue);
+
if (lo->lo_flags & LO_FLAGS_PARTSCAN && bdev)
- ioctl_by_bdev(bdev, BLKRRPART, 0);
+ loop_reread_partitions(lo, bdev);
lo->lo_flags = 0;
if (!part_shift)
lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
+ destroy_workqueue(lo->wq);
+ lo->wq = NULL;
mutex_unlock(&lo->lo_ctl_mutex);
/*
* Need not hold lo_ctl_mutex to fput backing file.
!(lo->lo_flags & LO_FLAGS_PARTSCAN)) {
lo->lo_flags |= LO_FLAGS_PARTSCAN;
lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN;
- ioctl_by_bdev(lo->lo_device, BLKRRPART, 0);
+ loop_reread_partitions(lo, lo->lo_device);
}
lo->lo_encrypt_key_size = info->lo_encrypt_key_size;
goto out;
}
- mutex_lock(&lo->lo_ctl_mutex);
- lo->lo_refcnt++;
- mutex_unlock(&lo->lo_ctl_mutex);
+ atomic_inc(&lo->lo_refcnt);
out:
mutex_unlock(&loop_index_mutex);
return err;
struct loop_device *lo = disk->private_data;
int err;
- mutex_lock(&lo->lo_ctl_mutex);
-
- if (--lo->lo_refcnt)
- goto out;
+ if (atomic_dec_return(&lo->lo_refcnt))
+ return;
+ mutex_lock(&lo->lo_ctl_mutex);
if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
/*
* In autoclear mode, stop the loop thread
loop_flush(lo);
}
-out:
mutex_unlock(&lo->lo_ctl_mutex);
}
const struct blk_mq_queue_data *bd)
{
struct loop_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
+ struct loop_device *lo = cmd->rq->q->queuedata;
blk_mq_start_request(bd->rq);
+ if (lo->lo_state != Lo_bound)
+ return -EIO;
+
if (cmd->rq->cmd_flags & REQ_WRITE) {
struct loop_device *lo = cmd->rq->q->queuedata;
bool need_sched = true;
spin_unlock_irq(&lo->lo_lock);
if (need_sched)
- queue_work(loop_wq, &lo->write_work);
+ queue_work(lo->wq, &lo->write_work);
} else {
- queue_work(loop_wq, &cmd->read_work);
+ queue_work(lo->wq, &cmd->read_work);
}
return BLK_MQ_RQ_QUEUE_OK;
struct loop_device *lo = cmd->rq->q->queuedata;
int ret = -EIO;
- if (lo->lo_state != Lo_bound)
- goto failed;
-
if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY))
goto failed;
disk->flags |= GENHD_FL_NO_PART_SCAN;
disk->flags |= GENHD_FL_EXT_DEVT;
mutex_init(&lo->lo_ctl_mutex);
+ atomic_set(&lo->lo_refcnt, 0);
lo->lo_number = i;
spin_lock_init(&lo->lo_lock);
disk->major = LOOP_MAJOR;
mutex_unlock(&lo->lo_ctl_mutex);
break;
}
- if (lo->lo_refcnt > 0) {
+ if (atomic_read(&lo->lo_refcnt) > 0) {
ret = -EBUSY;
mutex_unlock(&lo->lo_ctl_mutex);
break;
goto misc_out;
}
- loop_wq = alloc_workqueue("kloopd",
- WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 0);
- if (!loop_wq) {
- err = -ENOMEM;
- goto misc_out;
- }
-
blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
THIS_MODULE, loop_probe, NULL, NULL);
blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range);
unregister_blkdev(LOOP_MAJOR, "loop");
- destroy_workqueue(loop_wq);
-
misc_deregister(&loop_misc);
}
* nr_pending is 0 and In_sync is clear, the entries we return will
* still be in the same position on the list when we re-enter
* list_for_each_entry_continue_rcu.
+ *
+ * Note that if entered with 'rdev == NULL' to start at the
+ * beginning, we temporarily assign 'rdev' to an address which
+ * isn't really an rdev, but which can be used by
+ * list_for_each_entry_continue_rcu() to find the first entry.
*/
rcu_read_lock();
if (rdev == NULL)
/* start at the beginning */
- rdev = list_entry_rcu(&mddev->disks, struct md_rdev, same_set);
+ rdev = list_entry(&mddev->disks, struct md_rdev, same_set);
else {
/* release the previous rdev and start from there. */
rdev_dec_pending(rdev, mddev);
if (bitmap->storage.file) {
path = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (path)
- ptr = d_path(&bitmap->storage.file->f_path,
+ ptr = file_path(bitmap->storage.file,
path, PAGE_SIZE);
printk(KERN_ALERT
chunk_kb ? "KB" : "B");
if (bitmap->storage.file) {
seq_printf(seq, ", file: ");
- seq_path(seq, &bitmap->storage.file->f_path, " \t\n");
+ seq_file_path(seq, bitmap->storage.file, " \t\n");
}
seq_printf(seq, "\n");
{
char b[BDEVNAME_SIZE];
struct kobject *ko;
- char *s;
int err;
/* prevent duplicates */
return -EBUSY;
}
bdevname(rdev->bdev,b);
- while ( (s=strchr(b, '/')) != NULL)
- *s = '!';
+ strreplace(b, '/', '!');
rdev->mddev = mddev;
printk(KERN_INFO "md: bind<%s>\n", b);
static ssize_t
errors_store(struct md_rdev *rdev, const char *buf, size_t len)
{
- char *e;
- unsigned long n = simple_strtoul(buf, &e, 10);
- if (*buf && (*e == 0 || *e == '\n')) {
- atomic_set(&rdev->corrected_errors, n);
- return len;
- }
- return -EINVAL;
+ unsigned int n;
+ int rv;
+
+ rv = kstrtouint(buf, 10, &n);
+ if (rv < 0)
+ return rv;
+ atomic_set(&rdev->corrected_errors, n);
+ return len;
}
static struct rdev_sysfs_entry rdev_errors =
__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
static ssize_t
slot_store(struct md_rdev *rdev, const char *buf, size_t len)
{
- char *e;
+ int slot;
int err;
- int slot = simple_strtoul(buf, &e, 10);
+
if (strncmp(buf, "none", 4)==0)
slot = -1;
- else if (e==buf || (*e && *e!= '\n'))
- return -EINVAL;
+ else {
+ err = kstrtouint(buf, 10, (unsigned int *)&slot);
+ if (err < 0)
+ return err;
+ }
if (rdev->mddev->pers && slot == -1) {
/* Setting 'slot' on an active array requires also
* updating the 'rd%d' link, and communicating
static ssize_t
layout_store(struct mddev *mddev, const char *buf, size_t len)
{
- char *e;
- unsigned long n = simple_strtoul(buf, &e, 10);
+ unsigned int n;
int err;
- if (!*buf || (*e && *e != '\n'))
- return -EINVAL;
+ err = kstrtouint(buf, 10, &n);
+ if (err < 0)
+ return err;
err = mddev_lock(mddev);
if (err)
return err;
static ssize_t
raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
{
- char *e;
+ unsigned int n;
int err;
- unsigned long n = simple_strtoul(buf, &e, 10);
- if (!*buf || (*e && *e != '\n'))
- return -EINVAL;
+ err = kstrtouint(buf, 10, &n);
+ if (err < 0)
+ return err;
err = mddev_lock(mddev);
if (err)
static ssize_t
chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
{
+ unsigned long n;
int err;
- char *e;
- unsigned long n = simple_strtoul(buf, &e, 10);
- if (!*buf || (*e && *e != '\n'))
- return -EINVAL;
+ err = kstrtoul(buf, 10, &n);
+ if (err < 0)
+ return err;
err = mddev_lock(mddev);
if (err)
static ssize_t
resync_start_store(struct mddev *mddev, const char *buf, size_t len)
{
+ unsigned long long n;
int err;
- char *e;
- unsigned long long n = simple_strtoull(buf, &e, 10);
+
+ if (cmd_match(buf, "none"))
+ n = MaxSector;
+ else {
+ err = kstrtoull(buf, 10, &n);
+ if (err < 0)
+ return err;
+ if (n != (sector_t)n)
+ return -EINVAL;
+ }
err = mddev_lock(mddev);
if (err)
return err;
if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
err = -EBUSY;
- else if (cmd_match(buf, "none"))
- n = MaxSector;
- else if (!*buf || (*e && *e != '\n'))
- err = -EINVAL;
if (!err) {
mddev->recovery_cp = n;
err = -EBUSY;
}
spin_unlock(&mddev->lock);
- return err;
+ return err ?: len;
}
err = mddev_lock(mddev);
if (err)
static ssize_t
max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
{
- char *e;
- unsigned long n = simple_strtoul(buf, &e, 10);
+ unsigned int n;
+ int rv;
- if (*buf && (*e == 0 || *e == '\n')) {
- atomic_set(&mddev->max_corr_read_errors, n);
- return len;
- }
- return -EINVAL;
+ rv = kstrtouint(buf, 10, &n);
+ if (rv < 0)
+ return rv;
+ atomic_set(&mddev->max_corr_read_errors, n);
+ return len;
}
static struct md_sysfs_entry max_corr_read_errors =
else
rdev = md_import_device(dev, -1, -1);
- if (IS_ERR(rdev))
+ if (IS_ERR(rdev)) {
+ mddev_unlock(mddev);
return PTR_ERR(rdev);
+ }
err = bind_rdev_to_array(rdev, mddev);
out:
if (err)
if (!mddev->pers || !mddev->pers->sync_request)
return -EINVAL;
- if (cmd_match(page, "frozen"))
- set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- else
- clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
- flush_workqueue(md_misc_wq);
- if (mddev->sync_thread) {
- set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- if (mddev_lock(mddev) == 0) {
+ if (cmd_match(page, "frozen"))
+ set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ else
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
+ mddev_lock(mddev) == 0) {
+ flush_workqueue(md_misc_wq);
+ if (mddev->sync_thread) {
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_reap_sync_thread(mddev);
- mddev_unlock(mddev);
}
+ mddev_unlock(mddev);
}
} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
return -EBUSY;
else if (cmd_match(page, "resync"))
- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
else if (cmd_match(page, "recover")) {
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
} else if (cmd_match(page, "reshape")) {
int err;
if (mddev->pers->start_reshape == NULL)
return -EINVAL;
err = mddev_lock(mddev);
if (!err) {
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
err = mddev->pers->start_reshape(mddev);
mddev_unlock(mddev);
}
set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
else if (!cmd_match(page, "repair"))
return -EINVAL;
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
}
static ssize_t
sync_min_store(struct mddev *mddev, const char *buf, size_t len)
{
- int min;
- char *e;
+ unsigned int min;
+ int rv;
+
if (strncmp(buf, "system", 6)==0) {
- mddev->sync_speed_min = 0;
- return len;
+ min = 0;
+ } else {
+ rv = kstrtouint(buf, 10, &min);
+ if (rv < 0)
+ return rv;
+ if (min == 0)
+ return -EINVAL;
}
- min = simple_strtoul(buf, &e, 10);
- if (buf == e || (*e && *e != '\n') || min <= 0)
- return -EINVAL;
mddev->sync_speed_min = min;
return len;
}
static ssize_t
sync_max_store(struct mddev *mddev, const char *buf, size_t len)
{
- int max;
- char *e;
+ unsigned int max;
+ int rv;
+
if (strncmp(buf, "system", 6)==0) {
- mddev->sync_speed_max = 0;
- return len;
+ max = 0;
+ } else {
+ rv = kstrtouint(buf, 10, &max);
+ if (rv < 0)
+ return rv;
+ if (max == 0)
+ return -EINVAL;
}
- max = simple_strtoul(buf, &e, 10);
- if (buf == e || (*e && *e != '\n') || max <= 0)
- return -EINVAL;
mddev->sync_speed_max = max;
return len;
}
static ssize_t
suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
{
- char *e;
- unsigned long long new = simple_strtoull(buf, &e, 10);
- unsigned long long old;
+ unsigned long long old, new;
int err;
- if (buf == e || (*e && *e != '\n'))
+ err = kstrtoull(buf, 10, &new);
+ if (err < 0)
+ return err;
+ if (new != (sector_t)new)
return -EINVAL;
err = mddev_lock(mddev);
static ssize_t
suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
{
- char *e;
- unsigned long long new = simple_strtoull(buf, &e, 10);
- unsigned long long old;
+ unsigned long long old, new;
int err;
- if (buf == e || (*e && *e != '\n'))
+ err = kstrtoull(buf, 10, &new);
+ if (err < 0)
+ return err;
+ if (new != (sector_t)new)
return -EINVAL;
err = mddev_lock(mddev);
reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
{
struct md_rdev *rdev;
- char *e;
+ unsigned long long new;
int err;
- unsigned long long new = simple_strtoull(buf, &e, 10);
- if (buf == e || (*e && *e != '\n'))
+ err = kstrtoull(buf, 10, &new);
+ if (err < 0)
+ return err;
+ if (new != (sector_t)new)
return -EINVAL;
err = mddev_lock(mddev);
if (err)
mddev_detach(mddev);
if (mddev->private)
pers->free(mddev, mddev->private);
+ mddev->private = NULL;
module_put(pers->owner);
bitmap_destroy(mddev);
return err;
mddev->changed = 0;
mddev->degraded = 0;
mddev->safemode = 0;
+ mddev->private = NULL;
mddev->merge_check_needed = 0;
mddev->bitmap_info.offset = 0;
mddev->bitmap_info.default_offset = 0;
mddev->pers = NULL;
spin_unlock(&mddev->lock);
pers->free(mddev, mddev->private);
+ mddev->private = NULL;
if (pers->sync_request && mddev->to_remove == NULL)
mddev->to_remove = &md_redundancy_group;
module_put(pers->owner);
/* bitmap disabled, zero the first byte and copy out */
if (!mddev->bitmap_info.file)
file->pathname[0] = '\0';
- else if ((ptr = d_path(&mddev->bitmap_info.file->f_path,
+ else if ((ptr = file_path(mddev->bitmap_info.file,
file->pathname, sizeof(file->pathname))),
IS_ERR(ptr))
err = PTR_ERR(ptr);
mddev->ctime != info->ctime ||
mddev->level != info->level ||
/* mddev->layout != info->layout || */
- !mddev->persistent != info->not_persistent||
+ mddev->persistent != !info->not_persistent ||
mddev->chunk_sectors != info->chunk_size >> 9 ||
/* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
((state^info->state) & 0xfffffe00)
int spares = 0;
if (mddev->ro) {
+ struct md_rdev *rdev;
+ if (!mddev->external && mddev->in_sync)
+ /* 'Blocked' flag not needed as failed devices
+ * will be recorded if array switched to read/write.
+ * Leaving it set will prevent the device
+ * from being removed.
+ */
+ rdev_for_each(rdev, mddev)
+ clear_bit(Blocked, &rdev->flags);
/* On a read-only array we can:
* - remove failed devices
* - add already-in_sync devices if the array itself
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_finish(mddev);
clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
}
static int set_ro(const char *val, struct kernel_param *kp)
{
- char *e;
- int num = simple_strtoul(val, &e, 10);
- if (*val && (*e == '\0' || *e == '\n')) {
- start_readonly = num;
- return 0;
- }
- return -EINVAL;
+ return kstrtouint(val, 10, (unsigned int *)&start_readonly);
}
module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
total_size = total_mapping_size(elf_phdata,
loc->elf_ex.e_phnum);
if (!total_size) {
- error = -EINVAL;
+ retval = -EINVAL;
goto out_free_dentry;
}
}
file = vma->vm_file;
if (!file)
continue;
- filename = d_path(&file->f_path, name_curpos, remaining);
+ filename = file_path(file, name_curpos, remaining);
if (IS_ERR(filename)) {
if (PTR_ERR(filename) == -ENAMETOOLONG) {
vfree(data);
continue;
}
- /* d_path() fills at the end, move name down */
+ /* file_path() fills at the end, move name down */
/* n = strlen(filename) + 1: */
n = (name_curpos + remaining) - filename;
remaining = filename - name_curpos;
#include <linux/device_cgroup.h>
#include <linux/highmem.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/module.h>
#include <linux/blkpg.h>
#include <linux/magic.h>
return container_of(inode, struct bdev_inode, vfs_inode);
}
-inline struct block_device *I_BDEV(struct inode *inode)
+struct block_device *I_BDEV(struct inode *inode)
{
return &BDEV_I(inode)->bdev;
}
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
+ if (IS_DAX(inode))
+ return dax_do_io(iocb, inode, iter, offset, blkdev_get_block,
+ NULL, DIO_SKIP_DIO_COUNT);
return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, offset,
blkdev_get_block, NULL, NULL,
DIO_SKIP_DIO_COUNT);
struct page *page)
{
const struct block_device_operations *ops = bdev->bd_disk->fops;
- if (!ops->rw_page)
+ if (!ops->rw_page || bdev_get_integrity(bdev))
return -EOPNOTSUPP;
return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ);
}
int result;
int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE;
const struct block_device_operations *ops = bdev->bd_disk->fops;
- if (!ops->rw_page)
+ if (!ops->rw_page || bdev_get_integrity(bdev))
return -EOPNOTSUPP;
set_page_writeback(page);
result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw);
long avail;
const struct block_device_operations *ops = bdev->bd_disk->fops;
+ /*
+ * The device driver is allowed to sleep, in order to make the
+ * memory directly accessible.
+ */
+ might_sleep();
+
if (size < 0)
return size;
if (!ops->direct_access)
.kill_sb = kill_anon_super,
};
-static struct super_block *blockdev_superblock __read_mostly;
+struct super_block *blockdev_superblock __read_mostly;
+EXPORT_SYMBOL_GPL(blockdev_superblock);
void __init bdev_cache_init(void)
{
return bdev;
}
-int sb_is_blkdev_sb(struct super_block *sb)
-{
- return sb == blockdev_superblock;
-}
-
/* Call when you free inode */
void bd_forget(struct inode *inode)
bdev->bd_disk = disk;
bdev->bd_queue = disk->queue;
bdev->bd_contains = bdev;
+ bdev->bd_inode->i_flags = disk->fops->direct_access ? S_DAX : 0;
if (!partno) {
ret = -ENXIO;
bdev->bd_part = disk_get_part(disk, partno);
}
current->backing_dev_info = inode_to_bdi(inode);
- err = file_remove_suid(file);
+ err = file_remove_privs(file);
if (err) {
mutex_unlock(&inode->i_mutex);
goto out;
struct btrfs_log_ctx ctx;
int ret = 0;
bool full_sync = 0;
+ const u64 len = end - start + 1;
trace_btrfs_sync_file(file, datasync);
* all extents are persisted and the respective file extent
* items are in the fs/subvol btree.
*/
- ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
+ ret = btrfs_wait_ordered_range(inode, start, len);
} else {
/*
* Start any new ordered operations before starting to log the
*/
smp_mb();
if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
- (full_sync && BTRFS_I(inode)->last_trans <=
- root->fs_info->last_trans_committed)) {
+ (BTRFS_I(inode)->last_trans <=
+ root->fs_info->last_trans_committed &&
+ (full_sync ||
+ !btrfs_have_ordered_extents_in_range(inode, start, len)))) {
/*
* We'v had everything committed since the last time we were
* modified so clear this flag in case it was set for whatever
case S_IFDIR:
dout("init_file %p %p 0%o (regular)\n", inode, file,
inode->i_mode);
- cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO);
+ cf = kmem_cache_alloc(ceph_file_cachep, GFP_KERNEL | __GFP_ZERO);
if (cf == NULL) {
ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
return -ENOMEM;
}
cf->fmode = fmode;
cf->next_offset = 2;
+ cf->readdir_cache_idx = -1;
file->private_data = cf;
BUG_ON(inode->i_fop->release != ceph_release);
break;
ceph_mdsc_put_request(cf->last_readdir);
kfree(cf->last_name);
kfree(cf->dir_info);
- dput(cf->dentry);
kmem_cache_free(ceph_file_cachep, cf);
/* wake up anyone waiting for caps on this inode */
}
} else {
num_pages = calc_pages_for(off, len);
- pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
+ pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
if (IS_ERR(pages))
return PTR_ERR(pages);
ret = striped_read(inode, off, len, pages,
* objects, rollback on failure, etc.)
*/
static ssize_t
-ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
+ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
+ struct ceph_snap_context *snapc)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- struct ceph_snap_context *snapc;
struct ceph_vino vino;
struct ceph_osd_request *req;
struct page **pages;
size_t start;
ssize_t n;
- snapc = ci->i_snap_realm->cached_context;
vino = ceph_vino(inode);
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
vino, pos, &len, 0,
break;
}
- osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC);
+ osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
n = iov_iter_get_pages_alloc(from, &pages, len, &start);
if (unlikely(n < 0)) {
* objects, rollback on failure, etc.)
*/
static ssize_t
-ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
+ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
+ struct ceph_snap_context *snapc)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- struct ceph_snap_context *snapc;
struct ceph_vino vino;
struct ceph_osd_request *req;
struct page **pages;
size_t left;
int n;
- snapc = ci->i_snap_realm->cached_context;
vino = ceph_vino(inode);
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
vino, pos, &len, 0, 1,
*/
num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
+ pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
if (IS_ERR(pages)) {
ret = PTR_ERR(pages);
goto out;
struct page *page = NULL;
loff_t i_size;
if (retry_op == READ_INLINE) {
- page = __page_cache_alloc(GFP_NOFS);
+ page = __page_cache_alloc(GFP_KERNEL);
if (!page)
return -ENOMEM;
}
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc =
&ceph_sb_to_client(inode->i_sb)->client->osdc;
+ struct ceph_cap_flush *prealloc_cf;
ssize_t count, written = 0;
int err, want, got;
loff_t pos;
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
+ prealloc_cf = ceph_alloc_cap_flush();
+ if (!prealloc_cf)
+ return -ENOMEM;
+
mutex_lock(&inode->i_mutex);
/* We can write back this queue in page reclaim */
pos = iocb->ki_pos;
count = iov_iter_count(from);
- err = file_remove_suid(file);
+ err = file_remove_privs(file);
if (err)
goto out;
if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
(iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) {
+ struct ceph_snap_context *snapc;
struct iov_iter data;
mutex_unlock(&inode->i_mutex);
+
+ spin_lock(&ci->i_ceph_lock);
+ if (__ceph_have_pending_cap_snap(ci)) {
+ struct ceph_cap_snap *capsnap =
+ list_last_entry(&ci->i_cap_snaps,
+ struct ceph_cap_snap,
+ ci_item);
+ snapc = ceph_get_snap_context(capsnap->context);
+ } else {
+ BUG_ON(!ci->i_head_snapc);
+ snapc = ceph_get_snap_context(ci->i_head_snapc);
+ }
+ spin_unlock(&ci->i_ceph_lock);
+
/* we might need to revert back to that point */
data = *from;
if (iocb->ki_flags & IOCB_DIRECT)
- written = ceph_sync_direct_write(iocb, &data, pos);
+ written = ceph_sync_direct_write(iocb, &data, pos,
+ snapc);
else
- written = ceph_sync_write(iocb, &data, pos);
+ written = ceph_sync_write(iocb, &data, pos, snapc);
if (written == -EOLDSNAPC) {
dout("aio_write %p %llx.%llx %llu~%u"
"got EOLDSNAPC, retrying\n",
}
if (written > 0)
iov_iter_advance(from, written);
+ ceph_put_snap_context(snapc);
} else {
loff_t old_size = inode->i_size;
/*
int dirty;
spin_lock(&ci->i_ceph_lock);
ci->i_inline_version = CEPH_INLINE_NONE;
- dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
+ &prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
if (dirty)
__mark_inode_dirty(inode, dirty);
out:
mutex_unlock(&inode->i_mutex);
out_unlocked:
+ ceph_free_cap_flush(prealloc_cf);
current->backing_dev_info = NULL;
return written ? written : err;
}
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc =
&ceph_inode_to_client(inode)->client->osdc;
+ struct ceph_cap_flush *prealloc_cf;
int want, got = 0;
int dirty;
int ret = 0;
if (!S_ISREG(inode->i_mode))
return -EOPNOTSUPP;
+ prealloc_cf = ceph_alloc_cap_flush();
+ if (!prealloc_cf)
+ return -ENOMEM;
+
mutex_lock(&inode->i_mutex);
if (ceph_snap(inode) != CEPH_NOSNAP) {
if (!ret) {
spin_lock(&ci->i_ceph_lock);
ci->i_inline_version = CEPH_INLINE_NONE;
- dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
+ &prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
if (dirty)
__mark_inode_dirty(inode, dirty);
ceph_put_cap_refs(ci, got);
unlock:
mutex_unlock(&inode->i_mutex);
+ ceph_free_cap_flush(prealloc_cf);
return ret;
}
return 0;
}
-static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg)
+static __printf(2, 0) int cn_vprintf(struct core_name *cn, const char *fmt,
+ va_list arg)
{
int free, need;
va_list arg_copy;
return -ENOMEM;
}
-static int cn_printf(struct core_name *cn, const char *fmt, ...)
+static __printf(2, 3) int cn_printf(struct core_name *cn, const char *fmt, ...)
{
va_list arg;
int ret;
return ret;
}
-static int cn_esc_printf(struct core_name *cn, const char *fmt, ...)
+static __printf(2, 3)
+int cn_esc_printf(struct core_name *cn, const char *fmt, ...)
{
int cur = cn->used;
va_list arg;
goto put_exe_file;
}
- path = d_path(&exe_file->f_path, pathbuf, PATH_MAX);
+ path = file_path(exe_file, pathbuf, PATH_MAX);
if (IS_ERR(path)) {
ret = PTR_ERR(path);
goto free_buf;
break;
/* uid */
case 'u':
- err = cn_printf(cn, "%d", cred->uid);
+ err = cn_printf(cn, "%u",
+ from_kuid(&init_user_ns,
+ cred->uid));
break;
/* gid */
case 'g':
- err = cn_printf(cn, "%d", cred->gid);
+ err = cn_printf(cn, "%u",
+ from_kgid(&init_user_ns,
+ cred->gid));
break;
case 'd':
err = cn_printf(cn, "%d",
break;
/* signal that caused the coredump */
case 's':
- err = cn_printf(cn, "%ld", cprm->siginfo->si_signo);
+ err = cn_printf(cn, "%d",
+ cprm->siginfo->si_signo);
break;
/* UNIX time of coredump */
case 't': {
}
if (iov_iter_rw(iter) == WRITE)
- len = copy_from_iter(addr, max - pos, iter);
+ len = copy_from_iter_nocache(addr, max - pos, iter);
else if (!hole)
len = copy_to_iter(addr, max - pos, iter);
else
}
/* Protects against truncate */
- inode_dio_begin(inode);
+ if (!(flags & DIO_SKIP_DIO_COUNT))
+ inode_dio_begin(inode);
retval = dax_io(inode, iter, pos, end, get_block, &bh);
if ((retval > 0) && end_io)
end_io(iocb, pos, retval, bh.b_private);
- inode_dio_end(inode);
+ if (!(flags & DIO_SKIP_DIO_COUNT))
+ inode_dio_end(inode);
out:
return retval;
}
out:
i_mmap_unlock_read(mapping);
- if (bh->b_end_io)
- bh->b_end_io(bh, 1);
-
return error;
}
-static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
- get_block_t get_block)
+/**
+ * __dax_fault - handle a page fault on a DAX file
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * When a page fault occurs, filesystems may call this helper in their
+ * fault handler for DAX files. __dax_fault() assumes the caller has done all
+ * the necessary locking for the page fault to proceed successfully.
+ */
+int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+ get_block_t get_block, dax_iodone_t complete_unwritten)
{
struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
page_cache_release(page);
}
+ /*
+ * If we successfully insert the new mapping over an unwritten extent,
+ * we need to ensure we convert the unwritten extent. If there is an
+ * error inserting the mapping, the filesystem needs to leave it as
+ * unwritten to prevent exposure of the stale underlying data to
+ * userspace, but we still need to call the completion function so
+ * the private resources on the mapping buffer can be released. We
+ * indicate what the callback should do via the uptodate variable, same
+ * as for normal BH based IO completions.
+ */
error = dax_insert_mapping(inode, &bh, vma, vmf);
+ if (buffer_unwritten(&bh))
+ complete_unwritten(&bh, !error);
out:
if (error == -ENOMEM)
}
goto out;
}
+EXPORT_SYMBOL(__dax_fault);
/**
* dax_fault - handle a page fault on a DAX file
* fault handler for DAX files.
*/
int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
- get_block_t get_block)
+ get_block_t get_block, dax_iodone_t complete_unwritten)
{
int result;
struct super_block *sb = file_inode(vma->vm_file)->i_sb;
sb_start_pagefault(sb);
file_update_time(vma->vm_file);
}
- result = do_dax_fault(vma, vmf, get_block);
+ result = __dax_fault(vma, vmf, get_block, complete_unwritten);
if (vmf->flags & FAULT_FLAG_WRITE)
sb_end_pagefault(sb);
}
/**
- * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups
+ * dentry_rcuwalk_invalidate - invalidate in-progress rcu-walk lookups
* @dentry: the target dentry
* After this call, in-progress rcu-walk path lookup will fail. This
* should be called after unhashing, and after changing d_inode (if
* the dentry has not already been unhashed).
*/
-static inline void dentry_rcuwalk_barrier(struct dentry *dentry)
+static inline void dentry_rcuwalk_invalidate(struct dentry *dentry)
{
- assert_spin_locked(&dentry->d_lock);
- /* Go through a barrier */
- write_seqcount_barrier(&dentry->d_seq);
+ lockdep_assert_held(&dentry->d_lock);
+ /* Go through am invalidation barrier */
+ write_seqcount_invalidate(&dentry->d_seq);
}
/*
struct inode *inode = dentry->d_inode;
__d_clear_type_and_inode(dentry);
hlist_del_init(&dentry->d_u.d_alias);
- dentry_rcuwalk_barrier(dentry);
+ dentry_rcuwalk_invalidate(dentry);
spin_unlock(&dentry->d_lock);
spin_unlock(&inode->i_lock);
if (!inode->i_nlink)
__hlist_bl_del(&dentry->d_hash);
dentry->d_hash.pprev = NULL;
hlist_bl_unlock(b);
- dentry_rcuwalk_barrier(dentry);
+ dentry_rcuwalk_invalidate(dentry);
}
}
EXPORT_SYMBOL(__d_drop);
DCACHE_OP_COMPARE |
DCACHE_OP_REVALIDATE |
DCACHE_OP_WEAK_REVALIDATE |
- DCACHE_OP_DELETE ));
+ DCACHE_OP_DELETE |
+ DCACHE_OP_SELECT_INODE));
dentry->d_op = op;
if (!op)
return;
dentry->d_flags |= DCACHE_OP_DELETE;
if (op->d_prune)
dentry->d_flags |= DCACHE_OP_PRUNE;
+ if (op->d_select_inode)
+ dentry->d_flags |= DCACHE_OP_SELECT_INODE;
}
EXPORT_SYMBOL(d_set_d_op);
if (inode)
hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
__d_set_inode_and_type(dentry, inode, add_flags);
- dentry_rcuwalk_barrier(dentry);
+ dentry_rcuwalk_invalidate(dentry);
spin_unlock(&dentry->d_lock);
fsnotify_d_instantiate(dentry, inode);
}
vfsmnt = &mnt->mnt;
continue;
}
- /*
- * Filesystems needing to implement special "root names"
- * should do so with ->d_dname()
- */
- if (IS_ROOT(dentry) &&
- (dentry->d_name.len != 1 ||
- dentry->d_name.name[0] != '/')) {
- WARN(1, "Root dentry has weird name <%.*s>\n",
- (int) dentry->d_name.len,
- dentry->d_name.name);
- }
if (!error)
error = is_mounted(vfsmnt) ? 1 : 2;
break;
return inode;
}
- static inline int debugfs_positive(struct dentry *dentry)
- {
- return d_really_is_positive(dentry) && !d_unhashed(dentry);
- }
-
struct debugfs_mount_opts {
kuid_t uid;
kgid_t gid;
{
int ret = 0;
- if (debugfs_positive(dentry)) {
+ if (simple_positive(dentry)) {
dget(dentry);
if (d_is_dir(dentry))
ret = simple_rmdir(d_inode(parent), dentry);
*/
spin_lock(&parent->d_lock);
list_for_each_entry(child, &parent->d_subdirs, d_child) {
- if (!debugfs_positive(child))
+ if (!simple_positive(child))
continue;
/* perhaps simple_empty(child) makes more sense */
* from d_subdirs. When releasing the parent->d_lock we can
* no longer trust that the next pointer is valid.
* Restart the loop. We'll skip this one with the
- * debugfs_positive() check.
+ * simple_positive() check.
*/
goto loop;
}
}
EXPORT_SYMBOL_GPL(debugfs_initialized);
-
-static struct kobject *debug_kobj;
-
static int __init debugfs_init(void)
{
int retval;
- debug_kobj = kobject_create_and_add("debug", kernel_kobj);
- if (!debug_kobj)
- return -EINVAL;
+ retval = sysfs_create_mount_point(kernel_kobj, "debug");
+ if (retval)
+ return retval;
retval = register_filesystem(&debug_fs_type);
if (retval)
- kobject_put(debug_kobj);
+ sysfs_remove_mount_point(kernel_kobj, "debug");
else
debugfs_registered = true;
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/parser.h>
#include <linux/buffer_head.h>
#include <linux/exportfs.h>
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+ if (bdev_read_only(sb->s_bdev))
+ return;
es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
es->s_last_error_time = cpu_to_le32(get_seconds());
strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
es = EXT4_SB(inode->i_sb)->s_es;
es->s_last_error_ino = cpu_to_le32(inode->i_ino);
if (ext4_error_ratelimit(inode->i_sb)) {
- path = d_path(&(file->f_path), pathname, sizeof(pathname));
+ path = file_path(file, pathname, sizeof(pathname));
if (IS_ERR(path))
path = "(unknown)";
va_start(args, fmt);
va_end(args);
}
+#define ext4_warning_ratelimit(sb) \
+ ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), \
+ "EXT4-fs warning")
+
void __ext4_warning(struct super_block *sb, const char *function,
unsigned int line, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
- if (!___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),
- "EXT4-fs warning"))
+ if (!ext4_warning_ratelimit(sb))
return;
va_start(args, fmt);
va_end(args);
}
+void __ext4_warning_inode(const struct inode *inode, const char *function,
+ unsigned int line, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ if (!ext4_warning_ratelimit(inode->i_sb))
+ return;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: "
+ "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id,
+ function, line, inode->i_ino, current->comm, &vaf);
+ va_end(args);
+}
+
void __ext4_grp_locked_error(const char *function, unsigned int line,
struct super_block *sb, ext4_group_t grp,
unsigned long ino, ext4_fsblk_t block,
dump_orphan_list(sb, sbi);
J_ASSERT(list_empty(&sbi->s_orphan));
+ sync_blockdev(sb->s_bdev);
invalidate_bdev(sb->s_bdev);
if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
/*
atomic_set(&ei->i_unwritten, 0);
INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
#ifdef CONFIG_EXT4_FS_ENCRYPTION
- ei->i_encryption_key.mode = EXT4_ENCRYPTION_MODE_INVALID;
+ ei->i_crypt_info = NULL;
#endif
-
return &ei->vfs_inode;
}
jbd2_free_inode(EXT4_I(inode)->jinode);
EXT4_I(inode)->jinode = NULL;
}
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ if (EXT4_I(inode)->i_crypt_info)
+ ext4_free_encryption_info(inode, EXT4_I(inode)->i_crypt_info);
+#endif
}
static struct inode *ext4_nfs_get_inode(struct super_block *sb,
unsigned long journal_devnum = 0;
unsigned long def_mount_opts;
struct inode *root;
- char *cp;
const char *descr;
int ret = -ENOMEM;
int blocksize, clustersize;
if (sb->s_bdev->bd_part)
sbi->s_sectors_written_start =
part_stat_read(sb->s_bdev->bd_part, sectors[1]);
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
- /* Modes of operations for file and directory encryption. */
- sbi->s_file_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS;
- sbi->s_dir_encryption_mode = EXT4_ENCRYPTION_MODE_INVALID;
-#endif
/* Cleanup superblock name */
- for (cp = sb->s_id; (cp = strchr(cp, '/'));)
- *cp = '!';
+ strreplace(sb->s_id, '/', '!');
/* -EINVAL is default */
ret = -EINVAL;
}
}
- if (unlikely(sbi->s_mount_flags & EXT4_MF_TEST_DUMMY_ENCRYPTION) &&
+ if ((DUMMY_ENCRYPTION_ENABLED(sbi) ||
+ EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) &&
+ (blocksize != PAGE_CACHE_SIZE)) {
+ ext4_msg(sb, KERN_ERR,
+ "Unsupported blocksize for fs encryption");
+ goto failed_mount_wq;
+ }
+
+ if (DUMMY_ENCRYPTION_ENABLED(sbi) &&
!(sb->s_flags & MS_RDONLY) &&
!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) {
EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT);
set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
}
+ if (*flags & MS_LAZYTIME)
+ sb->s_flags |= MS_LAZYTIME;
+
if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
err = -EROFS;
struct inode *inode = sb_dqopt(sb)->files[type];
ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
int err, offset = off & (sb->s_blocksize - 1);
+ int retries = 0;
struct buffer_head *bh;
handle_t *handle = journal_current_handle();
return -EIO;
}
- bh = ext4_bread(handle, inode, blk, 1);
+ do {
+ bh = ext4_bread(handle, inode, blk,
+ EXT4_GET_BLOCKS_CREATE |
+ EXT4_GET_BLOCKS_METADATA_NOFAIL);
+ } while (IS_ERR(bh) && (PTR_ERR(bh) == -ENOSPC) &&
+ ext4_should_retry_alloc(inode->i_sb, &retries));
if (IS_ERR(bh))
return PTR_ERR(bh);
if (!bh)
static void __exit ext4_exit_fs(void)
{
+ ext4_exit_crypto();
ext4_destroy_lazyinit_thread();
unregister_as_ext2();
unregister_as_ext3();
* Drop the release request when client does not
* implement 'open'
*/
- req->background = 0;
+ __clear_bit(FR_BACKGROUND, &req->flags);
iput(req->misc.release.inode);
fuse_put_request(ff->fc, req);
} else if (sync) {
- req->background = 0;
+ __clear_bit(FR_BACKGROUND, &req->flags);
fuse_request_send(ff->fc, req);
iput(req->misc.release.inode);
fuse_put_request(ff->fc, req);
} else {
req->end = fuse_release_end;
- req->background = 1;
+ __set_bit(FR_BACKGROUND, &req->flags);
fuse_request_send_background(ff->fc, req);
}
kfree(ff);
{
WARN_ON(atomic_read(&ff->count) > 1);
fuse_prepare_release(ff, flags, FUSE_RELEASE);
- ff->reserved_req->force = 1;
- ff->reserved_req->background = 0;
+ __set_bit(FR_FORCE, &ff->reserved_req->flags);
+ __clear_bit(FR_BACKGROUND, &ff->reserved_req->flags);
fuse_request_send(ff->fc, ff->reserved_req);
fuse_put_request(ff->fc, ff->reserved_req);
kfree(ff);
req->in.numargs = 1;
req->in.args[0].size = sizeof(inarg);
req->in.args[0].value = &inarg;
- req->force = 1;
+ __set_bit(FR_FORCE, &req->flags);
fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
if (err <= 0)
goto out;
- err = file_remove_suid(file);
+ err = file_remove_privs(file);
if (err)
goto out;
list_del(&req->writepages_entry);
for (i = 0; i < req->num_pages; i++) {
- dec_bdi_stat(bdi, BDI_WRITEBACK);
+ dec_wb_stat(&bdi->wb, WB_WRITEBACK);
dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP);
- bdi_writeout_inc(bdi);
+ wb_writeout_inc(&bdi->wb);
}
wake_up(&fi->page_waitq);
}
if (!req)
goto err;
- req->background = 1; /* writeback always goes to bg_queue */
+ /* writeback always goes to bg_queue */
+ __set_bit(FR_BACKGROUND, &req->flags);
tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
if (!tmp_page)
goto err_free;
req->end = fuse_writepage_end;
req->inode = inode;
- inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
+ inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
spin_lock(&fc->lock);
}
}
- if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT ||
- old_req->state == FUSE_REQ_PENDING)) {
+ if (old_req->num_pages == 1 && test_bit(FR_PENDING, &old_req->flags)) {
struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host);
copy_highpage(old_req->pages[0], page);
spin_unlock(&fc->lock);
- dec_bdi_stat(bdi, BDI_WRITEBACK);
+ dec_wb_stat(&bdi->wb, WB_WRITEBACK);
dec_zone_page_state(page, NR_WRITEBACK_TEMP);
- bdi_writeout_inc(bdi);
+ wb_writeout_inc(&bdi->wb);
fuse_writepage_free(fc, new_req);
fuse_request_free(new_req);
goto out;
req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
req->misc.write.next = NULL;
req->in.argpages = 1;
- req->background = 1;
+ __set_bit(FR_BACKGROUND, &req->flags);
req->num_pages = 0;
req->end = fuse_writepage_end;
req->inode = inode;
req->page_descs[req->num_pages].offset = 0;
req->page_descs[req->num_pages].length = PAGE_SIZE;
- inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
+ inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
err = 0;
void __destroy_inode(struct inode *inode)
{
BUG_ON(inode_has_buffers(inode));
+ inode_detach_wb(inode);
security_inode_free(inode);
fsnotify_inode_delete(inode);
locks_free_lock_context(inode->i_flctx);
}
#endif
- *p = ++res;
+ res++;
+ /* get_next_ino should not provide a 0 inode number */
+ if (unlikely(!res))
+ res++;
+ *p = res;
put_cpu_var(last_ino);
return res;
}
}
EXPORT_SYMBOL(should_remove_suid);
- static int __remove_suid(struct dentry *dentry, int kill)
+ /*
+ * Return mask of changes for notify_change() that need to be done as a
+ * response to write or truncate. Return 0 if nothing has to be changed.
+ * Negative value on error (change should be denied).
+ */
+ int dentry_needs_remove_privs(struct dentry *dentry)
+ {
+ struct inode *inode = d_inode(dentry);
+ int mask = 0;
+ int ret;
+
+ if (IS_NOSEC(inode))
+ return 0;
+
+ mask = should_remove_suid(dentry);
+ ret = security_inode_need_killpriv(dentry);
+ if (ret < 0)
+ return ret;
+ if (ret)
+ mask |= ATTR_KILL_PRIV;
+ return mask;
+ }
+ EXPORT_SYMBOL(dentry_needs_remove_privs);
+
+ static int __remove_privs(struct dentry *dentry, int kill)
{
struct iattr newattrs;
return notify_change(dentry, &newattrs, NULL);
}
- int file_remove_suid(struct file *file)
+ /*
+ * Remove special file priviledges (suid, capabilities) when file is written
+ * to or truncated.
+ */
+ int file_remove_privs(struct file *file)
{
struct dentry *dentry = file->f_path.dentry;
struct inode *inode = d_inode(dentry);
- int killsuid;
- int killpriv;
+ int kill;
int error = 0;
/* Fast path for nothing security related */
if (IS_NOSEC(inode))
return 0;
- killsuid = should_remove_suid(dentry);
- killpriv = security_inode_need_killpriv(dentry);
-
- if (killpriv < 0)
- return killpriv;
- if (killpriv)
- error = security_inode_killpriv(dentry);
- if (!error && killsuid)
- error = __remove_suid(dentry, killsuid);
- if (!error && (inode->i_sb->s_flags & MS_NOSEC))
- inode->i_flags |= S_NOSEC;
+ kill = file_needs_remove_privs(file);
+ if (kill < 0)
+ return kill;
+ if (kill)
+ error = __remove_privs(dentry, kill);
+ if (!error)
+ inode_has_no_xattr(inode);
return error;
}
- EXPORT_SYMBOL(file_remove_suid);
+ EXPORT_SYMBOL(file_remove_privs);
/**
* file_update_time - update mtime and ctime time
* inode is being instantiated). The reason for the cmpxchg() loop
* --- which wouldn't be necessary if all code paths which modify
* i_flags actually followed this rule, is that there is at least one
- * code path which doesn't today --- for example,
- * __generic_file_aio_write() calls file_remove_suid() without holding
- * i_mutex --- so we use cmpxchg() out of an abundance of caution.
+ * code path which doesn't today so we use cmpxchg() out of an abundance
+ * of caution.
*
* In the long run, i_mutex is overkill, and we should probably look
* at using the i_lock spinlock to protect i_flags, and then make sure
#include "internal.h"
- static inline int simple_positive(struct dentry *dentry)
- {
- return d_really_is_positive(dentry) && !d_unhashed(dentry);
- }
-
int simple_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
.readlink = generic_readlink
};
EXPORT_SYMBOL(simple_symlink_inode_operations);
+
+/*
+ * Operations for a permanently empty directory.
+ */
+static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
+ return ERR_PTR(-ENOENT);
+}
+
+static int empty_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct inode *inode = d_inode(dentry);
+ generic_fillattr(inode, stat);
+ return 0;
+}
+
+static int empty_dir_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ return -EPERM;
+}
+
+static int empty_dir_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
+{
+ return -EOPNOTSUPP;
+}
+
+static ssize_t empty_dir_getxattr(struct dentry *dentry, const char *name,
+ void *value, size_t size)
+{
+ return -EOPNOTSUPP;
+}
+
+static int empty_dir_removexattr(struct dentry *dentry, const char *name)
+{
+ return -EOPNOTSUPP;
+}
+
+static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t size)
+{
+ return -EOPNOTSUPP;
+}
+
+static const struct inode_operations empty_dir_inode_operations = {
+ .lookup = empty_dir_lookup,
+ .permission = generic_permission,
+ .setattr = empty_dir_setattr,
+ .getattr = empty_dir_getattr,
+ .setxattr = empty_dir_setxattr,
+ .getxattr = empty_dir_getxattr,
+ .removexattr = empty_dir_removexattr,
+ .listxattr = empty_dir_listxattr,
+};
+
+static loff_t empty_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+ /* An empty directory has two entries . and .. at offsets 0 and 1 */
+ return generic_file_llseek_size(file, offset, whence, 2, 2);
+}
+
+static int empty_dir_readdir(struct file *file, struct dir_context *ctx)
+{
+ dir_emit_dots(file, ctx);
+ return 0;
+}
+
+static const struct file_operations empty_dir_operations = {
+ .llseek = empty_dir_llseek,
+ .read = generic_read_dir,
+ .iterate = empty_dir_readdir,
+ .fsync = noop_fsync,
+};
+
+
+void make_empty_dir_inode(struct inode *inode)
+{
+ set_nlink(inode, 2);
+ inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
+ inode->i_rdev = 0;
+ inode->i_size = 2;
+ inode->i_blkbits = PAGE_SHIFT;
+ inode->i_blocks = 0;
+
+ inode->i_op = &empty_dir_inode_operations;
+ inode->i_fop = &empty_dir_operations;
+}
+
+bool is_empty_dir_inode(struct inode *inode)
+{
+ return (inode->i_fop == &empty_dir_operations) &&
+ (inode->i_op == &empty_dir_inode_operations);
+}
{
int err;
- if ((open_flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
- *opened |= FILE_CREATED;
-
err = finish_open(file, dentry, do_open, opened);
if (err)
goto out;
static void nfs_dentry_handle_enoent(struct dentry *dentry)
{
- if (d_really_is_positive(dentry) && !d_unhashed(dentry))
+ if (simple_positive(dentry))
d_delete(dentry);
}
base_ni = ni;
if (NInoAttr(ni))
base_ni = ni->ext.base_ntfs_ino;
- err = file_remove_suid(file);
+ err = file_remove_privs(file);
if (unlikely(err))
goto out;
/*
}
}
err = add_to_page_cache_lru(*cached_page, mapping,
- index, GFP_KERNEL);
+ index,
+ GFP_KERNEL & mapping_gfp_mask(mapping));
if (unlikely(err)) {
if (err == -EEXIST)
continue;
}
}
+static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+ unsigned int i;
+ int ret = 1;
+
+ for (i = 0; i < oe->numlower; i++) {
+ struct dentry *d = oe->lowerstack[i].dentry;
+
+ if (d->d_flags & DCACHE_OP_REVALIDATE) {
+ ret = d->d_op->d_revalidate(d, flags);
+ if (ret < 0)
+ return ret;
+ if (!ret) {
+ if (!(flags & LOOKUP_RCU))
+ d_invalidate(d);
+ return -ESTALE;
+ }
+ }
+ }
+ return 1;
+}
+
+static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+ unsigned int i;
+ int ret = 1;
+
+ for (i = 0; i < oe->numlower; i++) {
+ struct dentry *d = oe->lowerstack[i].dentry;
+
+ if (d->d_flags & DCACHE_OP_WEAK_REVALIDATE) {
+ ret = d->d_op->d_weak_revalidate(d, flags);
+ if (ret <= 0)
+ break;
+ }
+ }
+ return ret;
+}
+
static const struct dentry_operations ovl_dentry_operations = {
.d_release = ovl_dentry_release,
+ .d_select_inode = ovl_d_select_inode,
};
+static const struct dentry_operations ovl_reval_dentry_operations = {
+ .d_release = ovl_dentry_release,
+ .d_revalidate = ovl_dentry_revalidate,
+ .d_weak_revalidate = ovl_dentry_weak_revalidate,
+};
+
static struct ovl_entry *ovl_alloc_entry(unsigned int numlower)
{
size_t size = offsetof(struct ovl_entry, lowerstack[numlower]);
return oe;
}
+static bool ovl_dentry_remote(struct dentry *dentry)
+{
+ return dentry->d_flags &
+ (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE);
+}
+
+static bool ovl_dentry_weird(struct dentry *dentry)
+{
+ return dentry->d_flags & (DCACHE_NEED_AUTOMOUNT |
+ DCACHE_MANAGE_TRANSIT |
+ DCACHE_OP_HASH |
+ DCACHE_OP_COMPARE);
+}
+
static inline struct dentry *ovl_lookup_real(struct dentry *dir,
struct qstr *name)
{
} else if (!dentry->d_inode) {
dput(dentry);
dentry = NULL;
+ } else if (ovl_dentry_weird(dentry)) {
+ dput(dentry);
+ /* Don't support traversing automounts and other weirdness */
+ dentry = ERR_PTR(-EREMOTE);
}
return dentry;
}
goto out;
if (this) {
+ if (unlikely(ovl_dentry_remote(this))) {
+ dput(this);
+ err = -EREMOTE;
+ goto out;
+ }
if (ovl_is_whiteout(this)) {
dput(this);
this = NULL;
{
struct ovl_fs *ufs = sb->s_fs_info;
- if (!(*flags & MS_RDONLY) && !ufs->upper_mnt)
+ if (!(*flags & MS_RDONLY) && (!ufs->upper_mnt || !ufs->workdir))
return -EROFS;
return 0;
}
}
-static bool ovl_is_allowed_fs_type(struct dentry *root)
-{
- const struct dentry_operations *dop = root->d_op;
-
- /*
- * We don't support:
- * - automount filesystems
- * - filesystems with revalidate (FIXME for lower layer)
- * - filesystems with case insensitive names
- */
- if (dop &&
- (dop->d_manage || dop->d_automount ||
- dop->d_revalidate || dop->d_weak_revalidate ||
- dop->d_compare || dop->d_hash)) {
- return false;
- }
- return true;
-}
-
static int ovl_mount_dir_noesc(const char *name, struct path *path)
{
int err = -EINVAL;
goto out;
}
err = -EINVAL;
- if (!ovl_is_allowed_fs_type(path->dentry)) {
+ if (ovl_dentry_weird(path->dentry)) {
pr_err("overlayfs: filesystem on '%s' not supported\n", name);
goto out_put;
}
if (tmp) {
ovl_unescape(tmp);
err = ovl_mount_dir_noesc(tmp, path);
+
+ if (!err)
+ if (ovl_dentry_remote(path->dentry)) {
+ pr_err("overlayfs: filesystem on '%s' not supported as upperdir\n",
+ tmp);
+ path_put(path);
+ err = -EINVAL;
+ }
kfree(tmp);
}
return err;
}
static int ovl_lower_dir(const char *name, struct path *path, long *namelen,
- int *stack_depth)
+ int *stack_depth, bool *remote)
{
int err;
struct kstatfs statfs;
*namelen = max(*namelen, statfs.f_namelen);
*stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth);
+ if (ovl_dentry_remote(path->dentry))
+ *remote = true;
+
return 0;
out_put:
unsigned int numlower;
unsigned int stacklen = 0;
unsigned int i;
+ bool remote = false;
int err;
err = -ENOMEM;
lower = lowertmp;
for (numlower = 0; numlower < stacklen; numlower++) {
err = ovl_lower_dir(lower, &stack[numlower],
- &ufs->lower_namelen, &sb->s_stack_depth);
+ &ufs->lower_namelen, &sb->s_stack_depth,
+ &remote);
if (err)
goto out_put_lowerpath;
ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry);
err = PTR_ERR(ufs->workdir);
if (IS_ERR(ufs->workdir)) {
- pr_err("overlayfs: failed to create directory %s/%s\n",
- ufs->config.workdir, OVL_WORKDIR_NAME);
- goto out_put_upper_mnt;
+ pr_warn("overlayfs: failed to create directory %s/%s (errno: %i); mounting read-only\n",
+ ufs->config.workdir, OVL_WORKDIR_NAME, -err);
+ sb->s_flags |= MS_RDONLY;
+ ufs->workdir = NULL;
}
}
if (!ufs->upper_mnt)
sb->s_flags |= MS_RDONLY;
- sb->s_d_op = &ovl_dentry_operations;
+ if (remote)
+ sb->s_d_op = &ovl_reval_dentry_operations;
+ else
+ sb->s_d_op = &ovl_dentry_operations;
err = -ENOMEM;
oe = ovl_alloc_entry(numlower);
kfree(ufs->lower_mnt);
out_put_workdir:
dput(ufs->workdir);
-out_put_upper_mnt:
mntput(ufs->upper_mnt);
out_put_lowerpath:
for (i = 0; i < numlower; i++)
* ERR_PTR(error). In the end of sequence they return %NULL. ->show()
* returns 0 in case of success and negative number in case of error.
* Returning SEQ_SKIP means "discard this element and move on".
+ * Note: seq_open() will allocate a struct seq_file and store its
+ * pointer in @file->private_data. This pointer should not be modified.
*/
int seq_open(struct file *file, const struct seq_operations *op)
{
- struct seq_file *p = file->private_data;
+ struct seq_file *p;
+
+ WARN_ON(file->private_data);
+
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ file->private_data = p;
- if (!p) {
- p = kmalloc(sizeof(*p), GFP_KERNEL);
- if (!p)
- return -ENOMEM;
- file->private_data = p;
- }
- memset(p, 0, sizeof(*p));
mutex_init(&p->lock);
p->op = op;
#ifdef CONFIG_USER_NS
}
EXPORT_SYMBOL(seq_path);
+ /**
+ * seq_file_path - seq_file interface to print a pathname of a file
+ * @m: the seq_file handle
+ * @file: the struct file to print
+ * @esc: set of characters to escape in the output
+ *
+ * return the absolute path to the file.
+ */
+ int seq_file_path(struct seq_file *m, struct file *file, const char *esc)
+ {
+ return seq_path(m, &file->f_path, esc);
+ }
+ EXPORT_SYMBOL(seq_file_path);
+
/*
* Same as seq_path, but relative to supplied root.
*/
return res;
}
+EXPORT_SYMBOL(seq_dentry);
static void *single_start(struct seq_file *p, loff_t *pos)
{
return dentry;
}
- static inline int tracefs_positive(struct dentry *dentry)
- {
- return dentry->d_inode && !d_unhashed(dentry);
- }
-
static int __tracefs_remove(struct dentry *dentry, struct dentry *parent)
{
int ret = 0;
- if (tracefs_positive(dentry)) {
+ if (simple_positive(dentry)) {
if (dentry->d_inode) {
dget(dentry);
switch (dentry->d_inode->i_mode & S_IFMT) {
*/
spin_lock(&parent->d_lock);
list_for_each_entry(child, &parent->d_subdirs, d_child) {
- if (!tracefs_positive(child))
+ if (!simple_positive(child))
continue;
/* perhaps simple_empty(child) makes more sense */
* from d_subdirs. When releasing the parent->d_lock we can
* no longer trust that the next pointer is valid.
* Restart the loop. We'll skip this one with the
- * tracefs_positive() check.
+ * simple_positive() check.
*/
goto loop;
}
return tracefs_registered;
}
-static struct kobject *trace_kobj;
-
static int __init tracefs_init(void)
{
int retval;
- trace_kobj = kobject_create_and_add("tracing", kernel_kobj);
- if (!trace_kobj)
+ retval = sysfs_create_mount_point(kernel_kobj, "tracing");
+ if (retval)
return -EINVAL;
retval = register_filesystem(&trace_fs_type);
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/init.h>
#include <linux/parser.h>
#include <linux/buffer_head.h>
unsigned flags;
lock_ufs(sb);
+ mutex_lock(&UFS_SB(sb)->s_lock);
UFSD("ENTER\n");
ufs_put_cstotal(sb);
UFSD("EXIT\n");
+ mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return 0;
UFSD("flag %u\n", (int)(sb->s_flags & MS_RDONLY));
mutex_init(&sbi->mutex);
+ mutex_init(&sbi->s_lock);
spin_lock_init(&sbi->work_lock);
INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs);
/*
sync_filesystem(sb);
lock_ufs(sb);
+ mutex_lock(&UFS_SB(sb)->s_lock);
uspi = UFS_SB(sb)->s_uspi;
flags = UFS_SB(sb)->s_flags;
usb1 = ubh_get_usb_first(uspi);
new_mount_opt = 0;
ufs_set_opt (new_mount_opt, ONERROR_LOCK);
if (!ufs_parse_options (data, &new_mount_opt)) {
+ mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return -EINVAL;
}
new_mount_opt |= ufstype;
} else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
pr_err("ufstype can't be changed during remount\n");
+ mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return -EINVAL;
}
if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
UFS_SB(sb)->s_mount_opt = new_mount_opt;
+ mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return 0;
}
*/
#ifndef CONFIG_UFS_FS_WRITE
pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n");
+ mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return -EINVAL;
#else
ufstype != UFS_MOUNT_UFSTYPE_SUNx86 &&
ufstype != UFS_MOUNT_UFSTYPE_UFS2) {
pr_err("this ufstype is read-only supported\n");
+ mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return -EINVAL;
}
if (!ufs_read_cylinder_structures(sb)) {
pr_err("failed during remounting\n");
+ mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return -EPERM;
}
#endif
}
UFS_SB(sb)->s_mount_opt = new_mount_opt;
+ mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return 0;
}
#include <linux/dcache.h>
#include <linux/falloc.h>
#include <linux/pagevec.h>
+#include <linux/backing-dev.h>
static const struct vm_operations_struct xfs_file_vm_ops;
}
/*
- * xfs_iozero
+ * xfs_iozero clears the specified range supplied via the page cache (except in
+ * the DAX case). Writes through the page cache will allocate blocks over holes,
+ * though the callers usually map the holes first and avoid them. If a block is
+ * not completely zeroed, then it will be read from disk before being partially
+ * zeroed.
*
- * xfs_iozero clears the specified range of buffer supplied,
- * and marks all the affected blocks as valid and modified. If
- * an affected block is not allocated, it will be allocated. If
- * an affected block is not completely overwritten, and is not
- * valid before the operation, it will be read from disk before
- * being partially zeroed.
+ * In the DAX case, we can just directly write to the underlying pages. This
+ * will not allocate blocks, but will avoid holes and unwritten extents and so
+ * not do unnecessary work.
*/
int
xfs_iozero(
{
struct page *page;
struct address_space *mapping;
- int status;
+ int status = 0;
+
mapping = VFS_I(ip)->i_mapping;
do {
if (bytes > count)
bytes = count;
- status = pagecache_write_begin(NULL, mapping, pos, bytes,
- AOP_FLAG_UNINTERRUPTIBLE,
- &page, &fsdata);
- if (status)
- break;
+ if (IS_DAX(VFS_I(ip))) {
+ status = dax_zero_page_range(VFS_I(ip), pos, bytes,
+ xfs_get_blocks_direct);
+ if (status)
+ break;
+ } else {
+ status = pagecache_write_begin(NULL, mapping, pos, bytes,
+ AOP_FLAG_UNINTERRUPTIBLE,
+ &page, &fsdata);
+ if (status)
+ break;
- zero_user(page, offset, bytes);
+ zero_user(page, offset, bytes);
- status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
- page, fsdata);
- WARN_ON(status <= 0); /* can't return less than zero! */
+ status = pagecache_write_end(NULL, mapping, pos, bytes,
+ bytes, page, fsdata);
+ WARN_ON(status <= 0); /* can't return less than zero! */
+ status = 0;
+ }
pos += bytes;
count -= bytes;
- status = 0;
} while (count);
- return (-status);
+ return status;
}
int
tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
if (flags & XFS_PREALLOC_SYNC)
xfs_trans_set_sync(tp);
- return xfs_trans_commit(tp, 0);
+ return xfs_trans_commit(tp);
}
/*
if (file->f_mode & FMODE_NOCMTIME)
ioflags |= XFS_IO_INVIS;
- if (unlikely(ioflags & XFS_IO_ISDIRECT)) {
+ if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {
xfs_buftarg_t *target =
XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp;
trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
- ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
+ /* for dax, we need to avoid the page cache */
+ if (IS_DAX(VFS_I(ip)))
+ ret = default_file_splice_read(infilp, ppos, pipe, count, flags);
+ else
+ ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
if (ret > 0)
XFS_STATS_ADD(xs_read_bytes, ret);
if (error)
return error;
+ /* For changing security info in file_remove_privs() we need i_mutex */
+ if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
+ xfs_rw_iunlock(ip, *iolock);
+ *iolock = XFS_IOLOCK_EXCL;
+ xfs_rw_ilock(ip, *iolock);
+ goto restart;
+ }
/*
* If the offset is beyond the size of the file, we need to zero any
* blocks that fall between the existing EOF and the start of this
* setgid bits if the process is not being run by root. This keeps
* people from modifying setuid and setgid binaries.
*/
- return file_remove_suid(file);
+ if (!IS_NOSEC(inode))
+ return file_remove_privs(file);
+ return 0;
}
/*
mp->m_rtdev_targp : mp->m_ddev_targp;
/* DIO must be aligned to device logical sector size */
- if ((pos | count) & target->bt_logical_sectormask)
+ if (!IS_DAX(inode) && ((pos | count) & target->bt_logical_sectormask))
return -EINVAL;
/* "unaligned" here means not aligned to a filesystem block */
out:
xfs_rw_iunlock(ip, iolock);
- /* No fallback to buffered IO on errors for XFS. */
- ASSERT(ret < 0 || ret == count);
+ /*
+ * No fallback to buffered IO on errors for XFS. DAX can result in
+ * partial writes, but direct IO will either complete fully or fail.
+ */
+ ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
return ret;
}
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO;
- if (unlikely(iocb->ki_flags & IOCB_DIRECT))
+ if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
ret = xfs_file_dio_aio_write(iocb, from);
else
ret = xfs_file_buffered_aio_write(iocb, from);
return xfs_readdir(ip, ctx, bufsize);
}
-STATIC int
-xfs_file_mmap(
- struct file *filp,
- struct vm_area_struct *vma)
-{
- vma->vm_ops = &xfs_file_vm_ops;
-
- file_accessed(filp);
- return 0;
-}
-
/*
* This type is designed to indicate the type of offset we would like
* to search from page cache for xfs_seek_hole_data().
* ordering of:
*
* mmap_sem (MM)
- * i_mmap_lock (XFS - truncate serialisation)
- * page_lock (MM)
- * i_lock (XFS - extent map serialisation)
+ * sb_start_pagefault(vfs, freeze)
+ * i_mmap_lock (XFS - truncate serialisation)
+ * page_lock (MM)
+ * i_lock (XFS - extent map serialisation)
+ */
+
+/*
+ * mmap()d file has taken write protection fault and is being made writable. We
+ * can set the page state up correctly for a writable page, which means we can
+ * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
+ * mapping.
*/
STATIC int
-xfs_filemap_fault(
+xfs_filemap_page_mkwrite(
struct vm_area_struct *vma,
struct vm_fault *vmf)
{
- struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host);
- int error;
+ struct inode *inode = file_inode(vma->vm_file);
+ int ret;
- trace_xfs_filemap_fault(ip);
+ trace_xfs_filemap_page_mkwrite(XFS_I(inode));
- xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
- error = filemap_fault(vma, vmf);
- xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- return error;
+ if (IS_DAX(inode)) {
+ ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct,
+ xfs_end_io_dax_write);
+ } else {
+ ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks);
+ ret = block_page_mkwrite_return(ret);
+ }
+
+ xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ sb_end_pagefault(inode->i_sb);
+
+ return ret;
}
-/*
- * mmap()d file has taken write protection fault and is being made writable. We
- * can set the page state up correctly for a writable page, which means we can
- * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
- * mapping.
- */
STATIC int
-xfs_filemap_page_mkwrite(
+xfs_filemap_fault(
struct vm_area_struct *vma,
struct vm_fault *vmf)
{
- struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host);
- int error;
+ struct xfs_inode *ip = XFS_I(file_inode(vma->vm_file));
+ int ret;
+
+ trace_xfs_filemap_fault(ip);
- trace_xfs_filemap_page_mkwrite(ip);
+ /* DAX can shortcut the normal fault path on write faults! */
+ if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(VFS_I(ip)))
+ return xfs_filemap_page_mkwrite(vma, vmf);
xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
- error = block_page_mkwrite(vma, vmf, xfs_get_blocks);
+ ret = filemap_fault(vma, vmf);
xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
- return error;
+ return ret;
+}
+
+static const struct vm_operations_struct xfs_file_vm_ops = {
+ .fault = xfs_filemap_fault,
+ .map_pages = filemap_map_pages,
+ .page_mkwrite = xfs_filemap_page_mkwrite,
+};
+
+STATIC int
+xfs_file_mmap(
+ struct file *filp,
+ struct vm_area_struct *vma)
+{
+ file_accessed(filp);
+ vma->vm_ops = &xfs_file_vm_ops;
+ if (IS_DAX(file_inode(filp)))
+ vma->vm_flags |= VM_MIXEDMAP;
+ return 0;
}
const struct file_operations xfs_file_operations = {
#endif
.fsync = xfs_dir_fsync,
};
-
-static const struct vm_operations_struct xfs_file_vm_ops = {
- .fault = xfs_filemap_fault,
- .map_pages = filemap_map_pages,
- .page_mkwrite = xfs_filemap_page_mkwrite,
-};
#include <uapi/linux/fs.h>
struct backing_dev_info;
+struct bdi_writeback;
struct export_operations;
struct hd_geometry;
struct iovec;
struct buffer_head *bh_result, int create);
typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
ssize_t bytes, void *private);
+typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate);
#define MAY_EXEC 0x00000001
#define MAY_WRITE 0x00000002
struct hlist_node i_hash;
struct list_head i_wb_list; /* backing dev IO list */
+#ifdef CONFIG_CGROUP_WRITEBACK
+ struct bdi_writeback *i_wb; /* the associated cgroup wb */
+
+ /* foreign inode detection, see wbc_detach_inode() */
+ int i_wb_frn_winner;
+ u16 i_wb_frn_avg_time;
+ u16 i_wb_frn_history;
+#endif
struct list_head i_lru; /* inode LRU list */
struct list_head i_sb_list;
union {
#define UMOUNT_NOFOLLOW 0x00000008 /* Don't follow symlink on umount */
#define UMOUNT_UNUSED 0x80000000 /* Flag guaranteed to be unused */
+/* sb->s_iflags */
+#define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */
/* Possible states of 'frozen' field */
enum {
const struct quotactl_ops *s_qcop;
const struct export_operations *s_export_op;
unsigned long s_flags;
+ unsigned long s_iflags; /* internal SB_I_* flags */
unsigned long s_magic;
struct dentry *s_root;
struct rw_semaphore s_umount;
int (*set_acl)(struct inode *, struct posix_acl *, int);
/* WARNING: probably going away soon, do not use! */
- int (*dentry_open)(struct dentry *, struct file *, const struct cred *);
} ____cacheline_aligned;
ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
*
* I_DIO_WAKEUP Never set. Only used as a key for wait_on_bit().
*
+ * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to
+ * synchronize competing switching instances and to tell
+ * wb stat updates to grab mapping->tree_lock. See
+ * inode_switch_wb_work_fn() for details.
+ *
* Q: What is the difference between I_WILL_FREE and I_FREEING?
*/
#define I_DIRTY_SYNC (1 << 0)
#define I_DIRTY_TIME (1 << 11)
#define __I_DIRTY_TIME_EXPIRED 12
#define I_DIRTY_TIME_EXPIRED (1 << __I_DIRTY_TIME_EXPIRED)
+#define I_WB_SWITCH (1 << 13)
#define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)
#define FS_HAS_SUBTYPE 4
#define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */
#define FS_USERNS_DEV_MOUNT 16 /* A userns mount does not imply MNT_NODEV */
+#define FS_USERNS_VISIBLE 32 /* FS must already be visible */
#define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */
struct dentry *(*mount) (struct file_system_type *, int,
const char *, void *);
extern int freeze_super(struct super_block *super);
extern int thaw_super(struct super_block *super);
extern bool our_mnt(struct vfsmount *mnt);
-extern bool fs_fully_visible(struct file_system_type *);
extern int current_umask(void);
extern struct file *filp_open(const char *, int, umode_t);
extern struct file *file_open_root(struct dentry *, struct vfsmount *,
const char *, int);
- extern int vfs_open(const struct path *, struct file *, const struct cred *);
extern struct file * dentry_open(const struct path *, int, const struct cred *);
extern int filp_close(struct file *, fl_owner_t id);
extern void emergency_thaw_all(void);
extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
extern int fsync_bdev(struct block_device *);
-extern int sb_is_blkdev_sb(struct super_block *sb);
+
+extern struct super_block *blockdev_superblock;
+
+static inline bool sb_is_blkdev_sb(struct super_block *sb)
+{
+ return sb == blockdev_superblock;
+}
#else
static inline void bd_forget(struct inode *inode) {}
static inline int sync_blockdev(struct block_device *bdev) { return 0; }
extern struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode,
void *holder);
extern void blkdev_put(struct block_device *bdev, fmode_t mode);
+extern int __blkdev_reread_part(struct block_device *bdev);
+extern int blkdev_reread_part(struct block_device *bdev);
+
#ifdef CONFIG_SYSFS
extern int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk);
extern void bd_unlink_disk_holder(struct block_device *bdev,
extern int is_subdir(struct dentry *, struct dentry *);
extern int path_is_under(struct path *, struct path *);
+ extern char *file_path(struct file *, char *, int);
+
#include <linux/err.h>
/* needed for stackable file system support */
extern struct inode *new_inode(struct super_block *sb);
extern void free_inode_nonrcu(struct inode *inode);
extern int should_remove_suid(struct dentry *);
- extern int file_remove_suid(struct file *);
+ extern int file_remove_privs(struct file *);
+ extern int dentry_needs_remove_privs(struct dentry *dentry);
+ static inline int file_needs_remove_privs(struct file *file)
+ {
+ return dentry_needs_remove_privs(file->f_path.dentry);
+ }
extern void __insert_inode_hash(struct inode *, unsigned long hashval);
static inline void insert_inode_hash(struct inode *inode)
int dax_clear_blocks(struct inode *, sector_t block, long size);
int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
int dax_truncate_page(struct inode *, loff_t from, get_block_t);
-int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
+int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
+ dax_iodone_t);
+int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
+ dax_iodone_t);
int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
-#define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb)
+#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod)
+#define __dax_mkwrite(vma, vmf, gb, iod) __dax_fault(vma, vmf, gb, iod)
#ifdef CONFIG_BLOCK
typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,
extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *);
extern const struct file_operations simple_dir_operations;
extern const struct inode_operations simple_dir_inode_operations;
+extern void make_empty_dir_inode(struct inode *inode);
+extern bool is_empty_dir_inode(struct inode *inode);
struct tree_descr { char *name; const struct file_operations *ops; int mode; };
struct dentry *d_alloc_name(struct dentry *, const char *);
extern int simple_fill_super(struct super_block *, unsigned long, struct tree_descr *);
int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
pgoff_t index, gfp_t gfp_mask);
extern void delete_from_page_cache(struct page *page);
-extern void __delete_from_page_cache(struct page *page, void *shadow);
+extern void __delete_from_page_cache(struct page *page, void *shadow,
+ struct mem_cgroup *memcg);
int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
/*
return error;
}
+ static inline unsigned long dir_pages(struct inode *inode)
+ {
+ return (unsigned long)(inode->i_size + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+ }
+
#endif /* _LINUX_PAGEMAP_H */
#include <linux/kernel_stat.h>
#include <linux/cgroup.h>
#include <linux/perf_event.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
#include <linux/hw_breakpoint.h>
#include <linux/mm_types.h>
#include <linux/module.h>
static struct workqueue_struct *perf_wq;
+typedef int (*remote_function_f)(void *);
+
struct remote_function_call {
struct task_struct *p;
- int (*func)(void *info);
+ remote_function_f func;
void *info;
int ret;
};
* -EAGAIN - when the process moved away
*/
static int
-task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
+task_function_call(struct task_struct *p, remote_function_f func, void *info)
{
struct remote_function_call data = {
.p = p,
*
* returns: @func return value or -ENXIO when the cpu is offline
*/
-static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
+static int cpu_function_call(int cpu, remote_function_f func, void *info)
{
struct remote_function_call data = {
.p = NULL,
/*
* function must be called with interrupts disbled
*/
-static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
+static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
{
struct perf_cpu_context *cpuctx;
- enum hrtimer_restart ret = HRTIMER_NORESTART;
int rotations = 0;
WARN_ON(!irqs_disabled());
cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
-
rotations = perf_rotate_context(cpuctx);
- /*
- * arm timer if needed
- */
- if (rotations) {
+ raw_spin_lock(&cpuctx->hrtimer_lock);
+ if (rotations)
hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
- ret = HRTIMER_RESTART;
- }
-
- return ret;
-}
-
-/* CPU is going down */
-void perf_cpu_hrtimer_cancel(int cpu)
-{
- struct perf_cpu_context *cpuctx;
- struct pmu *pmu;
- unsigned long flags;
-
- if (WARN_ON(cpu != smp_processor_id()))
- return;
-
- local_irq_save(flags);
-
- rcu_read_lock();
-
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-
- if (pmu->task_ctx_nr == perf_sw_context)
- continue;
-
- hrtimer_cancel(&cpuctx->hrtimer);
- }
-
- rcu_read_unlock();
+ else
+ cpuctx->hrtimer_active = 0;
+ raw_spin_unlock(&cpuctx->hrtimer_lock);
- local_irq_restore(flags);
+ return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
}
-static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
+static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
{
- struct hrtimer *hr = &cpuctx->hrtimer;
+ struct hrtimer *timer = &cpuctx->hrtimer;
struct pmu *pmu = cpuctx->ctx.pmu;
- int timer;
+ u64 interval;
/* no multiplexing needed for SW PMU */
if (pmu->task_ctx_nr == perf_sw_context)
* check default is sane, if not set then force to
* default interval (1/tick)
*/
- timer = pmu->hrtimer_interval_ms;
- if (timer < 1)
- timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
+ interval = pmu->hrtimer_interval_ms;
+ if (interval < 1)
+ interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
- cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+ cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
- hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
- hr->function = perf_cpu_hrtimer_handler;
+ raw_spin_lock_init(&cpuctx->hrtimer_lock);
+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+ timer->function = perf_mux_hrtimer_handler;
}
-static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
+static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
{
- struct hrtimer *hr = &cpuctx->hrtimer;
+ struct hrtimer *timer = &cpuctx->hrtimer;
struct pmu *pmu = cpuctx->ctx.pmu;
+ unsigned long flags;
/* not for SW PMU */
if (pmu->task_ctx_nr == perf_sw_context)
- return;
+ return 0;
- if (hrtimer_active(hr))
- return;
+ raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
+ if (!cpuctx->hrtimer_active) {
+ cpuctx->hrtimer_active = 1;
+ hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+ }
+ raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
- if (!hrtimer_callback_running(hr))
- __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
- 0, HRTIMER_MODE_REL_PINNED, 0);
+ return 0;
}
void perf_pmu_disable(struct pmu *pmu)
* Those places that change perf_event::ctx will hold both
* perf_event_ctx::mutex of the 'old' and 'new' ctx value.
*
- * Lock ordering is by mutex address. There is one other site where
- * perf_event_context::mutex nests and that is put_event(). But remember that
- * that is a parent<->child context relation, and migration does not affect
- * children, therefore these two orderings should not interact.
+ * Lock ordering is by mutex address. There are two other sites where
+ * perf_event_context::mutex nests and those are:
+ *
+ * - perf_event_exit_task_context() [ child , 0 ]
+ * __perf_event_exit_task()
+ * sync_child_event()
+ * put_event() [ parent, 1 ]
+ *
+ * - perf_event_init_context() [ parent, 0 ]
+ * inherit_task_group()
+ * inherit_group()
+ * inherit_event()
+ * perf_event_alloc()
+ * perf_init_event()
+ * perf_try_init_event() [ child , 1 ]
+ *
+ * While it appears there is an obvious deadlock here -- the parent and child
+ * nesting levels are inverted between the two. This is in fact safe because
+ * life-time rules separate them. That is an exiting task cannot fork, and a
+ * spawning task cannot (yet) exit.
+ *
+ * But remember that that these are parent<->child context relations, and
+ * migration does not affect children, therefore these two orderings should not
+ * interact.
*
* The change in perf_event::ctx does not affect children (as claimed above)
* because the sys_perf_event_open() case will install a new event and break
core_initcall(perf_workqueue_init);
+static inline int pmu_filter_match(struct perf_event *event)
+{
+ struct pmu *pmu = event->pmu;
+ return pmu->filter_match ? pmu->filter_match(event) : 1;
+}
+
static inline int
event_filter_match(struct perf_event *event)
{
return (event->cpu == -1 || event->cpu == smp_processor_id())
- && perf_cgroup_match(event);
+ && perf_cgroup_match(event) && pmu_filter_match(event);
}
static void
if (event_sched_in(group_event, cpuctx, ctx)) {
pmu->cancel_txn(pmu);
- perf_cpu_hrtimer_restart(cpuctx);
+ perf_mux_hrtimer_restart(cpuctx);
return -EAGAIN;
}
pmu->cancel_txn(pmu);
- perf_cpu_hrtimer_restart(cpuctx);
+ perf_mux_hrtimer_restart(cpuctx);
return -EAGAIN;
}
*/
if (leader != event) {
group_sched_out(leader, cpuctx, ctx);
- perf_cpu_hrtimer_restart(cpuctx);
+ perf_mux_hrtimer_restart(cpuctx);
}
if (leader->attr.pinned) {
update_group_times(leader);
if (event->ns)
put_pid_ns(event->ns);
perf_event_free_filter(event);
- perf_event_free_bpf_prog(event);
kfree(event);
}
put_callchain_buffers();
}
+ perf_event_free_bpf_prog(event);
+
if (event->destroy)
event->destroy(event);
}
}
-/*
- * Called when the last reference to the file is gone.
- */
static void put_event(struct perf_event *event)
{
struct perf_event_context *ctx;
}
EXPORT_SYMBOL_GPL(perf_event_release_kernel);
+/*
+ * Called when the last reference to the file is gone.
+ */
static int perf_release(struct inode *inode, struct file *file)
{
put_event(file->private_data);
WARN_ON_ONCE(event->rcu_pending);
old_rb = event->rb;
- event->rcu_batches = get_state_synchronize_rcu();
- event->rcu_pending = 1;
-
spin_lock_irqsave(&old_rb->event_lock, flags);
list_del_rcu(&event->rb_entry);
spin_unlock_irqrestore(&old_rb->event_lock, flags);
- }
- if (event->rcu_pending && rb) {
- cond_synchronize_rcu(event->rcu_batches);
- event->rcu_pending = 0;
+ event->rcu_batches = get_state_synchronize_rcu();
+ event->rcu_pending = 1;
}
if (rb) {
+ if (event->rcu_pending) {
+ cond_synchronize_rcu(event->rcu_batches);
+ event->rcu_pending = 0;
+ }
+
spin_lock_irqsave(&rb->event_lock, flags);
list_add_rcu(&event->rb_entry, &rb->event_list);
spin_unlock_irqrestore(&rb->event_lock, flags);
}
}
-static void perf_event_output(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
+void perf_event_output(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
{
struct perf_output_handle handle;
struct perf_event_header header;
* need to add enough zero bytes after the string to handle
* the 64bit alignment we do later.
*/
- name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
+ name = file_path(file, buf, PATH_MAX - sizeof(u64));
if (IS_ERR(name)) {
name = "//toolong";
goto cpy_name;
perf_output_end(&handle);
}
+/*
+ * Lost/dropped samples logging
+ */
+void perf_log_lost_samples(struct perf_event *event, u64 lost)
+{
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int ret;
+
+ struct {
+ struct perf_event_header header;
+ u64 lost;
+ } lost_samples_event = {
+ .header = {
+ .type = PERF_RECORD_LOST_SAMPLES,
+ .misc = 0,
+ .size = sizeof(lost_samples_event),
+ },
+ .lost = lost,
+ };
+
+ perf_event_header__init_id(&lost_samples_event.header, &sample, event);
+
+ ret = perf_output_begin(&handle, event,
+ lost_samples_event.header.size);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, lost_samples_event);
+ perf_event__output_id_sample(event, &handle, &sample);
+ perf_output_end(&handle);
+}
+
/*
* IRQ throttle logging
*/
} else {
period = max_t(u64, 10000, hwc->sample_period);
}
- __hrtimer_start_range_ns(&hwc->hrtimer,
- ns_to_ktime(period), 0,
- HRTIMER_MODE_REL_PINNED, 0);
+ hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
+ HRTIMER_MODE_REL_PINNED);
}
static void perf_swevent_cancel_hrtimer(struct perf_event *event)
return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
}
+static DEFINE_MUTEX(mux_interval_mutex);
+
static ssize_t
perf_event_mux_interval_ms_store(struct device *dev,
struct device_attribute *attr,
if (timer == pmu->hrtimer_interval_ms)
return count;
+ mutex_lock(&mux_interval_mutex);
pmu->hrtimer_interval_ms = timer;
/* update all cpuctx for this PMU */
- for_each_possible_cpu(cpu) {
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
struct perf_cpu_context *cpuctx;
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
- if (hrtimer_active(&cpuctx->hrtimer))
- hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
+ cpu_function_call(cpu,
+ (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
}
+ put_online_cpus();
+ mutex_unlock(&mux_interval_mutex);
return count;
}
lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
cpuctx->ctx.pmu = pmu;
- __perf_cpu_hrtimer_init(cpuctx, cpu);
+ __perf_mux_hrtimer_init(cpuctx, cpu);
cpuctx->unique_pmu = pmu;
}
return -ENODEV;
if (event->group_leader != event) {
- ctx = perf_event_ctx_lock(event->group_leader);
+ /*
+ * This ctx->mutex can nest when we're called through
+ * inheritance. See the perf_event_ctx_lock_nested() comment.
+ */
+ ctx = perf_event_ctx_lock_nested(event->group_leader,
+ SINGLE_DEPTH_NESTING);
BUG_ON(!ctx);
}
* ->tree_lock (page_remove_rmap->set_page_dirty)
* bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
* ->inode->i_lock (page_remove_rmap->set_page_dirty)
+ * ->memcg->move_lock (page_remove_rmap->mem_cgroup_begin_page_stat)
* bdi.wb->list_lock (zap_pte_range->set_page_dirty)
* ->inode->i_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->__set_page_dirty_buffers)
/*
* Delete a page from the page cache and free it. Caller has to make
* sure the page is locked and that nobody else uses it - or that usage
- * is safe. The caller must hold the mapping's tree_lock.
+ * is safe. The caller must hold the mapping's tree_lock and
+ * mem_cgroup_begin_page_stat().
*/
-void __delete_from_page_cache(struct page *page, void *shadow)
+void __delete_from_page_cache(struct page *page, void *shadow,
+ struct mem_cgroup *memcg)
{
struct address_space *mapping = page->mapping;
page->mapping = NULL;
/* Leave page->index set: truncation lookup relies upon it */
- __dec_zone_page_state(page, NR_FILE_PAGES);
+ /* hugetlb pages do not participate in page cache accounting. */
+ if (!PageHuge(page))
+ __dec_zone_page_state(page, NR_FILE_PAGES);
if (PageSwapBacked(page))
__dec_zone_page_state(page, NR_SHMEM);
BUG_ON(page_mapped(page));
* anyway will be cleared before returning page into buddy allocator.
*/
if (WARN_ON_ONCE(PageDirty(page)))
- account_page_cleaned(page, mapping);
+ account_page_cleaned(page, mapping, memcg,
+ inode_to_wb(mapping->host));
}
/**
void delete_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
+ struct mem_cgroup *memcg;
+ unsigned long flags;
+
void (*freepage)(struct page *);
BUG_ON(!PageLocked(page));
freepage = mapping->a_ops->freepage;
- spin_lock_irq(&mapping->tree_lock);
- __delete_from_page_cache(page, NULL);
- spin_unlock_irq(&mapping->tree_lock);
+
+ memcg = mem_cgroup_begin_page_stat(page);
+ spin_lock_irqsave(&mapping->tree_lock, flags);
+ __delete_from_page_cache(page, NULL, memcg);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ mem_cgroup_end_page_stat(memcg);
if (freepage)
freepage(page);
if (!mapping_cap_writeback_dirty(mapping))
return 0;
+ wbc_attach_fdatawrite_inode(&wbc, mapping->host);
ret = do_writepages(mapping, &wbc);
+ wbc_detach_inode(&wbc);
return ret;
}
if (!error) {
struct address_space *mapping = old->mapping;
void (*freepage)(struct page *);
+ struct mem_cgroup *memcg;
+ unsigned long flags;
pgoff_t offset = old->index;
freepage = mapping->a_ops->freepage;
new->mapping = mapping;
new->index = offset;
- spin_lock_irq(&mapping->tree_lock);
- __delete_from_page_cache(old, NULL);
+ memcg = mem_cgroup_begin_page_stat(old);
+ spin_lock_irqsave(&mapping->tree_lock, flags);
+ __delete_from_page_cache(old, NULL, memcg);
error = radix_tree_insert(&mapping->page_tree, offset, new);
BUG_ON(error);
mapping->nrpages++;
- __inc_zone_page_state(new, NR_FILE_PAGES);
+
+ /*
+ * hugetlb pages do not participate in page cache accounting.
+ */
+ if (!PageHuge(new))
+ __inc_zone_page_state(new, NR_FILE_PAGES);
if (PageSwapBacked(new))
__inc_zone_page_state(new, NR_SHMEM);
- spin_unlock_irq(&mapping->tree_lock);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ mem_cgroup_end_page_stat(memcg);
mem_cgroup_migrate(old, new, true);
radix_tree_preload_end();
if (freepage)
radix_tree_preload_end();
if (unlikely(error))
goto err_insert;
- __inc_zone_page_state(page, NR_FILE_PAGES);
+
+ /* hugetlb pages do not participate in page cache accounting. */
+ if (!huge)
+ __inc_zone_page_state(page, NR_FILE_PAGES);
spin_unlock_irq(&mapping->tree_lock);
if (!huge)
mem_cgroup_commit_charge(page, memcg, false);
error = -ENOMEM;
goto out;
}
- error = add_to_page_cache_lru(page, mapping,
- index, GFP_KERNEL);
+ error = add_to_page_cache_lru(page, mapping, index,
+ GFP_KERNEL & mapping_gfp_mask(mapping));
if (error) {
page_cache_release(page);
if (error == -EEXIST) {
if (!page)
return -ENOMEM;
- ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
+ ret = add_to_page_cache_lru(page, mapping, offset,
+ GFP_KERNEL & mapping_gfp_mask(mapping));
if (ret == 0)
ret = mapping->a_ops->readpage(file, page);
else if (ret == -EEXIST)
/* We can write back this queue in page reclaim */
current->backing_dev_info = inode_to_bdi(inode);
- err = file_remove_suid(file);
+ err = file_remove_privs(file);
if (err)
goto out;
goto oom;
cow_user_page(new_page, old_page, address, vma);
}
- __SetPageUptodate(new_page);
if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
goto oom_free_new;
+ __SetPageUptodate(new_page);
+
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
/*
page = alloc_zeroed_user_highpage_movable(vma, address);
if (!page)
goto oom;
+
+ if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
+ goto oom_free_page;
+
/*
* The memory barrier inside __SetPageUptodate makes sure that
* preceeding stores to the page contents become visible before
*/
__SetPageUptodate(page);
- if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
- goto oom_free_page;
-
entry = mk_pte(page, vma->vm_page_prot);
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
if (buf) {
char *p;
- p = d_path(&f->f_path, buf, PAGE_SIZE);
+ p = file_path(f, buf, PAGE_SIZE);
if (IS_ERR(p))
p = "?";
printk("%s%s[%lx+%lx]", prefix, kbasename(p),
}
#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
-void might_fault(void)
+void __might_fault(const char *file, int line)
{
/*
* Some code (nfs/sunrpc) uses socket ops on kernel memory while
*/
if (segment_eq(get_fs(), KERNEL_DS))
return;
-
- /*
- * it would be nicer only to annotate paths which are not under
- * pagefault_disable, however that requires a larger audit and
- * providing helpers like get_user_atomic.
- */
- if (in_atomic())
+ if (pagefault_disabled())
return;
-
- __might_sleep(__FILE__, __LINE__, 0);
-
+ __might_sleep(file, line, 0);
+#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
if (current->mm)
might_lock_read(¤t->mm->mmap_sem);
+#endif
}
-EXPORT_SYMBOL(might_fault);
+EXPORT_SYMBOL(__might_fault);
#endif
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
static struct vfsmount *mount;
static int mount_count;
- static inline int positive(struct dentry *dentry)
- {
- return d_really_is_positive(dentry) && !d_unhashed(dentry);
- }
-
static int fill_super(struct super_block *sb, void *data, int silent)
{
static struct tree_descr files[] = {{""}};
return;
mutex_lock(&d_inode(parent)->i_mutex);
- if (positive(dentry)) {
- if (d_really_is_positive(dentry)) {
- if (d_is_dir(dentry))
- simple_rmdir(d_inode(parent), dentry);
- else
- simple_unlink(d_inode(parent), dentry);
- dput(dentry);
- }
+ if (simple_positive(dentry)) {
+ if (d_is_dir(dentry))
+ simple_rmdir(d_inode(parent), dentry);
+ else
+ simple_unlink(d_inode(parent), dentry);
+ dput(dentry);
}
mutex_unlock(&d_inode(parent)->i_mutex);
simple_release_fs(&mount, &mount_count);
}
EXPORT_SYMBOL_GPL(securityfs_remove);
-static struct kobject *security_kobj;
-
static int __init securityfs_init(void)
{
int retval;
- security_kobj = kobject_create_and_add("security", kernel_kobj);
- if (!security_kobj)
- return -EINVAL;
+ retval = sysfs_create_mount_point(kernel_kobj, "security");
+ if (retval)
+ return retval;
retval = register_filesystem(&fs_type);
if (retval)
- kobject_put(security_kobj);
+ sysfs_remove_mount_point(kernel_kobj, "security");
return retval;
}