.count = count
};
- if (!arg.file)
+ if (!fd_file(arg))
return -EBADF;
- error = iterate_dir(arg.file, &buf.ctx);
+ error = iterate_dir(fd_file(arg), &buf.ctx);
if (error >= 0)
error = buf.error;
if (count != buf.count)
unsigned long
arch_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff,
- unsigned long flags)
+ unsigned long flags, vm_flags_t vm_flags)
{
unsigned long limit;
{
struct sgx_epc_page *page;
int nid_of_current = numa_node_id();
- int nid = nid_of_current;
+ int nid_start, nid;
- if (node_isset(nid_of_current, sgx_numa_mask)) {
- page = __sgx_alloc_epc_page_from_node(nid_of_current);
- if (page)
- return page;
- }
-
- /* Fall back to the non-local NUMA nodes: */
- while (true) {
- nid = next_node_in(nid, sgx_numa_mask);
- if (nid == nid_of_current)
- break;
+ /*
+ * Try local node first. If it doesn't have an EPC section,
+ * fall back to the non-local NUMA nodes.
+ */
+ if (node_isset(nid_of_current, sgx_numa_mask))
+ nid_start = nid_of_current;
+ else
+ nid_start = next_node_in(nid_of_current, sgx_numa_mask);
+ nid = nid_start;
+ do {
page = __sgx_alloc_epc_page_from_node(nid);
if (page)
return page;
- }
+
+ nid = next_node_in(nid, sgx_numa_mask);
+ } while (nid != nid_start);
return ERR_PTR(-ENOMEM);
}
return 0;
}
-/**
+/*
* A section metric is concatenated in a way that @low bits 12-31 define the
* bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the
* metric.
return false;
}
+ for_each_online_node(nid) {
+ if (!node_isset(nid, sgx_numa_mask) &&
+ node_state(nid, N_MEMORY) && node_state(nid, N_CPU))
+ pr_info("node%d has both CPUs and memory but doesn't have an EPC section\n",
+ nid);
+ }
+
return true;
}
{
struct fd f = fdget(attribute_fd);
- if (!f.file)
+ if (!fd_file(f))
return -EINVAL;
- if (f.file->f_op != &sgx_provision_fops) {
+ if (fd_file(f)->f_op != &sgx_provision_fops) {
fdput(f);
return -EINVAL;
}
int ret;
f = fdget(fd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- ret = sev_issue_cmd_external_user(f.file, id, data, error);
+ ret = sev_issue_cmd_external_user(fd_file(f), id, data, error);
fdput(f);
return ret;
bool charged = false;
int ret;
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- if (!file_is_kvm(f.file)) {
+ if (!file_is_kvm(fd_file(f))) {
ret = -EBADF;
goto out_fput;
}
- source_kvm = f.file->private_data;
+ source_kvm = fd_file(f)->private_data;
ret = sev_lock_two_vms(kvm, source_kvm);
if (ret)
goto out_fput;
for (gfn = gfn_start, i = 0; gfn < gfn_start + npages; gfn++, i++) {
struct sev_data_snp_launch_update fw_args = {0};
- bool assigned;
+ bool assigned = false;
int level;
- if (!kvm_mem_is_private(kvm, gfn)) {
- pr_debug("%s: Failed to ensure GFN 0x%llx has private memory attribute set\n",
- __func__, gfn);
- ret = -EINVAL;
- goto err;
- }
-
ret = snp_lookup_rmpentry((u64)pfn + i, &assigned, &level);
if (ret || assigned) {
pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n",
__func__, gfn, ret, assigned);
- ret = -EINVAL;
+ ret = ret ? -EINVAL : -EEXIST;
goto err;
}
if (src) {
void *vaddr = kmap_local_pfn(pfn + i);
- ret = copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE);
- if (ret)
+ if (copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE)) {
+ ret = -EFAULT;
goto err;
+ }
kunmap_local(vaddr);
}
data->gctx_paddr = __psp_pa(sev->snp_context);
ret = sev_issue_cmd(kvm, SEV_CMD_SNP_LAUNCH_FINISH, data, &argp->error);
+ /*
+ * Now that there will be no more SNP_LAUNCH_UPDATE ioctls, private pages
+ * can be given to the guest simply by marking the RMP entry as private.
+ * This can happen on first access and also with KVM_PRE_FAULT_MEMORY.
+ */
+ if (!ret)
+ kvm->arch.pre_fault_allowed = true;
+
kfree(id_auth);
e_free_id_block:
struct kvm_sev_info *source_sev, *mirror_sev;
int ret;
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- if (!file_is_kvm(f.file)) {
+ if (!file_is_kvm(fd_file(f))) {
ret = -EBADF;
goto e_source_fput;
}
- source_kvm = f.file->private_data;
+ source_kvm = fd_file(f)->private_data;
ret = sev_lock_two_vms(kvm, source_kvm);
if (ret)
goto e_source_fput;
*/
-#include <linux/fdtable.h>
#include <linux/file.h>
#include <linux/pid.h>
uint32_t id;
int r;
- if (!f.file)
+ if (!fd_file(f))
return -EINVAL;
- r = amdgpu_file_to_fpriv(f.file, &fpriv);
+ r = amdgpu_file_to_fpriv(fd_file(f), &fpriv);
if (r) {
fdput(f);
return r;
struct amdgpu_ctx *ctx;
int r;
- if (!f.file)
+ if (!fd_file(f))
return -EINVAL;
- r = amdgpu_file_to_fpriv(f.file, &fpriv);
+ r = amdgpu_file_to_fpriv(fd_file(f), &fpriv);
if (r) {
fdput(f);
return r;
struct fd f = fdget(fd);
int ret;
- if (!f.file)
+ if (!fd_file(f))
return -EINVAL;
- if (f.file->f_op != &drm_syncobj_file_fops) {
+ if (fd_file(f)->f_op != &drm_syncobj_file_fops) {
fdput(f);
return -EINVAL;
}
/* take a reference to put in the idr */
- syncobj = f.file->private_data;
+ syncobj = fd_file(f)->private_data;
drm_syncobj_get(syncobj);
idr_preload(GFP_KERNEL);
struct drm_syncobj *syncobj;
struct eventfd_ctx *ev_fd_ctx;
struct syncobj_eventfd_entry *entry;
+ int ret;
if (!drm_core_check_feature(dev, DRIVER_SYNCOBJ_TIMELINE))
return -EOPNOTSUPP;
return -ENOENT;
ev_fd_ctx = eventfd_ctx_fdget(args->fd);
- if (IS_ERR(ev_fd_ctx))
- return PTR_ERR(ev_fd_ctx);
+ if (IS_ERR(ev_fd_ctx)) {
+ ret = PTR_ERR(ev_fd_ctx);
+ goto err_fdget;
+ }
entry = kzalloc(sizeof(*entry), GFP_KERNEL);
if (!entry) {
- eventfd_ctx_put(ev_fd_ctx);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto err_kzalloc;
}
entry->syncobj = syncobj;
entry->ev_fd_ctx = ev_fd_ctx;
drm_syncobj_put(syncobj);
return 0;
+
+err_kzalloc:
+ eventfd_ctx_put(ev_fd_ctx);
+err_fdget:
+ drm_syncobj_put(syncobj);
+ return ret;
}
int
range.minlen = max(range.minlen, minlen);
ret = btrfs_trim_fs(fs_info, &range);
- if (ret < 0)
- return ret;
if (copy_to_user(arg, &range, sizeof(range)))
return -EFAULT;
- return 0;
+ return ret;
}
int __pure btrfs_is_empty_uuid(const u8 *uuid)
} else {
struct fd src = fdget(fd);
struct inode *src_inode;
- if (!src.file) {
+ if (!fd_file(src)) {
ret = -EINVAL;
goto out_drop_write;
}
- src_inode = file_inode(src.file);
+ src_inode = file_inode(fd_file(src));
if (src_inode->i_sb != file_inode(file)->i_sb) {
btrfs_info(BTRFS_I(file_inode(file))->root->fs_info,
"Snapshot src from another FS");
return ret;
ret = btrfs_sync_fs(inode->i_sb, 1);
/*
- * The transaction thread may want to do more work,
- * namely it pokes the cleaner kthread that will start
- * processing uncleaned subvols.
+ * There may be work for the cleaner kthread to do (subvolume
+ * deletion, delayed iputs, defrag inodes, etc), so wake it up.
*/
- wake_up_process(fs_info->transaction_kthread);
+ wake_up_process(fs_info->cleaner_kthread);
return ret;
}
case BTRFS_IOC_START_SYNC:
static bool ep_busy_loop_on(struct eventpoll *ep)
{
- return !!ep->busy_poll_usecs || net_busy_loop_on();
+ return !!READ_ONCE(ep->busy_poll_usecs) || net_busy_loop_on();
}
static bool ep_busy_loop_end(void *p, unsigned long start_time)
error = PTR_ERR(file);
goto out_free_fd;
}
-#ifdef CONFIG_NET_RX_BUSY_POLL
- ep->busy_poll_usecs = 0;
- ep->busy_poll_budget = 0;
- ep->prefer_busy_poll = false;
-#endif
ep->file = file;
fd_install(fd, file);
return fd;
error = -EBADF;
f = fdget(epfd);
- if (!f.file)
+ if (!fd_file(f))
goto error_return;
/* Get the "struct file *" for the target file */
tf = fdget(fd);
- if (!tf.file)
+ if (!fd_file(tf))
goto error_fput;
/* The target file descriptor must support poll */
error = -EPERM;
- if (!file_can_poll(tf.file))
+ if (!file_can_poll(fd_file(tf)))
goto error_tgt_fput;
/* Check if EPOLLWAKEUP is allowed */
* adding an epoll file descriptor inside itself.
*/
error = -EINVAL;
- if (f.file == tf.file || !is_file_epoll(f.file))
+ if (fd_file(f) == fd_file(tf) || !is_file_epoll(fd_file(f)))
goto error_tgt_fput;
/*
if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
if (op == EPOLL_CTL_MOD)
goto error_tgt_fput;
- if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
+ if (op == EPOLL_CTL_ADD && (is_file_epoll(fd_file(tf)) ||
(epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
goto error_tgt_fput;
}
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
- ep = f.file->private_data;
+ ep = fd_file(f)->private_data;
/*
* When we insert an epoll file descriptor inside another epoll file
if (error)
goto error_tgt_fput;
if (op == EPOLL_CTL_ADD) {
- if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen ||
- is_file_epoll(tf.file)) {
+ if (READ_ONCE(fd_file(f)->f_ep) || ep->gen == loop_check_gen ||
+ is_file_epoll(fd_file(tf))) {
mutex_unlock(&ep->mtx);
error = epoll_mutex_lock(&epnested_mutex, 0, nonblock);
if (error)
goto error_tgt_fput;
loop_check_gen++;
full_check = 1;
- if (is_file_epoll(tf.file)) {
- tep = tf.file->private_data;
+ if (is_file_epoll(fd_file(tf))) {
+ tep = fd_file(tf)->private_data;
error = -ELOOP;
if (ep_loop_check(ep, tep) != 0)
goto error_tgt_fput;
* above, we can be sure to be able to use the item looked up by
* ep_find() till we release the mutex.
*/
- epi = ep_find(ep, tf.file, fd);
+ epi = ep_find(ep, fd_file(tf), fd);
error = -EINVAL;
switch (op) {
case EPOLL_CTL_ADD:
if (!epi) {
epds->events |= EPOLLERR | EPOLLHUP;
- error = ep_insert(ep, epds, tf.file, fd, full_check);
+ error = ep_insert(ep, epds, fd_file(tf), fd, full_check);
} else
error = -EEXIST;
break;
/* Get the "struct file *" for the eventpoll file */
f = fdget(epfd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
/*
* the user passed to us _is_ an eventpoll file.
*/
error = -EINVAL;
- if (!is_file_epoll(f.file))
+ if (!is_file_epoll(fd_file(f)))
goto error_fput;
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
- ep = f.file->private_data;
+ ep = fd_file(f)->private_data;
/* Time to fish for events ... */
error = ep_poll(ep, events, maxevents, to);
#include <asm/siginfo.h>
#include <linux/uaccess.h>
+#include "internal.h"
+
#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
static int setfl(int fd, struct file * filp, unsigned int arg)
return error;
}
-static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
- int force)
+/*
+ * Allocate an file->f_owner struct if it doesn't exist, handling racing
+ * allocations correctly.
+ */
+int file_f_owner_allocate(struct file *file)
{
- write_lock_irq(&filp->f_owner.lock);
- if (force || !filp->f_owner.pid) {
- put_pid(filp->f_owner.pid);
- filp->f_owner.pid = get_pid(pid);
- filp->f_owner.pid_type = type;
+ struct fown_struct *f_owner;
- if (pid) {
- const struct cred *cred = current_cred();
- filp->f_owner.uid = cred->uid;
- filp->f_owner.euid = cred->euid;
- }
+ f_owner = file_f_owner(file);
+ if (f_owner)
+ return 0;
+
+ f_owner = kzalloc(sizeof(struct fown_struct), GFP_KERNEL);
+ if (!f_owner)
+ return -ENOMEM;
+
+ rwlock_init(&f_owner->lock);
+ f_owner->file = file;
+ /* If someone else raced us, drop our allocation. */
+ if (unlikely(cmpxchg(&file->f_owner, NULL, f_owner)))
+ kfree(f_owner);
+ return 0;
+}
+EXPORT_SYMBOL(file_f_owner_allocate);
+
+void file_f_owner_release(struct file *file)
+{
+ struct fown_struct *f_owner;
+
+ f_owner = file_f_owner(file);
+ if (f_owner) {
+ put_pid(f_owner->pid);
+ kfree(f_owner);
}
- write_unlock_irq(&filp->f_owner.lock);
}
void __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
int force)
{
- security_file_set_fowner(filp);
- f_modown(filp, pid, type, force);
+ struct fown_struct *f_owner;
+
+ f_owner = file_f_owner(filp);
+ if (WARN_ON_ONCE(!f_owner))
+ return;
+
+ write_lock_irq(&f_owner->lock);
+ if (force || !f_owner->pid) {
+ put_pid(f_owner->pid);
+ f_owner->pid = get_pid(pid);
+ f_owner->pid_type = type;
+
+ if (pid) {
+ const struct cred *cred = current_cred();
+ security_file_set_fowner(filp);
+ f_owner->uid = cred->uid;
+ f_owner->euid = cred->euid;
+ }
+ }
+ write_unlock_irq(&f_owner->lock);
}
EXPORT_SYMBOL(__f_setown);
struct pid *pid = NULL;
int ret = 0;
+ might_sleep();
+
type = PIDTYPE_TGID;
if (who < 0) {
/* avoid overflow below */
who = -who;
}
+ ret = file_f_owner_allocate(filp);
+ if (ret)
+ return ret;
+
rcu_read_lock();
if (who) {
pid = find_vpid(who);
void f_delown(struct file *filp)
{
- f_modown(filp, NULL, PIDTYPE_TGID, 1);
+ __f_setown(filp, NULL, PIDTYPE_TGID, 1);
}
pid_t f_getown(struct file *filp)
{
pid_t pid = 0;
+ struct fown_struct *f_owner;
- read_lock_irq(&filp->f_owner.lock);
+ f_owner = file_f_owner(filp);
+ if (!f_owner)
+ return pid;
+
+ read_lock_irq(&f_owner->lock);
rcu_read_lock();
- if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) {
- pid = pid_vnr(filp->f_owner.pid);
- if (filp->f_owner.pid_type == PIDTYPE_PGID)
+ if (pid_task(f_owner->pid, f_owner->pid_type)) {
+ pid = pid_vnr(f_owner->pid);
+ if (f_owner->pid_type == PIDTYPE_PGID)
pid = -pid;
}
rcu_read_unlock();
- read_unlock_irq(&filp->f_owner.lock);
+ read_unlock_irq(&f_owner->lock);
return pid;
}
return -EINVAL;
}
+ ret = file_f_owner_allocate(filp);
+ if (ret)
+ return ret;
+
rcu_read_lock();
pid = find_vpid(owner.pid);
if (owner.pid && !pid)
struct f_owner_ex __user *owner_p = (void __user *)arg;
struct f_owner_ex owner = {};
int ret = 0;
+ struct fown_struct *f_owner;
+ enum pid_type pid_type = PIDTYPE_PID;
- read_lock_irq(&filp->f_owner.lock);
- rcu_read_lock();
- if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type))
- owner.pid = pid_vnr(filp->f_owner.pid);
- rcu_read_unlock();
- switch (filp->f_owner.pid_type) {
+ f_owner = file_f_owner(filp);
+ if (f_owner) {
+ read_lock_irq(&f_owner->lock);
+ rcu_read_lock();
+ if (pid_task(f_owner->pid, f_owner->pid_type))
+ owner.pid = pid_vnr(f_owner->pid);
+ rcu_read_unlock();
+ pid_type = f_owner->pid_type;
+ }
+
+ switch (pid_type) {
case PIDTYPE_PID:
owner.type = F_OWNER_TID;
break;
ret = -EINVAL;
break;
}
- read_unlock_irq(&filp->f_owner.lock);
+ if (f_owner)
+ read_unlock_irq(&f_owner->lock);
if (!ret) {
ret = copy_to_user(owner_p, &owner, sizeof(owner));
static int f_getowner_uids(struct file *filp, unsigned long arg)
{
struct user_namespace *user_ns = current_user_ns();
+ struct fown_struct *f_owner;
uid_t __user *dst = (void __user *)arg;
- uid_t src[2];
+ uid_t src[2] = {0, 0};
int err;
- read_lock_irq(&filp->f_owner.lock);
- src[0] = from_kuid(user_ns, filp->f_owner.uid);
- src[1] = from_kuid(user_ns, filp->f_owner.euid);
- read_unlock_irq(&filp->f_owner.lock);
+ f_owner = file_f_owner(filp);
+ if (f_owner) {
+ read_lock_irq(&f_owner->lock);
+ src[0] = from_kuid(user_ns, f_owner->uid);
+ src[1] = from_kuid(user_ns, f_owner->euid);
+ read_unlock_irq(&f_owner->lock);
+ }
err = put_user(src[0], &dst[0]);
err |= put_user(src[1], &dst[1]);
* overkill, but given our lockless file pointer lookup, the
* alternatives are complicated.
*/
- return f.file == filp;
+ return fd_file(f) == filp;
}
+/* Let the caller figure out whether a given file was just created. */
+static long f_created_query(const struct file *filp)
+{
+ return !!(filp->f_mode & FMODE_CREATED);
+}
+
+static int f_owner_sig(struct file *filp, int signum, bool setsig)
+{
+ int ret = 0;
+ struct fown_struct *f_owner;
+
+ might_sleep();
+
+ if (setsig) {
+ if (!valid_signal(signum))
+ return -EINVAL;
+
+ ret = file_f_owner_allocate(filp);
+ if (ret)
+ return ret;
+ }
+
+ f_owner = file_f_owner(filp);
+ if (setsig)
+ f_owner->signum = signum;
+ else if (f_owner)
+ ret = f_owner->signum;
+ return ret;
+}
+
static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
struct file *filp)
{
long err = -EINVAL;
switch (cmd) {
+ case F_CREATED_QUERY:
+ err = f_created_query(filp);
+ break;
case F_DUPFD:
err = f_dupfd(argi, filp, 0);
break;
err = f_getowner_uids(filp, arg);
break;
case F_GETSIG:
- err = filp->f_owner.signum;
+ err = f_owner_sig(filp, 0, false);
break;
case F_SETSIG:
- /* arg == 0 restores default behaviour. */
- if (!valid_signal(argi)) {
- break;
- }
- err = 0;
- filp->f_owner.signum = argi;
+ err = f_owner_sig(filp, argi, true);
break;
case F_GETLEASE:
err = fcntl_getlease(filp);
static int check_fcntl_cmd(unsigned cmd)
{
switch (cmd) {
+ case F_CREATED_QUERY:
case F_DUPFD:
case F_DUPFD_CLOEXEC:
case F_DUPFD_QUERY:
struct fd f = fdget_raw(fd);
long err = -EBADF;
- if (!f.file)
+ if (!fd_file(f))
goto out;
- if (unlikely(f.file->f_mode & FMODE_PATH)) {
+ if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) {
if (!check_fcntl_cmd(cmd))
goto out1;
}
- err = security_file_fcntl(f.file, cmd, arg);
+ err = security_file_fcntl(fd_file(f), cmd, arg);
if (!err)
- err = do_fcntl(fd, cmd, arg, f.file);
+ err = do_fcntl(fd, cmd, arg, fd_file(f));
out1:
fdput(f);
struct flock64 flock;
long err = -EBADF;
- if (!f.file)
+ if (!fd_file(f))
goto out;
- if (unlikely(f.file->f_mode & FMODE_PATH)) {
+ if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) {
if (!check_fcntl_cmd(cmd))
goto out1;
}
- err = security_file_fcntl(f.file, cmd, arg);
+ err = security_file_fcntl(fd_file(f), cmd, arg);
if (err)
goto out1;
err = -EFAULT;
if (copy_from_user(&flock, argp, sizeof(flock)))
break;
- err = fcntl_getlk64(f.file, cmd, &flock);
+ err = fcntl_getlk64(fd_file(f), cmd, &flock);
if (!err && copy_to_user(argp, &flock, sizeof(flock)))
err = -EFAULT;
break;
err = -EFAULT;
if (copy_from_user(&flock, argp, sizeof(flock)))
break;
- err = fcntl_setlk64(fd, f.file, cmd, &flock);
+ err = fcntl_setlk64(fd, fd_file(f), cmd, &flock);
break;
default:
- err = do_fcntl(fd, cmd, arg, f.file);
+ err = do_fcntl(fd, cmd, arg, fd_file(f));
break;
}
out1:
struct flock flock;
long err = -EBADF;
- if (!f.file)
+ if (!fd_file(f))
return err;
- if (unlikely(f.file->f_mode & FMODE_PATH)) {
+ if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) {
if (!check_fcntl_cmd(cmd))
goto out_put;
}
- err = security_file_fcntl(f.file, cmd, arg);
+ err = security_file_fcntl(fd_file(f), cmd, arg);
if (err)
goto out_put;
err = get_compat_flock(&flock, compat_ptr(arg));
if (err)
break;
- err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock);
+ err = fcntl_getlk(fd_file(f), convert_fcntl_cmd(cmd), &flock);
if (err)
break;
err = fixup_compat_flock(&flock);
err = get_compat_flock64(&flock, compat_ptr(arg));
if (err)
break;
- err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock);
+ err = fcntl_getlk(fd_file(f), convert_fcntl_cmd(cmd), &flock);
if (!err)
err = put_compat_flock64(&flock, compat_ptr(arg));
break;
err = get_compat_flock(&flock, compat_ptr(arg));
if (err)
break;
- err = fcntl_setlk(fd, f.file, convert_fcntl_cmd(cmd), &flock);
+ err = fcntl_setlk(fd, fd_file(f), convert_fcntl_cmd(cmd), &flock);
break;
case F_SETLK64:
case F_SETLKW64:
err = get_compat_flock64(&flock, compat_ptr(arg));
if (err)
break;
- err = fcntl_setlk(fd, f.file, convert_fcntl_cmd(cmd), &flock);
+ err = fcntl_setlk(fd, fd_file(f), convert_fcntl_cmd(cmd), &flock);
break;
default:
- err = do_fcntl(fd, cmd, arg, f.file);
+ err = do_fcntl(fd, cmd, arg, fd_file(f));
break;
}
out_put:
do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, type);
}
-int send_sigurg(struct fown_struct *fown)
+int send_sigurg(struct file *file)
{
+ struct fown_struct *fown;
struct task_struct *p;
enum pid_type type;
struct pid *pid;
unsigned long flags;
int ret = 0;
+ fown = file_f_owner(file);
+ if (!fown)
+ return 0;
+
read_lock_irqsave(&fown->lock, flags);
type = fown->pid_type;
}
read_lock_irqsave(&fa->fa_lock, flags);
if (fa->fa_file) {
- fown = &fa->fa_file->f_owner;
+ fown = file_f_owner(fa->fa_file);
+ if (!fown)
+ goto next;
/* Don't send SIGURG to processes which have not set a
queued signum: SIGURG has its own default signalling
mechanism. */
if (!(sig == SIGURG && fown->signum == 0))
send_sigio(fown, fa->fa_fd, band);
}
+next:
read_unlock_irqrestore(&fa->fa_lock, flags);
fa = rcu_dereference(fa->fa_next);
}
static long do_sys_name_to_handle(const struct path *path,
struct file_handle __user *ufh,
- int __user *mnt_id, int fh_flags)
+ void __user *mnt_id, bool unique_mntid,
+ int fh_flags)
{
long retval;
struct file_handle f_handle;
} else
retval = 0;
/* copy the mount id */
- if (put_user(real_mount(path->mnt)->mnt_id, mnt_id) ||
- copy_to_user(ufh, handle,
- struct_size(handle, f_handle, handle_bytes)))
+ if (unique_mntid) {
+ if (put_user(real_mount(path->mnt)->mnt_id_unique,
+ (u64 __user *) mnt_id))
+ retval = -EFAULT;
+ } else {
+ if (put_user(real_mount(path->mnt)->mnt_id,
+ (int __user *) mnt_id))
+ retval = -EFAULT;
+ }
+ /* copy the handle */
+ if (retval != -EFAULT &&
+ copy_to_user(ufh, handle,
+ struct_size(handle, f_handle, handle_bytes)))
retval = -EFAULT;
kfree(handle);
return retval;
* @name: name that should be converted to handle.
* @handle: resulting file handle
* @mnt_id: mount id of the file system containing the file
+ * (u64 if AT_HANDLE_MNT_ID_UNIQUE, otherwise int)
* @flag: flag value to indicate whether to follow symlink or not
* and whether a decodable file handle is required.
*
* value required.
*/
SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
- struct file_handle __user *, handle, int __user *, mnt_id,
+ struct file_handle __user *, handle, void __user *, mnt_id,
int, flag)
{
struct path path;
int fh_flags;
int err;
- if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID))
+ if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID |
+ AT_HANDLE_MNT_ID_UNIQUE))
return -EINVAL;
lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0;
lookup_flags |= LOOKUP_EMPTY;
err = user_path_at(dfd, name, lookup_flags, &path);
if (!err) {
- err = do_sys_name_to_handle(&path, handle, mnt_id, fh_flags);
+ err = do_sys_name_to_handle(&path, handle, mnt_id,
+ flag & AT_HANDLE_MNT_ID_UNIQUE,
+ fh_flags);
path_put(&path);
}
return err;
spin_unlock(&fs->lock);
} else {
struct fd f = fdget(fd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- *root = f.file->f_path;
+ *root = fd_file(f)->f_path;
path_get(root);
fdput(f);
}
#define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr))
#define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long))
+#define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds
/*
* Copy 'count' fd bits from the old table to the new table and clear the extra
* space if any. This does not copy the file pointers. Called with the files
* spinlock held for write.
*/
-static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
- unsigned int count)
+static inline void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
+ unsigned int copy_words)
{
- unsigned int cpy, set;
-
- cpy = count / BITS_PER_BYTE;
- set = (nfdt->max_fds - count) / BITS_PER_BYTE;
- memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
- memset((char *)nfdt->open_fds + cpy, 0, set);
- memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
- memset((char *)nfdt->close_on_exec + cpy, 0, set);
-
- cpy = BITBIT_SIZE(count);
- set = BITBIT_SIZE(nfdt->max_fds) - cpy;
- memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy);
- memset((char *)nfdt->full_fds_bits + cpy, 0, set);
+ unsigned int nwords = fdt_words(nfdt);
+
+ bitmap_copy_and_extend(nfdt->open_fds, ofdt->open_fds,
+ copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG);
+ bitmap_copy_and_extend(nfdt->close_on_exec, ofdt->close_on_exec,
+ copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG);
+ bitmap_copy_and_extend(nfdt->full_fds_bits, ofdt->full_fds_bits,
+ copy_words, nwords);
}
/*
memcpy(nfdt->fd, ofdt->fd, cpy);
memset((char *)nfdt->fd + cpy, 0, set);
- copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds);
+ copy_fd_bitmaps(nfdt, ofdt, fdt_words(ofdt));
}
/*
open_files = sane_fdtable_size(old_fdt, max_fds);
}
- copy_fd_bitmaps(new_fdt, old_fdt, open_files);
+ copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG);
old_fds = old_fdt->fd;
new_fds = new_fdt->fd;
return filp_close(file, files);
}
-EXPORT_SYMBOL(close_fd); /* for ksys_close() */
+EXPORT_SYMBOL(close_fd);
/**
* last_fd - return last valid index into fd table
* The fput_needed flag returned by fget_light should be passed to the
* corresponding fput_light.
*/
- static unsigned long __fget_light(unsigned int fd, fmode_t mask)
+ static inline struct fd __fget_light(unsigned int fd, fmode_t mask)
{
struct files_struct *files = current->files;
struct file *file;
if (likely(atomic_read_acquire(&files->count) == 1)) {
file = files_lookup_fd_raw(files, fd);
if (!file || unlikely(file->f_mode & mask))
- return 0;
- return (unsigned long)file;
+ return EMPTY_FD;
+ return BORROWED_FD(file);
} else {
file = __fget_files(files, fd, mask);
if (!file)
- return 0;
- return FDPUT_FPUT | (unsigned long)file;
+ return EMPTY_FD;
+ return CLONED_FD(file);
}
}
- unsigned long __fdget(unsigned int fd)
+ struct fd fdget(unsigned int fd)
{
return __fget_light(fd, FMODE_PATH);
}
- EXPORT_SYMBOL(__fdget);
+ EXPORT_SYMBOL(fdget);
- unsigned long __fdget_raw(unsigned int fd)
+ struct fd fdget_raw(unsigned int fd)
{
return __fget_light(fd, 0);
}
(file_count(file) > 1 || file->f_op->iterate_shared);
}
- unsigned long __fdget_pos(unsigned int fd)
+ struct fd fdget_pos(unsigned int fd)
{
- unsigned long v = __fdget(fd);
- struct file *file = (struct file *)(v & ~3);
+ struct fd f = fdget(fd);
+ struct file *file = fd_file(f);
if (file && file_needs_f_pos_lock(file)) {
- v |= FDPUT_POS_UNLOCK;
+ f.word |= FDPUT_POS_UNLOCK;
mutex_lock(&file->f_pos_lock);
}
- return v;
+ return f;
}
void __f_unlock_pos(struct file *f)
* tables and this condition does not arise without those.
*/
fdt = files_fdtable(files);
+ fd = array_index_nospec(fd, fdt->max_fds);
tofree = fdt->fd[fd];
if (!tofree && fd_is_open(fd, fdt))
goto Ebusy;
static struct kmem_cache *fuse_req_cachep;
+static void end_requests(struct list_head *head);
+
static struct fuse_dev *fuse_get_dev(struct file *file)
{
/*
(folio->flags & PAGE_FLAGS_CHECK_AT_PREP &
~(1 << PG_locked |
1 << PG_referenced |
- 1 << PG_uptodate |
1 << PG_lru |
1 << PG_active |
1 << PG_workingset |
newfolio = page_folio(buf->page);
- if (!folio_test_uptodate(newfolio))
- folio_mark_uptodate(newfolio);
-
+ folio_clear_uptodate(newfolio);
folio_clear_mappedtodisk(newfolio);
if (fuse_check_folio(newfolio) != 0)
this_num = min_t(unsigned, num, PAGE_SIZE - offset);
err = fuse_copy_page(cs, &page, offset, this_num, 0);
- if (!err && offset == 0 &&
- (this_num == PAGE_SIZE || file_size == end))
+ if (!PageUptodate(page) && !err && offset == 0 &&
+ (this_num == PAGE_SIZE || file_size == end)) {
+ zero_user_segment(page, this_num, PAGE_SIZE);
SetPageUptodate(page);
+ }
unlock_page(page);
put_page(page);
}
spin_lock(&fiq->lock);
+ if (!fiq->connected) {
+ spin_unlock(&fiq->lock);
+ list_for_each_entry(req, &to_queue, list)
+ clear_bit(FR_PENDING, &req->flags);
+ end_requests(&to_queue);
+ return;
+ }
/* iq and pq requests are both oldest to newest */
list_splice(&to_queue, &fiq->pending);
fiq->ops->wake_pending_and_unlock(fiq);
return -EFAULT;
f = fdget(oldfd);
- if (!f.file)
+ if (!fd_file(f))
return -EINVAL;
/*
* Check against file->f_op because CUSE
* uses the same ioctl handler.
*/
- if (f.file->f_op == file->f_op)
- fud = fuse_get_dev(f.file);
+ if (fd_file(f)->f_op == file->f_op)
+ fud = fuse_get_dev(fd_file(f));
res = -EINVAL;
if (fud) {
struct file *filp = fl->c.flc_file;
f_delown(filp);
- filp->f_owner.signum = 0;
+ file_f_owner(filp)->signum = 0;
fasync_helper(0, fl->c.flc_file, 0, &fl->fl_fasync);
if (fl->fl_fasync != NULL) {
printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync);
lease = *flp;
trace_generic_add_lease(inode, lease);
+ error = file_f_owner_allocate(filp);
+ if (error)
+ return error;
+
/* Note that arg is never F_UNLCK here */
ctx = locks_get_lock_context(inode, arg);
if (!ctx)
error = -EBADF;
f = fdget(fd);
- if (!f.file)
+ if (!fd_file(f))
return error;
- if (type != F_UNLCK && !(f.file->f_mode & (FMODE_READ | FMODE_WRITE)))
+ if (type != F_UNLCK && !(fd_file(f)->f_mode & (FMODE_READ | FMODE_WRITE)))
goto out_putf;
- flock_make_lock(f.file, &fl, type);
+ flock_make_lock(fd_file(f), &fl, type);
- error = security_file_lock(f.file, fl.c.flc_type);
+ error = security_file_lock(fd_file(f), fl.c.flc_type);
if (error)
goto out_putf;
if (can_sleep)
fl.c.flc_flags |= FL_SLEEP;
- if (f.file->f_op->flock)
- error = f.file->f_op->flock(f.file,
+ if (fd_file(f)->f_op->flock)
+ error = fd_file(f)->f_op->flock(fd_file(f),
(can_sleep) ? F_SETLKW : F_SETLK,
&fl);
else
- error = locks_lock_file_wait(f.file, &fl);
+ error = locks_lock_file_wait(fd_file(f), &fl);
locks_release_private(&fl);
out_putf:
filelock_cache = kmem_cache_create("file_lock_cache",
sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
- filelease_cache = kmem_cache_create("file_lock_cache",
+ filelease_cache = kmem_cache_create("file_lease_cache",
sizeof(struct file_lease), 0, SLAB_PANIC, NULL);
for_each_possible_cpu(i) {
}
EXPORT_SYMBOL(lookup_one_qstr_excl);
+/**
+ * lookup_fast - do fast lockless (but racy) lookup of a dentry
+ * @nd: current nameidata
+ *
+ * Do a fast, but racy lookup in the dcache for the given dentry, and
+ * revalidate it. Returns a valid dentry pointer or NULL if one wasn't
+ * found. On error, an ERR_PTR will be returned.
+ *
+ * If this function returns a valid dentry and the walk is no longer
+ * lazy, the dentry will carry a reference that must later be put. If
+ * RCU mode is still in force, then this is not the case and the dentry
+ * must be legitimized before use. If this returns NULL, then the walk
+ * will no longer be in RCU mode.
+ */
static struct dentry *lookup_fast(struct nameidata *nd)
{
struct dentry *dentry, *parent = nd->path.dentry;
struct fd f = fdget_raw(nd->dfd);
struct dentry *dentry;
- if (!f.file)
+ if (!fd_file(f))
return ERR_PTR(-EBADF);
if (flags & LOOKUP_LINKAT_EMPTY) {
- if (f.file->f_cred != current_cred() &&
- !ns_capable(f.file->f_cred->user_ns, CAP_DAC_READ_SEARCH)) {
+ if (fd_file(f)->f_cred != current_cred() &&
+ !ns_capable(fd_file(f)->f_cred->user_ns, CAP_DAC_READ_SEARCH)) {
fdput(f);
return ERR_PTR(-ENOENT);
}
}
- dentry = f.file->f_path.dentry;
+ dentry = fd_file(f)->f_path.dentry;
if (*s && unlikely(!d_can_lookup(dentry))) {
fdput(f);
return ERR_PTR(-ENOTDIR);
}
- nd->path = f.file->f_path;
+ nd->path = fd_file(f)->f_path;
if (flags & LOOKUP_RCU) {
nd->inode = nd->path.dentry->d_inode;
nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
return dentry;
}
+ if (open_flag & O_CREAT)
+ audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
+
/*
* Checking write permission is tricky, bacuse we don't know if we are
* going to actually need it: O_CREAT opens should work as long as the
return ERR_PTR(error);
}
+static inline bool trailing_slashes(struct nameidata *nd)
+{
+ return (bool)nd->last.name[nd->last.len];
+}
+
+static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag)
+{
+ struct dentry *dentry;
+
+ if (open_flag & O_CREAT) {
+ if (trailing_slashes(nd))
+ return ERR_PTR(-EISDIR);
+
+ /* Don't bother on an O_EXCL create */
+ if (open_flag & O_EXCL)
+ return NULL;
+ }
+
+ if (trailing_slashes(nd))
+ nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+
+ dentry = lookup_fast(nd);
+ if (IS_ERR_OR_NULL(dentry))
+ return dentry;
+
+ if (open_flag & O_CREAT) {
+ /* Discard negative dentries. Need inode_lock to do the create */
+ if (!dentry->d_inode) {
+ if (!(nd->flags & LOOKUP_RCU))
+ dput(dentry);
+ dentry = NULL;
+ }
+ }
+ return dentry;
+}
+
static const char *open_last_lookups(struct nameidata *nd,
struct file *file, const struct open_flags *op)
{
return handle_dots(nd, nd->last_type);
}
- if (!(open_flag & O_CREAT)) {
- if (nd->last.name[nd->last.len])
- nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
- /* we _can_ be in RCU mode here */
- dentry = lookup_fast(nd);
- if (IS_ERR(dentry))
- return ERR_CAST(dentry);
- if (likely(dentry))
- goto finish_lookup;
+ /* We _can_ be in RCU mode here */
+ dentry = lookup_fast_for_open(nd, open_flag);
+ if (IS_ERR(dentry))
+ return ERR_CAST(dentry);
+ if (likely(dentry))
+ goto finish_lookup;
+
+ if (!(open_flag & O_CREAT)) {
if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU))
return ERR_PTR(-ECHILD);
} else {
- /* create side of things */
if (nd->flags & LOOKUP_RCU) {
if (!try_to_unlazy(nd))
return ERR_PTR(-ECHILD);
}
- audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
- /* trailing slashes? */
- if (unlikely(nd->last.name[nd->last.len]))
- return ERR_PTR(-EISDIR);
}
if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
struct address_space *mapping = inode->i_mapping;
const struct address_space_operations *aops = mapping->a_ops;
bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS);
- struct page *page;
+ struct folio *folio;
void *fsdata = NULL;
int err;
unsigned int flags;
retry:
if (nofs)
flags = memalloc_nofs_save();
- err = aops->write_begin(NULL, mapping, 0, len-1, &page, &fsdata);
+ err = aops->write_begin(NULL, mapping, 0, len-1, &folio, &fsdata);
if (nofs)
memalloc_nofs_restore(flags);
if (err)
goto fail;
- memcpy(page_address(page), symname, len-1);
+ memcpy(folio_address(folio), symname, len - 1);
- err = aops->write_end(NULL, mapping, 0, len-1, len-1,
- page, fsdata);
+ err = aops->write_end(NULL, mapping, 0, len - 1, len - 1,
+ folio, fsdata);
if (err < 0)
goto fail;
if (err < len-1)
list_del_init(&p->mnt_child);
}
- /* Add propogated mounts to the tmp_list */
+ /* Add propagated mounts to the tmp_list */
if (how & UMOUNT_PROPAGATE)
propagate_umount(&tmp_list);
dentry->d_fsdata == &mntns_operations;
}
-static struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
+struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
{
- return container_of(ns, struct mnt_namespace, ns);
+ return &mnt->ns;
}
-struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
+struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mntns, bool previous)
{
- return &mnt->ns;
+ guard(read_lock)(&mnt_ns_tree_lock);
+ for (;;) {
+ struct rb_node *node;
+
+ if (previous)
+ node = rb_prev(&mntns->mnt_ns_tree_node);
+ else
+ node = rb_next(&mntns->mnt_ns_tree_node);
+ if (!node)
+ return ERR_PTR(-ENOENT);
+
+ mntns = node_to_mnt_ns(node);
+ node = &mntns->mnt_ns_tree_node;
+
+ if (!ns_capable_noaudit(mntns->user_ns, CAP_SYS_ADMIN))
+ continue;
+
+ /*
+ * Holding mnt_ns_tree_lock prevents the mount namespace from
+ * being freed but it may well be on it's deathbed. We want an
+ * active reference, not just a passive one here as we're
+ * persisting the mount namespace.
+ */
+ if (!refcount_inc_not_zero(&mntns->ns.count))
+ continue;
+
+ return mntns;
+ }
}
static bool mnt_ns_loop(struct dentry *dentry)
if (!__mnt_is_readonly(mnt) &&
(!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) &&
(ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
- char *buf = (char *)__get_free_page(GFP_KERNEL);
- char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM);
+ char *buf, *mntpath;
+
+ buf = (char *)__get_free_page(GFP_KERNEL);
+ if (buf)
+ mntpath = d_path(mountpoint, buf, PAGE_SIZE);
+ else
+ mntpath = ERR_PTR(-ENOMEM);
+ if (IS_ERR(mntpath))
+ mntpath = "(unknown)";
pr_warn("%s filesystem being %s at %s supports timestamps until %ptTd (0x%llx)\n",
sb->s_type->name,
mntpath, &sb->s_time_max,
(unsigned long long)sb->s_time_max);
- free_page((unsigned long)buf);
sb->s_iflags |= SB_I_TS_EXPIRY_WARNED;
+ if (buf)
+ free_page((unsigned long)buf);
}
}
}
f = fdget(fs_fd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
ret = -EINVAL;
- if (f.file->f_op != &fscontext_fops)
+ if (fd_file(f)->f_op != &fscontext_fops)
goto err_fsfd;
- fc = f.file->private_data;
+ fc = fd_file(f)->private_data;
ret = mutex_lock_interruptible(&fc->uapi_mutex);
if (ret < 0)
return -EINVAL;
f = fdget(attr->userns_fd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- if (!proc_ns_file(f.file)) {
+ if (!proc_ns_file(fd_file(f))) {
err = -EINVAL;
goto out_fput;
}
- ns = get_proc_ns(file_inode(f.file));
+ ns = get_proc_ns(file_inode(fd_file(f)));
if (ns->ops->type != CLONE_NEWUSER) {
err = -EINVAL;
goto out_fput;
* that, or if not simply grab a passive reference on our mount namespace and
* return that.
*/
-static struct mnt_namespace *grab_requested_mnt_ns(u64 mnt_ns_id)
+static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq)
{
- if (mnt_ns_id)
- return lookup_mnt_ns(mnt_ns_id);
- refcount_inc(¤t->nsproxy->mnt_ns->passive);
- return current->nsproxy->mnt_ns;
+ struct mnt_namespace *mnt_ns;
+
+ if (kreq->mnt_ns_id && kreq->spare)
+ return ERR_PTR(-EINVAL);
+
+ if (kreq->mnt_ns_id)
+ return lookup_mnt_ns(kreq->mnt_ns_id);
+
+ if (kreq->spare) {
+ struct ns_common *ns;
+
+ CLASS(fd, f)(kreq->spare);
- if (!f.file)
++ if (fd_empty(f))
+ return ERR_PTR(-EBADF);
+
- if (!proc_ns_file(f.file))
++ if (!proc_ns_file(fd_file(f)))
+ return ERR_PTR(-EINVAL);
+
- ns = get_proc_ns(file_inode(f.file));
++ ns = get_proc_ns(file_inode(fd_file(f)));
+ if (ns->ops->type != CLONE_NEWNS)
+ return ERR_PTR(-EINVAL);
+
+ mnt_ns = to_mnt_ns(ns);
+ } else {
+ mnt_ns = current->nsproxy->mnt_ns;
+ }
+
+ refcount_inc(&mnt_ns->passive);
+ return mnt_ns;
}
SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
if (ret)
return ret;
- ns = grab_requested_mnt_ns(kreq.mnt_ns_id);
+ ns = grab_requested_mnt_ns(&kreq);
if (!ns)
return -ENOENT;
if (!kmnt_ids)
return -ENOMEM;
- ns = grab_requested_mnt_ns(kreq.mnt_ns_id);
+ ns = grab_requested_mnt_ns(&kreq);
if (!ns)
return -ENOENT;
/* Only worry about locked mounts */
if (!(child->mnt.mnt_flags & MNT_LOCKED))
continue;
- /* Is the directory permanetly empty? */
+ /* Is the directory permanently empty? */
if (!is_empty_dir_inode(inode))
goto next;
}
if (length < 0)
return -EINVAL;
f = fdget(fd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- error = do_ftruncate(f.file, length, small);
+ error = do_ftruncate(fd_file(f), length, small);
fdput(f);
return error;
if (offset < 0 || len <= 0)
return -EINVAL;
- /* Return error if mode is not supported */
- if (mode & ~FALLOC_FL_SUPPORTED_MASK)
+ if (mode & ~(FALLOC_FL_MODE_MASK | FALLOC_FL_KEEP_SIZE))
return -EOPNOTSUPP;
- /* Punch hole and zero range are mutually exclusive */
- if ((mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) ==
- (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))
- return -EOPNOTSUPP;
-
- /* Punch hole must have keep size set */
- if ((mode & FALLOC_FL_PUNCH_HOLE) &&
- !(mode & FALLOC_FL_KEEP_SIZE))
+ /*
+ * Modes are exclusive, even if that is not obvious from the encoding
+ * as bit masks and the mix with the flag in the same namespace.
+ *
+ * To make things even more complicated, FALLOC_FL_ALLOCATE_RANGE is
+ * encoded as no bit set.
+ */
+ switch (mode & FALLOC_FL_MODE_MASK) {
+ case FALLOC_FL_ALLOCATE_RANGE:
+ case FALLOC_FL_UNSHARE_RANGE:
+ case FALLOC_FL_ZERO_RANGE:
+ break;
+ case FALLOC_FL_PUNCH_HOLE:
+ if (!(mode & FALLOC_FL_KEEP_SIZE))
+ return -EOPNOTSUPP;
+ break;
+ case FALLOC_FL_COLLAPSE_RANGE:
+ case FALLOC_FL_INSERT_RANGE:
+ if (mode & FALLOC_FL_KEEP_SIZE)
+ return -EOPNOTSUPP;
+ break;
+ default:
return -EOPNOTSUPP;
-
- /* Collapse range should only be used exclusively. */
- if ((mode & FALLOC_FL_COLLAPSE_RANGE) &&
- (mode & ~FALLOC_FL_COLLAPSE_RANGE))
- return -EINVAL;
-
- /* Insert range should only be used exclusively. */
- if ((mode & FALLOC_FL_INSERT_RANGE) &&
- (mode & ~FALLOC_FL_INSERT_RANGE))
- return -EINVAL;
-
- /* Unshare range should only be used with allocate mode. */
- if ((mode & FALLOC_FL_UNSHARE_RANGE) &&
- (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE)))
- return -EINVAL;
+ }
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
/*
- * We can only allow pure fallocate on append only files
+ * On append-only files only space preallocation is supported.
*/
if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode))
return -EPERM;
struct fd f = fdget(fd);
int error = -EBADF;
- if (f.file) {
- error = vfs_fallocate(f.file, mode, offset, len);
+ if (fd_file(f)) {
+ error = vfs_fallocate(fd_file(f), mode, offset, len);
fdput(f);
}
return error;
int error;
error = -EBADF;
- if (!f.file)
+ if (!fd_file(f))
goto out;
error = -ENOTDIR;
- if (!d_can_lookup(f.file->f_path.dentry))
+ if (!d_can_lookup(fd_file(f)->f_path.dentry))
goto out_putf;
- error = file_permission(f.file, MAY_EXEC | MAY_CHDIR);
+ error = file_permission(fd_file(f), MAY_EXEC | MAY_CHDIR);
if (!error)
- set_fs_pwd(current->fs, &f.file->f_path);
+ set_fs_pwd(current->fs, &fd_file(f)->f_path);
out_putf:
fdput(f);
out:
struct fd f = fdget(fd);
int err = -EBADF;
- if (f.file) {
- err = vfs_fchmod(f.file, mode);
+ if (fd_file(f)) {
+ err = vfs_fchmod(fd_file(f), mode);
fdput(f);
}
return err;
struct fd f = fdget(fd);
int error = -EBADF;
- if (f.file) {
- error = vfs_fchown(f.file, user, group);
+ if (fd_file(f)) {
+ error = vfs_fchown(fd_file(f), user, group);
fdput(f);
}
return error;
static inline bool unsigned_offsets(struct file *file)
{
- return file->f_mode & FMODE_UNSIGNED_OFFSET;
+ return file->f_op->fop_flags & FOP_UNSIGNED_OFFSET;
}
/**
- * vfs_setpos - update the file offset for lseek
+ * vfs_setpos_cookie - update the file offset for lseek and reset cookie
* @file: file structure in question
* @offset: file offset to seek to
* @maxsize: maximum file size
+ * @cookie: cookie to reset
*
- * This is a low-level filesystem helper for updating the file offset to
- * the value specified by @offset if the given offset is valid and it is
- * not equal to the current file offset.
+ * Update the file offset to the value specified by @offset if the given
+ * offset is valid and it is not equal to the current file offset and
+ * reset the specified cookie to indicate that a seek happened.
*
* Return the specified offset on success and -EINVAL on invalid offset.
*/
-loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
+static loff_t vfs_setpos_cookie(struct file *file, loff_t offset,
+ loff_t maxsize, u64 *cookie)
{
if (offset < 0 && !unsigned_offsets(file))
return -EINVAL;
if (offset != file->f_pos) {
file->f_pos = offset;
- file->f_version = 0;
+ if (cookie)
+ *cookie = 0;
}
return offset;
}
-EXPORT_SYMBOL(vfs_setpos);
/**
- * generic_file_llseek_size - generic llseek implementation for regular files
- * @file: file structure to seek on
+ * vfs_setpos - update the file offset for lseek
+ * @file: file structure in question
* @offset: file offset to seek to
- * @whence: type of seek
- * @maxsize: max size of this file in file system
- * @eof: offset used for SEEK_END position
+ * @maxsize: maximum file size
*
- * This is a variant of generic_file_llseek that allows passing in a custom
- * maximum file size and a custom EOF position, for e.g. hashed directories
+ * This is a low-level filesystem helper for updating the file offset to
+ * the value specified by @offset if the given offset is valid and it is
+ * not equal to the current file offset.
*
- * Synchronization:
- * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
- * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
- * read/writes behave like SEEK_SET against seeks.
+ * Return the specified offset on success and -EINVAL on invalid offset.
*/
-loff_t
-generic_file_llseek_size(struct file *file, loff_t offset, int whence,
- loff_t maxsize, loff_t eof)
+loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
+{
+ return vfs_setpos_cookie(file, offset, maxsize, NULL);
+}
+EXPORT_SYMBOL(vfs_setpos);
+
+/**
+ * must_set_pos - check whether f_pos has to be updated
+ * @file: file to seek on
+ * @offset: offset to use
+ * @whence: type of seek operation
+ * @eof: end of file
+ *
+ * Check whether f_pos needs to be updated and update @offset according
+ * to @whence.
+ *
+ * Return: 0 if f_pos doesn't need to be updated, 1 if f_pos has to be
+ * updated, and negative error code on failure.
+ */
+static int must_set_pos(struct file *file, loff_t *offset, int whence, loff_t eof)
{
switch (whence) {
case SEEK_END:
- offset += eof;
+ *offset += eof;
break;
case SEEK_CUR:
/*
* f_pos value back to the file because a concurrent read(),
* write() or lseek() might have altered it
*/
- if (offset == 0)
- return file->f_pos;
- /*
- * f_lock protects against read/modify/write race with other
- * SEEK_CURs. Note that parallel writes and reads behave
- * like SEEK_SET.
- */
- spin_lock(&file->f_lock);
- offset = vfs_setpos(file, file->f_pos + offset, maxsize);
- spin_unlock(&file->f_lock);
- return offset;
+ if (*offset == 0) {
+ *offset = file->f_pos;
+ return 0;
+ }
+ break;
case SEEK_DATA:
/*
* In the generic case the entire file is data, so as long as
* offset isn't at the end of the file then the offset is data.
*/
- if ((unsigned long long)offset >= eof)
+ if ((unsigned long long)*offset >= eof)
return -ENXIO;
break;
case SEEK_HOLE:
* There is a virtual hole at the end of the file, so as long as
* offset isn't i_size or larger, return i_size.
*/
- if ((unsigned long long)offset >= eof)
+ if ((unsigned long long)*offset >= eof)
return -ENXIO;
- offset = eof;
+ *offset = eof;
break;
}
+ return 1;
+}
+
+/**
+ * generic_file_llseek_size - generic llseek implementation for regular files
+ * @file: file structure to seek on
+ * @offset: file offset to seek to
+ * @whence: type of seek
+ * @maxsize: max size of this file in file system
+ * @eof: offset used for SEEK_END position
+ *
+ * This is a variant of generic_file_llseek that allows passing in a custom
+ * maximum file size and a custom EOF position, for e.g. hashed directories
+ *
+ * Synchronization:
+ * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
+ * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
+ * read/writes behave like SEEK_SET against seeks.
+ */
+loff_t
+generic_file_llseek_size(struct file *file, loff_t offset, int whence,
+ loff_t maxsize, loff_t eof)
+{
+ int ret;
+
+ ret = must_set_pos(file, &offset, whence, eof);
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ return offset;
+
+ if (whence == SEEK_CUR) {
+ /*
+ * f_lock protects against read/modify/write race with
+ * other SEEK_CURs. Note that parallel writes and reads
+ * behave like SEEK_SET.
+ */
+ guard(spinlock)(&file->f_lock);
+ return vfs_setpos(file, file->f_pos + offset, maxsize);
+ }
+
return vfs_setpos(file, offset, maxsize);
}
EXPORT_SYMBOL(generic_file_llseek_size);
+/**
+ * generic_llseek_cookie - versioned llseek implementation
+ * @file: file structure to seek on
+ * @offset: file offset to seek to
+ * @whence: type of seek
+ * @cookie: cookie to update
+ *
+ * See generic_file_llseek for a general description and locking assumptions.
+ *
+ * In contrast to generic_file_llseek, this function also resets a
+ * specified cookie to indicate a seek took place.
+ */
+loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence,
+ u64 *cookie)
+{
+ struct inode *inode = file->f_mapping->host;
+ loff_t maxsize = inode->i_sb->s_maxbytes;
+ loff_t eof = i_size_read(inode);
+ int ret;
+
+ if (WARN_ON_ONCE(!cookie))
+ return -EINVAL;
+
+ /*
+ * Require that this is only used for directories that guarantee
+ * synchronization between readdir and seek so that an update to
+ * @cookie is correctly synchronized with concurrent readdir.
+ */
+ if (WARN_ON_ONCE(!(file->f_mode & FMODE_ATOMIC_POS)))
+ return -EINVAL;
+
+ ret = must_set_pos(file, &offset, whence, eof);
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ return offset;
+
+ /* No need to hold f_lock because we know that f_pos_lock is held. */
+ if (whence == SEEK_CUR)
+ return vfs_setpos_cookie(file, file->f_pos + offset, maxsize, cookie);
+
+ return vfs_setpos_cookie(file, offset, maxsize, cookie);
+}
+EXPORT_SYMBOL(generic_llseek_cookie);
+
/**
* generic_file_llseek - generic llseek implementation for regular files
* @file: file structure to seek on
}
retval = -EINVAL;
if (offset >= 0 || unsigned_offsets(file)) {
- if (offset != file->f_pos) {
+ if (offset != file->f_pos)
file->f_pos = offset;
- file->f_version = 0;
- }
retval = offset;
}
out:
{
off_t retval;
struct fd f = fdget_pos(fd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
retval = -EINVAL;
if (whence <= SEEK_MAX) {
- loff_t res = vfs_llseek(f.file, offset, whence);
+ loff_t res = vfs_llseek(fd_file(f), offset, whence);
retval = res;
if (res != (loff_t)retval)
retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
struct fd f = fdget_pos(fd);
loff_t offset;
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
retval = -EINVAL;
if (whence > SEEK_MAX)
goto out_putf;
- offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
+ offset = vfs_llseek(fd_file(f), ((loff_t) offset_high << 32) | offset_low,
whence);
retval = (int)offset;
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
- if (f.file) {
- loff_t pos, *ppos = file_ppos(f.file);
+ if (fd_file(f)) {
+ loff_t pos, *ppos = file_ppos(fd_file(f));
if (ppos) {
pos = *ppos;
ppos = &pos;
}
- ret = vfs_read(f.file, buf, count, ppos);
+ ret = vfs_read(fd_file(f), buf, count, ppos);
if (ret >= 0 && ppos)
- f.file->f_pos = pos;
+ fd_file(f)->f_pos = pos;
fdput_pos(f);
}
return ret;
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
- if (f.file) {
- loff_t pos, *ppos = file_ppos(f.file);
+ if (fd_file(f)) {
+ loff_t pos, *ppos = file_ppos(fd_file(f));
if (ppos) {
pos = *ppos;
ppos = &pos;
}
- ret = vfs_write(f.file, buf, count, ppos);
+ ret = vfs_write(fd_file(f), buf, count, ppos);
if (ret >= 0 && ppos)
- f.file->f_pos = pos;
+ fd_file(f)->f_pos = pos;
fdput_pos(f);
}
return -EINVAL;
f = fdget(fd);
- if (f.file) {
+ if (fd_file(f)) {
ret = -ESPIPE;
- if (f.file->f_mode & FMODE_PREAD)
- ret = vfs_read(f.file, buf, count, &pos);
+ if (fd_file(f)->f_mode & FMODE_PREAD)
+ ret = vfs_read(fd_file(f), buf, count, &pos);
fdput(f);
}
return -EINVAL;
f = fdget(fd);
- if (f.file) {
+ if (fd_file(f)) {
ret = -ESPIPE;
- if (f.file->f_mode & FMODE_PWRITE)
- ret = vfs_write(f.file, buf, count, &pos);
+ if (fd_file(f)->f_mode & FMODE_PWRITE)
+ ret = vfs_write(fd_file(f), buf, count, &pos);
fdput(f);
}
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
- if (f.file) {
- loff_t pos, *ppos = file_ppos(f.file);
+ if (fd_file(f)) {
+ loff_t pos, *ppos = file_ppos(fd_file(f));
if (ppos) {
pos = *ppos;
ppos = &pos;
}
- ret = vfs_readv(f.file, vec, vlen, ppos, flags);
+ ret = vfs_readv(fd_file(f), vec, vlen, ppos, flags);
if (ret >= 0 && ppos)
- f.file->f_pos = pos;
+ fd_file(f)->f_pos = pos;
fdput_pos(f);
}
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
- if (f.file) {
- loff_t pos, *ppos = file_ppos(f.file);
+ if (fd_file(f)) {
+ loff_t pos, *ppos = file_ppos(fd_file(f));
if (ppos) {
pos = *ppos;
ppos = &pos;
}
- ret = vfs_writev(f.file, vec, vlen, ppos, flags);
+ ret = vfs_writev(fd_file(f), vec, vlen, ppos, flags);
if (ret >= 0 && ppos)
- f.file->f_pos = pos;
+ fd_file(f)->f_pos = pos;
fdput_pos(f);
}
return -EINVAL;
f = fdget(fd);
- if (f.file) {
+ if (fd_file(f)) {
ret = -ESPIPE;
- if (f.file->f_mode & FMODE_PREAD)
- ret = vfs_readv(f.file, vec, vlen, &pos, flags);
+ if (fd_file(f)->f_mode & FMODE_PREAD)
+ ret = vfs_readv(fd_file(f), vec, vlen, &pos, flags);
fdput(f);
}
return -EINVAL;
f = fdget(fd);
- if (f.file) {
+ if (fd_file(f)) {
ret = -ESPIPE;
- if (f.file->f_mode & FMODE_PWRITE)
- ret = vfs_writev(f.file, vec, vlen, &pos, flags);
+ if (fd_file(f)->f_mode & FMODE_PWRITE)
+ ret = vfs_writev(fd_file(f), vec, vlen, &pos, flags);
fdput(f);
}
*/
retval = -EBADF;
in = fdget(in_fd);
- if (!in.file)
+ if (!fd_file(in))
goto out;
- if (!(in.file->f_mode & FMODE_READ))
+ if (!(fd_file(in)->f_mode & FMODE_READ))
goto fput_in;
retval = -ESPIPE;
if (!ppos) {
- pos = in.file->f_pos;
+ pos = fd_file(in)->f_pos;
} else {
pos = *ppos;
- if (!(in.file->f_mode & FMODE_PREAD))
+ if (!(fd_file(in)->f_mode & FMODE_PREAD))
goto fput_in;
}
- retval = rw_verify_area(READ, in.file, &pos, count);
+ retval = rw_verify_area(READ, fd_file(in), &pos, count);
if (retval < 0)
goto fput_in;
if (count > MAX_RW_COUNT)
*/
retval = -EBADF;
out = fdget(out_fd);
- if (!out.file)
+ if (!fd_file(out))
goto fput_in;
- if (!(out.file->f_mode & FMODE_WRITE))
+ if (!(fd_file(out)->f_mode & FMODE_WRITE))
goto fput_out;
- in_inode = file_inode(in.file);
- out_inode = file_inode(out.file);
- out_pos = out.file->f_pos;
+ in_inode = file_inode(fd_file(in));
+ out_inode = file_inode(fd_file(out));
+ out_pos = fd_file(out)->f_pos;
if (!max)
max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
* and the application is arguably buggy if it doesn't expect
* EAGAIN on a non-blocking file descriptor.
*/
- if (in.file->f_flags & O_NONBLOCK)
+ if (fd_file(in)->f_flags & O_NONBLOCK)
fl = SPLICE_F_NONBLOCK;
#endif
- opipe = get_pipe_info(out.file, true);
+ opipe = get_pipe_info(fd_file(out), true);
if (!opipe) {
- retval = rw_verify_area(WRITE, out.file, &out_pos, count);
+ retval = rw_verify_area(WRITE, fd_file(out), &out_pos, count);
if (retval < 0)
goto fput_out;
- retval = do_splice_direct(in.file, &pos, out.file, &out_pos,
+ retval = do_splice_direct(fd_file(in), &pos, fd_file(out), &out_pos,
count, fl);
} else {
- if (out.file->f_flags & O_NONBLOCK)
+ if (fd_file(out)->f_flags & O_NONBLOCK)
fl |= SPLICE_F_NONBLOCK;
- retval = splice_file_to_pipe(in.file, opipe, &pos, count, fl);
+ retval = splice_file_to_pipe(fd_file(in), opipe, &pos, count, fl);
}
if (retval > 0) {
add_rchar(current, retval);
add_wchar(current, retval);
- fsnotify_access(in.file);
- fsnotify_modify(out.file);
- out.file->f_pos = out_pos;
+ fsnotify_access(fd_file(in));
+ fsnotify_modify(fd_file(out));
+ fd_file(out)->f_pos = out_pos;
if (ppos)
*ppos = pos;
else
- in.file->f_pos = pos;
+ fd_file(in)->f_pos = pos;
}
inc_syscr(current);
ssize_t ret = -EBADF;
f_in = fdget(fd_in);
- if (!f_in.file)
+ if (!fd_file(f_in))
goto out2;
f_out = fdget(fd_out);
- if (!f_out.file)
+ if (!fd_file(f_out))
goto out1;
ret = -EFAULT;
if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
goto out;
} else {
- pos_in = f_in.file->f_pos;
+ pos_in = fd_file(f_in)->f_pos;
}
if (off_out) {
if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
goto out;
} else {
- pos_out = f_out.file->f_pos;
+ pos_out = fd_file(f_out)->f_pos;
}
ret = -EINVAL;
if (flags != 0)
goto out;
- ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
+ ret = vfs_copy_file_range(fd_file(f_in), pos_in, fd_file(f_out), pos_out, len,
flags);
if (ret > 0) {
pos_in += ret;
if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
ret = -EFAULT;
} else {
- f_in.file->f_pos = pos_in;
+ fd_file(f_in)->f_pos = pos_in;
}
if (off_out) {
if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
ret = -EFAULT;
} else {
- f_out.file->f_pos = pos_out;
+ fd_file(f_out)->f_pos = pos_out;
}
}
{
u64 ret;
struct timespec64 now;
+ u64 slack = current->timer_slack_ns;
- /*
- * Realtime tasks get a slack of 0 for obvious reasons.
- */
-
- if (rt_task(current))
+ if (slack == 0)
return 0;
ktime_get_ts64(&now);
now = timespec64_sub(*tv, now);
ret = __estimate_accuracy(&now);
- if (ret < current->timer_slack_ns)
- return current->timer_slack_ns;
+ if (ret < slack)
+ return slack;
return ret;
}
continue;
mask = EPOLLNVAL;
f = fdget(i);
- if (f.file) {
+ if (fd_file(f)) {
wait_key_set(wait, in, out, bit,
busy_flag);
- mask = vfs_poll(f.file, wait);
+ mask = vfs_poll(fd_file(f), wait);
fdput(f);
}
{
// the path is hot enough for overhead of copy_from_user() to matter
if (from) {
- if (!user_read_access_begin(from, sizeof(*from)))
+ if (can_do_masked_user_access())
+ from = masked_user_access_begin(from);
+ else if (!user_read_access_begin(from, sizeof(*from)))
return -EFAULT;
unsafe_get_user(to->p, &from->p, Efault);
unsafe_get_user(to->size, &from->size, Efault);
struct poll_list {
struct poll_list *next;
unsigned int len;
- struct pollfd entries[];
+ struct pollfd entries[] __counted_by(len);
};
#define POLLFD_PER_PAGE ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd))
goto out;
mask = EPOLLNVAL;
f = fdget(fd);
- if (!f.file)
+ if (!fd_file(f))
goto out;
/* userland u16 ->events contains POLL... bitmap */
filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP;
pwait->_key = filter | busy_flag;
- mask = vfs_poll(f.file, pwait);
+ mask = vfs_poll(fd_file(f), pwait);
if (mask & busy_flag)
*can_busy_poll = true;
mask &= filter; /* Mask out unneeded events. */
DECLARE_WAITQUEUE(wait, current);
spin_lock_irq(¤t->sighand->siglock);
- ret = dequeue_signal(current, &ctx->sigmask, info, &type);
+ ret = dequeue_signal(&ctx->sigmask, info, &type);
switch (ret) {
case 0:
if (!nonblock)
add_wait_queue(¤t->sighand->signalfd_wqh, &wait);
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
- ret = dequeue_signal(current, &ctx->sigmask, info, &type);
+ ret = dequeue_signal(&ctx->sigmask, info, &type);
if (ret != 0)
break;
if (signal_pending(current)) {
fd_install(ufd, file);
} else {
struct fd f = fdget(ufd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- ctx = f.file->private_data;
- if (f.file->f_op != &signalfd_fops) {
+ ctx = fd_file(f)->private_data;
+ if (fd_file(f)->f_op != &signalfd_fops) {
fdput(f);
return -EINVAL;
}
}
src_file = fdget(srcfd);
- if (!src_file.file) {
+ if (!fd_file(src_file)) {
rc = -EBADF;
goto out_drop_write;
}
- if (src_file.file->f_op->unlocked_ioctl != cifs_ioctl) {
+ if (fd_file(src_file)->f_op->unlocked_ioctl != cifs_ioctl) {
rc = -EBADF;
cifs_dbg(VFS, "src file seems to be from a different filesystem type\n");
goto out_fput;
}
- src_inode = file_inode(src_file.file);
+ src_inode = file_inode(fd_file(src_file));
rc = -EINVAL;
if (S_ISDIR(src_inode->i_mode))
goto out_fput;
- rc = cifs_file_copychunk_range(xid, src_file.file, 0, dst_file, 0,
+ rc = cifs_file_copychunk_range(xid, fd_file(src_file), 0, dst_file, 0,
src_inode->i_size, 0);
if (rc > 0)
rc = 0;
static int cifs_shutdown(struct super_block *sb, unsigned long arg)
{
struct cifs_sb_info *sbi = CIFS_SB(sb);
+ struct tcon_link *tlink;
+ struct cifs_tcon *tcon;
__u32 flags;
+ int rc;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (get_user(flags, (__u32 __user *)arg))
return -EFAULT;
- if (flags > CIFS_GOING_FLAGS_NOLOGFLUSH)
- return -EINVAL;
+ tlink = cifs_sb_tlink(sbi);
+ if (IS_ERR(tlink))
+ return PTR_ERR(tlink);
+ tcon = tlink_tcon(tlink);
+
+ trace_smb3_shutdown_enter(flags, tcon->tid);
+ if (flags > CIFS_GOING_FLAGS_NOLOGFLUSH) {
+ rc = -EINVAL;
+ goto shutdown_out_err;
+ }
if (cifs_forced_shutdown(sbi))
- return 0;
+ goto shutdown_good;
cifs_dbg(VFS, "shut down requested (%d)", flags);
-/* trace_cifs_shutdown(sb, flags);*/
/*
* see:
*/
case CIFS_GOING_FLAGS_DEFAULT:
cifs_dbg(FYI, "shutdown with default flag not supported\n");
- return -EINVAL;
+ rc = -EINVAL;
+ goto shutdown_out_err;
/*
* FLAGS_LOGFLUSH is easy since it asks to write out metadata (not
* data) but metadata writes are not cached on the client, so can treat
case CIFS_GOING_FLAGS_LOGFLUSH:
case CIFS_GOING_FLAGS_NOLOGFLUSH:
sbi->mnt_cifs_flags |= CIFS_MOUNT_SHUTDOWN;
- return 0;
+ goto shutdown_good;
default:
- return -EINVAL;
+ rc = -EINVAL;
+ goto shutdown_out_err;
}
+
+shutdown_good:
+ trace_smb3_shutdown_done(flags, tcon->tid);
+ cifs_put_tlink(tlink);
return 0;
+shutdown_out_err:
+ trace_smb3_shutdown_err(rc, flags, tcon->tid);
+ cifs_put_tlink(tlink);
+ return rc;
}
static int cifs_dump_full_key(struct cifs_tcon *tcon, struct smb3_full_key_debug_info __user *in)
return error;
}
+/*
+ * Check that file2's metadata agree with the snapshot that we took for the
+ * range commit request.
+ *
+ * This should be called after the filesystem has locked /all/ inode metadata
+ * against modification.
+ */
+STATIC int
+xfs_exchrange_check_freshness(
+ const struct xfs_exchrange *fxr,
+ struct xfs_inode *ip2)
+{
+ struct inode *inode2 = VFS_I(ip2);
+ struct timespec64 ctime = inode_get_ctime(inode2);
+ struct timespec64 mtime = inode_get_mtime(inode2);
+
+ trace_xfs_exchrange_freshness(fxr, ip2);
+
+ /* Check that file2 hasn't otherwise been modified. */
+ if (fxr->file2_ino != ip2->i_ino ||
+ fxr->file2_gen != inode2->i_generation ||
+ !timespec64_equal(&fxr->file2_ctime, &ctime) ||
+ !timespec64_equal(&fxr->file2_mtime, &mtime))
+ return -EBUSY;
+
+ return 0;
+}
+
#define QRETRY_IP1 (0x1)
#define QRETRY_IP2 (0x2)
if (error || fxr->length == 0)
return error;
+ if (fxr->flags & __XFS_EXCHANGE_RANGE_CHECK_FRESH2) {
+ error = xfs_exchrange_check_freshness(fxr, ip2);
+ if (error)
+ return error;
+ }
+
/* Attach dquots to both inodes before changing block maps. */
error = xfs_qm_dqattach(ip2);
if (error)
if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt)
return -EXDEV;
- if (fxr->flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
+ if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS |
+ __XFS_EXCHANGE_RANGE_CHECK_FRESH2))
return -EINVAL;
/* Userspace requests only honored for regular files. */
fxr.flags = args.flags;
file1 = fdget(args.file1_fd);
- if (!file1.file)
+ if (!fd_file(file1))
return -EBADF;
- fxr.file1 = file1.file;
+ fxr.file1 = fd_file(file1);
error = xfs_exchange_range(&fxr);
fdput(file1);
return error;
}
- if (!file1.file)
+
+/* Opaque freshness blob for XFS_IOC_COMMIT_RANGE */
+struct xfs_commit_range_fresh {
+ xfs_fsid_t fsid; /* m_fixedfsid */
+ __u64 file2_ino; /* inode number */
+ __s64 file2_mtime; /* modification time */
+ __s64 file2_ctime; /* change time */
+ __s32 file2_mtime_nsec; /* mod time, nsec */
+ __s32 file2_ctime_nsec; /* change time, nsec */
+ __u32 file2_gen; /* inode generation */
+ __u32 magic; /* zero */
+};
+#define XCR_FRESH_MAGIC 0x444F524B /* DORK */
+
+/* Set up a commitrange operation by sampling file2's write-related attrs */
+long
+xfs_ioc_start_commit(
+ struct file *file,
+ struct xfs_commit_range __user *argp)
+{
+ struct xfs_commit_range args = { };
+ struct timespec64 ts;
+ struct xfs_commit_range_fresh *kern_f;
+ struct xfs_commit_range_fresh __user *user_f;
+ struct inode *inode2 = file_inode(file);
+ struct xfs_inode *ip2 = XFS_I(inode2);
+ const unsigned int lockflags = XFS_IOLOCK_SHARED |
+ XFS_MMAPLOCK_SHARED |
+ XFS_ILOCK_SHARED;
+
+ BUILD_BUG_ON(sizeof(struct xfs_commit_range_fresh) !=
+ sizeof(args.file2_freshness));
+
+ kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;
+
+ memcpy(&kern_f->fsid, ip2->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));
+
+ xfs_ilock(ip2, lockflags);
+ ts = inode_get_ctime(inode2);
+ kern_f->file2_ctime = ts.tv_sec;
+ kern_f->file2_ctime_nsec = ts.tv_nsec;
+ ts = inode_get_mtime(inode2);
+ kern_f->file2_mtime = ts.tv_sec;
+ kern_f->file2_mtime_nsec = ts.tv_nsec;
+ kern_f->file2_ino = ip2->i_ino;
+ kern_f->file2_gen = inode2->i_generation;
+ kern_f->magic = XCR_FRESH_MAGIC;
+ xfs_iunlock(ip2, lockflags);
+
+ user_f = (struct xfs_commit_range_fresh __user *)&argp->file2_freshness;
+ if (copy_to_user(user_f, kern_f, sizeof(*kern_f)))
+ return -EFAULT;
+
+ return 0;
+}
+
+/*
+ * Exchange file1 and file2 contents if file2 has not been written since the
+ * start commit operation.
+ */
+long
+xfs_ioc_commit_range(
+ struct file *file,
+ struct xfs_commit_range __user *argp)
+{
+ struct xfs_exchrange fxr = {
+ .file2 = file,
+ };
+ struct xfs_commit_range args;
+ struct xfs_commit_range_fresh *kern_f;
+ struct xfs_inode *ip2 = XFS_I(file_inode(file));
+ struct xfs_mount *mp = ip2->i_mount;
+ struct fd file1;
+ int error;
+
+ kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;
+
+ if (copy_from_user(&args, argp, sizeof(args)))
+ return -EFAULT;
+ if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
+ return -EINVAL;
+ if (kern_f->magic != XCR_FRESH_MAGIC)
+ return -EBUSY;
+ if (memcmp(&kern_f->fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)))
+ return -EBUSY;
+
+ fxr.file1_offset = args.file1_offset;
+ fxr.file2_offset = args.file2_offset;
+ fxr.length = args.length;
+ fxr.flags = args.flags | __XFS_EXCHANGE_RANGE_CHECK_FRESH2;
+ fxr.file2_ino = kern_f->file2_ino;
+ fxr.file2_gen = kern_f->file2_gen;
+ fxr.file2_mtime.tv_sec = kern_f->file2_mtime;
+ fxr.file2_mtime.tv_nsec = kern_f->file2_mtime_nsec;
+ fxr.file2_ctime.tv_sec = kern_f->file2_ctime;
+ fxr.file2_ctime.tv_nsec = kern_f->file2_ctime_nsec;
+
+ file1 = fdget(args.file1_fd);
- fxr.file1 = file1.file;
++ if (fd_empty(file1))
+ return -EBADF;
++ fxr.file1 = fd_file(file1);
+
+ error = xfs_exchange_range(&fxr);
+ fdput(file1);
+ return error;
+}
/* Can't change realtime flag if any extents are allocated. */
if (ip->i_df.if_nextents || ip->i_delayed_blks)
return -EINVAL;
+
+ /*
+ * If S_DAX is enabled on this file, we can only switch the
+ * device if both support fsdax. We can't update S_DAX because
+ * there might be other threads walking down the access paths.
+ */
+ if (IS_DAX(VFS_I(ip)) &&
+ (mp->m_ddev_targp->bt_daxdev == NULL ||
+ (mp->m_rtdev_targp &&
+ mp->m_rtdev_targp->bt_daxdev == NULL)))
+ return -EINVAL;
}
if (rtflag) {
return error;
}
-STATIC int
-xfs_ioc_getfsmap(
- struct xfs_inode *ip,
- struct fsmap_head __user *arg)
-{
- struct xfs_fsmap_head xhead = {0};
- struct fsmap_head head;
- struct fsmap *recs;
- unsigned int count;
- __u32 last_flags = 0;
- bool done = false;
- int error;
-
- if (copy_from_user(&head, arg, sizeof(struct fsmap_head)))
- return -EFAULT;
- if (memchr_inv(head.fmh_reserved, 0, sizeof(head.fmh_reserved)) ||
- memchr_inv(head.fmh_keys[0].fmr_reserved, 0,
- sizeof(head.fmh_keys[0].fmr_reserved)) ||
- memchr_inv(head.fmh_keys[1].fmr_reserved, 0,
- sizeof(head.fmh_keys[1].fmr_reserved)))
- return -EINVAL;
-
- /*
- * Use an internal memory buffer so that we don't have to copy fsmap
- * data to userspace while holding locks. Start by trying to allocate
- * up to 128k for the buffer, but fall back to a single page if needed.
- */
- count = min_t(unsigned int, head.fmh_count,
- 131072 / sizeof(struct fsmap));
- recs = kvcalloc(count, sizeof(struct fsmap), GFP_KERNEL);
- if (!recs) {
- count = min_t(unsigned int, head.fmh_count,
- PAGE_SIZE / sizeof(struct fsmap));
- recs = kvcalloc(count, sizeof(struct fsmap), GFP_KERNEL);
- if (!recs)
- return -ENOMEM;
- }
-
- xhead.fmh_iflags = head.fmh_iflags;
- xfs_fsmap_to_internal(&xhead.fmh_keys[0], &head.fmh_keys[0]);
- xfs_fsmap_to_internal(&xhead.fmh_keys[1], &head.fmh_keys[1]);
-
- trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]);
- trace_xfs_getfsmap_high_key(ip->i_mount, &xhead.fmh_keys[1]);
-
- head.fmh_entries = 0;
- do {
- struct fsmap __user *user_recs;
- struct fsmap *last_rec;
-
- user_recs = &arg->fmh_recs[head.fmh_entries];
- xhead.fmh_entries = 0;
- xhead.fmh_count = min_t(unsigned int, count,
- head.fmh_count - head.fmh_entries);
-
- /* Run query, record how many entries we got. */
- error = xfs_getfsmap(ip->i_mount, &xhead, recs);
- switch (error) {
- case 0:
- /*
- * There are no more records in the result set. Copy
- * whatever we got to userspace and break out.
- */
- done = true;
- break;
- case -ECANCELED:
- /*
- * The internal memory buffer is full. Copy whatever
- * records we got to userspace and go again if we have
- * not yet filled the userspace buffer.
- */
- error = 0;
- break;
- default:
- goto out_free;
- }
- head.fmh_entries += xhead.fmh_entries;
- head.fmh_oflags = xhead.fmh_oflags;
-
- /*
- * If the caller wanted a record count or there aren't any
- * new records to return, we're done.
- */
- if (head.fmh_count == 0 || xhead.fmh_entries == 0)
- break;
-
- /* Copy all the records we got out to userspace. */
- if (copy_to_user(user_recs, recs,
- xhead.fmh_entries * sizeof(struct fsmap))) {
- error = -EFAULT;
- goto out_free;
- }
-
- /* Remember the last record flags we copied to userspace. */
- last_rec = &recs[xhead.fmh_entries - 1];
- last_flags = last_rec->fmr_flags;
-
- /* Set up the low key for the next iteration. */
- xfs_fsmap_to_internal(&xhead.fmh_keys[0], last_rec);
- trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]);
- } while (!done && head.fmh_entries < head.fmh_count);
-
- /*
- * If there are no more records in the query result set and we're not
- * in counting mode, mark the last record returned with the LAST flag.
- */
- if (done && head.fmh_count > 0 && head.fmh_entries > 0) {
- struct fsmap __user *user_rec;
-
- last_flags |= FMR_OF_LAST;
- user_rec = &arg->fmh_recs[head.fmh_entries - 1];
-
- if (copy_to_user(&user_rec->fmr_flags, &last_flags,
- sizeof(last_flags))) {
- error = -EFAULT;
- goto out_free;
- }
- }
-
- /* copy back header */
- if (copy_to_user(arg, &head, sizeof(struct fsmap_head))) {
- error = -EFAULT;
- goto out_free;
- }
-
-out_free:
- kvfree(recs);
- return error;
-}
-
int
xfs_ioc_swapext(
xfs_swapext_t *sxp)
/* Pull information for the target fd */
f = fdget((int)sxp->sx_fdtarget);
- if (!f.file) {
+ if (!fd_file(f)) {
error = -EINVAL;
goto out;
}
- if (!(f.file->f_mode & FMODE_WRITE) ||
- !(f.file->f_mode & FMODE_READ) ||
- (f.file->f_flags & O_APPEND)) {
+ if (!(fd_file(f)->f_mode & FMODE_WRITE) ||
+ !(fd_file(f)->f_mode & FMODE_READ) ||
+ (fd_file(f)->f_flags & O_APPEND)) {
error = -EBADF;
goto out_put_file;
}
tmp = fdget((int)sxp->sx_fdtmp);
- if (!tmp.file) {
+ if (!fd_file(tmp)) {
error = -EINVAL;
goto out_put_file;
}
- if (!(tmp.file->f_mode & FMODE_WRITE) ||
- !(tmp.file->f_mode & FMODE_READ) ||
- (tmp.file->f_flags & O_APPEND)) {
+ if (!(fd_file(tmp)->f_mode & FMODE_WRITE) ||
+ !(fd_file(tmp)->f_mode & FMODE_READ) ||
+ (fd_file(tmp)->f_flags & O_APPEND)) {
error = -EBADF;
goto out_put_tmp_file;
}
- if (IS_SWAPFILE(file_inode(f.file)) ||
- IS_SWAPFILE(file_inode(tmp.file))) {
+ if (IS_SWAPFILE(file_inode(fd_file(f))) ||
+ IS_SWAPFILE(file_inode(fd_file(tmp)))) {
error = -EINVAL;
goto out_put_tmp_file;
}
* before we cast and access them as XFS structures as we have no
* control over what the user passes us here.
*/
- if (f.file->f_op != &xfs_file_operations ||
- tmp.file->f_op != &xfs_file_operations) {
+ if (fd_file(f)->f_op != &xfs_file_operations ||
+ fd_file(tmp)->f_op != &xfs_file_operations) {
error = -EINVAL;
goto out_put_tmp_file;
}
- ip = XFS_I(file_inode(f.file));
- tip = XFS_I(file_inode(tmp.file));
+ ip = XFS_I(file_inode(fd_file(f)));
+ tip = XFS_I(file_inode(fd_file(tmp)));
if (ip->i_mount != tip->i_mount) {
error = -EINVAL;
case XFS_IOC_EXCHANGE_RANGE:
return xfs_ioc_exchange_range(filp, arg);
+ case XFS_IOC_START_COMMIT:
+ return xfs_ioc_start_commit(filp, arg);
+ case XFS_IOC_COMMIT_RANGE:
+ return xfs_ioc_commit_range(filp, arg);
default:
return -ENOTTY;
#include <linux/posix_types.h>
#include <linux/errno.h>
#include <linux/cleanup.h>
+#include <linux/err.h>
struct file;
fput(file);
}
+ /* either a reference to struct file + flags
+ * (cloned vs. borrowed, pos locked), with
+ * flags stored in lower bits of value,
+ * or empty (represented by 0).
+ */
struct fd {
- struct file *file;
- unsigned int flags;
+ unsigned long word;
};
#define FDPUT_FPUT 1
#define FDPUT_POS_UNLOCK 2
- static inline void fdput(struct fd fd)
+ #define fd_file(f) ((struct file *)((f).word & ~(FDPUT_FPUT|FDPUT_POS_UNLOCK)))
+ static inline bool fd_empty(struct fd f)
{
- if (fd.flags & FDPUT_FPUT)
- fput(fd.file);
+ return unlikely(!f.word);
}
- extern struct file *fget(unsigned int fd);
- extern struct file *fget_raw(unsigned int fd);
- extern struct file *fget_task(struct task_struct *task, unsigned int fd);
- extern unsigned long __fdget(unsigned int fd);
- extern unsigned long __fdget_raw(unsigned int fd);
- extern unsigned long __fdget_pos(unsigned int fd);
- extern void __f_unlock_pos(struct file *);
-
- static inline struct fd __to_fd(unsigned long v)
+ #define EMPTY_FD (struct fd){0}
+ static inline struct fd BORROWED_FD(struct file *f)
{
- return (struct fd){(struct file *)(v & ~3),v & 3};
+ return (struct fd){(unsigned long)f};
}
-
- static inline struct fd fdget(unsigned int fd)
+ static inline struct fd CLONED_FD(struct file *f)
{
- return __to_fd(__fdget(fd));
+ return (struct fd){(unsigned long)f | FDPUT_FPUT};
}
- static inline struct fd fdget_raw(unsigned int fd)
+ static inline void fdput(struct fd fd)
{
- return __to_fd(__fdget_raw(fd));
+ if (fd.word & FDPUT_FPUT)
+ fput(fd_file(fd));
}
- static inline struct fd fdget_pos(int fd)
- {
- return __to_fd(__fdget_pos(fd));
- }
+ extern struct file *fget(unsigned int fd);
+ extern struct file *fget_raw(unsigned int fd);
+ extern struct file *fget_task(struct task_struct *task, unsigned int fd);
+ extern void __f_unlock_pos(struct file *);
+
+ struct fd fdget(unsigned int fd);
+ struct fd fdget_raw(unsigned int fd);
+ struct fd fdget_pos(unsigned int fd);
static inline void fdput_pos(struct fd f)
{
- if (f.flags & FDPUT_POS_UNLOCK)
- __f_unlock_pos(f.file);
+ if (f.word & FDPUT_POS_UNLOCK)
+ __f_unlock_pos(fd_file(f));
fdput(f);
}
DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T),
get_unused_fd_flags(flags), unsigned flags)
+DEFINE_FREE(fput, struct file *, if (!IS_ERR_OR_NULL(_T)) fput(_T))
/*
* take_fd() will take care to set @fd to -EBADF ensuring that
*
* f = dentry_open(&path, O_RDONLY, current_cred());
* if (IS_ERR(f))
- * return PTR_ERR(fd);
+ * return PTR_ERR(f);
*
* fd_install(fd, f);
* return take_fd(fd);
#include <linux/slab.h>
#include <linux/audit.h>
#include <linux/security.h>
+#include <linux/cpuset.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
void io_sq_thread_park(struct io_sq_data *sqd)
__acquires(&sqd->lock)
{
- WARN_ON_ONCE(sqd->thread == current);
+ WARN_ON_ONCE(data_race(sqd->thread) == current);
atomic_inc(&sqd->park_pending);
set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
struct fd f;
f = fdget(p->wq_fd);
- if (!f.file)
+ if (!fd_file(f))
return ERR_PTR(-ENXIO);
- if (!io_is_uring_fops(f.file)) {
+ if (!io_is_uring_fops(fd_file(f))) {
fdput(f);
return ERR_PTR(-EINVAL);
}
- ctx_attach = f.file->private_data;
+ ctx_attach = fd_file(f)->private_data;
sqd = ctx_attach->sq_data;
if (!sqd) {
fdput(f);
if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
- if (!wq_list_empty(&ctx->iopoll_list) || to_submit) {
+ if (to_submit || !wq_list_empty(&ctx->iopoll_list)) {
const struct cred *creds = NULL;
if (ctx->sq_creds != current_cred())
struct fd f;
f = fdget(p->wq_fd);
- if (!f.file)
+ if (!fd_file(f))
return -ENXIO;
- if (!io_is_uring_fops(f.file)) {
+ if (!io_is_uring_fops(fd_file(f))) {
fdput(f);
return -EINVAL;
}
return 0;
if (p->flags & IORING_SETUP_SQ_AFF) {
+ struct cpumask allowed_mask;
int cpu = p->sq_thread_cpu;
ret = -EINVAL;
- if (cpu >= nr_cpu_ids || !cpu_online(cpu))
+ cpuset_cpus_allowed(current, &allowed_mask);
+ if (!cpumask_test_cpu(cpu, &allowed_mask))
goto err_sqpoll;
sqd->sq_cpu = cpu;
} else {
BTF_KFUNC_HOOK_TRACING,
BTF_KFUNC_HOOK_SYSCALL,
BTF_KFUNC_HOOK_FMODRET,
- BTF_KFUNC_HOOK_CGROUP_SKB,
+ BTF_KFUNC_HOOK_CGROUP,
BTF_KFUNC_HOOK_SCHED_ACT,
BTF_KFUNC_HOOK_SK_SKB,
BTF_KFUNC_HOOK_SOCKET_FILTER,
return NULL;
}
-static bool __btf_name_valid(const struct btf *btf, u32 offset)
+static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
{
/* offset must be valid */
const char *src = btf_str_by_offset(btf, offset);
return !*src;
}
-static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
-{
- return __btf_name_valid(btf, offset);
-}
-
/* Allow any printable character in DATASEC names */
static bool btf_name_valid_section(const struct btf *btf, u32 offset)
{
const char *src = btf_str_by_offset(btf, offset);
const char *src_limit;
+ if (!*src)
+ return false;
+
/* set a limit on identifier length */
src_limit = src + KSYM_NAME_LEN;
- src++;
while (*src && src < src_limit) {
if (!isprint(*src))
return false;
return -EINVAL;
}
+/* Callers have to ensure the life cycle of btf if it is program BTF */
static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
struct btf_field_info *info)
{
field->kptr.dtor = NULL;
id = info->kptr.type_id;
kptr_btf = (struct btf *)btf;
- btf_get(kptr_btf);
goto found_dtor;
}
if (id < 0)
}
if (!t->name_off ||
- !__btf_name_valid(env->btf, t->name_off)) {
+ !btf_name_valid_identifier(env->btf, t->name_off)) {
btf_verifier_log_type(env, t, "Invalid name");
return -EINVAL;
}
static struct btf_struct_metas *
btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
{
- union {
- struct btf_id_set set;
- struct {
- u32 _cnt;
- u32 _ids[ARRAY_SIZE(alloc_obj_fields)];
- } _arr;
- } aof;
struct btf_struct_metas *tab = NULL;
+ struct btf_id_set *aof;
int i, n, id, ret;
BUILD_BUG_ON(offsetof(struct btf_id_set, cnt) != 0);
BUILD_BUG_ON(sizeof(struct btf_id_set) != sizeof(u32));
- memset(&aof, 0, sizeof(aof));
+ aof = kmalloc(sizeof(*aof), GFP_KERNEL | __GFP_NOWARN);
+ if (!aof)
+ return ERR_PTR(-ENOMEM);
+ aof->cnt = 0;
+
for (i = 0; i < ARRAY_SIZE(alloc_obj_fields); i++) {
/* Try to find whether this special type exists in user BTF, and
* if so remember its ID so we can easily find it among members
* of structs that we iterate in the next loop.
*/
+ struct btf_id_set *new_aof;
+
id = btf_find_by_name_kind(btf, alloc_obj_fields[i], BTF_KIND_STRUCT);
if (id < 0)
continue;
- aof.set.ids[aof.set.cnt++] = id;
+
+ new_aof = krealloc(aof, offsetof(struct btf_id_set, ids[aof->cnt + 1]),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!new_aof) {
+ ret = -ENOMEM;
+ goto free_aof;
+ }
+ aof = new_aof;
+ aof->ids[aof->cnt++] = id;
}
- if (!aof.set.cnt)
+ n = btf_nr_types(btf);
+ for (i = 1; i < n; i++) {
+ /* Try to find if there are kptrs in user BTF and remember their ID */
+ struct btf_id_set *new_aof;
+ struct btf_field_info tmp;
+ const struct btf_type *t;
+
+ t = btf_type_by_id(btf, i);
+ if (!t) {
+ ret = -EINVAL;
+ goto free_aof;
+ }
+
+ ret = btf_find_kptr(btf, t, 0, 0, &tmp);
+ if (ret != BTF_FIELD_FOUND)
+ continue;
+
+ new_aof = krealloc(aof, offsetof(struct btf_id_set, ids[aof->cnt + 1]),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!new_aof) {
+ ret = -ENOMEM;
+ goto free_aof;
+ }
+ aof = new_aof;
+ aof->ids[aof->cnt++] = i;
+ }
+
+ if (!aof->cnt) {
+ kfree(aof);
return NULL;
- sort(&aof.set.ids, aof.set.cnt, sizeof(aof.set.ids[0]), btf_id_cmp_func, NULL);
+ }
+ sort(&aof->ids, aof->cnt, sizeof(aof->ids[0]), btf_id_cmp_func, NULL);
- n = btf_nr_types(btf);
for (i = 1; i < n; i++) {
struct btf_struct_metas *new_tab;
const struct btf_member *member;
int j, tab_cnt;
t = btf_type_by_id(btf, i);
- if (!t) {
- ret = -EINVAL;
- goto free;
- }
if (!__btf_type_is_struct(t))
continue;
cond_resched();
for_each_member(j, t, member) {
- if (btf_id_set_contains(&aof.set, member->type))
+ if (btf_id_set_contains(aof, member->type))
goto parse;
}
continue;
type = &tab->types[tab->cnt];
type->btf_id = i;
record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE |
- BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT, t->size);
+ BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT |
+ BPF_KPTR, t->size);
/* The record cannot be unset, treat it as an error if so */
if (IS_ERR_OR_NULL(record)) {
ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT;
type->record = record;
tab->cnt++;
}
+ kfree(aof);
return tab;
free:
btf_struct_metas_free(tab);
+free_aof:
+ kfree(aof);
return ERR_PTR(ret);
}
btf->kernel_btf = true;
snprintf(btf->name, sizeof(btf->name), "%s", module_name);
- btf->data = kvmalloc(data_size, GFP_KERNEL | __GFP_NOWARN);
+ btf->data = kvmemdup(data, data_size, GFP_KERNEL | __GFP_NOWARN);
if (!btf->data) {
err = -ENOMEM;
goto errout;
}
- memcpy(btf->data, data, data_size);
btf->data_size = data_size;
err = btf_parse_hdr(env);
errout:
btf_verifier_env_free(env);
- if (base_btf != vmlinux_btf)
+ if (!IS_ERR(base_btf) && base_btf != vmlinux_btf)
btf_free(base_btf);
if (btf) {
kvfree(btf->data);
if (arg == nr_args) {
switch (prog->expected_attach_type) {
- case BPF_LSM_CGROUP:
case BPF_LSM_MAC:
+ /* mark we are accessing the return value */
+ info->is_retval = true;
+ fallthrough;
+ case BPF_LSM_CGROUP:
case BPF_TRACE_FEXIT:
/* When LSM programs are attached to void LSM hooks
* they use FEXIT trampolines and when attached to
if (prog_args_trusted(prog))
info->reg_type |= PTR_TRUSTED;
+ if (btf_param_match_suffix(btf, &args[arg], "__nullable"))
+ info->reg_type |= PTR_MAYBE_NULL;
+
if (tgt_prog) {
enum bpf_prog_type tgt_type;
f = fdget(fd);
- if (!f.file)
+ if (!fd_file(f))
return ERR_PTR(-EBADF);
- if (f.file->f_op != &btf_fops) {
+ if (fd_file(f)->f_op != &btf_fops) {
fdput(f);
return ERR_PTR(-EINVAL);
}
- btf = f.file->private_data;
+ btf = fd_file(f)->private_data;
refcount_inc(&btf->refcnt);
fdput(f);
BTF_TRACING_TYPE_xxx
#undef BTF_TRACING_TYPE
+/* Validate well-formedness of iter argument type.
+ * On success, return positive BTF ID of iter state's STRUCT type.
+ * On error, negative error is returned.
+ */
+int btf_check_iter_arg(struct btf *btf, const struct btf_type *func, int arg_idx)
+{
+ const struct btf_param *arg;
+ const struct btf_type *t;
+ const char *name;
+ int btf_id;
+
+ if (btf_type_vlen(func) <= arg_idx)
+ return -EINVAL;
+
+ arg = &btf_params(func)[arg_idx];
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!t || !btf_type_is_ptr(t))
+ return -EINVAL;
+ t = btf_type_skip_modifiers(btf, t->type, &btf_id);
+ if (!t || !__btf_type_is_struct(t))
+ return -EINVAL;
+
+ name = btf_name_by_offset(btf, t->name_off);
+ if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1))
+ return -EINVAL;
+
+ return btf_id;
+}
+
static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name,
const struct btf_type *func, u32 func_flags)
{
u32 flags = func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY);
- const char *name, *sfx, *iter_name;
- const struct btf_param *arg;
+ const char *sfx, *iter_name;
const struct btf_type *t;
char exp_name[128];
u32 nr_args;
+ int btf_id;
/* exactly one of KF_ITER_{NEW,NEXT,DESTROY} can be set */
if (!flags || (flags & (flags - 1)))
if (nr_args < 1)
return -EINVAL;
- arg = &btf_params(func)[0];
- t = btf_type_skip_modifiers(btf, arg->type, NULL);
- if (!t || !btf_type_is_ptr(t))
- return -EINVAL;
- t = btf_type_skip_modifiers(btf, t->type, NULL);
- if (!t || !__btf_type_is_struct(t))
- return -EINVAL;
-
- name = btf_name_by_offset(btf, t->name_off);
- if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1))
- return -EINVAL;
+ btf_id = btf_check_iter_arg(btf, func, 0);
+ if (btf_id < 0)
+ return btf_id;
/* sizeof(struct bpf_iter_<type>) should be a multiple of 8 to
* fit nicely in stack slots
*/
+ t = btf_type_by_id(btf, btf_id);
if (t->size == 0 || (t->size % 8))
return -EINVAL;
/* validate bpf_iter_<type>_{new,next,destroy}(struct bpf_iter_<type> *)
* naming pattern
*/
- iter_name = name + sizeof(ITER_PREFIX) - 1;
+ iter_name = btf_name_by_offset(btf, t->name_off) + sizeof(ITER_PREFIX) - 1;
if (flags & KF_ITER_NEW)
sfx = "new";
else if (flags & KF_ITER_NEXT)
case BPF_PROG_TYPE_STRUCT_OPS:
return BTF_KFUNC_HOOK_STRUCT_OPS;
case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ case BPF_PROG_TYPE_PERF_EVENT:
case BPF_PROG_TYPE_LSM:
return BTF_KFUNC_HOOK_TRACING;
case BPF_PROG_TYPE_SYSCALL:
return BTF_KFUNC_HOOK_SYSCALL;
case BPF_PROG_TYPE_CGROUP_SKB:
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
- return BTF_KFUNC_HOOK_CGROUP_SKB;
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ case BPF_PROG_TYPE_CGROUP_SYSCTL:
+ return BTF_KFUNC_HOOK_CGROUP;
case BPF_PROG_TYPE_SCHED_ACT:
return BTF_KFUNC_HOOK_SCHED_ACT;
case BPF_PROG_TYPE_SK_SKB:
struct bpf_core_cand_list cands = {};
struct bpf_core_relo_res targ_res;
struct bpf_core_spec *specs;
+ const struct btf_type *type;
int err;
/* ~4k of temp memory necessary to convert LLVM spec like "0:1:0:5"
if (!specs)
return -ENOMEM;
+ type = btf_type_by_id(ctx->btf, relo->type_id);
+ if (!type) {
+ bpf_log(ctx->log, "relo #%u: bad type id %u\n",
+ relo_idx, relo->type_id);
+ return -EINVAL;
+ }
+
if (need_cands) {
struct bpf_cand_cache *cc;
int i;
case BPF_KPTR_PERCPU:
if (rec->fields[i].kptr.module)
module_put(rec->fields[i].kptr.module);
- btf_put(rec->fields[i].kptr.btf);
+ if (btf_is_kernel(rec->fields[i].kptr.btf))
+ btf_put(rec->fields[i].kptr.btf);
break;
case BPF_LIST_HEAD:
case BPF_LIST_NODE:
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
case BPF_KPTR_PERCPU:
- btf_get(fields[i].kptr.btf);
+ if (btf_is_kernel(fields[i].kptr.btf))
+ btf_get(fields[i].kptr.btf);
if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
ret = -ENXIO;
goto free;
}
}
-/* called from workqueue */
-static void bpf_map_free_deferred(struct work_struct *work)
+static void bpf_map_free(struct bpf_map *map)
{
- struct bpf_map *map = container_of(work, struct bpf_map, work);
struct btf_record *rec = map->record;
struct btf *btf = map->btf;
- security_bpf_map_free(map);
- bpf_map_release_memcg(map);
/* implementation dependent freeing */
map->ops->map_free(map);
/* Delay freeing of btf_record for maps, as map_free
btf_put(btf);
}
+/* called from workqueue */
+static void bpf_map_free_deferred(struct work_struct *work)
+{
+ struct bpf_map *map = container_of(work, struct bpf_map, work);
+
+ security_bpf_map_free(map);
+ bpf_map_release_memcg(map);
+ bpf_map_free(map);
+}
+
static void bpf_map_put_uref(struct bpf_map *map)
{
if (atomic64_dec_and_test(&map->usercnt)) {
static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
{
- fmode_t mode = f.file->f_mode;
+ fmode_t mode = fd_file(f)->f_mode;
/* Our file permissions may have been overridden by global
* map permissions facing syscall side.
free_map_sec:
security_bpf_map_free(map);
free_map:
- btf_put(map->btf);
- map->ops->map_free(map);
+ bpf_map_free(map);
put_token:
bpf_token_put(token);
return err;
*/
struct bpf_map *__bpf_map_get(struct fd f)
{
- if (!f.file)
+ if (!fd_file(f))
return ERR_PTR(-EBADF);
- if (f.file->f_op != &bpf_map_fops) {
+ if (fd_file(f)->f_op != &bpf_map_fops) {
fdput(f);
return ERR_PTR(-EINVAL);
}
- return f.file->private_data;
+ return fd_file(f)->private_data;
}
void bpf_map_inc(struct bpf_map *map)
goto free_key;
}
- err = bpf_map_update_value(map, f.file, key, value, attr->flags);
+ err = bpf_map_update_value(map, fd_file(f), key, value, attr->flags);
if (!err)
maybe_wait_bpf_programs(map);
static struct bpf_prog *____bpf_prog_get(struct fd f)
{
- if (!f.file)
+ if (!fd_file(f))
return ERR_PTR(-EBADF);
- if (f.file->f_op != &bpf_prog_fops) {
+ if (fd_file(f)->f_op != &bpf_prog_fops) {
fdput(f);
return ERR_PTR(-EINVAL);
}
- return f.file->private_data;
+ return fd_file(f)->private_data;
}
void bpf_prog_add(struct bpf_prog *prog, int i)
struct fd f = fdget(ufd);
struct bpf_link *link;
- if (!f.file)
+ if (!fd_file(f))
return ERR_PTR(-EBADF);
- if (f.file->f_op != &bpf_link_fops && f.file->f_op != &bpf_link_fops_poll) {
+ if (fd_file(f)->f_op != &bpf_link_fops && fd_file(f)->f_op != &bpf_link_fops_poll) {
fdput(f);
return ERR_PTR(-EINVAL);
}
- link = f.file->private_data;
+ link = fd_file(f)->private_data;
bpf_link_inc(link);
fdput(f);
return -EINVAL;
f = fdget(ufd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADFD;
- if (f.file->f_op == &bpf_prog_fops)
- err = bpf_prog_get_info_by_fd(f.file, f.file->private_data, attr,
+ if (fd_file(f)->f_op == &bpf_prog_fops)
+ err = bpf_prog_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr,
uattr);
- else if (f.file->f_op == &bpf_map_fops)
- err = bpf_map_get_info_by_fd(f.file, f.file->private_data, attr,
+ else if (fd_file(f)->f_op == &bpf_map_fops)
+ err = bpf_map_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr,
uattr);
- else if (f.file->f_op == &btf_fops)
- err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr);
- else if (f.file->f_op == &bpf_link_fops || f.file->f_op == &bpf_link_fops_poll)
- err = bpf_link_get_info_by_fd(f.file, f.file->private_data,
+ else if (fd_file(f)->f_op == &btf_fops)
+ err = bpf_btf_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr);
+ else if (fd_file(f)->f_op == &bpf_link_fops || fd_file(f)->f_op == &bpf_link_fops_poll)
+ err = bpf_link_get_info_by_fd(fd_file(f), fd_file(f)->private_data,
attr, uattr);
else
err = -EINVAL;
else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch, map, attr, uattr);
else if (cmd == BPF_MAP_UPDATE_BATCH)
- BPF_DO_BATCH(map->ops->map_update_batch, map, f.file, attr, uattr);
+ BPF_DO_BATCH(map->ops->map_update_batch, map, fd_file(f), attr, uattr);
else
BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr);
err_put:
return bpf_token_create(attr);
}
-static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
+static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
{
union bpf_attr attr;
int err;
BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flags, u64 *, res)
{
+ *res = 0;
if (flags)
return -EINVAL;
.arg1_type = ARG_PTR_TO_MEM,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_LONG,
+ .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED,
+ .arg4_size = sizeof(u64),
};
static const struct bpf_func_proto *
return -EINVAL;
}
+struct cgroup_of_peak *of_peak(struct kernfs_open_file *of)
+{
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ return &ctx->peak;
+}
+
static void apply_cgroup_root_flags(unsigned int root_flags)
{
if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
.fs_flags = FS_USERNS_MOUNT,
};
-#ifdef CONFIG_CPUSETS
+#ifdef CONFIG_CPUSETS_V1
static const struct fs_context_operations cpuset_fs_context_ops = {
.get_tree = cgroup1_get_tree,
.free = cgroup_fs_context_free,
static int cgroup_stat_show(struct seq_file *seq, void *v)
{
struct cgroup *cgroup = seq_css(seq)->cgroup;
+ struct cgroup_subsys_state *css;
+ int dying_cnt[CGROUP_SUBSYS_COUNT];
+ int ssid;
seq_printf(seq, "nr_descendants %d\n",
cgroup->nr_descendants);
+
+ /*
+ * Show the number of live and dying csses associated with each of
+ * non-inhibited cgroup subsystems that is bound to cgroup v2.
+ *
+ * Without proper lock protection, racing is possible. So the
+ * numbers may not be consistent when that happens.
+ */
+ rcu_read_lock();
+ for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) {
+ dying_cnt[ssid] = -1;
+ if ((BIT(ssid) & cgrp_dfl_inhibit_ss_mask) ||
+ (cgroup_subsys[ssid]->root != &cgrp_dfl_root))
+ continue;
+ css = rcu_dereference_raw(cgroup->subsys[ssid]);
+ dying_cnt[ssid] = cgroup->nr_dying_subsys[ssid];
+ seq_printf(seq, "nr_subsys_%s %d\n", cgroup_subsys[ssid]->name,
+ css ? (css->nr_descendants + 1) : 0);
+ }
+
seq_printf(seq, "nr_dying_descendants %d\n",
cgroup->nr_dying_descendants);
-
+ for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) {
+ if (dying_cnt[ssid] >= 0)
+ seq_printf(seq, "nr_dying_subsys_%s %d\n",
+ cgroup_subsys[ssid]->name, dying_cnt[ssid]);
+ }
+ rcu_read_unlock();
return 0;
}
* If namespaces are delegation boundaries, disallow writes to
* files in an non-init namespace root from inside the namespace
* except for the files explicitly marked delegatable -
- * cgroup.procs and cgroup.subtree_control.
+ * eg. cgroup.procs, cgroup.threads and cgroup.subtree_control.
*/
if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
!(cft->flags & CFTYPE_NS_DELEGATABLE) &&
*
* While this function requires cgroup_mutex or RCU read locking, it
* doesn't require the whole traversal to be contained in a single critical
- * section. This function will return the correct next descendant as long
- * as both @pos and @root are accessible and @pos is a descendant of @root.
+ * section. Additionally, it isn't necessary to hold onto a reference to @pos.
+ * This function will return the correct next descendant as long as both @pos
+ * and @root are accessible and @pos is a descendant of @root.
*
* If a subsystem synchronizes ->css_online() and the start of iteration, a
* css which finished ->css_online() is guaranteed to be visible in the
*
* While this function requires cgroup_mutex or RCU read locking, it
* doesn't require the whole traversal to be contained in a single critical
- * section. This function will return the correct rightmost descendant as
- * long as @pos is accessible.
+ * section. Additionally, it isn't necessary to hold onto a reference to @pos.
+ * This function will return the correct rightmost descendant as long as @pos
+ * is accessible.
*/
struct cgroup_subsys_state *
css_rightmost_descendant(struct cgroup_subsys_state *pos)
*
* While this function requires cgroup_mutex or RCU read locking, it
* doesn't require the whole traversal to be contained in a single critical
- * section. This function will return the correct next descendant as long
- * as both @pos and @cgroup are accessible and @pos is a descendant of
- * @cgroup.
+ * section. Additionally, it isn't necessary to hold onto a reference to @pos.
+ * This function will return the correct next descendant as long as both @pos
+ * and @cgroup are accessible and @pos is a descendant of @cgroup.
*
* If a subsystem synchronizes ->css_online() and the start of iteration, a
* css which finished ->css_online() is guaranteed to be visible in the
list_del_rcu(&css->sibling);
if (ss) {
+ struct cgroup *parent_cgrp;
+
/* css release path */
if (!list_empty(&css->rstat_css_node)) {
cgroup_rstat_flush(cgrp);
cgroup_idr_replace(&ss->css_idr, NULL, css->id);
if (ss->css_released)
ss->css_released(css);
+
+ cgrp->nr_dying_subsys[ss->id]--;
+ /*
+ * When a css is released and ready to be freed, its
+ * nr_descendants must be zero. However, the corresponding
+ * cgrp->nr_dying_subsys[ss->id] may not be 0 if a subsystem
+ * is activated and deactivated multiple times with one or
+ * more of its previous activation leaving behind dying csses.
+ */
+ WARN_ON_ONCE(css->nr_descendants);
+ parent_cgrp = cgroup_parent(cgrp);
+ while (parent_cgrp) {
+ parent_cgrp->nr_dying_subsys[ss->id]--;
+ parent_cgrp = cgroup_parent(parent_cgrp);
+ }
} else {
struct cgroup *tcgrp;
rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
atomic_inc(&css->online_cnt);
- if (css->parent)
+ if (css->parent) {
atomic_inc(&css->parent->online_cnt);
+ while ((css = css->parent))
+ css->nr_descendants++;
+ }
}
return ret;
}
RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
wake_up_all(&css->cgroup->offline_waitq);
+
+ css->cgroup->nr_dying_subsys[ss->id]++;
+ /*
+ * Parent css and cgroup cannot be freed until after the freeing
+ * of child css, see css_free_rwork_fn().
+ */
+ while ((css = css->parent)) {
+ css->nr_descendants--;
+ css->cgroup->nr_dying_subsys[ss->id]++;
+ }
}
/**
WARN_ON(register_filesystem(&cgroup_fs_type));
WARN_ON(register_filesystem(&cgroup2_fs_type));
WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
-#ifdef CONFIG_CPUSETS
+#ifdef CONFIG_CPUSETS_V1
WARN_ON(register_filesystem(&cpuset_fs_type));
#endif
{
struct cgroup *cgrp;
struct fd f = fdget_raw(fd);
- if (!f.file)
+ if (!fd_file(f))
return ERR_PTR(-EBADF);
- cgrp = cgroup_v1v2_get_from_file(f.file);
+ cgrp = cgroup_v1v2_get_from_file(fd_file(f));
fdput(f);
return cgrp;
}
return data.ret;
}
+enum event_type_t {
+ EVENT_FLEXIBLE = 0x01,
+ EVENT_PINNED = 0x02,
+ EVENT_TIME = 0x04,
+ EVENT_FROZEN = 0x08,
+ /* see ctx_resched() for details */
+ EVENT_CPU = 0x10,
+ EVENT_CGROUP = 0x20,
+
+ /* compound helpers */
+ EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
+ EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN,
+};
+
+static inline void __perf_ctx_lock(struct perf_event_context *ctx)
+{
+ raw_spin_lock(&ctx->lock);
+ WARN_ON_ONCE(ctx->is_active & EVENT_FROZEN);
+}
+
static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
- raw_spin_lock(&cpuctx->ctx.lock);
+ __perf_ctx_lock(&cpuctx->ctx);
if (ctx)
- raw_spin_lock(&ctx->lock);
+ __perf_ctx_lock(ctx);
+}
+
+static inline void __perf_ctx_unlock(struct perf_event_context *ctx)
+{
+ /*
+ * If ctx_sched_in() didn't again set any ALL flags, clean up
+ * after ctx_sched_out() by clearing is_active.
+ */
+ if (ctx->is_active & EVENT_FROZEN) {
+ if (!(ctx->is_active & EVENT_ALL))
+ ctx->is_active = 0;
+ else
+ ctx->is_active &= ~EVENT_FROZEN;
+ }
+ raw_spin_unlock(&ctx->lock);
}
static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
if (ctx)
- raw_spin_unlock(&ctx->lock);
- raw_spin_unlock(&cpuctx->ctx.lock);
+ __perf_ctx_unlock(ctx);
+ __perf_ctx_unlock(&cpuctx->ctx);
}
#define TASK_TOMBSTONE ((void *)-1L)
{
struct perf_event_context *ctx = event->ctx;
struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
+ struct perf_cpu_context *cpuctx;
struct event_function_struct efs = {
.event = event,
.func = func,
if (!task_function_call(task, event_function, &efs))
return;
- raw_spin_lock_irq(&ctx->lock);
+ local_irq_disable();
+ cpuctx = this_cpu_ptr(&perf_cpu_context);
+ perf_ctx_lock(cpuctx, ctx);
/*
* Reload the task pointer, it might have been changed by
* a concurrent perf_event_context_sched_out().
*/
task = ctx->task;
- if (task == TASK_TOMBSTONE) {
- raw_spin_unlock_irq(&ctx->lock);
- return;
- }
+ if (task == TASK_TOMBSTONE)
+ goto unlock;
if (ctx->is_active) {
- raw_spin_unlock_irq(&ctx->lock);
+ perf_ctx_unlock(cpuctx, ctx);
+ local_irq_enable();
goto again;
}
func(event, NULL, ctx, data);
- raw_spin_unlock_irq(&ctx->lock);
+unlock:
+ perf_ctx_unlock(cpuctx, ctx);
+ local_irq_enable();
}
/*
(PERF_SAMPLE_BRANCH_KERNEL |\
PERF_SAMPLE_BRANCH_HV)
-enum event_type_t {
- EVENT_FLEXIBLE = 0x1,
- EVENT_PINNED = 0x2,
- EVENT_TIME = 0x4,
- /* see ctx_resched() for details */
- EVENT_CPU = 0x8,
- EVENT_CGROUP = 0x10,
- EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
-};
-
/*
* perf_sched_events : >0 events exist
*/
static DEFINE_MUTEX(pmus_lock);
static struct srcu_struct pmus_srcu;
static cpumask_var_t perf_online_mask;
+static cpumask_var_t perf_online_core_mask;
+static cpumask_var_t perf_online_die_mask;
+static cpumask_var_t perf_online_cluster_mask;
+static cpumask_var_t perf_online_pkg_mask;
+static cpumask_var_t perf_online_sys_mask;
static struct kmem_cache *perf_event_cache;
/*
___p; \
})
+#define for_each_epc(_epc, _ctx, _pmu, _cgroup) \
+ list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \
+ if (_cgroup && !_epc->nr_cgroups) \
+ continue; \
+ else if (_pmu && _epc->pmu != _pmu) \
+ continue; \
+ else
+
static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
{
struct perf_event_pmu_context *pmu_ctx;
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
- if (cgroup && !pmu_ctx->nr_cgroups)
- continue;
+ for_each_epc(pmu_ctx, ctx, NULL, cgroup)
perf_pmu_disable(pmu_ctx->pmu);
- }
}
static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
{
struct perf_event_pmu_context *pmu_ctx;
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
- if (cgroup && !pmu_ctx->nr_cgroups)
- continue;
+ for_each_epc(pmu_ctx, ctx, NULL, cgroup)
perf_pmu_enable(pmu_ctx->pmu);
- }
}
-static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
-static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type);
+static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
+static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
#ifdef CONFIG_CGROUP_PERF
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_ctx_disable(&cpuctx->ctx, true);
- ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
+ ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
/*
* must not be done before ctxswout due
* to update_cgrp_time_from_cpuctx() in
* perf_cgroup_set_timestamp() in ctx_sched_in()
* to not have to pass task around
*/
- ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
+ ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
perf_ctx_enable(&cpuctx->ctx, true);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
struct fd f = fdget(fd);
int ret = 0;
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- css = css_tryget_online_from_dir(f.file->f_path.dentry,
+ css = css_tryget_online_from_dir(fd_file(f)->f_path.dentry,
&perf_event_cgrp_subsys);
if (IS_ERR(css)) {
ret = PTR_ERR(css);
* perf_event_context::mutex
* perf_event::child_mutex;
* perf_event_context::lock
- * perf_event::mmap_mutex
* mmap_lock
+ * perf_event::mmap_mutex
+ * perf_buffer::aux_mutex
* perf_addr_filters_head::lock
*
* cpu_hotplug_lock
event = rb_entry_safe(rb_next(&event->group_node), \
typeof(*event), group_node))
+/*
+ * Does the event attribute request inherit with PERF_SAMPLE_READ
+ */
+static inline bool has_inherit_and_sample_read(struct perf_event_attr *attr)
+{
+ return attr->inherit && (attr->sample_type & PERF_SAMPLE_READ);
+}
+
/*
* Add an event from the lists for its context.
* Must be called with ctx->mutex and ctx->lock held.
ctx->nr_user++;
if (event->attr.inherit_stat)
ctx->nr_stat++;
+ if (has_inherit_and_sample_read(&event->attr))
+ local_inc(&ctx->nr_no_switch_fast);
if (event->state > PERF_EVENT_STATE_OFF)
perf_cgroup_event_enable(event, ctx);
ctx->nr_user--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
+ if (has_inherit_and_sample_read(&event->attr))
+ local_dec(&ctx->nr_no_switch_fast);
list_del_rcu(&event->event_entry);
event_sched_out(event, ctx);
}
+static inline void
+__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final)
+{
+ if (ctx->is_active & EVENT_TIME) {
+ if (ctx->is_active & EVENT_FROZEN)
+ return;
+ update_context_time(ctx);
+ update_cgrp_time_from_cpuctx(cpuctx, final);
+ }
+}
+
+static inline void
+ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
+{
+ __ctx_time_update(cpuctx, ctx, false);
+}
+
+/*
+ * To be used inside perf_ctx_lock() / perf_ctx_unlock(). Lasts until perf_ctx_unlock().
+ */
+static inline void
+ctx_time_freeze(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
+{
+ ctx_time_update(cpuctx, ctx);
+ if (ctx->is_active & EVENT_TIME)
+ ctx->is_active |= EVENT_FROZEN;
+}
+
+static inline void
+ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event)
+{
+ if (ctx->is_active & EVENT_TIME) {
+ if (ctx->is_active & EVENT_FROZEN)
+ return;
+ update_context_time(ctx);
+ update_cgrp_time_from_event(event);
+ }
+}
+
#define DETACH_GROUP 0x01UL
#define DETACH_CHILD 0x02UL
#define DETACH_DEAD 0x04UL
struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
unsigned long flags = (unsigned long)info;
- if (ctx->is_active & EVENT_TIME) {
- update_context_time(ctx);
- update_cgrp_time_from_cpuctx(cpuctx, false);
- }
+ ctx_time_update(cpuctx, ctx);
/*
* Ensure event_sched_out() switches to OFF, at the very least
if (event->state < PERF_EVENT_STATE_INACTIVE)
return;
- if (ctx->is_active & EVENT_TIME) {
- update_context_time(ctx);
- update_cgrp_time_from_event(event);
- }
-
perf_pmu_disable(event->pmu_ctx->pmu);
+ ctx_time_update_event(ctx, event);
if (event == event->group_leader)
group_sched_out(event, ctx);
}
static void task_ctx_sched_out(struct perf_event_context *ctx,
- enum event_type_t event_type)
+ struct pmu *pmu,
+ enum event_type_t event_type)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
return;
- ctx_sched_out(ctx, event_type);
+ ctx_sched_out(ctx, pmu, event_type);
}
static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+ struct perf_event_context *ctx,
+ struct pmu *pmu)
{
- ctx_sched_in(&cpuctx->ctx, EVENT_PINNED);
+ ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED);
if (ctx)
- ctx_sched_in(ctx, EVENT_PINNED);
- ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE);
+ ctx_sched_in(ctx, pmu, EVENT_PINNED);
+ ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
if (ctx)
- ctx_sched_in(ctx, EVENT_FLEXIBLE);
+ ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE);
}
/*
* event_type is a bit mask of the types of events involved. For CPU events,
* event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
*/
-/*
- * XXX: ctx_resched() reschedule entire perf_event_context while adding new
- * event to the context or enabling existing event in the context. We can
- * probably optimize it by rescheduling only affected pmu_ctx.
- */
static void ctx_resched(struct perf_cpu_context *cpuctx,
struct perf_event_context *task_ctx,
- enum event_type_t event_type)
+ struct pmu *pmu, enum event_type_t event_type)
{
bool cpu_event = !!(event_type & EVENT_CPU);
+ struct perf_event_pmu_context *epc;
/*
* If pinned groups are involved, flexible groups also need to be
event_type &= EVENT_ALL;
- perf_ctx_disable(&cpuctx->ctx, false);
+ for_each_epc(epc, &cpuctx->ctx, pmu, false)
+ perf_pmu_disable(epc->pmu);
+
if (task_ctx) {
- perf_ctx_disable(task_ctx, false);
- task_ctx_sched_out(task_ctx, event_type);
+ for_each_epc(epc, task_ctx, pmu, false)
+ perf_pmu_disable(epc->pmu);
+
+ task_ctx_sched_out(task_ctx, pmu, event_type);
}
/*
* - otherwise, do nothing more.
*/
if (cpu_event)
- ctx_sched_out(&cpuctx->ctx, event_type);
+ ctx_sched_out(&cpuctx->ctx, pmu, event_type);
else if (event_type & EVENT_PINNED)
- ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
+ ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
- perf_event_sched_in(cpuctx, task_ctx);
+ perf_event_sched_in(cpuctx, task_ctx, pmu);
- perf_ctx_enable(&cpuctx->ctx, false);
- if (task_ctx)
- perf_ctx_enable(task_ctx, false);
+ for_each_epc(epc, &cpuctx->ctx, pmu, false)
+ perf_pmu_enable(epc->pmu);
+
+ if (task_ctx) {
+ for_each_epc(epc, task_ctx, pmu, false)
+ perf_pmu_enable(epc->pmu);
+ }
}
void perf_pmu_resched(struct pmu *pmu)
struct perf_event_context *task_ctx = cpuctx->task_ctx;
perf_ctx_lock(cpuctx, task_ctx);
- ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
+ ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU);
perf_ctx_unlock(cpuctx, task_ctx);
}
#endif
if (reprogram) {
- ctx_sched_out(ctx, EVENT_TIME);
+ ctx_time_freeze(cpuctx, ctx);
add_event_to_ctx(event, ctx);
- ctx_resched(cpuctx, task_ctx, get_event_type(event));
+ ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
+ get_event_type(event));
} else {
add_event_to_ctx(event, ctx);
}
event->state <= PERF_EVENT_STATE_ERROR)
return;
- if (ctx->is_active)
- ctx_sched_out(ctx, EVENT_TIME);
+ ctx_time_freeze(cpuctx, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
perf_cgroup_event_enable(event, ctx);
if (!ctx->is_active)
return;
- if (!event_filter_match(event)) {
- ctx_sched_in(ctx, EVENT_TIME);
+ if (!event_filter_match(event))
return;
- }
/*
* If the event is in a group and isn't the group leader,
* then don't put it on unless the group is on.
*/
- if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
- ctx_sched_in(ctx, EVENT_TIME);
+ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
return;
- }
task_ctx = cpuctx->task_ctx;
if (ctx->task)
WARN_ON_ONCE(task_ctx != ctx);
- ctx_resched(cpuctx, task_ctx, get_event_type(event));
+ ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event));
}
/*
struct perf_event *event, *tmp;
struct pmu *pmu = pmu_ctx->pmu;
- if (ctx->task && !ctx->is_active) {
+ if (ctx->task && !(ctx->is_active & EVENT_ALL)) {
struct perf_cpu_pmu_context *cpc;
cpc = this_cpu_ptr(pmu->cpu_pmu_context);
cpc->task_epc = NULL;
}
- if (!event_type)
+ if (!(event_type & EVENT_ALL))
return;
perf_pmu_disable(pmu);
perf_pmu_enable(pmu);
}
+/*
+ * Be very careful with the @pmu argument since this will change ctx state.
+ * The @pmu argument works for ctx_resched(), because that is symmetric in
+ * ctx_sched_out() / ctx_sched_in() usage and the ctx state ends up invariant.
+ *
+ * However, if you were to be asymmetrical, you could end up with messed up
+ * state, eg. ctx->is_active cleared even though most EPCs would still actually
+ * be active.
+ */
static void
-ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
+ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_pmu_context *pmu_ctx;
*
* would only update time for the pinned events.
*/
- if (is_active & EVENT_TIME) {
- /* update (and stop) ctx time */
- update_context_time(ctx);
- update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
+ __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx);
+
+ /*
+ * CPU-release for the below ->is_active store,
+ * see __load_acquire() in perf_event_time_now()
+ */
+ barrier();
+ ctx->is_active &= ~event_type;
+
+ if (!(ctx->is_active & EVENT_ALL)) {
/*
- * CPU-release for the below ->is_active store,
- * see __load_acquire() in perf_event_time_now()
+ * For FROZEN, preserve TIME|FROZEN such that perf_event_time_now()
+ * does not observe a hole. perf_ctx_unlock() will clean up.
*/
- barrier();
+ if (ctx->is_active & EVENT_FROZEN)
+ ctx->is_active &= EVENT_TIME_FROZEN;
+ else
+ ctx->is_active = 0;
}
- ctx->is_active &= ~event_type;
- if (!(ctx->is_active & EVENT_ALL))
- ctx->is_active = 0;
-
if (ctx->task) {
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
- if (!ctx->is_active)
+ if (!(ctx->is_active & EVENT_ALL))
cpuctx->task_ctx = NULL;
}
is_active ^= ctx->is_active; /* changed bits */
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
- if (cgroup && !pmu_ctx->nr_cgroups)
- continue;
+ for_each_epc(pmu_ctx, ctx, pmu, cgroup)
__pmu_ctx_sched_out(pmu_ctx, is_active);
- }
}
/*
perf_ctx_disable(ctx, false);
- /* PMIs are disabled; ctx->nr_pending is stable. */
- if (local_read(&ctx->nr_pending) ||
- local_read(&next_ctx->nr_pending)) {
+ /* PMIs are disabled; ctx->nr_no_switch_fast is stable. */
+ if (local_read(&ctx->nr_no_switch_fast) ||
+ local_read(&next_ctx->nr_no_switch_fast)) {
/*
* Must not swap out ctx when there's pending
* events that rely on the ctx->task relation.
+ *
+ * Likewise, when a context contains inherit +
+ * SAMPLE_READ events they should be switched
+ * out using the slow path so that they are
+ * treated as if they were distinct contexts.
*/
raw_spin_unlock(&next_ctx->lock);
rcu_read_unlock();
inside_switch:
perf_ctx_sched_task_cb(ctx, false);
- task_ctx_sched_out(ctx, EVENT_ALL);
+ task_ctx_sched_out(ctx, NULL, EVENT_ALL);
perf_ctx_enable(ctx, false);
raw_spin_unlock(&ctx->lock);
merge_sched_in, &can_add_hw);
}
-static void ctx_groups_sched_in(struct perf_event_context *ctx,
- struct perf_event_groups *groups,
- bool cgroup)
+static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
+ enum event_type_t event_type)
{
- struct perf_event_pmu_context *pmu_ctx;
-
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
- if (cgroup && !pmu_ctx->nr_cgroups)
- continue;
- pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu);
- }
-}
+ struct perf_event_context *ctx = pmu_ctx->ctx;
-static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
- struct pmu *pmu)
-{
- pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
+ if (event_type & EVENT_PINNED)
+ pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu);
+ if (event_type & EVENT_FLEXIBLE)
+ pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu);
}
static void
-ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
+ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_pmu_context *pmu_ctx;
int is_active = ctx->is_active;
bool cgroup = event_type & EVENT_CGROUP;
ctx->is_active |= (event_type | EVENT_TIME);
if (ctx->task) {
- if (!is_active)
+ if (!(is_active & EVENT_ALL))
cpuctx->task_ctx = ctx;
else
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
* First go through the list and put on any pinned groups
* in order to give them the best chance of going on.
*/
- if (is_active & EVENT_PINNED)
- ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup);
+ if (is_active & EVENT_PINNED) {
+ for_each_epc(pmu_ctx, ctx, pmu, cgroup)
+ __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED);
+ }
/* Then walk through the lower prio flexible groups */
- if (is_active & EVENT_FLEXIBLE)
- ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup);
+ if (is_active & EVENT_FLEXIBLE) {
+ for_each_epc(pmu_ctx, ctx, pmu, cgroup)
+ __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE);
+ }
}
static void perf_event_context_sched_in(struct task_struct *task)
*/
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
perf_ctx_disable(&cpuctx->ctx, false);
- ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
+ ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE);
}
- perf_event_sched_in(cpuctx, ctx);
+ perf_event_sched_in(cpuctx, ctx, NULL);
perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
period = perf_calculate_period(event, nsec, count);
delta = (s64)(period - hwc->sample_period);
- delta = (delta + 7) / 8; /* low pass filter */
+ if (delta >= 0)
+ delta += 7;
+ else
+ delta -= 7;
+ delta /= 8; /* low pass filter */
sample_period = hwc->sample_period + delta;
update_context_time(&cpuctx->ctx);
__pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
rotate_ctx(&cpuctx->ctx, cpu_event);
- __pmu_ctx_sched_in(&cpuctx->ctx, pmu);
+ __pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE);
}
if (task_event)
rotate_ctx(task_epc->ctx, task_event);
if (task_event || (task_epc && cpu_event))
- __pmu_ctx_sched_in(task_epc->ctx, pmu);
+ __pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE);
perf_pmu_enable(pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
cpuctx = this_cpu_ptr(&perf_cpu_context);
perf_ctx_lock(cpuctx, ctx);
- ctx_sched_out(ctx, EVENT_TIME);
+ ctx_time_freeze(cpuctx, ctx);
list_for_each_entry(event, &ctx->event_list, event_entry) {
enabled |= event_enable_on_exec(event, ctx);
*/
if (enabled) {
clone_ctx = unclone_ctx(ctx);
- ctx_resched(cpuctx, ctx, event_type);
- } else {
- ctx_sched_in(ctx, EVENT_TIME);
+ ctx_resched(cpuctx, ctx, NULL, event_type);
}
perf_ctx_unlock(cpuctx, ctx);
int ret;
};
+static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu);
+
static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
{
+ int local_cpu = smp_processor_id();
u16 local_pkg, event_pkg;
if ((unsigned)event_cpu >= nr_cpu_ids)
return event_cpu;
- if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
- int local_cpu = smp_processor_id();
+ if (event->group_caps & PERF_EV_CAP_READ_SCOPE) {
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(event->pmu->scope, event_cpu);
+
+ if (cpumask && cpumask_test_cpu(local_cpu, cpumask))
+ return local_cpu;
+ }
+ if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
event_pkg = topology_physical_package_id(event_cpu);
local_pkg = topology_physical_package_id(local_cpu);
return;
raw_spin_lock(&ctx->lock);
- if (ctx->is_active & EVENT_TIME) {
- update_context_time(ctx);
- update_cgrp_time_from_event(event);
- }
+ ctx_time_update_event(ctx, event);
perf_event_update_time(event);
if (data->group)
raw_spin_unlock(&ctx->lock);
}
-static inline u64 perf_event_count(struct perf_event *event)
+static inline u64 perf_event_count(struct perf_event *event, bool self)
{
+ if (self)
+ return local64_read(&event->count);
+
return local64_read(&event->count) + atomic64_read(&event->child_count);
}
* May read while context is not active (e.g., thread is
* blocked), in that case we cannot update context time
*/
- if (ctx->is_active & EVENT_TIME) {
- update_context_time(ctx);
- update_cgrp_time_from_event(event);
- }
+ ctx_time_update_event(ctx, event);
perf_event_update_time(event);
if (group)
*/
if (task_work_cancel(current, head)) {
event->pending_work = 0;
- local_dec(&event->ctx->nr_pending);
+ local_dec(&event->ctx->nr_no_switch_fast);
return;
}
mutex_lock(&event->child_mutex);
(void)perf_event_read(event, false);
- total += perf_event_count(event);
+ total += perf_event_count(event, false);
*enabled += event->total_time_enabled +
atomic64_read(&event->child_total_time_enabled);
list_for_each_entry(child, &event->child_list, child_list) {
(void)perf_event_read(child, false);
- total += perf_event_count(child);
+ total += perf_event_count(child, false);
*enabled += child->total_time_enabled;
*running += child->total_time_running;
}
/*
* Write {count,id} tuples for every sibling.
*/
- values[n++] += perf_event_count(leader);
+ values[n++] += perf_event_count(leader, false);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
if (read_format & PERF_FORMAT_LOST)
values[n++] = atomic64_read(&leader->lost_samples);
for_each_sibling_event(sub, leader) {
- values[n++] += perf_event_count(sub);
+ values[n++] += perf_event_count(sub, false);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
if (read_format & PERF_FORMAT_LOST)
static inline int perf_fget_light(int fd, struct fd *p)
{
struct fd f = fdget(fd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- if (f.file->f_op != &perf_fops) {
+ if (fd_file(f)->f_op != &perf_fops) {
fdput(f);
return -EBADF;
}
ret = perf_fget_light(arg, &output);
if (ret)
return ret;
- output_event = output.file->private_data;
+ output_event = fd_file(output)->private_data;
ret = perf_event_set_output(event, output_event);
fdput(output);
} else {
++userpg->lock;
barrier();
userpg->index = perf_event_index(event);
- userpg->offset = perf_event_count(event);
+ userpg->offset = perf_event_count(event, false);
if (userpg->index)
userpg->offset -= local64_read(&event->hw.prev_count);
event->pmu->event_unmapped(event, vma->vm_mm);
/*
- * rb->aux_mmap_count will always drop before rb->mmap_count and
- * event->mmap_count, so it is ok to use event->mmap_mutex to
- * serialize with perf_mmap here.
+ * The AUX buffer is strictly a sub-buffer, serialize using aux_mutex
+ * to avoid complications.
*/
if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
- atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
+ atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) {
/*
* Stop all AUX events that are writing to this buffer,
* so that we can free its AUX pages and corresponding PMU
rb_free_aux(rb);
WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
- mutex_unlock(&event->mmap_mutex);
+ mutex_unlock(&rb->aux_mutex);
}
if (atomic_dec_and_test(&rb->mmap_count))
struct perf_event *event = file->private_data;
unsigned long user_locked, user_lock_limit;
struct user_struct *user = current_user();
+ struct mutex *aux_mutex = NULL;
struct perf_buffer *rb = NULL;
unsigned long locked, lock_limit;
unsigned long vma_size;
if (!rb)
goto aux_unlock;
+ aux_mutex = &rb->aux_mutex;
+ mutex_lock(aux_mutex);
+
aux_offset = READ_ONCE(rb->user_page->aux_offset);
aux_size = READ_ONCE(rb->user_page->aux_size);
atomic_dec(&rb->mmap_count);
}
aux_unlock:
+ if (aux_mutex)
+ mutex_unlock(aux_mutex);
mutex_unlock(&event->mmap_mutex);
/*
if (event->pending_work) {
event->pending_work = 0;
perf_sigtrap(event);
- local_dec(&event->ctx->nr_pending);
+ local_dec(&event->ctx->nr_no_switch_fast);
rcuwait_wake_up(&event->pending_work_wait);
}
rcu_read_unlock();
u64 values[5];
int n = 0;
- values[n++] = perf_event_count(event);
+ values[n++] = perf_event_count(event, has_inherit_and_sample_read(&event->attr));
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
values[n++] = enabled +
atomic64_read(&event->child_total_time_enabled);
}
static void perf_output_read_group(struct perf_output_handle *handle,
- struct perf_event *event,
- u64 enabled, u64 running)
+ struct perf_event *event,
+ u64 enabled, u64 running)
{
struct perf_event *leader = event->group_leader, *sub;
u64 read_format = event->attr.read_format;
unsigned long flags;
u64 values[6];
int n = 0;
+ bool self = has_inherit_and_sample_read(&event->attr);
/*
* Disabling interrupts avoids all counter scheduling
(leader->state == PERF_EVENT_STATE_ACTIVE))
leader->pmu->read(leader);
- values[n++] = perf_event_count(leader);
+ values[n++] = perf_event_count(leader, self);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
if (read_format & PERF_FORMAT_LOST)
(sub->state == PERF_EVENT_STATE_ACTIVE))
sub->pmu->read(sub);
- values[n++] = perf_event_count(sub);
+ values[n++] = perf_event_count(sub, self);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
if (read_format & PERF_FORMAT_LOST)
* The problem is that its both hard and excessively expensive to iterate the
* child list, not to mention that its impossible to IPI the children running
* on another CPU, from interrupt/NMI context.
+ *
+ * Instead the combination of PERF_SAMPLE_READ and inherit will track per-thread
+ * counts rather than attempting to accumulate some value across all children on
+ * all cores.
*/
static void perf_output_read(struct perf_output_handle *handle,
struct perf_event *event)
mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
if (atomic_read(&nr_build_id_events))
- build_id_parse(vma, mmap_event->build_id, &mmap_event->build_id_size);
+ build_id_parse_nofault(vma, mmap_event->build_id, &mmap_event->build_id_size);
perf_iterate_sb(perf_event_mmap_output,
mmap_event,
ret = __perf_event_account_interrupt(event, throttle);
- if (event->prog && !bpf_overflow_handler(event, data, regs))
+ if (event->prog && event->prog->type == BPF_PROG_TYPE_PERF_EVENT &&
+ !bpf_overflow_handler(event, data, regs))
return ret;
/*
if (!event->pending_work &&
!task_work_add(current, &event->pending_task, notify_mode)) {
event->pending_work = pending_id;
- local_inc(&event->ctx->nr_pending);
+ local_inc(&event->ctx->nr_no_switch_fast);
event->pending_addr = 0;
if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR))
}
static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
+static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu)
+{
+ switch (scope) {
+ case PERF_PMU_SCOPE_CORE:
+ return topology_sibling_cpumask(cpu);
+ case PERF_PMU_SCOPE_DIE:
+ return topology_die_cpumask(cpu);
+ case PERF_PMU_SCOPE_CLUSTER:
+ return topology_cluster_cpumask(cpu);
+ case PERF_PMU_SCOPE_PKG:
+ return topology_core_cpumask(cpu);
+ case PERF_PMU_SCOPE_SYS_WIDE:
+ return cpu_online_mask;
+ }
+
+ return NULL;
+}
+
+static inline struct cpumask *perf_scope_cpumask(unsigned int scope)
+{
+ switch (scope) {
+ case PERF_PMU_SCOPE_CORE:
+ return perf_online_core_mask;
+ case PERF_PMU_SCOPE_DIE:
+ return perf_online_die_mask;
+ case PERF_PMU_SCOPE_CLUSTER:
+ return perf_online_cluster_mask;
+ case PERF_PMU_SCOPE_PKG:
+ return perf_online_pkg_mask;
+ case PERF_PMU_SCOPE_SYS_WIDE:
+ return perf_online_sys_mask;
+ }
+
+ return NULL;
+}
+
+static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct pmu *pmu = dev_get_drvdata(dev);
+ struct cpumask *mask = perf_scope_cpumask(pmu->scope);
+
+ if (mask)
+ return cpumap_print_to_pagebuf(true, buf, mask);
+ return 0;
+}
+
+static DEVICE_ATTR_RO(cpumask);
+
static struct attribute *pmu_dev_attrs[] = {
&dev_attr_type.attr,
&dev_attr_perf_event_mux_interval_ms.attr,
&dev_attr_nr_addr_filters.attr,
+ &dev_attr_cpumask.attr,
NULL,
};
if (n == 2 && !pmu->nr_addr_filters)
return 0;
+ /* cpumask */
+ if (n == 3 && pmu->scope == PERF_PMU_SCOPE_NONE)
+ return 0;
+
return a->mode;
}
goto free_pdc;
}
+ if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) {
+ ret = -EINVAL;
+ goto free_pdc;
+ }
+
pmu->name = name;
if (type >= 0)
event_has_any_exclude_flag(event))
ret = -EINVAL;
+ if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) {
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu);
+ struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope);
+ int cpu;
+
+ if (pmu_cpumask && cpumask) {
+ cpu = cpumask_any_and(pmu_cpumask, cpumask);
+ if (cpu >= nr_cpu_ids)
+ ret = -ENODEV;
+ else
+ event->event_caps |= PERF_EV_CAP_READ_SCOPE;
+ } else {
+ ret = -ENODEV;
+ }
+ }
+
if (ret && event->destroy)
event->destroy(event);
}
local64_set(&hwc->period_left, hwc->sample_period);
/*
- * We currently do not support PERF_SAMPLE_READ on inherited events.
+ * We do not support PERF_SAMPLE_READ on inherited events unless
+ * PERF_SAMPLE_TID is also selected, which allows inherited events to
+ * collect per-thread samples.
* See perf_output_read().
*/
- if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
+ if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID))
goto err_ns;
if (!has_branch_stack(event))
struct perf_event_attr attr;
struct perf_event_context *ctx;
struct file *event_file = NULL;
- struct fd group = {NULL, 0};
+ struct fd group = EMPTY_FD;
struct task_struct *task = NULL;
struct pmu *pmu;
int event_fd;
err = perf_fget_light(group_fd, &group);
if (err)
goto err_fd;
- group_leader = group.file->private_data;
+ group_leader = fd_file(group)->private_data;
if (flags & PERF_FLAG_FD_OUTPUT)
output_event = group_leader;
if (flags & PERF_FLAG_FD_NO_GROUP)
perf_event_read_event(child_event, task);
}
- child_val = perf_event_count(child_event);
+ child_val = perf_event_count(child_event, false);
/*
* Add back the child's count to the parent's count:
* in.
*/
raw_spin_lock_irq(&child_ctx->lock);
- task_ctx_sched_out(child_ctx, EVENT_ALL);
+ task_ctx_sched_out(child_ctx, NULL, EVENT_ALL);
/*
* Now that the context is inactive, destroy the task <-> ctx relation
return &event->attr;
}
+int perf_allow_kernel(struct perf_event_attr *attr)
+{
+ if (sysctl_perf_event_paranoid > 1 && !perfmon_capable())
+ return -EACCES;
+
+ return security_perf_event_open(attr, PERF_SECURITY_KERNEL);
+}
+EXPORT_SYMBOL_GPL(perf_allow_kernel);
+
/*
* Inherit an event from parent task to child task.
*
int cpu;
zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_core_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_die_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_cluster_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_pkg_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_sys_mask, GFP_KERNEL);
+
for_each_possible_cpu(cpu) {
swhash = &per_cpu(swevent_htable, cpu);
struct perf_event *event;
raw_spin_lock(&ctx->lock);
- ctx_sched_out(ctx, EVENT_TIME);
+ ctx_sched_out(ctx, NULL, EVENT_TIME);
list_for_each_entry(event, &ctx->event_list, event_entry)
__perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
raw_spin_unlock(&ctx->lock);
}
+static void perf_event_clear_cpumask(unsigned int cpu)
+{
+ int target[PERF_PMU_MAX_SCOPE];
+ unsigned int scope;
+ struct pmu *pmu;
+
+ cpumask_clear_cpu(cpu, perf_online_mask);
+
+ for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
+ struct cpumask *pmu_cpumask = perf_scope_cpumask(scope);
+
+ target[scope] = -1;
+ if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
+ continue;
+
+ if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask))
+ continue;
+ target[scope] = cpumask_any_but(cpumask, cpu);
+ if (target[scope] < nr_cpu_ids)
+ cpumask_set_cpu(target[scope], pmu_cpumask);
+ }
+
+ /* migrate */
+ list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
+ if (pmu->scope == PERF_PMU_SCOPE_NONE ||
+ WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE))
+ continue;
+
+ if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids)
+ perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]);
+ }
+}
+
static void perf_event_exit_cpu_context(int cpu)
{
struct perf_cpu_context *cpuctx;
// XXX simplify cpuctx->online
mutex_lock(&pmus_lock);
+ /*
+ * Clear the cpumasks, and migrate to other CPUs if possible.
+ * Must be invoked before the __perf_event_exit_context.
+ */
+ perf_event_clear_cpumask(cpu);
cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
ctx = &cpuctx->ctx;
smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
cpuctx->online = 0;
mutex_unlock(&ctx->mutex);
- cpumask_clear_cpu(cpu, perf_online_mask);
mutex_unlock(&pmus_lock);
}
#else
#endif
+static void perf_event_setup_cpumask(unsigned int cpu)
+{
+ struct cpumask *pmu_cpumask;
+ unsigned int scope;
+
+ /*
+ * Early boot stage, the cpumask hasn't been set yet.
+ * The perf_online_<domain>_masks includes the first CPU of each domain.
+ * Always unconditionally set the boot CPU for the perf_online_<domain>_masks.
+ */
+ if (cpumask_empty(perf_online_mask)) {
+ for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
+ pmu_cpumask = perf_scope_cpumask(scope);
+ if (WARN_ON_ONCE(!pmu_cpumask))
+ continue;
+ cpumask_set_cpu(cpu, pmu_cpumask);
+ }
+ goto end;
+ }
+
+ for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
+
+ pmu_cpumask = perf_scope_cpumask(scope);
+
+ if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
+ continue;
+
+ if (!cpumask_empty(cpumask) &&
+ cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids)
+ cpumask_set_cpu(cpu, pmu_cpumask);
+ }
+end:
+ cpumask_set_cpu(cpu, perf_online_mask);
+}
+
int perf_event_init_cpu(unsigned int cpu)
{
struct perf_cpu_context *cpuctx;
perf_swevent_init_cpu(cpu);
mutex_lock(&pmus_lock);
- cpumask_set_cpu(cpu, perf_online_mask);
+ perf_event_setup_cpumask(cpu);
cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
ctx = &cpuctx->ctx;
struct idempotent *existing;
bool first;
- u->ret = 0;
+ u->ret = -EINTR;
u->cookie = cookie;
init_completion(&u->complete);
hlist_for_each_entry_safe(pos, next, head, entry) {
if (pos->cookie != cookie)
continue;
- hlist_del(&pos->entry);
+ hlist_del_init(&pos->entry);
pos->ret = ret;
complete(&pos->complete);
}
return ret;
}
+/*
+ * Wait for the idempotent worker.
+ *
+ * If we get interrupted, we need to remove ourselves from the
+ * the idempotent list, and the completion may still come in.
+ *
+ * The 'idem_lock' protects against the race, and 'idem.ret' was
+ * initialized to -EINTR and is thus always the right return
+ * value even if the idempotent work then completes between
+ * the wait_for_completion and the cleanup.
+ */
+static int idempotent_wait_for_completion(struct idempotent *u)
+{
+ if (wait_for_completion_interruptible(&u->complete)) {
+ spin_lock(&idem_lock);
+ if (!hlist_unhashed(&u->entry))
+ hlist_del(&u->entry);
+ spin_unlock(&idem_lock);
+ }
+ return u->ret;
+}
+
static int init_module_from_file(struct file *f, const char __user * uargs, int flags)
{
struct load_info info = { };
if (!f || !(f->f_mode & FMODE_READ))
return -EBADF;
- /* See if somebody else is doing the operation? */
- if (idempotent(&idem, file_inode(f))) {
- wait_for_completion(&idem.complete);
- return idem.ret;
+ /* Are we the winners of the race and get to do this? */
+ if (!idempotent(&idem, file_inode(f))) {
+ int ret = init_module_from_file(f, uargs, flags);
+ return idempotent_complete(&idem, ret);
}
- /* Otherwise, we'll do it and complete others */
- return idempotent_complete(&idem,
- init_module_from_file(f, uargs, flags));
+ /*
+ * Somebody else won the race and is loading the module.
+ */
+ return idempotent_wait_for_completion(&idem);
}
SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
return -EINVAL;
f = fdget(fd);
- err = idempotent_init_module(f.file, uargs, flags);
+ err = idempotent_init_module(fd_file(f), uargs, flags);
fdput(f);
return err;
}
}
/*
- * Dequeue a signal and return the element to the caller, which is
- * expected to free it.
- *
- * All callers have to hold the siglock.
+ * Try to dequeue a signal. If a deliverable signal is found fill in the
+ * caller provided siginfo and return the signal number. Otherwise return
+ * 0.
*/
-int dequeue_signal(struct task_struct *tsk, sigset_t *mask,
- kernel_siginfo_t *info, enum pid_type *type)
+int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type)
{
+ struct task_struct *tsk = current;
bool resched_timer = false;
int signr;
- /* We only dequeue private signals from ourselves, we don't let
- * signalfd steal them
- */
+ lockdep_assert_held(&tsk->sighand->siglock);
+
*type = PIDTYPE_PID;
signr = __dequeue_signal(&tsk->pending, mask, info, &resched_timer);
if (!signr) {
void sigqueue_free(struct sigqueue *q)
{
- unsigned long flags;
spinlock_t *lock = ¤t->sighand->siglock;
+ unsigned long flags;
- BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
+ if (WARN_ON_ONCE(!(q->flags & SIGQUEUE_PREALLOC)))
+ return;
/*
* We must hold ->siglock while testing q->list
* to serialize with collect_signal() or with
unsigned long flags;
int ret, result;
- BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
+ if (WARN_ON_ONCE(!(q->flags & SIGQUEUE_PREALLOC)))
+ return 0;
+ if (WARN_ON_ONCE(q->info.si_code != SI_TIMER))
+ return 0;
ret = -1;
rcu_read_lock();
* If an SI_TIMER entry is already queue just increment
* the overrun count.
*/
- BUG_ON(q->info.si_code != SI_TIMER);
q->info.si_overrun++;
result = TRACE_SIGNAL_ALREADY_PENDING;
goto out;
type = PIDTYPE_PID;
signr = dequeue_synchronous_signal(&ksig->info);
if (!signr)
- signr = dequeue_signal(current, ¤t->blocked,
- &ksig->info, &type);
+ signr = dequeue_signal(¤t->blocked, &ksig->info, &type);
if (!signr)
break; /* will return 0 */
current->flags |= PF_SIGNALED;
if (sig_kernel_coredump(signr)) {
+ int ret;
+
if (print_fatal_signals)
print_fatal_signal(signr);
proc_coredump_connector(current);
* first and our do_group_exit call below will use
* that value and ignore the one we pass it.
*/
- do_coredump(&ksig->info);
+ ret = do_coredump(&ksig->info);
+ if (ret)
+ coredump_report_failure("coredump has not been created, error %d",
+ ret);
+ else if (!IS_ENABLED(CONFIG_COREDUMP)) {
+ /*
+ * Coredumps are not available, can't fail collecting
+ * the coredump.
+ *
+ * Leave a note though that the coredump is going to be
+ * not created. This is not an error or a warning as disabling
+ * support in the kernel for coredumps isn't commonplace, and
+ * the user must've built the kernel with the custom config so
+ * let them know all works as desired.
+ */
+ coredump_report("no coredump collected as "
+ "that is disabled in the kernel configuration");
+ }
}
/*
signotset(&mask);
spin_lock_irq(&tsk->sighand->siglock);
- sig = dequeue_signal(tsk, &mask, info, &type);
+ sig = dequeue_signal(&mask, info, &type);
if (!sig && timeout) {
/*
* None ready, temporarily unblock those we're interested
spin_lock_irq(&tsk->sighand->siglock);
__set_task_blocked(tsk, &tsk->real_blocked);
sigemptyset(&tsk->real_blocked);
- sig = dequeue_signal(tsk, &mask, info, &type);
+ sig = dequeue_signal(&mask, info, &type);
}
spin_unlock_irq(&tsk->sighand->siglock);
return -EINVAL;
f = fdget(pidfd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
/* Is this a pidfd? */
- pid = pidfd_to_pid(f.file);
+ pid = pidfd_to_pid(fd_file(f));
if (IS_ERR(pid)) {
ret = PTR_ERR(pid);
goto err;
switch (flags) {
case 0:
/* Infer scope from the type of pidfd. */
- if (f.file->f_flags & PIDFD_THREAD)
+ if (fd_file(f)->f_flags & PIDFD_THREAD)
type = PIDTYPE_PID;
else
type = PIDTYPE_TGID;
int err;
exe = fdget(fd);
- if (!exe.file)
+ if (!fd_file(exe))
return -EBADF;
- inode = file_inode(exe.file);
+ inode = file_inode(fd_file(exe));
/*
* Because the original mm->exe_file points to executable file, make
* overall picture.
*/
err = -EACCES;
- if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path))
+ if (!S_ISREG(inode->i_mode) || path_noexec(&fd_file(exe)->f_path))
goto exit;
- err = file_permission(exe.file, MAY_EXEC);
+ err = file_permission(fd_file(exe), MAY_EXEC);
if (err)
goto exit;
- err = replace_mm_exe_file(mm, exe.file);
+ err = replace_mm_exe_file(mm, fd_file(exe));
exit:
fdput(exe);
return err;
error = current->timer_slack_ns;
break;
case PR_SET_TIMERSLACK:
+ if (rt_or_dl_task_policy(current))
+ break;
if (arg2 <= 0)
current->timer_slack_ns =
current->default_timer_slack_ns;
#include <linux/pipe_fs_i.h>
#include <linux/splice.h>
#include <linux/rcupdate_wait.h>
+#include <linux/sched/mm.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include "internal.h"
* ->swap_lock (try_to_unmap_one)
* ->private_lock (try_to_unmap_one)
* ->i_pages lock (try_to_unmap_one)
- * ->lruvec->lru_lock (follow_page->mark_page_accessed)
- * ->lruvec->lru_lock (check_pte_range->isolate_lru_page)
+ * ->lruvec->lru_lock (follow_page_mask->mark_page_accessed)
+ * ->lruvec->lru_lock (check_pte_range->folio_isolate_lru)
* ->private_lock (folio_remove_rmap_pte->set_page_dirty)
* ->i_pages lock (folio_remove_rmap_pte->set_page_dirty)
* bdi.wb->list_lock (folio_remove_rmap_pte->set_page_dirty)
struct folio *folio = fbatch.folios[i];
folio_wait_writeback(folio);
- folio_clear_error(folio);
}
folio_batch_release(&fbatch);
cond_resched();
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
+ VM_BUG_ON_FOLIO(folio_order(folio) < mapping_min_folio_order(mapping),
+ folio);
mapping_set_update(&xas, mapping);
VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
folio_wait_stable(folio);
no_page:
if (!folio && (fgp_flags & FGP_CREAT)) {
- unsigned order = FGF_GET_ORDER(fgp_flags);
+ unsigned int min_order = mapping_min_folio_order(mapping);
+ unsigned int order = max(min_order, FGF_GET_ORDER(fgp_flags));
int err;
+ index = mapping_align_index(mapping, index);
if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
gfp |= __GFP_WRITE;
if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
fgp_flags |= FGP_LOCK;
- if (!mapping_large_folio_support(mapping))
- order = 0;
- if (order > MAX_PAGECACHE_ORDER)
- order = MAX_PAGECACHE_ORDER;
+ if (order > mapping_max_folio_order(mapping))
+ order = mapping_max_folio_order(mapping);
/* If we're not aligned, allocate a smaller folio */
if (index & ((1UL << order) - 1))
order = __ffs(index);
gfp_t alloc_gfp = gfp;
err = -ENOMEM;
- if (order > 0)
+ if (order > min_order)
alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;
folio = filemap_alloc_folio(alloc_gfp, order);
if (!folio)
break;
folio_put(folio);
folio = NULL;
- } while (order-- > 0);
+ } while (order-- > min_order);
if (err == -EEXIST)
goto repeat;
if (!folio_batch_add(fbatch, folio))
break;
}
- rcu_read_unlock();
if (folio_batch_count(fbatch)) {
- unsigned long nr = 1;
+ unsigned long nr;
int idx = folio_batch_count(fbatch) - 1;
folio = fbatch->folios[idx];
if (!xa_is_value(folio))
nr = folio_nr_pages(folio);
- *start = indices[idx] + nr;
+ else
+ nr = 1 << xa_get_order(&mapping->i_pages, indices[idx]);
+ *start = round_down(indices[idx] + nr, nr);
}
+ rcu_read_unlock();
+
return folio_batch_count(fbatch);
}
rcu_read_lock();
while ((folio = find_get_entry(&xas, end, XA_PRESENT))) {
+ unsigned long base;
+ unsigned long nr;
+
if (!xa_is_value(folio)) {
- if (folio->index < *start)
+ nr = folio_nr_pages(folio);
+ base = folio->index;
+ /* Omit large folio which begins before the start */
+ if (base < *start)
goto put;
- if (folio_next_index(folio) - 1 > end)
+ /* Omit large folio which extends beyond the end */
+ if (base + nr - 1 > end)
goto put;
if (!folio_trylock(folio))
goto put;
goto unlock;
VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index),
folio);
+ } else {
+ nr = 1 << xas_get_order(&xas);
+ base = xas.xa_index & ~(nr - 1);
+ /* Omit order>0 value which begins before the start */
+ if (base < *start)
+ continue;
+ /* Omit order>0 value which extends beyond the end */
+ if (base + nr - 1 > end)
+ break;
}
+
+ /* Update start now so that last update is correct on return */
+ *start = base + nr;
indices[fbatch->nr] = xas.xa_index;
if (!folio_batch_add(fbatch, folio))
break;
}
rcu_read_unlock();
- if (folio_batch_count(fbatch)) {
- unsigned long nr = 1;
- int idx = folio_batch_count(fbatch) - 1;
-
- folio = fbatch->folios[idx];
- if (!xa_is_value(folio))
- nr = folio_nr_pages(folio);
- *start = indices[idx] + nr;
- }
return folio_batch_count(fbatch);
}
unsigned long pflags;
int error;
- /*
- * A previous I/O error may have been due to temporary failures,
- * eg. multipath errors. PG_error will be set again if read_folio
- * fails.
- */
- folio_clear_error(folio);
-
/* Start the actual read. The read will unlock the page. */
if (unlikely(workingset))
psi_memstall_enter(&pflags);
}
static int filemap_create_folio(struct file *file,
- struct address_space *mapping, pgoff_t index,
+ struct address_space *mapping, loff_t pos,
struct folio_batch *fbatch)
{
struct folio *folio;
int error;
+ unsigned int min_order = mapping_min_folio_order(mapping);
+ pgoff_t index;
- folio = filemap_alloc_folio(mapping_gfp_mask(mapping), 0);
+ folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order);
if (!folio)
return -ENOMEM;
* well to keep locking rules simple.
*/
filemap_invalidate_lock_shared(mapping);
+ index = (pos >> (PAGE_SHIFT + min_order)) << min_order;
error = filemap_add_folio(mapping, folio, index,
mapping_gfp_constraint(mapping, GFP_KERNEL));
if (error == -EEXIST)
pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
pgoff_t last_index;
struct folio *folio;
+ unsigned int flags;
int err = 0;
/* "last_index" is the index of the page beyond the end of the read */
if (!folio_batch_count(fbatch)) {
if (iocb->ki_flags & IOCB_NOIO)
return -EAGAIN;
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ flags = memalloc_noio_save();
page_cache_sync_readahead(mapping, ra, filp, index,
last_index - index);
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ memalloc_noio_restore(flags);
filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
}
if (!folio_batch_count(fbatch)) {
if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
return -EAGAIN;
- err = filemap_create_folio(filp, mapping,
- iocb->ki_pos >> PAGE_SHIFT, fbatch);
+ err = filemap_create_folio(filp, mapping, iocb->ki_pos, fbatch);
if (err == AOP_TRUNCATED_PAGE)
goto retry;
return err;
goto err;
}
+ trace_mm_filemap_get_pages(mapping, index, last_index - 1);
return 0;
err:
if (err < 0)
}
EXPORT_SYMBOL_GPL(kiocb_write_and_wait);
-int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
+int filemap_invalidate_pages(struct address_space *mapping,
+ loff_t pos, loff_t end, bool nowait)
{
- struct address_space *mapping = iocb->ki_filp->f_mapping;
- loff_t pos = iocb->ki_pos;
- loff_t end = pos + count - 1;
int ret;
- if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (nowait) {
/* we could block if there are any pages in the range */
if (filemap_range_has_page(mapping, pos, end))
return -EAGAIN;
return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
end >> PAGE_SHIFT);
}
+
+int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
+{
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+
+ return filemap_invalidate_pages(mapping, iocb->ki_pos,
+ iocb->ki_pos + count - 1,
+ iocb->ki_flags & IOCB_NOWAIT);
+}
EXPORT_SYMBOL_GPL(kiocb_invalidate_pages);
/**
static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio)
{
if (xa_is_value(folio))
- return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index);
+ return PAGE_SIZE << xas_get_order(xas);
return folio_size(folio);
}
if (unlikely(index >= max_idx))
return VM_FAULT_SIGBUS;
+ trace_mm_filemap_fault(mapping, index);
+
/*
* Do we have something in the page cache already?
*/
struct vm_area_struct *vma = vmf->vma;
struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
- pgoff_t last_pgoff = start_pgoff;
+ pgoff_t file_end, last_pgoff = start_pgoff;
unsigned long addr;
XA_STATE(xas, &mapping->i_pages, start_pgoff);
struct folio *folio;
goto out;
}
+ file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1;
+ if (end_pgoff > file_end)
+ end_pgoff = file_end;
+
folio_type = mm_counter_file(folio);
do {
unsigned long end;
} while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL);
add_mm_counter(vma->vm_mm, folio_type, rss);
pte_unmap_unlock(vmf->pte, vmf->ptl);
+ trace_mm_filemap_map_pages(mapping, start_pgoff, end_pgoff);
out:
rcu_read_unlock();
repeat:
folio = filemap_get_folio(mapping, index);
if (IS_ERR(folio)) {
- folio = filemap_alloc_folio(gfp, 0);
+ folio = filemap_alloc_folio(gfp,
+ mapping_min_folio_order(mapping));
if (!folio)
return ERR_PTR(-ENOMEM);
+ index = mapping_align_index(mapping, index);
err = filemap_add_folio(mapping, folio, index, gfp);
if (unlikely(err)) {
folio_put(folio);
ssize_t written = 0;
do {
- struct page *page;
struct folio *folio;
size_t offset; /* Offset into folio */
size_t bytes; /* Bytes to write to folio */
}
status = a_ops->write_begin(file, mapping, pos, bytes,
- &page, &fsdata);
+ &folio, &fsdata);
if (unlikely(status < 0))
break;
- folio = page_folio(page);
offset = offset_in_folio(folio, pos);
if (bytes > folio_size(folio) - offset)
bytes = folio_size(folio) - offset;
flush_dcache_folio(folio);
status = a_ops->write_end(file, mapping, pos, bytes, copied,
- page, fsdata);
+ folio, fsdata);
if (unlikely(status != copied)) {
iov_iter_revert(i, copied - max(status, 0L));
if (unlikely(status < 0))
}
/* Wait for writeback to complete on all folios and discard. */
- truncate_inode_pages_range(mapping, start, end);
+ invalidate_inode_pages2_range(mapping, start / PAGE_SIZE, end / PAGE_SIZE);
unlock:
filemap_invalidate_unlock(mapping);
if (xas_retry(&xas, folio))
continue;
- order = xa_get_order(xas.xa, xas.xa_index);
+ order = xas_get_order(&xas);
nr_pages = 1 << order;
folio_first_index = round_down(xas.xa_index, 1 << order);
folio_last_index = folio_first_index + nr_pages - 1;
struct cachestat cs;
pgoff_t first_index, last_index;
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
if (copy_from_user(&csr, cstat_range,
}
/* hugetlbfs is not supported */
- if (is_file_hugepages(f.file)) {
+ if (is_file_hugepages(fd_file(f))) {
fdput(f);
return -EOPNOTSUPP;
}
last_index =
csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT;
memset(&cs, 0, sizeof(struct cachestat));
- mapping = f.file->f_mapping;
+ mapping = fd_file(f)->f_mapping;
filemap_cachestat(mapping, first_index, last_index, &cs);
fdput(f);
return folio_file_page(folio, index);
}
+static void memcg1_check_events(struct mem_cgroup *memcg, int nid);
+static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages);
+
/**
* mem_cgroup_move_account - move account of the folio
* @folio: The folio.
nid = folio_nid(folio);
local_irq_disable();
- mem_cgroup_charge_statistics(to, nr_pages);
+ memcg1_charge_statistics(to, nr_pages);
memcg1_check_events(to, nid);
- mem_cgroup_charge_statistics(from, -nr_pages);
+ memcg1_charge_statistics(from, -nr_pages);
memcg1_check_events(from, nid);
local_irq_enable();
out:
}
}
+/* Cgroup1: threshold notifications & softlimit tree updates */
+struct memcg1_events_percpu {
+ unsigned long nr_page_events;
+ unsigned long targets[MEM_CGROUP_NTARGETS];
+};
+
+static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages)
+{
+ /* pagein of a big page is an event. So, ignore page size */
+ if (nr_pages > 0)
+ __count_memcg_events(memcg, PGPGIN, 1);
+ else {
+ __count_memcg_events(memcg, PGPGOUT, 1);
+ nr_pages = -nr_pages; /* for event */
+ }
+
+ __this_cpu_add(memcg->events_percpu->nr_page_events, nr_pages);
+}
+
+#define THRESHOLDS_EVENTS_TARGET 128
+#define SOFTLIMIT_EVENTS_TARGET 1024
+
+static bool memcg1_event_ratelimit(struct mem_cgroup *memcg,
+ enum mem_cgroup_events_target target)
+{
+ unsigned long val, next;
+
+ val = __this_cpu_read(memcg->events_percpu->nr_page_events);
+ next = __this_cpu_read(memcg->events_percpu->targets[target]);
+ /* from time_after() in jiffies.h */
+ if ((long)(next - val) < 0) {
+ switch (target) {
+ case MEM_CGROUP_TARGET_THRESH:
+ next = val + THRESHOLDS_EVENTS_TARGET;
+ break;
+ case MEM_CGROUP_TARGET_SOFTLIMIT:
+ next = val + SOFTLIMIT_EVENTS_TARGET;
+ break;
+ default:
+ break;
+ }
+ __this_cpu_write(memcg->events_percpu->targets[target], next);
+ return true;
+ }
+ return false;
+}
+
/*
* Check events in order.
*
*/
-void memcg1_check_events(struct mem_cgroup *memcg, int nid)
+static void memcg1_check_events(struct mem_cgroup *memcg, int nid)
{
if (IS_ENABLED(CONFIG_PREEMPT_RT))
return;
/* threshold event is triggered in finer grain than soft limit */
- if (unlikely(mem_cgroup_event_ratelimit(memcg,
+ if (unlikely(memcg1_event_ratelimit(memcg,
MEM_CGROUP_TARGET_THRESH))) {
bool do_softlimit;
- do_softlimit = mem_cgroup_event_ratelimit(memcg,
+ do_softlimit = memcg1_event_ratelimit(memcg,
MEM_CGROUP_TARGET_SOFTLIMIT);
mem_cgroup_threshold(memcg);
if (unlikely(do_softlimit))
}
}
+void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ memcg1_charge_statistics(memcg, folio_nr_pages(folio));
+ memcg1_check_events(memcg, folio_nid(folio));
+ local_irq_restore(flags);
+}
+
+void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg)
+{
+ /*
+ * Interrupts should be disabled here because the caller holds the
+ * i_pages lock which is taken with interrupts-off. It is
+ * important here to have the interrupts disabled because it is the
+ * only synchronisation we have for updating the per-CPU variables.
+ */
+ preempt_disable_nested();
+ VM_WARN_ON_IRQS_ENABLED();
+ memcg1_charge_statistics(memcg, -folio_nr_pages(folio));
+ preempt_enable_nested();
+ memcg1_check_events(memcg, folio_nid(folio));
+}
+
+void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
+ unsigned long nr_memory, int nid)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __count_memcg_events(memcg, PGPGOUT, pgpgout);
+ __this_cpu_add(memcg->events_percpu->nr_page_events, nr_memory);
+ memcg1_check_events(memcg, nid);
+ local_irq_restore(flags);
+}
+
static int compare_thresholds(const void *a, const void *b)
{
const struct mem_cgroup_threshold *_a = a;
buf = endp + 1;
cfd = simple_strtoul(buf, &endp, 10);
- if ((*endp != ' ') && (*endp != '\0'))
+ if (*endp == '\0')
+ buf = endp;
+ else if (*endp == ' ')
+ buf = endp + 1;
+ else
return -EINVAL;
- buf = endp + 1;
event = kzalloc(sizeof(*event), GFP_KERNEL);
if (!event)
INIT_WORK(&event->remove, memcg_event_remove);
efile = fdget(efd);
- if (!efile.file) {
+ if (!fd_file(efile)) {
ret = -EBADF;
goto out_kfree;
}
- event->eventfd = eventfd_ctx_fileget(efile.file);
+ event->eventfd = eventfd_ctx_fileget(fd_file(efile));
if (IS_ERR(event->eventfd)) {
ret = PTR_ERR(event->eventfd);
goto out_put_efile;
}
cfile = fdget(cfd);
- if (!cfile.file) {
+ if (!fd_file(cfile)) {
ret = -EBADF;
goto out_put_eventfd;
}
/* the process need read permission on control file */
/* AV: shouldn't we check that it's been opened for read instead? */
- ret = file_permission(cfile.file, MAY_READ);
+ ret = file_permission(fd_file(cfile), MAY_READ);
if (ret < 0)
goto out_put_cfile;
* The control file must be a regular cgroup1 file. As a regular cgroup
* file can't be renamed, it's safe to access its name afterwards.
*/
- cdentry = cfile.file->f_path.dentry;
+ cdentry = fd_file(cfile)->f_path.dentry;
if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
ret = -EINVAL;
goto out_put_cfile;
event->register_event = mem_cgroup_usage_register_event;
event->unregister_event = mem_cgroup_usage_unregister_event;
} else if (!strcmp(name, "memory.oom_control")) {
+ pr_warn_once("oom_control is deprecated and will be removed. "
+ " if you depend on this functionality. \n");
event->register_event = mem_cgroup_oom_register_event;
event->unregister_event = mem_cgroup_oom_unregister_event;
} else if (!strcmp(name, "memory.pressure_level")) {
+ pr_warn_once("pressure_level is deprecated and will be removed. "
+ "if you depend on this functionality. \n");
event->register_event = vmpressure_register_event;
event->unregister_event = vmpressure_unregister_event;
} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
if (ret)
goto out_put_css;
- vfs_poll(efile.file, &event->pt);
+ vfs_poll(fd_file(efile), &event->pt);
spin_lock_irq(&memcg->event_list_lock);
list_add(&event->list, &memcg->event_list);
ret = 0;
break;
case _TCP:
+ pr_warn_once("kmem.tcp.limit_in_bytes is deprecated and will be removed. "
+ "depend on this functionality.\n");
ret = memcg_update_tcp_max(memcg, nr_pages);
break;
}
if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
ret = -EOPNOTSUPP;
} else {
+ pr_warn_once("soft_limit_in_bytes is deprecated and will be removed. "
+ "depend on this functionality.\n");
WRITE_ONCE(memcg->soft_limit, nr_pages);
ret = 0;
}
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ pr_warn_once("oom_control is deprecated and will be removed. "
+ "depend on this functionality. \n");
+
/* cannot set to root cgroup and only 0 and 1 are allowed */
if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1)))
return -EINVAL;
return false;
}
+bool memcg1_alloc_events(struct mem_cgroup *memcg)
+{
+ memcg->events_percpu = alloc_percpu_gfp(struct memcg1_events_percpu,
+ GFP_KERNEL_ACCOUNT);
+ return !!memcg->events_percpu;
+}
+
+void memcg1_free_events(struct mem_cgroup *memcg)
+{
+ if (memcg->events_percpu)
+ free_percpu(memcg->events_percpu);
+}
+
static int __init memcg1_init(void)
{
int node;
unsigned long nr_to_read, unsigned long lookahead_size)
{
struct address_space *mapping = ractl->mapping;
- unsigned long index = readahead_index(ractl);
+ unsigned long ra_folio_index, index = readahead_index(ractl);
gfp_t gfp_mask = readahead_gfp_mask(mapping);
- unsigned long i;
+ unsigned long mark, i = 0;
+ unsigned int min_nrpages = mapping_min_folio_nrpages(mapping);
/*
* Partway through the readahead operation, we will have added
unsigned int nofs = memalloc_nofs_save();
filemap_invalidate_lock_shared(mapping);
+ index = mapping_align_index(mapping, index);
+
+ /*
+ * As iterator `i` is aligned to min_nrpages, round_up the
+ * difference between nr_to_read and lookahead_size to mark the
+ * index that only has lookahead or "async_region" to set the
+ * readahead flag.
+ */
+ ra_folio_index = round_up(readahead_index(ractl) + nr_to_read - lookahead_size,
+ min_nrpages);
+ mark = ra_folio_index - index;
+ nr_to_read += readahead_index(ractl) - index;
+ ractl->_index = index;
+
/*
* Preallocate as many pages as we will need.
*/
- for (i = 0; i < nr_to_read; i++) {
+ while (i < nr_to_read) {
struct folio *folio = xa_load(&mapping->i_pages, index + i);
int ret;
* not worth getting one just for that.
*/
read_pages(ractl);
- ractl->_index++;
- i = ractl->_index + ractl->_nr_pages - index - 1;
+ ractl->_index += min_nrpages;
+ i = ractl->_index + ractl->_nr_pages - index;
continue;
}
- folio = filemap_alloc_folio(gfp_mask, 0);
+ folio = filemap_alloc_folio(gfp_mask,
+ mapping_min_folio_order(mapping));
if (!folio)
break;
if (ret == -ENOMEM)
break;
read_pages(ractl);
- ractl->_index++;
- i = ractl->_index + ractl->_nr_pages - index - 1;
+ ractl->_index += min_nrpages;
+ i = ractl->_index + ractl->_nr_pages - index;
continue;
}
- if (i == nr_to_read - lookahead_size)
+ if (i == mark)
folio_set_readahead(folio);
ractl->_workingset |= folio_test_workingset(folio);
- ractl->_nr_pages++;
+ ractl->_nr_pages += min_nrpages;
+ i += min_nrpages;
}
/*
struct address_space *mapping = ractl->mapping;
pgoff_t start = readahead_index(ractl);
pgoff_t index = start;
+ unsigned int min_order = mapping_min_folio_order(mapping);
pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT;
pgoff_t mark = index + ra->size - ra->async_size;
unsigned int nofs;
int err = 0;
gfp_t gfp = readahead_gfp_mask(mapping);
+ unsigned int min_ra_size = max(4, mapping_min_folio_nrpages(mapping));
- if (!mapping_large_folio_support(mapping) || ra->size < 4)
+ /*
+ * Fallback when size < min_nrpages as each folio should be
+ * at least min_nrpages anyway.
+ */
+ if (!mapping_large_folio_support(mapping) || ra->size < min_ra_size)
goto fallback;
limit = min(limit, index + ra->size - 1);
- if (new_order < MAX_PAGECACHE_ORDER)
+ if (new_order < mapping_max_folio_order(mapping))
new_order += 2;
- new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order);
+ new_order = min(mapping_max_folio_order(mapping), new_order);
new_order = min_t(unsigned int, new_order, ilog2(ra->size));
+ new_order = max(new_order, min_order);
/* See comment in page_cache_ra_unbounded() */
nofs = memalloc_nofs_save();
filemap_invalidate_lock_shared(mapping);
+ /*
+ * If the new_order is greater than min_order and index is
+ * already aligned to new_order, then this will be noop as index
+ * aligned to new_order should also be aligned to min_order.
+ */
+ ractl->_index = mapping_align_index(mapping, index);
+ index = readahead_index(ractl);
+
while (index <= limit) {
unsigned int order = new_order;
if (index & ((1UL << order) - 1))
order = __ffs(index);
/* Don't allocate pages past EOF */
- while (index + (1UL << order) - 1 > limit)
+ while (order > min_order && index + (1UL << order) - 1 > limit)
order--;
err = ra_alloc_folio(ractl, index, mark, order, gfp);
if (err)
ret = -EBADF;
f = fdget(fd);
- if (!f.file || !(f.file->f_mode & FMODE_READ))
+ if (!fd_file(f) || !(fd_file(f)->f_mode & FMODE_READ))
goto out;
/*
* on this file, then we must return -EINVAL.
*/
ret = -EINVAL;
- if (!f.file->f_mapping || !f.file->f_mapping->a_ops ||
- (!S_ISREG(file_inode(f.file)->i_mode) &&
- !S_ISBLK(file_inode(f.file)->i_mode)))
+ if (!fd_file(f)->f_mapping || !fd_file(f)->f_mapping->a_ops ||
+ (!S_ISREG(file_inode(fd_file(f))->i_mode) &&
+ !S_ISBLK(file_inode(fd_file(f))->i_mode)))
goto out;
- ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED);
+ ret = vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED);
out:
fdput(f);
return ret;
struct file_ra_state *ra = ractl->ra;
pgoff_t new_index, new_nr_pages;
gfp_t gfp_mask = readahead_gfp_mask(mapping);
+ unsigned long min_nrpages = mapping_min_folio_nrpages(mapping);
+ unsigned int min_order = mapping_min_folio_order(mapping);
new_index = new_start / PAGE_SIZE;
+ /*
+ * Readahead code should have aligned the ractl->_index to
+ * min_nrpages before calling readahead aops.
+ */
+ VM_BUG_ON(!IS_ALIGNED(ractl->_index, min_nrpages));
/* Expand the leading edge downwards */
while (ractl->_index > new_index) {
if (folio && !xa_is_value(folio))
return; /* Folio apparently present */
- folio = filemap_alloc_folio(gfp_mask, 0);
+ folio = filemap_alloc_folio(gfp_mask, min_order);
if (!folio)
return;
+
+ index = mapping_align_index(mapping, index);
if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) {
folio_put(folio);
return;
ractl->_workingset = true;
psi_memstall_enter(&ractl->_pflags);
}
- ractl->_nr_pages++;
+ ractl->_nr_pages += min_nrpages;
ractl->_index = folio->index;
}
if (folio && !xa_is_value(folio))
return; /* Folio apparently present */
- folio = filemap_alloc_folio(gfp_mask, 0);
+ folio = filemap_alloc_folio(gfp_mask, min_order);
if (!folio)
return;
+
+ index = mapping_align_index(mapping, index);
if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) {
folio_put(folio);
return;
ractl->_workingset = true;
psi_memstall_enter(&ractl->_pflags);
}
- ractl->_nr_pages++;
+ ractl->_nr_pages += min_nrpages;
if (ra) {
- ra->size++;
- ra->async_size++;
+ ra->size += min_nrpages;
+ ra->async_size += min_nrpages;
}
}
}
int err = -ENOMEM;
void *data = NULL;
- if (ops->id && ops->size) {
+ if (ops->id) {
data = kzalloc(ops->size, GFP_KERNEL);
if (!data)
goto out;
if (!err)
return 0;
- if (ops->id && ops->size) {
+ if (ops->id) {
ng = rcu_dereference_protected(net->gen,
lockdep_is_held(&pernet_ops_rwsem));
ng->ptr[*ops->id] = NULL;
struct list_head *net_exit_list)
{
struct net *net;
- if (ops->size && ops->id) {
+
+ if (ops->id) {
list_for_each_entry(net, net_exit_list, exit_list)
kfree(net_generic(net, *ops->id));
}
}
EXPORT_SYMBOL_GPL(get_net_ns_by_id);
+static __net_init void preinit_net_sysctl(struct net *net)
+{
+ net->core.sysctl_somaxconn = SOMAXCONN;
+ /* Limits per socket sk_omem_alloc usage.
+ * TCP zerocopy regular usage needs 128 KB.
+ */
+ net->core.sysctl_optmem_max = 128 * 1024;
+ net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED;
+}
+
/* init code that must occur even if setup_net() is not called. */
-static __net_init void preinit_net(struct net *net)
+static __net_init void preinit_net(struct net *net, struct user_namespace *user_ns)
{
+ refcount_set(&net->passive, 1);
+ refcount_set(&net->ns.count, 1);
+ ref_tracker_dir_init(&net->refcnt_tracker, 128, "net refcnt");
ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net notrefcnt");
+
+ get_random_bytes(&net->hash_mix, sizeof(u32));
+ net->dev_base_seq = 1;
+ net->user_ns = user_ns;
+
+ idr_init(&net->netns_ids);
+ spin_lock_init(&net->nsid_lock);
+ mutex_init(&net->ipv4.ra_mutex);
+ preinit_net_sysctl(net);
}
/*
* setup_net runs the initializers for the network namespace object.
*/
-static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
+static __net_init int setup_net(struct net *net)
{
/* Must be called with pernet_ops_rwsem held */
const struct pernet_operations *ops, *saved_ops;
LIST_HEAD(dev_kill_list);
int error = 0;
- refcount_set(&net->ns.count, 1);
- ref_tracker_dir_init(&net->refcnt_tracker, 128, "net refcnt");
-
- refcount_set(&net->passive, 1);
- get_random_bytes(&net->hash_mix, sizeof(u32));
preempt_disable();
net->net_cookie = gen_cookie_next(&net_cookie);
preempt_enable();
- net->dev_base_seq = 1;
- net->user_ns = user_ns;
- idr_init(&net->netns_ids);
- spin_lock_init(&net->nsid_lock);
- mutex_init(&net->ipv4.ra_mutex);
list_for_each_entry(ops, &pernet_list, list) {
error = ops_init(ops, net);
goto out;
}
-static int __net_init net_defaults_init_net(struct net *net)
-{
- net->core.sysctl_somaxconn = SOMAXCONN;
- /* Limits per socket sk_omem_alloc usage.
- * TCP zerocopy regular usage needs 128 KB.
- */
- net->core.sysctl_optmem_max = 128 * 1024;
- net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED;
-
- return 0;
-}
-
-static struct pernet_operations net_defaults_ops = {
- .init = net_defaults_init_net,
-};
-
-static __init int net_defaults_init(void)
-{
- if (register_pernet_subsys(&net_defaults_ops))
- panic("Cannot initialize net default settings");
-
- return 0;
-}
-
-core_initcall(net_defaults_init);
-
#ifdef CONFIG_NET_NS
static struct ucounts *inc_net_namespaces(struct user_namespace *ns)
{
goto dec_ucounts;
}
- preinit_net(net);
- refcount_set(&net->passive, 1);
+ preinit_net(net, user_ns);
net->ucounts = ucounts;
get_user_ns(user_ns);
if (rv < 0)
goto put_userns;
- rv = setup_net(net, user_ns);
+ rv = setup_net(net);
up_read(&pernet_ops_rwsem);
struct fd f = fdget(fd);
struct net *net = ERR_PTR(-EINVAL);
- if (!f.file)
+ if (!fd_file(f))
return ERR_PTR(-EBADF);
- if (proc_ns_file(f.file)) {
- struct ns_common *ns = get_proc_ns(file_inode(f.file));
+ if (proc_ns_file(fd_file(f))) {
+ struct ns_common *ns = get_proc_ns(file_inode(fd_file(f)));
if (ns->ops == &netns_operations)
net = get_net(container_of(ns, struct net, ns));
}
#ifdef CONFIG_KEYS
init_net.key_domain = &init_net_key_domain;
#endif
+ preinit_net(&init_net, &init_user_ns);
+
down_write(&pernet_ops_rwsem);
- preinit_net(&init_net);
- if (setup_net(&init_net, &init_user_ns))
+ if (setup_net(&init_net))
panic("Could not setup the initial network namespace");
init_net_initialized = true;
LIST_HEAD(net_exit_list);
list_add_tail(&ops->list, list);
- if (ops->init || (ops->id && ops->size)) {
+ if (ops->init || ops->id) {
/* We held write locked pernet_ops_rwsem, and parallel
* setup_net() and cleanup_net() are not possible.
*/
{
int error;
+ if (WARN_ON(!!ops->id ^ !!ops->size))
+ return -EINVAL;
+
if (ops->id) {
error = ida_alloc_min(&net_generic_ids, MIN_PERNET_OPS_ID,
GFP_KERNEL);
struct socket *sock;
*err = -EBADF;
- if (f.file) {
- sock = sock_from_file(f.file);
+ if (fd_file(f)) {
+ sock = sock_from_file(fd_file(f));
if (likely(sock)) {
- *fput_needed = f.flags & FDPUT_FPUT;
+ *fput_needed = f.word & FDPUT_FPUT;
return sock;
}
*err = -ENOTSOCK;
memset(&tss, 0, sizeof(tss));
tsflags = READ_ONCE(sk->sk_tsflags);
- if ((tsflags & SOF_TIMESTAMPING_SOFTWARE) &&
+ if ((tsflags & SOF_TIMESTAMPING_SOFTWARE &&
+ (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE ||
+ skb_is_err_queue(skb) ||
+ !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER))) &&
ktime_to_timespec64_cond(skb->tstamp, tss.ts + 0))
empty = 0;
if (shhwtstamps &&
- (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
+ (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE &&
+ (tsflags & SOF_TIMESTAMPING_RX_HARDWARE ||
+ skb_is_err_queue(skb) ||
+ !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER))) &&
!skb_is_swtx_tstamp(skb, false_tstamp)) {
if_index = 0;
if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NETDEV)
struct fd f;
f = fdget(fd);
- if (f.file) {
- ret = __sys_accept4_file(f.file, upeer_sockaddr,
+ if (fd_file(f)) {
+ ret = __sys_accept4_file(fd_file(f), upeer_sockaddr,
upeer_addrlen, flags);
fdput(f);
}
struct fd f;
f = fdget(fd);
- if (f.file) {
+ if (fd_file(f)) {
struct sockaddr_storage address;
ret = move_addr_to_kernel(uservaddr, addrlen, &address);
if (!ret)
- ret = __sys_connect_file(f.file, &address, addrlen, 0);
+ ret = __sys_connect_file(fd_file(f), &address, addrlen, 0);
fdput(f);
}
int do_sock_getsockopt(struct socket *sock, bool compat, int level,
int optname, sockptr_t optval, sockptr_t optlen)
{
- int max_optlen __maybe_unused;
+ int max_optlen __maybe_unused = 0;
const struct proto_ops *ops;
int err;
return err;
if (!compat)
- max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen);
+ copy_from_sockptr(&max_optlen, optlen, sizeof(int));
ops = READ_ONCE(sock->ops);
if (level == SOL_SOCKET) {
return;
f = fdget(kernel_fd);
- if (!f.file)
+ if (!fd_file(f))
return;
- process_buffer_measurement(file_mnt_idmap(f.file), file_inode(f.file),
+ process_buffer_measurement(file_mnt_idmap(fd_file(f)), file_inode(fd_file(f)),
buf, size, "kexec-cmdline", KEXEC_CMDLINE, 0,
NULL, false, NULL, 0);
fdput(f);
#ifdef CONFIG_INTEGRITY_ASYMMETRIC_KEYS
LSM_HOOK_INIT(kernel_module_request, ima_kernel_module_request),
#endif
- LSM_HOOK_INIT(inode_free_security, ima_inode_free),
+ LSM_HOOK_INIT(inode_free_security_rcu, ima_inode_free_rcu),
};
static const struct lsm_id ima_lsmid = {
params = memdup_user(_params, sizeof(*params));
if (IS_ERR(params))
- return PTR_ERR(no_free_ptr(params));
+ return PTR_ERR(params);
err = snd_pcm_hw_refine(substream, params);
if (err < 0)
params = memdup_user(_params, sizeof(*params));
if (IS_ERR(params))
- return PTR_ERR(no_free_ptr(params));
+ return PTR_ERR(params);
err = snd_pcm_hw_params(substream, params);
if (err < 0)
bool nonatomic = substream->pcm->nonatomic;
CLASS(fd, f)(fd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADFD;
- if (!is_pcm_file(f.file))
+ if (!is_pcm_file(fd_file(f)))
return -EBADFD;
- pcm_file = f.file->private_data;
+ pcm_file = fd_file(f)->private_data;
substream1 = pcm_file->substream;
if (substream == substream1)
return snd_interval_refine(hw_param_interval(params, rule->var), &t);
}
-#if SNDRV_PCM_RATE_5512 != 1 << 0 || SNDRV_PCM_RATE_192000 != 1 << 12
+#if SNDRV_PCM_RATE_5512 != 1 << 0 || SNDRV_PCM_RATE_192000 != 1 << 12 ||\
+ SNDRV_PCM_RATE_128000 != 1 << 19
#error "Change this table"
#endif
+/* NOTE: the list is unsorted! */
static const unsigned int rates[] = {
5512, 8000, 11025, 16000, 22050, 32000, 44100,
- 48000, 64000, 88200, 96000, 176400, 192000, 352800, 384000, 705600, 768000
+ 48000, 64000, 88200, 96000, 176400, 192000, 352800, 384000, 705600, 768000,
+ /* extended */
+ 12000, 24000, 128000
};
const struct snd_pcm_hw_constraint_list snd_pcm_known_rates = {
bufs = memdup_user(xfern.bufs, sizeof(void *) * runtime->channels);
if (IS_ERR(bufs))
- return PTR_ERR(no_free_ptr(bufs));
+ return PTR_ERR(bufs);
if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK)
result = snd_pcm_lib_writev(substream, bufs, xfern.frames);
else
oparams = memdup_user(_oparams, sizeof(*oparams));
if (IS_ERR(oparams))
- return PTR_ERR(no_free_ptr(oparams));
+ return PTR_ERR(oparams);
snd_pcm_hw_convert_from_old_params(params, oparams);
err = snd_pcm_hw_refine(substream, params);
if (err < 0)
oparams = memdup_user(_oparams, sizeof(*oparams));
if (IS_ERR(oparams))
- return PTR_ERR(no_free_ptr(oparams));
+ return PTR_ERR(oparams);
snd_pcm_hw_convert_from_old_params(params, oparams);
err = snd_pcm_hw_params(substream, params);
mutex_lock(&kvm->irqfds.resampler_lock);
list_del_rcu(&irqfd->resampler_link);
- synchronize_srcu(&kvm->irq_srcu);
if (list_empty(&resampler->list)) {
list_del_rcu(&resampler->link);
kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
/*
- * synchronize_srcu(&kvm->irq_srcu) already called
+ * synchronize_srcu_expedited(&kvm->irq_srcu) already called
* in kvm_unregister_irq_ack_notifier().
*/
kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
resampler->notifier.gsi, 0, false);
kfree(resampler);
+ } else {
+ synchronize_srcu_expedited(&kvm->irq_srcu);
}
mutex_unlock(&kvm->irqfds.resampler_lock);
u64 cnt;
/* Make sure irqfd has been initialized in assign path. */
- synchronize_srcu(&kvm->irq_srcu);
+ synchronize_srcu_expedited(&kvm->irq_srcu);
/*
* Synchronize with the wait-queue and unhook ourselves to prevent
seqcount_spinlock_init(&irqfd->irq_entry_sc, &kvm->irqfds.lock);
f = fdget(args->fd);
- if (!f.file) {
+ if (!fd_file(f)) {
ret = -EBADF;
goto out;
}
- eventfd = eventfd_ctx_fileget(f.file);
+ eventfd = eventfd_ctx_fileget(fd_file(f));
if (IS_ERR(eventfd)) {
ret = PTR_ERR(eventfd);
goto fail;
}
list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list);
- synchronize_srcu(&kvm->irq_srcu);
+ synchronize_srcu_expedited(&kvm->irq_srcu);
mutex_unlock(&kvm->irqfds.resampler_lock);
}
* Check if there was an event already pending on the eventfd
* before we registered, and trigger it as if we didn't miss it.
*/
- events = vfs_poll(f.file, &irqfd->pt);
+ events = vfs_poll(fd_file(f), &irqfd->pt);
if (events & EPOLLIN)
schedule_work(&irqfd->inject);
mutex_lock(&kvm->irq_lock);
hlist_del_init_rcu(&kian->link);
mutex_unlock(&kvm->irq_lock);
- synchronize_srcu(&kvm->irq_srcu);
+ synchronize_srcu_expedited(&kvm->irq_srcu);
kvm_arch_post_irq_ack_notifier_list_update(kvm);
}
/*
* Take note of a change in irq routing.
- * Caller must invoke synchronize_srcu(&kvm->irq_srcu) afterwards.
+ * Caller must invoke synchronize_srcu_expedited(&kvm->irq_srcu) afterwards.
*/
void kvm_irq_routing_update(struct kvm *kvm)
{