EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
/*
- * spte bits of bit 3 ~ bit 11 are used as low 9 bits of generation number,
- * the bits of bits 52 ~ bit 61 are used as high 10 bits of generation
- * number.
+ * the low bit of the generation number is always presumed to be zero.
+ * This disables mmio caching during memslot updates. The concept is
+ * similar to a seqcount but instead of retrying the access we just punt
+ * and ignore the cache.
+ *
+ * spte bits 3-11 are used as bits 1-9 of the generation number,
+ * the bits 52-61 are used as bits 10-19 of the generation number.
*/
-#define MMIO_SPTE_GEN_LOW_SHIFT 3
+#define MMIO_SPTE_GEN_LOW_SHIFT 2
#define MMIO_SPTE_GEN_HIGH_SHIFT 52
-#define MMIO_GEN_SHIFT 19
-#define MMIO_GEN_LOW_SHIFT 9
-#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 1)
+#define MMIO_GEN_SHIFT 20
+#define MMIO_GEN_LOW_SHIFT 10
+#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2)
#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1)
#define MMIO_MAX_GEN ((1 << MMIO_GEN_SHIFT) - 1)
static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
{
- /*
- * Init kvm generation close to MMIO_MAX_GEN to easily test the
- * code of handling generation number wrap-around.
- */
- return (kvm_memslots(kvm)->generation +
- MMIO_MAX_GEN - 150) & MMIO_GEN_MASK;
+ return kvm_memslots(kvm)->generation & MMIO_GEN_MASK;
}
static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn,
return likely(kvm_gen == spte_gen);
}
-static inline u64 rsvd_bits(int s, int e)
-{
- return ((1ULL << (e - s + 1)) - 1) << s;
-}
-
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask)
{
* Write-protect on the specified @sptep, @pt_protect indicates whether
* spte write-protection is caused by protecting shadow page table.
*
- * Note: write protection is difference between drity logging and spte
+ * Note: write protection is difference between dirty logging and spte
* protection:
* - for dirty logging, the spte can be set to writable at anytime if
* its dirty bitmap is properly set.
}
static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
- struct kvm_memory_slot *slot, unsigned long data)
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ unsigned long data)
{
u64 *sptep;
struct rmap_iterator iter;
while ((sptep = rmap_get_first(*rmapp, &iter))) {
BUG_ON(!(*sptep & PT_PRESENT_MASK));
- rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", sptep, *sptep);
+ rmap_printk("kvm_rmap_unmap_hva: spte %p %llx gfn %llx (%d)\n",
+ sptep, *sptep, gfn, level);
drop_spte(kvm, sptep);
need_tlb_flush = 1;
}
static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
- struct kvm_memory_slot *slot, unsigned long data)
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ unsigned long data)
{
u64 *sptep;
struct rmap_iterator iter;
for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
BUG_ON(!is_shadow_present_pte(*sptep));
- rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", sptep, *sptep);
+ rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
+ sptep, *sptep, gfn, level);
need_flush = 1;
int (*handler)(struct kvm *kvm,
unsigned long *rmapp,
struct kvm_memory_slot *slot,
+ gfn_t gfn,
+ int level,
unsigned long data))
{
int j;
j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) {
unsigned long idx, idx_end;
unsigned long *rmapp;
+ gfn_t gfn = gfn_start;
/*
* {idx(page_j) | page_j intersects with
rmapp = __gfn_to_rmap(gfn_start, j, memslot);
- for (; idx <= idx_end; ++idx)
- ret |= handler(kvm, rmapp++, memslot, data);
+ for (; idx <= idx_end;
+ ++idx, gfn += (1UL << KVM_HPAGE_GFN_SHIFT(j)))
+ ret |= handler(kvm, rmapp++, memslot,
+ gfn, j, data);
}
}
unsigned long data,
int (*handler)(struct kvm *kvm, unsigned long *rmapp,
struct kvm_memory_slot *slot,
+ gfn_t gfn, int level,
unsigned long data))
{
return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
}
static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
- struct kvm_memory_slot *slot, unsigned long data)
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ unsigned long data)
{
u64 *sptep;
struct rmap_iterator uninitialized_var(iter);
int young = 0;
- /*
- * In case of absence of EPT Access and Dirty Bits supports,
- * emulate the accessed bit for EPT, by checking if this page has
- * an EPT mapping, and clearing it if it does. On the next access,
- * a new EPT mapping will be established.
- * This has some overhead, but not as much as the cost of swapping
- * out actively used pages or breaking up actively used hugepages.
- */
- if (!shadow_accessed_mask) {
- young = kvm_unmap_rmapp(kvm, rmapp, slot, data);
- goto out;
- }
+ BUG_ON(!shadow_accessed_mask);
for (sptep = rmap_get_first(*rmapp, &iter); sptep;
sptep = rmap_get_next(&iter)) {
(unsigned long *)sptep);
}
}
-out:
- /* @data has hva passed to kvm_age_hva(). */
- trace_kvm_age_page(data, slot, young);
+ trace_kvm_age_page(gfn, level, slot, young);
return young;
}
static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
- struct kvm_memory_slot *slot, unsigned long data)
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ int level, unsigned long data)
{
u64 *sptep;
struct rmap_iterator iter;
rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
- kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, 0);
+ kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, gfn, sp->role.level, 0);
kvm_flush_remote_tlbs(vcpu->kvm);
}
-int kvm_age_hva(struct kvm *kvm, unsigned long hva)
+int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
{
- return kvm_handle_hva(kvm, hva, hva, kvm_age_rmapp);
+ /*
+ * In case of absence of EPT Access and Dirty Bits supports,
+ * emulate the accessed bit for EPT, by checking if this page has
+ * an EPT mapping, and clearing it if it does. On the next access,
+ * a new EPT mapping will be established.
+ * This has some overhead, but not as much as the cost of swapping
+ * out actively used pages or breaking up actively used hugepages.
+ */
+ if (!shadow_accessed_mask) {
+ /*
+ * We are holding the kvm->mmu_lock, and we are blowing up
+ * shadow PTEs. MMU notifier consumers need to be kept at bay.
+ * This is correct as long as we don't decouple the mmu_lock
+ * protected regions (like invalidate_range_start|end does).
+ */
+ kvm->mmu_notifier_seq++;
+ return kvm_handle_hva_range(kvm, start, end, 0,
+ kvm_unmap_rmapp);
+ }
+
+ return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
}
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
return 1;
}
- kvm_mmu_flush_tlb(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
return 0;
}
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
if (flush)
- kvm_mmu_flush_tlb(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
struct mmu_page_path {
true, host_writable)) {
if (write_fault)
*emulate = 1;
- kvm_mmu_flush_tlb(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
if (unlikely(is_mmio_spte(*sptep) && emulate))
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
- vcpu_clear_mmio_info(vcpu, ~0ul);
+ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
{
if (exception)
exception->error_code = 0;
- return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
+ return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception);
}
static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
context->nx = false;
}
-void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
-{
- ++vcpu->stat.tlb_flush;
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb);
-
void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu)
{
mmu_free_roots(vcpu);
int maxphyaddr = cpuid_maxphyaddr(vcpu);
u64 exb_bit_rsvd = 0;
u64 gbpages_bit_rsvd = 0;
+ u64 nonleaf_bit8_rsvd = 0;
context->bad_mt_xwr = 0;
exb_bit_rsvd = rsvd_bits(63, 63);
if (!guest_cpuid_has_gbpages(vcpu))
gbpages_bit_rsvd = rsvd_bits(7, 7);
+
+ /*
+ * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
+ * leaf entries) on AMD CPUs only.
+ */
+ if (guest_cpuid_is_amd(vcpu))
+ nonleaf_bit8_rsvd = rsvd_bits(8, 8);
+
switch (context->root_level) {
case PT32_ROOT_LEVEL:
/* no rsvd bits for 2 level 4K page table entries */
break;
case PT64_ROOT_LEVEL:
context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 7);
+ nonleaf_bit8_rsvd | rsvd_bits(7, 7) | rsvd_bits(maxphyaddr, 51);
context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
- gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51);
+ nonleaf_bit8_rsvd | gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51);
context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 51);
context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
if (remote_flush)
kvm_flush_remote_tlbs(vcpu->kvm);
else if (local_flush)
- kvm_mmu_flush_tlb(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{
vcpu->arch.mmu.invlpg(vcpu, gva);
- kvm_mmu_flush_tlb(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
++vcpu->stat.invlpg;
}
EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
* The very rare case: if the generation-number is round,
* zap all shadow pages.
*/
- if (unlikely(kvm_current_mmio_generation(kvm) >= MMIO_MAX_GEN)) {
+ if (unlikely(kvm_current_mmio_generation(kvm) == 0)) {
printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n");
kvm_mmu_invalidate_zap_all_pages(kvm);
}
if (!mmu_page_header_cache)
goto nomem;
- if (percpu_counter_init(&kvm_total_used_mmu_pages, 0))
+ if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
goto nomem;
register_shrinker(&mmu_shrinker);
*
* The refcount will have a range of 0 to ((1U << 31) - 1), i.e. one bit less
* than an atomic_t - this is because of the way shutdown works, see
- * percpu_ref_kill()/PCPU_COUNT_BIAS.
+ * percpu_ref_kill()/PERCPU_COUNT_BIAS.
*
* Before you call percpu_ref_kill(), percpu_ref_put() does not check for the
* refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill()
* calls io_destroy() or the process exits.
*
* In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it
- * calls percpu_ref_kill(), then hlist_del_rcu() and sychronize_rcu() to remove
+ * calls percpu_ref_kill(), then hlist_del_rcu() and synchronize_rcu() to remove
* the kioctx from the proccess's list of kioctxs - after that, there can't be
* any new users of the kioctx (from lookup_ioctx()) and it's then safe to drop
* the initial ref with percpu_ref_put().
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/rcupdate.h>
+ #include <linux/gfp.h>
struct percpu_ref;
typedef void (percpu_ref_func_t)(struct percpu_ref *);
+ /* flags set in the lower bits of percpu_ref->percpu_count_ptr */
+ enum {
+ __PERCPU_REF_ATOMIC = 1LU << 0, /* operating in atomic mode */
+ __PERCPU_REF_DEAD = 1LU << 1, /* (being) killed */
+ __PERCPU_REF_ATOMIC_DEAD = __PERCPU_REF_ATOMIC | __PERCPU_REF_DEAD,
+
+ __PERCPU_REF_FLAG_BITS = 2,
+ };
+
+ /* @flags for percpu_ref_init() */
+ enum {
+ /*
+ * Start w/ ref == 1 in atomic mode. Can be switched to percpu
+ * operation using percpu_ref_switch_to_percpu(). If initialized
+ * with this flag, the ref will stay in atomic mode until
+ * percpu_ref_switch_to_percpu() is invoked on it.
+ */
+ PERCPU_REF_INIT_ATOMIC = 1 << 0,
+
+ /*
+ * Start dead w/ ref == 0 in atomic mode. Must be revived with
+ * percpu_ref_reinit() before used. Implies INIT_ATOMIC.
+ */
+ PERCPU_REF_INIT_DEAD = 1 << 1,
+ };
+
struct percpu_ref {
- atomic_t count;
+ atomic_long_t count;
/*
* The low bit of the pointer indicates whether the ref is in percpu
* mode; if set, then get/put will manipulate the atomic_t.
*/
- unsigned long pcpu_count_ptr;
+ unsigned long percpu_count_ptr;
percpu_ref_func_t *release;
- percpu_ref_func_t *confirm_kill;
+ percpu_ref_func_t *confirm_switch;
+ bool force_atomic:1;
struct rcu_head rcu;
};
int __must_check percpu_ref_init(struct percpu_ref *ref,
- percpu_ref_func_t *release);
- void percpu_ref_reinit(struct percpu_ref *ref);
+ percpu_ref_func_t *release, unsigned int flags,
+ gfp_t gfp);
void percpu_ref_exit(struct percpu_ref *ref);
+ void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
+ percpu_ref_func_t *confirm_switch);
+ void percpu_ref_switch_to_percpu(struct percpu_ref *ref);
void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
percpu_ref_func_t *confirm_kill);
- void __percpu_ref_kill_expedited(struct percpu_ref *ref);
+ void percpu_ref_reinit(struct percpu_ref *ref);
/**
* percpu_ref_kill - drop the initial ref
return percpu_ref_kill_and_confirm(ref, NULL);
}
- #define PCPU_REF_DEAD 1
-
/*
* Internal helper. Don't use outside percpu-refcount proper. The
* function doesn't return the pointer and let the caller test it for NULL
* because doing so forces the compiler to generate two conditional
- * branches as it can't assume that @ref->pcpu_count is not NULL.
+ * branches as it can't assume that @ref->percpu_count is not NULL.
*/
- static inline bool __pcpu_ref_alive(struct percpu_ref *ref,
- unsigned __percpu **pcpu_countp)
+ static inline bool __ref_is_percpu(struct percpu_ref *ref,
+ unsigned long __percpu **percpu_countp)
{
- unsigned long pcpu_ptr = ACCESS_ONCE(ref->pcpu_count_ptr);
+ unsigned long percpu_ptr = ACCESS_ONCE(ref->percpu_count_ptr);
/* paired with smp_store_release() in percpu_ref_reinit() */
smp_read_barrier_depends();
- if (unlikely(pcpu_ptr & PCPU_REF_DEAD))
+ if (unlikely(percpu_ptr & __PERCPU_REF_ATOMIC))
return false;
- *pcpu_countp = (unsigned __percpu *)pcpu_ptr;
+ *percpu_countp = (unsigned long __percpu *)percpu_ptr;
return true;
}
* percpu_ref_get - increment a percpu refcount
* @ref: percpu_ref to get
*
- * Analagous to atomic_inc().
- */
+ * Analagous to atomic_long_inc().
+ *
+ * This function is safe to call as long as @ref is between init and exit.
+ */
static inline void percpu_ref_get(struct percpu_ref *ref)
{
- unsigned __percpu *pcpu_count;
+ unsigned long __percpu *percpu_count;
rcu_read_lock_sched();
- if (__pcpu_ref_alive(ref, &pcpu_count))
- this_cpu_inc(*pcpu_count);
+ if (__ref_is_percpu(ref, &percpu_count))
+ this_cpu_inc(*percpu_count);
else
- atomic_inc(&ref->count);
+ atomic_long_inc(&ref->count);
rcu_read_unlock_sched();
}
* Increment a percpu refcount unless its count already reached zero.
* Returns %true on success; %false on failure.
*
- * The caller is responsible for ensuring that @ref stays accessible.
+ * This function is safe to call as long as @ref is between init and exit.
*/
static inline bool percpu_ref_tryget(struct percpu_ref *ref)
{
- unsigned __percpu *pcpu_count;
- int ret = false;
+ unsigned long __percpu *percpu_count;
+ int ret;
rcu_read_lock_sched();
- if (__pcpu_ref_alive(ref, &pcpu_count)) {
- this_cpu_inc(*pcpu_count);
+ if (__ref_is_percpu(ref, &percpu_count)) {
+ this_cpu_inc(*percpu_count);
ret = true;
} else {
- ret = atomic_inc_not_zero(&ref->count);
+ ret = atomic_long_inc_not_zero(&ref->count);
}
rcu_read_unlock_sched();
* Increment a percpu refcount unless it has already been killed. Returns
* %true on success; %false on failure.
*
- * Completion of percpu_ref_kill() in itself doesn't guarantee that tryget
- * will fail. For such guarantee, percpu_ref_kill_and_confirm() should be
- * used. After the confirm_kill callback is invoked, it's guaranteed that
- * no new reference will be given out by percpu_ref_tryget().
+ * Completion of percpu_ref_kill() in itself doesn't guarantee that this
+ * function will fail. For such guarantee, percpu_ref_kill_and_confirm()
+ * should be used. After the confirm_kill callback is invoked, it's
+ * guaranteed that no new reference will be given out by
+ * percpu_ref_tryget_live().
*
- * The caller is responsible for ensuring that @ref stays accessible.
+ * This function is safe to call as long as @ref is between init and exit.
*/
static inline bool percpu_ref_tryget_live(struct percpu_ref *ref)
{
- unsigned __percpu *pcpu_count;
+ unsigned long __percpu *percpu_count;
int ret = false;
rcu_read_lock_sched();
- if (__pcpu_ref_alive(ref, &pcpu_count)) {
- this_cpu_inc(*pcpu_count);
+ if (__ref_is_percpu(ref, &percpu_count)) {
+ this_cpu_inc(*percpu_count);
ret = true;
+ } else if (!(ACCESS_ONCE(ref->percpu_count_ptr) & __PERCPU_REF_DEAD)) {
+ ret = atomic_long_inc_not_zero(&ref->count);
}
rcu_read_unlock_sched();
*
* Decrement the refcount, and if 0, call the release function (which was passed
* to percpu_ref_init())
+ *
+ * This function is safe to call as long as @ref is between init and exit.
*/
static inline void percpu_ref_put(struct percpu_ref *ref)
{
- unsigned __percpu *pcpu_count;
+ unsigned long __percpu *percpu_count;
rcu_read_lock_sched();
- if (__pcpu_ref_alive(ref, &pcpu_count))
- this_cpu_dec(*pcpu_count);
- else if (unlikely(atomic_dec_and_test(&ref->count)))
+ if (__ref_is_percpu(ref, &percpu_count))
+ this_cpu_dec(*percpu_count);
+ else if (unlikely(atomic_long_dec_and_test(&ref->count)))
ref->release(ref);
rcu_read_unlock_sched();
* @ref: percpu_ref to test
*
* Returns %true if @ref reached zero.
+ *
+ * This function is safe to call as long as @ref is between init and exit.
*/
static inline bool percpu_ref_is_zero(struct percpu_ref *ref)
{
- unsigned __percpu *pcpu_count;
+ unsigned long __percpu *percpu_count;
- if (__pcpu_ref_alive(ref, &pcpu_count))
+ if (__ref_is_percpu(ref, &percpu_count))
return false;
- return !atomic_read(&ref->count);
+ return !atomic_long_read(&ref->count);
}
#endif
static struct cftype cgroup_dfl_base_files[];
static struct cftype cgroup_legacy_base_files[];
-static void cgroup_put(struct cgroup *cgrp);
static int rebind_subsystems(struct cgroup_root *dst_root,
unsigned int ss_mask);
static int cgroup_destroy_locked(struct cgroup *cgrp);
static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
-static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
/* IDR wrappers which synchronize using cgroup_idr_lock */
static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
return false;
}
-static int cgroup_is_releasable(const struct cgroup *cgrp)
-{
- const int bits =
- (1 << CGRP_RELEASABLE) |
- (1 << CGRP_NOTIFY_ON_RELEASE);
- return (cgrp->flags & bits) == bits;
-}
-
static int notify_on_release(const struct cgroup *cgrp)
{
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
; \
else
-/* the list of cgroups eligible for automatic release. Protected by
- * release_list_lock */
-static LIST_HEAD(release_list);
-static DEFINE_RAW_SPINLOCK(release_list_lock);
static void cgroup_release_agent(struct work_struct *work);
-static DECLARE_WORK(release_agent_work, cgroup_release_agent);
static void check_for_release(struct cgroup *cgrp);
/*
return key;
}
-static void put_css_set_locked(struct css_set *cset, bool taskexit)
+static void put_css_set_locked(struct css_set *cset)
{
struct cgrp_cset_link *link, *tmp_link;
struct cgroup_subsys *ss;
/* @cgrp can't go away while we're holding css_set_rwsem */
if (list_empty(&cgrp->cset_links)) {
cgroup_update_populated(cgrp, false);
- if (notify_on_release(cgrp)) {
- if (taskexit)
- set_bit(CGRP_RELEASABLE, &cgrp->flags);
- check_for_release(cgrp);
- }
+ check_for_release(cgrp);
}
kfree(link);
kfree_rcu(cset, rcu_head);
}
-static void put_css_set(struct css_set *cset, bool taskexit)
+static void put_css_set(struct css_set *cset)
{
/*
* Ensure that the refcount doesn't hit zero while any readers
return;
down_write(&css_set_rwsem);
- put_css_set_locked(cset, taskexit);
+ put_css_set_locked(cset);
up_write(&css_set_rwsem);
}
* knows that the cgroup won't be removed, as cgroup_rmdir()
* needs that mutex.
*
- * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
- * (usually) take cgroup_mutex. These are the two most performance
- * critical pieces of code here. The exception occurs on cgroup_exit(),
- * when a task in a notify_on_release cgroup exits. Then cgroup_mutex
- * is taken, and if the cgroup count is zero, a usermode call made
- * to the release agent with the name of the cgroup (path relative to
- * the root of cgroup file system) as the argument.
- *
* A cgroup can only be deleted if both its 'count' of using tasks
* is zero, and its list of 'children' cgroups is empty. Since all
* tasks in the system use _some_ cgroup, and since there is always at
INIT_LIST_HEAD(&cgrp->self.sibling);
INIT_LIST_HEAD(&cgrp->self.children);
INIT_LIST_HEAD(&cgrp->cset_links);
- INIT_LIST_HEAD(&cgrp->release_list);
INIT_LIST_HEAD(&cgrp->pidlists);
mutex_init(&cgrp->pidlist_mutex);
cgrp->self.cgroup = cgrp;
INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
init_waitqueue_head(&cgrp->offline_waitq);
+ INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent);
}
static void init_cgroup_root(struct cgroup_root *root,
goto out;
root_cgrp->id = ret;
- ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release);
+ ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
+ GFP_KERNEL);
if (ret)
goto out;
* task. As trading it for new_cset is protected by cgroup_mutex,
* we're safe to drop it here; it will be freed under RCU.
*/
- set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
- put_css_set_locked(old_cset, false);
+ put_css_set_locked(old_cset);
}
/**
cset->mg_src_cgrp = NULL;
cset->mg_dst_cset = NULL;
list_del_init(&cset->mg_preload_node);
- put_css_set_locked(cset, false);
+ put_css_set_locked(cset);
}
up_write(&css_set_rwsem);
}
if (src_cset == dst_cset) {
src_cset->mg_src_cgrp = NULL;
list_del_init(&src_cset->mg_preload_node);
- put_css_set(src_cset, false);
- put_css_set(dst_cset, false);
+ put_css_set(src_cset);
+ put_css_set(dst_cset);
continue;
}
if (list_empty(&dst_cset->mg_preload_node))
list_add(&dst_cset->mg_preload_node, &csets);
else
- put_css_set(dst_cset, false);
+ put_css_set(dst_cset);
}
list_splice_tail(&csets, preloaded_csets);
static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
- clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
if (val)
set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
else
/* cgroup free path */
atomic_dec(&cgrp->root->nr_cgrps);
cgroup_pidlist_destroy_all(cgrp);
+ cancel_work_sync(&cgrp->release_agent_work);
if (cgroup_parent(cgrp)) {
/*
init_and_link_css(css, ss, cgrp);
- err = percpu_ref_init(&css->refcnt, css_release);
+ err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
if (err)
goto err_free_css;
goto out_unlock;
}
- ret = percpu_ref_init(&cgrp->self.refcnt, css_release);
+ ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
if (ret)
goto out_free_cgrp;
for_each_css(css, ssid, cgrp)
kill_css(css);
- /* CSS_ONLINE is clear, remove from ->release_list for the last time */
- raw_spin_lock(&release_list_lock);
- if (!list_empty(&cgrp->release_list))
- list_del_init(&cgrp->release_list);
- raw_spin_unlock(&release_list_lock);
-
/*
* Remove @cgrp directory along with the base files. @cgrp has an
* extra ref on its kn.
*/
kernfs_remove(cgrp->kn);
- set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags);
check_for_release(cgroup_parent(cgrp));
/* put the base reference */
cgrp = cgroup_kn_lock_live(kn);
if (!cgrp)
return 0;
- cgroup_get(cgrp); /* for @kn->priv clearing */
ret = cgroup_destroy_locked(cgrp);
cgroup_kn_unlock(kn);
-
- cgroup_put(cgrp);
return ret;
}
* - Print task's cgroup paths into seq_file, one line for each hierarchy
* - Used for /proc/<pid>/cgroup.
*/
-
-/* TODO: Use a proper seq_file iterator */
-int proc_cgroup_show(struct seq_file *m, void *v)
+int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *tsk)
{
- struct pid *pid;
- struct task_struct *tsk;
char *buf, *path;
int retval;
struct cgroup_root *root;
if (!buf)
goto out;
- retval = -ESRCH;
- pid = m->private;
- tsk = get_pid_task(pid, PIDTYPE_PID);
- if (!tsk)
- goto out_free;
-
- retval = 0;
-
mutex_lock(&cgroup_mutex);
down_read(&css_set_rwsem);
seq_putc(m, '\n');
}
+ retval = 0;
out_unlock:
up_read(&css_set_rwsem);
mutex_unlock(&cgroup_mutex);
- put_task_struct(tsk);
-out_free:
kfree(buf);
out:
return retval;
int i;
/*
- * This may race against cgroup_enable_task_cg_links(). As that
+ * This may race against cgroup_enable_task_cg_lists(). As that
* function sets use_task_css_set_links before grabbing
* tasklist_lock and we just went through tasklist_lock to add
* @child, it's guaranteed that either we see the set
* when implementing operations which need to migrate all tasks of
* a cgroup to another.
*
- * Note that if we lose to cgroup_enable_task_cg_links(), @child
+ * Note that if we lose to cgroup_enable_task_cg_lists(), @child
* will remain in init_css_set. This is safe because all tasks are
* in the init_css_set before cg_links is enabled and there's no
* operation which transfers all tasks out of init_css_set.
}
if (put_cset)
- put_css_set(cset, true);
+ put_css_set(cset);
}
static void check_for_release(struct cgroup *cgrp)
{
- if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) &&
- !css_has_online_children(&cgrp->self)) {
- /*
- * Control Group is currently removeable. If it's not
- * already queued for a userspace notification, queue
- * it now
- */
- int need_schedule_work = 0;
-
- raw_spin_lock(&release_list_lock);
- if (!cgroup_is_dead(cgrp) &&
- list_empty(&cgrp->release_list)) {
- list_add(&cgrp->release_list, &release_list);
- need_schedule_work = 1;
- }
- raw_spin_unlock(&release_list_lock);
- if (need_schedule_work)
- schedule_work(&release_agent_work);
- }
+ if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) &&
+ !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
+ schedule_work(&cgrp->release_agent_work);
}
/*
*/
static void cgroup_release_agent(struct work_struct *work)
{
- BUG_ON(work != &release_agent_work);
+ struct cgroup *cgrp =
+ container_of(work, struct cgroup, release_agent_work);
+ char *pathbuf = NULL, *agentbuf = NULL, *path;
+ char *argv[3], *envp[3];
+
mutex_lock(&cgroup_mutex);
- raw_spin_lock(&release_list_lock);
- while (!list_empty(&release_list)) {
- char *argv[3], *envp[3];
- int i;
- char *pathbuf = NULL, *agentbuf = NULL, *path;
- struct cgroup *cgrp = list_entry(release_list.next,
- struct cgroup,
- release_list);
- list_del_init(&cgrp->release_list);
- raw_spin_unlock(&release_list_lock);
- pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
- if (!pathbuf)
- goto continue_free;
- path = cgroup_path(cgrp, pathbuf, PATH_MAX);
- if (!path)
- goto continue_free;
- agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
- if (!agentbuf)
- goto continue_free;
-
- i = 0;
- argv[i++] = agentbuf;
- argv[i++] = path;
- argv[i] = NULL;
-
- i = 0;
- /* minimal command environment */
- envp[i++] = "HOME=/";
- envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
- envp[i] = NULL;
-
- /* Drop the lock while we invoke the usermode helper,
- * since the exec could involve hitting disk and hence
- * be a slow process */
- mutex_unlock(&cgroup_mutex);
- call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
- mutex_lock(&cgroup_mutex);
- continue_free:
- kfree(pathbuf);
- kfree(agentbuf);
- raw_spin_lock(&release_list_lock);
- }
- raw_spin_unlock(&release_list_lock);
+
+ pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
+ agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
+ if (!pathbuf || !agentbuf)
+ goto out;
+
+ path = cgroup_path(cgrp, pathbuf, PATH_MAX);
+ if (!path)
+ goto out;
+
+ argv[0] = agentbuf;
+ argv[1] = path;
+ argv[2] = NULL;
+
+ /* minimal command environment */
+ envp[0] = "HOME=/";
+ envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+ envp[2] = NULL;
+
+ mutex_unlock(&cgroup_mutex);
+ call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+ goto out_free;
+out:
mutex_unlock(&cgroup_mutex);
+out_free:
+ kfree(agentbuf);
+ kfree(pathbuf);
}
static int __init cgroup_disable(char *str)
static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
{
- return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);
+ return (!cgroup_has_tasks(css->cgroup) &&
+ !css_has_online_children(&css->cgroup->self));
}
static struct cftype debug_files[] = {
bdi_wb_init(&bdi->wb, bdi);
for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
- err = percpu_counter_init(&bdi->bdi_stat[i], 0);
+ err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL);
if (err)
goto err;
}
bdi->write_bandwidth = INIT_BW;
bdi->avg_write_bandwidth = INIT_BW;
- err = fprop_local_init_percpu(&bdi->completions);
+ err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL);
if (err) {
err:
* of sleeping on the congestion queue
*/
if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
- !zone_is_reclaim_congested(zone)) {
+ !test_bit(ZONE_CONGESTED, &zone->flags)) {
cond_resched();
/* In case we scheduled, work out time remaining */
* MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes
* w: (no) no w: (no) no w: (yes) yes w: (no) no
* x: (no) no x: (no) yes x: (no) yes x: (yes) yes
- *
+ *
* MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes
* w: (no) no w: (no) no w: (copy) copy w: (no) no
* x: (no) no x: (no) yes x: (no) yes x: (yes) yes
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
- unsigned long rlim, retval;
+ unsigned long retval;
unsigned long newbrk, oldbrk;
struct mm_struct *mm = current->mm;
unsigned long min_brk;
* segment grow beyond its set limit the in case where the limit is
* not page aligned -Ram Gupta
*/
- rlim = rlimit(RLIMIT_DATA);
- if (rlim < RLIM_INFINITY && (brk - mm->start_brk) +
- (mm->end_data - mm->start_data) > rlim)
+ if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk,
+ mm->end_data, mm->start_data))
goto out;
newbrk = PAGE_ALIGN(brk);
struct vm_area_struct *vma;
vma = rb_entry(nd, struct vm_area_struct, vm_rb);
if (vma->vm_start < prev) {
- pr_emerg("vm_start %lx prev %lx\n", vma->vm_start, prev);
+ pr_emerg("vm_start %lx < prev %lx\n",
+ vma->vm_start, prev);
bug = 1;
}
if (vma->vm_start < pend) {
- pr_emerg("vm_start %lx pend %lx\n", vma->vm_start, pend);
+ pr_emerg("vm_start %lx < pend %lx\n",
+ vma->vm_start, pend);
bug = 1;
}
if (vma->vm_start > vma->vm_end) {
- pr_emerg("vm_end %lx < vm_start %lx\n",
- vma->vm_end, vma->vm_start);
+ pr_emerg("vm_start %lx > vm_end %lx\n",
+ vma->vm_start, vma->vm_end);
bug = 1;
}
if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
for (nd = rb_first(root); nd; nd = rb_next(nd)) {
struct vm_area_struct *vma;
vma = rb_entry(nd, struct vm_area_struct, vm_rb);
- BUG_ON(vma != ignore &&
- vma->rb_subtree_gap != vma_compute_subtree_gap(vma));
+ VM_BUG_ON_VMA(vma != ignore &&
+ vma->rb_subtree_gap != vma_compute_subtree_gap(vma),
+ vma);
}
}
int i = 0;
unsigned long highest_address = 0;
struct vm_area_struct *vma = mm->mmap;
+
while (vma) {
struct anon_vma_chain *avc;
+
vma_lock_anon_vma(vma);
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
anon_vma_interval_tree_verify(avc);
}
if (highest_address != mm->highest_vm_end) {
pr_emerg("mm->highest_vm_end %lx, found %lx\n",
- mm->highest_vm_end, highest_address);
+ mm->highest_vm_end, highest_address);
bug = 1;
}
i = browse_rb(&mm->mm_rb);
if (i != mm->map_count) {
- pr_emerg("map_count %d rb %d\n", mm->map_count, i);
+ if (i != -1)
+ pr_emerg("map_count %d rb %d\n", mm->map_count, i);
bug = 1;
}
- BUG_ON(bug);
+ VM_BUG_ON_MM(bug, mm);
}
#else
#define validate_mm_rb(root, ignore) do { } while (0)
* split_vma inserting another: so it must be
* mprotect case 4 shifting the boundary down.
*/
- adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT);
+ adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT);
exporter = vma;
importer = next;
}
if (!anon_vma && adjust_next)
anon_vma = next->anon_vma;
if (anon_vma) {
- VM_BUG_ON(adjust_next && next->anon_vma &&
- anon_vma != next->anon_vma);
+ VM_BUG_ON_VMA(adjust_next && next->anon_vma &&
+ anon_vma != next->anon_vma, next);
anon_vma_lock_write(anon_vma);
anon_vma_interval_tree_pre_update_vma(vma);
if (adjust_next)
struct vm_area_struct *vma_merge(struct mm_struct *mm,
struct vm_area_struct *prev, unsigned long addr,
unsigned long end, unsigned long vm_flags,
- struct anon_vma *anon_vma, struct file *file,
+ struct anon_vma *anon_vma, struct file *file,
pgoff_t pgoff, struct mempolicy *policy)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
* Can it merge with the predecessor?
*/
if (prev && prev->vm_end == addr &&
- mpol_equal(vma_policy(prev), policy) &&
+ mpol_equal(vma_policy(prev), policy) &&
can_vma_merge_after(prev, vm_flags,
anon_vma, file, pgoff)) {
/*
* Can this new request be merged in front of next?
*/
if (next && end == next->vm_start &&
- mpol_equal(policy, vma_policy(next)) &&
+ mpol_equal(policy, vma_policy(next)) &&
can_vma_merge_before(next, vm_flags,
anon_vma, file, pgoff+pglen)) {
if (prev && addr < prev->vm_end) /* case 4 */
unsigned long flags, unsigned long pgoff,
unsigned long *populate)
{
- struct mm_struct * mm = current->mm;
+ struct mm_struct *mm = current->mm;
vm_flags_t vm_flags;
*populate = 0;
/* offset overflow? */
if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
- return -EOVERFLOW;
+ return -EOVERFLOW;
/* Too many mappings? */
if (mm->map_count > sysctl_max_map_count)
info.align_mask = 0;
return vm_unmapped_area(&info);
}
-#endif
+#endif
/*
* This mmap-allocator allocates new areas top-down from below the
}
struct vm_area_struct *
-find_extend_vma(struct mm_struct * mm, unsigned long addr)
+find_extend_vma(struct mm_struct *mm, unsigned long addr)
{
- struct vm_area_struct * vma;
+ struct vm_area_struct *vma;
unsigned long start;
addr &= PAGE_MASK;
- vma = find_vma(mm,addr);
+ vma = find_vma(mm, addr);
if (!vma)
return NULL;
if (vma->vm_start <= addr)
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end)
{
- struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
+ struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap;
struct mmu_gather tlb;
lru_add_drain();
* __split_vma() bypasses sysctl_max_map_count checking. We use this on the
* munmap path where it doesn't make sense to fail.
*/
-static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
+static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, int new_below)
{
struct vm_area_struct *new;
if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
return -EINVAL;
- if ((len = PAGE_ALIGN(len)) == 0)
+ len = PAGE_ALIGN(len);
+ if (len == 0)
return -EINVAL;
/* Find the first overlapping VMA */
if (error)
return error;
}
- vma = prev? prev->vm_next: mm->mmap;
+ vma = prev ? prev->vm_next : mm->mmap;
/*
* unlock any mlock()ed ranges before detaching vmas
*/
static unsigned long do_brk(unsigned long addr, unsigned long len)
{
- struct mm_struct * mm = current->mm;
- struct vm_area_struct * vma, * prev;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma, *prev;
unsigned long flags;
- struct rb_node ** rb_link, * rb_parent;
+ struct rb_node **rb_link, *rb_parent;
pgoff_t pgoff = addr >> PAGE_SHIFT;
int error;
* safe. It is only safe to keep the vm_pgoff
* linear if there are no pages mapped yet.
*/
- VM_BUG_ON(faulted_in_anon_vma);
+ VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
*vmap = vma = new_vma;
}
*need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
{
int ret;
- ret = percpu_counter_init(&vm_committed_as, 0);
+ ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
VM_BUG_ON(ret);
}
}
if (dirty < setpoint) {
- x = min(bdi->balanced_dirty_ratelimit,
- min(balanced_dirty_ratelimit, task_ratelimit));
+ x = min3(bdi->balanced_dirty_ratelimit,
+ balanced_dirty_ratelimit, task_ratelimit);
if (dirty_ratelimit < x)
step = x - dirty_ratelimit;
} else {
- x = max(bdi->balanced_dirty_ratelimit,
- max(balanced_dirty_ratelimit, task_ratelimit));
+ x = max3(bdi->balanced_dirty_ratelimit,
+ balanced_dirty_ratelimit, task_ratelimit);
if (dirty_ratelimit > x)
step = dirty_ratelimit - x;
}
writeback_set_ratelimit();
register_cpu_notifier(&ratelimit_nb);
- fprop_global_init(&writeout_completions);
+ fprop_global_init(&writeout_completions, GFP_KERNEL);
}
/**
if (new_dentry->d_inode) {
(void) shmem_unlink(new_dir, new_dentry);
- if (they_are_dirs)
+ if (they_are_dirs) {
+ drop_nlink(new_dentry->d_inode);
drop_nlink(old_dir);
+ }
} else if (they_are_dirs) {
drop_nlink(old_dir);
inc_nlink(new_dir);
#endif
spin_lock_init(&sbinfo->stat_lock);
- if (percpu_counter_init(&sbinfo->used_blocks, 0))
+ if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
goto failed;
sbinfo->free_inodes = sbinfo->max_inodes;
.write_begin = shmem_write_begin,
.write_end = shmem_write_end,
#endif
+#ifdef CONFIG_MIGRATION
.migratepage = migrate_page,
+#endif
.error_remove_page = generic_error_remove_page,
};
default:
dccp_pr_debug("packet_type=%s\n",
dccp_packet_name(dh->dccph_type));
- sk_eat_skb(sk, skb, false);
+ sk_eat_skb(sk, skb);
}
verify_sock_status:
if (sock_flag(sk, SOCK_DONE)) {
len = skb->len;
found_fin_ok:
if (!(flags & MSG_PEEK))
- sk_eat_skb(sk, skb, false);
+ sk_eat_skb(sk, skb);
break;
} while (1);
out:
EXPORT_SYMBOL_GPL(dccp_shutdown);
-static inline int dccp_mib_init(void)
+static inline int __init dccp_mib_init(void)
{
dccp_statistics = alloc_percpu(struct dccp_mib);
if (!dccp_statistics)
BUILD_BUG_ON(sizeof(struct dccp_skb_cb) >
FIELD_SIZEOF(struct sk_buff, cb));
- rc = percpu_counter_init(&dccp_orphan_count, 0);
+ rc = percpu_counter_init(&dccp_orphan_count, 0, GFP_KERNEL);
if (rc)
goto out_fail;
rc = -ENOBUFS;
#include <net/tcp.h>
#include <net/xfrm.h>
#include <net/ip.h>
-#include <net/netdma.h>
#include <net/sock.h>
#include <asm/uaccess.h>
tp->reordering = sysctl_tcp_reordering;
tcp_enable_early_retrans(tp);
- icsk->icsk_ca_ops = &tcp_init_congestion_ops;
+ tcp_assign_congestion_control(sk);
tp->tsoffset = 0;
return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
}
-static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
+static void skb_entail(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
tcb->seq = tcb->end_seq = tp->write_seq;
tcb->tcp_flags = TCPHDR_ACK;
tcb->sacked = 0;
- skb_header_release(skb);
+ __skb_header_release(skb);
tcp_add_write_queue_tail(sk, skb);
sk->sk_wmem_queued += skb->truesize;
sk_mem_charge(sk, skb->truesize);
skb->ip_summed = CHECKSUM_PARTIAL;
tp->write_seq += copy;
TCP_SKB_CB(skb)->end_seq += copy;
- skb_shinfo(skb)->gso_segs = 0;
+ tcp_skb_pcount_set(skb, 0);
if (!copied)
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
tp->write_seq += copy;
TCP_SKB_CB(skb)->end_seq += copy;
- skb_shinfo(skb)->gso_segs = 0;
+ tcp_skb_pcount_set(skb, 0);
from += copy;
copied += copy;
* calculation of whether or not we must ACK for the sake of
* a window update.
*/
-void tcp_cleanup_rbuf(struct sock *sk, int copied)
+static void tcp_cleanup_rbuf(struct sock *sk, int copied)
{
struct tcp_sock *tp = tcp_sk(sk);
bool time_to_ack = false;
tp->ucopy.memory = 0;
}
-#ifdef CONFIG_NET_DMA
-static void tcp_service_net_dma(struct sock *sk, bool wait)
-{
- dma_cookie_t done, used;
- dma_cookie_t last_issued;
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (!tp->ucopy.dma_chan)
- return;
-
- last_issued = tp->ucopy.dma_cookie;
- dma_async_issue_pending(tp->ucopy.dma_chan);
-
- do {
- if (dma_async_is_tx_complete(tp->ucopy.dma_chan,
- last_issued, &done,
- &used) == DMA_COMPLETE) {
- /* Safe to free early-copied skbs now */
- __skb_queue_purge(&sk->sk_async_wait_queue);
- break;
- } else {
- struct sk_buff *skb;
- while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
- (dma_async_is_complete(skb->dma_cookie, done,
- used) == DMA_COMPLETE)) {
- __skb_dequeue(&sk->sk_async_wait_queue);
- kfree_skb(skb);
- }
- }
- } while (wait);
-}
-#endif
-
static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
{
struct sk_buff *skb;
while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
offset = seq - TCP_SKB_CB(skb)->seq;
- if (tcp_hdr(skb)->syn)
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
offset--;
- if (offset < skb->len || tcp_hdr(skb)->fin) {
+ if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
*off = offset;
return skb;
}
* splitted a fat GRO packet, while we released socket lock
* in skb_splice_bits()
*/
- sk_eat_skb(sk, skb, false);
+ sk_eat_skb(sk, skb);
}
return NULL;
}
if (offset + 1 != skb->len)
continue;
}
- if (tcp_hdr(skb)->fin) {
- sk_eat_skb(sk, skb, false);
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
+ sk_eat_skb(sk, skb);
++seq;
break;
}
- sk_eat_skb(sk, skb, false);
+ sk_eat_skb(sk, skb);
if (!desc->count)
break;
tp->copied_seq = seq;
int target; /* Read at least this many bytes */
long timeo;
struct task_struct *user_recv = NULL;
- bool copied_early = false;
struct sk_buff *skb;
u32 urg_hole = 0;
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
-#ifdef CONFIG_NET_DMA
- tp->ucopy.dma_chan = NULL;
- preempt_disable();
- skb = skb_peek_tail(&sk->sk_receive_queue);
- {
- int available = 0;
-
- if (skb)
- available = TCP_SKB_CB(skb)->seq + skb->len - (*seq);
- if ((available < target) &&
- (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
- !sysctl_tcp_low_latency &&
- net_dma_find_channel()) {
- preempt_enable();
- tp->ucopy.pinned_list =
- dma_pin_iovec_pages(msg->msg_iov, len);
- } else {
- preempt_enable();
- }
- }
-#endif
-
do {
u32 offset;
break;
offset = *seq - TCP_SKB_CB(skb)->seq;
- if (tcp_hdr(skb)->syn)
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
offset--;
if (offset < skb->len)
goto found_ok_skb;
- if (tcp_hdr(skb)->fin)
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto found_fin_ok;
WARN(!(flags & MSG_PEEK),
"recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
/* __ Set realtime policy in scheduler __ */
}
-#ifdef CONFIG_NET_DMA
- if (tp->ucopy.dma_chan) {
- if (tp->rcv_wnd == 0 &&
- !skb_queue_empty(&sk->sk_async_wait_queue)) {
- tcp_service_net_dma(sk, true);
- tcp_cleanup_rbuf(sk, copied);
- } else
- dma_async_issue_pending(tp->ucopy.dma_chan);
- }
-#endif
if (copied >= target) {
/* Do not sleep, just process backlog. */
release_sock(sk);
} else
sk_wait_data(sk, &timeo);
-#ifdef CONFIG_NET_DMA
- tcp_service_net_dma(sk, false); /* Don't block */
- tp->ucopy.wakeup = 0;
-#endif
-
if (user_recv) {
int chunk;
}
if (!(flags & MSG_TRUNC)) {
-#ifdef CONFIG_NET_DMA
- if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
- tp->ucopy.dma_chan = net_dma_find_channel();
-
- if (tp->ucopy.dma_chan) {
- tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
- tp->ucopy.dma_chan, skb, offset,
- msg->msg_iov, used,
- tp->ucopy.pinned_list);
-
- if (tp->ucopy.dma_cookie < 0) {
-
- pr_alert("%s: dma_cookie < 0\n",
- __func__);
-
- /* Exception. Bailout! */
- if (!copied)
- copied = -EFAULT;
- break;
- }
-
- dma_async_issue_pending(tp->ucopy.dma_chan);
-
- if ((offset + used) == skb->len)
- copied_early = true;
-
- } else
-#endif
- {
- err = skb_copy_datagram_iovec(skb, offset,
- msg->msg_iov, used);
- if (err) {
- /* Exception. Bailout! */
- if (!copied)
- copied = -EFAULT;
- break;
- }
+ err = skb_copy_datagram_iovec(skb, offset,
+ msg->msg_iov, used);
+ if (err) {
+ /* Exception. Bailout! */
+ if (!copied)
+ copied = -EFAULT;
+ break;
}
}
if (used + offset < skb->len)
continue;
- if (tcp_hdr(skb)->fin)
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto found_fin_ok;
- if (!(flags & MSG_PEEK)) {
- sk_eat_skb(sk, skb, copied_early);
- copied_early = false;
- }
+ if (!(flags & MSG_PEEK))
+ sk_eat_skb(sk, skb);
continue;
found_fin_ok:
/* Process the FIN. */
++*seq;
- if (!(flags & MSG_PEEK)) {
- sk_eat_skb(sk, skb, copied_early);
- copied_early = false;
- }
+ if (!(flags & MSG_PEEK))
+ sk_eat_skb(sk, skb);
break;
} while (len > 0);
tp->ucopy.len = 0;
}
-#ifdef CONFIG_NET_DMA
- tcp_service_net_dma(sk, true); /* Wait for queue to drain */
- tp->ucopy.dma_chan = NULL;
-
- if (tp->ucopy.pinned_list) {
- dma_unpin_iovec_pages(tp->ucopy.pinned_list);
- tp->ucopy.pinned_list = NULL;
- }
-#endif
-
/* According to UNIX98, msg_name/msg_namelen are ignored
* on connected socket. I was just happy when found this 8) --ANK
*/
* reader process may not have drained the data yet!
*/
while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
- u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
- tcp_hdr(skb)->fin;
+ u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
+
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+ len--;
data_was_unread += len;
__kfree_skb(skb);
}
__skb_queue_purge(&sk->sk_receive_queue);
tcp_write_queue_purge(sk);
__skb_queue_purge(&tp->out_of_order_queue);
-#ifdef CONFIG_NET_DMA
- __skb_queue_purge(&sk->sk_async_wait_queue);
-#endif
inet->inet_dport = 0;
break;
#endif
case TCP_USER_TIMEOUT:
- /* Cap the max timeout in ms TCP will retry/retrans
+ /* Cap the max time in ms TCP will retry or probe the window
* before giving up and aborting (ETIMEDOUT) a connection.
*/
if (val < 0)
}
__setup("thash_entries=", set_thash_entries);
-static void tcp_init_mem(void)
+static void __init tcp_init_mem(void)
{
unsigned long limit = nr_free_buffer_pages() / 8;
limit = max(limit, 128UL);
BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
- percpu_counter_init(&tcp_sockets_allocated, 0);
- percpu_counter_init(&tcp_orphan_count, 0);
+ percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
+ percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
tcp_hashinfo.bind_bucket_cachep =
kmem_cache_create("tcp_bind_bucket",
sizeof(struct inet_bind_bucket), 0,
tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
tcp_metrics_init();
-
- tcp_register_congestion_control(&tcp_reno);
-
+ BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
tcp_tasklet_init();
}
if (addr->v4.sin_addr.s_addr != htonl(INADDR_ANY) &&
ret != RTN_LOCAL &&
!sp->inet.freebind &&
- !sysctl_ip_nonlocal_bind)
+ !net->ipv4.sysctl_ip_nonlocal_bind)
return 0;
if (ipv6_only_sock(sctp_opt2sk(sp)))
if (!sctp_chunk_cachep)
goto err_chunk_cachep;
- status = percpu_counter_init(&sctp_sockets_allocated, 0);
+ status = percpu_counter_init(&sctp_sockets_allocated, 0, GFP_KERNEL);
if (status)
goto err_percpu_counter_init;