static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
{
- int hash = (sect >> STRIPE_SHIFT) & HASH_MASK;
+ int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK;
return &conf->stripe_hashtbl[hash];
}
-static inline int stripe_hash_locks_hash(sector_t sect)
+static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect)
{
- return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
+ return (sect >> RAID5_STRIPE_SHIFT(conf)) & STRIPE_HASH_LOCKS_MASK;
}
static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
int previous, int noblock, int noquiesce)
{
struct stripe_head *sh;
- int hash = stripe_hash_locks_hash(sector);
+ int hash = stripe_hash_locks_hash(conf, sector);
int inc_empty_inactive_list_flag;
pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
tmp_sec = sh->sector;
if (!sector_div(tmp_sec, conf->chunk_sectors))
return;
- head_sector = sh->sector - STRIPE_SECTORS;
+ head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf);
- hash = stripe_hash_locks_hash(head_sector);
+ hash = stripe_hash_locks_hash(conf, head_sector);
spin_lock_irq(conf->hash_locks + hash);
head = __find_stripe(conf, head_sector, conf->generation);
if (head && !atomic_inc_not_zero(&head->count)) {
struct bio *bio;
while ((bio = bio_list_pop(tmp)))
- generic_make_request(bio);
+ submit_bio_noacct(bio);
}
static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b)
test_bit(WriteErrorSeen, &rdev->flags)) {
sector_t first_bad;
int bad_sectors;
- int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
+ int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
&first_bad, &bad_sectors);
if (!bad)
break;
if (rdev) {
if (s->syncing || s->expanding || s->expanded
|| s->replacing)
- md_sync_acct(rdev->bdev, STRIPE_SECTORS);
+ md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf));
set_bit(STRIPE_IO_STARTED, &sh->state);
else
sh->dev[i].vec.bv_page = sh->dev[i].page;
bi->bi_vcnt = 1;
- bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
+ bi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
bi->bi_io_vec[0].bv_offset = 0;
- bi->bi_iter.bi_size = STRIPE_SIZE;
+ bi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
bi->bi_write_hint = sh->dev[i].write_hint;
if (!rrdev)
sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
if (should_defer && op_is_write(op))
bio_list_add(&pending_bios, bi);
else
- generic_make_request(bi);
+ submit_bio_noacct(bi);
}
if (rrdev) {
if (s->syncing || s->expanding || s->expanded
|| s->replacing)
- md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
+ md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf));
set_bit(STRIPE_IO_STARTED, &sh->state);
WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
sh->dev[i].rvec.bv_page = sh->dev[i].page;
rbi->bi_vcnt = 1;
- rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
+ rbi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
rbi->bi_io_vec[0].bv_offset = 0;
- rbi->bi_iter.bi_size = STRIPE_SIZE;
+ rbi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
rbi->bi_write_hint = sh->dev[i].write_hint;
sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
/*
if (should_defer && op_is_write(op))
bio_list_add(&pending_bios, rbi);
else
- generic_make_request(rbi);
+ submit_bio_noacct(rbi);
}
if (!rdev && !rrdev) {
if (op_is_write(op))
int page_offset;
struct async_submit_ctl submit;
enum async_tx_flags flags = 0;
+ struct r5conf *conf = sh->raid_conf;
if (bio->bi_iter.bi_sector >= sector)
page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
len -= b_offset;
}
- if (len > 0 && page_offset + len > STRIPE_SIZE)
- clen = STRIPE_SIZE - page_offset;
+ if (len > 0 && page_offset + len > RAID5_STRIPE_SIZE(conf))
+ clen = RAID5_STRIPE_SIZE(conf) - page_offset;
else
clen = len;
b_offset += bvl.bv_offset;
bio_page = bvl.bv_page;
if (frombio) {
- if (sh->raid_conf->skip_copy &&
+ if (conf->skip_copy &&
b_offset == 0 && page_offset == 0 &&
- clen == STRIPE_SIZE &&
+ clen == RAID5_STRIPE_SIZE(conf) &&
!no_skipcopy)
*page = bio_page;
else
{
struct stripe_head *sh = stripe_head_ref;
int i;
+ struct r5conf *conf = sh->raid_conf;
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
rbi = dev->read;
dev->read = NULL;
while (rbi && rbi->bi_iter.bi_sector <
- dev->sector + STRIPE_SECTORS) {
- rbi2 = r5_next_bio(rbi, dev->sector);
+ dev->sector + RAID5_STRIPE_SECTORS(conf)) {
+ rbi2 = r5_next_bio(conf, rbi, dev->sector);
bio_endio(rbi);
rbi = rbi2;
}
struct dma_async_tx_descriptor *tx = NULL;
struct async_submit_ctl submit;
int i;
+ struct r5conf *conf = sh->raid_conf;
BUG_ON(sh->batch_head);
pr_debug("%s: stripe %llu\n", __func__,
dev->toread = NULL;
spin_unlock_irq(&sh->stripe_lock);
while (rbi && rbi->bi_iter.bi_sector <
- dev->sector + STRIPE_SECTORS) {
+ dev->sector + RAID5_STRIPE_SECTORS(conf)) {
tx = async_copy_data(0, rbi, &dev->page,
dev->sector, tx, sh, 0);
- rbi = r5_next_bio(rbi, dev->sector);
+ rbi = r5_next_bio(conf, rbi, dev->sector);
}
}
}
init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
if (unlikely(count == 1))
- tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
+ tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0,
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
else
- tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
+ tx = async_xor(xor_dest, xor_srcs, 0, count,
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
return tx;
}
init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
ops_complete_compute, sh,
to_addr_conv(sh, percpu, 0));
- tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
+ tx = async_gen_syndrome(blocks, 0, count+2,
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
} else {
/* Compute any data- or p-drive using XOR */
count = 0;
init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
NULL, ops_complete_compute, sh,
to_addr_conv(sh, percpu, 0));
- tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
+ tx = async_xor(dest, blocks, 0, count,
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
}
return tx;
ops_complete_compute, sh,
to_addr_conv(sh, percpu, 0));
return async_gen_syndrome(blocks, 0, syndrome_disks+2,
- STRIPE_SIZE, &submit);
+ RAID5_STRIPE_SIZE(sh->raid_conf),
+ &submit);
} else {
struct page *dest;
int data_target;
ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
NULL, NULL, NULL,
to_addr_conv(sh, percpu, 0));
- tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
+ tx = async_xor(dest, blocks, 0, count,
+ RAID5_STRIPE_SIZE(sh->raid_conf),
&submit);
count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
ops_complete_compute, sh,
to_addr_conv(sh, percpu, 0));
return async_gen_syndrome(blocks, 0, count+2,
- STRIPE_SIZE, &submit);
+ RAID5_STRIPE_SIZE(sh->raid_conf),
+ &submit);
}
} else {
init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
if (failb == syndrome_disks) {
/* We're missing D+P. */
return async_raid6_datap_recov(syndrome_disks+2,
- STRIPE_SIZE, faila,
- blocks, &submit);
+ RAID5_STRIPE_SIZE(sh->raid_conf),
+ faila,
+ blocks, &submit);
} else {
/* We're missing D+D. */
return async_raid6_2data_recov(syndrome_disks+2,
- STRIPE_SIZE, faila, failb,
- blocks, &submit);
+ RAID5_STRIPE_SIZE(sh->raid_conf),
+ faila, failb,
+ blocks, &submit);
}
}
}
init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
- tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
+ tx = async_xor(xor_dest, xor_srcs, 0, count,
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
return tx;
}
init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
- tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
+ tx = async_gen_syndrome(blocks, 0, count+2,
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
return tx;
}
WARN_ON(dev->page != dev->orig_page);
while (wbi && wbi->bi_iter.bi_sector <
- dev->sector + STRIPE_SECTORS) {
+ dev->sector + RAID5_STRIPE_SECTORS(conf)) {
if (wbi->bi_opf & REQ_FUA)
set_bit(R5_WantFUA, &dev->flags);
if (wbi->bi_opf & REQ_SYNC)
clear_bit(R5_OVERWRITE, &dev->flags);
}
}
- wbi = r5_next_bio(wbi, dev->sector);
+ wbi = r5_next_bio(conf, wbi, dev->sector);
}
if (head_sh->batch_head) {
}
if (unlikely(count == 1))
- tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
+ tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0,
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
else
- tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
+ tx = async_xor(xor_dest, xor_srcs, 0, count,
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
if (!last_stripe) {
j++;
sh = list_first_entry(&sh->batch_list, struct stripe_head,
} else
init_async_submit(&submit, 0, tx, NULL, NULL,
to_addr_conv(sh, percpu, j));
- tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
+ tx = async_gen_syndrome(blocks, 0, count+2,
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
if (!last_stripe) {
j++;
sh = list_first_entry(&sh->batch_list, struct stripe_head,
init_async_submit(&submit, 0, NULL, NULL, NULL,
to_addr_conv(sh, percpu, 0));
- tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
+ tx = async_xor_val(xor_dest, xor_srcs, 0, count,
+ RAID5_STRIPE_SIZE(sh->raid_conf),
&sh->ops.zero_sum_result, &submit);
atomic_inc(&sh->count);
atomic_inc(&sh->count);
init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
sh, to_addr_conv(sh, percpu, 0));
- async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
+ async_syndrome_val(srcs, 0, count+2,
+ RAID5_STRIPE_SIZE(sh->raid_conf),
&sh->ops.zero_sum_result, percpu->spare_page, &submit);
}
/**
* scribble_alloc - allocate percpu scribble buffer for required size
* of the scribble region
- * @percpu - from for_each_present_cpu() of the caller
- * @num - total number of disks in the array
- * @cnt - scribble objs count for required size of the scribble region
+ * @percpu: from for_each_present_cpu() of the caller
+ * @num: total number of disks in the array
+ * @cnt: scribble objs count for required size of the scribble region
*
* The scribble buffer size must be enough to contain:
* 1/ a struct page pointer for each device in the array +2
percpu = per_cpu_ptr(conf->percpu, cpu);
err = scribble_alloc(percpu, new_disks,
- new_sectors / STRIPE_SECTORS);
+ new_sectors / RAID5_STRIPE_SECTORS(conf));
if (err)
break;
}
*/
pr_info_ratelimited(
"md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n",
- mdname(conf->mddev), STRIPE_SECTORS,
+ mdname(conf->mddev), RAID5_STRIPE_SECTORS(conf),
(unsigned long long)s,
bdevname(rdev->bdev, b));
- atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
+ atomic_add(RAID5_STRIPE_SECTORS(conf), &rdev->corrected_errors);
clear_bit(R5_ReadError, &sh->dev[i].flags);
clear_bit(R5_ReWrite, &sh->dev[i].flags);
} else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
if (!(set_bad
&& test_bit(In_sync, &rdev->flags)
&& rdev_set_badblocks(
- rdev, sh->sector, STRIPE_SECTORS, 0)))
+ rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0)))
md_error(conf->mddev, rdev);
}
}
struct stripe_head *sh = bi->bi_private;
struct r5conf *conf = sh->raid_conf;
int disks = sh->disks, i;
- struct md_rdev *uninitialized_var(rdev);
+ struct md_rdev *rdev;
sector_t first_bad;
int bad_sectors;
int replacement = 0;
if (bi->bi_status)
md_error(conf->mddev, rdev);
else if (is_badblock(rdev, sh->sector,
- STRIPE_SECTORS,
+ RAID5_STRIPE_SECTORS(conf),
&first_bad, &bad_sectors))
set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
} else {
set_bit(MD_RECOVERY_NEEDED,
&rdev->mddev->recovery);
} else if (is_badblock(rdev, sh->sector,
- STRIPE_SECTORS,
+ RAID5_STRIPE_SECTORS(conf),
&first_bad, &bad_sectors)) {
set_bit(R5_MadeGood, &sh->dev[i].flags);
if (test_bit(R5_ReadError, &sh->dev[i].flags))
/* check if page is covered */
sector_t sector = sh->dev[dd_idx].sector;
for (bi=sh->dev[dd_idx].towrite;
- sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
+ sector < sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf) &&
bi && bi->bi_iter.bi_sector <= sector;
- bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
+ bi = r5_next_bio(conf, bi, sh->dev[dd_idx].sector)) {
if (bio_end_sector(bi) >= sector)
sector = bio_end_sector(bi);
}
- if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
+ if (sector >= sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf))
if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags))
sh->overwrite_disks++;
}
set_bit(STRIPE_BITMAP_PENDING, &sh->state);
spin_unlock_irq(&sh->stripe_lock);
md_bitmap_startwrite(conf->mddev->bitmap, sh->sector,
- STRIPE_SECTORS, 0);
+ RAID5_STRIPE_SECTORS(conf), 0);
spin_lock_irq(&sh->stripe_lock);
clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
if (!sh->batch_head) {
if (!rdev_set_badblocks(
rdev,
sh->sector,
- STRIPE_SECTORS, 0))
+ RAID5_STRIPE_SECTORS(conf), 0))
md_error(conf->mddev, rdev);
rdev_dec_pending(rdev, conf->mddev);
}
wake_up(&conf->wait_for_overlap);
while (bi && bi->bi_iter.bi_sector <
- sh->dev[i].sector + STRIPE_SECTORS) {
- struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
+ sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
+ struct bio *nextbi = r5_next_bio(conf, bi, sh->dev[i].sector);
md_write_end(conf->mddev);
bio_io_error(bi);
}
if (bitmap_end)
md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
- STRIPE_SECTORS, 0, 0);
+ RAID5_STRIPE_SECTORS(conf), 0, 0);
bitmap_end = 0;
/* and fail all 'written' */
bi = sh->dev[i].written;
if (bi) bitmap_end = 1;
while (bi && bi->bi_iter.bi_sector <
- sh->dev[i].sector + STRIPE_SECTORS) {
- struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
+ sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
+ struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector);
md_write_end(conf->mddev);
bio_io_error(bi);
if (bi)
s->to_read--;
while (bi && bi->bi_iter.bi_sector <
- sh->dev[i].sector + STRIPE_SECTORS) {
+ sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
struct bio *nextbi =
- r5_next_bio(bi, sh->dev[i].sector);
+ r5_next_bio(conf, bi, sh->dev[i].sector);
bio_io_error(bi);
bi = nextbi;
}
if (bitmap_end)
md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
- STRIPE_SECTORS, 0, 0);
+ RAID5_STRIPE_SECTORS(conf), 0, 0);
/* If we were in the middle of a write the parity block might
* still be locked - so just clear all R5_LOCKED flags
*/
&& !test_bit(Faulty, &rdev->flags)
&& !test_bit(In_sync, &rdev->flags)
&& !rdev_set_badblocks(rdev, sh->sector,
- STRIPE_SECTORS, 0))
+ RAID5_STRIPE_SECTORS(conf), 0))
abort = 1;
rdev = rcu_dereference(conf->disks[i].replacement);
if (rdev
&& !test_bit(Faulty, &rdev->flags)
&& !test_bit(In_sync, &rdev->flags)
&& !rdev_set_badblocks(rdev, sh->sector,
- STRIPE_SECTORS, 0))
+ RAID5_STRIPE_SECTORS(conf), 0))
abort = 1;
}
rcu_read_unlock();
conf->recovery_disabled =
conf->mddev->recovery_disabled;
}
- md_done_sync(conf->mddev, STRIPE_SECTORS, !abort);
+ md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), !abort);
}
static int want_replace(struct stripe_head *sh, int disk_idx)
struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
&sh->dev[s->failed_num[1]] };
int i;
+ bool force_rcw = (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW);
if (test_bit(R5_LOCKED, &dev->flags) ||
* devices must be read.
*/
return 1;
+
+ if (s->failed >= 2 &&
+ (fdev[i]->towrite ||
+ s->failed_num[i] == sh->pd_idx ||
+ s->failed_num[i] == sh->qd_idx) &&
+ !test_bit(R5_UPTODATE, &fdev[i]->flags))
+ /* In max degraded raid6, If the failed disk is P, Q,
+ * or we want to read the failed disk, we need to do
+ * reconstruct-write.
+ */
+ force_rcw = true;
}
- /* If we are forced to do a reconstruct-write, either because
- * the current RAID6 implementation only supports that, or
- * because parity cannot be trusted and we are currently
- * recovering it, there is extra need to be careful.
+ /* If we are forced to do a reconstruct-write, because parity
+ * cannot be trusted and we are currently recovering it, there
+ * is extra need to be careful.
* If one of the devices that we would need to read, because
* it is not being overwritten (and maybe not written at all)
* is missing/faulty, then we need to read everything we can.
*/
- if (sh->raid_conf->level != 6 &&
+ if (!force_rcw &&
sh->sector < sh->raid_conf->mddev->recovery_cp)
/* reconstruct-write isn't being forced */
return 0;
return 0;
}
-/**
+/*
* handle_stripe_fill - read or compute data to satisfy pending requests.
*/
static void handle_stripe_fill(struct stripe_head *sh,
wbi = dev->written;
dev->written = NULL;
while (wbi && wbi->bi_iter.bi_sector <
- dev->sector + STRIPE_SECTORS) {
- wbi2 = r5_next_bio(wbi, dev->sector);
+ dev->sector + RAID5_STRIPE_SECTORS(conf)) {
+ wbi2 = r5_next_bio(conf, wbi, dev->sector);
md_write_end(conf->mddev);
bio_endio(wbi);
wbi = wbi2;
}
md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
- STRIPE_SECTORS,
+ RAID5_STRIPE_SECTORS(conf),
!test_bit(STRIPE_DEGRADED, &sh->state),
0);
if (head_sh->batch_head) {
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, &dev->flags);
s->locked++;
- } else {
+ } else
set_bit(STRIPE_DELAYED, &sh->state);
- set_bit(STRIPE_HANDLE, &sh->state);
- }
}
}
}
set_bit(R5_Wantread, &dev->flags);
s->locked++;
qread++;
- } else {
+ } else
set_bit(STRIPE_DELAYED, &sh->state);
- set_bit(STRIPE_HANDLE, &sh->state);
- }
}
}
if (rcw && conf->mddev->queue)
*/
set_bit(STRIPE_INSYNC, &sh->state);
else {
- atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
+ atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
/* don't try to repair!! */
set_bit(STRIPE_INSYNC, &sh->state);
"%llu-%llu\n", mdname(conf->mddev),
(unsigned long long) sh->sector,
(unsigned long long) sh->sector +
- STRIPE_SECTORS);
+ RAID5_STRIPE_SECTORS(conf));
} else {
sh->check_state = check_state_compute_run;
set_bit(STRIPE_COMPUTE_RUN, &sh->state);
*/
}
} else {
- atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
+ atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
/* don't try to repair!! */
set_bit(STRIPE_INSYNC, &sh->state);
"%llu-%llu\n", mdname(conf->mddev),
(unsigned long long) sh->sector,
(unsigned long long) sh->sector +
- STRIPE_SECTORS);
+ RAID5_STRIPE_SECTORS(conf));
} else {
int *target = &sh->ops.target;
/* place all the copies on one channel */
init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
tx = async_memcpy(sh2->dev[dd_idx].page,
- sh->dev[i].page, 0, 0, STRIPE_SIZE,
+ sh->dev[i].page, 0, 0, RAID5_STRIPE_SIZE(conf),
&submit);
set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
*/
rdev = rcu_dereference(conf->disks[i].replacement);
if (rdev && !test_bit(Faulty, &rdev->flags) &&
- rdev->recovery_offset >= sh->sector + STRIPE_SECTORS &&
- !is_badblock(rdev, sh->sector, STRIPE_SECTORS,
+ rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) &&
+ !is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
&first_bad, &bad_sectors))
set_bit(R5_ReadRepl, &dev->flags);
else {
if (rdev && test_bit(Faulty, &rdev->flags))
rdev = NULL;
if (rdev) {
- is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
+ is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
&first_bad, &bad_sectors);
if (s->blocked_rdev == NULL
&& (test_bit(Blocked, &rdev->flags)
}
} else if (test_bit(In_sync, &rdev->flags))
set_bit(R5_Insync, &dev->flags);
- else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
+ else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset)
/* in sync if before recovery_offset */
set_bit(R5_Insync, &dev->flags);
else if (test_bit(R5_UPTODATE, &dev->flags) &&
rcu_read_unlock();
}
+/*
+ * Return '1' if this is a member of batch, or '0' if it is a lone stripe or
+ * a head which can now be handled.
+ */
static int clear_batch_ready(struct stripe_head *sh)
{
- /* Return '1' if this is a member of batch, or
- * '0' if it is a lone stripe or a head which can now be
- * handled.
- */
struct stripe_head *tmp;
if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
return (sh->batch_head && sh->batch_head != sh);
struct r5dev *pdev, *qdev;
clear_bit(STRIPE_HANDLE, &sh->state);
+
+ /*
+ * handle_stripe should not continue handle the batched stripe, only
+ * the head of batch list or lone stripe can continue. Otherwise we
+ * could see break_stripe_batch_list warns about the STRIPE_ACTIVE
+ * is set for the batched stripe.
+ */
+ if (clear_batch_ready(sh))
+ return;
+
if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) {
/* already being handled, ensure it gets handled
* again when current action finishes */
return;
}
- if (clear_batch_ready(sh) ) {
- clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
- return;
- }
-
if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
break_stripe_batch_list(sh, 0);
* or to load a block that is being partially written.
*/
if (s.to_read || s.non_overwrite
- || (conf->level == 6 && s.to_write && s.failed)
+ || (s.to_write && s.failed)
|| (s.syncing && (s.uptodate + s.compute < disks))
|| s.replacing
|| s.expanding)
if ((s.syncing || s.replacing) && s.locked == 0 &&
!test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
test_bit(STRIPE_INSYNC, &sh->state)) {
- md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
+ md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
clear_bit(STRIPE_SYNCING, &sh->state);
if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
wake_up(&conf->wait_for_overlap);
if (!test_bit(R5_ReWrite, &dev->flags)) {
set_bit(R5_Wantwrite, &dev->flags);
set_bit(R5_ReWrite, &dev->flags);
- set_bit(R5_LOCKED, &dev->flags);
- s.locked++;
- } else {
+ } else
/* let's read it back */
set_bit(R5_Wantread, &dev->flags);
- set_bit(R5_LOCKED, &dev->flags);
- s.locked++;
- }
+ set_bit(R5_LOCKED, &dev->flags);
+ s.locked++;
}
}
clear_bit(STRIPE_EXPAND_READY, &sh->state);
atomic_dec(&conf->reshape_stripes);
wake_up(&conf->wait_for_overlap);
- md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
+ md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
}
if (s.expanding && s.locked == 0 &&
/* We own a safe reference to the rdev */
rdev = conf->disks[i].rdev;
if (!rdev_set_badblocks(rdev, sh->sector,
- STRIPE_SECTORS, 0))
+ RAID5_STRIPE_SECTORS(conf), 0))
md_error(conf->mddev, rdev);
rdev_dec_pending(rdev, conf->mddev);
}
if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
rdev = conf->disks[i].rdev;
rdev_clear_badblocks(rdev, sh->sector,
- STRIPE_SECTORS, 0);
+ RAID5_STRIPE_SECTORS(conf), 0);
rdev_dec_pending(rdev, conf->mddev);
}
if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
/* rdev have been moved down */
rdev = conf->disks[i].rdev;
rdev_clear_badblocks(rdev, sh->sector,
- STRIPE_SECTORS, 0);
+ RAID5_STRIPE_SECTORS(conf), 0);
rdev_dec_pending(rdev, conf->mddev);
}
}
}
}
-static int raid5_congested(struct mddev *mddev, int bits)
-{
- struct r5conf *conf = mddev->private;
-
- /* No difference between reads and writes. Just check
- * how busy the stripe_cache is
- */
-
- if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
- return 1;
-
- /* Also checks whether there is pressure on r5cache log space */
- if (test_bit(R5C_LOG_TIGHT, &conf->cache_state))
- return 1;
- if (conf->quiesce)
- return 1;
- if (atomic_read(&conf->empty_inactive_list_nr))
- return 1;
-
- return 0;
-}
-
static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
{
struct r5conf *conf = mddev->private;
trace_block_bio_remap(align_bi->bi_disk->queue,
align_bi, disk_devt(mddev->gendisk),
raid_bio->bi_iter.bi_sector);
- generic_make_request(align_bi);
+ submit_bio_noacct(align_bi);
return 1;
} else {
rcu_read_unlock();
struct r5conf *conf = mddev->private;
split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split);
bio_chain(split, raid_bio);
- generic_make_request(raid_bio);
+ submit_bio_noacct(raid_bio);
raid_bio = split;
}
/* Skip discard while reshape is happening */
return;
- logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+ logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
last_sector = bio_end_sector(bi);
bi->bi_next = NULL;
last_sector *= conf->chunk_sectors;
for (; logical_sector < last_sector;
- logical_sector += STRIPE_SECTORS) {
+ logical_sector += RAID5_STRIPE_SECTORS(conf)) {
DEFINE_WAIT(w);
int d;
again:
d++)
md_bitmap_startwrite(mddev->bitmap,
sh->sector,
- STRIPE_SECTORS,
+ RAID5_STRIPE_SECTORS(conf),
0);
sh->bm_seq = conf->seq_flush + 1;
set_bit(STRIPE_BIT_DELAY, &sh->state);
return true;
}
- logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+ logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
last_sector = bio_end_sector(bi);
bi->bi_next = NULL;
prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
- for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
+ for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) {
int previous;
int seq;
do_flush = false;
}
- if (!sh->batch_head || sh == sh->batch_head)
- set_bit(STRIPE_HANDLE, &sh->state);
+ set_bit(STRIPE_HANDLE, &sh->state);
clear_bit(STRIPE_DELAYED, &sh->state);
if ((!sh->batch_head || sh == sh->batch_head) &&
(bi->bi_opf & REQ_SYNC) &&
sector_div(sector_nr, new_data_disks);
if (sector_nr) {
mddev->curr_resync_completed = sector_nr;
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
*skipped = 1;
retn = sector_nr;
goto finish;
conf->reshape_safe = mddev->reshape_position;
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap);
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
INIT_LIST_HEAD(&stripes);
- for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
+ for (i = 0; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) {
int j;
int skipped_disk = 0;
sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
skipped_disk = 1;
continue;
}
- memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
+ memset(page_address(sh->dev[j].page), 0, RAID5_STRIPE_SIZE(conf));
set_bit(R5_Expanded, &sh->dev[j].flags);
set_bit(R5_UPTODATE, &sh->dev[j].flags);
}
set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
raid5_release_stripe(sh);
- first_sector += STRIPE_SECTORS;
+ first_sector += RAID5_STRIPE_SECTORS(conf);
}
/* Now that the sources are clearly marked, we can release
* the destination stripes
conf->reshape_safe = mddev->reshape_position;
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap);
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
ret:
return retn;
if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
!conf->fullsync &&
!md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
- sync_blocks >= STRIPE_SECTORS) {
+ sync_blocks >= RAID5_STRIPE_SECTORS(conf)) {
/* we can skip this block, and probably more */
- sync_blocks /= STRIPE_SECTORS;
+ do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf));
*skipped = 1;
- return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
+ /* keep things rounded to whole stripes */
+ return sync_blocks * RAID5_STRIPE_SECTORS(conf);
}
md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
raid5_release_stripe(sh);
- return STRIPE_SECTORS;
+ return RAID5_STRIPE_SECTORS(conf);
}
static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
int handled = 0;
logical_sector = raid_bio->bi_iter.bi_sector &
- ~((sector_t)STRIPE_SECTORS-1);
+ ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
sector = raid5_compute_sector(conf, logical_sector,
0, &dd_idx, NULL);
last_sector = bio_end_sector(raid_bio);
for (; logical_sector < last_sector;
- logical_sector += STRIPE_SECTORS,
- sector += STRIPE_SECTORS,
+ logical_sector += RAID5_STRIPE_SECTORS(conf),
+ sector += RAID5_STRIPE_SECTORS(conf),
scnt++) {
if (scnt < offset)
raid5_show_rmw_level,
raid5_store_rmw_level);
+static ssize_t
+raid5_show_stripe_size(struct mddev *mddev, char *page)
+{
+ struct r5conf *conf;
+ int ret = 0;
+
+ spin_lock(&mddev->lock);
+ conf = mddev->private;
+ if (conf)
+ ret = sprintf(page, "%lu\n", RAID5_STRIPE_SIZE(conf));
+ spin_unlock(&mddev->lock);
+ return ret;
+}
+
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
+static ssize_t
+raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len)
+{
+ struct r5conf *conf;
+ unsigned long new;
+ int err;
+
+ if (len >= PAGE_SIZE)
+ return -EINVAL;
+ if (kstrtoul(page, 10, &new))
+ return -EINVAL;
+
+ /*
+ * The value should not be bigger than PAGE_SIZE. It requires to
+ * be multiple of DEFAULT_STRIPE_SIZE.
+ */
+ if (new % DEFAULT_STRIPE_SIZE != 0 || new > PAGE_SIZE || new == 0)
+ return -EINVAL;
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+
+ conf = mddev->private;
+ if (!conf) {
+ err = -ENODEV;
+ goto out_unlock;
+ }
+
+ if (new == conf->stripe_size)
+ goto out_unlock;
+
+ pr_debug("md/raid: change stripe_size from %lu to %lu\n",
+ conf->stripe_size, new);
+
+ mddev_suspend(mddev);
+ conf->stripe_size = new;
+ conf->stripe_shift = ilog2(new) - 9;
+ conf->stripe_sectors = new >> 9;
+ mddev_resume(mddev);
+
+out_unlock:
+ mddev_unlock(mddev);
+ return err ?: len;
+}
+
+static struct md_sysfs_entry
+raid5_stripe_size = __ATTR(stripe_size, 0644,
+ raid5_show_stripe_size,
+ raid5_store_stripe_size);
+#else
+static struct md_sysfs_entry
+raid5_stripe_size = __ATTR(stripe_size, 0444,
+ raid5_show_stripe_size,
+ NULL);
+#endif
static ssize_t
raid5_show_preread_threshold(struct mddev *mddev, char *page)
&raid5_group_thread_cnt.attr,
&raid5_skip_copy.attr,
&raid5_rmw_level.attr,
+ &raid5_stripe_size.attr,
&r5c_journal_mode.attr,
&ppl_write_hint.attr,
NULL,
conf->previous_raid_disks),
max(conf->chunk_sectors,
conf->prev_chunk_sectors)
- / STRIPE_SECTORS)) {
+ / RAID5_STRIPE_SECTORS(conf))) {
free_scratch_buffer(conf, percpu);
return -ENOMEM;
}
conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
if (conf == NULL)
goto abort;
+
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
+ conf->stripe_size = DEFAULT_STRIPE_SIZE;
+ conf->stripe_shift = ilog2(DEFAULT_STRIPE_SIZE) - 9;
+ conf->stripe_sectors = DEFAULT_STRIPE_SIZE >> 9;
+#endif
INIT_LIST_HEAD(&conf->free_list);
INIT_LIST_HEAD(&conf->pending_list);
conf->pending_data = kcalloc(PENDING_IO_MAX,
} else
goto abort;
spin_lock_init(&conf->device_lock);
- seqcount_init(&conf->gen_lock);
+ seqcount_spinlock_init(&conf->gen_lock, &conf->device_lock);
mutex_init(&conf->cache_size_mutex);
init_waitqueue_head(&conf->wait_for_quiescent);
init_waitqueue_head(&conf->wait_for_stripe);
conf->min_nr_stripes = NR_STRIPES;
if (mddev->reshape_position != MaxSector) {
int stripes = max_t(int,
- ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4,
- ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4);
+ ((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4,
+ ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4);
conf->min_nr_stripes = max(NR_STRIPES, stripes);
if (conf->min_nr_stripes != NR_STRIPES)
pr_info("md/raid:%s: force stripe size %d for reshape\n",
* stripe_heads first.
*/
struct r5conf *conf = mddev->private;
- if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
+ if (((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
> conf->min_nr_stripes ||
- ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
+ ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
> conf->min_nr_stripes) {
pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n",
mdname(mddev),
((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
- / STRIPE_SIZE)*4);
+ / RAID5_STRIPE_SIZE(conf))*4);
return 0;
}
return 1;
else
rdev->recovery_offset = 0;
- if (sysfs_link_rdev(mddev, rdev))
- /* Failure here is OK */;
+ /* Failure here is OK */
+ sysfs_link_rdev(mddev, rdev);
}
} else if (rdev->raid_disk >= conf->previous_raid_disks
&& !test_bit(Faulty, &rdev->flags)) {
while (chunksect && (mddev->array_sectors & (chunksect-1)))
chunksect >>= 1;
- if ((chunksect<<9) < STRIPE_SIZE)
+ if ((chunksect<<9) < RAID5_STRIPE_SIZE((struct r5conf *)mddev->private))
/* array size does not allow a suitable chunk size */
return ERR_PTR(-EINVAL);
.finish_reshape = raid5_finish_reshape,
.quiesce = raid5_quiesce,
.takeover = raid6_takeover,
- .congested = raid5_congested,
.change_consistency_policy = raid5_change_consistency_policy,
};
static struct md_personality raid5_personality =
.finish_reshape = raid5_finish_reshape,
.quiesce = raid5_quiesce,
.takeover = raid5_takeover,
- .congested = raid5_congested,
.change_consistency_policy = raid5_change_consistency_policy,
};
.finish_reshape = raid5_finish_reshape,
.quiesce = raid5_quiesce,
.takeover = raid4_takeover,
- .congested = raid5_congested,
.change_consistency_policy = raid5_change_consistency_policy,
};
THREAD_SIZE_ORDER);
if (likely(page)) {
- tsk->stack = page_address(page);
+ tsk->stack = kasan_reset_tag(page_address(page));
return tsk->stack;
}
return NULL;
if (vm) {
int i;
- for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
- mod_memcg_page_state(vm->pages[i],
- MEMCG_KERNEL_STACK_KB,
- -(int)(PAGE_SIZE / 1024));
-
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
memcg_kmem_uncharge_page(vm->pages[i], 0);
- }
for (i = 0; i < NR_CACHED_STACKS; i++) {
if (this_cpu_cmpxchg(cached_stacks[i],
{
unsigned long *stack;
stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
+ stack = kasan_reset_tag(stack);
tsk->stack = stack;
return stack;
}
struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (new) {
- *new = *orig;
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
+ /*
+ * orig->shared.rb may be modified concurrently, but the clone
+ * will be reinitialized.
+ */
+ *new = data_race(*orig);
INIT_LIST_HEAD(&new->anon_vma_chain);
new->vm_next = new->vm_prev = NULL;
}
void *stack = task_stack_page(tsk);
struct vm_struct *vm = task_stack_vm_area(tsk);
- BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
-
- if (vm) {
- int i;
-
- BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
-
- for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
- mod_zone_page_state(page_zone(vm->pages[i]),
- NR_KERNEL_STACK_KB,
- PAGE_SIZE / 1024 * account);
- }
- } else {
- /*
- * All stack pages are in the same zone and belong to the
- * same memcg.
- */
- struct page *first_page = virt_to_page(stack);
-
- mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
- THREAD_SIZE / 1024 * account);
- mod_memcg_obj_state(stack, MEMCG_KERNEL_STACK_KB,
- account * (THREAD_SIZE / 1024));
- }
+ /* All stack pages are in the same node. */
+ if (vm)
+ mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB,
+ account * (THREAD_SIZE / 1024));
+ else
+ mod_lruvec_slab_state(stack, NR_KERNEL_STACK_KB,
+ account * (THREAD_SIZE / 1024));
}
static int memcg_charge_kernel_stack(struct task_struct *tsk)
struct vm_struct *vm = task_stack_vm_area(tsk);
int ret;
+ BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+
if (vm) {
int i;
+ BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
+
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
/*
* If memcg_kmem_charge_page() fails, page->mem_cgroup
- * pointer is NULL, and both memcg_kmem_uncharge_page()
- * and mod_memcg_page_state() in free_thread_stack()
- * will ignore this page. So it's safe.
+ * pointer is NULL, and memcg_kmem_uncharge_page() in
+ * free_thread_stack() will ignore this page.
*/
ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
0);
if (ret)
return ret;
-
- mod_memcg_page_state(vm->pages[i],
- MEMCG_KERNEL_STACK_KB,
- PAGE_SIZE / 1024);
}
}
#endif
#endif
rt_mutex_debug_task_free(tsk);
ftrace_graph_exit_task(tsk);
- put_seccomp_filter(tsk);
arch_release_task_struct(tsk);
if (tsk->flags & PF_KTHREAD)
free_kthread_struct(tsk);
goto out;
}
- newf = dup_fd(oldf, &error);
+ newf = dup_fd(oldf, NR_OPEN_MAX, &error);
if (!newf)
goto out;
*/
static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
{
- struct task_struct *task;
struct pid *pid = file->private_data;
__poll_t poll_flags = 0;
poll_wait(file, &pid->wait_pidfd, pts);
- rcu_read_lock();
- task = pid_task(pid, PIDTYPE_PID);
/*
* Inform pollers only when the whole thread group exits.
* If the thread group leader exits before all other threads in the
* group, then poll(2) should block, similar to the wait(2) family.
*/
- if (!task || (task->exit_state && thread_group_empty(task)))
+ if (thread_group_exited(pid))
poll_flags = EPOLLIN | EPOLLRDNORM;
- rcu_read_unlock();
return poll_flags;
}
#ifdef CONFIG_CPUSETS
p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
- seqcount_init(&p->mems_allowed_seq);
+ seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
- p->irq_events = 0;
- p->hardirq_enable_ip = 0;
- p->hardirq_enable_event = 0;
- p->hardirq_disable_ip = _THIS_IP_;
- p->hardirq_disable_event = 0;
- p->softirqs_enabled = 1;
- p->softirq_enable_ip = _THIS_IP_;
- p->softirq_enable_event = 0;
- p->softirq_disable_ip = 0;
- p->softirq_disable_event = 0;
- p->softirq_context = 0;
+ memset(&p->irqtrace, 0, sizeof(p->irqtrace));
+ p->irqtrace.hardirq_disable_ip = _THIS_IP_;
+ p->irqtrace.softirq_enable_ip = _THIS_IP_;
+ p->softirqs_enabled = 1;
+ p->softirq_context = 0;
#endif
p->pagefault_disabled = 0;
retval = copy_io(clone_flags, p);
if (retval)
goto bad_fork_cleanup_namespaces;
- retval = copy_thread_tls(clone_flags, args->stack, args->stack_size, p,
- args->tls);
+ retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls);
if (retval)
goto bad_fork_cleanup_io;
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
+ sched_post_fork(p);
cgroup_post_fork(p, args);
perf_event_fork(p);
int trace = 0;
long nr;
+ /*
+ * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
+ * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
+ * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
+ * field in struct clone_args and it still doesn't make sense to have
+ * them both point at the same memory location. Performing this check
+ * here has the advantage that we don't need to have a separate helper
+ * to check for legacy clone().
+ */
+ if ((args->flags & CLONE_PIDFD) &&
+ (args->flags & CLONE_PARENT_SETTID) &&
+ (args->pidfd == args->parent_tid))
+ return -EINVAL;
+
/*
* Determine whether and which event to report to ptracer. When
* called from kernel_thread or CLONE_UNTRACED is explicitly
return nr;
}
-bool legacy_clone_args_valid(const struct kernel_clone_args *kargs)
-{
- /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */
- if ((kargs->flags & CLONE_PIDFD) &&
- (kargs->flags & CLONE_PARENT_SETTID))
- return false;
-
- return true;
-}
-
-#ifndef CONFIG_HAVE_COPY_THREAD_TLS
-/* For compatibility with architectures that call do_fork directly rather than
- * using the syscall entry points below. */
-long do_fork(unsigned long clone_flags,
- unsigned long stack_start,
- unsigned long stack_size,
- int __user *parent_tidptr,
- int __user *child_tidptr)
-{
- struct kernel_clone_args args = {
- .flags = (lower_32_bits(clone_flags) & ~CSIGNAL),
- .pidfd = parent_tidptr,
- .child_tid = child_tidptr,
- .parent_tid = parent_tidptr,
- .exit_signal = (lower_32_bits(clone_flags) & CSIGNAL),
- .stack = stack_start,
- .stack_size = stack_size,
- };
-
- if (!legacy_clone_args_valid(&args))
- return -EINVAL;
-
- return _do_fork(&args);
-}
-#endif
-
/*
* Create a kernel thread.
*/
.tls = tls,
};
- if (!legacy_clone_args_valid(&args))
- return -EINVAL;
-
return _do_fork(&args);
}
#endif
#ifdef __ARCH_WANT_SYS_CLONE3
-/*
- * copy_thread implementations handle CLONE_SETTLS by reading the TLS value from
- * the registers containing the syscall arguments for clone. This doesn't work
- * with clone3 since the TLS value is passed in clone_args instead.
- */
-#ifndef CONFIG_HAVE_COPY_THREAD_TLS
-#error clone3 requires copy_thread_tls support in arch
-#endif
-
noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
struct clone_args __user *uargs,
size_t usize)
/*
* Unshare file descriptor table if it is being shared
*/
-static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
+int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
+ struct files_struct **new_fdp)
{
struct files_struct *fd = current->files;
int error = 0;
if ((unshare_flags & CLONE_FILES) &&
(fd && atomic_read(&fd->count) > 1)) {
- *new_fdp = dup_fd(fd, &error);
+ *new_fdp = dup_fd(fd, max_fds, &error);
if (!*new_fdp)
return error;
}
/*
* unshare allows a process to 'unshare' part of the process
* context which was originally shared using clone. copy_*
- * functions used by do_fork() cannot be used here directly
+ * functions used by _do_fork() cannot be used here directly
* because they modify an inactive task_struct that is being
* constructed. Here we are modifying the current, active,
* task_struct.
err = unshare_fs(unshare_flags, &new_fs);
if (err)
goto bad_unshare_out;
- err = unshare_fd(unshare_flags, &new_fd);
+ err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd);
if (err)
goto bad_unshare_cleanup_fs;
err = unshare_userns(unshare_flags, &new_cred);
struct files_struct *copy = NULL;
int error;
- error = unshare_fd(CLONE_FILES, ©);
+ error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, ©);
if (error || !copy) {
*displaced = NULL;
return error;