drivers/md/raid10.c

   1 /*
   2  * raid10.c : Multiple Devices driver for Linux
   3  *
   4  * Copyright (C) 2000-2004 Neil Brown
   5  *
   6  * RAID-10 support for md.
   7  *
   8  * Base on code in raid1.c.  See raid1.c for further copyright information.
   9  *
  10  *
  11  * This program is free software; you can redistribute it and/or modify
  12  * it under the terms of the GNU General Public License as published by
  13  * the Free Software Foundation; either version 2, or (at your option)
  14  * any later version.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * (for example /usr/src/linux/COPYING); if not, write to the Free
  18  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19  */
  20
  21 #include <linux/slab.h>
  22 #include <linux/delay.h>
  23 #include <linux/blkdev.h>
  24 #include <linux/module.h>
  25 #include <linux/seq_file.h>
  26 #include <linux/ratelimit.h>
  27 #include <linux/kthread.h>
  28 #include "md.h"
  29 #include "raid10.h"
  30 #include "raid0.h"
  31 #include "bitmap.h"
  32
  33 /*
  34  * RAID10 provides a combination of RAID0 and RAID1 functionality.
  35  * The layout of data is defined by
  36  *    chunk_size
  37  *    raid_disks
  38  *    near_copies (stored in low byte of layout)
  39  *    far_copies (stored in second byte of layout)
  40  *    far_offset (stored in bit 16 of layout )
  41  *    use_far_sets (stored in bit 17 of layout )
  42  *
  43  * The data to be stored is divided into chunks using chunksize.  Each device
  44  * is divided into far_copies sections.   In each section, chunks are laid out
  45  * in a style similar to raid0, but near_copies copies of each chunk is stored
  46  * (each on a different drive).  The starting device for each section is offset
  47  * near_copies from the starting device of the previous section.  Thus there
  48  * are (near_copies * far_copies) of each chunk, and each is on a different
  49  * drive.  near_copies and far_copies must be at least one, and their product
  50  * is at most raid_disks.
  51  *
  52  * If far_offset is true, then the far_copies are handled a bit differently.
  53  * The copies are still in different stripes, but instead of being very far
  54  * apart on disk, there are adjacent stripes.
  55  *
  56  * The far and offset algorithms are handled slightly differently if
  57  * 'use_far_sets' is true.  In this case, the array's devices are grouped into
  58  * sets that are (near_copies * far_copies) in size.  The far copied stripes
  59  * are still shifted by 'near_copies' devices, but this shifting stays confined
  60  * to the set rather than the entire array.  This is done to improve the number
  61  * of device combinations that can fail without causing the array to fail.
  62  * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk
  63  * on a device):
  64  *    A B C D    A B C D E
  65  *      ...         ...
  66  *    D A B C    E A B C D
  67  * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s):
  68  *    [A B] [C D]    [A B] [C D E]
  69  *    |...| |...|    |...| | ... |
  70  *    [B A] [D C]    [B A] [E C D]
  71  */
  72
  73 /*
  74  * Number of guaranteed r10bios in case of extreme VM load:
  75  */
  76 #define NR_RAID10_BIOS 256
  77
  78 /* when we get a read error on a read-only array, we redirect to another
  79  * device without failing the first device, or trying to over-write to
  80  * correct the read error.  To keep track of bad blocks on a per-bio
  81  * level, we store IO_BLOCKED in the appropriate 'bios' pointer
  82  */
  83 #define IO_BLOCKED ((struct bio *)1)
  84 /* When we successfully write to a known bad-block, we need to remove the
  85  * bad-block marking which must be done from process context.  So we record
  86  * the success by setting devs[n].bio to IO_MADE_GOOD
  87  */
  88 #define IO_MADE_GOOD ((struct bio *)2)
  89
  90 #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
  91
  92 /* When there are this many requests queued to be written by
  93  * the raid10 thread, we become 'congested' to provide back-pressure
  94  * for writeback.
  95  */
  96 static int max_queued_requests = 1024;
  97
  98 static void allow_barrier(struct r10conf *conf);
  99 static void lower_barrier(struct r10conf *conf);
 100 static int enough(struct r10conf *conf, int ignore);
 101 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 102                                 int *skipped);
 103 static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
 104 static void end_reshape_write(struct bio *bio, int error);
 105 static void end_reshape(struct r10conf *conf);
 106
 107 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
 108 {
 109         struct r10conf *conf = data;
 110         int size = offsetof(struct r10bio, devs[conf->copies]);
 111
 112         /* allocate a r10bio with room for raid_disks entries in the
 113          * bios array */
 114         return kzalloc(size, gfp_flags);
 115 }
 116
 117 static void r10bio_pool_free(void *r10_bio, void *data)
 118 {
 119         kfree(r10_bio);
 120 }
 121
 122 /* Maximum size of each resync request */
 123 #define RESYNC_BLOCK_SIZE (64*1024)
 124 #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
 125 /* amount of memory to reserve for resync requests */
 126 #define RESYNC_WINDOW (1024*1024)
 127 /* maximum number of concurrent requests, memory permitting */
 128 #define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
 129
 130 /*
 131  * When performing a resync, we need to read and compare, so
 132  * we need as many pages are there are copies.
 133  * When performing a recovery, we need 2 bios, one for read,
 134  * one for write (we recover only one drive per r10buf)
 135  *
 136  */
 137 static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 138 {
 139         struct r10conf *conf = data;
 140         struct page *page;
 141         struct r10bio *r10_bio;
 142         struct bio *bio;
 143         int i, j;
 144         int nalloc;
 145
 146         r10_bio = r10bio_pool_alloc(gfp_flags, conf);
 147         if (!r10_bio)
 148                 return NULL;
 149
 150         if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
 151             test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
 152                 nalloc = conf->copies; /* resync */
 153         else
 154                 nalloc = 2; /* recovery */
 155
 156         /*
 157          * Allocate bios.
 158          */
 159         for (j = nalloc ; j-- ; ) {
 160                 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
 161                 if (!bio)
 162                         goto out_free_bio;
 163                 r10_bio->devs[j].bio = bio;
 164                 if (!conf->have_replacement)
 165                         continue;
 166                 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
 167                 if (!bio)
 168                         goto out_free_bio;
 169                 r10_bio->devs[j].repl_bio = bio;
 170         }
 171         /*
 172          * Allocate RESYNC_PAGES data pages and attach them
 173          * where needed.
 174          */
 175         for (j = 0 ; j < nalloc; j++) {
 176                 struct bio *rbio = r10_bio->devs[j].repl_bio;
 177                 bio = r10_bio->devs[j].bio;
 178                 for (i = 0; i < RESYNC_PAGES; i++) {
 179                         if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
 180                                                &conf->mddev->recovery)) {
 181                                 /* we can share bv_page's during recovery
 182                                  * and reshape */
 183                                 struct bio *rbio = r10_bio->devs[0].bio;
 184                                 page = rbio->bi_io_vec[i].bv_page;
 185                                 get_page(page);
 186                         } else
 187                                 page = alloc_page(gfp_flags);
 188                         if (unlikely(!page))
 189                                 goto out_free_pages;
 190
 191                         bio->bi_io_vec[i].bv_page = page;
 192                         if (rbio)
 193                                 rbio->bi_io_vec[i].bv_page = page;
 194                 }
 195         }
 196
 197         return r10_bio;
 198
 199 out_free_pages:
 200         for ( ; i > 0 ; i--)
 201                 safe_put_page(bio->bi_io_vec[i-1].bv_page);
 202         while (j--)
 203                 for (i = 0; i < RESYNC_PAGES ; i++)
 204                         safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
 205         j = 0;
 206 out_free_bio:
 207         for ( ; j < nalloc; j++) {
 208                 if (r10_bio->devs[j].bio)
 209                         bio_put(r10_bio->devs[j].bio);
 210                 if (r10_bio->devs[j].repl_bio)
 211                         bio_put(r10_bio->devs[j].repl_bio);
 212         }
 213         r10bio_pool_free(r10_bio, conf);
 214         return NULL;
 215 }
 216
 217 static void r10buf_pool_free(void *__r10_bio, void *data)
 218 {
 219         int i;
 220         struct r10conf *conf = data;
 221         struct r10bio *r10bio = __r10_bio;
 222         int j;
 223
 224         for (j=0; j < conf->copies; j++) {
 225                 struct bio *bio = r10bio->devs[j].bio;
 226                 if (bio) {
 227                         for (i = 0; i < RESYNC_PAGES; i++) {
 228                                 safe_put_page(bio->bi_io_vec[i].bv_page);
 229                                 bio->bi_io_vec[i].bv_page = NULL;
 230                         }
 231                         bio_put(bio);
 232                 }
 233                 bio = r10bio->devs[j].repl_bio;
 234                 if (bio)
 235                         bio_put(bio);
 236         }
 237         r10bio_pool_free(r10bio, conf);
 238 }
 239
 240 static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
 241 {
 242         int i;
 243
 244         for (i = 0; i < conf->copies; i++) {
 245                 struct bio **bio = & r10_bio->devs[i].bio;
 246                 if (!BIO_SPECIAL(*bio))
 247                         bio_put(*bio);
 248                 *bio = NULL;
 249                 bio = &r10_bio->devs[i].repl_bio;
 250                 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
 251                         bio_put(*bio);
 252                 *bio = NULL;
 253         }
 254 }
 255
 256 static void free_r10bio(struct r10bio *r10_bio)
 257 {
 258         struct r10conf *conf = r10_bio->mddev->private;
 259
 260         put_all_bios(conf, r10_bio);
 261         mempool_free(r10_bio, conf->r10bio_pool);
 262 }
 263
 264 static void put_buf(struct r10bio *r10_bio)
 265 {
 266         struct r10conf *conf = r10_bio->mddev->private;
 267
 268         mempool_free(r10_bio, conf->r10buf_pool);
 269
 270         lower_barrier(conf);
 271 }
 272
 273 static void reschedule_retry(struct r10bio *r10_bio)
 274 {
 275         unsigned long flags;
 276         struct mddev *mddev = r10_bio->mddev;
 277         struct r10conf *conf = mddev->private;
 278
 279         spin_lock_irqsave(&conf->device_lock, flags);
 280         list_add(&r10_bio->retry_list, &conf->retry_list);
 281         conf->nr_queued ++;
 282         spin_unlock_irqrestore(&conf->device_lock, flags);
 283
 284         /* wake up frozen array... */
 285         wake_up(&conf->wait_barrier);
 286
 287         md_wakeup_thread(mddev->thread);
 288 }
 289
 290 /*
 291  * raid_end_bio_io() is called when we have finished servicing a mirrored
 292  * operation and are ready to return a success/failure code to the buffer
 293  * cache layer.
 294  */
 295 static void raid_end_bio_io(struct r10bio *r10_bio)
 296 {
 297         struct bio *bio = r10_bio->master_bio;
 298         int done;
 299         struct r10conf *conf = r10_bio->mddev->private;
 300
 301         if (bio->bi_phys_segments) {
 302                 unsigned long flags;
 303                 spin_lock_irqsave(&conf->device_lock, flags);
 304                 bio->bi_phys_segments--;
 305                 done = (bio->bi_phys_segments == 0);
 306                 spin_unlock_irqrestore(&conf->device_lock, flags);
 307         } else
 308                 done = 1;
 309         if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
 310                 clear_bit(BIO_UPTODATE, &bio->bi_flags);
 311         if (done) {
 312                 bio_endio(bio, 0);
 313                 /*
 314                  * Wake up any possible resync thread that waits for the device
 315                  * to go idle.
 316                  */
 317                 allow_barrier(conf);
 318         }
 319         free_r10bio(r10_bio);
 320 }
 321
 322 /*
 323  * Update disk head position estimator based on IRQ completion info.
 324  */
 325 static inline void update_head_pos(int slot, struct r10bio *r10_bio)
 326 {
 327         struct r10conf *conf = r10_bio->mddev->private;
 328
 329         conf->mirrors[r10_bio->devs[slot].devnum].head_position =
 330                 r10_bio->devs[slot].addr + (r10_bio->sectors);
 331 }
 332
 333 /*
 334  * Find the disk number which triggered given bio
 335  */
 336 static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
 337                          struct bio *bio, int *slotp, int *replp)
 338 {
 339         int slot;
 340         int repl = 0;
 341
 342         for (slot = 0; slot < conf->copies; slot++) {
 343                 if (r10_bio->devs[slot].bio == bio)
 344                         break;
 345                 if (r10_bio->devs[slot].repl_bio == bio) {
 346                         repl = 1;
 347                         break;
 348                 }
 349         }
 350
 351         BUG_ON(slot == conf->copies);
 352         update_head_pos(slot, r10_bio);
 353
 354         if (slotp)
 355                 *slotp = slot;
 356         if (replp)
 357                 *replp = repl;
 358         return r10_bio->devs[slot].devnum;
 359 }
 360
 361 static void raid10_end_read_request(struct bio *bio, int error)
 362 {
 363         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 364         struct r10bio *r10_bio = bio->bi_private;
 365         int slot, dev;
 366         struct md_rdev *rdev;
 367         struct r10conf *conf = r10_bio->mddev->private;
 368
 369
 370         slot = r10_bio->read_slot;
 371         dev = r10_bio->devs[slot].devnum;
 372         rdev = r10_bio->devs[slot].rdev;
 373         /*
 374          * this branch is our 'one mirror IO has finished' event handler:
 375          */
 376         update_head_pos(slot, r10_bio);
 377
 378         if (uptodate) {
 379                 /*
 380                  * Set R10BIO_Uptodate in our master bio, so that
 381                  * we will return a good error code to the higher
 382                  * levels even if IO on some other mirrored buffer fails.
 383                  *
 384                  * The 'master' represents the composite IO operation to
 385                  * user-side. So if something waits for IO, then it will
 386                  * wait for the 'master' bio.
 387                  */
 388                 set_bit(R10BIO_Uptodate, &r10_bio->state);
 389         } else {
 390                 /* If all other devices that store this block have
 391                  * failed, we want to return the error upwards rather
 392                  * than fail the last device.  Here we redefine
 393                  * "uptodate" to mean "Don't want to retry"
 394                  */
 395                 unsigned long flags;
 396                 spin_lock_irqsave(&conf->device_lock, flags);
 397                 if (!enough(conf, rdev->raid_disk))
 398                         uptodate = 1;
 399                 spin_unlock_irqrestore(&conf->device_lock, flags);
 400         }
 401         if (uptodate) {
 402                 raid_end_bio_io(r10_bio);
 403                 rdev_dec_pending(rdev, conf->mddev);
 404         } else {
 405                 /*
 406                  * oops, read error - keep the refcount on the rdev
 407                  */
 408                 char b[BDEVNAME_SIZE];
 409                 printk_ratelimited(KERN_ERR
 410                                    "md/raid10:%s: %s: rescheduling sector %llu\n",
 411                                    mdname(conf->mddev),
 412                                    bdevname(rdev->bdev, b),
 413                                    (unsigned long long)r10_bio->sector);
 414                 set_bit(R10BIO_ReadError, &r10_bio->state);
 415                 reschedule_retry(r10_bio);
 416         }
 417 }
 418
 419 static void close_write(struct r10bio *r10_bio)
 420 {
 421         /* clear the bitmap if all writes complete successfully */
 422         bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
 423                         r10_bio->sectors,
 424                         !test_bit(R10BIO_Degraded, &r10_bio->state),
 425                         0);
 426         md_write_end(r10_bio->mddev);
 427 }
 428
 429 static void one_write_done(struct r10bio *r10_bio)
 430 {
 431         if (atomic_dec_and_test(&r10_bio->remaining)) {
 432                 if (test_bit(R10BIO_WriteError, &r10_bio->state))
 433                         reschedule_retry(r10_bio);
 434                 else {
 435                         close_write(r10_bio);
 436                         if (test_bit(R10BIO_MadeGood, &r10_bio->state))
 437                                 reschedule_retry(r10_bio);
 438                         else
 439                                 raid_end_bio_io(r10_bio);
 440                 }
 441         }
 442 }
 443
 444 static void raid10_end_write_request(struct bio *bio, int error)
 445 {
 446         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 447         struct r10bio *r10_bio = bio->bi_private;
 448         int dev;
 449         int dec_rdev = 1;
 450         struct r10conf *conf = r10_bio->mddev->private;
 451         int slot, repl;
 452         struct md_rdev *rdev = NULL;
 453
 454         dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
 455
 456         if (repl)
 457                 rdev = conf->mirrors[dev].replacement;
 458         if (!rdev) {
 459                 smp_rmb();
 460                 repl = 0;
 461                 rdev = conf->mirrors[dev].rdev;
 462         }
 463         /*
 464          * this branch is our 'one mirror IO has finished' event handler:
 465          */
 466         if (!uptodate) {
 467                 if (repl)
 468                         /* Never record new bad blocks to replacement,
 469                          * just fail it.
 470                          */
 471                         md_error(rdev->mddev, rdev);
 472                 else {
 473                         set_bit(WriteErrorSeen, &rdev->flags);
 474                         if (!test_and_set_bit(WantReplacement, &rdev->flags))
 475                                 set_bit(MD_RECOVERY_NEEDED,
 476                                         &rdev->mddev->recovery);
 477                         set_bit(R10BIO_WriteError, &r10_bio->state);
 478                         dec_rdev = 0;
 479                 }
 480         } else {
 481                 /*
 482                  * Set R10BIO_Uptodate in our master bio, so that
 483                  * we will return a good error code for to the higher
 484                  * levels even if IO on some other mirrored buffer fails.
 485                  *
 486                  * The 'master' represents the composite IO operation to
 487                  * user-side. So if something waits for IO, then it will
 488                  * wait for the 'master' bio.
 489                  */
 490                 sector_t first_bad;
 491                 int bad_sectors;
 492
 493                 set_bit(R10BIO_Uptodate, &r10_bio->state);
 494
 495                 /* Maybe we can clear some bad blocks. */
 496                 if (is_badblock(rdev,
 497                                 r10_bio->devs[slot].addr,
 498                                 r10_bio->sectors,
 499                                 &first_bad, &bad_sectors)) {
 500                         bio_put(bio);
 501                         if (repl)
 502                                 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
 503                         else
 504                                 r10_bio->devs[slot].bio = IO_MADE_GOOD;
 505                         dec_rdev = 0;
 506                         set_bit(R10BIO_MadeGood, &r10_bio->state);
 507                 }
 508         }
 509
 510         /*
 511          *
 512          * Let's see if all mirrored write operations have finished
 513          * already.
 514          */
 515         one_write_done(r10_bio);
 516         if (dec_rdev)
 517                 rdev_dec_pending(rdev, conf->mddev);
 518 }
 519
 520 /*
 521  * RAID10 layout manager
 522  * As well as the chunksize and raid_disks count, there are two
 523  * parameters: near_copies and far_copies.
 524  * near_copies * far_copies must be <= raid_disks.
 525  * Normally one of these will be 1.
 526  * If both are 1, we get raid0.
 527  * If near_copies == raid_disks, we get raid1.
 528  *
 529  * Chunks are laid out in raid0 style with near_copies copies of the
 530  * first chunk, followed by near_copies copies of the next chunk and
 531  * so on.
 532  * If far_copies > 1, then after 1/far_copies of the array has been assigned
 533  * as described above, we start again with a device offset of near_copies.
 534  * So we effectively have another copy of the whole array further down all
 535  * the drives, but with blocks on different drives.
 536  * With this layout, and block is never stored twice on the one device.
 537  *
 538  * raid10_find_phys finds the sector offset of a given virtual sector
 539  * on each device that it is on.
 540  *
 541  * raid10_find_virt does the reverse mapping, from a device and a
 542  * sector offset to a virtual address
 543  */
 544
 545 static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
 546 {
 547         int n,f;
 548         sector_t sector;
 549         sector_t chunk;
 550         sector_t stripe;
 551         int dev;
 552         int slot = 0;
 553         int last_far_set_start, last_far_set_size;
 554
 555         last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
 556         last_far_set_start *= geo->far_set_size;
 557
 558         last_far_set_size = geo->far_set_size;
 559         last_far_set_size += (geo->raid_disks % geo->far_set_size);
 560
 561         /* now calculate first sector/dev */
 562         chunk = r10bio->sector >> geo->chunk_shift;
 563         sector = r10bio->sector & geo->chunk_mask;
 564
 565         chunk *= geo->near_copies;
 566         stripe = chunk;
 567         dev = sector_div(stripe, geo->raid_disks);
 568         if (geo->far_offset)
 569                 stripe *= geo->far_copies;
 570
 571         sector += stripe << geo->chunk_shift;
 572
 573         /* and calculate all the others */
 574         for (n = 0; n < geo->near_copies; n++) {
 575                 int d = dev;
 576                 int set;
 577                 sector_t s = sector;
 578                 r10bio->devs[slot].devnum = d;
 579                 r10bio->devs[slot].addr = s;
 580                 slot++;
 581
 582                 for (f = 1; f < geo->far_copies; f++) {
 583                         set = d / geo->far_set_size;
 584                         d += geo->near_copies;
 585
 586                         if ((geo->raid_disks % geo->far_set_size) &&
 587                             (d > last_far_set_start)) {
 588                                 d -= last_far_set_start;
 589                                 d %= last_far_set_size;
 590                                 d += last_far_set_start;
 591                         } else {
 592                                 d %= geo->far_set_size;
 593                                 d += geo->far_set_size * set;
 594                         }
 595                         s += geo->stride;
 596                         r10bio->devs[slot].devnum = d;
 597                         r10bio->devs[slot].addr = s;
 598                         slot++;
 599                 }
 600                 dev++;
 601                 if (dev >= geo->raid_disks) {
 602                         dev = 0;
 603                         sector += (geo->chunk_mask + 1);
 604                 }
 605         }
 606 }
 607
 608 static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
 609 {
 610         struct geom *geo = &conf->geo;
 611
 612         if (conf->reshape_progress != MaxSector &&
 613             ((r10bio->sector >= conf->reshape_progress) !=
 614              conf->mddev->reshape_backwards)) {
 615                 set_bit(R10BIO_Previous, &r10bio->state);
 616                 geo = &conf->prev;
 617         } else
 618                 clear_bit(R10BIO_Previous, &r10bio->state);
 619
 620         __raid10_find_phys(geo, r10bio);
 621 }
 622
 623 static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
 624 {
 625         sector_t offset, chunk, vchunk;
 626         /* Never use conf->prev as this is only called during resync
 627          * or recovery, so reshape isn't happening
 628          */
 629         struct geom *geo = &conf->geo;
 630         int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
 631         int far_set_size = geo->far_set_size;
 632         int last_far_set_start;
 633
 634         if (geo->raid_disks % geo->far_set_size) {
 635                 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
 636                 last_far_set_start *= geo->far_set_size;
 637
 638                 if (dev >= last_far_set_start) {
 639                         far_set_size = geo->far_set_size;
 640                         far_set_size += (geo->raid_disks % geo->far_set_size);
 641                         far_set_start = last_far_set_start;
 642                 }
 643         }
 644
 645         offset = sector & geo->chunk_mask;
 646         if (geo->far_offset) {
 647                 int fc;
 648                 chunk = sector >> geo->chunk_shift;
 649                 fc = sector_div(chunk, geo->far_copies);
 650                 dev -= fc * geo->near_copies;
 651                 if (dev < far_set_start)
 652                         dev += far_set_size;
 653         } else {
 654                 while (sector >= geo->stride) {
 655                         sector -= geo->stride;
 656                         if (dev < (geo->near_copies + far_set_start))
 657                                 dev += far_set_size - geo->near_copies;
 658                         else
 659                                 dev -= geo->near_copies;
 660                 }
 661                 chunk = sector >> geo->chunk_shift;
 662         }
 663         vchunk = chunk * geo->raid_disks + dev;
 664         sector_div(vchunk, geo->near_copies);
 665         return (vchunk << geo->chunk_shift) + offset;
 666 }
 667
 668 /**
 669  *      raid10_mergeable_bvec -- tell bio layer if a two requests can be merged
 670  *      @q: request queue
 671  *      @bvm: properties of new bio
 672  *      @biovec: the request that could be merged to it.
 673  *
 674  *      Return amount of bytes we can accept at this offset
 675  *      This requires checking for end-of-chunk if near_copies != raid_disks,
 676  *      and for subordinate merge_bvec_fns if merge_check_needed.
 677  */
 678 static int raid10_mergeable_bvec(struct request_queue *q,
 679                                  struct bvec_merge_data *bvm,
 680                                  struct bio_vec *biovec)
 681 {
 682         struct mddev *mddev = q->queuedata;
 683         struct r10conf *conf = mddev->private;
 684         sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
 685         int max;
 686         unsigned int chunk_sectors;
 687         unsigned int bio_sectors = bvm->bi_size >> 9;
 688         struct geom *geo = &conf->geo;
 689
 690         chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1;
 691         if (conf->reshape_progress != MaxSector &&
 692             ((sector >= conf->reshape_progress) !=
 693              conf->mddev->reshape_backwards))
 694                 geo = &conf->prev;
 695
 696         if (geo->near_copies < geo->raid_disks) {
 697                 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
 698                                         + bio_sectors)) << 9;
 699                 if (max < 0)
 700                         /* bio_add cannot handle a negative return */
 701                         max = 0;
 702                 if (max <= biovec->bv_len && bio_sectors == 0)
 703                         return biovec->bv_len;
 704         } else
 705                 max = biovec->bv_len;
 706
 707         if (mddev->merge_check_needed) {
 708                 struct {
 709                         struct r10bio r10_bio;
 710                         struct r10dev devs[conf->copies];
 711                 } on_stack;
 712                 struct r10bio *r10_bio = &on_stack.r10_bio;
 713                 int s;
 714                 if (conf->reshape_progress != MaxSector) {
 715                         /* Cannot give any guidance during reshape */
 716                         if (max <= biovec->bv_len && bio_sectors == 0)
 717                                 return biovec->bv_len;
 718                         return 0;
 719                 }
 720                 r10_bio->sector = sector;
 721                 raid10_find_phys(conf, r10_bio);
 722                 rcu_read_lock();
 723                 for (s = 0; s < conf->copies; s++) {
 724                         int disk = r10_bio->devs[s].devnum;
 725                         struct md_rdev *rdev = rcu_dereference(
 726                                 conf->mirrors[disk].rdev);
 727                         if (rdev && !test_bit(Faulty, &rdev->flags)) {
 728                                 struct request_queue *q =
 729                                         bdev_get_queue(rdev->bdev);
 730                                 if (q->merge_bvec_fn) {
 731                                         bvm->bi_sector = r10_bio->devs[s].addr
 732                                                 + rdev->data_offset;
 733                                         bvm->bi_bdev = rdev->bdev;
 734                                         max = min(max, q->merge_bvec_fn(
 735                                                           q, bvm, biovec));
 736                                 }
 737                         }
 738                         rdev = rcu_dereference(conf->mirrors[disk].replacement);
 739                         if (rdev && !test_bit(Faulty, &rdev->flags)) {
 740                                 struct request_queue *q =
 741                                         bdev_get_queue(rdev->bdev);
 742                                 if (q->merge_bvec_fn) {
 743                                         bvm->bi_sector = r10_bio->devs[s].addr
 744                                                 + rdev->data_offset;
 745                                         bvm->bi_bdev = rdev->bdev;
 746                                         max = min(max, q->merge_bvec_fn(
 747                                                           q, bvm, biovec));
 748                                 }
 749                         }
 750                 }
 751                 rcu_read_unlock();
 752         }
 753         return max;
 754 }
 755
 756 /*
 757  * This routine returns the disk from which the requested read should
 758  * be done. There is a per-array 'next expected sequential IO' sector
 759  * number - if this matches on the next IO then we use the last disk.
 760  * There is also a per-disk 'last know head position' sector that is
 761  * maintained from IRQ contexts, both the normal and the resync IO
 762  * completion handlers update this position correctly. If there is no
 763  * perfect sequential match then we pick the disk whose head is closest.
 764  *
 765  * If there are 2 mirrors in the same 2 devices, performance degrades
 766  * because position is mirror, not device based.
 767  *
 768  * The rdev for the device selected will have nr_pending incremented.
 769  */
 770
 771 /*
 772  * FIXME: possibly should rethink readbalancing and do it differently
 773  * depending on near_copies / far_copies geometry.
 774  */
 775 static struct md_rdev *read_balance(struct r10conf *conf,
 776                                     struct r10bio *r10_bio,
 777                                     int *max_sectors)
 778 {
 779         const sector_t this_sector = r10_bio->sector;
 780         int disk, slot;
 781         int sectors = r10_bio->sectors;
 782         int best_good_sectors;
 783         sector_t new_distance, best_dist;
 784         struct md_rdev *best_rdev, *rdev = NULL;
 785         int do_balance;
 786         int best_slot;
 787         struct geom *geo = &conf->geo;
 788
 789         raid10_find_phys(conf, r10_bio);
 790         rcu_read_lock();
 791 retry:
 792         sectors = r10_bio->sectors;
 793         best_slot = -1;
 794         best_rdev = NULL;
 795         best_dist = MaxSector;
 796         best_good_sectors = 0;
 797         do_balance = 1;
 798         /*
 799          * Check if we can balance. We can balance on the whole
 800          * device if no resync is going on (recovery is ok), or below
 801          * the resync window. We take the first readable disk when
 802          * above the resync window.
 803          */
 804         if (conf->mddev->recovery_cp < MaxSector
 805             && (this_sector + sectors >= conf->next_resync))
 806                 do_balance = 0;
 807
 808         for (slot = 0; slot < conf->copies ; slot++) {
 809                 sector_t first_bad;
 810                 int bad_sectors;
 811                 sector_t dev_sector;
 812
 813                 if (r10_bio->devs[slot].bio == IO_BLOCKED)
 814                         continue;
 815                 disk = r10_bio->devs[slot].devnum;
 816                 rdev = rcu_dereference(conf->mirrors[disk].replacement);
 817                 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
 818                     test_bit(Unmerged, &rdev->flags) ||
 819                     r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
 820                         rdev = rcu_dereference(conf->mirrors[disk].rdev);
 821                 if (rdev == NULL ||
 822                     test_bit(Faulty, &rdev->flags) ||
 823                     test_bit(Unmerged, &rdev->flags))
 824                         continue;
 825                 if (!test_bit(In_sync, &rdev->flags) &&
 826                     r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
 827                         continue;
 828
 829                 dev_sector = r10_bio->devs[slot].addr;
 830                 if (is_badblock(rdev, dev_sector, sectors,
 831                                 &first_bad, &bad_sectors)) {
 832                         if (best_dist < MaxSector)
 833                                 /* Already have a better slot */
 834                                 continue;
 835                         if (first_bad <= dev_sector) {
 836                                 /* Cannot read here.  If this is the
 837                                  * 'primary' device, then we must not read
 838                                  * beyond 'bad_sectors' from another device.
 839                                  */
 840                                 bad_sectors -= (dev_sector - first_bad);
 841                                 if (!do_balance && sectors > bad_sectors)
 842                                         sectors = bad_sectors;
 843                                 if (best_good_sectors > sectors)
 844                                         best_good_sectors = sectors;
 845                         } else {
 846                                 sector_t good_sectors =
 847                                         first_bad - dev_sector;
 848                                 if (good_sectors > best_good_sectors) {
 849                                         best_good_sectors = good_sectors;
 850                                         best_slot = slot;
 851                                         best_rdev = rdev;
 852                                 }
 853                                 if (!do_balance)
 854                                         /* Must read from here */
 855                                         break;
 856                         }
 857                         continue;
 858                 } else
 859                         best_good_sectors = sectors;
 860
 861                 if (!do_balance)
 862                         break;
 863
 864                 /* This optimisation is debatable, and completely destroys
 865                  * sequential read speed for 'far copies' arrays.  So only
 866                  * keep it for 'near' arrays, and review those later.
 867                  */
 868                 if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
 869                         break;
 870
 871                 /* for far > 1 always use the lowest address */
 872                 if (geo->far_copies > 1)
 873                         new_distance = r10_bio->devs[slot].addr;
 874                 else
 875                         new_distance = abs(r10_bio->devs[slot].addr -
 876                                            conf->mirrors[disk].head_position);
 877                 if (new_distance < best_dist) {
 878                         best_dist = new_distance;
 879                         best_slot = slot;
 880                         best_rdev = rdev;
 881                 }
 882         }
 883         if (slot >= conf->copies) {
 884                 slot = best_slot;
 885                 rdev = best_rdev;
 886         }
 887
 888         if (slot >= 0) {
 889                 atomic_inc(&rdev->nr_pending);
 890                 if (test_bit(Faulty, &rdev->flags)) {
 891                         /* Cannot risk returning a device that failed
 892                          * before we inc'ed nr_pending
 893                          */
 894                         rdev_dec_pending(rdev, conf->mddev);
 895                         goto retry;
 896                 }
 897                 r10_bio->read_slot = slot;
 898         } else
 899                 rdev = NULL;
 900         rcu_read_unlock();
 901         *max_sectors = best_good_sectors;
 902
 903         return rdev;
 904 }
 905
 906 int md_raid10_congested(struct mddev *mddev, int bits)
 907 {
 908         struct r10conf *conf = mddev->private;
 909         int i, ret = 0;
 910
 911         if ((bits & (1 << BDI_async_congested)) &&
 912             conf->pending_count >= max_queued_requests)
 913                 return 1;
 914
 915         rcu_read_lock();
 916         for (i = 0;
 917              (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
 918                      && ret == 0;
 919              i++) {
 920                 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
 921                 if (rdev && !test_bit(Faulty, &rdev->flags)) {
 922                         struct request_queue *q = bdev_get_queue(rdev->bdev);
 923
 924                         ret |= bdi_congested(&q->backing_dev_info, bits);
 925                 }
 926         }
 927         rcu_read_unlock();
 928         return ret;
 929 }
 930 EXPORT_SYMBOL_GPL(md_raid10_congested);
 931
 932 static int raid10_congested(void *data, int bits)
 933 {
 934         struct mddev *mddev = data;
 935
 936         return mddev_congested(mddev, bits) ||
 937                 md_raid10_congested(mddev, bits);
 938 }
 939
 940 static void flush_pending_writes(struct r10conf *conf)
 941 {
 942         /* Any writes that have been queued but are awaiting
 943          * bitmap updates get flushed here.
 944          */
 945         spin_lock_irq(&conf->device_lock);
 946
 947         if (conf->pending_bio_list.head) {
 948                 struct bio *bio;
 949                 bio = bio_list_get(&conf->pending_bio_list);
 950                 conf->pending_count = 0;
 951                 spin_unlock_irq(&conf->device_lock);
 952                 /* flush any pending bitmap writes to disk
 953                  * before proceeding w/ I/O */
 954                 bitmap_unplug(conf->mddev->bitmap);
 955                 wake_up(&conf->wait_barrier);
 956
 957                 while (bio) { /* submit pending writes */
 958                         struct bio *next = bio->bi_next;
 959                         bio->bi_next = NULL;
 960                         if (unlikely((bio->bi_rw & REQ_DISCARD) &&
 961                             !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
 962                                 /* Just ignore it */
 963                                 bio_endio(bio, 0);
 964                         else
 965                                 generic_make_request(bio);
 966                         bio = next;
 967                 }
 968         } else
 969                 spin_unlock_irq(&conf->device_lock);
 970 }
 971
 972 /* Barriers....
 973  * Sometimes we need to suspend IO while we do something else,
 974  * either some resync/recovery, or reconfigure the array.
 975  * To do this we raise a 'barrier'.
 976  * The 'barrier' is a counter that can be raised multiple times
 977  * to count how many activities are happening which preclude
 978  * normal IO.
 979  * We can only raise the barrier if there is no pending IO.
 980  * i.e. if nr_pending == 0.
 981  * We choose only to raise the barrier if no-one is waiting for the
 982  * barrier to go down.  This means that as soon as an IO request
 983  * is ready, no other operations which require a barrier will start
 984  * until the IO request has had a chance.
 985  *
 986  * So: regular IO calls 'wait_barrier'.  When that returns there
 987  *    is no backgroup IO happening,  It must arrange to call
 988  *    allow_barrier when it has finished its IO.
 989  * backgroup IO calls must call raise_barrier.  Once that returns
 990  *    there is no normal IO happeing.  It must arrange to call
 991  *    lower_barrier when the particular background IO completes.
 992  */
 993
 994 static void raise_barrier(struct r10conf *conf, int force)
 995 {
 996         BUG_ON(force && !conf->barrier);
 997         spin_lock_irq(&conf->resync_lock);
 998
 999         /* Wait until no block IO is waiting (unless 'force') */
1000         wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
1001                             conf->resync_lock);
1002
1003         /* block any new IO from starting */
1004         conf->barrier++;
1005
1006         /* Now wait for all pending IO to complete */
1007         wait_event_lock_irq(conf->wait_barrier,
1008                             !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
1009                             conf->resync_lock);
1010
1011         spin_unlock_irq(&conf->resync_lock);
1012 }
1013
1014 static void lower_barrier(struct r10conf *conf)
1015 {
1016         unsigned long flags;
1017         spin_lock_irqsave(&conf->resync_lock, flags);
1018         conf->barrier--;
1019         spin_unlock_irqrestore(&conf->resync_lock, flags);
1020         wake_up(&conf->wait_barrier);
1021 }
1022
1023 static void wait_barrier(struct r10conf *conf)
1024 {
1025         spin_lock_irq(&conf->resync_lock);
1026         if (conf->barrier) {
1027                 conf->nr_waiting++;
1028                 /* Wait for the barrier to drop.
1029                  * However if there are already pending
1030                  * requests (preventing the barrier from
1031                  * rising completely), and the
1032                  * pre-process bio queue isn't empty,
1033                  * then don't wait, as we need to empty
1034                  * that queue to get the nr_pending
1035                  * count down.
1036                  */
1037                 wait_event_lock_irq(conf->wait_barrier,
1038                                     !conf->barrier ||
1039                                     (conf->nr_pending &&
1040                                      current->bio_list &&
1041                                      !bio_list_empty(current->bio_list)),
1042                                     conf->resync_lock);
1043                 conf->nr_waiting--;
1044         }
1045         conf->nr_pending++;
1046         spin_unlock_irq(&conf->resync_lock);
1047 }
1048
1049 static void allow_barrier(struct r10conf *conf)
1050 {
1051         unsigned long flags;
1052         spin_lock_irqsave(&conf->resync_lock, flags);
1053         conf->nr_pending--;
1054         spin_unlock_irqrestore(&conf->resync_lock, flags);
1055         wake_up(&conf->wait_barrier);
1056 }
1057
1058 static void freeze_array(struct r10conf *conf)
1059 {
1060         /* stop syncio and normal IO and wait for everything to
1061          * go quiet.
1062          * We increment barrier and nr_waiting, and then
1063          * wait until nr_pending match nr_queued+1
1064          * This is called in the context of one normal IO request
1065          * that has failed. Thus any sync request that might be pending
1066          * will be blocked by nr_pending, and we need to wait for
1067          * pending IO requests to complete or be queued for re-try.
1068          * Thus the number queued (nr_queued) plus this request (1)
1069          * must match the number of pending IOs (nr_pending) before
1070          * we continue.
1071          */
1072         spin_lock_irq(&conf->resync_lock);
1073         conf->barrier++;
1074         conf->nr_waiting++;
1075         wait_event_lock_irq_cmd(conf->wait_barrier,
1076                                 conf->nr_pending == conf->nr_queued+1,
1077                                 conf->resync_lock,
1078                                 flush_pending_writes(conf));
1079
1080         spin_unlock_irq(&conf->resync_lock);
1081 }
1082
1083 static void unfreeze_array(struct r10conf *conf)
1084 {
1085         /* reverse the effect of the freeze */
1086         spin_lock_irq(&conf->resync_lock);
1087         conf->barrier--;
1088         conf->nr_waiting--;
1089         wake_up(&conf->wait_barrier);
1090         spin_unlock_irq(&conf->resync_lock);
1091 }
1092
1093 static sector_t choose_data_offset(struct r10bio *r10_bio,
1094                                    struct md_rdev *rdev)
1095 {
1096         if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
1097             test_bit(R10BIO_Previous, &r10_bio->state))
1098                 return rdev->data_offset;
1099         else
1100                 return rdev->new_data_offset;
1101 }
1102
1103 struct raid10_plug_cb {
1104         struct blk_plug_cb      cb;
1105         struct bio_list         pending;
1106         int                     pending_cnt;
1107 };
1108
1109 static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
1110 {
1111         struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb,
1112                                                    cb);
1113         struct mddev *mddev = plug->cb.data;
1114         struct r10conf *conf = mddev->private;
1115         struct bio *bio;
1116
1117         if (from_schedule || current->bio_list) {
1118                 spin_lock_irq(&conf->device_lock);
1119                 bio_list_merge(&conf->pending_bio_list, &plug->pending);
1120                 conf->pending_count += plug->pending_cnt;
1121                 spin_unlock_irq(&conf->device_lock);
1122                 wake_up(&conf->wait_barrier);
1123                 md_wakeup_thread(mddev->thread);
1124                 kfree(plug);
1125                 return;
1126         }
1127
1128         /* we aren't scheduling, so we can do the write-out directly. */
1129         bio = bio_list_get(&plug->pending);
1130         bitmap_unplug(mddev->bitmap);
1131         wake_up(&conf->wait_barrier);
1132
1133         while (bio) { /* submit pending writes */
1134                 struct bio *next = bio->bi_next;
1135                 bio->bi_next = NULL;
1136                 generic_make_request(bio);
1137                 bio = next;
1138         }
1139         kfree(plug);
1140 }
1141
1142 static void make_request(struct mddev *mddev, struct bio * bio)
1143 {
1144         struct r10conf *conf = mddev->private;
1145         struct r10bio *r10_bio;
1146         struct bio *read_bio;
1147         int i;
1148         sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
1149         int chunk_sects = chunk_mask + 1;
1150         const int rw = bio_data_dir(bio);
1151         const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
1152         const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
1153         const unsigned long do_discard = (bio->bi_rw
1154                                           & (REQ_DISCARD | REQ_SECURE));
1155         const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
1156         unsigned long flags;
1157         struct md_rdev *blocked_rdev;
1158         struct blk_plug_cb *cb;
1159         struct raid10_plug_cb *plug = NULL;
1160         int sectors_handled;
1161         int max_sectors;
1162         int sectors;
1163
1164         if (unlikely(bio->bi_rw & REQ_FLUSH)) {
1165                 md_flush_request(mddev, bio);
1166                 return;
1167         }
1168
1169         /* If this request crosses a chunk boundary, we need to
1170          * split it.  This will only happen for 1 PAGE (or less) requests.
1171          */
1172         if (unlikely((bio->bi_sector & chunk_mask) + bio_sectors(bio)
1173                      > chunk_sects
1174                      && (conf->geo.near_copies < conf->geo.raid_disks
1175                          || conf->prev.near_copies < conf->prev.raid_disks))) {
1176                 struct bio_pair *bp;
1177                 /* Sanity check -- queue functions should prevent this happening */
1178                 if (bio_segments(bio) > 1)
1179                         goto bad_map;
1180                 /* This is a one page bio that upper layers
1181                  * refuse to split for us, so we need to split it.
1182                  */
1183                 bp = bio_split(bio,
1184                                chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
1185
1186                 /* Each of these 'make_request' calls will call 'wait_barrier'.
1187                  * If the first succeeds but the second blocks due to the resync
1188                  * thread raising the barrier, we will deadlock because the
1189                  * IO to the underlying device will be queued in generic_make_request
1190                  * and will never complete, so will never reduce nr_pending.
1191                  * So increment nr_waiting here so no new raise_barriers will
1192                  * succeed, and so the second wait_barrier cannot block.
1193                  */
1194                 spin_lock_irq(&conf->resync_lock);
1195                 conf->nr_waiting++;
1196                 spin_unlock_irq(&conf->resync_lock);
1197
1198                 make_request(mddev, &bp->bio1);
1199                 make_request(mddev, &bp->bio2);
1200
1201                 spin_lock_irq(&conf->resync_lock);
1202                 conf->nr_waiting--;
1203                 wake_up(&conf->wait_barrier);
1204                 spin_unlock_irq(&conf->resync_lock);
1205
1206                 bio_pair_release(bp);
1207                 return;
1208         bad_map:
1209                 printk("md/raid10:%s: make_request bug: can't convert block across chunks"
1210                        " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
1211                        (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2);
1212
1213                 bio_io_error(bio);
1214                 return;
1215         }
1216
1217         md_write_start(mddev, bio);
1218
1219         /*
1220          * Register the new request and wait if the reconstruction
1221          * thread has put up a bar for new requests.
1222          * Continue immediately if no resync is active currently.
1223          */
1224         wait_barrier(conf);
1225
1226         sectors = bio_sectors(bio);
1227         while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1228             bio->bi_sector < conf->reshape_progress &&
1229             bio->bi_sector + sectors > conf->reshape_progress) {
1230                 /* IO spans the reshape position.  Need to wait for
1231                  * reshape to pass
1232                  */
1233                 allow_barrier(conf);
1234                 wait_event(conf->wait_barrier,
1235                            conf->reshape_progress <= bio->bi_sector ||
1236                            conf->reshape_progress >= bio->bi_sector + sectors);
1237                 wait_barrier(conf);
1238         }
1239         if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1240             bio_data_dir(bio) == WRITE &&
1241             (mddev->reshape_backwards
1242              ? (bio->bi_sector < conf->reshape_safe &&
1243                 bio->bi_sector + sectors > conf->reshape_progress)
1244              : (bio->bi_sector + sectors > conf->reshape_safe &&
1245                 bio->bi_sector < conf->reshape_progress))) {
1246                 /* Need to update reshape_position in metadata */
1247                 mddev->reshape_position = conf->reshape_progress;
1248                 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1249                 set_bit(MD_CHANGE_PENDING, &mddev->flags);
1250                 md_wakeup_thread(mddev->thread);
1251                 wait_event(mddev->sb_wait,
1252                            !test_bit(MD_CHANGE_PENDING, &mddev->flags));
1253
1254                 conf->reshape_safe = mddev->reshape_position;
1255         }
1256
1257         r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1258
1259         r10_bio->master_bio = bio;
1260         r10_bio->sectors = sectors;
1261
1262         r10_bio->mddev = mddev;
1263         r10_bio->sector = bio->bi_sector;
1264         r10_bio->state = 0;
1265
1266         /* We might need to issue multiple reads to different
1267          * devices if there are bad blocks around, so we keep
1268          * track of the number of reads in bio->bi_phys_segments.
1269          * If this is 0, there is only one r10_bio and no locking
1270          * will be needed when the request completes.  If it is
1271          * non-zero, then it is the number of not-completed requests.
1272          */
1273         bio->bi_phys_segments = 0;
1274         clear_bit(BIO_SEG_VALID, &bio->bi_flags);
1275
1276         if (rw == READ) {
1277                 /*
1278                  * read balancing logic:
1279                  */
1280                 struct md_rdev *rdev;
1281                 int slot;
1282
1283 read_again:
1284                 rdev = read_balance(conf, r10_bio, &max_sectors);
1285                 if (!rdev) {
1286                         raid_end_bio_io(r10_bio);
1287                         return;
1288                 }
1289                 slot = r10_bio->read_slot;
1290
1291                 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1292                 md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
1293                             max_sectors);
1294
1295                 r10_bio->devs[slot].bio = read_bio;
1296                 r10_bio->devs[slot].rdev = rdev;
1297
1298                 read_bio->bi_sector = r10_bio->devs[slot].addr +
1299                         choose_data_offset(r10_bio, rdev);
1300                 read_bio->bi_bdev = rdev->bdev;
1301                 read_bio->bi_end_io = raid10_end_read_request;
1302                 read_bio->bi_rw = READ | do_sync;
1303                 read_bio->bi_private = r10_bio;
1304
1305                 if (max_sectors < r10_bio->sectors) {
1306                         /* Could not read all from this device, so we will
1307                          * need another r10_bio.
1308                          */
1309                         sectors_handled = (r10_bio->sectors + max_sectors
1310                                            - bio->bi_sector);
1311                         r10_bio->sectors = max_sectors;
1312                         spin_lock_irq(&conf->device_lock);
1313                         if (bio->bi_phys_segments == 0)
1314                                 bio->bi_phys_segments = 2;
1315                         else
1316                                 bio->bi_phys_segments++;
1317                         spin_unlock(&conf->device_lock);
1318                         /* Cannot call generic_make_request directly
1319                          * as that will be queued in __generic_make_request
1320                          * and subsequent mempool_alloc might block
1321                          * waiting for it.  so hand bio over to raid10d.
1322                          */
1323                         reschedule_retry(r10_bio);
1324
1325                         r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1326
1327                         r10_bio->master_bio = bio;
1328                         r10_bio->sectors = bio_sectors(bio) - sectors_handled;
1329                         r10_bio->state = 0;
1330                         r10_bio->mddev = mddev;
1331                         r10_bio->sector = bio->bi_sector + sectors_handled;
1332                         goto read_again;
1333                 } else
1334                         generic_make_request(read_bio);
1335                 return;
1336         }
1337
1338         /*
1339          * WRITE:
1340          */
1341         if (conf->pending_count >= max_queued_requests) {
1342                 md_wakeup_thread(mddev->thread);
1343                 wait_event(conf->wait_barrier,
1344                            conf->pending_count < max_queued_requests);
1345         }
1346         /* first select target devices under rcu_lock and
1347          * inc refcount on their rdev.  Record them by setting
1348          * bios[x] to bio
1349          * If there are known/acknowledged bad blocks on any device
1350          * on which we have seen a write error, we want to avoid
1351          * writing to those blocks.  This potentially requires several
1352          * writes to write around the bad blocks.  Each set of writes
1353          * gets its own r10_bio with a set of bios attached.  The number
1354          * of r10_bios is recored in bio->bi_phys_segments just as with
1355          * the read case.
1356          */
1357
1358         r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
1359         raid10_find_phys(conf, r10_bio);
1360 retry_write:
1361         blocked_rdev = NULL;
1362         rcu_read_lock();
1363         max_sectors = r10_bio->sectors;
1364
1365         for (i = 0;  i < conf->copies; i++) {
1366                 int d = r10_bio->devs[i].devnum;
1367                 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
1368                 struct md_rdev *rrdev = rcu_dereference(
1369                         conf->mirrors[d].replacement);
1370                 if (rdev == rrdev)
1371                         rrdev = NULL;
1372                 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1373                         atomic_inc(&rdev->nr_pending);
1374                         blocked_rdev = rdev;
1375                         break;
1376                 }
1377                 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
1378                         atomic_inc(&rrdev->nr_pending);
1379                         blocked_rdev = rrdev;
1380                         break;
1381                 }
1382                 if (rdev && (test_bit(Faulty, &rdev->flags)
1383                              || test_bit(Unmerged, &rdev->flags)))
1384                         rdev = NULL;
1385                 if (rrdev && (test_bit(Faulty, &rrdev->flags)
1386                               || test_bit(Unmerged, &rrdev->flags)))
1387                         rrdev = NULL;
1388
1389                 r10_bio->devs[i].bio = NULL;
1390                 r10_bio->devs[i].repl_bio = NULL;
1391
1392                 if (!rdev && !rrdev) {
1393                         set_bit(R10BIO_Degraded, &r10_bio->state);
1394                         continue;
1395                 }
1396                 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
1397                         sector_t first_bad;
1398                         sector_t dev_sector = r10_bio->devs[i].addr;
1399                         int bad_sectors;
1400                         int is_bad;
1401
1402                         is_bad = is_badblock(rdev, dev_sector,
1403                                              max_sectors,
1404                                              &first_bad, &bad_sectors);
1405                         if (is_bad < 0) {
1406                                 /* Mustn't write here until the bad block
1407                                  * is acknowledged
1408                                  */
1409                                 atomic_inc(&rdev->nr_pending);
1410                                 set_bit(BlockedBadBlocks, &rdev->flags);
1411                                 blocked_rdev = rdev;
1412                                 break;
1413                         }
1414                         if (is_bad && first_bad <= dev_sector) {
1415                                 /* Cannot write here at all */
1416                                 bad_sectors -= (dev_sector - first_bad);
1417                                 if (bad_sectors < max_sectors)
1418                                         /* Mustn't write more than bad_sectors
1419                                          * to other devices yet
1420                                          */
1421                                         max_sectors = bad_sectors;
1422                                 /* We don't set R10BIO_Degraded as that
1423                                  * only applies if the disk is missing,
1424                                  * so it might be re-added, and we want to
1425                                  * know to recover this chunk.
1426                                  * In this case the device is here, and the
1427                                  * fact that this chunk is not in-sync is
1428                                  * recorded in the bad block log.
1429                                  */
1430                                 continue;
1431                         }
1432                         if (is_bad) {
1433                                 int good_sectors = first_bad - dev_sector;
1434                                 if (good_sectors < max_sectors)
1435                                         max_sectors = good_sectors;
1436                         }
1437                 }
1438                 if (rdev) {
1439                         r10_bio->devs[i].bio = bio;
1440                         atomic_inc(&rdev->nr_pending);
1441                 }
1442                 if (rrdev) {
1443                         r10_bio->devs[i].repl_bio = bio;
1444                         atomic_inc(&rrdev->nr_pending);
1445                 }
1446         }
1447         rcu_read_unlock();
1448
1449         if (unlikely(blocked_rdev)) {
1450                 /* Have to wait for this device to get unblocked, then retry */
1451                 int j;
1452                 int d;
1453
1454                 for (j = 0; j < i; j++) {
1455                         if (r10_bio->devs[j].bio) {
1456                                 d = r10_bio->devs[j].devnum;
1457                                 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1458                         }
1459                         if (r10_bio->devs[j].repl_bio) {
1460                                 struct md_rdev *rdev;
1461                                 d = r10_bio->devs[j].devnum;
1462                                 rdev = conf->mirrors[d].replacement;
1463                                 if (!rdev) {
1464                                         /* Race with remove_disk */
1465                                         smp_mb();
1466                                         rdev = conf->mirrors[d].rdev;
1467                                 }
1468                                 rdev_dec_pending(rdev, mddev);
1469                         }
1470                 }
1471                 allow_barrier(conf);
1472                 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1473                 wait_barrier(conf);
1474                 goto retry_write;
1475         }
1476
1477         if (max_sectors < r10_bio->sectors) {
1478                 /* We are splitting this into multiple parts, so
1479                  * we need to prepare for allocating another r10_bio.
1480                  */
1481                 r10_bio->sectors = max_sectors;
1482                 spin_lock_irq(&conf->device_lock);
1483                 if (bio->bi_phys_segments == 0)
1484                         bio->bi_phys_segments = 2;
1485                 else
1486                         bio->bi_phys_segments++;
1487                 spin_unlock_irq(&conf->device_lock);
1488         }
1489         sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector;
1490
1491         atomic_set(&r10_bio->remaining, 1);
1492         bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
1493
1494         for (i = 0; i < conf->copies; i++) {
1495                 struct bio *mbio;
1496                 int d = r10_bio->devs[i].devnum;
1497                 if (r10_bio->devs[i].bio) {
1498                         struct md_rdev *rdev = conf->mirrors[d].rdev;
1499                         mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1500                         md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1501                                     max_sectors);
1502                         r10_bio->devs[i].bio = mbio;
1503
1504                         mbio->bi_sector = (r10_bio->devs[i].addr+
1505                                            choose_data_offset(r10_bio,
1506                                                               rdev));
1507                         mbio->bi_bdev = rdev->bdev;
1508                         mbio->bi_end_io = raid10_end_write_request;
1509                         mbio->bi_rw =
1510                                 WRITE | do_sync | do_fua | do_discard | do_same;
1511                         mbio->bi_private = r10_bio;
1512
1513                         atomic_inc(&r10_bio->remaining);
1514
1515                         cb = blk_check_plugged(raid10_unplug, mddev,
1516                                                sizeof(*plug));
1517                         if (cb)
1518                                 plug = container_of(cb, struct raid10_plug_cb,
1519                                                     cb);
1520                         else
1521                                 plug = NULL;
1522                         spin_lock_irqsave(&conf->device_lock, flags);
1523                         if (plug) {
1524                                 bio_list_add(&plug->pending, mbio);
1525                                 plug->pending_cnt++;
1526                         } else {
1527                                 bio_list_add(&conf->pending_bio_list, mbio);
1528                                 conf->pending_count++;
1529                         }
1530                         spin_unlock_irqrestore(&conf->device_lock, flags);
1531                         if (!plug)
1532                                 md_wakeup_thread(mddev->thread);
1533                 }
1534
1535                 if (r10_bio->devs[i].repl_bio) {
1536                         struct md_rdev *rdev = conf->mirrors[d].replacement;
1537                         if (rdev == NULL) {
1538                                 /* Replacement just got moved to main 'rdev' */
1539                                 smp_mb();
1540                                 rdev = conf->mirrors[d].rdev;
1541                         }
1542                         mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1543                         md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1544                                     max_sectors);
1545                         r10_bio->devs[i].repl_bio = mbio;
1546
1547                         mbio->bi_sector = (r10_bio->devs[i].addr +
1548                                            choose_data_offset(
1549                                                    r10_bio, rdev));
1550                         mbio->bi_bdev = rdev->bdev;
1551                         mbio->bi_end_io = raid10_end_write_request;
1552                         mbio->bi_rw =
1553                                 WRITE | do_sync | do_fua | do_discard | do_same;
1554                         mbio->bi_private = r10_bio;
1555
1556                         atomic_inc(&r10_bio->remaining);
1557                         spin_lock_irqsave(&conf->device_lock, flags);
1558                         bio_list_add(&conf->pending_bio_list, mbio);
1559                         conf->pending_count++;
1560                         spin_unlock_irqrestore(&conf->device_lock, flags);
1561                         if (!mddev_check_plugged(mddev))
1562                                 md_wakeup_thread(mddev->thread);
1563                 }
1564         }
1565
1566         /* Don't remove the bias on 'remaining' (one_write_done) until
1567          * after checking if we need to go around again.
1568          */
1569
1570         if (sectors_handled < bio_sectors(bio)) {
1571                 one_write_done(r10_bio);
1572                 /* We need another r10_bio.  It has already been counted
1573                  * in bio->bi_phys_segments.
1574                  */
1575                 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1576
1577                 r10_bio->master_bio = bio;
1578                 r10_bio->sectors = bio_sectors(bio) - sectors_handled;
1579
1580                 r10_bio->mddev = mddev;
1581                 r10_bio->sector = bio->bi_sector + sectors_handled;
1582                 r10_bio->state = 0;
1583                 goto retry_write;
1584         }
1585         one_write_done(r10_bio);
1586
1587         /* In case raid10d snuck in to freeze_array */
1588         wake_up(&conf->wait_barrier);
1589 }
1590
1591 static void status(struct seq_file *seq, struct mddev *mddev)
1592 {
1593         struct r10conf *conf = mddev->private;
1594         int i;
1595
1596         if (conf->geo.near_copies < conf->geo.raid_disks)
1597                 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
1598         if (conf->geo.near_copies > 1)
1599                 seq_printf(seq, " %d near-copies", conf->geo.near_copies);
1600         if (conf->geo.far_copies > 1) {
1601                 if (conf->geo.far_offset)
1602                         seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
1603                 else
1604                         seq_printf(seq, " %d far-copies", conf->geo.far_copies);
1605         }
1606         seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
1607                                         conf->geo.raid_disks - mddev->degraded);
1608         for (i = 0; i < conf->geo.raid_disks; i++)
1609                 seq_printf(seq, "%s",
1610                               conf->mirrors[i].rdev &&
1611                               test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
1612         seq_printf(seq, "]");
1613 }
1614
1615 /* check if there are enough drives for
1616  * every block to appear on atleast one.
1617  * Don't consider the device numbered 'ignore'
1618  * as we might be about to remove it.
1619  */
1620 static int _enough(struct r10conf *conf, struct geom *geo, int ignore)
1621 {
1622         int first = 0;
1623
1624         do {
1625                 int n = conf->copies;
1626                 int cnt = 0;
1627                 int this = first;
1628                 while (n--) {
1629                         if (conf->mirrors[this].rdev &&
1630                             this != ignore)
1631                                 cnt++;
1632                         this = (this+1) % geo->raid_disks;
1633                 }
1634                 if (cnt == 0)
1635                         return 0;
1636                 first = (first + geo->near_copies) % geo->raid_disks;
1637         } while (first != 0);
1638         return 1;
1639 }
1640
1641 static int enough(struct r10conf *conf, int ignore)
1642 {
1643         return _enough(conf, &conf->geo, ignore) &&
1644                 _enough(conf, &conf->prev, ignore);
1645 }
1646
1647 static void error(struct mddev *mddev, struct md_rdev *rdev)
1648 {
1649         char b[BDEVNAME_SIZE];
1650         struct r10conf *conf = mddev->private;
1651
1652         /*
1653          * If it is not operational, then we have already marked it as dead
1654          * else if it is the last working disks, ignore the error, let the
1655          * next level up know.
1656          * else mark the drive as failed
1657          */
1658         if (test_bit(In_sync, &rdev->flags)
1659             && !enough(conf, rdev->raid_disk))
1660                 /*
1661                  * Don't fail the drive, just return an IO error.
1662                  */
1663                 return;
1664         if (test_and_clear_bit(In_sync, &rdev->flags)) {
1665                 unsigned long flags;
1666                 spin_lock_irqsave(&conf->device_lock, flags);
1667                 mddev->degraded++;
1668                 spin_unlock_irqrestore(&conf->device_lock, flags);
1669                 /*
1670                  * if recovery is running, make sure it aborts.
1671                  */
1672                 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1673         }
1674         set_bit(Blocked, &rdev->flags);
1675         set_bit(Faulty, &rdev->flags);
1676         set_bit(MD_CHANGE_DEVS, &mddev->flags);
1677         printk(KERN_ALERT
1678                "md/raid10:%s: Disk failure on %s, disabling device.\n"
1679                "md/raid10:%s: Operation continuing on %d devices.\n",
1680                mdname(mddev), bdevname(rdev->bdev, b),
1681                mdname(mddev), conf->geo.raid_disks - mddev->degraded);
1682 }
1683
1684 static void print_conf(struct r10conf *conf)
1685 {
1686         int i;
1687         struct raid10_info *tmp;
1688
1689         printk(KERN_DEBUG "RAID10 conf printout:\n");
1690         if (!conf) {
1691                 printk(KERN_DEBUG "(!conf)\n");
1692                 return;
1693         }
1694         printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
1695                 conf->geo.raid_disks);
1696
1697         for (i = 0; i < conf->geo.raid_disks; i++) {
1698                 char b[BDEVNAME_SIZE];
1699                 tmp = conf->mirrors + i;
1700                 if (tmp->rdev)
1701                         printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
1702                                 i, !test_bit(In_sync, &tmp->rdev->flags),
1703                                 !test_bit(Faulty, &tmp->rdev->flags),
1704                                 bdevname(tmp->rdev->bdev,b));
1705         }
1706 }
1707
1708 static void close_sync(struct r10conf *conf)
1709 {
1710         wait_barrier(conf);
1711         allow_barrier(conf);
1712
1713         mempool_destroy(conf->r10buf_pool);
1714         conf->r10buf_pool = NULL;
1715 }
1716
1717 static int raid10_spare_active(struct mddev *mddev)
1718 {
1719         int i;
1720         struct r10conf *conf = mddev->private;
1721         struct raid10_info *tmp;
1722         int count = 0;
1723         unsigned long flags;
1724
1725         /*
1726          * Find all non-in_sync disks within the RAID10 configuration
1727          * and mark them in_sync
1728          */
1729         for (i = 0; i < conf->geo.raid_disks; i++) {
1730                 tmp = conf->mirrors + i;
1731                 if (tmp->replacement
1732                     && tmp->replacement->recovery_offset == MaxSector
1733                     && !test_bit(Faulty, &tmp->replacement->flags)
1734                     && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
1735                         /* Replacement has just become active */
1736                         if (!tmp->rdev
1737                             || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
1738                                 count++;
1739                         if (tmp->rdev) {
1740                                 /* Replaced device not technically faulty,
1741                                  * but we need to be sure it gets removed
1742                                  * and never re-added.
1743                                  */
1744                                 set_bit(Faulty, &tmp->rdev->flags);
1745                                 sysfs_notify_dirent_safe(
1746                                         tmp->rdev->sysfs_state);
1747                         }
1748                         sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
1749                 } else if (tmp->rdev
1750                            && !test_bit(Faulty, &tmp->rdev->flags)
1751                            && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1752                         count++;
1753                         sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
1754                 }
1755         }
1756         spin_lock_irqsave(&conf->device_lock, flags);
1757         mddev->degraded -= count;
1758         spin_unlock_irqrestore(&conf->device_lock, flags);
1759
1760         print_conf(conf);
1761         return count;
1762 }
1763
1764
1765 static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1766 {
1767         struct r10conf *conf = mddev->private;
1768         int err = -EEXIST;
1769         int mirror;
1770         int first = 0;
1771         int last = conf->geo.raid_disks - 1;
1772         struct request_queue *q = bdev_get_queue(rdev->bdev);
1773
1774         if (mddev->recovery_cp < MaxSector)
1775                 /* only hot-add to in-sync arrays, as recovery is
1776                  * very different from resync
1777                  */
1778                 return -EBUSY;
1779         if (rdev->saved_raid_disk < 0 && !_enough(conf, &conf->prev, -1))
1780                 return -EINVAL;
1781
1782         if (rdev->raid_disk >= 0)
1783                 first = last = rdev->raid_disk;
1784
1785         if (q->merge_bvec_fn) {
1786                 set_bit(Unmerged, &rdev->flags);
1787                 mddev->merge_check_needed = 1;
1788         }
1789
1790         if (rdev->saved_raid_disk >= first &&
1791             conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1792                 mirror = rdev->saved_raid_disk;
1793         else
1794                 mirror = first;
1795         for ( ; mirror <= last ; mirror++) {
1796                 struct raid10_info *p = &conf->mirrors[mirror];
1797                 if (p->recovery_disabled == mddev->recovery_disabled)
1798                         continue;
1799                 if (p->rdev) {
1800                         if (!test_bit(WantReplacement, &p->rdev->flags) ||
1801                             p->replacement != NULL)
1802                                 continue;
1803                         clear_bit(In_sync, &rdev->flags);
1804                         set_bit(Replacement, &rdev->flags);
1805                         rdev->raid_disk = mirror;
1806                         err = 0;
1807                         disk_stack_limits(mddev->gendisk, rdev->bdev,
1808                                           rdev->data_offset << 9);
1809                         conf->fullsync = 1;
1810                         rcu_assign_pointer(p->replacement, rdev);
1811                         break;
1812                 }
1813
1814                 disk_stack_limits(mddev->gendisk, rdev->bdev,
1815                                   rdev->data_offset << 9);
1816
1817                 p->head_position = 0;
1818                 p->recovery_disabled = mddev->recovery_disabled - 1;
1819                 rdev->raid_disk = mirror;
1820                 err = 0;
1821                 if (rdev->saved_raid_disk != mirror)
1822                         conf->fullsync = 1;
1823                 rcu_assign_pointer(p->rdev, rdev);
1824                 break;
1825         }
1826         if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
1827                 /* Some requests might not have seen this new
1828                  * merge_bvec_fn.  We must wait for them to complete
1829                  * before merging the device fully.
1830                  * First we make sure any code which has tested
1831                  * our function has submitted the request, then
1832                  * we wait for all outstanding requests to complete.
1833                  */
1834                 synchronize_sched();
1835                 raise_barrier(conf, 0);
1836                 lower_barrier(conf);
1837                 clear_bit(Unmerged, &rdev->flags);
1838         }
1839         md_integrity_add_rdev(rdev, mddev);
1840         if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
1841                 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
1842
1843         print_conf(conf);
1844         return err;
1845 }
1846
1847 static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1848 {
1849         struct r10conf *conf = mddev->private;
1850         int err = 0;
1851         int number = rdev->raid_disk;
1852         struct md_rdev **rdevp;
1853         struct raid10_info *p = conf->mirrors + number;
1854
1855         print_conf(conf);
1856         if (rdev == p->rdev)
1857                 rdevp = &p->rdev;
1858         else if (rdev == p->replacement)
1859                 rdevp = &p->replacement;
1860         else
1861                 return 0;
1862
1863         if (test_bit(In_sync, &rdev->flags) ||
1864             atomic_read(&rdev->nr_pending)) {
1865                 err = -EBUSY;
1866                 goto abort;
1867         }
1868         /* Only remove faulty devices if recovery
1869          * is not possible.
1870          */
1871         if (!test_bit(Faulty, &rdev->flags) &&
1872             mddev->recovery_disabled != p->recovery_disabled &&
1873             (!p->replacement || p->replacement == rdev) &&
1874             number < conf->geo.raid_disks &&
1875             enough(conf, -1)) {
1876                 err = -EBUSY;
1877                 goto abort;
1878         }
1879         *rdevp = NULL;
1880         synchronize_rcu();
1881         if (atomic_read(&rdev->nr_pending)) {
1882                 /* lost the race, try later */
1883                 err = -EBUSY;
1884                 *rdevp = rdev;
1885                 goto abort;
1886         } else if (p->replacement) {
1887                 /* We must have just cleared 'rdev' */
1888                 p->rdev = p->replacement;
1889                 clear_bit(Replacement, &p->replacement->flags);
1890                 smp_mb(); /* Make sure other CPUs may see both as identical
1891                            * but will never see neither -- if they are careful.
1892                            */
1893                 p->replacement = NULL;
1894                 clear_bit(WantReplacement, &rdev->flags);
1895         } else
1896                 /* We might have just remove the Replacement as faulty
1897                  * Clear the flag just in case
1898                  */
1899                 clear_bit(WantReplacement, &rdev->flags);
1900
1901         err = md_integrity_register(mddev);
1902
1903 abort:
1904
1905         print_conf(conf);
1906         return err;
1907 }
1908
1909
1910 static void end_sync_read(struct bio *bio, int error)
1911 {
1912         struct r10bio *r10_bio = bio->bi_private;
1913         struct r10conf *conf = r10_bio->mddev->private;
1914         int d;
1915
1916         if (bio == r10_bio->master_bio) {
1917                 /* this is a reshape read */
1918                 d = r10_bio->read_slot; /* really the read dev */
1919         } else
1920                 d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
1921
1922         if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1923                 set_bit(R10BIO_Uptodate, &r10_bio->state);
1924         else
1925                 /* The write handler will notice the lack of
1926                  * R10BIO_Uptodate and record any errors etc
1927                  */
1928                 atomic_add(r10_bio->sectors,
1929                            &conf->mirrors[d].rdev->corrected_errors);
1930
1931         /* for reconstruct, we always reschedule after a read.
1932          * for resync, only after all reads
1933          */
1934         rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1935         if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1936             atomic_dec_and_test(&r10_bio->remaining)) {
1937                 /* we have read all the blocks,
1938                  * do the comparison in process context in raid10d
1939                  */
1940                 reschedule_retry(r10_bio);
1941         }
1942 }
1943
1944 static void end_sync_request(struct r10bio *r10_bio)
1945 {
1946         struct mddev *mddev = r10_bio->mddev;
1947
1948         while (atomic_dec_and_test(&r10_bio->remaining)) {
1949                 if (r10_bio->master_bio == NULL) {
1950                         /* the primary of several recovery bios */
1951                         sector_t s = r10_bio->sectors;
1952                         if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1953                             test_bit(R10BIO_WriteError, &r10_bio->state))
1954                                 reschedule_retry(r10_bio);
1955                         else
1956                                 put_buf(r10_bio);
1957                         md_done_sync(mddev, s, 1);
1958                         break;
1959                 } else {
1960                         struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
1961                         if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1962                             test_bit(R10BIO_WriteError, &r10_bio->state))
1963                                 reschedule_retry(r10_bio);
1964                         else
1965                                 put_buf(r10_bio);
1966                         r10_bio = r10_bio2;
1967                 }
1968         }
1969 }
1970
1971 static void end_sync_write(struct bio *bio, int error)
1972 {
1973         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1974         struct r10bio *r10_bio = bio->bi_private;
1975         struct mddev *mddev = r10_bio->mddev;
1976         struct r10conf *conf = mddev->private;
1977         int d;
1978         sector_t first_bad;
1979         int bad_sectors;
1980         int slot;
1981         int repl;
1982         struct md_rdev *rdev = NULL;
1983
1984         d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
1985         if (repl)
1986                 rdev = conf->mirrors[d].replacement;
1987         else
1988                 rdev = conf->mirrors[d].rdev;
1989
1990         if (!uptodate) {
1991                 if (repl)
1992                         md_error(mddev, rdev);
1993                 else {
1994                         set_bit(WriteErrorSeen, &rdev->flags);
1995                         if (!test_and_set_bit(WantReplacement, &rdev->flags))
1996                                 set_bit(MD_RECOVERY_NEEDED,
1997                                         &rdev->mddev->recovery);
1998                         set_bit(R10BIO_WriteError, &r10_bio->state);
1999                 }
2000         } else if (is_badblock(rdev,
2001                              r10_bio->devs[slot].addr,
2002                              r10_bio->sectors,
2003                              &first_bad, &bad_sectors))
2004                 set_bit(R10BIO_MadeGood, &r10_bio->state);
2005
2006         rdev_dec_pending(rdev, mddev);
2007
2008         end_sync_request(r10_bio);
2009 }
2010
2011 /*
2012  * Note: sync and recover and handled very differently for raid10
2013  * This code is for resync.
2014  * For resync, we read through virtual addresses and read all blocks.
2015  * If there is any error, we schedule a write.  The lowest numbered
2016  * drive is authoritative.
2017  * However requests come for physical address, so we need to map.
2018  * For every physical address there are raid_disks/copies virtual addresses,
2019  * which is always are least one, but is not necessarly an integer.
2020  * This means that a physical address can span multiple chunks, so we may
2021  * have to submit multiple io requests for a single sync request.
2022  */
2023 /*
2024  * We check if all blocks are in-sync and only write to blocks that
2025  * aren't in sync
2026  */
2027 static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2028 {
2029         struct r10conf *conf = mddev->private;
2030         int i, first;
2031         struct bio *tbio, *fbio;
2032         int vcnt;
2033
2034         atomic_set(&r10_bio->remaining, 1);
2035
2036         /* find the first device with a block */
2037         for (i=0; i<conf->copies; i++)
2038                 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
2039                         break;
2040
2041         if (i == conf->copies)
2042                 goto done;
2043
2044         first = i;
2045         fbio = r10_bio->devs[i].bio;
2046
2047         vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
2048         /* now find blocks with errors */
2049         for (i=0 ; i < conf->copies ; i++) {
2050                 int  j, d;
2051
2052                 tbio = r10_bio->devs[i].bio;
2053
2054                 if (tbio->bi_end_io != end_sync_read)
2055                         continue;
2056                 if (i == first)
2057                         continue;
2058                 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {
2059                         /* We know that the bi_io_vec layout is the same for
2060                          * both 'first' and 'i', so we just compare them.
2061                          * All vec entries are PAGE_SIZE;
2062                          */
2063                         for (j = 0; j < vcnt; j++)
2064                                 if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
2065                                            page_address(tbio->bi_io_vec[j].bv_page),
2066                                            fbio->bi_io_vec[j].bv_len))
2067                                         break;
2068                         if (j == vcnt)
2069                                 continue;
2070                         atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
2071                         if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
2072                                 /* Don't fix anything. */
2073                                 continue;
2074                 }
2075                 /* Ok, we need to write this bio, either to correct an
2076                  * inconsistency or to correct an unreadable block.
2077                  * First we need to fixup bv_offset, bv_len and
2078                  * bi_vecs, as the read request might have corrupted these
2079                  */
2080                 tbio->bi_vcnt = vcnt;
2081                 tbio->bi_size = r10_bio->sectors << 9;
2082                 tbio->bi_idx = 0;
2083                 tbio->bi_phys_segments = 0;
2084                 tbio->bi_flags &= ~(BIO_POOL_MASK - 1);
2085                 tbio->bi_flags |= 1 << BIO_UPTODATE;
2086                 tbio->bi_next = NULL;
2087                 tbio->bi_rw = WRITE;
2088                 tbio->bi_private = r10_bio;
2089                 tbio->bi_sector = r10_bio->devs[i].addr;
2090
2091                 for (j=0; j < vcnt ; j++) {
2092                         tbio->bi_io_vec[j].bv_offset = 0;
2093                         tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
2094
2095                         memcpy(page_address(tbio->bi_io_vec[j].bv_page),
2096                                page_address(fbio->bi_io_vec[j].bv_page),
2097                                PAGE_SIZE);
2098                 }
2099                 tbio->bi_end_io = end_sync_write;
2100
2101                 d = r10_bio->devs[i].devnum;
2102                 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2103                 atomic_inc(&r10_bio->remaining);
2104                 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
2105
2106                 tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
2107                 tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
2108                 generic_make_request(tbio);
2109         }
2110
2111         /* Now write out to any replacement devices
2112          * that are active
2113          */
2114         for (i = 0; i < conf->copies; i++) {
2115                 int j, d;
2116
2117                 tbio = r10_bio->devs[i].repl_bio;
2118                 if (!tbio || !tbio->bi_end_io)
2119                         continue;
2120                 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
2121                     && r10_bio->devs[i].bio != fbio)
2122                         for (j = 0; j < vcnt; j++)
2123                                 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
2124                                        page_address(fbio->bi_io_vec[j].bv_page),
2125                                        PAGE_SIZE);
2126                 d = r10_bio->devs[i].devnum;
2127                 atomic_inc(&r10_bio->remaining);
2128                 md_sync_acct(conf->mirrors[d].replacement->bdev,
2129                              bio_sectors(tbio));
2130                 generic_make_request(tbio);
2131         }
2132
2133 done:
2134         if (atomic_dec_and_test(&r10_bio->remaining)) {
2135                 md_done_sync(mddev, r10_bio->sectors, 1);
2136                 put_buf(r10_bio);
2137         }
2138 }
2139
2140 /*
2141  * Now for the recovery code.
2142  * Recovery happens across physical sectors.
2143  * We recover all non-is_sync drives by finding the virtual address of
2144  * each, and then choose a working drive that also has that virt address.
2145  * There is a separate r10_bio for each non-in_sync drive.
2146  * Only the first two slots are in use. The first for reading,
2147  * The second for writing.
2148  *
2149  */
2150 static void fix_recovery_read_error(struct r10bio *r10_bio)
2151 {
2152         /* We got a read error during recovery.
2153          * We repeat the read in smaller page-sized sections.
2154          * If a read succeeds, write it to the new device or record
2155          * a bad block if we cannot.
2156          * If a read fails, record a bad block on both old and
2157          * new devices.
2158          */
2159         struct mddev *mddev = r10_bio->mddev;
2160         struct r10conf *conf = mddev->private;
2161         struct bio *bio = r10_bio->devs[0].bio;
2162         sector_t sect = 0;
2163         int sectors = r10_bio->sectors;
2164         int idx = 0;
2165         int dr = r10_bio->devs[0].devnum;
2166         int dw = r10_bio->devs[1].devnum;
2167
2168         while (sectors) {
2169                 int s = sectors;
2170                 struct md_rdev *rdev;
2171                 sector_t addr;
2172                 int ok;
2173
2174                 if (s > (PAGE_SIZE>>9))
2175                         s = PAGE_SIZE >> 9;
2176
2177                 rdev = conf->mirrors[dr].rdev;
2178                 addr = r10_bio->devs[0].addr + sect,
2179                 ok = sync_page_io(rdev,
2180                                   addr,
2181                                   s << 9,
2182                                   bio->bi_io_vec[idx].bv_page,
2183                                   READ, false);
2184                 if (ok) {
2185                         rdev = conf->mirrors[dw].rdev;
2186                         addr = r10_bio->devs[1].addr + sect;
2187                         ok = sync_page_io(rdev,
2188                                           addr,
2189                                           s << 9,
2190                                           bio->bi_io_vec[idx].bv_page,
2191                                           WRITE, false);
2192                         if (!ok) {
2193                                 set_bit(WriteErrorSeen, &rdev->flags);
2194                                 if (!test_and_set_bit(WantReplacement,
2195                                                       &rdev->flags))
2196                                         set_bit(MD_RECOVERY_NEEDED,
2197                                                 &rdev->mddev->recovery);
2198                         }
2199                 }
2200                 if (!ok) {
2201                         /* We don't worry if we cannot set a bad block -
2202                          * it really is bad so there is no loss in not
2203                          * recording it yet
2204                          */
2205                         rdev_set_badblocks(rdev, addr, s, 0);
2206
2207                         if (rdev != conf->mirrors[dw].rdev) {
2208                                 /* need bad block on destination too */
2209                                 struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
2210                                 addr = r10_bio->devs[1].addr + sect;
2211                                 ok = rdev_set_badblocks(rdev2, addr, s, 0);
2212                                 if (!ok) {
2213                                         /* just abort the recovery */
2214                                         printk(KERN_NOTICE
2215                                                "md/raid10:%s: recovery aborted"
2216                                                " due to read error\n",
2217                                                mdname(mddev));
2218
2219                                         conf->mirrors[dw].recovery_disabled
2220                                                 = mddev->recovery_disabled;
2221                                         set_bit(MD_RECOVERY_INTR,
2222                                                 &mddev->recovery);
2223                                         break;
2224                                 }
2225                         }
2226                 }
2227
2228                 sectors -= s;
2229                 sect += s;
2230                 idx++;
2231         }
2232 }
2233
2234 static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2235 {
2236         struct r10conf *conf = mddev->private;
2237         int d;
2238         struct bio *wbio, *wbio2;
2239
2240         if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
2241                 fix_recovery_read_error(r10_bio);
2242                 end_sync_request(r10_bio);
2243                 return;
2244         }
2245
2246         /*
2247          * share the pages with the first bio
2248          * and submit the write request
2249          */
2250         d = r10_bio->devs[1].devnum;
2251         wbio = r10_bio->devs[1].bio;
2252         wbio2 = r10_bio->devs[1].repl_bio;
2253         if (wbio->bi_end_io) {
2254                 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2255                 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
2256                 generic_make_request(wbio);
2257         }
2258         if (wbio2 && wbio2->bi_end_io) {
2259                 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
2260                 md_sync_acct(conf->mirrors[d].replacement->bdev,
2261                              bio_sectors(wbio2));
2262                 generic_make_request(wbio2);
2263         }
2264 }
2265
2266
2267 /*
2268  * Used by fix_read_error() to decay the per rdev read_errors.
2269  * We halve the read error count for every hour that has elapsed
2270  * since the last recorded read error.
2271  *
2272  */
2273 static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
2274 {
2275         struct timespec cur_time_mon;
2276         unsigned long hours_since_last;
2277         unsigned int read_errors = atomic_read(&rdev->read_errors);
2278
2279         ktime_get_ts(&cur_time_mon);
2280
2281         if (rdev->last_read_error.tv_sec == 0 &&
2282             rdev->last_read_error.tv_nsec == 0) {
2283                 /* first time we've seen a read error */
2284                 rdev->last_read_error = cur_time_mon;
2285                 return;
2286         }
2287
2288         hours_since_last = (cur_time_mon.tv_sec -
2289                             rdev->last_read_error.tv_sec) / 3600;
2290
2291         rdev->last_read_error = cur_time_mon;
2292
2293         /*
2294          * if hours_since_last is > the number of bits in read_errors
2295          * just set read errors to 0. We do this to avoid
2296          * overflowing the shift of read_errors by hours_since_last.
2297          */
2298         if (hours_since_last >= 8 * sizeof(read_errors))
2299                 atomic_set(&rdev->read_errors, 0);
2300         else
2301                 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
2302 }
2303
2304 static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
2305                             int sectors, struct page *page, int rw)
2306 {
2307         sector_t first_bad;
2308         int bad_sectors;
2309
2310         if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
2311             && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
2312                 return -1;
2313         if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
2314                 /* success */
2315                 return 1;
2316         if (rw == WRITE) {
2317                 set_bit(WriteErrorSeen, &rdev->flags);
2318                 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2319                         set_bit(MD_RECOVERY_NEEDED,
2320                                 &rdev->mddev->recovery);
2321         }
2322         /* need to record an error - either for the block or the device */
2323         if (!rdev_set_badblocks(rdev, sector, sectors, 0))
2324                 md_error(rdev->mddev, rdev);
2325         return 0;
2326 }
2327
2328 /*
2329  * This is a kernel thread which:
2330  *
2331  *      1.      Retries failed read operations on working mirrors.
2332  *      2.      Updates the raid superblock when problems encounter.
2333  *      3.      Performs writes following reads for array synchronising.
2334  */
2335
2336 static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
2337 {
2338         int sect = 0; /* Offset from r10_bio->sector */
2339         int sectors = r10_bio->sectors;
2340         struct md_rdev*rdev;
2341         int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
2342         int d = r10_bio->devs[r10_bio->read_slot].devnum;
2343
2344         /* still own a reference to this rdev, so it cannot
2345          * have been cleared recently.
2346          */
2347         rdev = conf->mirrors[d].rdev;
2348
2349         if (test_bit(Faulty, &rdev->flags))
2350                 /* drive has already been failed, just ignore any
2351                    more fix_read_error() attempts */
2352                 return;
2353
2354         check_decay_read_errors(mddev, rdev);
2355         atomic_inc(&rdev->read_errors);
2356         if (atomic_read(&rdev->read_errors) > max_read_errors) {
2357                 char b[BDEVNAME_SIZE];
2358                 bdevname(rdev->bdev, b);
2359
2360                 printk(KERN_NOTICE
2361                        "md/raid10:%s: %s: Raid device exceeded "
2362                        "read_error threshold [cur %d:max %d]\n",
2363                        mdname(mddev), b,
2364                        atomic_read(&rdev->read_errors), max_read_errors);
2365                 printk(KERN_NOTICE
2366                        "md/raid10:%s: %s: Failing raid device\n",
2367                        mdname(mddev), b);
2368                 md_error(mddev, conf->mirrors[d].rdev);
2369                 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
2370                 return;
2371         }
2372
2373         while(sectors) {
2374                 int s = sectors;
2375                 int sl = r10_bio->read_slot;
2376                 int success = 0;
2377                 int start;
2378
2379                 if (s > (PAGE_SIZE>>9))
2380                         s = PAGE_SIZE >> 9;
2381
2382                 rcu_read_lock();
2383                 do {
2384                         sector_t first_bad;
2385                         int bad_sectors;
2386
2387                         d = r10_bio->devs[sl].devnum;
2388                         rdev = rcu_dereference(conf->mirrors[d].rdev);
2389                         if (rdev &&
2390                             !test_bit(Unmerged, &rdev->flags) &&
2391                             test_bit(In_sync, &rdev->flags) &&
2392                             is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
2393                                         &first_bad, &bad_sectors) == 0) {
2394                                 atomic_inc(&rdev->nr_pending);
2395                                 rcu_read_unlock();
2396                                 success = sync_page_io(rdev,
2397                                                        r10_bio->devs[sl].addr +
2398                                                        sect,
2399                                                        s<<9,
2400                                                        conf->tmppage, READ, false);
2401                                 rdev_dec_pending(rdev, mddev);
2402                                 rcu_read_lock();
2403                                 if (success)
2404                                         break;
2405                         }
2406                         sl++;
2407                         if (sl == conf->copies)
2408                                 sl = 0;
2409                 } while (!success && sl != r10_bio->read_slot);
2410                 rcu_read_unlock();
2411
2412                 if (!success) {
2413                         /* Cannot read from anywhere, just mark the block
2414                          * as bad on the first device to discourage future
2415                          * reads.
2416                          */
2417                         int dn = r10_bio->devs[r10_bio->read_slot].devnum;
2418                         rdev = conf->mirrors[dn].rdev;
2419
2420                         if (!rdev_set_badblocks(
2421                                     rdev,
2422                                     r10_bio->devs[r10_bio->read_slot].addr
2423                                     + sect,
2424                                     s, 0)) {
2425                                 md_error(mddev, rdev);
2426                                 r10_bio->devs[r10_bio->read_slot].bio
2427                                         = IO_BLOCKED;
2428                         }
2429                         break;
2430                 }
2431
2432                 start = sl;
2433                 /* write it back and re-read */
2434                 rcu_read_lock();
2435                 while (sl != r10_bio->read_slot) {
2436                         char b[BDEVNAME_SIZE];
2437
2438                         if (sl==0)
2439                                 sl = conf->copies;
2440                         sl--;
2441                         d = r10_bio->devs[sl].devnum;
2442                         rdev = rcu_dereference(conf->mirrors[d].rdev);
2443                         if (!rdev ||
2444                             test_bit(Unmerged, &rdev->flags) ||
2445                             !test_bit(In_sync, &rdev->flags))
2446                                 continue;
2447
2448                         atomic_inc(&rdev->nr_pending);
2449                         rcu_read_unlock();
2450                         if (r10_sync_page_io(rdev,
2451                                              r10_bio->devs[sl].addr +
2452                                              sect,
2453                                              s, conf->tmppage, WRITE)
2454                             == 0) {
2455                                 /* Well, this device is dead */
2456                                 printk(KERN_NOTICE
2457                                        "md/raid10:%s: read correction "
2458                                        "write failed"
2459                                        " (%d sectors at %llu on %s)\n",
2460                                        mdname(mddev), s,
2461                                        (unsigned long long)(
2462                                                sect +
2463                                                choose_data_offset(r10_bio,
2464                                                                   rdev)),
2465                                        bdevname(rdev->bdev, b));
2466                                 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2467                                        "drive\n",
2468                                        mdname(mddev),
2469                                        bdevname(rdev->bdev, b));
2470                         }
2471                         rdev_dec_pending(rdev, mddev);
2472                         rcu_read_lock();
2473                 }
2474                 sl = start;
2475                 while (sl != r10_bio->read_slot) {
2476                         char b[BDEVNAME_SIZE];
2477
2478                         if (sl==0)
2479                                 sl = conf->copies;
2480                         sl--;
2481                         d = r10_bio->devs[sl].devnum;
2482                         rdev = rcu_dereference(conf->mirrors[d].rdev);
2483                         if (!rdev ||
2484                             !test_bit(In_sync, &rdev->flags))
2485                                 continue;
2486
2487                         atomic_inc(&rdev->nr_pending);
2488                         rcu_read_unlock();
2489                         switch (r10_sync_page_io(rdev,
2490                                              r10_bio->devs[sl].addr +
2491                                              sect,
2492                                              s, conf->tmppage,
2493                                                  READ)) {
2494                         case 0:
2495                                 /* Well, this device is dead */
2496                                 printk(KERN_NOTICE
2497                                        "md/raid10:%s: unable to read back "
2498                                        "corrected sectors"
2499                                        " (%d sectors at %llu on %s)\n",
2500                                        mdname(mddev), s,
2501                                        (unsigned long long)(
2502                                                sect +
2503                                                choose_data_offset(r10_bio, rdev)),
2504                                        bdevname(rdev->bdev, b));
2505                                 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2506                                        "drive\n",
2507                                        mdname(mddev),
2508                                        bdevname(rdev->bdev, b));
2509                                 break;
2510                         case 1:
2511                                 printk(KERN_INFO
2512                                        "md/raid10:%s: read error corrected"
2513                                        " (%d sectors at %llu on %s)\n",
2514                                        mdname(mddev), s,
2515                                        (unsigned long long)(
2516                                                sect +
2517                                                choose_data_offset(r10_bio, rdev)),
2518                                        bdevname(rdev->bdev, b));
2519                                 atomic_add(s, &rdev->corrected_errors);
2520                         }
2521
2522                         rdev_dec_pending(rdev, mddev);
2523                         rcu_read_lock();
2524                 }
2525                 rcu_read_unlock();
2526
2527                 sectors -= s;
2528                 sect += s;
2529         }
2530 }
2531
2532 static void bi_complete(struct bio *bio, int error)
2533 {
2534         complete((struct completion *)bio->bi_private);
2535 }
2536
2537 static int submit_bio_wait(int rw, struct bio *bio)
2538 {
2539         struct completion event;
2540         rw |= REQ_SYNC;
2541
2542         init_completion(&event);
2543         bio->bi_private = &event;
2544         bio->bi_end_io = bi_complete;
2545         submit_bio(rw, bio);
2546         wait_for_completion(&event);
2547
2548         return test_bit(BIO_UPTODATE, &bio->bi_flags);
2549 }
2550
2551 static int narrow_write_error(struct r10bio *r10_bio, int i)
2552 {
2553         struct bio *bio = r10_bio->master_bio;
2554         struct mddev *mddev = r10_bio->mddev;
2555         struct r10conf *conf = mddev->private;
2556         struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
2557         /* bio has the data to be written to slot 'i' where
2558          * we just recently had a write error.
2559          * We repeatedly clone the bio and trim down to one block,
2560          * then try the write.  Where the write fails we record
2561          * a bad block.
2562          * It is conceivable that the bio doesn't exactly align with
2563          * blocks.  We must handle this.
2564          *
2565          * We currently own a reference to the rdev.
2566          */
2567
2568         int block_sectors;
2569         sector_t sector;
2570         int sectors;
2571         int sect_to_write = r10_bio->sectors;
2572         int ok = 1;
2573
2574         if (rdev->badblocks.shift < 0)
2575                 return 0;
2576
2577         block_sectors = 1 << rdev->badblocks.shift;
2578         sector = r10_bio->sector;
2579         sectors = ((r10_bio->sector + block_sectors)
2580                    & ~(sector_t)(block_sectors - 1))
2581                 - sector;
2582
2583         while (sect_to_write) {
2584                 struct bio *wbio;
2585                 if (sectors > sect_to_write)
2586                         sectors = sect_to_write;
2587                 /* Write at 'sector' for 'sectors' */
2588                 wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
2589                 md_trim_bio(wbio, sector - bio->bi_sector, sectors);
2590                 wbio->bi_sector = (r10_bio->devs[i].addr+
2591                                    choose_data_offset(r10_bio, rdev) +
2592                                    (sector - r10_bio->sector));
2593                 wbio->bi_bdev = rdev->bdev;
2594                 if (submit_bio_wait(WRITE, wbio) == 0)
2595                         /* Failure! */
2596                         ok = rdev_set_badblocks(rdev, sector,
2597                                                 sectors, 0)
2598                                 && ok;
2599
2600                 bio_put(wbio);
2601                 sect_to_write -= sectors;
2602                 sector += sectors;
2603                 sectors = block_sectors;
2604         }
2605         return ok;
2606 }
2607
2608 static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2609 {
2610         int slot = r10_bio->read_slot;
2611         struct bio *bio;
2612         struct r10conf *conf = mddev->private;
2613         struct md_rdev *rdev = r10_bio->devs[slot].rdev;
2614         char b[BDEVNAME_SIZE];
2615         unsigned long do_sync;
2616         int max_sectors;
2617
2618         /* we got a read error. Maybe the drive is bad.  Maybe just
2619          * the block and we can fix it.
2620          * We freeze all other IO, and try reading the block from
2621          * other devices.  When we find one, we re-write
2622          * and check it that fixes the read error.
2623          * This is all done synchronously while the array is
2624          * frozen.
2625          */
2626         bio = r10_bio->devs[slot].bio;
2627         bdevname(bio->bi_bdev, b);
2628         bio_put(bio);
2629         r10_bio->devs[slot].bio = NULL;
2630
2631         if (mddev->ro == 0) {
2632                 freeze_array(conf);
2633                 fix_read_error(conf, mddev, r10_bio);
2634                 unfreeze_array(conf);
2635         } else
2636                 r10_bio->devs[slot].bio = IO_BLOCKED;
2637
2638         rdev_dec_pending(rdev, mddev);
2639
2640 read_more:
2641         rdev = read_balance(conf, r10_bio, &max_sectors);
2642         if (rdev == NULL) {
2643                 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
2644                        " read error for block %llu\n",
2645                        mdname(mddev), b,
2646                        (unsigned long long)r10_bio->sector);
2647                 raid_end_bio_io(r10_bio);
2648                 return;
2649         }
2650
2651         do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
2652         slot = r10_bio->read_slot;
2653         printk_ratelimited(
2654                 KERN_ERR
2655                 "md/raid10:%s: %s: redirecting "
2656                 "sector %llu to another mirror\n",
2657                 mdname(mddev),
2658                 bdevname(rdev->bdev, b),
2659                 (unsigned long long)r10_bio->sector);
2660         bio = bio_clone_mddev(r10_bio->master_bio,
2661                               GFP_NOIO, mddev);
2662         md_trim_bio(bio,
2663                     r10_bio->sector - bio->bi_sector,
2664                     max_sectors);
2665         r10_bio->devs[slot].bio = bio;
2666         r10_bio->devs[slot].rdev = rdev;
2667         bio->bi_sector = r10_bio->devs[slot].addr
2668                 + choose_data_offset(r10_bio, rdev);
2669         bio->bi_bdev = rdev->bdev;
2670         bio->bi_rw = READ | do_sync;
2671         bio->bi_private = r10_bio;
2672         bio->bi_end_io = raid10_end_read_request;
2673         if (max_sectors < r10_bio->sectors) {
2674                 /* Drat - have to split this up more */
2675                 struct bio *mbio = r10_bio->master_bio;
2676                 int sectors_handled =
2677                         r10_bio->sector + max_sectors
2678                         - mbio->bi_sector;
2679                 r10_bio->sectors = max_sectors;
2680                 spin_lock_irq(&conf->device_lock);
2681                 if (mbio->bi_phys_segments == 0)
2682                         mbio->bi_phys_segments = 2;
2683                 else
2684                         mbio->bi_phys_segments++;
2685                 spin_unlock_irq(&conf->device_lock);
2686                 generic_make_request(bio);
2687
2688                 r10_bio = mempool_alloc(conf->r10bio_pool,
2689                                         GFP_NOIO);
2690                 r10_bio->master_bio = mbio;
2691                 r10_bio->sectors = bio_sectors(mbio) - sectors_handled;
2692                 r10_bio->state = 0;
2693                 set_bit(R10BIO_ReadError,
2694                         &r10_bio->state);
2695                 r10_bio->mddev = mddev;
2696                 r10_bio->sector = mbio->bi_sector
2697                         + sectors_handled;
2698
2699                 goto read_more;
2700         } else
2701                 generic_make_request(bio);
2702 }
2703
2704 static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2705 {
2706         /* Some sort of write request has finished and it
2707          * succeeded in writing where we thought there was a
2708          * bad block.  So forget the bad block.
2709          * Or possibly if failed and we need to record
2710          * a bad block.
2711          */
2712         int m;
2713         struct md_rdev *rdev;
2714
2715         if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
2716             test_bit(R10BIO_IsRecover, &r10_bio->state)) {
2717                 for (m = 0; m < conf->copies; m++) {
2718                         int dev = r10_bio->devs[m].devnum;
2719                         rdev = conf->mirrors[dev].rdev;
2720                         if (r10_bio->devs[m].bio == NULL)
2721                                 continue;
2722                         if (test_bit(BIO_UPTODATE,
2723                                      &r10_bio->devs[m].bio->bi_flags)) {
2724                                 rdev_clear_badblocks(
2725                                         rdev,
2726                                         r10_bio->devs[m].addr,
2727                                         r10_bio->sectors, 0);
2728                         } else {
2729                                 if (!rdev_set_badblocks(
2730                                             rdev,
2731                                             r10_bio->devs[m].addr,
2732                                             r10_bio->sectors, 0))
2733                                         md_error(conf->mddev, rdev);
2734                         }
2735                         rdev = conf->mirrors[dev].replacement;
2736                         if (r10_bio->devs[m].repl_bio == NULL)
2737                                 continue;
2738                         if (test_bit(BIO_UPTODATE,
2739                                      &r10_bio->devs[m].repl_bio->bi_flags)) {
2740                                 rdev_clear_badblocks(
2741                                         rdev,
2742                                         r10_bio->devs[m].addr,
2743                                         r10_bio->sectors, 0);
2744                         } else {
2745                                 if (!rdev_set_badblocks(
2746                                             rdev,
2747                                             r10_bio->devs[m].addr,
2748                                             r10_bio->sectors, 0))
2749                                         md_error(conf->mddev, rdev);
2750                         }
2751                 }
2752                 put_buf(r10_bio);
2753         } else {
2754                 for (m = 0; m < conf->copies; m++) {
2755                         int dev = r10_bio->devs[m].devnum;
2756                         struct bio *bio = r10_bio->devs[m].bio;
2757                         rdev = conf->mirrors[dev].rdev;
2758                         if (bio == IO_MADE_GOOD) {
2759                                 rdev_clear_badblocks(
2760                                         rdev,
2761                                         r10_bio->devs[m].addr,
2762                                         r10_bio->sectors, 0);
2763                                 rdev_dec_pending(rdev, conf->mddev);
2764                         } else if (bio != NULL &&
2765                                    !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
2766                                 if (!narrow_write_error(r10_bio, m)) {
2767                                         md_error(conf->mddev, rdev);
2768                                         set_bit(R10BIO_Degraded,
2769                                                 &r10_bio->state);
2770                                 }
2771                                 rdev_dec_pending(rdev, conf->mddev);
2772                         }
2773                         bio = r10_bio->devs[m].repl_bio;
2774                         rdev = conf->mirrors[dev].replacement;
2775                         if (rdev && bio == IO_MADE_GOOD) {
2776                                 rdev_clear_badblocks(
2777                                         rdev,
2778                                         r10_bio->devs[m].addr,
2779                                         r10_bio->sectors, 0);
2780                                 rdev_dec_pending(rdev, conf->mddev);
2781                         }
2782                 }
2783                 if (test_bit(R10BIO_WriteError,
2784                              &r10_bio->state))
2785                         close_write(r10_bio);
2786                 raid_end_bio_io(r10_bio);
2787         }
2788 }
2789
2790 static void raid10d(struct md_thread *thread)
2791 {
2792         struct mddev *mddev = thread->mddev;
2793         struct r10bio *r10_bio;
2794         unsigned long flags;
2795         struct r10conf *conf = mddev->private;
2796         struct list_head *head = &conf->retry_list;
2797         struct blk_plug plug;
2798
2799         md_check_recovery(mddev);
2800
2801         blk_start_plug(&plug);
2802         for (;;) {
2803
2804                 flush_pending_writes(conf);
2805
2806                 spin_lock_irqsave(&conf->device_lock, flags);
2807                 if (list_empty(head)) {
2808                         spin_unlock_irqrestore(&conf->device_lock, flags);
2809                         break;
2810                 }
2811                 r10_bio = list_entry(head->prev, struct r10bio, retry_list);
2812                 list_del(head->prev);
2813                 conf->nr_queued--;
2814                 spin_unlock_irqrestore(&conf->device_lock, flags);
2815
2816                 mddev = r10_bio->mddev;
2817                 conf = mddev->private;
2818                 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2819                     test_bit(R10BIO_WriteError, &r10_bio->state))
2820                         handle_write_completed(conf, r10_bio);
2821                 else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
2822                         reshape_request_write(mddev, r10_bio);
2823                 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
2824                         sync_request_write(mddev, r10_bio);
2825                 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
2826                         recovery_request_write(mddev, r10_bio);
2827                 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
2828                         handle_read_error(mddev, r10_bio);
2829                 else {
2830                         /* just a partial read to be scheduled from a
2831                          * separate context
2832                          */
2833                         int slot = r10_bio->read_slot;
2834                         generic_make_request(r10_bio->devs[slot].bio);
2835                 }
2836
2837                 cond_resched();
2838                 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
2839                         md_check_recovery(mddev);
2840         }
2841         blk_finish_plug(&plug);
2842 }
2843
2844
2845 static int init_resync(struct r10conf *conf)
2846 {
2847         int buffs;
2848         int i;
2849
2850         buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2851         BUG_ON(conf->r10buf_pool);
2852         conf->have_replacement = 0;
2853         for (i = 0; i < conf->geo.raid_disks; i++)
2854                 if (conf->mirrors[i].replacement)
2855                         conf->have_replacement = 1;
2856         conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
2857         if (!conf->r10buf_pool)
2858                 return -ENOMEM;
2859         conf->next_resync = 0;
2860         return 0;
2861 }
2862
2863 /*
2864  * perform a "sync" on one "block"
2865  *
2866  * We need to make sure that no normal I/O request - particularly write
2867  * requests - conflict with active sync requests.
2868  *
2869  * This is achieved by tracking pending requests and a 'barrier' concept
2870  * that can be installed to exclude normal IO requests.
2871  *
2872  * Resync and recovery are handled very differently.
2873  * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery.
2874  *
2875  * For resync, we iterate over virtual addresses, read all copies,
2876  * and update if there are differences.  If only one copy is live,
2877  * skip it.
2878  * For recovery, we iterate over physical addresses, read a good
2879  * value for each non-in_sync drive, and over-write.
2880  *
2881  * So, for recovery we may have several outstanding complex requests for a
2882  * given address, one for each out-of-sync device.  We model this by allocating
2883  * a number of r10_bio structures, one for each out-of-sync device.
2884  * As we setup these structures, we collect all bio's together into a list
2885  * which we then process collectively to add pages, and then process again
2886  * to pass to generic_make_request.
2887  *
2888  * The r10_bio structures are linked using a borrowed master_bio pointer.
2889  * This link is counted in ->remaining.  When the r10_bio that points to NULL
2890  * has its remaining count decremented to 0, the whole complex operation
2891  * is complete.
2892  *
2893  */
2894
2895 static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2896                              int *skipped, int go_faster)
2897 {
2898         struct r10conf *conf = mddev->private;
2899         struct r10bio *r10_bio;
2900         struct bio *biolist = NULL, *bio;
2901         sector_t max_sector, nr_sectors;
2902         int i;
2903         int max_sync;
2904         sector_t sync_blocks;
2905         sector_t sectors_skipped = 0;
2906         int chunks_skipped = 0;
2907         sector_t chunk_mask = conf->geo.chunk_mask;
2908
2909         if (!conf->r10buf_pool)
2910                 if (init_resync(conf))
2911                         return 0;
2912
2913  skipped:
2914         max_sector = mddev->dev_sectors;
2915         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
2916             test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2917                 max_sector = mddev->resync_max_sectors;
2918         if (sector_nr >= max_sector) {
2919                 /* If we aborted, we need to abort the
2920                  * sync on the 'current' bitmap chucks (there can
2921                  * be several when recovering multiple devices).
2922                  * as we may have started syncing it but not finished.
2923                  * We can find the current address in
2924                  * mddev->curr_resync, but for recovery,
2925                  * we need to convert that to several
2926                  * virtual addresses.
2927                  */
2928                 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
2929                         end_reshape(conf);
2930                         return 0;
2931                 }
2932
2933                 if (mddev->curr_resync < max_sector) { /* aborted */
2934                         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2935                                 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
2936                                                 &sync_blocks, 1);
2937                         else for (i = 0; i < conf->geo.raid_disks; i++) {
2938                                 sector_t sect =
2939                                         raid10_find_virt(conf, mddev->curr_resync, i);
2940                                 bitmap_end_sync(mddev->bitmap, sect,
2941                                                 &sync_blocks, 1);
2942                         }
2943                 } else {
2944                         /* completed sync */
2945                         if ((!mddev->bitmap || conf->fullsync)
2946                             && conf->have_replacement
2947                             && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2948                                 /* Completed a full sync so the replacements
2949                                  * are now fully recovered.
2950                                  */
2951                                 for (i = 0; i < conf->geo.raid_disks; i++)
2952                                         if (conf->mirrors[i].replacement)
2953                                                 conf->mirrors[i].replacement
2954                                                         ->recovery_offset
2955                                                         = MaxSector;
2956                         }
2957                         conf->fullsync = 0;
2958                 }
2959                 bitmap_close_sync(mddev->bitmap);
2960                 close_sync(conf);
2961                 *skipped = 1;
2962                 return sectors_skipped;
2963         }
2964
2965         if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2966                 return reshape_request(mddev, sector_nr, skipped);
2967
2968         if (chunks_skipped >= conf->geo.raid_disks) {
2969                 /* if there has been nothing to do on any drive,
2970                  * then there is nothing to do at all..
2971                  */
2972                 *skipped = 1;
2973                 return (max_sector - sector_nr) + sectors_skipped;
2974         }
2975
2976         if (max_sector > mddev->resync_max)
2977                 max_sector = mddev->resync_max; /* Don't do IO beyond here */
2978
2979         /* make sure whole request will fit in a chunk - if chunks
2980          * are meaningful
2981          */
2982         if (conf->geo.near_copies < conf->geo.raid_disks &&
2983             max_sector > (sector_nr | chunk_mask))
2984                 max_sector = (sector_nr | chunk_mask) + 1;
2985         /*
2986          * If there is non-resync activity waiting for us then
2987          * put in a delay to throttle resync.
2988          */
2989         if (!go_faster && conf->nr_waiting)
2990                 msleep_interruptible(1000);
2991
2992         /* Again, very different code for resync and recovery.
2993          * Both must result in an r10bio with a list of bios that
2994          * have bi_end_io, bi_sector, bi_bdev set,
2995          * and bi_private set to the r10bio.
2996          * For recovery, we may actually create several r10bios
2997          * with 2 bios in each, that correspond to the bios in the main one.
2998          * In this case, the subordinate r10bios link back through a
2999          * borrowed master_bio pointer, and the counter in the master
3000          * includes a ref from each subordinate.
3001          */
3002         /* First, we decide what to do and set ->bi_end_io
3003          * To end_sync_read if we want to read, and
3004          * end_sync_write if we will want to write.
3005          */
3006
3007         max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
3008         if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3009                 /* recovery... the complicated one */
3010                 int j;
3011                 r10_bio = NULL;
3012
3013                 for (i = 0 ; i < conf->geo.raid_disks; i++) {
3014                         int still_degraded;
3015                         struct r10bio *rb2;
3016                         sector_t sect;
3017                         int must_sync;
3018                         int any_working;
3019                         struct raid10_info *mirror = &conf->mirrors[i];
3020
3021                         if ((mirror->rdev == NULL ||
3022                              test_bit(In_sync, &mirror->rdev->flags))
3023                             &&
3024                             (mirror->replacement == NULL ||
3025                              test_bit(Faulty,
3026                                       &mirror->replacement->flags)))
3027                                 continue;
3028
3029                         still_degraded = 0;
3030                         /* want to reconstruct this device */
3031                         rb2 = r10_bio;
3032                         sect = raid10_find_virt(conf, sector_nr, i);
3033                         if (sect >= mddev->resync_max_sectors) {
3034                                 /* last stripe is not complete - don't
3035                                  * try to recover this sector.
3036                                  */
3037                                 continue;
3038                         }
3039                         /* Unless we are doing a full sync, or a replacement
3040                          * we only need to recover the block if it is set in
3041                          * the bitmap
3042                          */
3043                         must_sync = bitmap_start_sync(mddev->bitmap, sect,
3044                                                       &sync_blocks, 1);
3045                         if (sync_blocks < max_sync)
3046                                 max_sync = sync_blocks;
3047                         if (!must_sync &&
3048                             mirror->replacement == NULL &&
3049                             !conf->fullsync) {
3050                                 /* yep, skip the sync_blocks here, but don't assume
3051                                  * that there will never be anything to do here
3052                                  */
3053                                 chunks_skipped = -1;
3054                                 continue;
3055                         }
3056
3057                         r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
3058                         raise_barrier(conf, rb2 != NULL);
3059                         atomic_set(&r10_bio->remaining, 0);
3060
3061                         r10_bio->master_bio = (struct bio*)rb2;
3062                         if (rb2)
3063                                 atomic_inc(&rb2->remaining);
3064                         r10_bio->mddev = mddev;
3065                         set_bit(R10BIO_IsRecover, &r10_bio->state);
3066                         r10_bio->sector = sect;
3067
3068                         raid10_find_phys(conf, r10_bio);
3069
3070                         /* Need to check if the array will still be
3071                          * degraded
3072                          */
3073                         for (j = 0; j < conf->geo.raid_disks; j++)
3074                                 if (conf->mirrors[j].rdev == NULL ||
3075                                     test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
3076                                         still_degraded = 1;
3077                                         break;
3078                                 }
3079
3080                         must_sync = bitmap_start_sync(mddev->bitmap, sect,
3081                                                       &sync_blocks, still_degraded);
3082
3083                         any_working = 0;
3084                         for (j=0; j<conf->copies;j++) {
3085                                 int k;
3086                                 int d = r10_bio->devs[j].devnum;
3087                                 sector_t from_addr, to_addr;
3088                                 struct md_rdev *rdev;
3089                                 sector_t sector, first_bad;
3090                                 int bad_sectors;
3091                                 if (!conf->mirrors[d].rdev ||
3092                                     !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
3093                                         continue;
3094                                 /* This is where we read from */
3095                                 any_working = 1;
3096                                 rdev = conf->mirrors[d].rdev;
3097                                 sector = r10_bio->devs[j].addr;
3098
3099                                 if (is_badblock(rdev, sector, max_sync,
3100                                                 &first_bad, &bad_sectors)) {
3101                                         if (first_bad > sector)
3102                                                 max_sync = first_bad - sector;
3103                                         else {
3104                                                 bad_sectors -= (sector
3105                                                                 - first_bad);
3106                                                 if (max_sync > bad_sectors)
3107                                                         max_sync = bad_sectors;
3108                                                 continue;
3109                                         }
3110                                 }
3111                                 bio = r10_bio->devs[0].bio;
3112                                 bio->bi_next = biolist;
3113                                 biolist = bio;
3114                                 bio->bi_private = r10_bio;
3115                                 bio->bi_end_io = end_sync_read;
3116                                 bio->bi_rw = READ;
3117                                 from_addr = r10_bio->devs[j].addr;
3118                                 bio->bi_sector = from_addr + rdev->data_offset;
3119                                 bio->bi_bdev = rdev->bdev;
3120                                 atomic_inc(&rdev->nr_pending);
3121                                 /* and we write to 'i' (if not in_sync) */
3122
3123                                 for (k=0; k<conf->copies; k++)
3124                                         if (r10_bio->devs[k].devnum == i)
3125                                                 break;
3126                                 BUG_ON(k == conf->copies);
3127                                 to_addr = r10_bio->devs[k].addr;
3128                                 r10_bio->devs[0].devnum = d;
3129                                 r10_bio->devs[0].addr = from_addr;
3130                                 r10_bio->devs[1].devnum = i;
3131                                 r10_bio->devs[1].addr = to_addr;
3132
3133                                 rdev = mirror->rdev;
3134                                 if (!test_bit(In_sync, &rdev->flags)) {
3135                                         bio = r10_bio->devs[1].bio;
3136                                         bio->bi_next = biolist;
3137                                         biolist = bio;
3138                                         bio->bi_private = r10_bio;
3139                                         bio->bi_end_io = end_sync_write;
3140                                         bio->bi_rw = WRITE;
3141                                         bio->bi_sector = to_addr
3142                                                 + rdev->data_offset;
3143                                         bio->bi_bdev = rdev->bdev;
3144                                         atomic_inc(&r10_bio->remaining);
3145                                 } else
3146                                         r10_bio->devs[1].bio->bi_end_io = NULL;
3147
3148                                 /* and maybe write to replacement */
3149                                 bio = r10_bio->devs[1].repl_bio;
3150                                 if (bio)
3151                                         bio->bi_end_io = NULL;
3152                                 rdev = mirror->replacement;
3153                                 /* Note: if rdev != NULL, then bio
3154                                  * cannot be NULL as r10buf_pool_alloc will
3155                                  * have allocated it.
3156                                  * So the second test here is pointless.
3157                                  * But it keeps semantic-checkers happy, and
3158                                  * this comment keeps human reviewers
3159                                  * happy.
3160                                  */
3161                                 if (rdev == NULL || bio == NULL ||
3162                                     test_bit(Faulty, &rdev->flags))
3163                                         break;
3164                                 bio->bi_next = biolist;
3165                                 biolist = bio;
3166                                 bio->bi_private = r10_bio;
3167                                 bio->bi_end_io = end_sync_write;
3168                                 bio->bi_rw = WRITE;
3169                                 bio->bi_sector = to_addr + rdev->data_offset;
3170                                 bio->bi_bdev = rdev->bdev;
3171                                 atomic_inc(&r10_bio->remaining);
3172                                 break;
3173                         }
3174                         if (j == conf->copies) {
3175                                 /* Cannot recover, so abort the recovery or
3176                                  * record a bad block */
3177                                 put_buf(r10_bio);
3178                                 if (rb2)
3179                                         atomic_dec(&rb2->remaining);
3180                                 r10_bio = rb2;
3181                                 if (any_working) {
3182                                         /* problem is that there are bad blocks
3183                                          * on other device(s)
3184                                          */
3185                                         int k;
3186                                         for (k = 0; k < conf->copies; k++)
3187                                                 if (r10_bio->devs[k].devnum == i)
3188                                                         break;
3189                                         if (!test_bit(In_sync,
3190                                                       &mirror->rdev->flags)
3191                                             && !rdev_set_badblocks(
3192                                                     mirror->rdev,
3193                                                     r10_bio->devs[k].addr,
3194                                                     max_sync, 0))
3195                                                 any_working = 0;
3196                                         if (mirror->replacement &&
3197                                             !rdev_set_badblocks(
3198                                                     mirror->replacement,
3199                                                     r10_bio->devs[k].addr,
3200                                                     max_sync, 0))
3201                                                 any_working = 0;
3202                                 }
3203                                 if (!any_working)  {
3204                                         if (!test_and_set_bit(MD_RECOVERY_INTR,
3205                                                               &mddev->recovery))
3206                                                 printk(KERN_INFO "md/raid10:%s: insufficient "
3207                                                        "working devices for recovery.\n",
3208                                                        mdname(mddev));
3209                                         mirror->recovery_disabled
3210                                                 = mddev->recovery_disabled;
3211                                 }
3212                                 break;
3213                         }
3214                 }
3215                 if (biolist == NULL) {
3216                         while (r10_bio) {
3217                                 struct r10bio *rb2 = r10_bio;
3218                                 r10_bio = (struct r10bio*) rb2->master_bio;
3219                                 rb2->master_bio = NULL;
3220                                 put_buf(rb2);
3221                         }
3222                         goto giveup;
3223                 }
3224         } else {
3225                 /* resync. Schedule a read for every block at this virt offset */
3226                 int count = 0;
3227
3228                 bitmap_cond_end_sync(mddev->bitmap, sector_nr);
3229
3230                 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
3231                                        &sync_blocks, mddev->degraded) &&
3232                     !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
3233                                                  &mddev->recovery)) {
3234                         /* We can skip this block */
3235                         *skipped = 1;
3236                         return sync_blocks + sectors_skipped;
3237                 }
3238                 if (sync_blocks < max_sync)
3239                         max_sync = sync_blocks;
3240                 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
3241
3242                 r10_bio->mddev = mddev;
3243                 atomic_set(&r10_bio->remaining, 0);
3244                 raise_barrier(conf, 0);
3245                 conf->next_resync = sector_nr;
3246
3247                 r10_bio->master_bio = NULL;
3248                 r10_bio->sector = sector_nr;
3249                 set_bit(R10BIO_IsSync, &r10_bio->state);
3250                 raid10_find_phys(conf, r10_bio);
3251                 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
3252
3253                 for (i = 0; i < conf->copies; i++) {
3254                         int d = r10_bio->devs[i].devnum;
3255                         sector_t first_bad, sector;
3256                         int bad_sectors;
3257
3258                         if (r10_bio->devs[i].repl_bio)
3259                                 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
3260
3261                         bio = r10_bio->devs[i].bio;
3262                         bio->bi_end_io = NULL;
3263                         clear_bit(BIO_UPTODATE, &bio->bi_flags);
3264                         if (conf->mirrors[d].rdev == NULL ||
3265                             test_bit(Faulty, &conf->mirrors[d].rdev->flags))
3266                                 continue;
3267                         sector = r10_bio->devs[i].addr;
3268                         if (is_badblock(conf->mirrors[d].rdev,
3269                                         sector, max_sync,
3270                                         &first_bad, &bad_sectors)) {
3271                                 if (first_bad > sector)
3272                                         max_sync = first_bad - sector;
3273                                 else {
3274                                         bad_sectors -= (sector - first_bad);
3275                                         if (max_sync > bad_sectors)
3276                                                 max_sync = bad_sectors;
3277                                         continue;
3278                                 }
3279                         }
3280                         atomic_inc(&conf->mirrors[d].rdev->nr_pending);
3281                         atomic_inc(&r10_bio->remaining);
3282                         bio->bi_next = biolist;
3283                         biolist = bio;
3284                         bio->bi_private = r10_bio;
3285                         bio->bi_end_io = end_sync_read;
3286                         bio->bi_rw = READ;
3287                         bio->bi_sector = sector +
3288                                 conf->mirrors[d].rdev->data_offset;
3289                         bio->bi_bdev = conf->mirrors[d].rdev->bdev;
3290                         count++;
3291
3292                         if (conf->mirrors[d].replacement == NULL ||
3293                             test_bit(Faulty,
3294                                      &conf->mirrors[d].replacement->flags))
3295                                 continue;
3296
3297                         /* Need to set up for writing to the replacement */
3298                         bio = r10_bio->devs[i].repl_bio;
3299                         clear_bit(BIO_UPTODATE, &bio->bi_flags);
3300
3301                         sector = r10_bio->devs[i].addr;
3302                         atomic_inc(&conf->mirrors[d].rdev->nr_pending);
3303                         bio->bi_next = biolist;
3304                         biolist = bio;
3305                         bio->bi_private = r10_bio;
3306                         bio->bi_end_io = end_sync_write;
3307                         bio->bi_rw = WRITE;
3308                         bio->bi_sector = sector +
3309                                 conf->mirrors[d].replacement->data_offset;
3310                         bio->bi_bdev = conf->mirrors[d].replacement->bdev;
3311                         count++;
3312                 }
3313
3314                 if (count < 2) {
3315                         for (i=0; i<conf->copies; i++) {
3316                                 int d = r10_bio->devs[i].devnum;
3317                                 if (r10_bio->devs[i].bio->bi_end_io)
3318                                         rdev_dec_pending(conf->mirrors[d].rdev,
3319                                                          mddev);
3320                                 if (r10_bio->devs[i].repl_bio &&
3321                                     r10_bio->devs[i].repl_bio->bi_end_io)
3322                                         rdev_dec_pending(
3323                                                 conf->mirrors[d].replacement,
3324                                                 mddev);
3325                         }
3326                         put_buf(r10_bio);
3327                         biolist = NULL;
3328                         goto giveup;
3329                 }
3330         }
3331
3332         for (bio = biolist; bio ; bio=bio->bi_next) {
3333
3334                 bio->bi_flags &= ~(BIO_POOL_MASK - 1);
3335                 if (bio->bi_end_io)
3336                         bio->bi_flags |= 1 << BIO_UPTODATE;
3337                 bio->bi_vcnt = 0;
3338                 bio->bi_idx = 0;
3339                 bio->bi_phys_segments = 0;
3340                 bio->bi_size = 0;
3341         }
3342
3343         nr_sectors = 0;
3344         if (sector_nr + max_sync < max_sector)
3345                 max_sector = sector_nr + max_sync;
3346         do {
3347                 struct page *page;
3348                 int len = PAGE_SIZE;
3349                 if (sector_nr + (len>>9) > max_sector)
3350                         len = (max_sector - sector_nr) << 9;
3351                 if (len == 0)
3352                         break;
3353                 for (bio= biolist ; bio ; bio=bio->bi_next) {
3354                         struct bio *bio2;
3355                         page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
3356                         if (bio_add_page(bio, page, len, 0))
3357                                 continue;
3358
3359                         /* stop here */
3360                         bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
3361                         for (bio2 = biolist;
3362                              bio2 && bio2 != bio;
3363                              bio2 = bio2->bi_next) {
3364                                 /* remove last page from this bio */
3365                                 bio2->bi_vcnt--;
3366                                 bio2->bi_size -= len;
3367                                 bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
3368                         }
3369                         goto bio_full;
3370                 }
3371                 nr_sectors += len>>9;
3372                 sector_nr += len>>9;
3373         } while (biolist->bi_vcnt < RESYNC_PAGES);
3374  bio_full:
3375         r10_bio->sectors = nr_sectors;
3376
3377         while (biolist) {
3378                 bio = biolist;
3379                 biolist = biolist->bi_next;
3380
3381                 bio->bi_next = NULL;
3382                 r10_bio = bio->bi_private;
3383                 r10_bio->sectors = nr_sectors;
3384
3385                 if (bio->bi_end_io == end_sync_read) {
3386                         md_sync_acct(bio->bi_bdev, nr_sectors);
3387                         generic_make_request(bio);
3388                 }
3389         }
3390
3391         if (sectors_skipped)
3392                 /* pretend they weren't skipped, it makes
3393                  * no important difference in this case
3394                  */
3395                 md_done_sync(mddev, sectors_skipped, 1);
3396
3397         return sectors_skipped + nr_sectors;
3398  giveup:
3399         /* There is nowhere to write, so all non-sync
3400          * drives must be failed or in resync, all drives
3401          * have a bad block, so try the next chunk...
3402          */
3403         if (sector_nr + max_sync < max_sector)
3404                 max_sector = sector_nr + max_sync;
3405
3406         sectors_skipped += (max_sector - sector_nr);
3407         chunks_skipped ++;
3408         sector_nr = max_sector;
3409         goto skipped;
3410 }
3411
3412 static sector_t
3413 raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
3414 {
3415         sector_t size;
3416         struct r10conf *conf = mddev->private;
3417
3418         if (!raid_disks)
3419                 raid_disks = min(conf->geo.raid_disks,
3420                                  conf->prev.raid_disks);
3421         if (!sectors)
3422                 sectors = conf->dev_sectors;
3423
3424         size = sectors >> conf->geo.chunk_shift;
3425         sector_div(size, conf->geo.far_copies);
3426         size = size * raid_disks;
3427         sector_div(size, conf->geo.near_copies);
3428
3429         return size << conf->geo.chunk_shift;
3430 }
3431
3432 static void calc_sectors(struct r10conf *conf, sector_t size)
3433 {
3434         /* Calculate the number of sectors-per-device that will
3435          * actually be used, and set conf->dev_sectors and
3436          * conf->stride
3437          */
3438
3439         size = size >> conf->geo.chunk_shift;
3440         sector_div(size, conf->geo.far_copies);
3441         size = size * conf->geo.raid_disks;
3442         sector_div(size, conf->geo.near_copies);
3443         /* 'size' is now the number of chunks in the array */
3444         /* calculate "used chunks per device" */
3445         size = size * conf->copies;
3446
3447         /* We need to round up when dividing by raid_disks to
3448          * get the stride size.
3449          */
3450         size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
3451
3452         conf->dev_sectors = size << conf->geo.chunk_shift;
3453
3454         if (conf->geo.far_offset)
3455                 conf->geo.stride = 1 << conf->geo.chunk_shift;
3456         else {
3457                 sector_div(size, conf->geo.far_copies);
3458                 conf->geo.stride = size << conf->geo.chunk_shift;
3459         }
3460 }
3461
3462 enum geo_type {geo_new, geo_old, geo_start};
3463 static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3464 {
3465         int nc, fc, fo;
3466         int layout, chunk, disks;
3467         switch (new) {
3468         case geo_old:
3469                 layout = mddev->layout;
3470                 chunk = mddev->chunk_sectors;
3471                 disks = mddev->raid_disks - mddev->delta_disks;
3472                 break;
3473         case geo_new:
3474                 layout = mddev->new_layout;
3475                 chunk = mddev->new_chunk_sectors;
3476                 disks = mddev->raid_disks;
3477                 break;
3478         default: /* avoid 'may be unused' warnings */
3479         case geo_start: /* new when starting reshape - raid_disks not
3480                          * updated yet. */
3481                 layout = mddev->new_layout;
3482                 chunk = mddev->new_chunk_sectors;
3483                 disks = mddev->raid_disks + mddev->delta_disks;
3484                 break;
3485         }
3486         if (layout >> 18)
3487                 return -1;
3488         if (chunk < (PAGE_SIZE >> 9) ||
3489             !is_power_of_2(chunk))
3490                 return -2;
3491         nc = layout & 255;
3492         fc = (layout >> 8) & 255;
3493         fo = layout & (1<<16);
3494         geo->raid_disks = disks;
3495         geo->near_copies = nc;
3496         geo->far_copies = fc;
3497         geo->far_offset = fo;
3498         geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks;
3499         geo->chunk_mask = chunk - 1;
3500         geo->chunk_shift = ffz(~chunk);
3501         return nc*fc;
3502 }
3503
3504 static struct r10conf *setup_conf(struct mddev *mddev)
3505 {
3506         struct r10conf *conf = NULL;
3507         int err = -EINVAL;
3508         struct geom geo;
3509         int copies;
3510
3511         copies = setup_geo(&geo, mddev, geo_new);
3512
3513         if (copies == -2) {
3514                 printk(KERN_ERR "md/raid10:%s: chunk size must be "
3515                        "at least PAGE_SIZE(%ld) and be a power of 2.\n",
3516                        mdname(mddev), PAGE_SIZE);
3517                 goto out;
3518         }
3519
3520         if (copies < 2 || copies > mddev->raid_disks) {
3521                 printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
3522                        mdname(mddev), mddev->new_layout);
3523                 goto out;
3524         }
3525
3526         err = -ENOMEM;
3527         conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
3528         if (!conf)
3529                 goto out;
3530
3531         /* FIXME calc properly */
3532         conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +
3533                                                             max(0,mddev->delta_disks)),
3534                                 GFP_KERNEL);
3535         if (!conf->mirrors)
3536                 goto out;
3537
3538         conf->tmppage = alloc_page(GFP_KERNEL);
3539         if (!conf->tmppage)
3540                 goto out;
3541
3542         conf->geo = geo;
3543         conf->copies = copies;
3544         conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
3545                                            r10bio_pool_free, conf);
3546         if (!conf->r10bio_pool)
3547                 goto out;
3548
3549         calc_sectors(conf, mddev->dev_sectors);
3550         if (mddev->reshape_position == MaxSector) {
3551                 conf->prev = conf->geo;
3552                 conf->reshape_progress = MaxSector;
3553         } else {
3554                 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
3555                         err = -EINVAL;
3556                         goto out;
3557                 }
3558                 conf->reshape_progress = mddev->reshape_position;
3559                 if (conf->prev.far_offset)
3560                         conf->prev.stride = 1 << conf->prev.chunk_shift;
3561                 else
3562                         /* far_copies must be 1 */
3563                         conf->prev.stride = conf->dev_sectors;
3564         }
3565         spin_lock_init(&conf->device_lock);
3566         INIT_LIST_HEAD(&conf->retry_list);
3567
3568         spin_lock_init(&conf->resync_lock);
3569         init_waitqueue_head(&conf->wait_barrier);
3570
3571         conf->thread = md_register_thread(raid10d, mddev, "raid10");
3572         if (!conf->thread)
3573                 goto out;
3574
3575         conf->mddev = mddev;
3576         return conf;
3577
3578  out:
3579         if (err == -ENOMEM)
3580                 printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
3581                        mdname(mddev));
3582         if (conf) {
3583                 if (conf->r10bio_pool)
3584                         mempool_destroy(conf->r10bio_pool);
3585                 kfree(conf->mirrors);
3586                 safe_put_page(conf->tmppage);
3587                 kfree(conf);
3588         }
3589         return ERR_PTR(err);
3590 }
3591
3592 static int run(struct mddev *mddev)
3593 {
3594         struct r10conf *conf;
3595         int i, disk_idx, chunk_size;
3596         struct raid10_info *disk;
3597         struct md_rdev *rdev;
3598         sector_t size;
3599         sector_t min_offset_diff = 0;
3600         int first = 1;
3601         bool discard_supported = false;
3602
3603         if (mddev->private == NULL) {
3604                 conf = setup_conf(mddev);
3605                 if (IS_ERR(conf))
3606                         return PTR_ERR(conf);
3607                 mddev->private = conf;
3608         }
3609         conf = mddev->private;
3610         if (!conf)
3611                 goto out;
3612
3613         mddev->thread = conf->thread;
3614         conf->thread = NULL;
3615
3616         chunk_size = mddev->chunk_sectors << 9;
3617         if (mddev->queue) {
3618                 blk_queue_max_discard_sectors(mddev->queue,
3619                                               mddev->chunk_sectors);
3620                 blk_queue_max_write_same_sectors(mddev->queue,
3621                                                  mddev->chunk_sectors);
3622                 blk_queue_io_min(mddev->queue, chunk_size);
3623                 if (conf->geo.raid_disks % conf->geo.near_copies)
3624                         blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
3625                 else
3626                         blk_queue_io_opt(mddev->queue, chunk_size *
3627                                          (conf->geo.raid_disks / conf->geo.near_copies));
3628         }
3629
3630         rdev_for_each(rdev, mddev) {
3631                 long long diff;
3632                 struct request_queue *q;
3633
3634                 disk_idx = rdev->raid_disk;
3635                 if (disk_idx < 0)
3636                         continue;
3637                 if (disk_idx >= conf->geo.raid_disks &&
3638                     disk_idx >= conf->prev.raid_disks)
3639                         continue;
3640                 disk = conf->mirrors + disk_idx;
3641
3642                 if (test_bit(Replacement, &rdev->flags)) {
3643                         if (disk->replacement)
3644                                 goto out_free_conf;
3645                         disk->replacement = rdev;
3646                 } else {
3647                         if (disk->rdev)
3648                                 goto out_free_conf;
3649                         disk->rdev = rdev;
3650                 }
3651                 q = bdev_get_queue(rdev->bdev);
3652                 if (q->merge_bvec_fn)
3653                         mddev->merge_check_needed = 1;
3654                 diff = (rdev->new_data_offset - rdev->data_offset);
3655                 if (!mddev->reshape_backwards)
3656                         diff = -diff;
3657                 if (diff < 0)
3658                         diff = 0;
3659                 if (first || diff < min_offset_diff)
3660                         min_offset_diff = diff;
3661
3662                 if (mddev->gendisk)
3663                         disk_stack_limits(mddev->gendisk, rdev->bdev,
3664                                           rdev->data_offset << 9);
3665
3666                 disk->head_position = 0;
3667
3668                 if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
3669                         discard_supported = true;
3670         }
3671
3672         if (mddev->queue) {
3673                 if (discard_supported)
3674                         queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
3675                                                 mddev->queue);
3676                 else
3677                         queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
3678                                                   mddev->queue);
3679         }
3680         /* need to check that every block has at least one working mirror */
3681         if (!enough(conf, -1)) {
3682                 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
3683                        mdname(mddev));
3684                 goto out_free_conf;
3685         }
3686
3687         if (conf->reshape_progress != MaxSector) {
3688                 /* must ensure that shape change is supported */
3689                 if (conf->geo.far_copies != 1 &&
3690                     conf->geo.far_offset == 0)
3691                         goto out_free_conf;
3692                 if (conf->prev.far_copies != 1 &&
3693                     conf->geo.far_offset == 0)
3694                         goto out_free_conf;
3695         }
3696
3697         mddev->degraded = 0;
3698         for (i = 0;
3699              i < conf->geo.raid_disks
3700                      || i < conf->prev.raid_disks;
3701              i++) {
3702
3703                 disk = conf->mirrors + i;
3704
3705                 if (!disk->rdev && disk->replacement) {
3706                         /* The replacement is all we have - use it */
3707                         disk->rdev = disk->replacement;
3708                         disk->replacement = NULL;
3709                         clear_bit(Replacement, &disk->rdev->flags);
3710                 }
3711
3712                 if (!disk->rdev ||
3713                     !test_bit(In_sync, &disk->rdev->flags)) {
3714                         disk->head_position = 0;
3715                         mddev->degraded++;
3716                         if (disk->rdev)
3717                                 conf->fullsync = 1;
3718                 }
3719                 disk->recovery_disabled = mddev->recovery_disabled - 1;
3720         }
3721
3722         if (mddev->recovery_cp != MaxSector)
3723                 printk(KERN_NOTICE "md/raid10:%s: not clean"
3724                        " -- starting background reconstruction\n",
3725                        mdname(mddev));
3726         printk(KERN_INFO
3727                 "md/raid10:%s: active with %d out of %d devices\n",
3728                 mdname(mddev), conf->geo.raid_disks - mddev->degraded,
3729                 conf->geo.raid_disks);
3730         /*
3731          * Ok, everything is just fine now
3732          */
3733         mddev->dev_sectors = conf->dev_sectors;
3734         size = raid10_size(mddev, 0, 0);
3735         md_set_array_sectors(mddev, size);
3736         mddev->resync_max_sectors = size;
3737
3738         if (mddev->queue) {
3739                 int stripe = conf->geo.raid_disks *
3740                         ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3741                 mddev->queue->backing_dev_info.congested_fn = raid10_congested;
3742                 mddev->queue->backing_dev_info.congested_data = mddev;
3743
3744                 /* Calculate max read-ahead size.
3745                  * We need to readahead at least twice a whole stripe....
3746                  * maybe...
3747                  */
3748                 stripe /= conf->geo.near_copies;
3749                 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
3750                         mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
3751                 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
3752         }
3753
3754
3755         if (md_integrity_register(mddev))
3756                 goto out_free_conf;
3757
3758         if (conf->reshape_progress != MaxSector) {
3759                 unsigned long before_length, after_length;
3760
3761                 before_length = ((1 << conf->prev.chunk_shift) *
3762                                  conf->prev.far_copies);
3763                 after_length = ((1 << conf->geo.chunk_shift) *
3764                                 conf->geo.far_copies);
3765
3766                 if (max(before_length, after_length) > min_offset_diff) {
3767                         /* This cannot work */
3768                         printk("md/raid10: offset difference not enough to continue reshape\n");
3769                         goto out_free_conf;
3770                 }
3771                 conf->offset_diff = min_offset_diff;
3772
3773                 conf->reshape_safe = conf->reshape_progress;
3774                 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3775                 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3776                 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
3777                 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3778                 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
3779                                                         "reshape");
3780         }
3781
3782         return 0;
3783
3784 out_free_conf:
3785         md_unregister_thread(&mddev->thread);
3786         if (conf->r10bio_pool)
3787                 mempool_destroy(conf->r10bio_pool);
3788         safe_put_page(conf->tmppage);
3789         kfree(conf->mirrors);
3790         kfree(conf);
3791         mddev->private = NULL;
3792 out:
3793         return -EIO;
3794 }
3795
3796 static int stop(struct mddev *mddev)
3797 {
3798         struct r10conf *conf = mddev->private;
3799
3800         raise_barrier(conf, 0);
3801         lower_barrier(conf);
3802
3803         md_unregister_thread(&mddev->thread);
3804         if (mddev->queue)
3805                 /* the unplug fn references 'conf'*/
3806                 blk_sync_queue(mddev->queue);
3807
3808         if (conf->r10bio_pool)
3809                 mempool_destroy(conf->r10bio_pool);
3810         kfree(conf->mirrors);
3811         kfree(conf);
3812         mddev->private = NULL;
3813         return 0;
3814 }
3815
3816 static void raid10_quiesce(struct mddev *mddev, int state)
3817 {
3818         struct r10conf *conf = mddev->private;
3819
3820         switch(state) {
3821         case 1:
3822                 raise_barrier(conf, 0);
3823                 break;
3824         case 0:
3825                 lower_barrier(conf);
3826                 break;
3827         }
3828 }
3829
3830 static int raid10_resize(struct mddev *mddev, sector_t sectors)
3831 {
3832         /* Resize of 'far' arrays is not supported.
3833          * For 'near' and 'offset' arrays we can set the
3834          * number of sectors used to be an appropriate multiple
3835          * of the chunk size.
3836          * For 'offset', this is far_copies*chunksize.
3837          * For 'near' the multiplier is the LCM of
3838          * near_copies and raid_disks.
3839          * So if far_copies > 1 && !far_offset, fail.
3840          * Else find LCM(raid_disks, near_copy)*far_copies and
3841          * multiply by chunk_size.  Then round to this number.
3842          * This is mostly done by raid10_size()
3843          */
3844         struct r10conf *conf = mddev->private;
3845         sector_t oldsize, size;
3846
3847         if (mddev->reshape_position != MaxSector)
3848                 return -EBUSY;
3849
3850         if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
3851                 return -EINVAL;
3852
3853         oldsize = raid10_size(mddev, 0, 0);
3854         size = raid10_size(mddev, sectors, 0);
3855         if (mddev->external_size &&
3856             mddev->array_sectors > size)
3857                 return -EINVAL;
3858         if (mddev->bitmap) {
3859                 int ret = bitmap_resize(mddev->bitmap, size, 0, 0);
3860                 if (ret)
3861                         return ret;
3862         }
3863         md_set_array_sectors(mddev, size);
3864         set_capacity(mddev->gendisk, mddev->array_sectors);
3865         revalidate_disk(mddev->gendisk);
3866         if (sectors > mddev->dev_sectors &&
3867             mddev->recovery_cp > oldsize) {
3868                 mddev->recovery_cp = oldsize;
3869                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3870         }
3871         calc_sectors(conf, sectors);
3872         mddev->dev_sectors = conf->dev_sectors;
3873         mddev->resync_max_sectors = size;
3874         return 0;
3875 }
3876
3877 static void *raid10_takeover_raid0(struct mddev *mddev)
3878 {
3879         struct md_rdev *rdev;
3880         struct r10conf *conf;
3881
3882         if (mddev->degraded > 0) {
3883                 printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n",
3884                        mdname(mddev));
3885                 return ERR_PTR(-EINVAL);
3886         }
3887
3888         /* Set new parameters */
3889         mddev->new_level = 10;
3890         /* new layout: far_copies = 1, near_copies = 2 */
3891         mddev->new_layout = (1<<8) + 2;
3892         mddev->new_chunk_sectors = mddev->chunk_sectors;
3893         mddev->delta_disks = mddev->raid_disks;
3894         mddev->raid_disks *= 2;
3895         /* make sure it will be not marked as dirty */
3896         mddev->recovery_cp = MaxSector;
3897
3898         conf = setup_conf(mddev);
3899         if (!IS_ERR(conf)) {
3900                 rdev_for_each(rdev, mddev)
3901                         if (rdev->raid_disk >= 0)
3902                                 rdev->new_raid_disk = rdev->raid_disk * 2;
3903                 conf->barrier = 1;
3904         }
3905
3906         return conf;
3907 }
3908
3909 static void *raid10_takeover(struct mddev *mddev)
3910 {
3911         struct r0conf *raid0_conf;
3912
3913         /* raid10 can take over:
3914          *  raid0 - providing it has only two drives
3915          */
3916         if (mddev->level == 0) {
3917                 /* for raid0 takeover only one zone is supported */
3918                 raid0_conf = mddev->private;
3919                 if (raid0_conf->nr_strip_zones > 1) {
3920                         printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0"
3921                                " with more than one zone.\n",
3922                                mdname(mddev));
3923                         return ERR_PTR(-EINVAL);
3924                 }
3925                 return raid10_takeover_raid0(mddev);
3926         }
3927         return ERR_PTR(-EINVAL);
3928 }
3929
3930 static int raid10_check_reshape(struct mddev *mddev)
3931 {
3932         /* Called when there is a request to change
3933          * - layout (to ->new_layout)
3934          * - chunk size (to ->new_chunk_sectors)
3935          * - raid_disks (by delta_disks)
3936          * or when trying to restart a reshape that was ongoing.
3937          *
3938          * We need to validate the request and possibly allocate
3939          * space if that might be an issue later.
3940          *
3941          * Currently we reject any reshape of a 'far' mode array,
3942          * allow chunk size to change if new is generally acceptable,
3943          * allow raid_disks to increase, and allow
3944          * a switch between 'near' mode and 'offset' mode.
3945          */
3946         struct r10conf *conf = mddev->private;
3947         struct geom geo;
3948
3949         if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
3950                 return -EINVAL;
3951
3952         if (setup_geo(&geo, mddev, geo_start) != conf->copies)
3953                 /* mustn't change number of copies */
3954                 return -EINVAL;
3955         if (geo.far_copies > 1 && !geo.far_offset)
3956                 /* Cannot switch to 'far' mode */
3957                 return -EINVAL;
3958
3959         if (mddev->array_sectors & geo.chunk_mask)
3960                         /* not factor of array size */
3961                         return -EINVAL;
3962
3963         if (!enough(conf, -1))
3964                 return -EINVAL;
3965
3966         kfree(conf->mirrors_new);
3967         conf->mirrors_new = NULL;
3968         if (mddev->delta_disks > 0) {
3969                 /* allocate new 'mirrors' list */
3970                 conf->mirrors_new = kzalloc(
3971                         sizeof(struct raid10_info)
3972                         *(mddev->raid_disks +
3973                           mddev->delta_disks),
3974                         GFP_KERNEL);
3975                 if (!conf->mirrors_new)
3976                         return -ENOMEM;
3977         }
3978         return 0;
3979 }
3980
3981 /*
3982  * Need to check if array has failed when deciding whether to:
3983  *  - start an array
3984  *  - remove non-faulty devices
3985  *  - add a spare
3986  *  - allow a reshape
3987  * This determination is simple when no reshape is happening.
3988  * However if there is a reshape, we need to carefully check
3989  * both the before and after sections.
3990  * This is because some failed devices may only affect one
3991  * of the two sections, and some non-in_sync devices may
3992  * be insync in the section most affected by failed devices.
3993  */
3994 static int calc_degraded(struct r10conf *conf)
3995 {
3996         int degraded, degraded2;
3997         int i;
3998
3999         rcu_read_lock();
4000         degraded = 0;
4001         /* 'prev' section first */
4002         for (i = 0; i < conf->prev.raid_disks; i++) {
4003                 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4004                 if (!rdev || test_bit(Faulty, &rdev->flags))
4005                         degraded++;
4006                 else if (!test_bit(In_sync, &rdev->flags))
4007                         /* When we can reduce the number of devices in
4008                          * an array, this might not contribute to
4009                          * 'degraded'.  It does now.
4010                          */
4011                         degraded++;
4012         }
4013         rcu_read_unlock();
4014         if (conf->geo.raid_disks == conf->prev.raid_disks)
4015                 return degraded;
4016         rcu_read_lock();
4017         degraded2 = 0;
4018         for (i = 0; i < conf->geo.raid_disks; i++) {
4019                 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4020                 if (!rdev || test_bit(Faulty, &rdev->flags))
4021                         degraded2++;
4022                 else if (!test_bit(In_sync, &rdev->flags)) {
4023                         /* If reshape is increasing the number of devices,
4024                          * this section has already been recovered, so
4025                          * it doesn't contribute to degraded.
4026                          * else it does.
4027                          */
4028                         if (conf->geo.raid_disks <= conf->prev.raid_disks)
4029                                 degraded2++;
4030                 }
4031         }
4032         rcu_read_unlock();
4033         if (degraded2 > degraded)
4034                 return degraded2;
4035         return degraded;
4036 }
4037
4038 static int raid10_start_reshape(struct mddev *mddev)
4039 {
4040         /* A 'reshape' has been requested. This commits
4041          * the various 'new' fields and sets MD_RECOVER_RESHAPE
4042          * This also checks if there are enough spares and adds them
4043          * to the array.
4044          * We currently require enough spares to make the final
4045          * array non-degraded.  We also require that the difference
4046          * between old and new data_offset - on each device - is
4047          * enough that we never risk over-writing.
4048          */
4049
4050         unsigned long before_length, after_length;
4051         sector_t min_offset_diff = 0;
4052         int first = 1;
4053         struct geom new;
4054         struct r10conf *conf = mddev->private;
4055         struct md_rdev *rdev;
4056         int spares = 0;
4057         int ret;
4058
4059         if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4060                 return -EBUSY;
4061
4062         if (setup_geo(&new, mddev, geo_start) != conf->copies)
4063                 return -EINVAL;
4064
4065         before_length = ((1 << conf->prev.chunk_shift) *
4066                          conf->prev.far_copies);
4067         after_length = ((1 << conf->geo.chunk_shift) *
4068                         conf->geo.far_copies);
4069
4070         rdev_for_each(rdev, mddev) {
4071                 if (!test_bit(In_sync, &rdev->flags)
4072                     && !test_bit(Faulty, &rdev->flags))
4073                         spares++;
4074                 if (rdev->raid_disk >= 0) {
4075                         long long diff = (rdev->new_data_offset
4076                                           - rdev->data_offset);
4077                         if (!mddev->reshape_backwards)
4078                                 diff = -diff;
4079                         if (diff < 0)
4080                                 diff = 0;
4081                         if (first || diff < min_offset_diff)
4082                                 min_offset_diff = diff;
4083                 }
4084         }
4085
4086         if (max(before_length, after_length) > min_offset_diff)
4087                 return -EINVAL;
4088
4089         if (spares < mddev->delta_disks)
4090                 return -EINVAL;
4091
4092         conf->offset_diff = min_offset_diff;
4093         spin_lock_irq(&conf->device_lock);
4094         if (conf->mirrors_new) {
4095                 memcpy(conf->mirrors_new, conf->mirrors,
4096                        sizeof(struct raid10_info)*conf->prev.raid_disks);
4097                 smp_mb();
4098                 kfree(conf->mirrors_old); /* FIXME and elsewhere */
4099                 conf->mirrors_old = conf->mirrors;
4100                 conf->mirrors = conf->mirrors_new;
4101                 conf->mirrors_new = NULL;
4102         }
4103         setup_geo(&conf->geo, mddev, geo_start);
4104         smp_mb();
4105         if (mddev->reshape_backwards) {
4106                 sector_t size = raid10_size(mddev, 0, 0);
4107                 if (size < mddev->array_sectors) {
4108                         spin_unlock_irq(&conf->device_lock);
4109                         printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n",
4110                                mdname(mddev));
4111                         return -EINVAL;
4112                 }
4113                 mddev->resync_max_sectors = size;
4114                 conf->reshape_progress = size;
4115         } else
4116                 conf->reshape_progress = 0;
4117         spin_unlock_irq(&conf->device_lock);
4118
4119         if (mddev->delta_disks && mddev->bitmap) {
4120                 ret = bitmap_resize(mddev->bitmap,
4121                                     raid10_size(mddev, 0,
4122                                                 conf->geo.raid_disks),
4123                                     0, 0);
4124                 if (ret)
4125                         goto abort;
4126         }
4127         if (mddev->delta_disks > 0) {
4128                 rdev_for_each(rdev, mddev)
4129                         if (rdev->raid_disk < 0 &&
4130                             !test_bit(Faulty, &rdev->flags)) {
4131                                 if (raid10_add_disk(mddev, rdev) == 0) {
4132                                         if (rdev->raid_disk >=
4133                                             conf->prev.raid_disks)
4134                                                 set_bit(In_sync, &rdev->flags);
4135                                         else
4136                                                 rdev->recovery_offset = 0;
4137
4138                                         if (sysfs_link_rdev(mddev, rdev))
4139                                                 /* Failure here  is OK */;
4140                                 }
4141                         } else if (rdev->raid_disk >= conf->prev.raid_disks
4142                                    && !test_bit(Faulty, &rdev->flags)) {
4143                                 /* This is a spare that was manually added */
4144                                 set_bit(In_sync, &rdev->flags);
4145                         }
4146         }
4147         /* When a reshape changes the number of devices,
4148          * ->degraded is measured against the larger of the
4149          * pre and  post numbers.
4150          */
4151         spin_lock_irq(&conf->device_lock);
4152         mddev->degraded = calc_degraded(conf);
4153         spin_unlock_irq(&conf->device_lock);
4154         mddev->raid_disks = conf->geo.raid_disks;
4155         mddev->reshape_position = conf->reshape_progress;
4156         set_bit(MD_CHANGE_DEVS, &mddev->flags);
4157
4158         clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4159         clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4160         set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4161         set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4162
4163         mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4164                                                 "reshape");
4165         if (!mddev->sync_thread) {
4166                 ret = -EAGAIN;
4167                 goto abort;
4168         }
4169         conf->reshape_checkpoint = jiffies;
4170         md_wakeup_thread(mddev->sync_thread);
4171         md_new_event(mddev);
4172         return 0;
4173
4174 abort:
4175         mddev->recovery = 0;
4176         spin_lock_irq(&conf->device_lock);
4177         conf->geo = conf->prev;
4178         mddev->raid_disks = conf->geo.raid_disks;
4179         rdev_for_each(rdev, mddev)
4180                 rdev->new_data_offset = rdev->data_offset;
4181         smp_wmb();
4182         conf->reshape_progress = MaxSector;
4183         mddev->reshape_position = MaxSector;
4184         spin_unlock_irq(&conf->device_lock);
4185         return ret;
4186 }
4187
4188 /* Calculate the last device-address that could contain
4189  * any block from the chunk that includes the array-address 's'
4190  * and report the next address.
4191  * i.e. the address returned will be chunk-aligned and after
4192  * any data that is in the chunk containing 's'.
4193  */
4194 static sector_t last_dev_address(sector_t s, struct geom *geo)
4195 {
4196         s = (s | geo->chunk_mask) + 1;
4197         s >>= geo->chunk_shift;
4198         s *= geo->near_copies;
4199         s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
4200         s *= geo->far_copies;
4201         s <<= geo->chunk_shift;
4202         return s;
4203 }
4204
4205 /* Calculate the first device-address that could contain
4206  * any block from the chunk that includes the array-address 's'.
4207  * This too will be the start of a chunk
4208  */
4209 static sector_t first_dev_address(sector_t s, struct geom *geo)
4210 {
4211         s >>= geo->chunk_shift;
4212         s *= geo->near_copies;
4213         sector_div(s, geo->raid_disks);
4214         s *= geo->far_copies;
4215         s <<= geo->chunk_shift;
4216         return s;
4217 }
4218
4219 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
4220                                 int *skipped)
4221 {
4222         /* We simply copy at most one chunk (smallest of old and new)
4223          * at a time, possibly less if that exceeds RESYNC_PAGES,
4224          * or we hit a bad block or something.
4225          * This might mean we pause for normal IO in the middle of
4226          * a chunk, but that is not a problem was mddev->reshape_position
4227          * can record any location.
4228          *
4229          * If we will want to write to a location that isn't
4230          * yet recorded as 'safe' (i.e. in metadata on disk) then
4231          * we need to flush all reshape requests and update the metadata.
4232          *
4233          * When reshaping forwards (e.g. to more devices), we interpret
4234          * 'safe' as the earliest block which might not have been copied
4235          * down yet.  We divide this by previous stripe size and multiply
4236          * by previous stripe length to get lowest device offset that we
4237          * cannot write to yet.
4238          * We interpret 'sector_nr' as an address that we want to write to.
4239          * From this we use last_device_address() to find where we might
4240          * write to, and first_device_address on the  'safe' position.
4241          * If this 'next' write position is after the 'safe' position,
4242          * we must update the metadata to increase the 'safe' position.
4243          *
4244          * When reshaping backwards, we round in the opposite direction
4245          * and perform the reverse test:  next write position must not be
4246          * less than current safe position.
4247          *
4248          * In all this the minimum difference in data offsets
4249          * (conf->offset_diff - always positive) allows a bit of slack,
4250          * so next can be after 'safe', but not by more than offset_disk
4251          *
4252          * We need to prepare all the bios here before we start any IO
4253          * to ensure the size we choose is acceptable to all devices.
4254          * The means one for each copy for write-out and an extra one for
4255          * read-in.
4256          * We store the read-in bio in ->master_bio and the others in
4257          * ->devs[x].bio and ->devs[x].repl_bio.
4258          */
4259         struct r10conf *conf = mddev->private;
4260         struct r10bio *r10_bio;
4261         sector_t next, safe, last;
4262         int max_sectors;
4263         int nr_sectors;
4264         int s;
4265         struct md_rdev *rdev;
4266         int need_flush = 0;
4267         struct bio *blist;
4268         struct bio *bio, *read_bio;
4269         int sectors_done = 0;
4270
4271         if (sector_nr == 0) {
4272                 /* If restarting in the middle, skip the initial sectors */
4273                 if (mddev->reshape_backwards &&
4274                     conf->reshape_progress < raid10_size(mddev, 0, 0)) {
4275                         sector_nr = (raid10_size(mddev, 0, 0)
4276                                      - conf->reshape_progress);
4277                 } else if (!mddev->reshape_backwards &&
4278                            conf->reshape_progress > 0)
4279                         sector_nr = conf->reshape_progress;
4280                 if (sector_nr) {
4281                         mddev->curr_resync_completed = sector_nr;
4282                         sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4283                         *skipped = 1;
4284                         return sector_nr;
4285                 }
4286         }
4287
4288         /* We don't use sector_nr to track where we are up to
4289          * as that doesn't work well for ->reshape_backwards.
4290          * So just use ->reshape_progress.
4291          */
4292         if (mddev->reshape_backwards) {
4293                 /* 'next' is the earliest device address that we might
4294                  * write to for this chunk in the new layout
4295                  */
4296                 next = first_dev_address(conf->reshape_progress - 1,
4297                                          &conf->geo);
4298
4299                 /* 'safe' is the last device address that we might read from
4300                  * in the old layout after a restart
4301                  */
4302                 safe = last_dev_address(conf->reshape_safe - 1,
4303                                         &conf->prev);
4304
4305                 if (next + conf->offset_diff < safe)
4306                         need_flush = 1;
4307
4308                 last = conf->reshape_progress - 1;
4309                 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
4310                                                & conf->prev.chunk_mask);
4311                 if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
4312                         sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
4313         } else {
4314                 /* 'next' is after the last device address that we
4315                  * might write to for this chunk in the new layout
4316                  */
4317                 next = last_dev_address(conf->reshape_progress, &conf->geo);
4318
4319                 /* 'safe' is the earliest device address that we might
4320                  * read from in the old layout after a restart
4321                  */
4322                 safe = first_dev_address(conf->reshape_safe, &conf->prev);
4323
4324                 /* Need to update metadata if 'next' might be beyond 'safe'
4325                  * as that would possibly corrupt data
4326                  */
4327                 if (next > safe + conf->offset_diff)
4328                         need_flush = 1;
4329
4330                 sector_nr = conf->reshape_progress;
4331                 last  = sector_nr | (conf->geo.chunk_mask
4332                                      & conf->prev.chunk_mask);
4333
4334                 if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
4335                         last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
4336         }
4337
4338         if (need_flush ||
4339             time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4340                 /* Need to update reshape_position in metadata */
4341                 wait_barrier(conf);
4342                 mddev->reshape_position = conf->reshape_progress;
4343                 if (mddev->reshape_backwards)
4344                         mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
4345                                 - conf->reshape_progress;
4346                 else
4347                         mddev->curr_resync_completed = conf->reshape_progress;
4348                 conf->reshape_checkpoint = jiffies;
4349                 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4350                 md_wakeup_thread(mddev->thread);
4351                 wait_event(mddev->sb_wait, mddev->flags == 0 ||
4352                            kthread_should_stop());
4353                 conf->reshape_safe = mddev->reshape_position;
4354                 allow_barrier(conf);
4355         }
4356
4357 read_more:
4358         /* Now schedule reads for blocks from sector_nr to last */
4359         r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
4360         raise_barrier(conf, sectors_done != 0);
4361         atomic_set(&r10_bio->remaining, 0);
4362         r10_bio->mddev = mddev;
4363         r10_bio->sector = sector_nr;
4364         set_bit(R10BIO_IsReshape, &r10_bio->state);
4365         r10_bio->sectors = last - sector_nr + 1;
4366         rdev = read_balance(conf, r10_bio, &max_sectors);
4367         BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
4368
4369         if (!rdev) {
4370                 /* Cannot read from here, so need to record bad blocks
4371                  * on all the target devices.
4372                  */
4373                 // FIXME
4374                 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4375                 return sectors_done;
4376         }
4377
4378         read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
4379
4380         read_bio->bi_bdev = rdev->bdev;
4381         read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
4382                                + rdev->data_offset);
4383         read_bio->bi_private = r10_bio;
4384         read_bio->bi_end_io = end_sync_read;
4385         read_bio->bi_rw = READ;
4386         read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
4387         read_bio->bi_flags |= 1 << BIO_UPTODATE;
4388         read_bio->bi_vcnt = 0;
4389         read_bio->bi_size = 0;
4390         r10_bio->master_bio = read_bio;
4391         r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4392
4393         /* Now find the locations in the new layout */
4394         __raid10_find_phys(&conf->geo, r10_bio);
4395
4396         blist = read_bio;
4397         read_bio->bi_next = NULL;
4398
4399         for (s = 0; s < conf->copies*2; s++) {
4400                 struct bio *b;
4401                 int d = r10_bio->devs[s/2].devnum;
4402                 struct md_rdev *rdev2;
4403                 if (s&1) {
4404                         rdev2 = conf->mirrors[d].replacement;
4405                         b = r10_bio->devs[s/2].repl_bio;
4406                 } else {
4407                         rdev2 = conf->mirrors[d].rdev;
4408                         b = r10_bio->devs[s/2].bio;
4409                 }
4410                 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
4411                         continue;
4412                 b->bi_bdev = rdev2->bdev;
4413                 b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset;
4414                 b->bi_private = r10_bio;
4415                 b->bi_end_io = end_reshape_write;
4416                 b->bi_rw = WRITE;
4417                 b->bi_flags &= ~(BIO_POOL_MASK - 1);
4418                 b->bi_flags |= 1 << BIO_UPTODATE;
4419                 b->bi_next = blist;
4420                 b->bi_vcnt = 0;
4421                 b->bi_idx = 0;
4422                 b->bi_size = 0;
4423                 blist = b;
4424         }
4425
4426         /* Now add as many pages as possible to all of these bios. */
4427
4428         nr_sectors = 0;
4429         for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
4430                 struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
4431                 int len = (max_sectors - s) << 9;
4432                 if (len > PAGE_SIZE)
4433                         len = PAGE_SIZE;
4434                 for (bio = blist; bio ; bio = bio->bi_next) {
4435                         struct bio *bio2;
4436                         if (bio_add_page(bio, page, len, 0))
4437                                 continue;
4438
4439                         /* Didn't fit, must stop */
4440                         for (bio2 = blist;
4441                              bio2 && bio2 != bio;
4442                              bio2 = bio2->bi_next) {
4443                                 /* Remove last page from this bio */
4444                                 bio2->bi_vcnt--;
4445                                 bio2->bi_size -= len;
4446                                 bio2->bi_flags &= ~(1<<BIO_SEG_VALID);
4447                         }
4448                         goto bio_full;
4449                 }
4450                 sector_nr += len >> 9;
4451                 nr_sectors += len >> 9;
4452         }
4453 bio_full:
4454         r10_bio->sectors = nr_sectors;
4455
4456         /* Now submit the read */
4457         md_sync_acct(read_bio->bi_bdev, r10_bio->sectors);
4458         atomic_inc(&r10_bio->remaining);
4459         read_bio->bi_next = NULL;
4460         generic_make_request(read_bio);
4461         sector_nr += nr_sectors;
4462         sectors_done += nr_sectors;
4463         if (sector_nr <= last)
4464                 goto read_more;
4465
4466         /* Now that we have done the whole section we can
4467          * update reshape_progress
4468          */
4469         if (mddev->reshape_backwards)
4470                 conf->reshape_progress -= sectors_done;
4471         else
4472                 conf->reshape_progress += sectors_done;
4473
4474         return sectors_done;
4475 }
4476
4477 static void end_reshape_request(struct r10bio *r10_bio);
4478 static int handle_reshape_read_error(struct mddev *mddev,
4479                                      struct r10bio *r10_bio);
4480 static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
4481 {
4482         /* Reshape read completed.  Hopefully we have a block
4483          * to write out.
4484          * If we got a read error then we do sync 1-page reads from
4485          * elsewhere until we find the data - or give up.
4486          */
4487         struct r10conf *conf = mddev->private;
4488         int s;
4489
4490         if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
4491                 if (handle_reshape_read_error(mddev, r10_bio) < 0) {
4492                         /* Reshape has been aborted */
4493                         md_done_sync(mddev, r10_bio->sectors, 0);
4494                         return;
4495                 }
4496
4497         /* We definitely have the data in the pages, schedule the
4498          * writes.
4499          */
4500         atomic_set(&r10_bio->remaining, 1);
4501         for (s = 0; s < conf->copies*2; s++) {
4502                 struct bio *b;
4503                 int d = r10_bio->devs[s/2].devnum;
4504                 struct md_rdev *rdev;
4505                 if (s&1) {
4506                         rdev = conf->mirrors[d].replacement;
4507                         b = r10_bio->devs[s/2].repl_bio;
4508                 } else {
4509                         rdev = conf->mirrors[d].rdev;
4510                         b = r10_bio->devs[s/2].bio;
4511                 }
4512                 if (!rdev || test_bit(Faulty, &rdev->flags))
4513                         continue;
4514                 atomic_inc(&rdev->nr_pending);
4515                 md_sync_acct(b->bi_bdev, r10_bio->sectors);
4516                 atomic_inc(&r10_bio->remaining);
4517                 b->bi_next = NULL;
4518                 generic_make_request(b);
4519         }
4520         end_reshape_request(r10_bio);
4521 }
4522
4523 static void end_reshape(struct r10conf *conf)
4524 {
4525         if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
4526                 return;
4527
4528         spin_lock_irq(&conf->device_lock);
4529         conf->prev = conf->geo;
4530         md_finish_reshape(conf->mddev);
4531         smp_wmb();
4532         conf->reshape_progress = MaxSector;
4533         spin_unlock_irq(&conf->device_lock);
4534
4535         /* read-ahead size must cover two whole stripes, which is
4536          * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
4537          */
4538         if (conf->mddev->queue) {
4539                 int stripe = conf->geo.raid_disks *
4540                         ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
4541                 stripe /= conf->geo.near_copies;
4542                 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
4543                         conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
4544         }
4545         conf->fullsync = 0;
4546 }
4547
4548
4549 static int handle_reshape_read_error(struct mddev *mddev,
4550                                      struct r10bio *r10_bio)
4551 {
4552         /* Use sync reads to get the blocks from somewhere else */
4553         int sectors = r10_bio->sectors;
4554         struct r10conf *conf = mddev->private;
4555         struct {
4556                 struct r10bio r10_bio;
4557                 struct r10dev devs[conf->copies];
4558         } on_stack;
4559         struct r10bio *r10b = &on_stack.r10_bio;
4560         int slot = 0;
4561         int idx = 0;
4562         struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
4563
4564         r10b->sector = r10_bio->sector;
4565         __raid10_find_phys(&conf->prev, r10b);
4566
4567         while (sectors) {
4568                 int s = sectors;
4569                 int success = 0;
4570                 int first_slot = slot;
4571
4572                 if (s > (PAGE_SIZE >> 9))
4573                         s = PAGE_SIZE >> 9;
4574
4575                 while (!success) {
4576                         int d = r10b->devs[slot].devnum;
4577                         struct md_rdev *rdev = conf->mirrors[d].rdev;
4578                         sector_t addr;
4579                         if (rdev == NULL ||
4580                             test_bit(Faulty, &rdev->flags) ||
4581                             !test_bit(In_sync, &rdev->flags))
4582                                 goto failed;
4583
4584                         addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
4585                         success = sync_page_io(rdev,
4586                                                addr,
4587                                                s << 9,
4588                                                bvec[idx].bv_page,
4589                                                READ, false);
4590                         if (success)
4591                                 break;
4592                 failed:
4593                         slot++;
4594                         if (slot >= conf->copies)
4595                                 slot = 0;
4596                         if (slot == first_slot)
4597                                 break;
4598                 }
4599                 if (!success) {
4600                         /* couldn't read this block, must give up */
4601                         set_bit(MD_RECOVERY_INTR,
4602                                 &mddev->recovery);
4603                         return -EIO;
4604                 }
4605                 sectors -= s;
4606                 idx++;
4607         }
4608         return 0;
4609 }
4610
4611 static void end_reshape_write(struct bio *bio, int error)
4612 {
4613         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
4614         struct r10bio *r10_bio = bio->bi_private;
4615         struct mddev *mddev = r10_bio->mddev;
4616         struct r10conf *conf = mddev->private;
4617         int d;
4618         int slot;
4619         int repl;
4620         struct md_rdev *rdev = NULL;
4621
4622         d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
4623         if (repl)
4624                 rdev = conf->mirrors[d].replacement;
4625         if (!rdev) {
4626                 smp_mb();
4627                 rdev = conf->mirrors[d].rdev;
4628         }
4629
4630         if (!uptodate) {
4631                 /* FIXME should record badblock */
4632                 md_error(mddev, rdev);
4633         }
4634
4635         rdev_dec_pending(rdev, mddev);
4636         end_reshape_request(r10_bio);
4637 }
4638
4639 static void end_reshape_request(struct r10bio *r10_bio)
4640 {
4641         if (!atomic_dec_and_test(&r10_bio->remaining))
4642                 return;
4643         md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
4644         bio_put(r10_bio->master_bio);
4645         put_buf(r10_bio);
4646 }
4647
4648 static void raid10_finish_reshape(struct mddev *mddev)
4649 {
4650         struct r10conf *conf = mddev->private;
4651
4652         if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
4653                 return;
4654
4655         if (mddev->delta_disks > 0) {
4656                 sector_t size = raid10_size(mddev, 0, 0);
4657                 md_set_array_sectors(mddev, size);
4658                 if (mddev->recovery_cp > mddev->resync_max_sectors) {
4659                         mddev->recovery_cp = mddev->resync_max_sectors;
4660                         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4661                 }
4662                 mddev->resync_max_sectors = size;
4663                 set_capacity(mddev->gendisk, mddev->array_sectors);
4664                 revalidate_disk(mddev->gendisk);
4665         } else {
4666                 int d;
4667                 for (d = conf->geo.raid_disks ;
4668                      d < conf->geo.raid_disks - mddev->delta_disks;
4669                      d++) {
4670                         struct md_rdev *rdev = conf->mirrors[d].rdev;
4671                         if (rdev)
4672                                 clear_bit(In_sync, &rdev->flags);
4673                         rdev = conf->mirrors[d].replacement;
4674                         if (rdev)
4675                                 clear_bit(In_sync, &rdev->flags);
4676                 }
4677         }
4678         mddev->layout = mddev->new_layout;
4679         mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
4680         mddev->reshape_position = MaxSector;
4681         mddev->delta_disks = 0;
4682         mddev->reshape_backwards = 0;
4683 }
4684
4685 static struct md_personality raid10_personality =
4686 {
4687         .name           = "raid10",
4688         .level          = 10,
4689         .owner          = THIS_MODULE,
4690         .make_request   = make_request,
4691         .run            = run,
4692         .stop           = stop,
4693         .status         = status,
4694         .error_handler  = error,
4695         .hot_add_disk   = raid10_add_disk,
4696         .hot_remove_disk= raid10_remove_disk,
4697         .spare_active   = raid10_spare_active,
4698         .sync_request   = sync_request,
4699         .quiesce        = raid10_quiesce,
4700         .size           = raid10_size,
4701         .resize         = raid10_resize,
4702         .takeover       = raid10_takeover,
4703         .check_reshape  = raid10_check_reshape,
4704         .start_reshape  = raid10_start_reshape,
4705         .finish_reshape = raid10_finish_reshape,
4706 };
4707
4708 static int __init raid_init(void)
4709 {
4710         return register_md_personality(&raid10_personality);
4711 }
4712
4713 static void raid_exit(void)
4714 {
4715         unregister_md_personality(&raid10_personality);
4716 }
4717
4718 module_init(raid_init);
4719 module_exit(raid_exit);
4720 MODULE_LICENSE("GPL");
4721 MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
4722 MODULE_ALIAS("md-personality-9"); /* RAID10 */
4723 MODULE_ALIAS("md-raid10");
4724 MODULE_ALIAS("md-level-10");
4725
4726 module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);