]> Git Repo - linux.git/commitdiff
Merge branch 'for-2.6.37/barrier' of git://git.kernel.dk/linux-2.6-block
authorLinus Torvalds <[email protected]>
Sat, 23 Oct 2010 00:07:18 +0000 (17:07 -0700)
committerLinus Torvalds <[email protected]>
Sat, 23 Oct 2010 00:07:18 +0000 (17:07 -0700)
* 'for-2.6.37/barrier' of git://git.kernel.dk/linux-2.6-block: (46 commits)
  xen-blkfront: disable barrier/flush write support
  Added blk-lib.c and blk-barrier.c was renamed to blk-flush.c
  block: remove BLKDEV_IFL_WAIT
  aic7xxx_old: removed unused 'req' variable
  block: remove the BH_Eopnotsupp flag
  block: remove the BLKDEV_IFL_BARRIER flag
  block: remove the WRITE_BARRIER flag
  swap: do not send discards as barriers
  fat: do not send discards as barriers
  ext4: do not send discards as barriers
  jbd2: replace barriers with explicit flush / FUA usage
  jbd2: Modify ASYNC_COMMIT code to not rely on queue draining on barrier
  jbd: replace barriers with explicit flush / FUA usage
  nilfs2: replace barriers with explicit flush / FUA usage
  reiserfs: replace barriers with explicit flush / FUA usage
  gfs2: replace barriers with explicit flush / FUA usage
  btrfs: replace barriers with explicit flush / FUA usage
  xfs: replace barriers with explicit flush / FUA usage
  block: pass gfp_mask and flags to sb_issue_discard
  dm: convey that all flushes are processed as empty
  ...

24 files changed:
1  2 
block/Makefile
block/blk-core.c
block/blk-settings.c
block/blk.h
block/ioctl.c
drivers/block/brd.c
drivers/block/drbd/drbd_int.h
drivers/block/drbd/drbd_receiver.c
drivers/block/loop.c
drivers/block/pktcdvd.c
drivers/block/ps3disk.c
drivers/block/virtio_blk.c
drivers/block/xen-blkfront.c
drivers/md/dm-snap.c
drivers/md/dm.c
drivers/md/md.c
drivers/s390/block/dasd.c
fs/gfs2/rgrp.c
fs/jbd/commit.c
fs/jbd2/commit.c
fs/nilfs2/super.c
include/linux/blk_types.h
include/linux/blkdev.h
include/linux/fs.h

diff --combined block/Makefile
index c850d5ef80a22eccd057d76c0d8acab6f5f4d953,f627e4b1a9da3e04f9da21119980f1a12ef08e42..0fec4b3fab511bc065261121f279e6f64038c35c
@@@ -3,13 -3,12 +3,13 @@@
  #
  
  obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
-                       blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
+                       blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
                        blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
                        blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o
  
  obj-$(CONFIG_BLK_DEV_BSG)     += bsg.o
  obj-$(CONFIG_BLK_CGROUP)      += blk-cgroup.o
 +obj-$(CONFIG_BLK_DEV_THROTTLING)      += blk-throttle.o
  obj-$(CONFIG_IOSCHED_NOOP)    += noop-iosched.o
  obj-$(CONFIG_IOSCHED_DEADLINE)        += deadline-iosched.o
  obj-$(CONFIG_IOSCHED_CFQ)     += cfq-iosched.o
diff --combined block/blk-core.c
index 500eb859886e7ec989b2b1dc731eccc8d58dc85b,a840523e3b409d74c075ddc8cb8c5e5a0383dede..45141469e89eba5f24c9594afdfa5218e5688f46
@@@ -64,15 -64,13 +64,15 @@@ static void drive_stat_acct(struct requ
                return;
  
        cpu = part_stat_lock();
 -      part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
  
 -      if (!new_io)
 +      if (!new_io) {
 +              part = rq->part;
                part_stat_inc(cpu, part, merges[rw]);
 -      else {
 +      } else {
 +              part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
                part_round_stats(cpu, part);
                part_inc_in_flight(part, rw);
 +              rq->part = part;
        }
  
        part_stat_unlock();
@@@ -130,7 -128,6 +130,7 @@@ void blk_rq_init(struct request_queue *
        rq->ref_count = 1;
        rq->start_time = jiffies;
        set_start_time_ns(rq);
 +      rq->part = NULL;
  }
  EXPORT_SYMBOL(blk_rq_init);
  
@@@ -139,7 -136,7 +139,7 @@@ static void req_bio_endio(struct reques
  {
        struct request_queue *q = rq->q;
  
-       if (&q->bar_rq != rq) {
+       if (&q->flush_rq != rq) {
                if (error)
                        clear_bit(BIO_UPTODATE, &bio->bi_flags);
                else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
                if (bio->bi_size == 0)
                        bio_endio(bio, error);
        } else {
                /*
-                * Okay, this is the barrier request in progress, just
-                * record the error;
+                * Okay, this is the sequenced flush request in
+                * progress, just record the error;
                 */
-               if (error && !q->orderr)
-                       q->orderr = error;
+               if (error && !q->flush_err)
+                       q->flush_err = error;
        }
  }
  
@@@ -385,7 -381,6 +384,7 @@@ void blk_sync_queue(struct request_queu
        del_timer_sync(&q->unplug_timer);
        del_timer_sync(&q->timeout);
        cancel_work_sync(&q->unplug_work);
 +      throtl_shutdown_timer_wq(q);
  }
  EXPORT_SYMBOL(blk_sync_queue);
  
@@@ -463,8 -458,6 +462,8 @@@ void blk_cleanup_queue(struct request_q
        if (q->elevator)
                elevator_exit(q->elevator);
  
 +      blk_throtl_exit(q);
 +
        blk_put_queue(q);
  }
  EXPORT_SYMBOL(blk_cleanup_queue);
@@@ -521,16 -514,12 +520,17 @@@ struct request_queue *blk_alloc_queue_n
                return NULL;
        }
  
 +      if (blk_throtl_init(q)) {
 +              kmem_cache_free(blk_requestq_cachep, q);
 +              return NULL;
 +      }
 +
        setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
                    laptop_mode_timer_fn, (unsigned long) q);
        init_timer(&q->unplug_timer);
        setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
        INIT_LIST_HEAD(&q->timeout_list);
+       INIT_LIST_HEAD(&q->pending_flushes);
        INIT_WORK(&q->unplug_work, blk_unplug_work);
  
        kobject_init(&q->kobj, &blk_queue_ktype);
@@@ -807,16 -796,11 +807,16 @@@ static struct request *get_request(stru
        rl->starved[is_sync] = 0;
  
        priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
 -      if (priv)
 +      if (priv) {
                rl->elvpriv++;
  
 -      if (blk_queue_io_stat(q))
 -              rw_flags |= REQ_IO_STAT;
 +              /*
 +               * Don't do stats for non-priv requests
 +               */
 +              if (blk_queue_io_stat(q))
 +                      rw_flags |= REQ_IO_STAT;
 +      }
 +
        spin_unlock_irq(q->queue_lock);
  
        rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
@@@ -1053,22 -1037,6 +1053,6 @@@ void blk_insert_request(struct request_
  }
  EXPORT_SYMBOL(blk_insert_request);
  
- /*
-  * add-request adds a request to the linked list.
-  * queue lock is held and interrupts disabled, as we muck with the
-  * request queue list.
-  */
- static inline void add_request(struct request_queue *q, struct request *req)
- {
-       drive_stat_acct(req, 1);
-       /*
-        * elevator indicated where it wants this request to be
-        * inserted at elevator_merge time
-        */
-       __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
- }
  static void part_round_stats_single(int cpu, struct hd_struct *part,
                                    unsigned long now)
  {
@@@ -1217,13 -1185,16 +1201,16 @@@ static int __make_request(struct reques
        const bool sync = !!(bio->bi_rw & REQ_SYNC);
        const bool unplug = !!(bio->bi_rw & REQ_UNPLUG);
        const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK;
+       int where = ELEVATOR_INSERT_SORT;
        int rw_flags;
  
-       if ((bio->bi_rw & REQ_HARDBARRIER) &&
-           (q->next_ordered == QUEUE_ORDERED_NONE)) {
+       /* REQ_HARDBARRIER is no more */
+       if (WARN_ONCE(bio->bi_rw & REQ_HARDBARRIER,
+               "block: HARDBARRIER is deprecated, use FLUSH/FUA instead\n")) {
                bio_endio(bio, -EOPNOTSUPP);
                return 0;
        }
        /*
         * low level driver can indicate that it wants pages above a
         * certain limit bounced to low memory (ie for highmem, or even
  
        spin_lock_irq(q->queue_lock);
  
-       if (unlikely((bio->bi_rw & REQ_HARDBARRIER)) || elv_queue_empty(q))
+       if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
+               where = ELEVATOR_INSERT_FRONT;
+               goto get_rq;
+       }
+       if (elv_queue_empty(q))
                goto get_rq;
  
        el_ret = elv_merge(q, &req, bio);
@@@ -1330,7 -1306,10 +1322,10 @@@ get_rq
                req->cpu = blk_cpu_to_group(smp_processor_id());
        if (queue_should_plug(q) && elv_queue_empty(q))
                blk_plug_device(q);
-       add_request(q, req);
+       /* insert the request into the elevator */
+       drive_stat_acct(req, 1);
+       __elv_add_request(q, req, where, 0);
  out:
        if (unplug || !queue_should_plug(q))
                __generic_unplug_device(q);
@@@ -1530,6 -1509,19 +1525,19 @@@ static inline void __generic_make_reque
                if (bio_check_eod(bio, nr_sectors))
                        goto end_io;
  
+               /*
+                * Filter flush bio's early so that make_request based
+                * drivers without flush support don't have to worry
+                * about them.
+                */
+               if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
+                       bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
+                       if (!nr_sectors) {
+                               err = 0;
+                               goto end_io;
+                       }
+               }
                if ((bio->bi_rw & REQ_DISCARD) &&
                    (!blk_queue_discard(q) ||
                     ((bio->bi_rw & REQ_SECURE) &&
                        goto end_io;
                }
  
 +              blk_throtl_bio(q, &bio);
 +
 +              /*
 +               * If bio = NULL, bio has been throttled and will be submitted
 +               * later.
 +               */
 +              if (!bio)
 +                      break;
 +
                trace_block_bio_queue(q, bio);
  
                ret = q->make_request_fn(q, bio);
@@@ -1637,12 -1620,11 +1645,12 @@@ void submit_bio(int rw, struct bio *bio
  
                if (unlikely(block_dump)) {
                        char b[BDEVNAME_SIZE];
 -                      printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
 +                      printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
                        current->comm, task_pid_nr(current),
                                (rw & WRITE) ? "WRITE" : "READ",
                                (unsigned long long)bio->bi_sector,
 -                              bdevname(bio->bi_bdev, b));
 +                              bdevname(bio->bi_bdev, b),
 +                              count);
                }
        }
  
@@@ -1785,7 -1767,7 +1793,7 @@@ static void blk_account_io_completion(s
                int cpu;
  
                cpu = part_stat_lock();
 -              part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
 +              part = req->part;
                part_stat_add(cpu, part, sectors[rw], bytes >> 9);
                part_stat_unlock();
        }
  static void blk_account_io_done(struct request *req)
  {
        /*
-        * Account IO completion.  bar_rq isn't accounted as a normal
-        * IO on queueing nor completion.  Accounting the containing
-        * request is enough.
+        * Account IO completion.  flush_rq isn't accounted as a
+        * normal IO on queueing nor completion.  Accounting the
+        * containing request is enough.
         */
-       if (blk_do_io_stat(req) && req != &req->q->bar_rq) {
+       if (blk_do_io_stat(req) && req != &req->q->flush_rq) {
                unsigned long duration = jiffies - req->start_time;
                const int rw = rq_data_dir(req);
                struct hd_struct *part;
                int cpu;
  
                cpu = part_stat_lock();
 -              part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
 +              part = req->part;
  
                part_stat_inc(cpu, part, ios[rw]);
                part_stat_add(cpu, part, ticks[rw], duration);
@@@ -2523,9 -2505,7 +2531,7 @@@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone)
  static void __blk_rq_prep_clone(struct request *dst, struct request *src)
  {
        dst->cpu = src->cpu;
-       dst->cmd_flags = (rq_data_dir(src) | REQ_NOMERGE);
-       if (src->cmd_flags & REQ_DISCARD)
-               dst->cmd_flags |= REQ_DISCARD;
+       dst->cmd_flags = (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE;
        dst->cmd_type = src->cmd_type;
        dst->__sector = blk_rq_pos(src);
        dst->__data_len = blk_rq_bytes(src);
@@@ -2605,13 -2585,6 +2611,13 @@@ int kblockd_schedule_work(struct reques
  }
  EXPORT_SYMBOL(kblockd_schedule_work);
  
 +int kblockd_schedule_delayed_work(struct request_queue *q,
 +                      struct delayed_work *dwork, unsigned long delay)
 +{
 +      return queue_delayed_work(kblockd_workqueue, dwork, delay);
 +}
 +EXPORT_SYMBOL(kblockd_schedule_delayed_work);
 +
  int __init blk_dev_init(void)
  {
        BUILD_BUG_ON(__REQ_NR_BITS > 8 *
diff --combined block/blk-settings.c
index 315b88c8cbbbbc5e6cefd00361d988a4c0b58a8a,9b18afcfe9257145ed9ec7abeb1298929f854ec4..701859fb9647c31a505f0218800744e3e6d0a775
@@@ -111,7 -111,6 +111,7 @@@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy)
  void blk_set_default_limits(struct queue_limits *lim)
  {
        lim->max_segments = BLK_MAX_SEGMENTS;
 +      lim->max_integrity_segments = 0;
        lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
        lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
        lim->max_sectors = BLK_DEF_MAX_SECTORS;
@@@ -214,7 -213,7 +214,7 @@@ void blk_queue_bounce_limit(struct requ
         */
        if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
                dma = 1;
 -      q->limits.bounce_pfn = max_low_pfn;
 +      q->limits.bounce_pfn = max(max_low_pfn, b_pfn);
  #else
        if (b_pfn < blk_max_low_pfn)
                dma = 1;
@@@ -344,7 -343,7 +344,7 @@@ EXPORT_SYMBOL(blk_queue_logical_block_s
   *   hardware can operate on without reverting to read-modify-write
   *   operations.
   */
 -void blk_queue_physical_block_size(struct request_queue *q, unsigned short size)
 +void blk_queue_physical_block_size(struct request_queue *q, unsigned int size)
  {
        q->limits.physical_block_size = size;
  
@@@ -456,6 -455,11 +456,6 @@@ void blk_queue_io_opt(struct request_qu
  }
  EXPORT_SYMBOL(blk_queue_io_opt);
  
 -/*
 - * Returns the minimum that is _not_ zero, unless both are zero.
 - */
 -#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
 -
  /**
   * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
   * @t:        the stacking driver (top)
@@@ -510,8 -514,6 +510,8 @@@ int blk_stack_limits(struct queue_limit
                                            b->seg_boundary_mask);
  
        t->max_segments = min_not_zero(t->max_segments, b->max_segments);
 +      t->max_integrity_segments = min_not_zero(t->max_integrity_segments,
 +                                               b->max_integrity_segments);
  
        t->max_segment_size = min_not_zero(t->max_segment_size,
                                           b->max_segment_size);
@@@ -792,6 -794,26 +792,26 @@@ void blk_queue_update_dma_alignment(str
  }
  EXPORT_SYMBOL(blk_queue_update_dma_alignment);
  
+ /**
+  * blk_queue_flush - configure queue's cache flush capability
+  * @q:                the request queue for the device
+  * @flush:    0, REQ_FLUSH or REQ_FLUSH | REQ_FUA
+  *
+  * Tell block layer cache flush capability of @q.  If it supports
+  * flushing, REQ_FLUSH should be set.  If it supports bypassing
+  * write cache for individual writes, REQ_FUA should be set.
+  */
+ void blk_queue_flush(struct request_queue *q, unsigned int flush)
+ {
+       WARN_ON_ONCE(flush & ~(REQ_FLUSH | REQ_FUA));
+       if (WARN_ON_ONCE(!(flush & REQ_FLUSH) && (flush & REQ_FUA)))
+               flush &= ~REQ_FUA;
+       q->flush_flags = flush & (REQ_FLUSH | REQ_FUA);
+ }
+ EXPORT_SYMBOL_GPL(blk_queue_flush);
  static int __init blk_settings_init(void)
  {
        blk_max_low_pfn = max_low_pfn - 1;
diff --combined block/blk.h
index f864012ec300d632d34c96ae5d0a2e991873cf8c,faf94f2acb12d0b6d72f8ba6e172c4fbc48ad831..1e675e5ade02615ee5eba4293a6f6b07cbf3c9c1
@@@ -51,6 -51,8 +51,8 @@@ static inline void blk_clear_rq_complet
   */
  #define ELV_ON_HASH(rq)               (!hlist_unhashed(&(rq)->hash))
  
+ struct request *blk_do_flush(struct request_queue *q, struct request *rq);
  static inline struct request *__elv_next_request(struct request_queue *q)
  {
        struct request *rq;
        while (1) {
                while (!list_empty(&q->queue_head)) {
                        rq = list_entry_rq(q->queue_head.next);
-                       if (blk_do_ordered(q, &rq))
+                       if (!(rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) ||
+                           rq == &q->flush_rq)
+                               return rq;
+                       rq = blk_do_flush(q, rq);
+                       if (rq)
                                return rq;
                }
  
@@@ -110,6 -116,10 +116,6 @@@ void blk_queue_congestion_threshold(str
  
  int blk_dev_init(void);
  
 -void elv_quiesce_start(struct request_queue *q);
 -void elv_quiesce_end(struct request_queue *q);
 -
 -
  /*
   * Return the threshold (number of used requests) at which the queue is
   * considered to be congested.  It include a little hysteresis to keep the
@@@ -128,6 -138,14 +134,6 @@@ static inline int queue_congestion_off_
        return q->nr_congestion_off;
  }
  
 -#if defined(CONFIG_BLK_DEV_INTEGRITY)
 -
 -#define rq_for_each_integrity_segment(bvl, _rq, _iter)                \
 -      __rq_for_each_bio(_iter.bio, _rq)                       \
 -              bip_for_each_vec(bvl, _iter.bio->bi_integrity, _iter.i)
 -
 -#endif /* BLK_DEV_INTEGRITY */
 -
  static inline int blk_cpu_to_group(int cpu)
  {
        int group = NR_CPUS;
diff --combined block/ioctl.c
index 2c15fe0912c4c4287b0fbde0c08933d6ad27aa64,cb2b9099862be9c526972b3772235ec54b1fb4e2..d724ceb1d46535fee2fa3e65ca57cb09cb88997d
@@@ -62,7 -62,7 +62,7 @@@ static int blkpg_ioctl(struct block_dev
  
                        /* all seems OK */
                        part = add_partition(disk, partno, start, length,
 -                                           ADDPART_FLAG_NONE);
 +                                           ADDPART_FLAG_NONE, NULL);
                        mutex_unlock(&bdev->bd_mutex);
                        return IS_ERR(part) ? PTR_ERR(part) : 0;
                case BLKPG_DEL_PARTITION:
@@@ -116,7 -116,7 +116,7 @@@ static int blkdev_reread_part(struct bl
  static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
                             uint64_t len, int secure)
  {
-       unsigned long flags = BLKDEV_IFL_WAIT;
+       unsigned long flags = 0;
  
        if (start & 511)
                return -EINVAL;
        if (start + len > (bdev->bd_inode->i_size >> 9))
                return -EINVAL;
        if (secure)
-               flags |= BLKDEV_IFL_SECURE;
+               flags |= BLKDEV_DISCARD_SECURE;
        return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags);
  }
  
diff --combined drivers/block/brd.c
index 82bfd5bb4a973f316de41fb6fd05bd41226c322a,fa33f97722babc52ca0a9405100e5a8dfc899ca2..b7f51e4594f8660f0a54472d264d957dbe533312
@@@ -15,7 -15,7 +15,7 @@@
  #include <linux/blkdev.h>
  #include <linux/bio.h>
  #include <linux/highmem.h>
 -#include <linux/smp_lock.h>
 +#include <linux/mutex.h>
  #include <linux/radix-tree.h>
  #include <linux/buffer_head.h> /* invalidate_bh_lrus() */
  #include <linux/slab.h>
@@@ -55,7 -55,6 +55,7 @@@ struct brd_device 
  /*
   * Look up and return a brd's page for a given sector.
   */
 +static DEFINE_MUTEX(brd_mutex);
  static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
  {
        pgoff_t idx;
@@@ -403,7 -402,7 +403,7 @@@ static int brd_ioctl(struct block_devic
         * ram device BLKFLSBUF has special semantics, we want to actually
         * release and destroy the ramdisk data.
         */
 -      lock_kernel();
 +      mutex_lock(&brd_mutex);
        mutex_lock(&bdev->bd_mutex);
        error = -EBUSY;
        if (bdev->bd_openers <= 1) {
                error = 0;
        }
        mutex_unlock(&bdev->bd_mutex);
 -      unlock_kernel();
 +      mutex_unlock(&brd_mutex);
  
        return error;
  }
@@@ -483,7 -482,6 +483,6 @@@ static struct brd_device *brd_alloc(in
        if (!brd->brd_queue)
                goto out_free_dev;
        blk_queue_make_request(brd->brd_queue, brd_make_request);
-       blk_queue_ordered(brd->brd_queue, QUEUE_ORDERED_TAG);
        blk_queue_max_hw_sectors(brd->brd_queue, 1024);
        blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
  
index c07c370c4c822d53614574b3256a2a922cb1e4c0,c2ef476f57119f87e58b0181539bd985de2b2f22..9bdcf4393c0aa9525c9c7918355cb9b985809a73
@@@ -337,25 -337,13 +337,25 @@@ static inline void bm_xfer_ctx_bit_to_w
   * NOTE that the payload starts at a long aligned offset,
   * regardless of 32 or 64 bit arch!
   */
 -struct p_header {
 +struct p_header80 {
        u32       magic;
        u16       command;
        u16       length;       /* bytes of data after this header */
        u8        payload[0];
  } __packed;
 -/* 8 bytes. packet FIXED for the next century! */
 +
 +/* Header for big packets, Used for data packets exceeding 64kB */
 +struct p_header95 {
 +      u16       magic;        /* use DRBD_MAGIC_BIG here */
 +      u16       command;
 +      u32       length;       /* Use only 24 bits of that. Ignore the highest 8 bit. */
 +      u8        payload[0];
 +} __packed;
 +
 +union p_header {
 +      struct p_header80 h80;
 +      struct p_header95 h95;
 +};
  
  /*
   * short commands, packets without payload, plain p_header:
   */
  
  /* these defines must not be changed without changing the protocol version */
 -#define DP_HARDBARRIER              1
 -#define DP_RW_SYNC          2
 +#define DP_HARDBARRIER              1 /* depricated */
 +#define DP_RW_SYNC          2 /* equals REQ_SYNC    */
  #define DP_MAY_SET_IN_SYNC    4
 +#define DP_UNPLUG             8 /* equals REQ_UNPLUG  */
 +#define DP_FUA               16 /* equals REQ_FUA     */
 +#define DP_FLUSH             32 /* equals REQ_FLUSH   */
 +#define DP_DISCARD           64 /* equals REQ_DISCARD */
  
  struct p_data {
 -      struct p_header head;
 +      union p_header head;
        u64         sector;    /* 64 bits sector number */
        u64         block_id;  /* to identify the request in protocol B&C */
        u32         seq_num;
   *   P_DATA_REQUEST, P_RS_DATA_REQUEST
   */
  struct p_block_ack {
 -      struct p_header head;
 +      struct p_header80 head;
        u64         sector;
        u64         block_id;
        u32         blksize;
  
  
  struct p_block_req {
 -      struct p_header head;
 +      struct p_header80 head;
        u64 sector;
        u64 block_id;
        u32 blksize;
   */
  
  struct p_handshake {
 -      struct p_header head;   /* 8 bytes */
 +      struct p_header80 head; /* 8 bytes */
        u32 protocol_min;
        u32 feature_flags;
        u32 protocol_max;
  /* 80 bytes, FIXED for the next century */
  
  struct p_barrier {
 -      struct p_header head;
 +      struct p_header80 head;
        u32 barrier;    /* barrier number _handle_ only */
        u32 pad;        /* to multiple of 8 Byte */
  } __packed;
  
  struct p_barrier_ack {
 -      struct p_header head;
 +      struct p_header80 head;
        u32 barrier;
        u32 set_size;
  } __packed;
  
  struct p_rs_param {
 -      struct p_header head;
 +      struct p_header80 head;
        u32 rate;
  
              /* Since protocol version 88 and higher. */
  } __packed;
  
  struct p_rs_param_89 {
 -      struct p_header head;
 +      struct p_header80 head;
        u32 rate;
          /* protocol version 89: */
        char verify_alg[SHARED_SECRET_MAX];
        char csums_alg[SHARED_SECRET_MAX];
  } __packed;
  
 +struct p_rs_param_95 {
 +      struct p_header80 head;
 +      u32 rate;
 +      char verify_alg[SHARED_SECRET_MAX];
 +      char csums_alg[SHARED_SECRET_MAX];
 +      u32 c_plan_ahead;
 +      u32 c_delay_target;
 +      u32 c_fill_target;
 +      u32 c_max_rate;
 +} __packed;
 +
  enum drbd_conn_flags {
        CF_WANT_LOSE = 1,
        CF_DRY_RUN = 2,
  };
  
  struct p_protocol {
 -      struct p_header head;
 +      struct p_header80 head;
        u32 protocol;
        u32 after_sb_0p;
        u32 after_sb_1p;
  } __packed;
  
  struct p_uuids {
 -      struct p_header head;
 +      struct p_header80 head;
        u64 uuid[UI_EXTENDED_SIZE];
  } __packed;
  
  struct p_rs_uuid {
 -      struct p_header head;
 +      struct p_header80 head;
        u64         uuid;
  } __packed;
  
  struct p_sizes {
 -      struct p_header head;
 +      struct p_header80 head;
        u64         d_size;  /* size of disk */
        u64         u_size;  /* user requested size */
        u64         c_size;  /* current exported size */
  } __packed;
  
  struct p_state {
 -      struct p_header head;
 +      struct p_header80 head;
        u32         state;
  } __packed;
  
  struct p_req_state {
 -      struct p_header head;
 +      struct p_header80 head;
        u32         mask;
        u32         val;
  } __packed;
  
  struct p_req_state_reply {
 -      struct p_header head;
 +      struct p_header80 head;
        u32         retcode;
  } __packed;
  
@@@ -544,7 -517,7 +544,7 @@@ struct p_drbd06_param 
  } __packed;
  
  struct p_discard {
 -      struct p_header head;
 +      struct p_header80 head;
        u64         block_id;
        u32         seq_num;
        u32         pad;
@@@ -560,7 -533,7 +560,7 @@@ enum drbd_bitmap_code 
  };
  
  struct p_compressed_bm {
 -      struct p_header head;
 +      struct p_header80 head;
        /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code
         * (encoding & 0x80): polarity (set/unset) of first runlength
         * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits
        u8 code[0];
  } __packed;
  
 -struct p_delay_probe {
 -      struct p_header head;
 -      u32     seq_num; /* sequence number to match the two probe packets */
 -      u32     offset;  /* usecs the probe got sent after the reference time point */
 +struct p_delay_probe93 {
 +      struct p_header80 head;
 +      u32     seq_num; /* sequence number to match the two probe packets */
 +      u32     offset;  /* usecs the probe got sent after the reference time point */
  } __packed;
  
  /* DCBP: Drbd Compressed Bitmap Packet ... */
@@@ -621,7 -594,7 +621,7 @@@ DCBP_set_pad_bits(struct p_compressed_b
   * so we need to use the fixed size 4KiB page size
   * most architechtures have used for a long time.
   */
 -#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header))
 +#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header80))
  #define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long))
  #define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm))
  #if (PAGE_SIZE < 4096)
  #endif
  
  union p_polymorph {
 -        struct p_header          header;
 +        union p_header           header;
          struct p_handshake       handshake;
          struct p_data            data;
          struct p_block_ack       block_ack;
          struct p_barrier         barrier;
          struct p_barrier_ack     barrier_ack;
          struct p_rs_param_89     rs_param_89;
 +        struct p_rs_param_95     rs_param_95;
          struct p_protocol        protocol;
          struct p_sizes           sizes;
          struct p_uuids           uuids;
          struct p_req_state       req_state;
          struct p_req_state_reply req_state_reply;
          struct p_block_req       block_req;
 +      struct p_delay_probe93   delay_probe93;
 +      struct p_rs_uuid         rs_uuid;
  } __packed;
  
  /**********************************************************************/
@@@ -727,7 -697,7 +727,7 @@@ struct drbd_tl_epoch 
        struct list_head requests; /* requests before */
        struct drbd_tl_epoch *next; /* pointer to the next barrier */
        unsigned int br_number;  /* the barriers identifier. */
 -      int n_req;      /* number of requests attached before this barrier */
 +      int n_writes;   /* number of requests attached before this barrier */
  };
  
  struct drbd_request;
@@@ -777,7 -747,7 +777,7 @@@ struct digest_info 
  struct drbd_epoch_entry {
        struct drbd_work w;
        struct hlist_node colision;
 -      struct drbd_epoch *epoch;
 +      struct drbd_epoch *epoch; /* for writes */
        struct drbd_conf *mdev;
        struct page *pages;
        atomic_t pending_bios;
        /* see comments on ee flag bits below */
        unsigned long flags;
        sector_t sector;
 -      u64 block_id;
 +      union {
 +              u64 block_id;
 +              struct digest_info *digest;
 +      };
  };
  
  /* ee flag bits.
@@@ -814,16 -781,12 +814,16 @@@ enum 
         * if any of those fail, we set this flag atomically
         * from the endio callback */
        __EE_WAS_ERROR,
 +
 +      /* This ee has a pointer to a digest instead of a block id */
 +      __EE_HAS_DIGEST,
  };
  #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
  #define EE_MAY_SET_IN_SYNC     (1<<__EE_MAY_SET_IN_SYNC)
  #define EE_IS_BARRIER          (1<<__EE_IS_BARRIER)
  #define       EE_RESUBMITTED         (1<<__EE_RESUBMITTED)
  #define EE_WAS_ERROR           (1<<__EE_WAS_ERROR)
 +#define EE_HAS_DIGEST          (1<<__EE_HAS_DIGEST)
  
  /* global flag bits */
  enum {
        SIGNAL_ASENDER,         /* whether asender wants to be interrupted */
        SEND_PING,              /* whether asender should send a ping asap */
  
 -      STOP_SYNC_TIMER,        /* tell timer to cancel itself */
        UNPLUG_QUEUED,          /* only relevant with kernel 2.4 */
        UNPLUG_REMOTE,          /* sending a "UnplugRemote" could help */
        MD_DIRTY,               /* current uuids and flags not yet on disk */
        BITMAP_IO,              /* suspend application io;
                                   once no more io in flight, start bitmap io */
        BITMAP_IO_QUEUED,       /* Started bitmap IO */
 +      GO_DISKLESS,            /* Disk failed, local_cnt reached zero, we are going diskless */
        RESYNC_AFTER_NEG,       /* Resync after online grow after the attach&negotiate finished. */
        NET_CONGESTED,          /* The data socket is congested */
  
                                 * the peer, if it changed there as well. */
        CONN_DRY_RUN,           /* Expect disconnect after resync handshake. */
        GOT_PING_ACK,           /* set when we receive a ping_ack packet, misc wait gets woken */
 +      NEW_CUR_UUID,           /* Create new current UUID when thawing IO */
 +      AL_SUSPENDED,           /* Activity logging is currently suspended. */
  };
  
  struct drbd_bitmap; /* opaque for drbd_conf */
  
  /* THINK maybe we actually want to use the default "event/%s" worker threads
   * or similar in linux 2.6, which uses per cpu data and threads.
 - *
 - * To be general, this might need a spin_lock member.
 - * For now, please use the mdev->req_lock to protect list_head,
 - * see drbd_queue_work below.
   */
  struct drbd_work_queue {
        struct list_head q;
@@@ -950,12 -915,6 +950,12 @@@ enum write_ordering_e 
        WO_bio_barrier
  };
  
 +struct fifo_buffer {
 +      int *values;
 +      unsigned int head_index;
 +      unsigned int size;
 +};
 +
  struct drbd_conf {
        /* things that are stored as / read from meta data on disk */
        unsigned long flags;
        unsigned int ko_count;
        struct drbd_work  resync_work,
                          unplug_work,
 +                        go_diskless,
                          md_sync_work;
        struct timer_list resync_timer;
        struct timer_list md_sync_timer;
 +#ifdef DRBD_DEBUG_MD_SYNC
 +      struct {
 +              unsigned int line;
 +              const char* func;
 +      } last_md_mark_dirty;
 +#endif
  
        /* Used after attach while negotiating new disk state. */
        union drbd_state new_state_tmp;
        union drbd_state state;
        wait_queue_head_t misc_wait;
        wait_queue_head_t state_wait;  /* upon each state change. */
 +      wait_queue_head_t net_cnt_wait;
        unsigned int send_cnt;
        unsigned int recv_cnt;
        unsigned int read_cnt;
        unsigned long rs_start;
        /* cumulated time in PausedSyncX state [unit jiffies] */
        unsigned long rs_paused;
 +      /* skipped because csum was equal [unit BM_BLOCK_SIZE] */
 +      unsigned long rs_same_csum;
 +#define DRBD_SYNC_MARKS 8
 +#define DRBD_SYNC_MARK_STEP (3*HZ)
        /* block not up-to-date at mark [unit BM_BLOCK_SIZE] */
 -      unsigned long rs_mark_left;
 +      unsigned long rs_mark_left[DRBD_SYNC_MARKS];
        /* marks's time [unit jiffies] */
 -      unsigned long rs_mark_time;
 -      /* skipped because csum was equeal [unit BM_BLOCK_SIZE] */
 -      unsigned long rs_same_csum;
 +      unsigned long rs_mark_time[DRBD_SYNC_MARKS];
 +      /* current index into rs_mark_{left,time} */
 +      int rs_last_mark;
  
        /* where does the admin want us to start? (sector) */
        sector_t ov_start_sector;
        spinlock_t epoch_lock;
        unsigned int epochs;
        enum write_ordering_e write_ordering;
 -      struct list_head active_ee; /* IO in progress */
 -      struct list_head sync_ee;   /* IO in progress */
 +      struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */
 +      struct list_head sync_ee;   /* IO in progress (P_RS_DATA_REPLY gets written to disk) */
        struct list_head done_ee;   /* send ack */
 -      struct list_head read_ee;   /* IO in progress */
 +      struct list_head read_ee;   /* IO in progress (any read) */
        struct list_head net_ee;    /* zero-copy network send in progress */
        struct hlist_head *ee_hash; /* is proteced by req_lock! */
        unsigned int ee_hash_s;
        int next_barrier_nr;
        struct hlist_head *app_reads_hash; /* is proteced by req_lock */
        struct list_head resync_reads;
 -      atomic_t pp_in_use;
 +      atomic_t pp_in_use;             /* allocated from page pool */
 +      atomic_t pp_in_use_by_net;      /* sendpage()d, still referenced by tcp */
        wait_queue_head_t ee_wait;
        struct page *md_io_page;        /* one page buffer for md_io */
        struct page *md_io_tmpp;        /* for logical_block_size != 512 */
        u64 ed_uuid; /* UUID of the exposed data */
        struct mutex state_mutex;
        char congestion_reason;  /* Why we where congested... */
 +      atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */
 +      atomic_t rs_sect_ev; /* for submitted resync data rate, both */
 +      int rs_last_sect_ev; /* counter to compare with */
 +      int rs_last_events;  /* counter of read or write "events" (unit sectors)
 +                            * on the lower level device when we last looked. */
 +      int c_sync_rate; /* current resync rate after syncer throttle magic */
 +      struct fifo_buffer rs_plan_s; /* correction values of resync planer */
 +      int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
 +      int rs_planed;    /* resync sectors already planed */
  };
  
  static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
@@@ -1201,8 -1138,6 +1201,8 @@@ extern void drbd_free_resources(struct 
  extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
                       unsigned int set_size);
  extern void tl_clear(struct drbd_conf *mdev);
 +enum drbd_req_event;
 +extern void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what);
  extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *);
  extern void drbd_free_sock(struct drbd_conf *mdev);
  extern int drbd_send(struct drbd_conf *mdev, struct socket *sock,
@@@ -1215,12 -1150,12 +1215,12 @@@ extern int drbd_send_sizes(struct drbd_
  extern int _drbd_send_state(struct drbd_conf *mdev);
  extern int drbd_send_state(struct drbd_conf *mdev);
  extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
 -                      enum drbd_packets cmd, struct p_header *h,
 +                      enum drbd_packets cmd, struct p_header80 *h,
                        size_t size, unsigned msg_flags);
  #define USE_DATA_SOCKET 1
  #define USE_META_SOCKET 0
  extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
 -                      enum drbd_packets cmd, struct p_header *h,
 +                      enum drbd_packets cmd, struct p_header80 *h,
                        size_t size);
  extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd,
                        char *data, size_t size);
@@@ -1232,7 -1167,7 +1232,7 @@@ extern int drbd_send_ack(struct drbd_co
  extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
                        struct p_block_req *rp);
  extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
 -                      struct p_data *dp);
 +                      struct p_data *dp, int data_size);
  extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
                            sector_t sector, int blksize, u64 block_id);
  extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
@@@ -1266,13 -1201,7 +1266,13 @@@ extern void drbd_uuid_set_bm(struct drb
  extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local);
  extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local);
  extern int drbd_md_test_flag(struct drbd_backing_dev *, int);
 +#ifndef DRBD_DEBUG_MD_SYNC
  extern void drbd_md_mark_dirty(struct drbd_conf *mdev);
 +#else
 +#define drbd_md_mark_dirty(m) drbd_md_mark_dirty_(m, __LINE__ , __func__ )
 +extern void drbd_md_mark_dirty_(struct drbd_conf *mdev,
 +              unsigned int line, const char *func);
 +#endif
  extern void drbd_queue_bitmap_io(struct drbd_conf *mdev,
                                 int (*io_fn)(struct drbd_conf *),
                                 void (*done)(struct drbd_conf *, int),
  extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
  extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
  extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why);
 +extern void drbd_go_diskless(struct drbd_conf *mdev);
  
  
  /* Meta data layout
@@@ -1336,8 -1264,6 +1336,8 @@@ struct bm_extent 
   * Bit 1 ==> local node thinks this block needs to be synced.
   */
  
 +#define SLEEP_TIME (HZ/10)
 +
  #define BM_BLOCK_SHIFT  12                     /* 4k per bit */
  #define BM_BLOCK_SIZE  (1<<BM_BLOCK_SHIFT)
  /* (9+3) : 512 bytes @ 8 bits; representing 16M storage
  #endif
  
  /* Sector shift value for the "hash" functions of tl_hash and ee_hash tables.
 - * With a value of 6 all IO in one 32K block make it to the same slot of the
 + * With a value of 8 all IO in one 128K block make it to the same slot of the
   * hash table. */
 -#define HT_SHIFT 6
 +#define HT_SHIFT 8
  #define DRBD_MAX_SEGMENT_SIZE (1U<<(9+HT_SHIFT))
  
 +#define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* The old header only allows packets up to 32Kib data */
 +
  /* Number of elements in the app_reads_hash */
  #define APP_R_HSIZE 15
  
@@@ -1445,7 -1369,6 +1445,7 @@@ extern unsigned long drbd_bm_find_next(
  /* bm_find_next variants for use while you hold drbd_bm_lock() */
  extern unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
  extern unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo);
 +extern unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev);
  extern unsigned long drbd_bm_total_weight(struct drbd_conf *mdev);
  extern int drbd_bm_rs_done(struct drbd_conf *mdev);
  /* for receive_bitmap */
@@@ -1498,8 -1421,7 +1498,8 @@@ extern void resync_after_online_grow(st
  extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local);
  extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role,
                int force);
 -enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
 +extern enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
 +extern void drbd_try_outdate_peer_async(struct drbd_conf *mdev);
  extern int drbd_khelper(struct drbd_conf *mdev, char *cmd);
  
  /* drbd_worker.c */
@@@ -1545,12 -1467,10 +1545,12 @@@ extern int w_send_barrier(struct drbd_c
  extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int);
  extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int);
  extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int);
 +extern int w_restart_disk_io(struct drbd_conf *, struct drbd_work *, int);
  
  extern void resync_timer_fn(unsigned long data);
  
  /* drbd_receiver.c */
 +extern int drbd_rs_should_slow_down(struct drbd_conf *mdev);
  extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
                const unsigned rw, const int fault_type);
  extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
@@@ -1559,10 -1479,7 +1559,10 @@@ extern struct drbd_epoch_entry *drbd_al
                                            sector_t sector,
                                            unsigned int data_size,
                                            gfp_t gfp_mask) __must_hold(local);
 -extern void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e);
 +extern void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
 +              int is_net);
 +#define drbd_free_ee(m,e)     drbd_free_some_ee(m, e, 0)
 +#define drbd_free_net_ee(m,e) drbd_free_some_ee(m, e, 1)
  extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev,
                struct list_head *head);
  extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
  extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled);
  extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed);
  extern void drbd_flush_workqueue(struct drbd_conf *mdev);
 +extern void drbd_free_tl_hash(struct drbd_conf *mdev);
  
  /* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to
   * mess with get_fs/set_fs, we know we are KERNEL_DS always. */
@@@ -1684,8 -1600,6 +1684,8 @@@ void drbd_bcast_ee(struct drbd_conf *md
  #define susp_MASK 1
  #define user_isp_MASK 1
  #define aftr_isp_MASK 1
 +#define susp_nod_MASK 1
 +#define susp_fen_MASK 1
  
  #define NS(T, S) \
        ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
@@@ -1941,6 -1855,13 +1941,6 @@@ static inline sector_t drbd_md_ss__(str
        }
  }
  
 -static inline void
 -_drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
 -{
 -      list_add_tail(&w->list, &q->q);
 -      up(&q->s);
 -}
 -
  static inline void
  drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w)
  {
@@@ -1978,19 -1899,19 +1978,19 @@@ static inline void request_ping(struct 
  static inline int drbd_send_short_cmd(struct drbd_conf *mdev,
        enum drbd_packets cmd)
  {
 -      struct p_header h;
 +      struct p_header80 h;
        return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h));
  }
  
  static inline int drbd_send_ping(struct drbd_conf *mdev)
  {
 -      struct p_header h;
 +      struct p_header80 h;
        return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h));
  }
  
  static inline int drbd_send_ping_ack(struct drbd_conf *mdev)
  {
 -      struct p_header h;
 +      struct p_header80 h;
        return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h));
  }
  
@@@ -2092,7 -2013,7 +2092,7 @@@ static inline void inc_unacked(struct d
  static inline void put_net_conf(struct drbd_conf *mdev)
  {
        if (atomic_dec_and_test(&mdev->net_cnt))
 -              wake_up(&mdev->misc_wait);
 +              wake_up(&mdev->net_cnt_wait);
  }
  
  /**
@@@ -2123,14 -2044,10 +2123,14 @@@ static inline int get_net_conf(struct d
  
  static inline void put_ldev(struct drbd_conf *mdev)
  {
 +      int i = atomic_dec_return(&mdev->local_cnt);
        __release(local);
 -      if (atomic_dec_and_test(&mdev->local_cnt))
 +      D_ASSERT(i >= 0);
 +      if (i == 0) {
 +              if (mdev->state.disk == D_FAILED)
 +                      drbd_go_diskless(mdev);
                wake_up(&mdev->misc_wait);
 -      D_ASSERT(atomic_read(&mdev->local_cnt) >= 0);
 +      }
  }
  
  #ifndef __CHECKER__
@@@ -2262,16 -2179,11 +2262,16 @@@ static inline int drbd_state_is_stable(
        return 1;
  }
  
 +static inline int is_susp(union drbd_state s)
 +{
 +      return s.susp || s.susp_nod || s.susp_fen;
 +}
 +
  static inline int __inc_ap_bio_cond(struct drbd_conf *mdev)
  {
        int mxb = drbd_get_max_buffers(mdev);
  
 -      if (mdev->state.susp)
 +      if (is_susp(mdev->state))
                return 0;
        if (test_bit(SUSPEND_IO, &mdev->flags))
                return 0;
@@@ -2409,8 -2321,7 +2409,7 @@@ static inline void drbd_md_flush(struc
        if (test_bit(MD_NO_BARRIER, &mdev->flags))
                return;
  
-       r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL,
-                       BLKDEV_IFL_WAIT);
+       r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL);
        if (r) {
                set_bit(MD_NO_BARRIER, &mdev->flags);
                dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
index 760ae0df92516994c368b3a138d62c3f8a91893b,df15e7f0e7b766e0a031c71a8f3bf7275a7cfc47..efd6169acf2f04bf758c68ceee419549351c7e64
@@@ -241,7 -241,7 +241,7 @@@ static void drbd_kick_lo_and_reclaim_ne
        spin_unlock_irq(&mdev->req_lock);
  
        list_for_each_entry_safe(e, t, &reclaimed, w.list)
 -              drbd_free_ee(mdev, e);
 +              drbd_free_net_ee(mdev, e);
  }
  
  /**
@@@ -298,11 -298,9 +298,11 @@@ static struct page *drbd_pp_alloc(struc
   * Is also used from inside an other spin_lock_irq(&mdev->req_lock);
   * Either links the page chain back to the global pool,
   * or returns all pages to the system. */
 -static void drbd_pp_free(struct drbd_conf *mdev, struct page *page)
 +static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net)
  {
 +      atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use;
        int i;
 +
        if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count)
                i = page_chain_free(page);
        else {
                drbd_pp_vacant += i;
                spin_unlock(&drbd_pp_lock);
        }
 -      atomic_sub(i, &mdev->pp_in_use);
 -      i = atomic_read(&mdev->pp_in_use);
 +      i = atomic_sub_return(i, a);
        if (i < 0)
 -              dev_warn(DEV, "ASSERTION FAILED: pp_in_use: %d < 0\n", i);
 +              dev_warn(DEV, "ASSERTION FAILED: %s: %d < 0\n",
 +                      is_net ? "pp_in_use_by_net" : "pp_in_use", i);
        wake_up(&drbd_pp_wait);
  }
  
@@@ -367,6 -365,7 +367,6 @@@ struct drbd_epoch_entry *drbd_alloc_ee(
        e->size = data_size;
        e->flags = 0;
        e->sector = sector;
 -      e->sector = sector;
        e->block_id = id;
  
        return e;
        return NULL;
  }
  
 -void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
 +void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, int is_net)
  {
 -      drbd_pp_free(mdev, e->pages);
 +      if (e->flags & EE_HAS_DIGEST)
 +              kfree(e->digest);
 +      drbd_pp_free(mdev, e->pages, is_net);
        D_ASSERT(atomic_read(&e->pending_bios) == 0);
        D_ASSERT(hlist_unhashed(&e->colision));
        mempool_free(e, drbd_ee_mempool);
@@@ -391,14 -388,13 +391,14 @@@ int drbd_release_ee(struct drbd_conf *m
        LIST_HEAD(work_list);
        struct drbd_epoch_entry *e, *t;
        int count = 0;
 +      int is_net = list == &mdev->net_ee;
  
        spin_lock_irq(&mdev->req_lock);
        list_splice_init(list, &work_list);
        spin_unlock_irq(&mdev->req_lock);
  
        list_for_each_entry_safe(e, t, &work_list, w.list) {
 -              drbd_free_ee(mdev, e);
 +              drbd_free_some_ee(mdev, e, is_net);
                count++;
        }
        return count;
@@@ -427,7 -423,7 +427,7 @@@ static int drbd_process_done_ee(struct 
        spin_unlock_irq(&mdev->req_lock);
  
        list_for_each_entry_safe(e, t, &reclaimed, w.list)
 -              drbd_free_ee(mdev, e);
 +              drbd_free_net_ee(mdev, e);
  
        /* possible callbacks here:
         * e_end_block, and e_end_resync_block, e_send_discard_ack.
@@@ -723,14 -719,14 +723,14 @@@ out
  static int drbd_send_fp(struct drbd_conf *mdev,
        struct socket *sock, enum drbd_packets cmd)
  {
 -      struct p_header *h = (struct p_header *) &mdev->data.sbuf.header;
 +      struct p_header80 *h = &mdev->data.sbuf.header.h80;
  
        return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0);
  }
  
  static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock)
  {
 -      struct p_header *h = (struct p_header *) &mdev->data.sbuf.header;
 +      struct p_header80 *h = &mdev->data.rbuf.header.h80;
        int rr;
  
        rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0);
@@@ -780,6 -776,9 +780,6 @@@ static int drbd_connect(struct drbd_con
  
        D_ASSERT(!mdev->data.socket);
  
 -      if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags))
 -              dev_err(DEV, "CREATE_BARRIER flag was set in drbd_connect - now cleared!\n");
 -
        if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS)
                return -2;
  
@@@ -928,11 -927,6 +928,11 @@@ retry
  
        drbd_thread_start(&mdev->asender);
  
 +      if (mdev->agreed_pro_version < 95 && get_ldev(mdev)) {
 +              drbd_setup_queue_param(mdev, DRBD_MAX_SIZE_H80_PACKET);
 +              put_ldev(mdev);
 +      }
 +
        if (!drbd_send_protocol(mdev))
                return -1;
        drbd_send_sync_param(mdev, &mdev->sync_conf);
@@@ -952,28 -946,22 +952,28 @@@ out_release_sockets
        return -1;
  }
  
 -static int drbd_recv_header(struct drbd_conf *mdev, struct p_header *h)
 +static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsigned int *packet_size)
  {
 +      union p_header *h = &mdev->data.rbuf.header;
        int r;
  
        r = drbd_recv(mdev, h, sizeof(*h));
 -
        if (unlikely(r != sizeof(*h))) {
                dev_err(DEV, "short read expecting header on sock: r=%d\n", r);
                return FALSE;
 -      };
 -      h->command = be16_to_cpu(h->command);
 -      h->length  = be16_to_cpu(h->length);
 -      if (unlikely(h->magic != BE_DRBD_MAGIC)) {
 -              dev_err(DEV, "magic?? on data m: 0x%lx c: %d l: %d\n",
 -                  (long)be32_to_cpu(h->magic),
 -                  h->command, h->length);
 +      }
 +
 +      if (likely(h->h80.magic == BE_DRBD_MAGIC)) {
 +              *cmd = be16_to_cpu(h->h80.command);
 +              *packet_size = be16_to_cpu(h->h80.length);
 +      } else if (h->h95.magic == BE_DRBD_MAGIC_BIG) {
 +              *cmd = be16_to_cpu(h->h95.command);
 +              *packet_size = be32_to_cpu(h->h95.length);
 +      } else {
 +              dev_err(DEV, "magic?? on data m: 0x%08x c: %d l: %d\n",
 +                  be32_to_cpu(h->h80.magic),
 +                  be16_to_cpu(h->h80.command),
 +                  be16_to_cpu(h->h80.length));
                return FALSE;
        }
        mdev->last_received = jiffies;
@@@ -987,7 -975,7 +987,7 @@@ static enum finish_epoch drbd_flush_aft
  
        if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
                rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
-                                       NULL, BLKDEV_IFL_WAIT);
+                                       NULL);
                if (rv) {
                        dev_err(DEV, "local disk flush failed with status %d\n", rv);
                        /* would rather check on EOPNOTSUPP, but that is not reliable.
@@@ -1280,12 -1268,17 +1280,12 @@@ int w_e_reissue(struct drbd_conf *mdev
        return 1;
  }
  
 -static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h)
 +static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
  {
        int rv, issue_flush;
 -      struct p_barrier *p = (struct p_barrier *)h;
 +      struct p_barrier *p = &mdev->data.rbuf.barrier;
        struct drbd_epoch *epoch;
  
 -      ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
 -
 -      rv = drbd_recv(mdev, h->payload, h->length);
 -      ERR_IF(rv != h->length) return FALSE;
 -
        inc_unacked(mdev);
  
        if (mdev->net_conf->wire_protocol != DRBD_PROT_C)
@@@ -1464,7 -1457,7 +1464,7 @@@ static int drbd_drain_block(struct drbd
                data_size -= rr;
        }
        kunmap(page);
 -      drbd_pp_free(mdev, page);
 +      drbd_pp_free(mdev, page, 0);
        return rv;
  }
  
@@@ -1569,29 -1562,30 +1569,29 @@@ static int recv_resync_read(struct drbd
        list_add(&e->w.list, &mdev->sync_ee);
        spin_unlock_irq(&mdev->req_lock);
  
 +      atomic_add(data_size >> 9, &mdev->rs_sect_ev);
        if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
                return TRUE;
  
 +      /* drbd_submit_ee currently fails for one reason only:
 +       * not being able to allocate enough bios.
 +       * Is dropping the connection going to help? */
 +      spin_lock_irq(&mdev->req_lock);
 +      list_del(&e->w.list);
 +      spin_unlock_irq(&mdev->req_lock);
 +
        drbd_free_ee(mdev, e);
  fail:
        put_ldev(mdev);
        return FALSE;
  }
  
 -static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h)
 +static int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
  {
        struct drbd_request *req;
        sector_t sector;
 -      unsigned int header_size, data_size;
        int ok;
 -      struct p_data *p = (struct p_data *)h;
 -
 -      header_size = sizeof(*p) - sizeof(*h);
 -      data_size   = h->length  - header_size;
 -
 -      ERR_IF(data_size == 0) return FALSE;
 -
 -      if (drbd_recv(mdev, h->payload, header_size) != header_size)
 -              return FALSE;
 +      struct p_data *p = &mdev->data.rbuf.data;
  
        sector = be64_to_cpu(p->sector);
  
        return ok;
  }
  
 -static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h)
 +static int receive_RSDataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
  {
        sector_t sector;
 -      unsigned int header_size, data_size;
        int ok;
 -      struct p_data *p = (struct p_data *)h;
 -
 -      header_size = sizeof(*p) - sizeof(*h);
 -      data_size   = h->length  - header_size;
 -
 -      ERR_IF(data_size == 0) return FALSE;
 -
 -      if (drbd_recv(mdev, h->payload, header_size) != header_size)
 -              return FALSE;
 +      struct p_data *p = &mdev->data.rbuf.data;
  
        sector = be64_to_cpu(p->sector);
        D_ASSERT(p->block_id == ID_SYNCER);
  
                ok = drbd_drain_block(mdev, data_size);
  
 -              drbd_send_ack_dp(mdev, P_NEG_ACK, p);
 +              drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
        }
  
 +      atomic_add(data_size >> 9, &mdev->rs_sect_in);
 +
        return ok;
  }
  
@@@ -1764,27 -1765,24 +1764,27 @@@ static int drbd_wait_peer_seq(struct dr
        return ret;
  }
  
 +static unsigned long write_flags_to_bio(struct drbd_conf *mdev, u32 dpf)
 +{
 +      if (mdev->agreed_pro_version >= 95)
 +              return  (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
 +                      (dpf & DP_UNPLUG ? REQ_UNPLUG : 0) |
 +                      (dpf & DP_FUA ? REQ_FUA : 0) |
 +                      (dpf & DP_FLUSH ? REQ_FUA : 0) |
 +                      (dpf & DP_DISCARD ? REQ_DISCARD : 0);
 +      else
 +              return dpf & DP_RW_SYNC ? (REQ_SYNC | REQ_UNPLUG) : 0;
 +}
 +
  /* mirrored write */
 -static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
 +static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
  {
        sector_t sector;
        struct drbd_epoch_entry *e;
 -      struct p_data *p = (struct p_data *)h;
 -      int header_size, data_size;
 +      struct p_data *p = &mdev->data.rbuf.data;
        int rw = WRITE;
        u32 dp_flags;
  
 -      header_size = sizeof(*p) - sizeof(*h);
 -      data_size   = h->length  - header_size;
 -
 -      ERR_IF(data_size == 0) return FALSE;
 -
 -      if (drbd_recv(mdev, h->payload, header_size) != header_size)
 -              return FALSE;
 -
        if (!get_ldev(mdev)) {
                if (__ratelimit(&drbd_ratelimit_state))
                        dev_err(DEV, "Can not write mirrored data block "
                        mdev->peer_seq++;
                spin_unlock(&mdev->peer_seq_lock);
  
 -              drbd_send_ack_dp(mdev, P_NEG_ACK, p);
 +              drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
                atomic_inc(&mdev->current_epoch->epoch_size);
                return drbd_drain_block(mdev, data_size);
        }
        spin_unlock(&mdev->epoch_lock);
  
        dp_flags = be32_to_cpu(p->dp_flags);
 -      if (dp_flags & DP_HARDBARRIER) {
 -              dev_err(DEV, "ASSERT FAILED would have submitted barrier request\n");
 -              /* rw |= REQ_HARDBARRIER; */
 -      }
 -      if (dp_flags & DP_RW_SYNC)
 -              rw |= REQ_SYNC | REQ_UNPLUG;
 +      rw |= write_flags_to_bio(mdev, dp_flags);
 +
        if (dp_flags & DP_MAY_SET_IN_SYNC)
                e->flags |= EE_MAY_SET_IN_SYNC;
  
        if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0)
                return TRUE;
  
 +      /* drbd_submit_ee currently fails for one reason only:
 +       * not being able to allocate enough bios.
 +       * Is dropping the connection going to help? */
 +      spin_lock_irq(&mdev->req_lock);
 +      list_del(&e->w.list);
 +      hlist_del_init(&e->colision);
 +      spin_unlock_irq(&mdev->req_lock);
 +      if (e->flags & EE_CALL_AL_COMPLETE_IO)
 +              drbd_al_complete_io(mdev, e->sector);
 +
  out_interrupted:
        /* yes, the epoch_size now is imbalanced.
         * but we drop the connection anyways, so we don't have a chance to
        return FALSE;
  }
  
 -static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
 +/* We may throttle resync, if the lower device seems to be busy,
 + * and current sync rate is above c_min_rate.
 + *
 + * To decide whether or not the lower device is busy, we use a scheme similar
 + * to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
 + * (more than 64 sectors) of activity we cannot account for with our own resync
 + * activity, it obviously is "busy".
 + *
 + * The current sync rate used here uses only the most recent two step marks,
 + * to have a short time average so we can react faster.
 + */
 +int drbd_rs_should_slow_down(struct drbd_conf *mdev)
 +{
 +      struct gendisk *disk = mdev->ldev->backing_bdev->bd_contains->bd_disk;
 +      unsigned long db, dt, dbdt;
 +      int curr_events;
 +      int throttle = 0;
 +
 +      /* feature disabled? */
 +      if (mdev->sync_conf.c_min_rate == 0)
 +              return 0;
 +
 +      curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
 +                    (int)part_stat_read(&disk->part0, sectors[1]) -
 +                      atomic_read(&mdev->rs_sect_ev);
 +      if (!mdev->rs_last_events || curr_events - mdev->rs_last_events > 64) {
 +              unsigned long rs_left;
 +              int i;
 +
 +              mdev->rs_last_events = curr_events;
 +
 +              /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
 +               * approx. */
 +              i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-2) % DRBD_SYNC_MARKS;
 +              rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
 +
 +              dt = ((long)jiffies - (long)mdev->rs_mark_time[i]) / HZ;
 +              if (!dt)
 +                      dt++;
 +              db = mdev->rs_mark_left[i] - rs_left;
 +              dbdt = Bit2KB(db/dt);
 +
 +              if (dbdt > mdev->sync_conf.c_min_rate)
 +                      throttle = 1;
 +      }
 +      return throttle;
 +}
 +
 +
 +static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int digest_size)
  {
        sector_t sector;
        const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
        struct drbd_epoch_entry *e;
        struct digest_info *di = NULL;
 -      int size, digest_size;
 +      int size, verb;
        unsigned int fault_type;
 -      struct p_block_req *p =
 -              (struct p_block_req *)h;
 -      const int brps = sizeof(*p)-sizeof(*h);
 -
 -      if (drbd_recv(mdev, h->payload, brps) != brps)
 -              return FALSE;
 +      struct p_block_req *p = &mdev->data.rbuf.block_req;
  
        sector = be64_to_cpu(p->sector);
        size   = be32_to_cpu(p->blksize);
        }
  
        if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
 -              if (__ratelimit(&drbd_ratelimit_state))
 +              verb = 1;
 +              switch (cmd) {
 +              case P_DATA_REQUEST:
 +                      drbd_send_ack_rp(mdev, P_NEG_DREPLY, p);
 +                      break;
 +              case P_RS_DATA_REQUEST:
 +              case P_CSUM_RS_REQUEST:
 +              case P_OV_REQUEST:
 +                      drbd_send_ack_rp(mdev, P_NEG_RS_DREPLY , p);
 +                      break;
 +              case P_OV_REPLY:
 +                      verb = 0;
 +                      dec_rs_pending(mdev);
 +                      drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, ID_IN_SYNC);
 +                      break;
 +              default:
 +                      dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
 +                              cmdname(cmd));
 +              }
 +              if (verb && __ratelimit(&drbd_ratelimit_state))
                        dev_err(DEV, "Can not satisfy peer's read request, "
                            "no local data.\n");
 -              drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY :
 -                               P_NEG_RS_DREPLY , p);
 -              return drbd_drain_block(mdev, h->length - brps);
 +
 +              /* drain possibly payload */
 +              return drbd_drain_block(mdev, digest_size);
        }
  
        /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
                return FALSE;
        }
  
 -      switch (h->command) {
 +      switch (cmd) {
        case P_DATA_REQUEST:
                e->w.cb = w_e_end_data_req;
                fault_type = DRBD_FAULT_DT_RD;
 -              break;
 +              /* application IO, don't drbd_rs_begin_io */
 +              goto submit;
 +
        case P_RS_DATA_REQUEST:
                e->w.cb = w_e_end_rsdata_req;
                fault_type = DRBD_FAULT_RS_RD;
 -              /* Eventually this should become asynchronously. Currently it
 -               * blocks the whole receiver just to delay the reading of a
 -               * resync data block.
 -               * the drbd_work_queue mechanism is made for this...
 -               */
 -              if (!drbd_rs_begin_io(mdev, sector)) {
 -                      /* we have been interrupted,
 -                       * probably connection lost! */
 -                      D_ASSERT(signal_pending(current));
 -                      goto out_free_e;
 -              }
                break;
  
        case P_OV_REPLY:
        case P_CSUM_RS_REQUEST:
                fault_type = DRBD_FAULT_RS_RD;
 -              digest_size = h->length - brps ;
                di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO);
                if (!di)
                        goto out_free_e;
                di->digest_size = digest_size;
                di->digest = (((char *)di)+sizeof(struct digest_info));
  
 +              e->digest = di;
 +              e->flags |= EE_HAS_DIGEST;
 +
                if (drbd_recv(mdev, di->digest, digest_size) != digest_size)
                        goto out_free_e;
  
 -              e->block_id = (u64)(unsigned long)di;
 -              if (h->command == P_CSUM_RS_REQUEST) {
 +              if (cmd == P_CSUM_RS_REQUEST) {
                        D_ASSERT(mdev->agreed_pro_version >= 89);
                        e->w.cb = w_e_end_csum_rs_req;
 -              } else if (h->command == P_OV_REPLY) {
 +              } else if (cmd == P_OV_REPLY) {
                        e->w.cb = w_e_end_ov_reply;
                        dec_rs_pending(mdev);
 -                      break;
 -              }
 -
 -              if (!drbd_rs_begin_io(mdev, sector)) {
 -                      /* we have been interrupted, probably connection lost! */
 -                      D_ASSERT(signal_pending(current));
 -                      goto out_free_e;
 +                      /* drbd_rs_begin_io done when we sent this request,
 +                       * but accounting still needs to be done. */
 +                      goto submit_for_resync;
                }
                break;
  
        case P_OV_REQUEST:
 -              if (mdev->state.conn >= C_CONNECTED &&
 -                  mdev->state.conn != C_VERIFY_T)
 -                      dev_warn(DEV, "ASSERT FAILED: got P_OV_REQUEST while being %s\n",
 -                              drbd_conn_str(mdev->state.conn));
                if (mdev->ov_start_sector == ~(sector_t)0 &&
                    mdev->agreed_pro_version >= 90) {
                        mdev->ov_start_sector = sector;
                }
                e->w.cb = w_e_end_ov_req;
                fault_type = DRBD_FAULT_RS_RD;
 -              /* Eventually this should become asynchronous. Currently it
 -               * blocks the whole receiver just to delay the reading of a
 -               * resync data block.
 -               * the drbd_work_queue mechanism is made for this...
 -               */
 -              if (!drbd_rs_begin_io(mdev, sector)) {
 -                      /* we have been interrupted,
 -                       * probably connection lost! */
 -                      D_ASSERT(signal_pending(current));
 -                      goto out_free_e;
 -              }
                break;
  
 -
        default:
                dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
 -                  cmdname(h->command));
 +                  cmdname(cmd));
                fault_type = DRBD_FAULT_MAX;
 +              goto out_free_e;
        }
  
 -      spin_lock_irq(&mdev->req_lock);
 -      list_add(&e->w.list, &mdev->read_ee);
 -      spin_unlock_irq(&mdev->req_lock);
 +      /* Throttle, drbd_rs_begin_io and submit should become asynchronous
 +       * wrt the receiver, but it is not as straightforward as it may seem.
 +       * Various places in the resync start and stop logic assume resync
 +       * requests are processed in order, requeuing this on the worker thread
 +       * introduces a bunch of new code for synchronization between threads.
 +       *
 +       * Unlimited throttling before drbd_rs_begin_io may stall the resync
 +       * "forever", throttling after drbd_rs_begin_io will lock that extent
 +       * for application writes for the same time.  For now, just throttle
 +       * here, where the rest of the code expects the receiver to sleep for
 +       * a while, anyways.
 +       */
 +
 +      /* Throttle before drbd_rs_begin_io, as that locks out application IO;
 +       * this defers syncer requests for some time, before letting at least
 +       * on request through.  The resync controller on the receiving side
 +       * will adapt to the incoming rate accordingly.
 +       *
 +       * We cannot throttle here if remote is Primary/SyncTarget:
 +       * we would also throttle its application reads.
 +       * In that case, throttling is done on the SyncTarget only.
 +       */
 +      if (mdev->state.peer != R_PRIMARY && drbd_rs_should_slow_down(mdev))
 +              msleep(100);
 +      if (drbd_rs_begin_io(mdev, e->sector))
 +              goto out_free_e;
  
 +submit_for_resync:
 +      atomic_add(size >> 9, &mdev->rs_sect_ev);
 +
 +submit:
        inc_unacked(mdev);
 +      spin_lock_irq(&mdev->req_lock);
 +      list_add_tail(&e->w.list, &mdev->read_ee);
 +      spin_unlock_irq(&mdev->req_lock);
  
        if (drbd_submit_ee(mdev, e, READ, fault_type) == 0)
                return TRUE;
  
 +      /* drbd_submit_ee currently fails for one reason only:
 +       * not being able to allocate enough bios.
 +       * Is dropping the connection going to help? */
 +      spin_lock_irq(&mdev->req_lock);
 +      list_del(&e->w.list);
 +      spin_unlock_irq(&mdev->req_lock);
 +      /* no drbd_rs_complete_io(), we are dropping the connection anyways */
 +
  out_free_e:
 -      kfree(di);
        put_ldev(mdev);
        drbd_free_ee(mdev, e);
        return FALSE;
@@@ -2780,13 -2699,20 +2780,13 @@@ static int cmp_after_sb(enum drbd_after
        return 1;
  }
  
 -static int receive_protocol(struct drbd_conf *mdev, struct p_header *h)
 +static int receive_protocol(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
  {
 -      struct p_protocol *p = (struct p_protocol *)h;
 -      int header_size, data_size;
 +      struct p_protocol *p = &mdev->data.rbuf.protocol;
        int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
        int p_want_lose, p_two_primaries, cf;
        char p_integrity_alg[SHARED_SECRET_MAX] = "";
  
 -      header_size = sizeof(*p) - sizeof(*h);
 -      data_size   = h->length  - header_size;
 -
 -      if (drbd_recv(mdev, h->payload, header_size) != header_size)
 -              return FALSE;
 -
        p_proto         = be32_to_cpu(p->protocol);
        p_after_sb_0p   = be32_to_cpu(p->after_sb_0p);
        p_after_sb_1p   = be32_to_cpu(p->after_sb_1p);
@@@ -2879,46 -2805,39 +2879,46 @@@ struct crypto_hash *drbd_crypto_alloc_d
        return tfm;
  }
  
 -static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h)
 +static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int packet_size)
  {
        int ok = TRUE;
 -      struct p_rs_param_89 *p = (struct p_rs_param_89 *)h;
 +      struct p_rs_param_95 *p = &mdev->data.rbuf.rs_param_95;
        unsigned int header_size, data_size, exp_max_sz;
        struct crypto_hash *verify_tfm = NULL;
        struct crypto_hash *csums_tfm = NULL;
        const int apv = mdev->agreed_pro_version;
 +      int *rs_plan_s = NULL;
 +      int fifo_size = 0;
  
        exp_max_sz  = apv <= 87 ? sizeof(struct p_rs_param)
                    : apv == 88 ? sizeof(struct p_rs_param)
                                        + SHARED_SECRET_MAX
 -                  : /* 89 */    sizeof(struct p_rs_param_89);
 +                  : apv <= 94 ? sizeof(struct p_rs_param_89)
 +                  : /* apv >= 95 */ sizeof(struct p_rs_param_95);
  
 -      if (h->length > exp_max_sz) {
 +      if (packet_size > exp_max_sz) {
                dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
 -                  h->length, exp_max_sz);
 +                  packet_size, exp_max_sz);
                return FALSE;
        }
  
        if (apv <= 88) {
 -              header_size = sizeof(struct p_rs_param) - sizeof(*h);
 -              data_size   = h->length  - header_size;
 -      } else /* apv >= 89 */ {
 -              header_size = sizeof(struct p_rs_param_89) - sizeof(*h);
 -              data_size   = h->length  - header_size;
 +              header_size = sizeof(struct p_rs_param) - sizeof(struct p_header80);
 +              data_size   = packet_size  - header_size;
 +      } else if (apv <= 94) {
 +              header_size = sizeof(struct p_rs_param_89) - sizeof(struct p_header80);
 +              data_size   = packet_size  - header_size;
 +              D_ASSERT(data_size == 0);
 +      } else {
 +              header_size = sizeof(struct p_rs_param_95) - sizeof(struct p_header80);
 +              data_size   = packet_size  - header_size;
                D_ASSERT(data_size == 0);
        }
  
        /* initialize verify_alg and csums_alg */
        memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
  
 -      if (drbd_recv(mdev, h->payload, header_size) != header_size)
 +      if (drbd_recv(mdev, &p->head.payload, header_size) != header_size)
                return FALSE;
  
        mdev->sync_conf.rate      = be32_to_cpu(p->rate);
                        }
                }
  
 +              if (apv > 94) {
 +                      mdev->sync_conf.rate      = be32_to_cpu(p->rate);
 +                      mdev->sync_conf.c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
 +                      mdev->sync_conf.c_delay_target = be32_to_cpu(p->c_delay_target);
 +                      mdev->sync_conf.c_fill_target = be32_to_cpu(p->c_fill_target);
 +                      mdev->sync_conf.c_max_rate = be32_to_cpu(p->c_max_rate);
 +
 +                      fifo_size = (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
 +                      if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
 +                              rs_plan_s   = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL);
 +                              if (!rs_plan_s) {
 +                                      dev_err(DEV, "kmalloc of fifo_buffer failed");
 +                                      goto disconnect;
 +                              }
 +                      }
 +              }
  
                spin_lock(&mdev->peer_seq_lock);
                /* lock against drbd_nl_syncer_conf() */
                        mdev->csums_tfm = csums_tfm;
                        dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
                }
 +              if (fifo_size != mdev->rs_plan_s.size) {
 +                      kfree(mdev->rs_plan_s.values);
 +                      mdev->rs_plan_s.values = rs_plan_s;
 +                      mdev->rs_plan_s.size   = fifo_size;
 +                      mdev->rs_planed = 0;
 +              }
                spin_unlock(&mdev->peer_seq_lock);
        }
  
@@@ -3049,15 -2946,19 +3049,15 @@@ static void warn_if_differ_considerably
                     (unsigned long long)a, (unsigned long long)b);
  }
  
 -static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
 +static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
  {
 -      struct p_sizes *p = (struct p_sizes *)h;
 +      struct p_sizes *p = &mdev->data.rbuf.sizes;
        enum determine_dev_size dd = unchanged;
        unsigned int max_seg_s;
        sector_t p_size, p_usize, my_usize;
        int ldsc = 0; /* local disk size changed */
        enum dds_flags ddsf;
  
 -      ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
 -      if (drbd_recv(mdev, h->payload, h->length) != h->length)
 -              return FALSE;
 -
        p_size = be64_to_cpu(p->d_size);
        p_usize = be64_to_cpu(p->u_size);
  
         * we still need to figure out whether we accept that. */
        mdev->p_size = p_size;
  
 -#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
        if (get_ldev(mdev)) {
                warn_if_differ_considerably(mdev, "lower level device sizes",
                           p_size, drbd_get_max_capacity(mdev->ldev));
  
                if (mdev->agreed_pro_version < 94)
                        max_seg_s = be32_to_cpu(p->max_segment_size);
 +              else if (mdev->agreed_pro_version == 94)
 +                      max_seg_s = DRBD_MAX_SIZE_H80_PACKET;
                else /* drbd 8.3.8 onwards */
                        max_seg_s = DRBD_MAX_SEGMENT_SIZE;
  
        return TRUE;
  }
  
 -static int receive_uuids(struct drbd_conf *mdev, struct p_header *h)
 +static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
  {
 -      struct p_uuids *p = (struct p_uuids *)h;
 +      struct p_uuids *p = &mdev->data.rbuf.uuids;
        u64 *p_uuid;
        int i;
  
 -      ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
 -      if (drbd_recv(mdev, h->payload, h->length) != h->length)
 -              return FALSE;
 -
        p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
  
        for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
                        drbd_md_sync(mdev);
                }
                put_ldev(mdev);
 +      } else if (mdev->state.disk < D_INCONSISTENT &&
 +                 mdev->state.role == R_PRIMARY) {
 +              /* I am a diskless primary, the peer just created a new current UUID
 +                 for me. */
 +              drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
        }
  
        /* Before we test for the disk state, we should wait until an eventually
@@@ -3251,12 -3150,16 +3251,12 @@@ static union drbd_state convert_state(u
        return ms;
  }
  
 -static int receive_req_state(struct drbd_conf *mdev, struct p_header *h)
 +static int receive_req_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
  {
 -      struct p_req_state *p = (struct p_req_state *)h;
 +      struct p_req_state *p = &mdev->data.rbuf.req_state;
        union drbd_state mask, val;
        int rv;
  
 -      ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
 -      if (drbd_recv(mdev, h->payload, h->length) != h->length)
 -              return FALSE;
 -
        mask.i = be32_to_cpu(p->mask);
        val.i = be32_to_cpu(p->val);
  
        return TRUE;
  }
  
 -static int receive_state(struct drbd_conf *mdev, struct p_header *h)
 +static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
  {
 -      struct p_state *p = (struct p_state *)h;
 -      enum drbd_conns nconn, oconn;
 -      union drbd_state ns, peer_state;
 +      struct p_state *p = &mdev->data.rbuf.state;
 +      union drbd_state os, ns, peer_state;
        enum drbd_disk_state real_peer_disk;
 +      enum chg_state_flags cs_flags;
        int rv;
  
 -      ERR_IF(h->length != (sizeof(*p)-sizeof(*h)))
 -              return FALSE;
 -
 -      if (drbd_recv(mdev, h->payload, h->length) != h->length)
 -              return FALSE;
 -
        peer_state.i = be32_to_cpu(p->state);
  
        real_peer_disk = peer_state.disk;
  
        spin_lock_irq(&mdev->req_lock);
   retry:
 -      oconn = nconn = mdev->state.conn;
 +      os = ns = mdev->state;
        spin_unlock_irq(&mdev->req_lock);
  
 -      if (nconn == C_WF_REPORT_PARAMS)
 -              nconn = C_CONNECTED;
 +      /* peer says his disk is uptodate, while we think it is inconsistent,
 +       * and this happens while we think we have a sync going on. */
 +      if (os.pdsk == D_INCONSISTENT && real_peer_disk == D_UP_TO_DATE &&
 +          os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
 +              /* If we are (becoming) SyncSource, but peer is still in sync
 +               * preparation, ignore its uptodate-ness to avoid flapping, it
 +               * will change to inconsistent once the peer reaches active
 +               * syncing states.
 +               * It may have changed syncer-paused flags, however, so we
 +               * cannot ignore this completely. */
 +              if (peer_state.conn > C_CONNECTED &&
 +                  peer_state.conn < C_SYNC_SOURCE)
 +                      real_peer_disk = D_INCONSISTENT;
 +
 +              /* if peer_state changes to connected at the same time,
 +               * it explicitly notifies us that it finished resync.
 +               * Maybe we should finish it up, too? */
 +              else if (os.conn >= C_SYNC_SOURCE &&
 +                       peer_state.conn == C_CONNECTED) {
 +                      if (drbd_bm_total_weight(mdev) <= mdev->rs_failed)
 +                              drbd_resync_finished(mdev);
 +                      return TRUE;
 +              }
 +      }
 +
 +      /* peer says his disk is inconsistent, while we think it is uptodate,
 +       * and this happens while the peer still thinks we have a sync going on,
 +       * but we think we are already done with the sync.
 +       * We ignore this to avoid flapping pdsk.
 +       * This should not happen, if the peer is a recent version of drbd. */
 +      if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT &&
 +          os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE)
 +              real_peer_disk = D_UP_TO_DATE;
 +
 +      if (ns.conn == C_WF_REPORT_PARAMS)
 +              ns.conn = C_CONNECTED;
  
        if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING &&
            get_ldev_if_state(mdev, D_NEGOTIATING)) {
                int cr; /* consider resync */
  
                /* if we established a new connection */
 -              cr  = (oconn < C_CONNECTED);
 +              cr  = (os.conn < C_CONNECTED);
                /* if we had an established connection
                 * and one of the nodes newly attaches a disk */
 -              cr |= (oconn == C_CONNECTED &&
 +              cr |= (os.conn == C_CONNECTED &&
                       (peer_state.disk == D_NEGOTIATING ||
 -                      mdev->state.disk == D_NEGOTIATING));
 +                      os.disk == D_NEGOTIATING));
                /* if we have both been inconsistent, and the peer has been
                 * forced to be UpToDate with --overwrite-data */
                cr |= test_bit(CONSIDER_RESYNC, &mdev->flags);
                /* if we had been plain connected, and the admin requested to
                 * start a sync by "invalidate" or "invalidate-remote" */
 -              cr |= (oconn == C_CONNECTED &&
 +              cr |= (os.conn == C_CONNECTED &&
                                (peer_state.conn >= C_STARTING_SYNC_S &&
                                 peer_state.conn <= C_WF_BITMAP_T));
  
                if (cr)
 -                      nconn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
 +                      ns.conn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
  
                put_ldev(mdev);
 -              if (nconn == C_MASK) {
 -                      nconn = C_CONNECTED;
 +              if (ns.conn == C_MASK) {
 +                      ns.conn = C_CONNECTED;
                        if (mdev->state.disk == D_NEGOTIATING) {
                                drbd_force_state(mdev, NS(disk, D_DISKLESS));
                        } else if (peer_state.disk == D_NEGOTIATING) {
                        } else {
                                if (test_and_clear_bit(CONN_DRY_RUN, &mdev->flags))
                                        return FALSE;
 -                              D_ASSERT(oconn == C_WF_REPORT_PARAMS);
 +                              D_ASSERT(os.conn == C_WF_REPORT_PARAMS);
                                drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
                                return FALSE;
                        }
        }
  
        spin_lock_irq(&mdev->req_lock);
 -      if (mdev->state.conn != oconn)
 +      if (mdev->state.i != os.i)
                goto retry;
        clear_bit(CONSIDER_RESYNC, &mdev->flags);
 -      ns.i = mdev->state.i;
 -      ns.conn = nconn;
        ns.peer = peer_state.role;
        ns.pdsk = real_peer_disk;
        ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
 -      if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
 +      if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
                ns.disk = mdev->new_state_tmp.disk;
 -
 -      rv = _drbd_set_state(mdev, ns, CS_VERBOSE | CS_HARD, NULL);
 +      cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
 +      if (ns.pdsk == D_CONSISTENT && is_susp(ns) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
 +          test_bit(NEW_CUR_UUID, &mdev->flags)) {
 +              /* Do not allow tl_restart(resend) for a rebooted peer. We can only allow this
 +                 for temporal network outages! */
 +              spin_unlock_irq(&mdev->req_lock);
 +              dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
 +              tl_clear(mdev);
 +              drbd_uuid_new_current(mdev);
 +              clear_bit(NEW_CUR_UUID, &mdev->flags);
 +              drbd_force_state(mdev, NS2(conn, C_PROTOCOL_ERROR, susp, 0));
 +              return FALSE;
 +      }
 +      rv = _drbd_set_state(mdev, ns, cs_flags, NULL);
        ns = mdev->state;
        spin_unlock_irq(&mdev->req_lock);
  
                return FALSE;
        }
  
 -      if (oconn > C_WF_REPORT_PARAMS) {
 -              if (nconn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
 +      if (os.conn > C_WF_REPORT_PARAMS) {
 +              if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
                    peer_state.disk != D_NEGOTIATING ) {
                        /* we want resync, peer has not yet decided to sync... */
                        /* Nowadays only used when forcing a node into primary role and
        return TRUE;
  }
  
 -static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h)
 +static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
  {
 -      struct p_rs_uuid *p = (struct p_rs_uuid *)h;
 +      struct p_rs_uuid *p = &mdev->data.rbuf.rs_uuid;
  
        wait_event(mdev->misc_wait,
                   mdev->state.conn == C_WF_SYNC_UUID ||
  
        /* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */
  
 -      ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
 -      if (drbd_recv(mdev, h->payload, h->length) != h->length)
 -              return FALSE;
 -
        /* Here the _drbd_uuid_ functions are right, current should
           _not_ be rotated into the history */
        if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
  enum receive_bitmap_ret { OK, DONE, FAILED };
  
  static enum receive_bitmap_ret
 -receive_bitmap_plain(struct drbd_conf *mdev, struct p_header *h,
 -      unsigned long *buffer, struct bm_xfer_ctx *c)
 +receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size,
 +                   unsigned long *buffer, struct bm_xfer_ctx *c)
  {
        unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
        unsigned want = num_words * sizeof(long);
  
 -      if (want != h->length) {
 -              dev_err(DEV, "%s:want (%u) != h->length (%u)\n", __func__, want, h->length);
 +      if (want != data_size) {
 +              dev_err(DEV, "%s:want (%u) != data_size (%u)\n", __func__, want, data_size);
                return FAILED;
        }
        if (want == 0)
@@@ -3491,7 -3360,7 +3491,7 @@@ recv_bm_rle_bits(struct drbd_conf *mdev
        u64 tmp;
        unsigned long s = c->bit_offset;
        unsigned long e;
 -      int len = p->head.length - (sizeof(*p) - sizeof(p->head));
 +      int len = be16_to_cpu(p->head.length) - (sizeof(*p) - sizeof(p->head));
        int toggle = DCBP_get_start(p);
        int have;
        int bits;
@@@ -3560,7 -3429,7 +3560,7 @@@ void INFO_bm_xfer_stats(struct drbd_con
                const char *direction, struct bm_xfer_ctx *c)
  {
        /* what would it take to transfer it "plaintext" */
 -      unsigned plain = sizeof(struct p_header) *
 +      unsigned plain = sizeof(struct p_header80) *
                ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1)
                + c->bm_words * sizeof(long);
        unsigned total = c->bytes[0] + c->bytes[1];
     in order to be agnostic to the 32 vs 64 bits issue.
  
     returns 0 on failure, 1 if we successfully received it. */
 -static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h)
 +static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
  {
        struct bm_xfer_ctx c;
        void *buffer;
        enum receive_bitmap_ret ret;
        int ok = FALSE;
 +      struct p_header80 *h = &mdev->data.rbuf.header.h80;
  
        wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
  
        };
  
        do {
 -              if (h->command == P_BITMAP) {
 -                      ret = receive_bitmap_plain(mdev, h, buffer, &c);
 -              } else if (h->command == P_COMPRESSED_BITMAP) {
 +              if (cmd == P_BITMAP) {
 +                      ret = receive_bitmap_plain(mdev, data_size, buffer, &c);
 +              } else if (cmd == P_COMPRESSED_BITMAP) {
                        /* MAYBE: sanity check that we speak proto >= 90,
                         * and the feature is enabled! */
                        struct p_compressed_bm *p;
  
 -                      if (h->length > BM_PACKET_PAYLOAD_BYTES) {
 +                      if (data_size > BM_PACKET_PAYLOAD_BYTES) {
                                dev_err(DEV, "ReportCBitmap packet too large\n");
                                goto out;
                        }
                        /* use the page buff */
                        p = buffer;
                        memcpy(p, h, sizeof(*h));
 -                      if (drbd_recv(mdev, p->head.payload, h->length) != h->length)
 +                      if (drbd_recv(mdev, p->head.payload, data_size) != data_size)
                                goto out;
 -                      if (p->head.length <= (sizeof(*p) - sizeof(p->head))) {
 -                              dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", p->head.length);
 +                      if (data_size <= (sizeof(*p) - sizeof(p->head))) {
 +                              dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", data_size);
                                return FAILED;
                        }
                        ret = decode_bitmap_c(mdev, p, &c);
                } else {
 -                      dev_warn(DEV, "receive_bitmap: h->command neither ReportBitMap nor ReportCBitMap (is 0x%x)", h->command);
 +                      dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", cmd);
                        goto out;
                }
  
 -              c.packets[h->command == P_BITMAP]++;
 -              c.bytes[h->command == P_BITMAP] += sizeof(struct p_header) + h->length;
 +              c.packets[cmd == P_BITMAP]++;
 +              c.bytes[cmd == P_BITMAP] += sizeof(struct p_header80) + data_size;
  
                if (ret != OK)
                        break;
  
 -              if (!drbd_recv_header(mdev, h))
 +              if (!drbd_recv_header(mdev, &cmd, &data_size))
                        goto out;
        } while (ret == OK);
        if (ret == FAILED)
        return ok;
  }
  
 -static int receive_skip_(struct drbd_conf *mdev, struct p_header *h, int silent)
 +static int receive_skip(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
  {
        /* TODO zero copy sink :) */
        static char sink[128];
        int size, want, r;
  
 -      if (!silent)
 -              dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
 -                   h->command, h->length);
 +      dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
 +               cmd, data_size);
  
 -      size = h->length;
 +      size = data_size;
        while (size > 0) {
                want = min_t(int, size, sizeof(sink));
                r = drbd_recv(mdev, sink, want);
        return size == 0;
  }
  
 -static int receive_skip(struct drbd_conf *mdev, struct p_header *h)
 -{
 -      return receive_skip_(mdev, h, 0);
 -}
 -
 -static int receive_skip_silent(struct drbd_conf *mdev, struct p_header *h)
 -{
 -      return receive_skip_(mdev, h, 1);
 -}
 -
 -static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h)
 +static int receive_UnplugRemote(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
  {
        if (mdev->state.disk >= D_INCONSISTENT)
                drbd_kick_lo(mdev);
        return TRUE;
  }
  
 -typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *);
 -
 -static drbd_cmd_handler_f drbd_default_handler[] = {
 -      [P_DATA]            = receive_Data,
 -      [P_DATA_REPLY]      = receive_DataReply,
 -      [P_RS_DATA_REPLY]   = receive_RSDataReply,
 -      [P_BARRIER]         = receive_Barrier,
 -      [P_BITMAP]          = receive_bitmap,
 -      [P_COMPRESSED_BITMAP]    = receive_bitmap,
 -      [P_UNPLUG_REMOTE]   = receive_UnplugRemote,
 -      [P_DATA_REQUEST]    = receive_DataRequest,
 -      [P_RS_DATA_REQUEST] = receive_DataRequest,
 -      [P_SYNC_PARAM]      = receive_SyncParam,
 -      [P_SYNC_PARAM89]           = receive_SyncParam,
 -      [P_PROTOCOL]        = receive_protocol,
 -      [P_UUIDS]           = receive_uuids,
 -      [P_SIZES]           = receive_sizes,
 -      [P_STATE]           = receive_state,
 -      [P_STATE_CHG_REQ]   = receive_req_state,
 -      [P_SYNC_UUID]       = receive_sync_uuid,
 -      [P_OV_REQUEST]      = receive_DataRequest,
 -      [P_OV_REPLY]        = receive_DataRequest,
 -      [P_CSUM_RS_REQUEST]    = receive_DataRequest,
 -      [P_DELAY_PROBE]     = receive_skip_silent,
 +typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, enum drbd_packets cmd, unsigned int to_receive);
 +
 +struct data_cmd {
 +      int expect_payload;
 +      size_t pkt_size;
 +      drbd_cmd_handler_f function;
 +};
 +
 +static struct data_cmd drbd_cmd_handler[] = {
 +      [P_DATA]            = { 1, sizeof(struct p_data), receive_Data },
 +      [P_DATA_REPLY]      = { 1, sizeof(struct p_data), receive_DataReply },
 +      [P_RS_DATA_REPLY]   = { 1, sizeof(struct p_data), receive_RSDataReply } ,
 +      [P_BARRIER]         = { 0, sizeof(struct p_barrier), receive_Barrier } ,
 +      [P_BITMAP]          = { 1, sizeof(struct p_header80), receive_bitmap } ,
 +      [P_COMPRESSED_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } ,
 +      [P_UNPLUG_REMOTE]   = { 0, sizeof(struct p_header80), receive_UnplugRemote },
 +      [P_DATA_REQUEST]    = { 0, sizeof(struct p_block_req), receive_DataRequest },
 +      [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
 +      [P_SYNC_PARAM]      = { 1, sizeof(struct p_header80), receive_SyncParam },
 +      [P_SYNC_PARAM89]    = { 1, sizeof(struct p_header80), receive_SyncParam },
 +      [P_PROTOCOL]        = { 1, sizeof(struct p_protocol), receive_protocol },
 +      [P_UUIDS]           = { 0, sizeof(struct p_uuids), receive_uuids },
 +      [P_SIZES]           = { 0, sizeof(struct p_sizes), receive_sizes },
 +      [P_STATE]           = { 0, sizeof(struct p_state), receive_state },
 +      [P_STATE_CHG_REQ]   = { 0, sizeof(struct p_req_state), receive_req_state },
 +      [P_SYNC_UUID]       = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid },
 +      [P_OV_REQUEST]      = { 0, sizeof(struct p_block_req), receive_DataRequest },
 +      [P_OV_REPLY]        = { 1, sizeof(struct p_block_req), receive_DataRequest },
 +      [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
 +      [P_DELAY_PROBE]     = { 0, sizeof(struct p_delay_probe93), receive_skip },
        /* anything missing from this table is in
         * the asender_tbl, see get_asender_cmd */
 -      [P_MAX_CMD]         = NULL,
 +      [P_MAX_CMD]         = { 0, 0, NULL },
  };
  
 -static drbd_cmd_handler_f *drbd_cmd_handler = drbd_default_handler;
 -static drbd_cmd_handler_f *drbd_opt_cmd_handler;
 +/* All handler functions that expect a sub-header get that sub-heder in
 +   mdev->data.rbuf.header.head.payload.
 +
 +   Usually in mdev->data.rbuf.header.head the callback can find the usual
 +   p_header, but they may not rely on that. Since there is also p_header95 !
 + */
  
  static void drbdd(struct drbd_conf *mdev)
  {
 -      drbd_cmd_handler_f handler;
 -      struct p_header *header = &mdev->data.rbuf.header;
 +      union p_header *header = &mdev->data.rbuf.header;
 +      unsigned int packet_size;
 +      enum drbd_packets cmd;
 +      size_t shs; /* sub header size */
 +      int rv;
  
        while (get_t_state(&mdev->receiver) == Running) {
                drbd_thread_current_set_cpu(mdev);
 -              if (!drbd_recv_header(mdev, header)) {
 -                      drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
 -                      break;
 -              }
 +              if (!drbd_recv_header(mdev, &cmd, &packet_size))
 +                      goto err_out;
  
 -              if (header->command < P_MAX_CMD)
 -                      handler = drbd_cmd_handler[header->command];
 -              else if (P_MAY_IGNORE < header->command
 -                   && header->command < P_MAX_OPT_CMD)
 -                      handler = drbd_opt_cmd_handler[header->command-P_MAY_IGNORE];
 -              else if (header->command > P_MAX_OPT_CMD)
 -                      handler = receive_skip;
 -              else
 -                      handler = NULL;
 +              if (unlikely(cmd >= P_MAX_CMD || !drbd_cmd_handler[cmd].function)) {
 +                      dev_err(DEV, "unknown packet type %d, l: %d!\n", cmd, packet_size);
 +                      goto err_out;
 +              }
  
 -              if (unlikely(!handler)) {
 -                      dev_err(DEV, "unknown packet type %d, l: %d!\n",
 -                          header->command, header->length);
 -                      drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
 -                      break;
 +              shs = drbd_cmd_handler[cmd].pkt_size - sizeof(union p_header);
 +              rv = drbd_recv(mdev, &header->h80.payload, shs);
 +              if (unlikely(rv != shs)) {
 +                      dev_err(DEV, "short read while reading sub header: rv=%d\n", rv);
 +                      goto err_out;
                }
 -              if (unlikely(!handler(mdev, header))) {
 -                      dev_err(DEV, "error receiving %s, l: %d!\n",
 -                          cmdname(header->command), header->length);
 -                      drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
 -                      break;
 +
 +              if (packet_size - shs > 0 && !drbd_cmd_handler[cmd].expect_payload) {
 +                      dev_err(DEV, "No payload expected %s l:%d\n", cmdname(cmd), packet_size);
 +                      goto err_out;
                }
 -      }
 -}
  
 -static void drbd_fail_pending_reads(struct drbd_conf *mdev)
 -{
 -      struct hlist_head *slot;
 -      struct hlist_node *pos;
 -      struct hlist_node *tmp;
 -      struct drbd_request *req;
 -      int i;
 +              rv = drbd_cmd_handler[cmd].function(mdev, cmd, packet_size - shs);
  
 -      /*
 -       * Application READ requests
 -       */
 -      spin_lock_irq(&mdev->req_lock);
 -      for (i = 0; i < APP_R_HSIZE; i++) {
 -              slot = mdev->app_reads_hash+i;
 -              hlist_for_each_entry_safe(req, pos, tmp, slot, colision) {
 -                      /* it may (but should not any longer!)
 -                       * be on the work queue; if that assert triggers,
 -                       * we need to also grab the
 -                       * spin_lock_irq(&mdev->data.work.q_lock);
 -                       * and list_del_init here. */
 -                      D_ASSERT(list_empty(&req->w.list));
 -                      /* It would be nice to complete outside of spinlock.
 -                       * But this is easier for now. */
 -                      _req_mod(req, connection_lost_while_pending);
 +              if (unlikely(!rv)) {
 +                      dev_err(DEV, "error receiving %s, l: %d!\n",
 +                          cmdname(cmd), packet_size);
 +                      goto err_out;
                }
        }
 -      for (i = 0; i < APP_R_HSIZE; i++)
 -              if (!hlist_empty(mdev->app_reads_hash+i))
 -                      dev_warn(DEV, "ASSERT FAILED: app_reads_hash[%d].first: "
 -                              "%p, should be NULL\n", i, mdev->app_reads_hash[i].first);
  
 -      memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
 -      spin_unlock_irq(&mdev->req_lock);
 +      if (0) {
 +      err_out:
 +              drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
 +      }
 +      /* If we leave here, we probably want to update at least the
 +       * "Connected" indicator on stable storage. Do so explicitly here. */
 +      drbd_md_sync(mdev);
  }
  
  void drbd_flush_workqueue(struct drbd_conf *mdev)
        wait_for_completion(&barr.done);
  }
  
 +void drbd_free_tl_hash(struct drbd_conf *mdev)
 +{
 +      struct hlist_head *h;
 +
 +      spin_lock_irq(&mdev->req_lock);
 +
 +      if (!mdev->tl_hash || mdev->state.conn != C_STANDALONE) {
 +              spin_unlock_irq(&mdev->req_lock);
 +              return;
 +      }
 +      /* paranoia code */
 +      for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
 +              if (h->first)
 +                      dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
 +                              (int)(h - mdev->ee_hash), h->first);
 +      kfree(mdev->ee_hash);
 +      mdev->ee_hash = NULL;
 +      mdev->ee_hash_s = 0;
 +
 +      /* paranoia code */
 +      for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
 +              if (h->first)
 +                      dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
 +                              (int)(h - mdev->tl_hash), h->first);
 +      kfree(mdev->tl_hash);
 +      mdev->tl_hash = NULL;
 +      mdev->tl_hash_s = 0;
 +      spin_unlock_irq(&mdev->req_lock);
 +}
 +
  static void drbd_disconnect(struct drbd_conf *mdev)
  {
        enum drbd_fencing_p fp;
        drbd_thread_stop(&mdev->asender);
        drbd_free_sock(mdev);
  
 +      /* wait for current activity to cease. */
        spin_lock_irq(&mdev->req_lock);
        _drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
        _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee);
  
        /* make sure syncer is stopped and w_resume_next_sg queued */
        del_timer_sync(&mdev->resync_timer);
 -      set_bit(STOP_SYNC_TIMER, &mdev->flags);
        resync_timer_fn((unsigned long)mdev);
  
        /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
        kfree(mdev->p_uuid);
        mdev->p_uuid = NULL;
  
 -      if (!mdev->state.susp)
 +      if (!is_susp(mdev->state))
                tl_clear(mdev);
  
 -      drbd_fail_pending_reads(mdev);
 -
        dev_info(DEV, "Connection closed\n");
  
        drbd_md_sync(mdev);
                put_ldev(mdev);
        }
  
 -      if (mdev->state.role == R_PRIMARY) {
 -              if (fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN) {
 -                      enum drbd_disk_state nps = drbd_try_outdate_peer(mdev);
 -                      drbd_request_state(mdev, NS(pdsk, nps));
 -              }
 -      }
 +      if (mdev->state.role == R_PRIMARY && fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN)
 +              drbd_try_outdate_peer_async(mdev);
  
        spin_lock_irq(&mdev->req_lock);
        os = mdev->state;
        spin_unlock_irq(&mdev->req_lock);
  
        if (os.conn == C_DISCONNECTING) {
 -              struct hlist_head *h;
 -              wait_event(mdev->misc_wait, atomic_read(&mdev->net_cnt) == 0);
 +              wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0);
  
 -              /* we must not free the tl_hash
 -               * while application io is still on the fly */
 -              wait_event(mdev->misc_wait, atomic_read(&mdev->ap_bio_cnt) == 0);
 -
 -              spin_lock_irq(&mdev->req_lock);
 -              /* paranoia code */
 -              for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
 -                      if (h->first)
 -                              dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
 -                                              (int)(h - mdev->ee_hash), h->first);
 -              kfree(mdev->ee_hash);
 -              mdev->ee_hash = NULL;
 -              mdev->ee_hash_s = 0;
 -
 -              /* paranoia code */
 -              for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
 -                      if (h->first)
 -                              dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
 -                                              (int)(h - mdev->tl_hash), h->first);
 -              kfree(mdev->tl_hash);
 -              mdev->tl_hash = NULL;
 -              mdev->tl_hash_s = 0;
 -              spin_unlock_irq(&mdev->req_lock);
 +              if (!is_susp(mdev->state)) {
 +                      /* we must not free the tl_hash
 +                       * while application io is still on the fly */
 +                      wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
 +                      drbd_free_tl_hash(mdev);
 +              }
  
                crypto_free_hash(mdev->cram_hmac_tfm);
                mdev->cram_hmac_tfm = NULL;
        i = drbd_release_ee(mdev, &mdev->net_ee);
        if (i)
                dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
 +      i = atomic_read(&mdev->pp_in_use_by_net);
 +      if (i)
 +              dev_info(DEV, "pp_in_use_by_net = %d, expected 0\n", i);
        i = atomic_read(&mdev->pp_in_use);
        if (i)
                dev_info(DEV, "pp_in_use = %d, expected 0\n", i);
@@@ -4004,7 -3888,7 +4004,7 @@@ static int drbd_send_handshake(struct d
        p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
        p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
        ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE,
 -                           (struct p_header *)p, sizeof(*p), 0 );
 +                           (struct p_header80 *)p, sizeof(*p), 0 );
        mutex_unlock(&mdev->data.mutex);
        return ok;
  }
@@@ -4020,28 -3904,27 +4020,28 @@@ static int drbd_do_handshake(struct drb
  {
        /* ASSERT current == mdev->receiver ... */
        struct p_handshake *p = &mdev->data.rbuf.handshake;
 -      const int expect = sizeof(struct p_handshake)
 -                        -sizeof(struct p_header);
 +      const int expect = sizeof(struct p_handshake) - sizeof(struct p_header80);
 +      unsigned int length;
 +      enum drbd_packets cmd;
        int rv;
  
        rv = drbd_send_handshake(mdev);
        if (!rv)
                return 0;
  
 -      rv = drbd_recv_header(mdev, &p->head);
 +      rv = drbd_recv_header(mdev, &cmd, &length);
        if (!rv)
                return 0;
  
 -      if (p->head.command != P_HAND_SHAKE) {
 +      if (cmd != P_HAND_SHAKE) {
                dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n",
 -                   cmdname(p->head.command), p->head.command);
 +                   cmdname(cmd), cmd);
                return -1;
        }
  
 -      if (p->head.length != expect) {
 +      if (length != expect) {
                dev_err(DEV, "expected HandShake length: %u, received: %u\n",
 -                   expect, p->head.length);
 +                   expect, length);
                return -1;
        }
  
@@@ -4099,11 -3982,10 +4099,11 @@@ static int drbd_do_auth(struct drbd_con
        char *response = NULL;
        char *right_response = NULL;
        char *peers_ch = NULL;
 -      struct p_header p;
        unsigned int key_len = strlen(mdev->net_conf->shared_secret);
        unsigned int resp_size;
        struct hash_desc desc;
 +      enum drbd_packets cmd;
 +      unsigned int length;
        int rv;
  
        desc.tfm = mdev->cram_hmac_tfm;
        if (!rv)
                goto fail;
  
 -      rv = drbd_recv_header(mdev, &p);
 +      rv = drbd_recv_header(mdev, &cmd, &length);
        if (!rv)
                goto fail;
  
 -      if (p.command != P_AUTH_CHALLENGE) {
 +      if (cmd != P_AUTH_CHALLENGE) {
                dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n",
 -                  cmdname(p.command), p.command);
 +                  cmdname(cmd), cmd);
                rv = 0;
                goto fail;
        }
  
 -      if (p.length > CHALLENGE_LEN*2) {
 +      if (length > CHALLENGE_LEN * 2) {
                dev_err(DEV, "expected AuthChallenge payload too big.\n");
                rv = -1;
                goto fail;
        }
  
 -      peers_ch = kmalloc(p.length, GFP_NOIO);
 +      peers_ch = kmalloc(length, GFP_NOIO);
        if (peers_ch == NULL) {
                dev_err(DEV, "kmalloc of peers_ch failed\n");
                rv = -1;
                goto fail;
        }
  
 -      rv = drbd_recv(mdev, peers_ch, p.length);
 +      rv = drbd_recv(mdev, peers_ch, length);
  
 -      if (rv != p.length) {
 +      if (rv != length) {
                dev_err(DEV, "short read AuthChallenge: l=%u\n", rv);
                rv = 0;
                goto fail;
        }
  
        sg_init_table(&sg, 1);
 -      sg_set_buf(&sg, peers_ch, p.length);
 +      sg_set_buf(&sg, peers_ch, length);
  
        rv = crypto_hash_digest(&desc, &sg, sg.length, response);
        if (rv) {
        if (!rv)
                goto fail;
  
 -      rv = drbd_recv_header(mdev, &p);
 +      rv = drbd_recv_header(mdev, &cmd, &length);
        if (!rv)
                goto fail;
  
 -      if (p.command != P_AUTH_RESPONSE) {
 +      if (cmd != P_AUTH_RESPONSE) {
                dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n",
 -                  cmdname(p.command), p.command);
 +                      cmdname(cmd), cmd);
                rv = 0;
                goto fail;
        }
  
 -      if (p.length != resp_size) {
 +      if (length != resp_size) {
                dev_err(DEV, "expected AuthResponse payload of wrong size\n");
                rv = 0;
                goto fail;
@@@ -4273,7 -4155,7 +4273,7 @@@ int drbdd_init(struct drbd_thread *thi
  
  /* ********* acknowledge sender ******** */
  
 -static int got_RqSReply(struct drbd_conf *mdev, struct p_header *h)
 +static int got_RqSReply(struct drbd_conf *mdev, struct p_header80 *h)
  {
        struct p_req_state_reply *p = (struct p_req_state_reply *)h;
  
        return TRUE;
  }
  
 -static int got_Ping(struct drbd_conf *mdev, struct p_header *h)
 +static int got_Ping(struct drbd_conf *mdev, struct p_header80 *h)
  {
        return drbd_send_ping_ack(mdev);
  
  }
  
 -static int got_PingAck(struct drbd_conf *mdev, struct p_header *h)
 +static int got_PingAck(struct drbd_conf *mdev, struct p_header80 *h)
  {
        /* restore idle timeout */
        mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
        return TRUE;
  }
  
 -static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h)
 +static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h)
  {
        struct p_block_ack *p = (struct p_block_ack *)h;
        sector_t sector = be64_to_cpu(p->sector);
  
        update_peer_seq(mdev, be32_to_cpu(p->seq_num));
  
 -      drbd_rs_complete_io(mdev, sector);
 -      drbd_set_in_sync(mdev, sector, blksize);
 -      /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
 -      mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
 +      if (get_ldev(mdev)) {
 +              drbd_rs_complete_io(mdev, sector);
 +              drbd_set_in_sync(mdev, sector, blksize);
 +              /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
 +              mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
 +              put_ldev(mdev);
 +      }
        dec_rs_pending(mdev);
 +      atomic_add(blksize >> 9, &mdev->rs_sect_in);
  
        return TRUE;
  }
@@@ -4381,7 -4259,7 +4381,7 @@@ static int validate_req_change_req_stat
        return TRUE;
  }
  
 -static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h)
 +static int got_BlockAck(struct drbd_conf *mdev, struct p_header80 *h)
  {
        struct p_block_ack *p = (struct p_block_ack *)h;
        sector_t sector = be64_to_cpu(p->sector);
                _ack_id_to_req, __func__ , what);
  }
  
 -static int got_NegAck(struct drbd_conf *mdev, struct p_header *h)
 +static int got_NegAck(struct drbd_conf *mdev, struct p_header80 *h)
  {
        struct p_block_ack *p = (struct p_block_ack *)h;
        sector_t sector = be64_to_cpu(p->sector);
                _ack_id_to_req, __func__ , neg_acked);
  }
  
 -static int got_NegDReply(struct drbd_conf *mdev, struct p_header *h)
 +static int got_NegDReply(struct drbd_conf *mdev, struct p_header80 *h)
  {
        struct p_block_ack *p = (struct p_block_ack *)h;
        sector_t sector = be64_to_cpu(p->sector);
                _ar_id_to_req, __func__ , neg_acked);
  }
  
 -static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h)
 +static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header80 *h)
  {
        sector_t sector;
        int size;
        return TRUE;
  }
  
 -static int got_BarrierAck(struct drbd_conf *mdev, struct p_header *h)
 +static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h)
  {
        struct p_barrier_ack *p = (struct p_barrier_ack *)h;
  
        return TRUE;
  }
  
 -static int got_OVResult(struct drbd_conf *mdev, struct p_header *h)
 +static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h)
  {
        struct p_block_ack *p = (struct p_block_ack *)h;
        struct drbd_work *w;
        else
                ov_oos_print(mdev);
  
 +      if (!get_ldev(mdev))
 +              return TRUE;
 +
        drbd_rs_complete_io(mdev, sector);
        dec_rs_pending(mdev);
  
                        drbd_resync_finished(mdev);
                }
        }
 +      put_ldev(mdev);
        return TRUE;
  }
  
 -static int got_something_to_ignore_m(struct drbd_conf *mdev, struct p_header *h)
 +static int got_skip(struct drbd_conf *mdev, struct p_header80 *h)
  {
 -      /* IGNORE */
        return TRUE;
  }
  
  struct asender_cmd {
        size_t pkt_size;
 -      int (*process)(struct drbd_conf *mdev, struct p_header *h);
 +      int (*process)(struct drbd_conf *mdev, struct p_header80 *h);
  };
  
  static struct asender_cmd *get_asender_cmd(int cmd)
                /* anything missing from this table is in
                 * the drbd_cmd_handler (drbd_default_handler) table,
                 * see the beginning of drbdd() */
 -      [P_PING]            = { sizeof(struct p_header), got_Ping },
 -      [P_PING_ACK]        = { sizeof(struct p_header), got_PingAck },
 +      [P_PING]            = { sizeof(struct p_header80), got_Ping },
 +      [P_PING_ACK]        = { sizeof(struct p_header80), got_PingAck },
        [P_RECV_ACK]        = { sizeof(struct p_block_ack), got_BlockAck },
        [P_WRITE_ACK]       = { sizeof(struct p_block_ack), got_BlockAck },
        [P_RS_WRITE_ACK]    = { sizeof(struct p_block_ack), got_BlockAck },
        [P_BARRIER_ACK]     = { sizeof(struct p_barrier_ack), got_BarrierAck },
        [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
        [P_RS_IS_IN_SYNC]   = { sizeof(struct p_block_ack), got_IsInSync },
 -      [P_DELAY_PROBE]     = { sizeof(struct p_delay_probe), got_something_to_ignore_m },
 +      [P_DELAY_PROBE]     = { sizeof(struct p_delay_probe93), got_skip },
        [P_MAX_CMD]         = { 0, NULL },
        };
        if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
  int drbd_asender(struct drbd_thread *thi)
  {
        struct drbd_conf *mdev = thi->mdev;
 -      struct p_header *h = &mdev->meta.rbuf.header;
 +      struct p_header80 *h = &mdev->meta.rbuf.header.h80;
        struct asender_cmd *cmd = NULL;
  
        int rv, len;
        void *buf    = h;
        int received = 0;
 -      int expect   = sizeof(struct p_header);
 +      int expect   = sizeof(struct p_header80);
        int empty;
  
        sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev));
                while (1) {
                        clear_bit(SIGNAL_ASENDER, &mdev->flags);
                        flush_signals(current);
 -                      if (!drbd_process_done_ee(mdev)) {
 -                              dev_err(DEV, "process_done_ee() = NOT_OK\n");
 +                      if (!drbd_process_done_ee(mdev))
                                goto reconnect;
 -                      }
                        /* to avoid race with newly queued ACKs */
                        set_bit(SIGNAL_ASENDER, &mdev->flags);
                        spin_lock_irq(&mdev->req_lock);
  
                if (received == expect && cmd == NULL) {
                        if (unlikely(h->magic != BE_DRBD_MAGIC)) {
 -                              dev_err(DEV, "magic?? on meta m: 0x%lx c: %d l: %d\n",
 -                                  (long)be32_to_cpu(h->magic),
 -                                  h->command, h->length);
 +                              dev_err(DEV, "magic?? on meta m: 0x%08x c: %d l: %d\n",
 +                                  be32_to_cpu(h->magic),
 +                                  be16_to_cpu(h->command),
 +                                  be16_to_cpu(h->length));
                                goto reconnect;
                        }
                        cmd = get_asender_cmd(be16_to_cpu(h->command));
                        len = be16_to_cpu(h->length);
                        if (unlikely(cmd == NULL)) {
 -                              dev_err(DEV, "unknown command?? on meta m: 0x%lx c: %d l: %d\n",
 -                                  (long)be32_to_cpu(h->magic),
 -                                  h->command, h->length);
 +                              dev_err(DEV, "unknown command?? on meta m: 0x%08x c: %d l: %d\n",
 +                                  be32_to_cpu(h->magic),
 +                                  be16_to_cpu(h->command),
 +                                  be16_to_cpu(h->length));
                                goto disconnect;
                        }
                        expect = cmd->pkt_size;
 -                      ERR_IF(len != expect-sizeof(struct p_header))
 +                      ERR_IF(len != expect-sizeof(struct p_header80))
                                goto reconnect;
                }
                if (received == expect) {
  
                        buf      = h;
                        received = 0;
 -                      expect   = sizeof(struct p_header);
 +                      expect   = sizeof(struct p_header80);
                        cmd      = NULL;
                }
        }
        if (0) {
  reconnect:
                drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
 +              drbd_md_sync(mdev);
        }
        if (0) {
  disconnect:
                drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
 +              drbd_md_sync(mdev);
        }
        clear_bit(SIGNAL_ASENDER, &mdev->flags);
  
diff --combined drivers/block/loop.c
index de3083b0a4f5de1f246ffdfa469fcb31840e35ee,5d27bc6596de01f179251b951ec292cfd181a917..6c48b3545f84583d0e32f1e87308c39a42b2133c
  #include <linux/compat.h>
  #include <linux/suspend.h>
  #include <linux/freezer.h>
 -#include <linux/smp_lock.h>
 +#include <linux/mutex.h>
  #include <linux/writeback.h>
  #include <linux/buffer_head.h>                /* for invalidate_bdev() */
  #include <linux/completion.h>
  #include <linux/highmem.h>
  #include <linux/kthread.h>
  #include <linux/splice.h>
 +#include <linux/sysfs.h>
  
  #include <asm/uaccess.h>
  
 +static DEFINE_MUTEX(loop_mutex);
  static LIST_HEAD(loop_devices);
  static DEFINE_MUTEX(loop_devices_mutex);
  
@@@ -479,17 -477,17 +479,17 @@@ static int do_bio_filebacked(struct loo
        pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset;
  
        if (bio_rw(bio) == WRITE) {
-               bool barrier = !!(bio->bi_rw & REQ_HARDBARRIER);
                struct file *file = lo->lo_backing_file;
  
-               if (barrier) {
-                       if (unlikely(!file->f_op->fsync)) {
-                               ret = -EOPNOTSUPP;
-                               goto out;
-                       }
+               /* REQ_HARDBARRIER is deprecated */
+               if (bio->bi_rw & REQ_HARDBARRIER) {
+                       ret = -EOPNOTSUPP;
+                       goto out;
+               }
  
+               if (bio->bi_rw & REQ_FLUSH) {
                        ret = vfs_fsync(file, 0);
-                       if (unlikely(ret)) {
+                       if (unlikely(ret && ret != -EINVAL)) {
                                ret = -EIO;
                                goto out;
                        }
  
                ret = lo_send(lo, bio, pos);
  
-               if (barrier && !ret) {
+               if ((bio->bi_rw & REQ_FUA) && !ret) {
                        ret = vfs_fsync(file, 0);
-                       if (unlikely(ret))
+                       if (unlikely(ret && ret != -EINVAL))
                                ret = -EIO;
                }
        } else
@@@ -739,103 -737,6 +739,103 @@@ static inline int is_loop_device(struc
        return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR;
  }
  
 +/* loop sysfs attributes */
 +
 +static ssize_t loop_attr_show(struct device *dev, char *page,
 +                            ssize_t (*callback)(struct loop_device *, char *))
 +{
 +      struct loop_device *l, *lo = NULL;
 +
 +      mutex_lock(&loop_devices_mutex);
 +      list_for_each_entry(l, &loop_devices, lo_list)
 +              if (disk_to_dev(l->lo_disk) == dev) {
 +                      lo = l;
 +                      break;
 +              }
 +      mutex_unlock(&loop_devices_mutex);
 +
 +      return lo ? callback(lo, page) : -EIO;
 +}
 +
 +#define LOOP_ATTR_RO(_name)                                           \
 +static ssize_t loop_attr_##_name##_show(struct loop_device *, char *);        \
 +static ssize_t loop_attr_do_show_##_name(struct device *d,            \
 +                              struct device_attribute *attr, char *b) \
 +{                                                                     \
 +      return loop_attr_show(d, b, loop_attr_##_name##_show);          \
 +}                                                                     \
 +static struct device_attribute loop_attr_##_name =                    \
 +      __ATTR(_name, S_IRUGO, loop_attr_do_show_##_name, NULL);
 +
 +static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf)
 +{
 +      ssize_t ret;
 +      char *p = NULL;
 +
 +      mutex_lock(&lo->lo_ctl_mutex);
 +      if (lo->lo_backing_file)
 +              p = d_path(&lo->lo_backing_file->f_path, buf, PAGE_SIZE - 1);
 +      mutex_unlock(&lo->lo_ctl_mutex);
 +
 +      if (IS_ERR_OR_NULL(p))
 +              ret = PTR_ERR(p);
 +      else {
 +              ret = strlen(p);
 +              memmove(buf, p, ret);
 +              buf[ret++] = '\n';
 +              buf[ret] = 0;
 +      }
 +
 +      return ret;
 +}
 +
 +static ssize_t loop_attr_offset_show(struct loop_device *lo, char *buf)
 +{
 +      return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_offset);
 +}
 +
 +static ssize_t loop_attr_sizelimit_show(struct loop_device *lo, char *buf)
 +{
 +      return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit);
 +}
 +
 +static ssize_t loop_attr_autoclear_show(struct loop_device *lo, char *buf)
 +{
 +      int autoclear = (lo->lo_flags & LO_FLAGS_AUTOCLEAR);
 +
 +      return sprintf(buf, "%s\n", autoclear ? "1" : "0");
 +}
 +
 +LOOP_ATTR_RO(backing_file);
 +LOOP_ATTR_RO(offset);
 +LOOP_ATTR_RO(sizelimit);
 +LOOP_ATTR_RO(autoclear);
 +
 +static struct attribute *loop_attrs[] = {
 +      &loop_attr_backing_file.attr,
 +      &loop_attr_offset.attr,
 +      &loop_attr_sizelimit.attr,
 +      &loop_attr_autoclear.attr,
 +      NULL,
 +};
 +
 +static struct attribute_group loop_attribute_group = {
 +      .name = "loop",
 +      .attrs= loop_attrs,
 +};
 +
 +static int loop_sysfs_init(struct loop_device *lo)
 +{
 +      return sysfs_create_group(&disk_to_dev(lo->lo_disk)->kobj,
 +                                &loop_attribute_group);
 +}
 +
 +static void loop_sysfs_exit(struct loop_device *lo)
 +{
 +      sysfs_remove_group(&disk_to_dev(lo->lo_disk)->kobj,
 +                         &loop_attribute_group);
 +}
 +
  static int loop_set_fd(struct loop_device *lo, fmode_t mode,
                       struct block_device *bdev, unsigned int arg)
  {
        lo->lo_queue->unplug_fn = loop_unplug;
  
        if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
-               blk_queue_ordered(lo->lo_queue, QUEUE_ORDERED_DRAIN);
+               blk_queue_flush(lo->lo_queue, REQ_FLUSH);
  
        set_capacity(lo->lo_disk, size);
        bd_set_size(bdev, size << 9);
 +      loop_sysfs_init(lo);
        /* let user-space know about the new size */
        kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
  
        return 0;
  
  out_clr:
 +      loop_sysfs_exit(lo);
        lo->lo_thread = NULL;
        lo->lo_device = NULL;
        lo->lo_backing_file = NULL;
@@@ -1051,7 -950,6 +1051,7 @@@ static int loop_clr_fd(struct loop_devi
        set_capacity(lo->lo_disk, 0);
        if (bdev) {
                bd_set_size(bdev, 0);
 +              loop_sysfs_exit(lo);
                /* let user-space know about this change */
                kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
        }
@@@ -1511,11 -1409,11 +1511,11 @@@ static int lo_open(struct block_device 
  {
        struct loop_device *lo = bdev->bd_disk->private_data;
  
 -      lock_kernel();
 +      mutex_lock(&loop_mutex);
        mutex_lock(&lo->lo_ctl_mutex);
        lo->lo_refcnt++;
        mutex_unlock(&lo->lo_ctl_mutex);
 -      unlock_kernel();
 +      mutex_unlock(&loop_mutex);
  
        return 0;
  }
@@@ -1525,7 -1423,7 +1525,7 @@@ static int lo_release(struct gendisk *d
        struct loop_device *lo = disk->private_data;
        int err;
  
 -      lock_kernel();
 +      mutex_lock(&loop_mutex);
        mutex_lock(&lo->lo_ctl_mutex);
  
        if (--lo->lo_refcnt)
  out:
        mutex_unlock(&lo->lo_ctl_mutex);
  out_unlocked:
 -      lock_kernel();
 +      mutex_unlock(&loop_mutex);
        return 0;
  }
  
diff --combined drivers/block/pktcdvd.c
index ef58fccadad3ada40ad07f6891b2cd3de0234c17,1b5cfcccd6543d5c445d6aa5f224f554ec701cd3..19b3568e9326303543c8ace464361637aadc6ef8
@@@ -57,6 -57,7 +57,6 @@@
  #include <linux/seq_file.h>
  #include <linux/miscdevice.h>
  #include <linux/freezer.h>
 -#include <linux/smp_lock.h>
  #include <linux/mutex.h>
  #include <linux/slab.h>
  #include <scsi/scsi_cmnd.h>
@@@ -85,7 -86,6 +85,7 @@@
  
  #define ZONE(sector, pd) (((sector) + (pd)->offset) & ~((pd)->settings.size - 1))
  
 +static DEFINE_MUTEX(pktcdvd_mutex);
  static struct pktcdvd_device *pkt_devs[MAX_WRITERS];
  static struct proc_dir_entry *pkt_proc;
  static int pktdev_major;
@@@ -753,7 -753,6 +753,6 @@@ static int pkt_generic_packet(struct pk
  
        rq->timeout = 60*HZ;
        rq->cmd_type = REQ_TYPE_BLOCK_PC;
-       rq->cmd_flags |= REQ_HARDBARRIER;
        if (cgc->quiet)
                rq->cmd_flags |= REQ_QUIET;
  
@@@ -2383,7 -2382,7 +2382,7 @@@ static int pkt_open(struct block_devic
  
        VPRINTK(DRIVER_NAME": entering open\n");
  
 -      lock_kernel();
 +      mutex_lock(&pktcdvd_mutex);
        mutex_lock(&ctl_mutex);
        pd = pkt_find_dev_from_minor(MINOR(bdev->bd_dev));
        if (!pd) {
        }
  
        mutex_unlock(&ctl_mutex);
 -      unlock_kernel();
 +      mutex_unlock(&pktcdvd_mutex);
        return 0;
  
  out_dec:
  out:
        VPRINTK(DRIVER_NAME": failed open (%d)\n", ret);
        mutex_unlock(&ctl_mutex);
 -      unlock_kernel();
 +      mutex_unlock(&pktcdvd_mutex);
        return ret;
  }
  
@@@ -2428,7 -2427,7 +2427,7 @@@ static int pkt_close(struct gendisk *di
        struct pktcdvd_device *pd = disk->private_data;
        int ret = 0;
  
 -      lock_kernel();
 +      mutex_lock(&pktcdvd_mutex);
        mutex_lock(&ctl_mutex);
        pd->refcnt--;
        BUG_ON(pd->refcnt < 0);
                pkt_release_dev(pd, flush);
        }
        mutex_unlock(&ctl_mutex);
 -      unlock_kernel();
 +      mutex_unlock(&pktcdvd_mutex);
        return ret;
  }
  
@@@ -2773,7 -2772,7 +2772,7 @@@ static int pkt_ioctl(struct block_devic
        VPRINTK("pkt_ioctl: cmd %x, dev %d:%d\n", cmd,
                MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
  
 -      lock_kernel();
 +      mutex_lock(&pktcdvd_mutex);
        switch (cmd) {
        case CDROMEJECT:
                /*
                VPRINTK(DRIVER_NAME": Unknown ioctl for %s (%x)\n", pd->name, cmd);
                ret = -ENOTTY;
        }
 -      unlock_kernel();
 +      mutex_unlock(&pktcdvd_mutex);
  
        return ret;
  }
@@@ -3046,7 -3045,6 +3045,7 @@@ static const struct file_operations pkt
        .compat_ioctl   = pkt_ctl_compat_ioctl,
  #endif
        .owner          = THIS_MODULE,
 +      .llseek         = no_llseek,
  };
  
  static struct miscdevice pkt_misc = {
diff --combined drivers/block/ps3disk.c
index 03688c2da319c007f4923c4ffd989e4f9666b755,4911f9e57bc70e11e782460bd9604721e4f80810..8e1ce2e2916a72cdeff49a8fa46f2179d04691dd
@@@ -113,7 -113,7 +113,7 @@@ static void ps3disk_scatter_gather(stru
                        memcpy(buf, dev->bounce_buf+offset, size);
                offset += size;
                flush_kernel_dcache_page(bvec->bv_page);
 -              bvec_kunmap_irq(bvec, &flags);
 +              bvec_kunmap_irq(buf, &flags);
                i++;
        }
  }
@@@ -468,7 -468,7 +468,7 @@@ static int __devinit ps3disk_probe(stru
        blk_queue_dma_alignment(queue, dev->blk_size-1);
        blk_queue_logical_block_size(queue, dev->blk_size);
  
-       blk_queue_ordered(queue, QUEUE_ORDERED_DRAIN_FLUSH);
+       blk_queue_flush(queue, REQ_FLUSH);
  
        blk_queue_max_segments(queue, -1);
        blk_queue_max_segment_size(queue, dev->bounce_size);
index 8320490226b78145f95c7e7801a15080419adc19,831e75caea3d07f7c0269618f5238b3f2aaf4cab..6ecf89cdf006a3f6605d15fb062f134a2d3a0621
@@@ -2,6 -2,7 +2,6 @@@
  #include <linux/spinlock.h>
  #include <linux/slab.h>
  #include <linux/blkdev.h>
 -#include <linux/smp_lock.h>
  #include <linux/hdreg.h>
  #include <linux/virtio.h>
  #include <linux/virtio_blk.h>
@@@ -127,9 -128,6 +127,6 @@@ static bool do_req(struct request_queu
                }
        }
  
-       if (vbr->req->cmd_flags & REQ_HARDBARRIER)
-               vbr->out_hdr.type |= VIRTIO_BLK_T_BARRIER;
        sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
  
        /*
@@@ -221,8 -219,8 +218,8 @@@ static int virtblk_get_id(struct gendis
        return err;
  }
  
 -static int virtblk_locked_ioctl(struct block_device *bdev, fmode_t mode,
 -                       unsigned cmd, unsigned long data)
 +static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
 +                           unsigned int cmd, unsigned long data)
  {
        struct gendisk *disk = bdev->bd_disk;
        struct virtio_blk *vblk = disk->private_data;
                              (void __user *)data);
  }
  
 -static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
 -                           unsigned int cmd, unsigned long param)
 -{
 -      int ret;
 -
 -      lock_kernel();
 -      ret = virtblk_locked_ioctl(bdev, mode, cmd, param);
 -      unlock_kernel();
 -
 -      return ret;
 -}
 -
  /* We provide getgeo only to please some old bootloader/partitioning tools */
  static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
  {
@@@ -379,31 -389,9 +376,9 @@@ static int __devinit virtblk_probe(stru
        vblk->disk->driverfs_dev = &vdev->dev;
        index++;
  
-       if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH)) {
-               /*
-                * If the FLUSH feature is supported we do have support for
-                * flushing a volatile write cache on the host.  Use that
-                * to implement write barrier support.
-                */
-               blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH);
-       } else if (virtio_has_feature(vdev, VIRTIO_BLK_F_BARRIER)) {
-               /*
-                * If the BARRIER feature is supported the host expects us
-                * to order request by tags.  This implies there is not
-                * volatile write cache on the host, and that the host
-                * never re-orders outstanding I/O.  This feature is not
-                * useful for real life scenarious and deprecated.
-                */
-               blk_queue_ordered(q, QUEUE_ORDERED_TAG);
-       } else {
-               /*
-                * If the FLUSH feature is not supported we must assume that
-                * the host does not perform any kind of volatile write
-                * caching. We still need to drain the queue to provider
-                * proper barrier semantics.
-                */
-               blk_queue_ordered(q, QUEUE_ORDERED_DRAIN);
-       }
+       /* configure queue flush support */
+       if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH))
+               blk_queue_flush(q, REQ_FLUSH);
  
        /* If disk is read-only in the host, the guest should obey */
        if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
@@@ -522,9 -510,9 +497,9 @@@ static const struct virtio_device_id id
  };
  
  static unsigned int features[] = {
-       VIRTIO_BLK_F_BARRIER, VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX,
-       VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
-       VIRTIO_BLK_F_SCSI, VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY
+       VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
+       VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI,
+       VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY
  };
  
  /*
index 3ff06f475eef47bc841b5c0877527ce9d47ea32b,739b4c1416eab3b4aac7cc2389bec131b4a09752..4b33a18c32e0c91959442deab466741b7e245ddf
@@@ -41,7 -41,7 +41,7 @@@
  #include <linux/cdrom.h>
  #include <linux/module.h>
  #include <linux/slab.h>
 -#include <linux/smp_lock.h>
 +#include <linux/mutex.h>
  #include <linux/scatterlist.h>
  
  #include <xen/xen.h>
@@@ -69,7 -69,6 +69,7 @@@ struct blk_shadow 
        unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  };
  
 +static DEFINE_MUTEX(blkfront_mutex);
  static const struct block_device_operations xlvbd_block_fops;
  
  #define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
@@@ -96,7 -95,7 +96,7 @@@ struct blkfront_inf
        struct gnttab_free_callback callback;
        struct blk_shadow shadow[BLK_RING_SIZE];
        unsigned long shadow_free;
-       int feature_barrier;
+       unsigned int feature_flush;
        int is_ready;
  };
  
@@@ -419,26 -418,12 +419,12 @@@ static int xlvbd_init_blk_queue(struct 
  }
  
  
- static int xlvbd_barrier(struct blkfront_info *info)
+ static void xlvbd_flush(struct blkfront_info *info)
  {
-       int err;
-       const char *barrier;
-       switch (info->feature_barrier) {
-       case QUEUE_ORDERED_DRAIN:       barrier = "enabled (drain)"; break;
-       case QUEUE_ORDERED_TAG:         barrier = "enabled (tag)"; break;
-       case QUEUE_ORDERED_NONE:        barrier = "disabled"; break;
-       default:                        return -EINVAL;
-       }
-       err = blk_queue_ordered(info->rq, info->feature_barrier);
-       if (err)
-               return err;
+       blk_queue_flush(info->rq, info->feature_flush);
        printk(KERN_INFO "blkfront: %s: barriers %s\n",
-              info->gd->disk_name, barrier);
-       return 0;
+              info->gd->disk_name,
+              info->feature_flush ? "enabled" : "disabled");
  }
  
  
@@@ -517,7 -502,7 +503,7 @@@ static int xlvbd_alloc_gendisk(blkif_se
        info->rq = gd->queue;
        info->gd = gd;
  
-       xlvbd_barrier(info);
+       xlvbd_flush(info);
  
        if (vdisk_info & VDISK_READONLY)
                set_disk_ro(gd, 1);
@@@ -663,8 -648,8 +649,8 @@@ static irqreturn_t blkif_interrupt(int 
                                printk(KERN_WARNING "blkfront: %s: write barrier op failed\n",
                                       info->gd->disk_name);
                                error = -EOPNOTSUPP;
-                               info->feature_barrier = QUEUE_ORDERED_NONE;
-                               xlvbd_barrier(info);
+                               info->feature_flush = 0;
+                               xlvbd_flush(info);
                        }
                        /* fall through */
                case BLKIF_OP_READ:
@@@ -1077,20 -1062,20 +1063,20 @@@ static void blkfront_connect(struct blk
        /*
         * If there's no "feature-barrier" defined, then it means
         * we're dealing with a very old backend which writes
-        * synchronously; draining will do what needs to get done.
+        * synchronously; nothing to do.
         *
-        * If there are barriers, then we can do full queued writes
-        * with tagged barriers.
-        *
-        * If barriers are not supported, then there's no much we can
-        * do, so just set ordering to NONE.
+        * If there are barriers, then we use flush.
         */
-       if (err)
-               info->feature_barrier = QUEUE_ORDERED_DRAIN;
-       else if (barrier)
-               info->feature_barrier = QUEUE_ORDERED_TAG;
-       else
-               info->feature_barrier = QUEUE_ORDERED_NONE;
+       info->feature_flush = 0;
+       /*
+        * The driver doesn't properly handled empty flushes, so
+        * lets disable barrier support for now.
+        */
+ #if 0
+       if (!err && barrier)
+               info->feature_flush = REQ_FLUSH;
+ #endif
  
        err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
        if (err) {
@@@ -1202,7 -1187,7 +1188,7 @@@ static int blkif_open(struct block_devi
        struct blkfront_info *info;
        int err = 0;
  
 -      lock_kernel();
 +      mutex_lock(&blkfront_mutex);
  
        info = disk->private_data;
        if (!info) {
        mutex_unlock(&info->mutex);
  
  out:
 -      unlock_kernel();
 +      mutex_unlock(&blkfront_mutex);
        return err;
  }
  
@@@ -1230,7 -1215,7 +1216,7 @@@ static int blkif_release(struct gendis
        struct block_device *bdev;
        struct xenbus_device *xbdev;
  
 -      lock_kernel();
 +      mutex_lock(&blkfront_mutex);
  
        bdev = bdget_disk(disk, 0);
        bdput(bdev);
        }
  
  out:
 -      unlock_kernel();
 +      mutex_unlock(&blkfront_mutex);
        return 0;
  }
  
diff --combined drivers/md/dm-snap.c
index f30f6e8d594e1cc90b52d6d73a4b110367ae2d6f,eed210152b75802930369d1953d6006aaedc993a..53cf79d8bcbc5aa24c7e004d588aff0e2b4676bc
@@@ -706,6 -706,8 +706,6 @@@ static int dm_add_exception(void *conte
        return 0;
  }
  
 -#define min_not_zero(l, r) (((l) == 0) ? (r) : (((r) == 0) ? (l) : min(l, r)))
 -
  /*
   * Return a minimum chunk size of all snapshots that have the specified origin.
   * Return zero if the origin has no snapshots.
@@@ -1585,7 -1587,7 +1585,7 @@@ static int snapshot_map(struct dm_targe
        chunk_t chunk;
        struct dm_snap_pending_exception *pe = NULL;
  
-       if (unlikely(bio_empty_barrier(bio))) {
+       if (bio->bi_rw & REQ_FLUSH) {
                bio->bi_bdev = s->cow->bdev;
                return DM_MAPIO_REMAPPED;
        }
@@@ -1689,7 -1691,7 +1689,7 @@@ static int snapshot_merge_map(struct dm
        int r = DM_MAPIO_REMAPPED;
        chunk_t chunk;
  
-       if (unlikely(bio_empty_barrier(bio))) {
+       if (bio->bi_rw & REQ_FLUSH) {
                if (!map_context->target_request_nr)
                        bio->bi_bdev = s->origin->bdev;
                else
@@@ -2133,7 -2135,7 +2133,7 @@@ static int origin_map(struct dm_target 
        struct dm_dev *dev = ti->private;
        bio->bi_bdev = dev->bdev;
  
-       if (unlikely(bio_empty_barrier(bio)))
+       if (bio->bi_rw & REQ_FLUSH)
                return DM_MAPIO_REMAPPED;
  
        /* Only tell snapshots if this is a write */
diff --combined drivers/md/dm.c
index 7967eca5a2d5fdaf3a009dde0eee189437103dc9,f934e9878436300252483632b31693bb6e497ea8..7cb1352f7e7a5e2b4b5e400319b981406ec69005
@@@ -15,6 -15,7 +15,6 @@@
  #include <linux/blkpg.h>
  #include <linux/bio.h>
  #include <linux/buffer_head.h>
 -#include <linux/smp_lock.h>
  #include <linux/mempool.h>
  #include <linux/slab.h>
  #include <linux/idr.h>
@@@ -32,7 -33,6 +32,7 @@@
  #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
  #define DM_COOKIE_LENGTH 24
  
 +static DEFINE_MUTEX(dm_mutex);
  static const char *_name = DM_NAME;
  
  static unsigned int major = 0;
@@@ -110,7 -110,6 +110,6 @@@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo)
  #define DMF_FREEING 3
  #define DMF_DELETING 4
  #define DMF_NOFLUSH_SUSPENDING 5
- #define DMF_QUEUE_IO_TO_THREAD 6
  
  /*
   * Work processed by per-device workqueue.
@@@ -144,24 -143,9 +143,9 @@@ struct mapped_device 
        spinlock_t deferred_lock;
  
        /*
-        * An error from the barrier request currently being processed.
-        */
-       int barrier_error;
-       /*
-        * Protect barrier_error from concurrent endio processing
-        * in request-based dm.
-        */
-       spinlock_t barrier_error_lock;
-       /*
-        * Processing queue (flush/barriers)
+        * Processing queue (flush)
         */
        struct workqueue_struct *wq;
-       struct work_struct barrier_work;
-       /* A pointer to the currently processing pre/post flush request */
-       struct request *flush_request;
  
        /*
         * The current mapping.
        /* sysfs handle */
        struct kobject kobj;
  
-       /* zero-length barrier that will be cloned and submitted to targets */
-       struct bio barrier_bio;
+       /* zero-length flush that will be cloned and submitted to targets */
+       struct bio flush_bio;
  };
  
  /*
@@@ -344,7 -328,7 +328,7 @@@ static int dm_blk_open(struct block_dev
  {
        struct mapped_device *md;
  
 -      lock_kernel();
 +      mutex_lock(&dm_mutex);
        spin_lock(&_minor_lock);
  
        md = bdev->bd_disk->private_data;
  
  out:
        spin_unlock(&_minor_lock);
 -      unlock_kernel();
 +      mutex_unlock(&dm_mutex);
  
        return md ? 0 : -ENXIO;
  }
@@@ -371,10 -355,10 +355,10 @@@ static int dm_blk_close(struct gendisk 
  {
        struct mapped_device *md = disk->private_data;
  
 -      lock_kernel();
 +      mutex_lock(&dm_mutex);
        atomic_dec(&md->open_count);
        dm_put(md);
 -      unlock_kernel();
 +      mutex_unlock(&dm_mutex);
  
        return 0;
  }
@@@ -512,7 -496,7 +496,7 @@@ static void end_io_acct(struct dm_io *i
  
        /*
         * After this is decremented the bio must not be touched if it is
-        * a barrier.
+        * a flush.
         */
        dm_disk(md)->part0.in_flight[rw] = pending =
                atomic_dec_return(&md->pending[rw]);
   */
  static void queue_io(struct mapped_device *md, struct bio *bio)
  {
-       down_write(&md->io_lock);
+       unsigned long flags;
  
-       spin_lock_irq(&md->deferred_lock);
+       spin_lock_irqsave(&md->deferred_lock, flags);
        bio_list_add(&md->deferred, bio);
-       spin_unlock_irq(&md->deferred_lock);
-       if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags))
-               queue_work(md->wq, &md->work);
-       up_write(&md->io_lock);
+       spin_unlock_irqrestore(&md->deferred_lock, flags);
+       queue_work(md->wq, &md->work);
  }
  
  /*
@@@ -625,11 -605,9 +605,9 @@@ static void dec_pending(struct dm_io *i
                         * Target requested pushing back the I/O.
                         */
                        spin_lock_irqsave(&md->deferred_lock, flags);
-                       if (__noflush_suspending(md)) {
-                               if (!(io->bio->bi_rw & REQ_HARDBARRIER))
-                                       bio_list_add_head(&md->deferred,
-                                                         io->bio);
-                       } else
+                       if (__noflush_suspending(md))
+                               bio_list_add_head(&md->deferred, io->bio);
+                       else
                                /* noflush suspend was interrupted. */
                                io->error = -EIO;
                        spin_unlock_irqrestore(&md->deferred_lock, flags);
  
                io_error = io->error;
                bio = io->bio;
+               end_io_acct(io);
+               free_io(md, io);
+               if (io_error == DM_ENDIO_REQUEUE)
+                       return;
  
-               if (bio->bi_rw & REQ_HARDBARRIER) {
+               if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) {
                        /*
-                        * There can be just one barrier request so we use
-                        * a per-device variable for error reporting.
-                        * Note that you can't touch the bio after end_io_acct
-                        *
-                        * We ignore -EOPNOTSUPP for empty flush reported by
-                        * underlying devices. We assume that if the device
-                        * doesn't support empty barriers, it doesn't need
-                        * cache flushing commands.
+                        * Preflush done for flush with data, reissue
+                        * without REQ_FLUSH.
                         */
-                       if (!md->barrier_error &&
-                           !(bio_empty_barrier(bio) && io_error == -EOPNOTSUPP))
-                               md->barrier_error = io_error;
-                       end_io_acct(io);
-                       free_io(md, io);
+                       bio->bi_rw &= ~REQ_FLUSH;
+                       queue_io(md, bio);
                } else {
-                       end_io_acct(io);
-                       free_io(md, io);
-                       if (io_error != DM_ENDIO_REQUEUE) {
-                               trace_block_bio_complete(md->queue, bio);
-                               bio_endio(bio, io_error);
-                       }
+                       /* done with normal IO or empty flush */
+                       trace_block_bio_complete(md->queue, bio);
+                       bio_endio(bio, io_error);
                }
        }
  }
@@@ -755,23 -724,6 +724,6 @@@ static void end_clone_bio(struct bio *c
        blk_update_request(tio->orig, 0, nr_bytes);
  }
  
- static void store_barrier_error(struct mapped_device *md, int error)
- {
-       unsigned long flags;
-       spin_lock_irqsave(&md->barrier_error_lock, flags);
-       /*
-        * Basically, the first error is taken, but:
-        *   -EOPNOTSUPP supersedes any I/O error.
-        *   Requeue request supersedes any I/O error but -EOPNOTSUPP.
-        */
-       if (!md->barrier_error || error == -EOPNOTSUPP ||
-           (md->barrier_error != -EOPNOTSUPP &&
-            error == DM_ENDIO_REQUEUE))
-               md->barrier_error = error;
-       spin_unlock_irqrestore(&md->barrier_error_lock, flags);
- }
  /*
   * Don't touch any member of the md after calling this function because
   * the md may be freed in dm_put() at the end of this function.
@@@ -809,13 -761,11 +761,11 @@@ static void free_rq_clone(struct reques
  static void dm_end_request(struct request *clone, int error)
  {
        int rw = rq_data_dir(clone);
-       int run_queue = 1;
-       bool is_barrier = clone->cmd_flags & REQ_HARDBARRIER;
        struct dm_rq_target_io *tio = clone->end_io_data;
        struct mapped_device *md = tio->md;
        struct request *rq = tio->orig;
  
-       if (rq->cmd_type == REQ_TYPE_BLOCK_PC && !is_barrier) {
+       if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
                rq->errors = clone->errors;
                rq->resid_len = clone->resid_len;
  
        }
  
        free_rq_clone(clone);
-       if (unlikely(is_barrier)) {
-               if (unlikely(error))
-                       store_barrier_error(md, error);
-               run_queue = 0;
-       } else
-               blk_end_request_all(rq, error);
-       rq_completed(md, rw, run_queue);
+       blk_end_request_all(rq, error);
+       rq_completed(md, rw, true);
  }
  
  static void dm_unprep_request(struct request *rq)
@@@ -862,16 -805,6 +805,6 @@@ void dm_requeue_unmapped_request(struc
        struct request_queue *q = rq->q;
        unsigned long flags;
  
-       if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
-               /*
-                * Barrier clones share an original request.
-                * Leave it to dm_end_request(), which handles this special
-                * case.
-                */
-               dm_end_request(clone, DM_ENDIO_REQUEUE);
-               return;
-       }
        dm_unprep_request(rq);
  
        spin_lock_irqsave(q->queue_lock, flags);
@@@ -961,19 -894,6 +894,6 @@@ static void dm_complete_request(struct 
        struct dm_rq_target_io *tio = clone->end_io_data;
        struct request *rq = tio->orig;
  
-       if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
-               /*
-                * Barrier clones share an original request.  So can't use
-                * softirq_done with the original.
-                * Pass the clone to dm_done() directly in this special case.
-                * It is safe (even if clone->q->queue_lock is held here)
-                * because there is no I/O dispatching during the completion
-                * of barrier clone.
-                */
-               dm_done(clone, error, true);
-               return;
-       }
        tio->error = error;
        rq->completion_data = clone;
        blk_complete_request(rq);
@@@ -990,17 -910,6 +910,6 @@@ void dm_kill_unmapped_request(struct re
        struct dm_rq_target_io *tio = clone->end_io_data;
        struct request *rq = tio->orig;
  
-       if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
-               /*
-                * Barrier clones share an original request.
-                * Leave it to dm_end_request(), which handles this special
-                * case.
-                */
-               BUG_ON(error > 0);
-               dm_end_request(clone, error);
-               return;
-       }
        rq->cmd_flags |= REQ_FAILED;
        dm_complete_request(clone, error);
  }
@@@ -1119,7 -1028,7 +1028,7 @@@ static void dm_bio_destructor(struct bi
  }
  
  /*
-  * Creates a little bio that is just does part of a bvec.
+  * Creates a little bio that just does part of a bvec.
   */
  static struct bio *split_bvec(struct bio *bio, sector_t sector,
                              unsigned short idx, unsigned int offset,
  
        clone->bi_sector = sector;
        clone->bi_bdev = bio->bi_bdev;
-       clone->bi_rw = bio->bi_rw & ~REQ_HARDBARRIER;
+       clone->bi_rw = bio->bi_rw;
        clone->bi_vcnt = 1;
        clone->bi_size = to_bytes(len);
        clone->bi_io_vec->bv_offset = offset;
@@@ -1161,7 -1070,6 +1070,6 @@@ static struct bio *clone_bio(struct bi
  
        clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
        __bio_clone(clone, bio);
-       clone->bi_rw &= ~REQ_HARDBARRIER;
        clone->bi_destructor = dm_bio_destructor;
        clone->bi_sector = sector;
        clone->bi_idx = idx;
@@@ -1225,16 -1133,15 +1133,15 @@@ static void __issue_target_requests(str
                __issue_target_request(ci, ti, request_nr, len);
  }
  
- static int __clone_and_map_empty_barrier(struct clone_info *ci)
+ static int __clone_and_map_empty_flush(struct clone_info *ci)
  {
        unsigned target_nr = 0;
        struct dm_target *ti;
  
+       BUG_ON(bio_has_data(ci->bio));
        while ((ti = dm_table_get_target(ci->map, target_nr++)))
                __issue_target_requests(ci, ti, ti->num_flush_requests, 0);
  
-       ci->sector_count = 0;
        return 0;
  }
  
@@@ -1289,9 -1196,6 +1196,6 @@@ static int __clone_and_map(struct clone
        sector_t len = 0, max;
        struct dm_target_io *tio;
  
-       if (unlikely(bio_empty_barrier(bio)))
-               return __clone_and_map_empty_barrier(ci);
        if (unlikely(bio->bi_rw & REQ_DISCARD))
                return __clone_and_map_discard(ci);
  
@@@ -1383,16 -1287,11 +1287,11 @@@ static void __split_and_process_bio(str
  
        ci.map = dm_get_live_table(md);
        if (unlikely(!ci.map)) {
-               if (!(bio->bi_rw & REQ_HARDBARRIER))
-                       bio_io_error(bio);
-               else
-                       if (!md->barrier_error)
-                               md->barrier_error = -EIO;
+               bio_io_error(bio);
                return;
        }
  
        ci.md = md;
-       ci.bio = bio;
        ci.io = alloc_io(md);
        ci.io->error = 0;
        atomic_set(&ci.io->io_count, 1);
        ci.io->md = md;
        spin_lock_init(&ci.io->endio_lock);
        ci.sector = bio->bi_sector;
-       ci.sector_count = bio_sectors(bio);
-       if (unlikely(bio_empty_barrier(bio)))
-               ci.sector_count = 1;
        ci.idx = bio->bi_idx;
  
        start_io_acct(ci.io);
-       while (ci.sector_count && !error)
-               error = __clone_and_map(&ci);
+       if (bio->bi_rw & REQ_FLUSH) {
+               ci.bio = &ci.md->flush_bio;
+               ci.sector_count = 0;
+               error = __clone_and_map_empty_flush(&ci);
+               /* dec_pending submits any data associated with flush */
+       } else {
+               ci.bio = bio;
+               ci.sector_count = bio_sectors(bio);
+               while (ci.sector_count && !error)
+                       error = __clone_and_map(&ci);
+       }
  
        /* drop the extra reference count */
        dec_pending(ci.io, error);
@@@ -1491,22 -1396,14 +1396,14 @@@ static int _dm_request(struct request_q
        part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
        part_stat_unlock();
  
-       /*
-        * If we're suspended or the thread is processing barriers
-        * we have to queue this io for later.
-        */
-       if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
-           unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
+       /* if we're suspended, we have to queue this io for later */
+       if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
                up_read(&md->io_lock);
  
-               if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
-                   bio_rw(bio) == READA) {
+               if (bio_rw(bio) != READA)
+                       queue_io(md, bio);
+               else
                        bio_io_error(bio);
-                       return 0;
-               }
-               queue_io(md, bio);
                return 0;
        }
  
@@@ -1537,14 -1434,6 +1434,6 @@@ static int dm_request(struct request_qu
        return _dm_request(q, bio);
  }
  
- static bool dm_rq_is_flush_request(struct request *rq)
- {
-       if (rq->cmd_flags & REQ_FLUSH)
-               return true;
-       else
-               return false;
- }
  void dm_dispatch_request(struct request *rq)
  {
        int r;
@@@ -1592,22 -1481,15 +1481,15 @@@ static int setup_clone(struct request *
  {
        int r;
  
-       if (dm_rq_is_flush_request(rq)) {
-               blk_rq_init(NULL, clone);
-               clone->cmd_type = REQ_TYPE_FS;
-               clone->cmd_flags |= (REQ_HARDBARRIER | WRITE);
-       } else {
-               r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
-                                     dm_rq_bio_constructor, tio);
-               if (r)
-                       return r;
-               clone->cmd = rq->cmd;
-               clone->cmd_len = rq->cmd_len;
-               clone->sense = rq->sense;
-               clone->buffer = rq->buffer;
-       }
+       r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
+                             dm_rq_bio_constructor, tio);
+       if (r)
+               return r;
  
+       clone->cmd = rq->cmd;
+       clone->cmd_len = rq->cmd_len;
+       clone->sense = rq->sense;
+       clone->buffer = rq->buffer;
        clone->end_io = end_clone_request;
        clone->end_io_data = tio;
  
@@@ -1648,9 -1530,6 +1530,6 @@@ static int dm_prep_fn(struct request_qu
        struct mapped_device *md = q->queuedata;
        struct request *clone;
  
-       if (unlikely(dm_rq_is_flush_request(rq)))
-               return BLKPREP_OK;
        if (unlikely(rq->special)) {
                DMWARN("Already has something in rq->special.");
                return BLKPREP_KILL;
@@@ -1727,6 -1606,7 +1606,7 @@@ static void dm_request_fn(struct reques
        struct dm_table *map = dm_get_live_table(md);
        struct dm_target *ti;
        struct request *rq, *clone;
+       sector_t pos;
  
        /*
         * For suspend, check blk_queue_stopped() and increment
                if (!rq)
                        goto plug_and_out;
  
-               if (unlikely(dm_rq_is_flush_request(rq))) {
-                       BUG_ON(md->flush_request);
-                       md->flush_request = rq;
-                       blk_start_request(rq);
-                       queue_work(md->wq, &md->barrier_work);
-                       goto out;
-               }
+               /* always use block 0 to find the target for flushes for now */
+               pos = 0;
+               if (!(rq->cmd_flags & REQ_FLUSH))
+                       pos = blk_rq_pos(rq);
+               ti = dm_table_find_target(map, pos);
+               BUG_ON(!dm_target_is_valid(ti));
  
-               ti = dm_table_find_target(map, blk_rq_pos(rq));
                if (ti->type->busy && ti->type->busy(ti))
                        goto plug_and_out;
  
@@@ -1918,7 -1797,6 +1797,6 @@@ out
  static const struct block_device_operations dm_blk_dops;
  
  static void dm_wq_work(struct work_struct *work);
- static void dm_rq_barrier_work(struct work_struct *work);
  
  static void dm_init_md_queue(struct mapped_device *md)
  {
        blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
        md->queue->unplug_fn = dm_unplug_all;
        blk_queue_merge_bvec(md->queue, dm_merge_bvec);
+       blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);
  }
  
  /*
@@@ -1972,7 -1851,6 +1851,6 @@@ static struct mapped_device *alloc_dev(
        mutex_init(&md->suspend_lock);
        mutex_init(&md->type_lock);
        spin_lock_init(&md->deferred_lock);
-       spin_lock_init(&md->barrier_error_lock);
        rwlock_init(&md->map_lock);
        atomic_set(&md->holders, 1);
        atomic_set(&md->open_count, 0);
        atomic_set(&md->pending[1], 0);
        init_waitqueue_head(&md->wait);
        INIT_WORK(&md->work, dm_wq_work);
-       INIT_WORK(&md->barrier_work, dm_rq_barrier_work);
        init_waitqueue_head(&md->eventq);
  
        md->disk->major = _major;
        if (!md->bdev)
                goto bad_bdev;
  
+       bio_init(&md->flush_bio);
+       md->flush_bio.bi_bdev = md->bdev;
+       md->flush_bio.bi_rw = WRITE_FLUSH;
        /* Populate the mapping, nobody knows we exist yet */
        spin_lock(&_minor_lock);
        old_md = idr_replace(&_minor_idr, md, minor);
@@@ -2245,7 -2126,6 +2126,6 @@@ static int dm_init_request_based_queue(
        blk_queue_softirq_done(md->queue, dm_softirq_done);
        blk_queue_prep_rq(md->queue, dm_prep_fn);
        blk_queue_lld_busy(md->queue, dm_lld_busy);
-       blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH);
  
        elv_register_queue(md->queue);
  
@@@ -2406,43 -2286,6 +2286,6 @@@ static int dm_wait_for_completion(struc
        return r;
  }
  
- static void dm_flush(struct mapped_device *md)
- {
-       dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
-       bio_init(&md->barrier_bio);
-       md->barrier_bio.bi_bdev = md->bdev;
-       md->barrier_bio.bi_rw = WRITE_BARRIER;
-       __split_and_process_bio(md, &md->barrier_bio);
-       dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
- }
- static void process_barrier(struct mapped_device *md, struct bio *bio)
- {
-       md->barrier_error = 0;
-       dm_flush(md);
-       if (!bio_empty_barrier(bio)) {
-               __split_and_process_bio(md, bio);
-               /*
-                * If the request isn't supported, don't waste time with
-                * the second flush.
-                */
-               if (md->barrier_error != -EOPNOTSUPP)
-                       dm_flush(md);
-       }
-       if (md->barrier_error != DM_ENDIO_REQUEUE)
-               bio_endio(bio, md->barrier_error);
-       else {
-               spin_lock_irq(&md->deferred_lock);
-               bio_list_add_head(&md->deferred, bio);
-               spin_unlock_irq(&md->deferred_lock);
-       }
- }
  /*
   * Process the deferred bios
   */
@@@ -2452,33 -2295,27 +2295,27 @@@ static void dm_wq_work(struct work_stru
                                                work);
        struct bio *c;
  
-       down_write(&md->io_lock);
+       down_read(&md->io_lock);
  
        while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
                spin_lock_irq(&md->deferred_lock);
                c = bio_list_pop(&md->deferred);
                spin_unlock_irq(&md->deferred_lock);
  
-               if (!c) {
-                       clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
+               if (!c)
                        break;
-               }
  
-               up_write(&md->io_lock);
+               up_read(&md->io_lock);
  
                if (dm_request_based(md))
                        generic_make_request(c);
-               else {
-                       if (c->bi_rw & REQ_HARDBARRIER)
-                               process_barrier(md, c);
-                       else
-                               __split_and_process_bio(md, c);
-               }
+               else
+                       __split_and_process_bio(md, c);
  
-               down_write(&md->io_lock);
+               down_read(&md->io_lock);
        }
  
-       up_write(&md->io_lock);
+       up_read(&md->io_lock);
  }
  
  static void dm_queue_flush(struct mapped_device *md)
        queue_work(md->wq, &md->work);
  }
  
- static void dm_rq_set_target_request_nr(struct request *clone, unsigned request_nr)
- {
-       struct dm_rq_target_io *tio = clone->end_io_data;
-       tio->info.target_request_nr = request_nr;
- }
- /* Issue barrier requests to targets and wait for their completion. */
- static int dm_rq_barrier(struct mapped_device *md)
- {
-       int i, j;
-       struct dm_table *map = dm_get_live_table(md);
-       unsigned num_targets = dm_table_get_num_targets(map);
-       struct dm_target *ti;
-       struct request *clone;
-       md->barrier_error = 0;
-       for (i = 0; i < num_targets; i++) {
-               ti = dm_table_get_target(map, i);
-               for (j = 0; j < ti->num_flush_requests; j++) {
-                       clone = clone_rq(md->flush_request, md, GFP_NOIO);
-                       dm_rq_set_target_request_nr(clone, j);
-                       atomic_inc(&md->pending[rq_data_dir(clone)]);
-                       map_request(ti, clone, md);
-               }
-       }
-       dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
-       dm_table_put(map);
-       return md->barrier_error;
- }
- static void dm_rq_barrier_work(struct work_struct *work)
- {
-       int error;
-       struct mapped_device *md = container_of(work, struct mapped_device,
-                                               barrier_work);
-       struct request_queue *q = md->queue;
-       struct request *rq;
-       unsigned long flags;
-       /*
-        * Hold the md reference here and leave it at the last part so that
-        * the md can't be deleted by device opener when the barrier request
-        * completes.
-        */
-       dm_get(md);
-       error = dm_rq_barrier(md);
-       rq = md->flush_request;
-       md->flush_request = NULL;
-       if (error == DM_ENDIO_REQUEUE) {
-               spin_lock_irqsave(q->queue_lock, flags);
-               blk_requeue_request(q, rq);
-               spin_unlock_irqrestore(q->queue_lock, flags);
-       } else
-               blk_end_request_all(rq, error);
-       blk_run_queue(q);
-       dm_put(md);
- }
  /*
   * Swap in a new table, returning the old one for the caller to destroy.
   */
@@@ -2677,23 -2447,17 +2447,17 @@@ int dm_suspend(struct mapped_device *md
         *
         * To get all processes out of __split_and_process_bio in dm_request,
         * we take the write lock. To prevent any process from reentering
-        * __split_and_process_bio from dm_request, we set
-        * DMF_QUEUE_IO_TO_THREAD.
-        *
-        * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND
-        * and call flush_workqueue(md->wq). flush_workqueue will wait until
-        * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any
-        * further calls to __split_and_process_bio from dm_wq_work.
+        * __split_and_process_bio from dm_request and quiesce the thread
+        * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
+        * flush_workqueue(md->wq).
         */
        down_write(&md->io_lock);
        set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
-       set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
        up_write(&md->io_lock);
  
        /*
-        * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which
-        * can be kicked until md->queue is stopped.  So stop md->queue before
-        * flushing md->wq.
+        * Stop md->queue before flushing md->wq in case request-based
+        * dm defers requests to md->wq from md->queue.
         */
        if (dm_request_based(md))
                stop_queue(md->queue);
diff --combined drivers/md/md.c
index dbf822df942a70e7fb701d80936800dca40c2cbf,ed075d19db376493b16d415dc10568d68b4a85d8..225815197a3d69fba134433ab269d0c9a255681b
@@@ -36,7 -36,7 +36,7 @@@
  #include <linux/blkdev.h>
  #include <linux/sysctl.h>
  #include <linux/seq_file.h>
 -#include <linux/smp_lock.h>
 +#include <linux/mutex.h>
  #include <linux/buffer_head.h> /* for invalidate_bdev */
  #include <linux/poll.h>
  #include <linux/ctype.h>
@@@ -57,7 -57,6 +57,7 @@@
  #define DEBUG 0
  #define dprintk(x...) ((void)(DEBUG && printk(x)))
  
 +static DEFINE_MUTEX(md_mutex);
  
  #ifndef MODULE
  static void autostart_arrays(int part);
@@@ -227,12 -226,12 +227,12 @@@ static int md_make_request(struct reque
                return 0;
        }
        rcu_read_lock();
-       if (mddev->suspended || mddev->barrier) {
+       if (mddev->suspended) {
                DEFINE_WAIT(__wait);
                for (;;) {
                        prepare_to_wait(&mddev->sb_wait, &__wait,
                                        TASK_UNINTERRUPTIBLE);
-                       if (!mddev->suspended && !mddev->barrier)
+                       if (!mddev->suspended)
                                break;
                        rcu_read_unlock();
                        schedule();
@@@ -283,40 -282,29 +283,29 @@@ EXPORT_SYMBOL_GPL(mddev_resume)
  
  int mddev_congested(mddev_t *mddev, int bits)
  {
-       if (mddev->barrier)
-               return 1;
        return mddev->suspended;
  }
  EXPORT_SYMBOL(mddev_congested);
  
  /*
-  * Generic barrier handling for md
+  * Generic flush handling for md
   */
  
- #define POST_REQUEST_BARRIER ((void*)1)
- static void md_end_barrier(struct bio *bio, int err)
+ static void md_end_flush(struct bio *bio, int err)
  {
        mdk_rdev_t *rdev = bio->bi_private;
        mddev_t *mddev = rdev->mddev;
-       if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER)
-               set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags);
  
        rdev_dec_pending(rdev, mddev);
  
        if (atomic_dec_and_test(&mddev->flush_pending)) {
-               if (mddev->barrier == POST_REQUEST_BARRIER) {
-                       /* This was a post-request barrier */
-                       mddev->barrier = NULL;
-                       wake_up(&mddev->sb_wait);
-               } else
-                       /* The pre-request barrier has finished */
-                       schedule_work(&mddev->barrier_work);
+               /* The pre-request flush has finished */
+               schedule_work(&mddev->flush_work);
        }
        bio_put(bio);
  }
  
- static void submit_barriers(mddev_t *mddev)
+ static void submit_flushes(mddev_t *mddev)
  {
        mdk_rdev_t *rdev;
  
                        atomic_inc(&rdev->nr_pending);
                        rcu_read_unlock();
                        bi = bio_alloc(GFP_KERNEL, 0);
-                       bi->bi_end_io = md_end_barrier;
+                       bi->bi_end_io = md_end_flush;
                        bi->bi_private = rdev;
                        bi->bi_bdev = rdev->bdev;
                        atomic_inc(&mddev->flush_pending);
-                       submit_bio(WRITE_BARRIER, bi);
+                       submit_bio(WRITE_FLUSH, bi);
                        rcu_read_lock();
                        rdev_dec_pending(rdev, mddev);
                }
        rcu_read_unlock();
  }
  
- static void md_submit_barrier(struct work_struct *ws)
+ static void md_submit_flush_data(struct work_struct *ws)
  {
-       mddev_t *mddev = container_of(ws, mddev_t, barrier_work);
-       struct bio *bio = mddev->barrier;
+       mddev_t *mddev = container_of(ws, mddev_t, flush_work);
+       struct bio *bio = mddev->flush_bio;
  
        atomic_set(&mddev->flush_pending, 1);
  
-       if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
-               bio_endio(bio, -EOPNOTSUPP);
-       else if (bio->bi_size == 0)
+       if (bio->bi_size == 0)
                /* an empty barrier - all done */
                bio_endio(bio, 0);
        else {
-               bio->bi_rw &= ~REQ_HARDBARRIER;
+               bio->bi_rw &= ~REQ_FLUSH;
                if (mddev->pers->make_request(mddev, bio))
                        generic_make_request(bio);
-               mddev->barrier = POST_REQUEST_BARRIER;
-               submit_barriers(mddev);
        }
        if (atomic_dec_and_test(&mddev->flush_pending)) {
-               mddev->barrier = NULL;
+               mddev->flush_bio = NULL;
                wake_up(&mddev->sb_wait);
        }
  }
  
- void md_barrier_request(mddev_t *mddev, struct bio *bio)
+ void md_flush_request(mddev_t *mddev, struct bio *bio)
  {
        spin_lock_irq(&mddev->write_lock);
        wait_event_lock_irq(mddev->sb_wait,
-                           !mddev->barrier,
+                           !mddev->flush_bio,
                            mddev->write_lock, /*nothing*/);
-       mddev->barrier = bio;
+       mddev->flush_bio = bio;
        spin_unlock_irq(&mddev->write_lock);
  
        atomic_set(&mddev->flush_pending, 1);
-       INIT_WORK(&mddev->barrier_work, md_submit_barrier);
+       INIT_WORK(&mddev->flush_work, md_submit_flush_data);
  
-       submit_barriers(mddev);
+       submit_flushes(mddev);
  
        if (atomic_dec_and_test(&mddev->flush_pending))
-               schedule_work(&mddev->barrier_work);
+               schedule_work(&mddev->flush_work);
  }
- EXPORT_SYMBOL(md_barrier_request);
+ EXPORT_SYMBOL(md_flush_request);
  
  /* Support for plugging.
   * This mirrors the plugging support in request_queue, but does not
@@@ -697,31 -681,6 +682,6 @@@ static void super_written(struct bio *b
        bio_put(bio);
  }
  
- static void super_written_barrier(struct bio *bio, int error)
- {
-       struct bio *bio2 = bio->bi_private;
-       mdk_rdev_t *rdev = bio2->bi_private;
-       mddev_t *mddev = rdev->mddev;
-       if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
-           error == -EOPNOTSUPP) {
-               unsigned long flags;
-               /* barriers don't appear to be supported :-( */
-               set_bit(BarriersNotsupp, &rdev->flags);
-               mddev->barriers_work = 0;
-               spin_lock_irqsave(&mddev->write_lock, flags);
-               bio2->bi_next = mddev->biolist;
-               mddev->biolist = bio2;
-               spin_unlock_irqrestore(&mddev->write_lock, flags);
-               wake_up(&mddev->sb_wait);
-               bio_put(bio);
-       } else {
-               bio_put(bio2);
-               bio->bi_private = rdev;
-               super_written(bio, error);
-       }
- }
  void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
                   sector_t sector, int size, struct page *page)
  {
         * and decrement it on completion, waking up sb_wait
         * if zero is reached.
         * If an error occurred, call md_error
-        *
-        * As we might need to resubmit the request if REQ_HARDBARRIER
-        * causes ENOTSUPP, we allocate a spare bio...
         */
        struct bio *bio = bio_alloc(GFP_NOIO, 1);
-       int rw = REQ_WRITE | REQ_SYNC | REQ_UNPLUG;
  
        bio->bi_bdev = rdev->bdev;
        bio->bi_sector = sector;
        bio_add_page(bio, page, size, 0);
        bio->bi_private = rdev;
        bio->bi_end_io = super_written;
-       bio->bi_rw = rw;
  
        atomic_inc(&mddev->pending_writes);
-       if (!test_bit(BarriersNotsupp, &rdev->flags)) {
-               struct bio *rbio;
-               rw |= REQ_HARDBARRIER;
-               rbio = bio_clone(bio, GFP_NOIO);
-               rbio->bi_private = bio;
-               rbio->bi_end_io = super_written_barrier;
-               submit_bio(rw, rbio);
-       } else
-               submit_bio(rw, bio);
+       submit_bio(REQ_WRITE | REQ_SYNC | REQ_UNPLUG | REQ_FLUSH | REQ_FUA,
+                  bio);
  }
  
  void md_super_wait(mddev_t *mddev)
  {
-       /* wait for all superblock writes that were scheduled to complete.
-        * if any had to be retried (due to BARRIER problems), retry them
-        */
+       /* wait for all superblock writes that were scheduled to complete */
        DEFINE_WAIT(wq);
        for(;;) {
                prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
                if (atomic_read(&mddev->pending_writes)==0)
                        break;
-               while (mddev->biolist) {
-                       struct bio *bio;
-                       spin_lock_irq(&mddev->write_lock);
-                       bio = mddev->biolist;
-                       mddev->biolist = bio->bi_next ;
-                       bio->bi_next = NULL;
-                       spin_unlock_irq(&mddev->write_lock);
-                       submit_bio(bio->bi_rw, bio);
-               }
                schedule();
        }
        finish_wait(&mddev->sb_wait, &wq);
@@@ -1071,7 -1007,6 +1008,6 @@@ static int super_90_validate(mddev_t *m
        clear_bit(Faulty, &rdev->flags);
        clear_bit(In_sync, &rdev->flags);
        clear_bit(WriteMostly, &rdev->flags);
-       clear_bit(BarriersNotsupp, &rdev->flags);
  
        if (mddev->raid_disks == 0) {
                mddev->major_version = 0;
@@@ -1486,7 -1421,6 +1422,6 @@@ static int super_1_validate(mddev_t *md
        clear_bit(Faulty, &rdev->flags);
        clear_bit(In_sync, &rdev->flags);
        clear_bit(WriteMostly, &rdev->flags);
-       clear_bit(BarriersNotsupp, &rdev->flags);
  
        if (mddev->raid_disks == 0) {
                mddev->major_version = 1;
@@@ -4505,7 -4439,6 +4440,6 @@@ int md_run(mddev_t *mddev
        /* may be over-ridden by personality */
        mddev->resync_max_sectors = mddev->dev_sectors;
  
-       mddev->barriers_work = 1;
        mddev->ok_start_degraded = start_dirty_degraded;
  
        if (start_readonly && mddev->ro == 0)
@@@ -4684,7 -4617,6 +4618,6 @@@ static void md_clean(mddev_t *mddev
        mddev->recovery = 0;
        mddev->in_sync = 0;
        mddev->degraded = 0;
-       mddev->barriers_work = 0;
        mddev->safemode = 0;
        mddev->bitmap_info.offset = 0;
        mddev->bitmap_info.default_offset = 0;
@@@ -5952,7 -5884,7 +5885,7 @@@ static int md_open(struct block_device 
        mddev_t *mddev = mddev_find(bdev->bd_dev);
        int err;
  
 -      lock_kernel();
 +      mutex_lock(&md_mutex);
        if (mddev->gendisk != bdev->bd_disk) {
                /* we are racing with mddev_put which is discarding this
                 * bd_disk.
                /* Wait until bdev->bd_disk is definitely gone */
                flush_scheduled_work();
                /* Then retry the open from the top */
 -              unlock_kernel();
 +              mutex_unlock(&md_mutex);
                return -ERESTARTSYS;
        }
        BUG_ON(mddev != bdev->bd_disk->private_data);
  
        check_disk_size_change(mddev->gendisk, bdev);
   out:
 -      unlock_kernel();
 +      mutex_unlock(&md_mutex);
        return err;
  }
  
@@@ -5984,10 -5916,10 +5917,10 @@@ static int md_release(struct gendisk *d
        mddev_t *mddev = disk->private_data;
  
        BUG_ON(!mddev);
 -      lock_kernel();
 +      mutex_lock(&md_mutex);
        atomic_dec(&mddev->openers);
        mddev_put(mddev);
 -      unlock_kernel();
 +      mutex_unlock(&md_mutex);
  
        return 0;
  }
index 38e6fa9a2012fc40cdb25ab3a0089e916d9e5c8d,9b106d83b0cddd20095135d6062fbc824f0ff2cd..aa95f1001761534d187eb06ceab7597faa24f51d
@@@ -21,6 -21,7 +21,6 @@@
  #include <linux/hdreg.h>
  #include <linux/async.h>
  #include <linux/mutex.h>
 -#include <linux/smp_lock.h>
  
  #include <asm/ccwdev.h>
  #include <asm/ebcdic.h>
@@@ -2196,7 -2197,6 +2196,6 @@@ static void dasd_setup_queue(struct das
         */
        blk_queue_max_segment_size(block->request_queue, PAGE_SIZE);
        blk_queue_segment_boundary(block->request_queue, PAGE_SIZE - 1);
-       blk_queue_ordered(block->request_queue, QUEUE_ORDERED_DRAIN);
  }
  
  /*
@@@ -2235,6 -2235,7 +2234,6 @@@ static int dasd_open(struct block_devic
        if (!block)
                return -ENODEV;
  
 -      lock_kernel();
        base = block->base;
        atomic_inc(&block->open_count);
        if (test_bit(DASD_FLAG_OFFLINE, &base->flags)) {
                goto out;
        }
  
 -      unlock_kernel();
        return 0;
  
  out:
        module_put(base->discipline->owner);
  unlock:
        atomic_dec(&block->open_count);
 -      unlock_kernel();
        return rc;
  }
  
@@@ -2282,8 -2285,10 +2281,8 @@@ static int dasd_release(struct gendisk 
  {
        struct dasd_block *block = disk->private_data;
  
 -      lock_kernel();
        atomic_dec(&block->open_count);
        module_put(block->base->discipline->owner);
 -      unlock_kernel();
        return 0;
  }
  
diff --combined fs/gfs2/rgrp.c
index fb67f593f40856b213f03887642c44a3e4c7ccc5,38b3ea1abaccd6ba9afe45720198d4e156171781..bef3ab6cf5c1aeb2d0f28d7955e4e62c8cb34be2
@@@ -500,7 -500,7 +500,7 @@@ u64 gfs2_ri_total(struct gfs2_sbd *sdp
        for (rgrps = 0;; rgrps++) {
                loff_t pos = rgrps * sizeof(struct gfs2_rindex);
  
 -              if (pos + sizeof(struct gfs2_rindex) >= ip->i_disksize)
 +              if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode))
                        break;
                error = gfs2_internal_read(ip, &ra_state, buf, &pos,
                                           sizeof(struct gfs2_rindex));
@@@ -588,9 -588,7 +588,9 @@@ static int gfs2_ri_update(struct gfs2_i
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct inode *inode = &ip->i_inode;
        struct file_ra_state ra_state;
 -      u64 rgrp_count = ip->i_disksize;
 +      u64 rgrp_count = i_size_read(inode);
 +      struct gfs2_rgrpd *rgd;
 +      unsigned int max_data = 0;
        int error;
  
        do_div(rgrp_count, sizeof(struct gfs2_rindex));
                }
        }
  
 +      list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
 +              if (rgd->rd_data > max_data)
 +                      max_data = rgd->rd_data;
 +      sdp->sd_max_rg_data = max_data;
        sdp->sd_rindex_uptodate = 1;
        return 0;
  }
@@@ -628,15 -622,13 +628,15 @@@ static int gfs2_ri_update_special(struc
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct inode *inode = &ip->i_inode;
        struct file_ra_state ra_state;
 +      struct gfs2_rgrpd *rgd;
 +      unsigned int max_data = 0;
        int error;
  
        file_ra_state_init(&ra_state, inode->i_mapping);
        for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
                /* Ignore partials */
                if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
 -                  ip->i_disksize)
 +                  i_size_read(inode))
                        break;
                error = read_rindex_entry(ip, &ra_state);
                if (error) {
                        return error;
                }
        }
 +      list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
 +              if (rgd->rd_data > max_data)
 +                      max_data = rgd->rd_data;
 +      sdp->sd_max_rg_data = max_data;
  
        sdp->sd_rindex_uptodate = 1;
        return 0;
@@@ -866,8 -854,7 +866,7 @@@ static void gfs2_rgrp_send_discards(str
                                if ((start + nr_sects) != blk) {
                                        rv = blkdev_issue_discard(bdev, start,
                                                            nr_sects, GFP_NOFS,
-                                                           BLKDEV_IFL_WAIT |
-                                                           BLKDEV_IFL_BARRIER);
+                                                           0);
                                        if (rv)
                                                goto fail;
                                        nr_sects = 0;
@@@ -881,8 -868,7 +880,7 @@@ start_new_extent
                }
        }
        if (nr_sects) {
-               rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
-                                        BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
+               rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 0);
                if (rv)
                        goto fail;
        }
@@@ -1200,8 -1186,7 +1198,8 @@@ out
   * Returns: errno
   */
  
 -int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
 +int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
 +                         char *file, unsigned int line)
  {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_alloc *al = ip->i_alloc;
                return -EINVAL;
  
  try_again:
 -      /* We need to hold the rindex unless the inode we're using is
 -         the rindex itself, in which case it's already held. */
 -      if (ip != GFS2_I(sdp->sd_rindex))
 -              error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
 -      else if (!sdp->sd_rgrps) /* We may not have the rindex read in, so: */
 -              error = gfs2_ri_update_special(ip);
 +      if (hold_rindex) {
 +              /* We need to hold the rindex unless the inode we're using is
 +                 the rindex itself, in which case it's already held. */
 +              if (ip != GFS2_I(sdp->sd_rindex))
 +                      error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
 +              else if (!sdp->sd_rgrps) /* We may not have the rindex read
 +                                          in, so: */
 +                      error = gfs2_ri_update_special(ip);
 +      }
  
        if (error)
                return error;
           try to free it, and try the allocation again. */
        error = get_local_rgrp(ip, &unlinked, &last_unlinked);
        if (error) {
 -              if (ip != GFS2_I(sdp->sd_rindex))
 +              if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
                        gfs2_glock_dq_uninit(&al->al_ri_gh);
                if (error != -EAGAIN)
                        return error;
@@@ -1273,7 -1255,7 +1271,7 @@@ void gfs2_inplace_release(struct gfs2_i
        al->al_rgd = NULL;
        if (al->al_rgd_gh.gh_gl)
                gfs2_glock_dq_uninit(&al->al_rgd_gh);
 -      if (ip != GFS2_I(sdp->sd_rindex))
 +      if (ip != GFS2_I(sdp->sd_rindex) && al->al_ri_gh.gh_gl)
                gfs2_glock_dq_uninit(&al->al_ri_gh);
  }
  
@@@ -1512,19 -1494,11 +1510,19 @@@ int gfs2_alloc_block(struct gfs2_inode 
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct buffer_head *dibh;
        struct gfs2_alloc *al = ip->i_alloc;
 -      struct gfs2_rgrpd *rgd = al->al_rgd;
 +      struct gfs2_rgrpd *rgd;
        u32 goal, blk;
        u64 block;
        int error;
  
 +      /* Only happens if there is a bug in gfs2, return something distinctive
 +       * to ensure that it is noticed.
 +       */
 +      if (al == NULL)
 +              return -ECANCELED;
 +
 +      rgd = al->al_rgd;
 +
        if (rgrp_contains_block(rgd, ip->i_goal))
                goal = ip->i_goal - rgd->rd_data0;
        else
diff --combined fs/jbd/commit.c
index 3f030e9efea6abfc5a835361b935295b30b5ef0a,484c5e5fa8af50c0440452de0f8ad23d3dfddbc6..85a6883c0aca265b898431eb2a2254632a8a8ef7
@@@ -137,34 -137,10 +137,10 @@@ static int journal_write_commit_record(
        JBUFFER_TRACE(descriptor, "write commit block");
        set_buffer_dirty(bh);
  
-       if (journal->j_flags & JFS_BARRIER) {
-               ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_BARRIER);
-               /*
-                * Is it possible for another commit to fail at roughly
-                * the same time as this one?  If so, we don't want to
-                * trust the barrier flag in the super, but instead want
-                * to remember if we sent a barrier request
-                */
-               if (ret == -EOPNOTSUPP) {
-                       char b[BDEVNAME_SIZE];
-                       printk(KERN_WARNING
-                               "JBD: barrier-based sync failed on %s - "
-                               "disabling barriers\n",
-                               bdevname(journal->j_dev, b));
-                       spin_lock(&journal->j_state_lock);
-                       journal->j_flags &= ~JFS_BARRIER;
-                       spin_unlock(&journal->j_state_lock);
-                       /* And try again, without the barrier */
-                       set_buffer_uptodate(bh);
-                       set_buffer_dirty(bh);
-                       ret = sync_dirty_buffer(bh);
-               }
-       } else {
+       if (journal->j_flags & JFS_BARRIER)
+               ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA);
+       else
                ret = sync_dirty_buffer(bh);
-       }
  
        put_bh(bh);             /* One for getblk() */
        journal_put_journal_head(descriptor);
@@@ -318,7 -294,7 +294,7 @@@ void journal_commit_transaction(journal
        int first_tag = 0;
        int tag_flag;
        int i;
 -      int write_op = WRITE;
 +      int write_op = WRITE_SYNC;
  
        /*
         * First job: lock down the current transaction and wait for
diff --combined fs/jbd2/commit.c
index 80910f51d4b447c8f7adc4d5eb26053f0e6ce75f,cb43c605cfaa247ec71e35c3d4ddf051b3a80963..bc6be8bda1cc067d3230acfbe20847b45906ee34
@@@ -134,25 -134,11 +134,11 @@@ static int journal_submit_commit_record
  
        if (journal->j_flags & JBD2_BARRIER &&
            !JBD2_HAS_INCOMPAT_FEATURE(journal,
-                                      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
-               ret = submit_bh(WRITE_SYNC_PLUG | WRITE_BARRIER, bh);
-               if (ret == -EOPNOTSUPP) {
-                       printk(KERN_WARNING
-                              "JBD2: Disabling barriers on %s, "
-                              "not supported by device\n", journal->j_devname);
-                       write_lock(&journal->j_state_lock);
-                       journal->j_flags &= ~JBD2_BARRIER;
-                       write_unlock(&journal->j_state_lock);
-                       /* And try again, without the barrier */
-                       lock_buffer(bh);
-                       set_buffer_uptodate(bh);
-                       clear_buffer_dirty(bh);
-                       ret = submit_bh(WRITE_SYNC_PLUG, bh);
-               }
-       } else {
+                                      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
+               ret = submit_bh(WRITE_SYNC_PLUG | WRITE_FLUSH_FUA, bh);
+       else
                ret = submit_bh(WRITE_SYNC_PLUG, bh);
-       }
        *cbh = bh;
        return ret;
  }
@@@ -166,29 -152,8 +152,8 @@@ static int journal_wait_on_commit_recor
  {
        int ret = 0;
  
- retry:
        clear_buffer_dirty(bh);
        wait_on_buffer(bh);
-       if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
-               printk(KERN_WARNING
-                      "JBD2: %s: disabling barries on %s - not supported "
-                      "by device\n", __func__, journal->j_devname);
-               write_lock(&journal->j_state_lock);
-               journal->j_flags &= ~JBD2_BARRIER;
-               write_unlock(&journal->j_state_lock);
-               lock_buffer(bh);
-               clear_buffer_dirty(bh);
-               set_buffer_uptodate(bh);
-               bh->b_end_io = journal_end_buffer_io_sync;
-               ret = submit_bh(WRITE_SYNC_PLUG, bh);
-               if (ret) {
-                       unlock_buffer(bh);
-                       return ret;
-               }
-               goto retry;
-       }
  
        if (unlikely(!buffer_uptodate(bh)))
                ret = -EIO;
@@@ -360,7 -325,7 +325,7 @@@ void jbd2_journal_commit_transaction(jo
        int tag_bytes = journal_tag_bytes(journal);
        struct buffer_head *cbh = NULL; /* For transactional checksums */
        __u32 crc32_sum = ~0;
 -      int write_op = WRITE;
 +      int write_op = WRITE_SYNC;
  
        /*
         * First job: lock down the current transaction and wait for
@@@ -701,6 -666,16 +666,16 @@@ start_journal_io
                }
        }
  
+       err = journal_finish_inode_data_buffers(journal, commit_transaction);
+       if (err) {
+               printk(KERN_WARNING
+                       "JBD2: Detected IO errors while flushing file data "
+                      "on %s\n", journal->j_devname);
+               if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
+                       jbd2_journal_abort(journal, err);
+               err = 0;
+       }
        /* 
         * If the journal is not located on the file system device,
         * then we must flush the file system device before we issue
        if (commit_transaction->t_flushed_data_blocks &&
            (journal->j_fs_dev != journal->j_dev) &&
            (journal->j_flags & JBD2_BARRIER))
-               blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
-                       BLKDEV_IFL_WAIT);
+               blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
  
        /* Done it all: now write the commit record asynchronously. */
        if (JBD2_HAS_INCOMPAT_FEATURE(journal,
                                                 &cbh, crc32_sum);
                if (err)
                        __jbd2_journal_abort_hard(journal);
-               if (journal->j_flags & JBD2_BARRIER)
-                       blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
-                               BLKDEV_IFL_WAIT);
-       }
-       err = journal_finish_inode_data_buffers(journal, commit_transaction);
-       if (err) {
-               printk(KERN_WARNING
-                       "JBD2: Detected IO errors while flushing file data "
-                      "on %s\n", journal->j_devname);
-               if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
-                       jbd2_journal_abort(journal, err);
-               err = 0;
        }
  
        /* Lo and behold: we have just managed to send a transaction to
@@@ -845,6 -806,11 +806,11 @@@ wait_for_iobuf
        }
        if (!err && !is_journal_aborted(journal))
                err = journal_wait_on_commit_record(journal, cbh);
+       if (JBD2_HAS_INCOMPAT_FEATURE(journal,
+                                     JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
+           journal->j_flags & JBD2_BARRIER) {
+               blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL);
+       }
  
        if (err)
                jbd2_journal_abort(journal, err);
diff --combined fs/nilfs2/super.c
index 9f4913f78408a80e5c9b1253c19ac092adecb2b2,faa5078ff751c83a511e497fc1f4653b90bfd81d..f3b75206e9560888489856361cb9b0067167267e
@@@ -45,6 -45,7 +45,6 @@@
  #include <linux/parser.h>
  #include <linux/random.h>
  #include <linux/crc32.h>
 -#include <linux/smp_lock.h>
  #include <linux/vfs.h>
  #include <linux/writeback.h>
  #include <linux/kobject.h>
@@@ -177,17 -178,9 +177,9 @@@ static int nilfs_sync_super(struct nilf
  
   retry:
        set_buffer_dirty(nilfs->ns_sbh[0]);
        if (nilfs_test_opt(sbi, BARRIER)) {
                err = __sync_dirty_buffer(nilfs->ns_sbh[0],
-                                         WRITE_SYNC | WRITE_BARRIER);
-               if (err == -EOPNOTSUPP) {
-                       nilfs_warning(sbi->s_super, __func__,
-                                     "barrier-based sync failed. "
-                                     "disabling barriers\n");
-                       nilfs_clear_opt(sbi, BARRIER);
-                       goto retry;
-               }
+                                         WRITE_SYNC | WRITE_FLUSH_FUA);
        } else {
                err = sync_dirty_buffer(nilfs->ns_sbh[0]);
        }
@@@ -341,6 -334,8 +333,6 @@@ static void nilfs_put_super(struct supe
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
        struct the_nilfs *nilfs = sbi->s_nilfs;
  
 -      lock_kernel();
 -
        nilfs_detach_segment_constructor(sbi);
  
        if (!(sb->s_flags & MS_RDONLY)) {
        sbi->s_super = NULL;
        sb->s_fs_info = NULL;
        nilfs_put_sbinfo(sbi);
 -
 -      unlock_kernel();
  }
  
  static int nilfs_sync_fs(struct super_block *sb, int wait)
@@@ -944,6 -941,8 +936,6 @@@ static int nilfs_remount(struct super_b
        struct nilfs_mount_options old_opts;
        int was_snapshot, err;
  
 -      lock_kernel();
 -
        down_write(&nilfs->ns_super_sem);
        old_sb_flags = sb->s_flags;
        old_opts.mount_opt = sbi->s_mount_opt;
        }
   out:
        up_write(&nilfs->ns_super_sem);
 -      unlock_kernel();
        return 0;
  
   restore_opts:
        sbi->s_mount_opt = old_opts.mount_opt;
        sbi->s_snapshot_cno = old_opts.snapshot_cno;
        up_write(&nilfs->ns_super_sem);
 -      unlock_kernel();
        return err;
  }
  
@@@ -1196,6 -1197,7 +1188,6 @@@ nilfs_get_sb(struct file_system_type *f
        put_nilfs(nilfs);
   failed:
        close_bdev_exclusive(sd.bdev, mode);
 -
        return err;
  
   cancel_new:
index d36629620a4fb9143c0e37b0b1a16302afda0efc,36edadf5b41a658b54b3d22bbc4437f44f08203a..0437ab6bb54c0b2265c39bb961e88eda09d38739
@@@ -97,7 -97,6 +97,7 @@@ struct bio 
  #define BIO_NULL_MAPPED 9     /* contains invalid user pages */
  #define BIO_FS_INTEGRITY 10   /* fs owns integrity data, not block layer */
  #define BIO_QUIET     11      /* Make BIO Quiet */
 +#define BIO_MAPPED_INTEGRITY 12/* integrity metadata has been remapped */
  #define bio_flagged(bio, flag)        ((bio)->bi_flags & (1 << (flag)))
  
  /*
@@@ -131,8 -130,6 +131,8 @@@ enum rq_flag_bits 
        /* bio only flags */
        __REQ_UNPLUG,           /* unplug the immediately after submission */
        __REQ_RAHEAD,           /* read ahead, can fail anytime */
 +      __REQ_THROTTLED,        /* This bio has already been subjected to
 +                               * throttling rules. Don't do it again. */
  
        /* request only flags */
        __REQ_SORTED,           /* elevator knows about this request */
        __REQ_FAILED,           /* set if the request failed */
        __REQ_QUIET,            /* don't worry about errors */
        __REQ_PREEMPT,          /* set for "ide_preempt" requests */
-       __REQ_ORDERED_COLOR,    /* is before or after barrier */
        __REQ_ALLOCED,          /* request came from our alloc pool */
        __REQ_COPY_USER,        /* contains copies of user pages */
 -      __REQ_INTEGRITY,        /* integrity metadata has been remapped */
        __REQ_FLUSH,            /* request for cache flush */
        __REQ_IO_STAT,          /* account I/O stat */
        __REQ_MIXED_MERGE,      /* merge of different types, fail separately */
        (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
  #define REQ_COMMON_MASK \
        (REQ_WRITE | REQ_FAILFAST_MASK | REQ_HARDBARRIER | REQ_SYNC | \
-        REQ_META| REQ_DISCARD | REQ_NOIDLE)
+        REQ_META | REQ_DISCARD | REQ_NOIDLE | REQ_FLUSH | REQ_FUA)
+ #define REQ_CLONE_MASK                REQ_COMMON_MASK
  
  #define REQ_UNPLUG            (1 << __REQ_UNPLUG)
  #define REQ_RAHEAD            (1 << __REQ_RAHEAD)
 +#define REQ_THROTTLED         (1 << __REQ_THROTTLED)
  
  #define REQ_SORTED            (1 << __REQ_SORTED)
  #define REQ_SOFTBARRIER               (1 << __REQ_SOFTBARRIER)
  #define REQ_FAILED            (1 << __REQ_FAILED)
  #define REQ_QUIET             (1 << __REQ_QUIET)
  #define REQ_PREEMPT           (1 << __REQ_PREEMPT)
- #define REQ_ORDERED_COLOR     (1 << __REQ_ORDERED_COLOR)
  #define REQ_ALLOCED           (1 << __REQ_ALLOCED)
  #define REQ_COPY_USER         (1 << __REQ_COPY_USER)
 -#define REQ_INTEGRITY         (1 << __REQ_INTEGRITY)
  #define REQ_FLUSH             (1 << __REQ_FLUSH)
  #define REQ_IO_STAT           (1 << __REQ_IO_STAT)
  #define REQ_MIXED_MERGE               (1 << __REQ_MIXED_MERGE)
diff --combined include/linux/blkdev.h
index 16f7f1be1acf2d88569955dba40e11d9308cce4b,accbd0e5c89360fba02b2fb32001f2d7de372dba..009b80e49f5361bb119e346e184c796d698dfc42
@@@ -115,7 -115,6 +115,7 @@@ struct request 
        void *elevator_private3;
  
        struct gendisk *rq_disk;
 +      struct hd_struct *part;
        unsigned long start_time;
  #ifdef CONFIG_BLK_CGROUP
        unsigned long long start_time_ns;
         * physical address coalescing is performed.
         */
        unsigned short nr_phys_segments;
 +#if defined(CONFIG_BLK_DEV_INTEGRITY)
 +      unsigned short nr_integrity_segments;
 +#endif
  
        unsigned short ioprio;
  
@@@ -247,7 -243,6 +247,7 @@@ struct queue_limits 
  
        unsigned short          logical_block_size;
        unsigned short          max_segments;
 +      unsigned short          max_integrity_segments;
  
        unsigned char           misaligned;
        unsigned char           discard_misaligned;
@@@ -360,23 -355,20 +360,25 @@@ struct request_queu
        struct blk_trace        *blk_trace;
  #endif
        /*
-        * reserved for flush operations
+        * for flush operations
         */
-       unsigned int            ordered, next_ordered, ordseq;
-       int                     orderr, ordcolor;
-       struct request          pre_flush_rq, bar_rq, post_flush_rq;
-       struct request          *orig_bar_rq;
+       unsigned int            flush_flags;
+       unsigned int            flush_seq;
+       int                     flush_err;
+       struct request          flush_rq;
+       struct request          *orig_flush_rq;
+       struct list_head        pending_flushes;
  
        struct mutex            sysfs_lock;
  
  #if defined(CONFIG_BLK_DEV_BSG)
        struct bsg_class_device bsg_dev;
  #endif
 +
 +#ifdef CONFIG_BLK_DEV_THROTTLING
 +      /* Throttle data */
 +      struct throtl_data *td;
 +#endif
  };
  
  #define QUEUE_FLAG_CLUSTER    0       /* cluster several segments into 1 */
@@@ -472,56 -464,6 +474,6 @@@ static inline void queue_flag_clear(uns
        __clear_bit(flag, &q->queue_flags);
  }
  
- enum {
-       /*
-        * Hardbarrier is supported with one of the following methods.
-        *
-        * NONE         : hardbarrier unsupported
-        * DRAIN        : ordering by draining is enough
-        * DRAIN_FLUSH  : ordering by draining w/ pre and post flushes
-        * DRAIN_FUA    : ordering by draining w/ pre flush and FUA write
-        * TAG          : ordering by tag is enough
-        * TAG_FLUSH    : ordering by tag w/ pre and post flushes
-        * TAG_FUA      : ordering by tag w/ pre flush and FUA write
-        */
-       QUEUE_ORDERED_BY_DRAIN          = 0x01,
-       QUEUE_ORDERED_BY_TAG            = 0x02,
-       QUEUE_ORDERED_DO_PREFLUSH       = 0x10,
-       QUEUE_ORDERED_DO_BAR            = 0x20,
-       QUEUE_ORDERED_DO_POSTFLUSH      = 0x40,
-       QUEUE_ORDERED_DO_FUA            = 0x80,
-       QUEUE_ORDERED_NONE              = 0x00,
-       QUEUE_ORDERED_DRAIN             = QUEUE_ORDERED_BY_DRAIN |
-                                         QUEUE_ORDERED_DO_BAR,
-       QUEUE_ORDERED_DRAIN_FLUSH       = QUEUE_ORDERED_DRAIN |
-                                         QUEUE_ORDERED_DO_PREFLUSH |
-                                         QUEUE_ORDERED_DO_POSTFLUSH,
-       QUEUE_ORDERED_DRAIN_FUA         = QUEUE_ORDERED_DRAIN |
-                                         QUEUE_ORDERED_DO_PREFLUSH |
-                                         QUEUE_ORDERED_DO_FUA,
-       QUEUE_ORDERED_TAG               = QUEUE_ORDERED_BY_TAG |
-                                         QUEUE_ORDERED_DO_BAR,
-       QUEUE_ORDERED_TAG_FLUSH         = QUEUE_ORDERED_TAG |
-                                         QUEUE_ORDERED_DO_PREFLUSH |
-                                         QUEUE_ORDERED_DO_POSTFLUSH,
-       QUEUE_ORDERED_TAG_FUA           = QUEUE_ORDERED_TAG |
-                                         QUEUE_ORDERED_DO_PREFLUSH |
-                                         QUEUE_ORDERED_DO_FUA,
-       /*
-        * Ordered operation sequence
-        */
-       QUEUE_ORDSEQ_STARTED    = 0x01, /* flushing in progress */
-       QUEUE_ORDSEQ_DRAIN      = 0x02, /* waiting for the queue to be drained */
-       QUEUE_ORDSEQ_PREFLUSH   = 0x04, /* pre-flushing in progress */
-       QUEUE_ORDSEQ_BAR        = 0x08, /* original barrier req in progress */
-       QUEUE_ORDSEQ_POSTFLUSH  = 0x10, /* post-flushing in progress */
-       QUEUE_ORDSEQ_DONE       = 0x20,
- };
  #define blk_queue_plugged(q)  test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
  #define blk_queue_tagged(q)   test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
  #define blk_queue_stopped(q)  test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
  #define blk_queue_nonrot(q)   test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags)
  #define blk_queue_io_stat(q)  test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
  #define blk_queue_add_random(q)       test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags)
- #define blk_queue_flushing(q) ((q)->ordseq)
  #define blk_queue_stackable(q)        \
        test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags)
  #define blk_queue_discard(q)  test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
@@@ -602,7 -543,8 +553,8 @@@ static inline void blk_clear_queue_full
   * it already be started by driver.
   */
  #define RQ_NOMERGE_FLAGS      \
-       (REQ_NOMERGE | REQ_STARTED | REQ_HARDBARRIER | REQ_SOFTBARRIER)
+       (REQ_NOMERGE | REQ_STARTED | REQ_HARDBARRIER | REQ_SOFTBARRIER | \
+        REQ_FLUSH | REQ_FUA)
  #define rq_mergeable(rq)      \
        (!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \
         (((rq)->cmd_flags & REQ_DISCARD) || \
@@@ -861,7 -803,7 +813,7 @@@ extern void blk_queue_max_segment_size(
  extern void blk_queue_max_discard_sectors(struct request_queue *q,
                unsigned int max_discard_sectors);
  extern void blk_queue_logical_block_size(struct request_queue *, unsigned short);
 -extern void blk_queue_physical_block_size(struct request_queue *, unsigned short);
 +extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
  extern void blk_queue_alignment_offset(struct request_queue *q,
                                       unsigned int alignment);
  extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
@@@ -891,12 -833,8 +843,8 @@@ extern void blk_queue_update_dma_alignm
  extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
  extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
  extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
+ extern void blk_queue_flush(struct request_queue *q, unsigned int flush);
  extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
- extern int blk_queue_ordered(struct request_queue *, unsigned);
- extern bool blk_do_ordered(struct request_queue *, struct request **);
- extern unsigned blk_ordered_cur_seq(struct request_queue *);
- extern unsigned blk_ordered_req_seq(struct request *);
- extern bool blk_ordered_complete_seq(struct request_queue *, unsigned, int);
  
  extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
  extern void blk_dump_rq_flags(struct request *, char *);
@@@ -929,27 -867,20 +877,20 @@@ static inline struct request *blk_map_q
                return NULL;
        return bqt->tag_index[tag];
  }
- enum{
-       BLKDEV_WAIT,    /* wait for completion */
-       BLKDEV_BARRIER, /* issue request with barrier */
-       BLKDEV_SECURE,  /* secure discard */
- };
- #define BLKDEV_IFL_WAIT               (1 << BLKDEV_WAIT)
- #define BLKDEV_IFL_BARRIER    (1 << BLKDEV_BARRIER)
- #define BLKDEV_IFL_SECURE     (1 << BLKDEV_SECURE)
- extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *,
-                       unsigned long);
+ #define BLKDEV_DISCARD_SECURE  0x01    /* secure discard */
+ extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
  extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
                sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
  extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
-                       sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
- static inline int sb_issue_discard(struct super_block *sb,
-                                  sector_t block, sector_t nr_blocks)
+                       sector_t nr_sects, gfp_t gfp_mask);
+ static inline int sb_issue_discard(struct super_block *sb, sector_t block,
+               sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
  {
-       block <<= (sb->s_blocksize_bits - 9);
-       nr_blocks <<= (sb->s_blocksize_bits - 9);
-       return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_NOFS,
-                                  BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
+       return blkdev_issue_discard(sb->s_bdev, block << (sb->s_blocksize_bits - 9),
+                                   nr_blocks << (sb->s_blocksize_bits - 9),
+                                   gfp_mask, flags);
  }
  
  extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
@@@ -1014,7 -945,7 +955,7 @@@ static inline unsigned int queue_physic
        return q->limits.physical_block_size;
  }
  
 -static inline int bdev_physical_block_size(struct block_device *bdev)
 +static inline unsigned int bdev_physical_block_size(struct block_device *bdev)
  {
        return queue_physical_block_size(bdev_get_queue(bdev));
  }
@@@ -1103,11 -1034,11 +1044,11 @@@ static inline int queue_dma_alignment(s
        return q ? q->dma_alignment : 511;
  }
  
 -static inline int blk_rq_aligned(struct request_queue *q, void *addr,
 +static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr,
                                 unsigned int len)
  {
        unsigned int alignment = queue_dma_alignment(q) | q->dma_pad_mask;
 -      return !((unsigned long)addr & alignment) && !(len & alignment);
 +      return !(addr & alignment) && !(len & alignment);
  }
  
  /* assumes size > 256 */
@@@ -1137,7 -1068,6 +1078,7 @@@ static inline void put_dev_sector(Secto
  
  struct work_struct;
  int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
 +int kblockd_schedule_delayed_work(struct request_queue *q, struct delayed_work *dwork, unsigned long delay);
  
  #ifdef CONFIG_BLK_CGROUP
  /*
@@@ -1181,24 -1111,6 +1122,24 @@@ static inline uint64_t rq_io_start_time
  }
  #endif
  
 +#ifdef CONFIG_BLK_DEV_THROTTLING
 +extern int blk_throtl_init(struct request_queue *q);
 +extern void blk_throtl_exit(struct request_queue *q);
 +extern int blk_throtl_bio(struct request_queue *q, struct bio **bio);
 +extern void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay);
 +extern void throtl_shutdown_timer_wq(struct request_queue *q);
 +#else /* CONFIG_BLK_DEV_THROTTLING */
 +static inline int blk_throtl_bio(struct request_queue *q, struct bio **bio)
 +{
 +      return 0;
 +}
 +
 +static inline int blk_throtl_init(struct request_queue *q) { return 0; }
 +static inline int blk_throtl_exit(struct request_queue *q) { return 0; }
 +static inline void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay) {}
 +static inline void throtl_shutdown_timer_wq(struct request_queue *q) {}
 +#endif /* CONFIG_BLK_DEV_THROTTLING */
 +
  #define MODULE_ALIAS_BLOCKDEV(major,minor) \
        MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
  #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
@@@ -1242,13 -1154,8 +1183,13 @@@ struct blk_integrity 
  extern int blk_integrity_register(struct gendisk *, struct blk_integrity *);
  extern void blk_integrity_unregister(struct gendisk *);
  extern int blk_integrity_compare(struct gendisk *, struct gendisk *);
 -extern int blk_rq_map_integrity_sg(struct request *, struct scatterlist *);
 -extern int blk_rq_count_integrity_sg(struct request *);
 +extern int blk_rq_map_integrity_sg(struct request_queue *, struct bio *,
 +                                 struct scatterlist *);
 +extern int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);
 +extern int blk_integrity_merge_rq(struct request_queue *, struct request *,
 +                                struct request *);
 +extern int blk_integrity_merge_bio(struct request_queue *, struct request *,
 +                                 struct bio *);
  
  static inline
  struct blk_integrity *bdev_get_integrity(struct block_device *bdev)
@@@ -1269,32 -1176,16 +1210,32 @@@ static inline int blk_integrity_rq(stru
        return bio_integrity(rq->bio);
  }
  
 +static inline void blk_queue_max_integrity_segments(struct request_queue *q,
 +                                                  unsigned int segs)
 +{
 +      q->limits.max_integrity_segments = segs;
 +}
 +
 +static inline unsigned short
 +queue_max_integrity_segments(struct request_queue *q)
 +{
 +      return q->limits.max_integrity_segments;
 +}
 +
  #else /* CONFIG_BLK_DEV_INTEGRITY */
  
  #define blk_integrity_rq(rq)                  (0)
 -#define blk_rq_count_integrity_sg(a)          (0)
 -#define blk_rq_map_integrity_sg(a, b)         (0)
 +#define blk_rq_count_integrity_sg(a, b)               (0)
 +#define blk_rq_map_integrity_sg(a, b, c)      (0)
  #define bdev_get_integrity(a)                 (0)
  #define blk_get_integrity(a)                  (0)
  #define blk_integrity_compare(a, b)           (0)
  #define blk_integrity_register(a, b)          (0)
  #define blk_integrity_unregister(a)           do { } while (0);
 +#define blk_queue_max_integrity_segments(a, b)        do { } while (0);
 +#define queue_max_integrity_segments(a)               (0)
 +#define blk_integrity_merge_rq(a, b, c)               (0)
 +#define blk_integrity_merge_bio(a, b, c)      (0)
  
  #endif /* CONFIG_BLK_DEV_INTEGRITY */
  
diff --combined include/linux/fs.h
index 0a81b87ea15813902fbb64108fa8931ffc80cce0,34a1cbcb56154670272e0c2753580cb4a2699ee7..4f34ff6e55585b365db2c419c878e46c67d6d9b0
@@@ -135,12 -135,12 +135,12 @@@ struct inodes_stat_t 
   *                    immediately after submission. The write equivalent
   *                    of READ_SYNC.
   * WRITE_ODIRECT_PLUG Special case write for O_DIRECT only.
-  * WRITE_BARRIER      Like WRITE_SYNC, but tells the block layer that all
-  *                    previously submitted writes must be safely on storage
-  *                    before this one is started. Also guarantees that when
-  *                    this write is complete, it itself is also safely on
-  *                    storage. Prevents reordering of writes on both sides
-  *                    of this IO.
+  * WRITE_FLUSH                Like WRITE_SYNC but with preceding cache flush.
+  * WRITE_FUA          Like WRITE_SYNC but data is guaranteed to be on
+  *                    non-volatile media on completion.
+  * WRITE_FLUSH_FUA    Combination of WRITE_FLUSH and FUA. The IO is preceded
+  *                    by a cache flush and data is guaranteed to be on
+  *                    non-volatile media on completion.
   *
   */
  #define RW_MASK                       REQ_WRITE
  #define WRITE_SYNC            (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG)
  #define WRITE_ODIRECT_PLUG    (WRITE | REQ_SYNC)
  #define WRITE_META            (WRITE | REQ_META)
- #define WRITE_BARRIER         (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
-                                REQ_HARDBARRIER)
- /*
-  * These aren't really reads or writes, they pass down information about
-  * parts of device that are now unused by the file system.
-  */
- #define DISCARD_NOBARRIER     (WRITE | REQ_DISCARD)
- #define DISCARD_BARRIER               (WRITE | REQ_DISCARD | REQ_HARDBARRIER)
- #define DISCARD_SECURE                (DISCARD_NOBARRIER | REQ_SECURE)
+ #define WRITE_FLUSH           (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
+                                REQ_FLUSH)
+ #define WRITE_FUA             (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
+                                REQ_FUA)
+ #define WRITE_FLUSH_FUA               (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
+                                REQ_FLUSH | REQ_FUA)
  
  #define SEL_IN                1
  #define SEL_OUT               2
@@@ -1093,6 -1089,10 +1089,6 @@@ struct file_lock 
  
  #include <linux/fcntl.h>
  
 -/* temporary stubs for BKL removal */
 -#define lock_flocks() lock_kernel()
 -#define unlock_flocks() unlock_kernel()
 -
  extern void send_sigio(struct fown_struct *fown, int fd, int band);
  
  #ifdef CONFIG_FILE_LOCKING
@@@ -1131,8 -1131,6 +1127,8 @@@ extern int vfs_setlease(struct file *, 
  extern int lease_modify(struct file_lock **, int);
  extern int lock_may_read(struct inode *, loff_t start, unsigned long count);
  extern int lock_may_write(struct inode *, loff_t start, unsigned long count);
 +extern void lock_flocks(void);
 +extern void unlock_flocks(void);
  #else /* !CONFIG_FILE_LOCKING */
  static inline int fcntl_getlk(struct file *file, struct flock __user *user)
  {
@@@ -1275,14 -1273,6 +1271,14 @@@ static inline int lock_may_write(struc
        return 1;
  }
  
 +static inline void lock_flocks(void)
 +{
 +}
 +
 +static inline void unlock_flocks(void)
 +{
 +}
 +
  #endif /* !CONFIG_FILE_LOCKING */
  
  
@@@ -1390,7 -1380,7 +1386,7 @@@ struct super_block 
         * Saved mount options for lazy filesystems using
         * generic_show_options()
         */
 -      char *s_options;
 +      char __rcu *s_options;
  };
  
  extern struct timespec current_fs_time(struct super_block *sb);
@@@ -2384,8 -2374,6 +2380,8 @@@ extern ssize_t simple_write_to_buffer(v
  
  extern int generic_file_fsync(struct file *, int);
  
 +extern int generic_check_addressable(unsigned, u64);
 +
  #ifdef CONFIG_MIGRATION
  extern int buffer_migrate_page(struct address_space *,
                                struct page *, struct page *);
@@@ -2462,7 -2450,6 +2458,7 @@@ static const struct file_operations __f
        .release = simple_attr_release,                                 \
        .read    = simple_attr_read,                                    \
        .write   = simple_attr_write,                                   \
 +      .llseek  = generic_file_llseek,                                 \
  };
  
  static inline void __attribute__((format(printf, 1, 2)))
This page took 0.322062 seconds and 4 git commands to generate.